xref: /openbmc/qemu/fpu/softfloat.c (revision 41bd3603)
1 /*
2  * QEMU float support
3  *
4  * Derived from SoftFloat.
5  */
6 
7 /*============================================================================
8 
9 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
10 Package, Release 2b.
11 
12 Written by John R. Hauser.  This work was made possible in part by the
13 International Computer Science Institute, located at Suite 600, 1947 Center
14 Street, Berkeley, California 94704.  Funding was partially provided by the
15 National Science Foundation under grant MIP-9311980.  The original version
16 of this code was written as part of a project to build a fixed-point vector
17 processor in collaboration with the University of California at Berkeley,
18 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
19 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20 arithmetic/SoftFloat.html'.
21 
22 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
23 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
30 
31 Derivative works are acceptable, even for commercial purposes, so long as
32 (1) the source code for the derivative work includes prominent notice that
33 the work is derivative, and (2) the source code includes prominent notice with
34 these four paragraphs for those parts of this code that are retained.
35 
36 =============================================================================*/
37 
38 /* softfloat (and in particular the code in softfloat-specialize.h) is
39  * target-dependent and needs the TARGET_* macros.
40  */
41 #include "config.h"
42 
43 #include "softfloat.h"
44 
45 /*----------------------------------------------------------------------------
46 | Primitive arithmetic functions, including multi-word arithmetic, and
47 | division and square root approximations.  (Can be specialized to target if
48 | desired.)
49 *----------------------------------------------------------------------------*/
50 #include "softfloat-macros.h"
51 
52 /*----------------------------------------------------------------------------
53 | Functions and definitions to determine:  (1) whether tininess for underflow
54 | is detected before or after rounding by default, (2) what (if anything)
55 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
56 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
57 | are propagated from function inputs to output.  These details are target-
58 | specific.
59 *----------------------------------------------------------------------------*/
60 #include "softfloat-specialize.h"
61 
62 void set_float_rounding_mode(int val STATUS_PARAM)
63 {
64     STATUS(float_rounding_mode) = val;
65 }
66 
67 void set_float_exception_flags(int val STATUS_PARAM)
68 {
69     STATUS(float_exception_flags) = val;
70 }
71 
72 void set_floatx80_rounding_precision(int val STATUS_PARAM)
73 {
74     STATUS(floatx80_rounding_precision) = val;
75 }
76 
77 /*----------------------------------------------------------------------------
78 | Returns the fraction bits of the half-precision floating-point value `a'.
79 *----------------------------------------------------------------------------*/
80 
81 INLINE uint32_t extractFloat16Frac(float16 a)
82 {
83     return float16_val(a) & 0x3ff;
84 }
85 
86 /*----------------------------------------------------------------------------
87 | Returns the exponent bits of the half-precision floating-point value `a'.
88 *----------------------------------------------------------------------------*/
89 
90 INLINE int16 extractFloat16Exp(float16 a)
91 {
92     return (float16_val(a) >> 10) & 0x1f;
93 }
94 
95 /*----------------------------------------------------------------------------
96 | Returns the sign bit of the single-precision floating-point value `a'.
97 *----------------------------------------------------------------------------*/
98 
99 INLINE flag extractFloat16Sign(float16 a)
100 {
101     return float16_val(a)>>15;
102 }
103 
104 /*----------------------------------------------------------------------------
105 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
106 | and 7, and returns the properly rounded 32-bit integer corresponding to the
107 | input.  If `zSign' is 1, the input is negated before being converted to an
108 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
109 | is simply rounded to an integer, with the inexact exception raised if the
110 | input cannot be represented exactly as an integer.  However, if the fixed-
111 | point input is too large, the invalid exception is raised and the largest
112 | positive or negative integer is returned.
113 *----------------------------------------------------------------------------*/
114 
115 static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
116 {
117     int8 roundingMode;
118     flag roundNearestEven;
119     int8 roundIncrement, roundBits;
120     int32 z;
121 
122     roundingMode = STATUS(float_rounding_mode);
123     roundNearestEven = ( roundingMode == float_round_nearest_even );
124     roundIncrement = 0x40;
125     if ( ! roundNearestEven ) {
126         if ( roundingMode == float_round_to_zero ) {
127             roundIncrement = 0;
128         }
129         else {
130             roundIncrement = 0x7F;
131             if ( zSign ) {
132                 if ( roundingMode == float_round_up ) roundIncrement = 0;
133             }
134             else {
135                 if ( roundingMode == float_round_down ) roundIncrement = 0;
136             }
137         }
138     }
139     roundBits = absZ & 0x7F;
140     absZ = ( absZ + roundIncrement )>>7;
141     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
142     z = absZ;
143     if ( zSign ) z = - z;
144     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
145         float_raise( float_flag_invalid STATUS_VAR);
146         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
147     }
148     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
149     return z;
150 
151 }
152 
153 /*----------------------------------------------------------------------------
154 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
155 | `absZ1', with binary point between bits 63 and 64 (between the input words),
156 | and returns the properly rounded 64-bit integer corresponding to the input.
157 | If `zSign' is 1, the input is negated before being converted to an integer.
158 | Ordinarily, the fixed-point input is simply rounded to an integer, with
159 | the inexact exception raised if the input cannot be represented exactly as
160 | an integer.  However, if the fixed-point input is too large, the invalid
161 | exception is raised and the largest positive or negative integer is
162 | returned.
163 *----------------------------------------------------------------------------*/
164 
165 static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
166 {
167     int8 roundingMode;
168     flag roundNearestEven, increment;
169     int64 z;
170 
171     roundingMode = STATUS(float_rounding_mode);
172     roundNearestEven = ( roundingMode == float_round_nearest_even );
173     increment = ( (int64_t) absZ1 < 0 );
174     if ( ! roundNearestEven ) {
175         if ( roundingMode == float_round_to_zero ) {
176             increment = 0;
177         }
178         else {
179             if ( zSign ) {
180                 increment = ( roundingMode == float_round_down ) && absZ1;
181             }
182             else {
183                 increment = ( roundingMode == float_round_up ) && absZ1;
184             }
185         }
186     }
187     if ( increment ) {
188         ++absZ0;
189         if ( absZ0 == 0 ) goto overflow;
190         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
191     }
192     z = absZ0;
193     if ( zSign ) z = - z;
194     if ( z && ( ( z < 0 ) ^ zSign ) ) {
195  overflow:
196         float_raise( float_flag_invalid STATUS_VAR);
197         return
198               zSign ? (int64_t) LIT64( 0x8000000000000000 )
199             : LIT64( 0x7FFFFFFFFFFFFFFF );
200     }
201     if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
202     return z;
203 
204 }
205 
206 /*----------------------------------------------------------------------------
207 | Returns the fraction bits of the single-precision floating-point value `a'.
208 *----------------------------------------------------------------------------*/
209 
210 INLINE uint32_t extractFloat32Frac( float32 a )
211 {
212 
213     return float32_val(a) & 0x007FFFFF;
214 
215 }
216 
217 /*----------------------------------------------------------------------------
218 | Returns the exponent bits of the single-precision floating-point value `a'.
219 *----------------------------------------------------------------------------*/
220 
221 INLINE int16 extractFloat32Exp( float32 a )
222 {
223 
224     return ( float32_val(a)>>23 ) & 0xFF;
225 
226 }
227 
228 /*----------------------------------------------------------------------------
229 | Returns the sign bit of the single-precision floating-point value `a'.
230 *----------------------------------------------------------------------------*/
231 
232 INLINE flag extractFloat32Sign( float32 a )
233 {
234 
235     return float32_val(a)>>31;
236 
237 }
238 
239 /*----------------------------------------------------------------------------
240 | If `a' is denormal and we are in flush-to-zero mode then set the
241 | input-denormal exception and return zero. Otherwise just return the value.
242 *----------------------------------------------------------------------------*/
243 static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
244 {
245     if (STATUS(flush_inputs_to_zero)) {
246         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
247             float_raise(float_flag_input_denormal STATUS_VAR);
248             return make_float32(float32_val(a) & 0x80000000);
249         }
250     }
251     return a;
252 }
253 
254 /*----------------------------------------------------------------------------
255 | Normalizes the subnormal single-precision floating-point value represented
256 | by the denormalized significand `aSig'.  The normalized exponent and
257 | significand are stored at the locations pointed to by `zExpPtr' and
258 | `zSigPtr', respectively.
259 *----------------------------------------------------------------------------*/
260 
261 static void
262  normalizeFloat32Subnormal( uint32_t aSig, int16 *zExpPtr, uint32_t *zSigPtr )
263 {
264     int8 shiftCount;
265 
266     shiftCount = countLeadingZeros32( aSig ) - 8;
267     *zSigPtr = aSig<<shiftCount;
268     *zExpPtr = 1 - shiftCount;
269 
270 }
271 
272 /*----------------------------------------------------------------------------
273 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
274 | single-precision floating-point value, returning the result.  After being
275 | shifted into the proper positions, the three fields are simply added
276 | together to form the result.  This means that any integer portion of `zSig'
277 | will be added into the exponent.  Since a properly normalized significand
278 | will have an integer portion equal to 1, the `zExp' input should be 1 less
279 | than the desired result exponent whenever `zSig' is a complete, normalized
280 | significand.
281 *----------------------------------------------------------------------------*/
282 
283 INLINE float32 packFloat32( flag zSign, int16 zExp, uint32_t zSig )
284 {
285 
286     return make_float32(
287           ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
288 
289 }
290 
291 /*----------------------------------------------------------------------------
292 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
293 | and significand `zSig', and returns the proper single-precision floating-
294 | point value corresponding to the abstract input.  Ordinarily, the abstract
295 | value is simply rounded and packed into the single-precision format, with
296 | the inexact exception raised if the abstract input cannot be represented
297 | exactly.  However, if the abstract value is too large, the overflow and
298 | inexact exceptions are raised and an infinity or maximal finite value is
299 | returned.  If the abstract value is too small, the input value is rounded to
300 | a subnormal number, and the underflow and inexact exceptions are raised if
301 | the abstract input cannot be represented exactly as a subnormal single-
302 | precision floating-point number.
303 |     The input significand `zSig' has its binary point between bits 30
304 | and 29, which is 7 bits to the left of the usual location.  This shifted
305 | significand must be normalized or smaller.  If `zSig' is not normalized,
306 | `zExp' must be 0; in that case, the result returned is a subnormal number,
307 | and it must not require rounding.  In the usual case that `zSig' is
308 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
309 | The handling of underflow and overflow follows the IEC/IEEE Standard for
310 | Binary Floating-Point Arithmetic.
311 *----------------------------------------------------------------------------*/
312 
313 static float32 roundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
314 {
315     int8 roundingMode;
316     flag roundNearestEven;
317     int8 roundIncrement, roundBits;
318     flag isTiny;
319 
320     roundingMode = STATUS(float_rounding_mode);
321     roundNearestEven = ( roundingMode == float_round_nearest_even );
322     roundIncrement = 0x40;
323     if ( ! roundNearestEven ) {
324         if ( roundingMode == float_round_to_zero ) {
325             roundIncrement = 0;
326         }
327         else {
328             roundIncrement = 0x7F;
329             if ( zSign ) {
330                 if ( roundingMode == float_round_up ) roundIncrement = 0;
331             }
332             else {
333                 if ( roundingMode == float_round_down ) roundIncrement = 0;
334             }
335         }
336     }
337     roundBits = zSig & 0x7F;
338     if ( 0xFD <= (uint16_t) zExp ) {
339         if (    ( 0xFD < zExp )
340              || (    ( zExp == 0xFD )
341                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
342            ) {
343             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
344             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
345         }
346         if ( zExp < 0 ) {
347             if (STATUS(flush_to_zero)) {
348                 float_raise(float_flag_output_denormal STATUS_VAR);
349                 return packFloat32(zSign, 0, 0);
350             }
351             isTiny =
352                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
353                 || ( zExp < -1 )
354                 || ( zSig + roundIncrement < 0x80000000 );
355             shift32RightJamming( zSig, - zExp, &zSig );
356             zExp = 0;
357             roundBits = zSig & 0x7F;
358             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
359         }
360     }
361     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
362     zSig = ( zSig + roundIncrement )>>7;
363     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
364     if ( zSig == 0 ) zExp = 0;
365     return packFloat32( zSign, zExp, zSig );
366 
367 }
368 
369 /*----------------------------------------------------------------------------
370 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
371 | and significand `zSig', and returns the proper single-precision floating-
372 | point value corresponding to the abstract input.  This routine is just like
373 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
374 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
375 | floating-point exponent.
376 *----------------------------------------------------------------------------*/
377 
378 static float32
379  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
380 {
381     int8 shiftCount;
382 
383     shiftCount = countLeadingZeros32( zSig ) - 1;
384     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
385 
386 }
387 
388 /*----------------------------------------------------------------------------
389 | Returns the fraction bits of the double-precision floating-point value `a'.
390 *----------------------------------------------------------------------------*/
391 
392 INLINE uint64_t extractFloat64Frac( float64 a )
393 {
394 
395     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
396 
397 }
398 
399 /*----------------------------------------------------------------------------
400 | Returns the exponent bits of the double-precision floating-point value `a'.
401 *----------------------------------------------------------------------------*/
402 
403 INLINE int16 extractFloat64Exp( float64 a )
404 {
405 
406     return ( float64_val(a)>>52 ) & 0x7FF;
407 
408 }
409 
410 /*----------------------------------------------------------------------------
411 | Returns the sign bit of the double-precision floating-point value `a'.
412 *----------------------------------------------------------------------------*/
413 
414 INLINE flag extractFloat64Sign( float64 a )
415 {
416 
417     return float64_val(a)>>63;
418 
419 }
420 
421 /*----------------------------------------------------------------------------
422 | If `a' is denormal and we are in flush-to-zero mode then set the
423 | input-denormal exception and return zero. Otherwise just return the value.
424 *----------------------------------------------------------------------------*/
425 static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
426 {
427     if (STATUS(flush_inputs_to_zero)) {
428         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
429             float_raise(float_flag_input_denormal STATUS_VAR);
430             return make_float64(float64_val(a) & (1ULL << 63));
431         }
432     }
433     return a;
434 }
435 
436 /*----------------------------------------------------------------------------
437 | Normalizes the subnormal double-precision floating-point value represented
438 | by the denormalized significand `aSig'.  The normalized exponent and
439 | significand are stored at the locations pointed to by `zExpPtr' and
440 | `zSigPtr', respectively.
441 *----------------------------------------------------------------------------*/
442 
443 static void
444  normalizeFloat64Subnormal( uint64_t aSig, int16 *zExpPtr, uint64_t *zSigPtr )
445 {
446     int8 shiftCount;
447 
448     shiftCount = countLeadingZeros64( aSig ) - 11;
449     *zSigPtr = aSig<<shiftCount;
450     *zExpPtr = 1 - shiftCount;
451 
452 }
453 
454 /*----------------------------------------------------------------------------
455 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
456 | double-precision floating-point value, returning the result.  After being
457 | shifted into the proper positions, the three fields are simply added
458 | together to form the result.  This means that any integer portion of `zSig'
459 | will be added into the exponent.  Since a properly normalized significand
460 | will have an integer portion equal to 1, the `zExp' input should be 1 less
461 | than the desired result exponent whenever `zSig' is a complete, normalized
462 | significand.
463 *----------------------------------------------------------------------------*/
464 
465 INLINE float64 packFloat64( flag zSign, int16 zExp, uint64_t zSig )
466 {
467 
468     return make_float64(
469         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
470 
471 }
472 
473 /*----------------------------------------------------------------------------
474 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
475 | and significand `zSig', and returns the proper double-precision floating-
476 | point value corresponding to the abstract input.  Ordinarily, the abstract
477 | value is simply rounded and packed into the double-precision format, with
478 | the inexact exception raised if the abstract input cannot be represented
479 | exactly.  However, if the abstract value is too large, the overflow and
480 | inexact exceptions are raised and an infinity or maximal finite value is
481 | returned.  If the abstract value is too small, the input value is rounded
482 | to a subnormal number, and the underflow and inexact exceptions are raised
483 | if the abstract input cannot be represented exactly as a subnormal double-
484 | precision floating-point number.
485 |     The input significand `zSig' has its binary point between bits 62
486 | and 61, which is 10 bits to the left of the usual location.  This shifted
487 | significand must be normalized or smaller.  If `zSig' is not normalized,
488 | `zExp' must be 0; in that case, the result returned is a subnormal number,
489 | and it must not require rounding.  In the usual case that `zSig' is
490 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
491 | The handling of underflow and overflow follows the IEC/IEEE Standard for
492 | Binary Floating-Point Arithmetic.
493 *----------------------------------------------------------------------------*/
494 
495 static float64 roundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
496 {
497     int8 roundingMode;
498     flag roundNearestEven;
499     int16 roundIncrement, roundBits;
500     flag isTiny;
501 
502     roundingMode = STATUS(float_rounding_mode);
503     roundNearestEven = ( roundingMode == float_round_nearest_even );
504     roundIncrement = 0x200;
505     if ( ! roundNearestEven ) {
506         if ( roundingMode == float_round_to_zero ) {
507             roundIncrement = 0;
508         }
509         else {
510             roundIncrement = 0x3FF;
511             if ( zSign ) {
512                 if ( roundingMode == float_round_up ) roundIncrement = 0;
513             }
514             else {
515                 if ( roundingMode == float_round_down ) roundIncrement = 0;
516             }
517         }
518     }
519     roundBits = zSig & 0x3FF;
520     if ( 0x7FD <= (uint16_t) zExp ) {
521         if (    ( 0x7FD < zExp )
522              || (    ( zExp == 0x7FD )
523                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
524            ) {
525             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
526             return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
527         }
528         if ( zExp < 0 ) {
529             if (STATUS(flush_to_zero)) {
530                 float_raise(float_flag_output_denormal STATUS_VAR);
531                 return packFloat64(zSign, 0, 0);
532             }
533             isTiny =
534                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
535                 || ( zExp < -1 )
536                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
537             shift64RightJamming( zSig, - zExp, &zSig );
538             zExp = 0;
539             roundBits = zSig & 0x3FF;
540             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
541         }
542     }
543     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
544     zSig = ( zSig + roundIncrement )>>10;
545     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
546     if ( zSig == 0 ) zExp = 0;
547     return packFloat64( zSign, zExp, zSig );
548 
549 }
550 
551 /*----------------------------------------------------------------------------
552 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
553 | and significand `zSig', and returns the proper double-precision floating-
554 | point value corresponding to the abstract input.  This routine is just like
555 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
556 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
557 | floating-point exponent.
558 *----------------------------------------------------------------------------*/
559 
560 static float64
561  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
562 {
563     int8 shiftCount;
564 
565     shiftCount = countLeadingZeros64( zSig ) - 1;
566     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
567 
568 }
569 
570 /*----------------------------------------------------------------------------
571 | Returns the fraction bits of the extended double-precision floating-point
572 | value `a'.
573 *----------------------------------------------------------------------------*/
574 
575 INLINE uint64_t extractFloatx80Frac( floatx80 a )
576 {
577 
578     return a.low;
579 
580 }
581 
582 /*----------------------------------------------------------------------------
583 | Returns the exponent bits of the extended double-precision floating-point
584 | value `a'.
585 *----------------------------------------------------------------------------*/
586 
587 INLINE int32 extractFloatx80Exp( floatx80 a )
588 {
589 
590     return a.high & 0x7FFF;
591 
592 }
593 
594 /*----------------------------------------------------------------------------
595 | Returns the sign bit of the extended double-precision floating-point value
596 | `a'.
597 *----------------------------------------------------------------------------*/
598 
599 INLINE flag extractFloatx80Sign( floatx80 a )
600 {
601 
602     return a.high>>15;
603 
604 }
605 
606 /*----------------------------------------------------------------------------
607 | Normalizes the subnormal extended double-precision floating-point value
608 | represented by the denormalized significand `aSig'.  The normalized exponent
609 | and significand are stored at the locations pointed to by `zExpPtr' and
610 | `zSigPtr', respectively.
611 *----------------------------------------------------------------------------*/
612 
613 static void
614  normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
615 {
616     int8 shiftCount;
617 
618     shiftCount = countLeadingZeros64( aSig );
619     *zSigPtr = aSig<<shiftCount;
620     *zExpPtr = 1 - shiftCount;
621 
622 }
623 
624 /*----------------------------------------------------------------------------
625 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
626 | extended double-precision floating-point value, returning the result.
627 *----------------------------------------------------------------------------*/
628 
629 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
630 {
631     floatx80 z;
632 
633     z.low = zSig;
634     z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
635     return z;
636 
637 }
638 
639 /*----------------------------------------------------------------------------
640 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
641 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
642 | and returns the proper extended double-precision floating-point value
643 | corresponding to the abstract input.  Ordinarily, the abstract value is
644 | rounded and packed into the extended double-precision format, with the
645 | inexact exception raised if the abstract input cannot be represented
646 | exactly.  However, if the abstract value is too large, the overflow and
647 | inexact exceptions are raised and an infinity or maximal finite value is
648 | returned.  If the abstract value is too small, the input value is rounded to
649 | a subnormal number, and the underflow and inexact exceptions are raised if
650 | the abstract input cannot be represented exactly as a subnormal extended
651 | double-precision floating-point number.
652 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
653 | number of bits as single or double precision, respectively.  Otherwise, the
654 | result is rounded to the full precision of the extended double-precision
655 | format.
656 |     The input significand must be normalized or smaller.  If the input
657 | significand is not normalized, `zExp' must be 0; in that case, the result
658 | returned is a subnormal number, and it must not require rounding.  The
659 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
660 | Floating-Point Arithmetic.
661 *----------------------------------------------------------------------------*/
662 
663 static floatx80
664  roundAndPackFloatx80(
665      int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
666  STATUS_PARAM)
667 {
668     int8 roundingMode;
669     flag roundNearestEven, increment, isTiny;
670     int64 roundIncrement, roundMask, roundBits;
671 
672     roundingMode = STATUS(float_rounding_mode);
673     roundNearestEven = ( roundingMode == float_round_nearest_even );
674     if ( roundingPrecision == 80 ) goto precision80;
675     if ( roundingPrecision == 64 ) {
676         roundIncrement = LIT64( 0x0000000000000400 );
677         roundMask = LIT64( 0x00000000000007FF );
678     }
679     else if ( roundingPrecision == 32 ) {
680         roundIncrement = LIT64( 0x0000008000000000 );
681         roundMask = LIT64( 0x000000FFFFFFFFFF );
682     }
683     else {
684         goto precision80;
685     }
686     zSig0 |= ( zSig1 != 0 );
687     if ( ! roundNearestEven ) {
688         if ( roundingMode == float_round_to_zero ) {
689             roundIncrement = 0;
690         }
691         else {
692             roundIncrement = roundMask;
693             if ( zSign ) {
694                 if ( roundingMode == float_round_up ) roundIncrement = 0;
695             }
696             else {
697                 if ( roundingMode == float_round_down ) roundIncrement = 0;
698             }
699         }
700     }
701     roundBits = zSig0 & roundMask;
702     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
703         if (    ( 0x7FFE < zExp )
704              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
705            ) {
706             goto overflow;
707         }
708         if ( zExp <= 0 ) {
709             if (STATUS(flush_to_zero)) {
710                 float_raise(float_flag_output_denormal STATUS_VAR);
711                 return packFloatx80(zSign, 0, 0);
712             }
713             isTiny =
714                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
715                 || ( zExp < 0 )
716                 || ( zSig0 <= zSig0 + roundIncrement );
717             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
718             zExp = 0;
719             roundBits = zSig0 & roundMask;
720             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
721             if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
722             zSig0 += roundIncrement;
723             if ( (int64_t) zSig0 < 0 ) zExp = 1;
724             roundIncrement = roundMask + 1;
725             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
726                 roundMask |= roundIncrement;
727             }
728             zSig0 &= ~ roundMask;
729             return packFloatx80( zSign, zExp, zSig0 );
730         }
731     }
732     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
733     zSig0 += roundIncrement;
734     if ( zSig0 < roundIncrement ) {
735         ++zExp;
736         zSig0 = LIT64( 0x8000000000000000 );
737     }
738     roundIncrement = roundMask + 1;
739     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
740         roundMask |= roundIncrement;
741     }
742     zSig0 &= ~ roundMask;
743     if ( zSig0 == 0 ) zExp = 0;
744     return packFloatx80( zSign, zExp, zSig0 );
745  precision80:
746     increment = ( (int64_t) zSig1 < 0 );
747     if ( ! roundNearestEven ) {
748         if ( roundingMode == float_round_to_zero ) {
749             increment = 0;
750         }
751         else {
752             if ( zSign ) {
753                 increment = ( roundingMode == float_round_down ) && zSig1;
754             }
755             else {
756                 increment = ( roundingMode == float_round_up ) && zSig1;
757             }
758         }
759     }
760     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
761         if (    ( 0x7FFE < zExp )
762              || (    ( zExp == 0x7FFE )
763                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
764                   && increment
765                 )
766            ) {
767             roundMask = 0;
768  overflow:
769             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
770             if (    ( roundingMode == float_round_to_zero )
771                  || ( zSign && ( roundingMode == float_round_up ) )
772                  || ( ! zSign && ( roundingMode == float_round_down ) )
773                ) {
774                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
775             }
776             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
777         }
778         if ( zExp <= 0 ) {
779             isTiny =
780                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
781                 || ( zExp < 0 )
782                 || ! increment
783                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
784             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
785             zExp = 0;
786             if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
787             if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
788             if ( roundNearestEven ) {
789                 increment = ( (int64_t) zSig1 < 0 );
790             }
791             else {
792                 if ( zSign ) {
793                     increment = ( roundingMode == float_round_down ) && zSig1;
794                 }
795                 else {
796                     increment = ( roundingMode == float_round_up ) && zSig1;
797                 }
798             }
799             if ( increment ) {
800                 ++zSig0;
801                 zSig0 &=
802                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
803                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
804             }
805             return packFloatx80( zSign, zExp, zSig0 );
806         }
807     }
808     if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
809     if ( increment ) {
810         ++zSig0;
811         if ( zSig0 == 0 ) {
812             ++zExp;
813             zSig0 = LIT64( 0x8000000000000000 );
814         }
815         else {
816             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
817         }
818     }
819     else {
820         if ( zSig0 == 0 ) zExp = 0;
821     }
822     return packFloatx80( zSign, zExp, zSig0 );
823 
824 }
825 
826 /*----------------------------------------------------------------------------
827 | Takes an abstract floating-point value having sign `zSign', exponent
828 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
829 | and returns the proper extended double-precision floating-point value
830 | corresponding to the abstract input.  This routine is just like
831 | `roundAndPackFloatx80' except that the input significand does not have to be
832 | normalized.
833 *----------------------------------------------------------------------------*/
834 
835 static floatx80
836  normalizeRoundAndPackFloatx80(
837      int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
838  STATUS_PARAM)
839 {
840     int8 shiftCount;
841 
842     if ( zSig0 == 0 ) {
843         zSig0 = zSig1;
844         zSig1 = 0;
845         zExp -= 64;
846     }
847     shiftCount = countLeadingZeros64( zSig0 );
848     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
849     zExp -= shiftCount;
850     return
851         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
852 
853 }
854 
855 /*----------------------------------------------------------------------------
856 | Returns the least-significant 64 fraction bits of the quadruple-precision
857 | floating-point value `a'.
858 *----------------------------------------------------------------------------*/
859 
860 INLINE uint64_t extractFloat128Frac1( float128 a )
861 {
862 
863     return a.low;
864 
865 }
866 
867 /*----------------------------------------------------------------------------
868 | Returns the most-significant 48 fraction bits of the quadruple-precision
869 | floating-point value `a'.
870 *----------------------------------------------------------------------------*/
871 
872 INLINE uint64_t extractFloat128Frac0( float128 a )
873 {
874 
875     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
876 
877 }
878 
879 /*----------------------------------------------------------------------------
880 | Returns the exponent bits of the quadruple-precision floating-point value
881 | `a'.
882 *----------------------------------------------------------------------------*/
883 
884 INLINE int32 extractFloat128Exp( float128 a )
885 {
886 
887     return ( a.high>>48 ) & 0x7FFF;
888 
889 }
890 
891 /*----------------------------------------------------------------------------
892 | Returns the sign bit of the quadruple-precision floating-point value `a'.
893 *----------------------------------------------------------------------------*/
894 
895 INLINE flag extractFloat128Sign( float128 a )
896 {
897 
898     return a.high>>63;
899 
900 }
901 
902 /*----------------------------------------------------------------------------
903 | Normalizes the subnormal quadruple-precision floating-point value
904 | represented by the denormalized significand formed by the concatenation of
905 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
906 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
907 | significand are stored at the location pointed to by `zSig0Ptr', and the
908 | least significant 64 bits of the normalized significand are stored at the
909 | location pointed to by `zSig1Ptr'.
910 *----------------------------------------------------------------------------*/
911 
912 static void
913  normalizeFloat128Subnormal(
914      uint64_t aSig0,
915      uint64_t aSig1,
916      int32 *zExpPtr,
917      uint64_t *zSig0Ptr,
918      uint64_t *zSig1Ptr
919  )
920 {
921     int8 shiftCount;
922 
923     if ( aSig0 == 0 ) {
924         shiftCount = countLeadingZeros64( aSig1 ) - 15;
925         if ( shiftCount < 0 ) {
926             *zSig0Ptr = aSig1>>( - shiftCount );
927             *zSig1Ptr = aSig1<<( shiftCount & 63 );
928         }
929         else {
930             *zSig0Ptr = aSig1<<shiftCount;
931             *zSig1Ptr = 0;
932         }
933         *zExpPtr = - shiftCount - 63;
934     }
935     else {
936         shiftCount = countLeadingZeros64( aSig0 ) - 15;
937         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
938         *zExpPtr = 1 - shiftCount;
939     }
940 
941 }
942 
943 /*----------------------------------------------------------------------------
944 | Packs the sign `zSign', the exponent `zExp', and the significand formed
945 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
946 | floating-point value, returning the result.  After being shifted into the
947 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
948 | added together to form the most significant 32 bits of the result.  This
949 | means that any integer portion of `zSig0' will be added into the exponent.
950 | Since a properly normalized significand will have an integer portion equal
951 | to 1, the `zExp' input should be 1 less than the desired result exponent
952 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
953 | significand.
954 *----------------------------------------------------------------------------*/
955 
956 INLINE float128
957  packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
958 {
959     float128 z;
960 
961     z.low = zSig1;
962     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
963     return z;
964 
965 }
966 
967 /*----------------------------------------------------------------------------
968 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
969 | and extended significand formed by the concatenation of `zSig0', `zSig1',
970 | and `zSig2', and returns the proper quadruple-precision floating-point value
971 | corresponding to the abstract input.  Ordinarily, the abstract value is
972 | simply rounded and packed into the quadruple-precision format, with the
973 | inexact exception raised if the abstract input cannot be represented
974 | exactly.  However, if the abstract value is too large, the overflow and
975 | inexact exceptions are raised and an infinity or maximal finite value is
976 | returned.  If the abstract value is too small, the input value is rounded to
977 | a subnormal number, and the underflow and inexact exceptions are raised if
978 | the abstract input cannot be represented exactly as a subnormal quadruple-
979 | precision floating-point number.
980 |     The input significand must be normalized or smaller.  If the input
981 | significand is not normalized, `zExp' must be 0; in that case, the result
982 | returned is a subnormal number, and it must not require rounding.  In the
983 | usual case that the input significand is normalized, `zExp' must be 1 less
984 | than the ``true'' floating-point exponent.  The handling of underflow and
985 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
986 *----------------------------------------------------------------------------*/
987 
988 static float128
989  roundAndPackFloat128(
990      flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
991 {
992     int8 roundingMode;
993     flag roundNearestEven, increment, isTiny;
994 
995     roundingMode = STATUS(float_rounding_mode);
996     roundNearestEven = ( roundingMode == float_round_nearest_even );
997     increment = ( (int64_t) zSig2 < 0 );
998     if ( ! roundNearestEven ) {
999         if ( roundingMode == float_round_to_zero ) {
1000             increment = 0;
1001         }
1002         else {
1003             if ( zSign ) {
1004                 increment = ( roundingMode == float_round_down ) && zSig2;
1005             }
1006             else {
1007                 increment = ( roundingMode == float_round_up ) && zSig2;
1008             }
1009         }
1010     }
1011     if ( 0x7FFD <= (uint32_t) zExp ) {
1012         if (    ( 0x7FFD < zExp )
1013              || (    ( zExp == 0x7FFD )
1014                   && eq128(
1015                          LIT64( 0x0001FFFFFFFFFFFF ),
1016                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1017                          zSig0,
1018                          zSig1
1019                      )
1020                   && increment
1021                 )
1022            ) {
1023             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1024             if (    ( roundingMode == float_round_to_zero )
1025                  || ( zSign && ( roundingMode == float_round_up ) )
1026                  || ( ! zSign && ( roundingMode == float_round_down ) )
1027                ) {
1028                 return
1029                     packFloat128(
1030                         zSign,
1031                         0x7FFE,
1032                         LIT64( 0x0000FFFFFFFFFFFF ),
1033                         LIT64( 0xFFFFFFFFFFFFFFFF )
1034                     );
1035             }
1036             return packFloat128( zSign, 0x7FFF, 0, 0 );
1037         }
1038         if ( zExp < 0 ) {
1039             if (STATUS(flush_to_zero)) {
1040                 float_raise(float_flag_output_denormal STATUS_VAR);
1041                 return packFloat128(zSign, 0, 0, 0);
1042             }
1043             isTiny =
1044                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1045                 || ( zExp < -1 )
1046                 || ! increment
1047                 || lt128(
1048                        zSig0,
1049                        zSig1,
1050                        LIT64( 0x0001FFFFFFFFFFFF ),
1051                        LIT64( 0xFFFFFFFFFFFFFFFF )
1052                    );
1053             shift128ExtraRightJamming(
1054                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1055             zExp = 0;
1056             if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1057             if ( roundNearestEven ) {
1058                 increment = ( (int64_t) zSig2 < 0 );
1059             }
1060             else {
1061                 if ( zSign ) {
1062                     increment = ( roundingMode == float_round_down ) && zSig2;
1063                 }
1064                 else {
1065                     increment = ( roundingMode == float_round_up ) && zSig2;
1066                 }
1067             }
1068         }
1069     }
1070     if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1071     if ( increment ) {
1072         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1073         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1074     }
1075     else {
1076         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1077     }
1078     return packFloat128( zSign, zExp, zSig0, zSig1 );
1079 
1080 }
1081 
1082 /*----------------------------------------------------------------------------
1083 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1084 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1085 | returns the proper quadruple-precision floating-point value corresponding
1086 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1087 | except that the input significand has fewer bits and does not have to be
1088 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1089 | point exponent.
1090 *----------------------------------------------------------------------------*/
1091 
1092 static float128
1093  normalizeRoundAndPackFloat128(
1094      flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1095 {
1096     int8 shiftCount;
1097     uint64_t zSig2;
1098 
1099     if ( zSig0 == 0 ) {
1100         zSig0 = zSig1;
1101         zSig1 = 0;
1102         zExp -= 64;
1103     }
1104     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1105     if ( 0 <= shiftCount ) {
1106         zSig2 = 0;
1107         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1108     }
1109     else {
1110         shift128ExtraRightJamming(
1111             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1112     }
1113     zExp -= shiftCount;
1114     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1115 
1116 }
1117 
1118 /*----------------------------------------------------------------------------
1119 | Returns the result of converting the 32-bit two's complement integer `a'
1120 | to the single-precision floating-point format.  The conversion is performed
1121 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1122 *----------------------------------------------------------------------------*/
1123 
1124 float32 int32_to_float32( int32 a STATUS_PARAM )
1125 {
1126     flag zSign;
1127 
1128     if ( a == 0 ) return float32_zero;
1129     if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1130     zSign = ( a < 0 );
1131     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1132 
1133 }
1134 
1135 /*----------------------------------------------------------------------------
1136 | Returns the result of converting the 32-bit two's complement integer `a'
1137 | to the double-precision floating-point format.  The conversion is performed
1138 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1139 *----------------------------------------------------------------------------*/
1140 
1141 float64 int32_to_float64( int32 a STATUS_PARAM )
1142 {
1143     flag zSign;
1144     uint32 absA;
1145     int8 shiftCount;
1146     uint64_t zSig;
1147 
1148     if ( a == 0 ) return float64_zero;
1149     zSign = ( a < 0 );
1150     absA = zSign ? - a : a;
1151     shiftCount = countLeadingZeros32( absA ) + 21;
1152     zSig = absA;
1153     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1154 
1155 }
1156 
1157 /*----------------------------------------------------------------------------
1158 | Returns the result of converting the 32-bit two's complement integer `a'
1159 | to the extended double-precision floating-point format.  The conversion
1160 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1161 | Arithmetic.
1162 *----------------------------------------------------------------------------*/
1163 
1164 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1165 {
1166     flag zSign;
1167     uint32 absA;
1168     int8 shiftCount;
1169     uint64_t zSig;
1170 
1171     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1172     zSign = ( a < 0 );
1173     absA = zSign ? - a : a;
1174     shiftCount = countLeadingZeros32( absA ) + 32;
1175     zSig = absA;
1176     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1177 
1178 }
1179 
1180 /*----------------------------------------------------------------------------
1181 | Returns the result of converting the 32-bit two's complement integer `a' to
1182 | the quadruple-precision floating-point format.  The conversion is performed
1183 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1184 *----------------------------------------------------------------------------*/
1185 
1186 float128 int32_to_float128( int32 a STATUS_PARAM )
1187 {
1188     flag zSign;
1189     uint32 absA;
1190     int8 shiftCount;
1191     uint64_t zSig0;
1192 
1193     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1194     zSign = ( a < 0 );
1195     absA = zSign ? - a : a;
1196     shiftCount = countLeadingZeros32( absA ) + 17;
1197     zSig0 = absA;
1198     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1199 
1200 }
1201 
1202 /*----------------------------------------------------------------------------
1203 | Returns the result of converting the 64-bit two's complement integer `a'
1204 | to the single-precision floating-point format.  The conversion is performed
1205 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1206 *----------------------------------------------------------------------------*/
1207 
1208 float32 int64_to_float32( int64 a STATUS_PARAM )
1209 {
1210     flag zSign;
1211     uint64 absA;
1212     int8 shiftCount;
1213 
1214     if ( a == 0 ) return float32_zero;
1215     zSign = ( a < 0 );
1216     absA = zSign ? - a : a;
1217     shiftCount = countLeadingZeros64( absA ) - 40;
1218     if ( 0 <= shiftCount ) {
1219         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1220     }
1221     else {
1222         shiftCount += 7;
1223         if ( shiftCount < 0 ) {
1224             shift64RightJamming( absA, - shiftCount, &absA );
1225         }
1226         else {
1227             absA <<= shiftCount;
1228         }
1229         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1230     }
1231 
1232 }
1233 
1234 float32 uint64_to_float32( uint64 a STATUS_PARAM )
1235 {
1236     int8 shiftCount;
1237 
1238     if ( a == 0 ) return float32_zero;
1239     shiftCount = countLeadingZeros64( a ) - 40;
1240     if ( 0 <= shiftCount ) {
1241         return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1242     }
1243     else {
1244         shiftCount += 7;
1245         if ( shiftCount < 0 ) {
1246             shift64RightJamming( a, - shiftCount, &a );
1247         }
1248         else {
1249             a <<= shiftCount;
1250         }
1251         return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1252     }
1253 }
1254 
1255 /*----------------------------------------------------------------------------
1256 | Returns the result of converting the 64-bit two's complement integer `a'
1257 | to the double-precision floating-point format.  The conversion is performed
1258 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1259 *----------------------------------------------------------------------------*/
1260 
1261 float64 int64_to_float64( int64 a STATUS_PARAM )
1262 {
1263     flag zSign;
1264 
1265     if ( a == 0 ) return float64_zero;
1266     if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1267         return packFloat64( 1, 0x43E, 0 );
1268     }
1269     zSign = ( a < 0 );
1270     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1271 
1272 }
1273 
1274 float64 uint64_to_float64( uint64 a STATUS_PARAM )
1275 {
1276     if ( a == 0 ) return float64_zero;
1277     return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1278 
1279 }
1280 
1281 /*----------------------------------------------------------------------------
1282 | Returns the result of converting the 64-bit two's complement integer `a'
1283 | to the extended double-precision floating-point format.  The conversion
1284 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1285 | Arithmetic.
1286 *----------------------------------------------------------------------------*/
1287 
1288 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1289 {
1290     flag zSign;
1291     uint64 absA;
1292     int8 shiftCount;
1293 
1294     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1295     zSign = ( a < 0 );
1296     absA = zSign ? - a : a;
1297     shiftCount = countLeadingZeros64( absA );
1298     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1299 
1300 }
1301 
1302 /*----------------------------------------------------------------------------
1303 | Returns the result of converting the 64-bit two's complement integer `a' to
1304 | the quadruple-precision floating-point format.  The conversion is performed
1305 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1306 *----------------------------------------------------------------------------*/
1307 
1308 float128 int64_to_float128( int64 a STATUS_PARAM )
1309 {
1310     flag zSign;
1311     uint64 absA;
1312     int8 shiftCount;
1313     int32 zExp;
1314     uint64_t zSig0, zSig1;
1315 
1316     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1317     zSign = ( a < 0 );
1318     absA = zSign ? - a : a;
1319     shiftCount = countLeadingZeros64( absA ) + 49;
1320     zExp = 0x406E - shiftCount;
1321     if ( 64 <= shiftCount ) {
1322         zSig1 = 0;
1323         zSig0 = absA;
1324         shiftCount -= 64;
1325     }
1326     else {
1327         zSig1 = absA;
1328         zSig0 = 0;
1329     }
1330     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1331     return packFloat128( zSign, zExp, zSig0, zSig1 );
1332 
1333 }
1334 
1335 /*----------------------------------------------------------------------------
1336 | Returns the result of converting the single-precision floating-point value
1337 | `a' to the 32-bit two's complement integer format.  The conversion is
1338 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1339 | Arithmetic---which means in particular that the conversion is rounded
1340 | according to the current rounding mode.  If `a' is a NaN, the largest
1341 | positive integer is returned.  Otherwise, if the conversion overflows, the
1342 | largest integer with the same sign as `a' is returned.
1343 *----------------------------------------------------------------------------*/
1344 
1345 int32 float32_to_int32( float32 a STATUS_PARAM )
1346 {
1347     flag aSign;
1348     int16 aExp, shiftCount;
1349     uint32_t aSig;
1350     uint64_t aSig64;
1351 
1352     a = float32_squash_input_denormal(a STATUS_VAR);
1353     aSig = extractFloat32Frac( a );
1354     aExp = extractFloat32Exp( a );
1355     aSign = extractFloat32Sign( a );
1356     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1357     if ( aExp ) aSig |= 0x00800000;
1358     shiftCount = 0xAF - aExp;
1359     aSig64 = aSig;
1360     aSig64 <<= 32;
1361     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1362     return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1363 
1364 }
1365 
1366 /*----------------------------------------------------------------------------
1367 | Returns the result of converting the single-precision floating-point value
1368 | `a' to the 32-bit two's complement integer format.  The conversion is
1369 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1370 | Arithmetic, except that the conversion is always rounded toward zero.
1371 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1372 | the conversion overflows, the largest integer with the same sign as `a' is
1373 | returned.
1374 *----------------------------------------------------------------------------*/
1375 
1376 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1377 {
1378     flag aSign;
1379     int16 aExp, shiftCount;
1380     uint32_t aSig;
1381     int32 z;
1382     a = float32_squash_input_denormal(a STATUS_VAR);
1383 
1384     aSig = extractFloat32Frac( a );
1385     aExp = extractFloat32Exp( a );
1386     aSign = extractFloat32Sign( a );
1387     shiftCount = aExp - 0x9E;
1388     if ( 0 <= shiftCount ) {
1389         if ( float32_val(a) != 0xCF000000 ) {
1390             float_raise( float_flag_invalid STATUS_VAR);
1391             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1392         }
1393         return (int32_t) 0x80000000;
1394     }
1395     else if ( aExp <= 0x7E ) {
1396         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1397         return 0;
1398     }
1399     aSig = ( aSig | 0x00800000 )<<8;
1400     z = aSig>>( - shiftCount );
1401     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1402         STATUS(float_exception_flags) |= float_flag_inexact;
1403     }
1404     if ( aSign ) z = - z;
1405     return z;
1406 
1407 }
1408 
1409 /*----------------------------------------------------------------------------
1410 | Returns the result of converting the single-precision floating-point value
1411 | `a' to the 16-bit two's complement integer format.  The conversion is
1412 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1413 | Arithmetic, except that the conversion is always rounded toward zero.
1414 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1415 | the conversion overflows, the largest integer with the same sign as `a' is
1416 | returned.
1417 *----------------------------------------------------------------------------*/
1418 
1419 int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM )
1420 {
1421     flag aSign;
1422     int16 aExp, shiftCount;
1423     uint32_t aSig;
1424     int32 z;
1425 
1426     aSig = extractFloat32Frac( a );
1427     aExp = extractFloat32Exp( a );
1428     aSign = extractFloat32Sign( a );
1429     shiftCount = aExp - 0x8E;
1430     if ( 0 <= shiftCount ) {
1431         if ( float32_val(a) != 0xC7000000 ) {
1432             float_raise( float_flag_invalid STATUS_VAR);
1433             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1434                 return 0x7FFF;
1435             }
1436         }
1437         return (int32_t) 0xffff8000;
1438     }
1439     else if ( aExp <= 0x7E ) {
1440         if ( aExp | aSig ) {
1441             STATUS(float_exception_flags) |= float_flag_inexact;
1442         }
1443         return 0;
1444     }
1445     shiftCount -= 0x10;
1446     aSig = ( aSig | 0x00800000 )<<8;
1447     z = aSig>>( - shiftCount );
1448     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1449         STATUS(float_exception_flags) |= float_flag_inexact;
1450     }
1451     if ( aSign ) {
1452         z = - z;
1453     }
1454     return z;
1455 
1456 }
1457 
1458 /*----------------------------------------------------------------------------
1459 | Returns the result of converting the single-precision floating-point value
1460 | `a' to the 64-bit two's complement integer format.  The conversion is
1461 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1462 | Arithmetic---which means in particular that the conversion is rounded
1463 | according to the current rounding mode.  If `a' is a NaN, the largest
1464 | positive integer is returned.  Otherwise, if the conversion overflows, the
1465 | largest integer with the same sign as `a' is returned.
1466 *----------------------------------------------------------------------------*/
1467 
1468 int64 float32_to_int64( float32 a STATUS_PARAM )
1469 {
1470     flag aSign;
1471     int16 aExp, shiftCount;
1472     uint32_t aSig;
1473     uint64_t aSig64, aSigExtra;
1474     a = float32_squash_input_denormal(a STATUS_VAR);
1475 
1476     aSig = extractFloat32Frac( a );
1477     aExp = extractFloat32Exp( a );
1478     aSign = extractFloat32Sign( a );
1479     shiftCount = 0xBE - aExp;
1480     if ( shiftCount < 0 ) {
1481         float_raise( float_flag_invalid STATUS_VAR);
1482         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1483             return LIT64( 0x7FFFFFFFFFFFFFFF );
1484         }
1485         return (int64_t) LIT64( 0x8000000000000000 );
1486     }
1487     if ( aExp ) aSig |= 0x00800000;
1488     aSig64 = aSig;
1489     aSig64 <<= 40;
1490     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1491     return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1492 
1493 }
1494 
1495 /*----------------------------------------------------------------------------
1496 | Returns the result of converting the single-precision floating-point value
1497 | `a' to the 64-bit two's complement integer format.  The conversion is
1498 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1499 | Arithmetic, except that the conversion is always rounded toward zero.  If
1500 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1501 | conversion overflows, the largest integer with the same sign as `a' is
1502 | returned.
1503 *----------------------------------------------------------------------------*/
1504 
1505 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1506 {
1507     flag aSign;
1508     int16 aExp, shiftCount;
1509     uint32_t aSig;
1510     uint64_t aSig64;
1511     int64 z;
1512     a = float32_squash_input_denormal(a STATUS_VAR);
1513 
1514     aSig = extractFloat32Frac( a );
1515     aExp = extractFloat32Exp( a );
1516     aSign = extractFloat32Sign( a );
1517     shiftCount = aExp - 0xBE;
1518     if ( 0 <= shiftCount ) {
1519         if ( float32_val(a) != 0xDF000000 ) {
1520             float_raise( float_flag_invalid STATUS_VAR);
1521             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1522                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1523             }
1524         }
1525         return (int64_t) LIT64( 0x8000000000000000 );
1526     }
1527     else if ( aExp <= 0x7E ) {
1528         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1529         return 0;
1530     }
1531     aSig64 = aSig | 0x00800000;
1532     aSig64 <<= 40;
1533     z = aSig64>>( - shiftCount );
1534     if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1535         STATUS(float_exception_flags) |= float_flag_inexact;
1536     }
1537     if ( aSign ) z = - z;
1538     return z;
1539 
1540 }
1541 
1542 /*----------------------------------------------------------------------------
1543 | Returns the result of converting the single-precision floating-point value
1544 | `a' to the double-precision floating-point format.  The conversion is
1545 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1546 | Arithmetic.
1547 *----------------------------------------------------------------------------*/
1548 
1549 float64 float32_to_float64( float32 a STATUS_PARAM )
1550 {
1551     flag aSign;
1552     int16 aExp;
1553     uint32_t aSig;
1554     a = float32_squash_input_denormal(a STATUS_VAR);
1555 
1556     aSig = extractFloat32Frac( a );
1557     aExp = extractFloat32Exp( a );
1558     aSign = extractFloat32Sign( a );
1559     if ( aExp == 0xFF ) {
1560         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1561         return packFloat64( aSign, 0x7FF, 0 );
1562     }
1563     if ( aExp == 0 ) {
1564         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1565         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1566         --aExp;
1567     }
1568     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1569 
1570 }
1571 
1572 /*----------------------------------------------------------------------------
1573 | Returns the result of converting the single-precision floating-point value
1574 | `a' to the extended double-precision floating-point format.  The conversion
1575 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1576 | Arithmetic.
1577 *----------------------------------------------------------------------------*/
1578 
1579 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1580 {
1581     flag aSign;
1582     int16 aExp;
1583     uint32_t aSig;
1584 
1585     a = float32_squash_input_denormal(a STATUS_VAR);
1586     aSig = extractFloat32Frac( a );
1587     aExp = extractFloat32Exp( a );
1588     aSign = extractFloat32Sign( a );
1589     if ( aExp == 0xFF ) {
1590         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1591         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1592     }
1593     if ( aExp == 0 ) {
1594         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1595         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1596     }
1597     aSig |= 0x00800000;
1598     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1599 
1600 }
1601 
1602 /*----------------------------------------------------------------------------
1603 | Returns the result of converting the single-precision floating-point value
1604 | `a' to the double-precision floating-point format.  The conversion is
1605 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1606 | Arithmetic.
1607 *----------------------------------------------------------------------------*/
1608 
1609 float128 float32_to_float128( float32 a STATUS_PARAM )
1610 {
1611     flag aSign;
1612     int16 aExp;
1613     uint32_t aSig;
1614 
1615     a = float32_squash_input_denormal(a STATUS_VAR);
1616     aSig = extractFloat32Frac( a );
1617     aExp = extractFloat32Exp( a );
1618     aSign = extractFloat32Sign( a );
1619     if ( aExp == 0xFF ) {
1620         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1621         return packFloat128( aSign, 0x7FFF, 0, 0 );
1622     }
1623     if ( aExp == 0 ) {
1624         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1625         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1626         --aExp;
1627     }
1628     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1629 
1630 }
1631 
1632 /*----------------------------------------------------------------------------
1633 | Rounds the single-precision floating-point value `a' to an integer, and
1634 | returns the result as a single-precision floating-point value.  The
1635 | operation is performed according to the IEC/IEEE Standard for Binary
1636 | Floating-Point Arithmetic.
1637 *----------------------------------------------------------------------------*/
1638 
1639 float32 float32_round_to_int( float32 a STATUS_PARAM)
1640 {
1641     flag aSign;
1642     int16 aExp;
1643     uint32_t lastBitMask, roundBitsMask;
1644     int8 roundingMode;
1645     uint32_t z;
1646     a = float32_squash_input_denormal(a STATUS_VAR);
1647 
1648     aExp = extractFloat32Exp( a );
1649     if ( 0x96 <= aExp ) {
1650         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1651             return propagateFloat32NaN( a, a STATUS_VAR );
1652         }
1653         return a;
1654     }
1655     if ( aExp <= 0x7E ) {
1656         if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1657         STATUS(float_exception_flags) |= float_flag_inexact;
1658         aSign = extractFloat32Sign( a );
1659         switch ( STATUS(float_rounding_mode) ) {
1660          case float_round_nearest_even:
1661             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1662                 return packFloat32( aSign, 0x7F, 0 );
1663             }
1664             break;
1665          case float_round_down:
1666             return make_float32(aSign ? 0xBF800000 : 0);
1667          case float_round_up:
1668             return make_float32(aSign ? 0x80000000 : 0x3F800000);
1669         }
1670         return packFloat32( aSign, 0, 0 );
1671     }
1672     lastBitMask = 1;
1673     lastBitMask <<= 0x96 - aExp;
1674     roundBitsMask = lastBitMask - 1;
1675     z = float32_val(a);
1676     roundingMode = STATUS(float_rounding_mode);
1677     if ( roundingMode == float_round_nearest_even ) {
1678         z += lastBitMask>>1;
1679         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1680     }
1681     else if ( roundingMode != float_round_to_zero ) {
1682         if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1683             z += roundBitsMask;
1684         }
1685     }
1686     z &= ~ roundBitsMask;
1687     if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1688     return make_float32(z);
1689 
1690 }
1691 
1692 /*----------------------------------------------------------------------------
1693 | Returns the result of adding the absolute values of the single-precision
1694 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1695 | before being returned.  `zSign' is ignored if the result is a NaN.
1696 | The addition is performed according to the IEC/IEEE Standard for Binary
1697 | Floating-Point Arithmetic.
1698 *----------------------------------------------------------------------------*/
1699 
1700 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1701 {
1702     int16 aExp, bExp, zExp;
1703     uint32_t aSig, bSig, zSig;
1704     int16 expDiff;
1705 
1706     aSig = extractFloat32Frac( a );
1707     aExp = extractFloat32Exp( a );
1708     bSig = extractFloat32Frac( b );
1709     bExp = extractFloat32Exp( b );
1710     expDiff = aExp - bExp;
1711     aSig <<= 6;
1712     bSig <<= 6;
1713     if ( 0 < expDiff ) {
1714         if ( aExp == 0xFF ) {
1715             if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1716             return a;
1717         }
1718         if ( bExp == 0 ) {
1719             --expDiff;
1720         }
1721         else {
1722             bSig |= 0x20000000;
1723         }
1724         shift32RightJamming( bSig, expDiff, &bSig );
1725         zExp = aExp;
1726     }
1727     else if ( expDiff < 0 ) {
1728         if ( bExp == 0xFF ) {
1729             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1730             return packFloat32( zSign, 0xFF, 0 );
1731         }
1732         if ( aExp == 0 ) {
1733             ++expDiff;
1734         }
1735         else {
1736             aSig |= 0x20000000;
1737         }
1738         shift32RightJamming( aSig, - expDiff, &aSig );
1739         zExp = bExp;
1740     }
1741     else {
1742         if ( aExp == 0xFF ) {
1743             if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1744             return a;
1745         }
1746         if ( aExp == 0 ) {
1747             if (STATUS(flush_to_zero)) {
1748                 if (aSig | bSig) {
1749                     float_raise(float_flag_output_denormal STATUS_VAR);
1750                 }
1751                 return packFloat32(zSign, 0, 0);
1752             }
1753             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1754         }
1755         zSig = 0x40000000 + aSig + bSig;
1756         zExp = aExp;
1757         goto roundAndPack;
1758     }
1759     aSig |= 0x20000000;
1760     zSig = ( aSig + bSig )<<1;
1761     --zExp;
1762     if ( (int32_t) zSig < 0 ) {
1763         zSig = aSig + bSig;
1764         ++zExp;
1765     }
1766  roundAndPack:
1767     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1768 
1769 }
1770 
1771 /*----------------------------------------------------------------------------
1772 | Returns the result of subtracting the absolute values of the single-
1773 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
1774 | difference is negated before being returned.  `zSign' is ignored if the
1775 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
1776 | Standard for Binary Floating-Point Arithmetic.
1777 *----------------------------------------------------------------------------*/
1778 
1779 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1780 {
1781     int16 aExp, bExp, zExp;
1782     uint32_t aSig, bSig, zSig;
1783     int16 expDiff;
1784 
1785     aSig = extractFloat32Frac( a );
1786     aExp = extractFloat32Exp( a );
1787     bSig = extractFloat32Frac( b );
1788     bExp = extractFloat32Exp( b );
1789     expDiff = aExp - bExp;
1790     aSig <<= 7;
1791     bSig <<= 7;
1792     if ( 0 < expDiff ) goto aExpBigger;
1793     if ( expDiff < 0 ) goto bExpBigger;
1794     if ( aExp == 0xFF ) {
1795         if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1796         float_raise( float_flag_invalid STATUS_VAR);
1797         return float32_default_nan;
1798     }
1799     if ( aExp == 0 ) {
1800         aExp = 1;
1801         bExp = 1;
1802     }
1803     if ( bSig < aSig ) goto aBigger;
1804     if ( aSig < bSig ) goto bBigger;
1805     return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1806  bExpBigger:
1807     if ( bExp == 0xFF ) {
1808         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1809         return packFloat32( zSign ^ 1, 0xFF, 0 );
1810     }
1811     if ( aExp == 0 ) {
1812         ++expDiff;
1813     }
1814     else {
1815         aSig |= 0x40000000;
1816     }
1817     shift32RightJamming( aSig, - expDiff, &aSig );
1818     bSig |= 0x40000000;
1819  bBigger:
1820     zSig = bSig - aSig;
1821     zExp = bExp;
1822     zSign ^= 1;
1823     goto normalizeRoundAndPack;
1824  aExpBigger:
1825     if ( aExp == 0xFF ) {
1826         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1827         return a;
1828     }
1829     if ( bExp == 0 ) {
1830         --expDiff;
1831     }
1832     else {
1833         bSig |= 0x40000000;
1834     }
1835     shift32RightJamming( bSig, expDiff, &bSig );
1836     aSig |= 0x40000000;
1837  aBigger:
1838     zSig = aSig - bSig;
1839     zExp = aExp;
1840  normalizeRoundAndPack:
1841     --zExp;
1842     return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1843 
1844 }
1845 
1846 /*----------------------------------------------------------------------------
1847 | Returns the result of adding the single-precision floating-point values `a'
1848 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
1849 | Binary Floating-Point Arithmetic.
1850 *----------------------------------------------------------------------------*/
1851 
1852 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1853 {
1854     flag aSign, bSign;
1855     a = float32_squash_input_denormal(a STATUS_VAR);
1856     b = float32_squash_input_denormal(b STATUS_VAR);
1857 
1858     aSign = extractFloat32Sign( a );
1859     bSign = extractFloat32Sign( b );
1860     if ( aSign == bSign ) {
1861         return addFloat32Sigs( a, b, aSign STATUS_VAR);
1862     }
1863     else {
1864         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1865     }
1866 
1867 }
1868 
1869 /*----------------------------------------------------------------------------
1870 | Returns the result of subtracting the single-precision floating-point values
1871 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1872 | for Binary Floating-Point Arithmetic.
1873 *----------------------------------------------------------------------------*/
1874 
1875 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1876 {
1877     flag aSign, bSign;
1878     a = float32_squash_input_denormal(a STATUS_VAR);
1879     b = float32_squash_input_denormal(b STATUS_VAR);
1880 
1881     aSign = extractFloat32Sign( a );
1882     bSign = extractFloat32Sign( b );
1883     if ( aSign == bSign ) {
1884         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1885     }
1886     else {
1887         return addFloat32Sigs( a, b, aSign STATUS_VAR );
1888     }
1889 
1890 }
1891 
1892 /*----------------------------------------------------------------------------
1893 | Returns the result of multiplying the single-precision floating-point values
1894 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1895 | for Binary Floating-Point Arithmetic.
1896 *----------------------------------------------------------------------------*/
1897 
1898 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1899 {
1900     flag aSign, bSign, zSign;
1901     int16 aExp, bExp, zExp;
1902     uint32_t aSig, bSig;
1903     uint64_t zSig64;
1904     uint32_t zSig;
1905 
1906     a = float32_squash_input_denormal(a STATUS_VAR);
1907     b = float32_squash_input_denormal(b STATUS_VAR);
1908 
1909     aSig = extractFloat32Frac( a );
1910     aExp = extractFloat32Exp( a );
1911     aSign = extractFloat32Sign( a );
1912     bSig = extractFloat32Frac( b );
1913     bExp = extractFloat32Exp( b );
1914     bSign = extractFloat32Sign( b );
1915     zSign = aSign ^ bSign;
1916     if ( aExp == 0xFF ) {
1917         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1918             return propagateFloat32NaN( a, b STATUS_VAR );
1919         }
1920         if ( ( bExp | bSig ) == 0 ) {
1921             float_raise( float_flag_invalid STATUS_VAR);
1922             return float32_default_nan;
1923         }
1924         return packFloat32( zSign, 0xFF, 0 );
1925     }
1926     if ( bExp == 0xFF ) {
1927         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1928         if ( ( aExp | aSig ) == 0 ) {
1929             float_raise( float_flag_invalid STATUS_VAR);
1930             return float32_default_nan;
1931         }
1932         return packFloat32( zSign, 0xFF, 0 );
1933     }
1934     if ( aExp == 0 ) {
1935         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1936         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1937     }
1938     if ( bExp == 0 ) {
1939         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1940         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1941     }
1942     zExp = aExp + bExp - 0x7F;
1943     aSig = ( aSig | 0x00800000 )<<7;
1944     bSig = ( bSig | 0x00800000 )<<8;
1945     shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
1946     zSig = zSig64;
1947     if ( 0 <= (int32_t) ( zSig<<1 ) ) {
1948         zSig <<= 1;
1949         --zExp;
1950     }
1951     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1952 
1953 }
1954 
1955 /*----------------------------------------------------------------------------
1956 | Returns the result of dividing the single-precision floating-point value `a'
1957 | by the corresponding value `b'.  The operation is performed according to the
1958 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1959 *----------------------------------------------------------------------------*/
1960 
1961 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1962 {
1963     flag aSign, bSign, zSign;
1964     int16 aExp, bExp, zExp;
1965     uint32_t aSig, bSig, zSig;
1966     a = float32_squash_input_denormal(a STATUS_VAR);
1967     b = float32_squash_input_denormal(b STATUS_VAR);
1968 
1969     aSig = extractFloat32Frac( a );
1970     aExp = extractFloat32Exp( a );
1971     aSign = extractFloat32Sign( a );
1972     bSig = extractFloat32Frac( b );
1973     bExp = extractFloat32Exp( b );
1974     bSign = extractFloat32Sign( b );
1975     zSign = aSign ^ bSign;
1976     if ( aExp == 0xFF ) {
1977         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1978         if ( bExp == 0xFF ) {
1979             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1980             float_raise( float_flag_invalid STATUS_VAR);
1981             return float32_default_nan;
1982         }
1983         return packFloat32( zSign, 0xFF, 0 );
1984     }
1985     if ( bExp == 0xFF ) {
1986         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1987         return packFloat32( zSign, 0, 0 );
1988     }
1989     if ( bExp == 0 ) {
1990         if ( bSig == 0 ) {
1991             if ( ( aExp | aSig ) == 0 ) {
1992                 float_raise( float_flag_invalid STATUS_VAR);
1993                 return float32_default_nan;
1994             }
1995             float_raise( float_flag_divbyzero STATUS_VAR);
1996             return packFloat32( zSign, 0xFF, 0 );
1997         }
1998         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1999     }
2000     if ( aExp == 0 ) {
2001         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2002         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2003     }
2004     zExp = aExp - bExp + 0x7D;
2005     aSig = ( aSig | 0x00800000 )<<7;
2006     bSig = ( bSig | 0x00800000 )<<8;
2007     if ( bSig <= ( aSig + aSig ) ) {
2008         aSig >>= 1;
2009         ++zExp;
2010     }
2011     zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2012     if ( ( zSig & 0x3F ) == 0 ) {
2013         zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2014     }
2015     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2016 
2017 }
2018 
2019 /*----------------------------------------------------------------------------
2020 | Returns the remainder of the single-precision floating-point value `a'
2021 | with respect to the corresponding value `b'.  The operation is performed
2022 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2023 *----------------------------------------------------------------------------*/
2024 
2025 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2026 {
2027     flag aSign, zSign;
2028     int16 aExp, bExp, expDiff;
2029     uint32_t aSig, bSig;
2030     uint32_t q;
2031     uint64_t aSig64, bSig64, q64;
2032     uint32_t alternateASig;
2033     int32_t sigMean;
2034     a = float32_squash_input_denormal(a STATUS_VAR);
2035     b = float32_squash_input_denormal(b STATUS_VAR);
2036 
2037     aSig = extractFloat32Frac( a );
2038     aExp = extractFloat32Exp( a );
2039     aSign = extractFloat32Sign( a );
2040     bSig = extractFloat32Frac( b );
2041     bExp = extractFloat32Exp( b );
2042     if ( aExp == 0xFF ) {
2043         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2044             return propagateFloat32NaN( a, b STATUS_VAR );
2045         }
2046         float_raise( float_flag_invalid STATUS_VAR);
2047         return float32_default_nan;
2048     }
2049     if ( bExp == 0xFF ) {
2050         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2051         return a;
2052     }
2053     if ( bExp == 0 ) {
2054         if ( bSig == 0 ) {
2055             float_raise( float_flag_invalid STATUS_VAR);
2056             return float32_default_nan;
2057         }
2058         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2059     }
2060     if ( aExp == 0 ) {
2061         if ( aSig == 0 ) return a;
2062         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2063     }
2064     expDiff = aExp - bExp;
2065     aSig |= 0x00800000;
2066     bSig |= 0x00800000;
2067     if ( expDiff < 32 ) {
2068         aSig <<= 8;
2069         bSig <<= 8;
2070         if ( expDiff < 0 ) {
2071             if ( expDiff < -1 ) return a;
2072             aSig >>= 1;
2073         }
2074         q = ( bSig <= aSig );
2075         if ( q ) aSig -= bSig;
2076         if ( 0 < expDiff ) {
2077             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2078             q >>= 32 - expDiff;
2079             bSig >>= 2;
2080             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2081         }
2082         else {
2083             aSig >>= 2;
2084             bSig >>= 2;
2085         }
2086     }
2087     else {
2088         if ( bSig <= aSig ) aSig -= bSig;
2089         aSig64 = ( (uint64_t) aSig )<<40;
2090         bSig64 = ( (uint64_t) bSig )<<40;
2091         expDiff -= 64;
2092         while ( 0 < expDiff ) {
2093             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2094             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2095             aSig64 = - ( ( bSig * q64 )<<38 );
2096             expDiff -= 62;
2097         }
2098         expDiff += 64;
2099         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2100         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2101         q = q64>>( 64 - expDiff );
2102         bSig <<= 6;
2103         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2104     }
2105     do {
2106         alternateASig = aSig;
2107         ++q;
2108         aSig -= bSig;
2109     } while ( 0 <= (int32_t) aSig );
2110     sigMean = aSig + alternateASig;
2111     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2112         aSig = alternateASig;
2113     }
2114     zSign = ( (int32_t) aSig < 0 );
2115     if ( zSign ) aSig = - aSig;
2116     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2117 
2118 }
2119 
2120 /*----------------------------------------------------------------------------
2121 | Returns the result of multiplying the single-precision floating-point values
2122 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2123 | multiplication.  The operation is performed according to the IEC/IEEE
2124 | Standard for Binary Floating-Point Arithmetic 754-2008.
2125 | The flags argument allows the caller to select negation of the
2126 | addend, the intermediate product, or the final result. (The difference
2127 | between this and having the caller do a separate negation is that negating
2128 | externally will flip the sign bit on NaNs.)
2129 *----------------------------------------------------------------------------*/
2130 
2131 float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)
2132 {
2133     flag aSign, bSign, cSign, zSign;
2134     int aExp, bExp, cExp, pExp, zExp, expDiff;
2135     uint32_t aSig, bSig, cSig;
2136     flag pInf, pZero, pSign;
2137     uint64_t pSig64, cSig64, zSig64;
2138     uint32_t pSig;
2139     int shiftcount;
2140     flag signflip, infzero;
2141 
2142     a = float32_squash_input_denormal(a STATUS_VAR);
2143     b = float32_squash_input_denormal(b STATUS_VAR);
2144     c = float32_squash_input_denormal(c STATUS_VAR);
2145     aSig = extractFloat32Frac(a);
2146     aExp = extractFloat32Exp(a);
2147     aSign = extractFloat32Sign(a);
2148     bSig = extractFloat32Frac(b);
2149     bExp = extractFloat32Exp(b);
2150     bSign = extractFloat32Sign(b);
2151     cSig = extractFloat32Frac(c);
2152     cExp = extractFloat32Exp(c);
2153     cSign = extractFloat32Sign(c);
2154 
2155     infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2156                (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2157 
2158     /* It is implementation-defined whether the cases of (0,inf,qnan)
2159      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2160      * they return if they do), so we have to hand this information
2161      * off to the target-specific pick-a-NaN routine.
2162      */
2163     if (((aExp == 0xff) && aSig) ||
2164         ((bExp == 0xff) && bSig) ||
2165         ((cExp == 0xff) && cSig)) {
2166         return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR);
2167     }
2168 
2169     if (infzero) {
2170         float_raise(float_flag_invalid STATUS_VAR);
2171         return float32_default_nan;
2172     }
2173 
2174     if (flags & float_muladd_negate_c) {
2175         cSign ^= 1;
2176     }
2177 
2178     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2179 
2180     /* Work out the sign and type of the product */
2181     pSign = aSign ^ bSign;
2182     if (flags & float_muladd_negate_product) {
2183         pSign ^= 1;
2184     }
2185     pInf = (aExp == 0xff) || (bExp == 0xff);
2186     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2187 
2188     if (cExp == 0xff) {
2189         if (pInf && (pSign ^ cSign)) {
2190             /* addition of opposite-signed infinities => InvalidOperation */
2191             float_raise(float_flag_invalid STATUS_VAR);
2192             return float32_default_nan;
2193         }
2194         /* Otherwise generate an infinity of the same sign */
2195         return packFloat32(cSign ^ signflip, 0xff, 0);
2196     }
2197 
2198     if (pInf) {
2199         return packFloat32(pSign ^ signflip, 0xff, 0);
2200     }
2201 
2202     if (pZero) {
2203         if (cExp == 0) {
2204             if (cSig == 0) {
2205                 /* Adding two exact zeroes */
2206                 if (pSign == cSign) {
2207                     zSign = pSign;
2208                 } else if (STATUS(float_rounding_mode) == float_round_down) {
2209                     zSign = 1;
2210                 } else {
2211                     zSign = 0;
2212                 }
2213                 return packFloat32(zSign ^ signflip, 0, 0);
2214             }
2215             /* Exact zero plus a denorm */
2216             if (STATUS(flush_to_zero)) {
2217                 float_raise(float_flag_output_denormal STATUS_VAR);
2218                 return packFloat32(cSign ^ signflip, 0, 0);
2219             }
2220         }
2221         /* Zero plus something non-zero : just return the something */
2222         return c ^ (signflip << 31);
2223     }
2224 
2225     if (aExp == 0) {
2226         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2227     }
2228     if (bExp == 0) {
2229         normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2230     }
2231 
2232     /* Calculate the actual result a * b + c */
2233 
2234     /* Multiply first; this is easy. */
2235     /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2236      * because we want the true exponent, not the "one-less-than"
2237      * flavour that roundAndPackFloat32() takes.
2238      */
2239     pExp = aExp + bExp - 0x7e;
2240     aSig = (aSig | 0x00800000) << 7;
2241     bSig = (bSig | 0x00800000) << 8;
2242     pSig64 = (uint64_t)aSig * bSig;
2243     if ((int64_t)(pSig64 << 1) >= 0) {
2244         pSig64 <<= 1;
2245         pExp--;
2246     }
2247 
2248     zSign = pSign ^ signflip;
2249 
2250     /* Now pSig64 is the significand of the multiply, with the explicit bit in
2251      * position 62.
2252      */
2253     if (cExp == 0) {
2254         if (!cSig) {
2255             /* Throw out the special case of c being an exact zero now */
2256             shift64RightJamming(pSig64, 32, &pSig64);
2257             pSig = pSig64;
2258             return roundAndPackFloat32(zSign, pExp - 1,
2259                                        pSig STATUS_VAR);
2260         }
2261         normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2262     }
2263 
2264     cSig64 = (uint64_t)cSig << (62 - 23);
2265     cSig64 |= LIT64(0x4000000000000000);
2266     expDiff = pExp - cExp;
2267 
2268     if (pSign == cSign) {
2269         /* Addition */
2270         if (expDiff > 0) {
2271             /* scale c to match p */
2272             shift64RightJamming(cSig64, expDiff, &cSig64);
2273             zExp = pExp;
2274         } else if (expDiff < 0) {
2275             /* scale p to match c */
2276             shift64RightJamming(pSig64, -expDiff, &pSig64);
2277             zExp = cExp;
2278         } else {
2279             /* no scaling needed */
2280             zExp = cExp;
2281         }
2282         /* Add significands and make sure explicit bit ends up in posn 62 */
2283         zSig64 = pSig64 + cSig64;
2284         if ((int64_t)zSig64 < 0) {
2285             shift64RightJamming(zSig64, 1, &zSig64);
2286         } else {
2287             zExp--;
2288         }
2289     } else {
2290         /* Subtraction */
2291         if (expDiff > 0) {
2292             shift64RightJamming(cSig64, expDiff, &cSig64);
2293             zSig64 = pSig64 - cSig64;
2294             zExp = pExp;
2295         } else if (expDiff < 0) {
2296             shift64RightJamming(pSig64, -expDiff, &pSig64);
2297             zSig64 = cSig64 - pSig64;
2298             zExp = cExp;
2299             zSign ^= 1;
2300         } else {
2301             zExp = pExp;
2302             if (cSig64 < pSig64) {
2303                 zSig64 = pSig64 - cSig64;
2304             } else if (pSig64 < cSig64) {
2305                 zSig64 = cSig64 - pSig64;
2306                 zSign ^= 1;
2307             } else {
2308                 /* Exact zero */
2309                 zSign = signflip;
2310                 if (STATUS(float_rounding_mode) == float_round_down) {
2311                     zSign ^= 1;
2312                 }
2313                 return packFloat32(zSign, 0, 0);
2314             }
2315         }
2316         --zExp;
2317         /* Normalize to put the explicit bit back into bit 62. */
2318         shiftcount = countLeadingZeros64(zSig64) - 1;
2319         zSig64 <<= shiftcount;
2320         zExp -= shiftcount;
2321     }
2322     shift64RightJamming(zSig64, 32, &zSig64);
2323     return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);
2324 }
2325 
2326 
2327 /*----------------------------------------------------------------------------
2328 | Returns the square root of the single-precision floating-point value `a'.
2329 | The operation is performed according to the IEC/IEEE Standard for Binary
2330 | Floating-Point Arithmetic.
2331 *----------------------------------------------------------------------------*/
2332 
2333 float32 float32_sqrt( float32 a STATUS_PARAM )
2334 {
2335     flag aSign;
2336     int16 aExp, zExp;
2337     uint32_t aSig, zSig;
2338     uint64_t rem, term;
2339     a = float32_squash_input_denormal(a STATUS_VAR);
2340 
2341     aSig = extractFloat32Frac( a );
2342     aExp = extractFloat32Exp( a );
2343     aSign = extractFloat32Sign( a );
2344     if ( aExp == 0xFF ) {
2345         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2346         if ( ! aSign ) return a;
2347         float_raise( float_flag_invalid STATUS_VAR);
2348         return float32_default_nan;
2349     }
2350     if ( aSign ) {
2351         if ( ( aExp | aSig ) == 0 ) return a;
2352         float_raise( float_flag_invalid STATUS_VAR);
2353         return float32_default_nan;
2354     }
2355     if ( aExp == 0 ) {
2356         if ( aSig == 0 ) return float32_zero;
2357         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2358     }
2359     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2360     aSig = ( aSig | 0x00800000 )<<8;
2361     zSig = estimateSqrt32( aExp, aSig ) + 2;
2362     if ( ( zSig & 0x7F ) <= 5 ) {
2363         if ( zSig < 2 ) {
2364             zSig = 0x7FFFFFFF;
2365             goto roundAndPack;
2366         }
2367         aSig >>= aExp & 1;
2368         term = ( (uint64_t) zSig ) * zSig;
2369         rem = ( ( (uint64_t) aSig )<<32 ) - term;
2370         while ( (int64_t) rem < 0 ) {
2371             --zSig;
2372             rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2373         }
2374         zSig |= ( rem != 0 );
2375     }
2376     shift32RightJamming( zSig, 1, &zSig );
2377  roundAndPack:
2378     return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2379 
2380 }
2381 
2382 /*----------------------------------------------------------------------------
2383 | Returns the binary exponential of the single-precision floating-point value
2384 | `a'. The operation is performed according to the IEC/IEEE Standard for
2385 | Binary Floating-Point Arithmetic.
2386 |
2387 | Uses the following identities:
2388 |
2389 | 1. -------------------------------------------------------------------------
2390 |      x    x*ln(2)
2391 |     2  = e
2392 |
2393 | 2. -------------------------------------------------------------------------
2394 |                      2     3     4     5           n
2395 |      x        x     x     x     x     x           x
2396 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2397 |               1!    2!    3!    4!    5!          n!
2398 *----------------------------------------------------------------------------*/
2399 
2400 static const float64 float32_exp2_coefficients[15] =
2401 {
2402     const_float64( 0x3ff0000000000000ll ), /*  1 */
2403     const_float64( 0x3fe0000000000000ll ), /*  2 */
2404     const_float64( 0x3fc5555555555555ll ), /*  3 */
2405     const_float64( 0x3fa5555555555555ll ), /*  4 */
2406     const_float64( 0x3f81111111111111ll ), /*  5 */
2407     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2408     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2409     const_float64( 0x3efa01a01a01a01all ), /*  8 */
2410     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2411     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2412     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2413     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2414     const_float64( 0x3de6124613a86d09ll ), /* 13 */
2415     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2416     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2417 };
2418 
2419 float32 float32_exp2( float32 a STATUS_PARAM )
2420 {
2421     flag aSign;
2422     int16 aExp;
2423     uint32_t aSig;
2424     float64 r, x, xn;
2425     int i;
2426     a = float32_squash_input_denormal(a STATUS_VAR);
2427 
2428     aSig = extractFloat32Frac( a );
2429     aExp = extractFloat32Exp( a );
2430     aSign = extractFloat32Sign( a );
2431 
2432     if ( aExp == 0xFF) {
2433         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2434         return (aSign) ? float32_zero : a;
2435     }
2436     if (aExp == 0) {
2437         if (aSig == 0) return float32_one;
2438     }
2439 
2440     float_raise( float_flag_inexact STATUS_VAR);
2441 
2442     /* ******************************* */
2443     /* using float64 for approximation */
2444     /* ******************************* */
2445     x = float32_to_float64(a STATUS_VAR);
2446     x = float64_mul(x, float64_ln2 STATUS_VAR);
2447 
2448     xn = x;
2449     r = float64_one;
2450     for (i = 0 ; i < 15 ; i++) {
2451         float64 f;
2452 
2453         f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2454         r = float64_add(r, f STATUS_VAR);
2455 
2456         xn = float64_mul(xn, x STATUS_VAR);
2457     }
2458 
2459     return float64_to_float32(r, status);
2460 }
2461 
2462 /*----------------------------------------------------------------------------
2463 | Returns the binary log of the single-precision floating-point value `a'.
2464 | The operation is performed according to the IEC/IEEE Standard for Binary
2465 | Floating-Point Arithmetic.
2466 *----------------------------------------------------------------------------*/
2467 float32 float32_log2( float32 a STATUS_PARAM )
2468 {
2469     flag aSign, zSign;
2470     int16 aExp;
2471     uint32_t aSig, zSig, i;
2472 
2473     a = float32_squash_input_denormal(a STATUS_VAR);
2474     aSig = extractFloat32Frac( a );
2475     aExp = extractFloat32Exp( a );
2476     aSign = extractFloat32Sign( a );
2477 
2478     if ( aExp == 0 ) {
2479         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2480         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2481     }
2482     if ( aSign ) {
2483         float_raise( float_flag_invalid STATUS_VAR);
2484         return float32_default_nan;
2485     }
2486     if ( aExp == 0xFF ) {
2487         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2488         return a;
2489     }
2490 
2491     aExp -= 0x7F;
2492     aSig |= 0x00800000;
2493     zSign = aExp < 0;
2494     zSig = aExp << 23;
2495 
2496     for (i = 1 << 22; i > 0; i >>= 1) {
2497         aSig = ( (uint64_t)aSig * aSig ) >> 23;
2498         if ( aSig & 0x01000000 ) {
2499             aSig >>= 1;
2500             zSig |= i;
2501         }
2502     }
2503 
2504     if ( zSign )
2505         zSig = -zSig;
2506 
2507     return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2508 }
2509 
2510 /*----------------------------------------------------------------------------
2511 | Returns 1 if the single-precision floating-point value `a' is equal to
2512 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2513 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2514 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2515 *----------------------------------------------------------------------------*/
2516 
2517 int float32_eq( float32 a, float32 b STATUS_PARAM )
2518 {
2519     uint32_t av, bv;
2520     a = float32_squash_input_denormal(a STATUS_VAR);
2521     b = float32_squash_input_denormal(b STATUS_VAR);
2522 
2523     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2524          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2525        ) {
2526         float_raise( float_flag_invalid STATUS_VAR);
2527         return 0;
2528     }
2529     av = float32_val(a);
2530     bv = float32_val(b);
2531     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2532 }
2533 
2534 /*----------------------------------------------------------------------------
2535 | Returns 1 if the single-precision floating-point value `a' is less than
2536 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
2537 | exception is raised if either operand is a NaN.  The comparison is performed
2538 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2539 *----------------------------------------------------------------------------*/
2540 
2541 int float32_le( float32 a, float32 b STATUS_PARAM )
2542 {
2543     flag aSign, bSign;
2544     uint32_t av, bv;
2545     a = float32_squash_input_denormal(a STATUS_VAR);
2546     b = float32_squash_input_denormal(b STATUS_VAR);
2547 
2548     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2549          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2550        ) {
2551         float_raise( float_flag_invalid STATUS_VAR);
2552         return 0;
2553     }
2554     aSign = extractFloat32Sign( a );
2555     bSign = extractFloat32Sign( b );
2556     av = float32_val(a);
2557     bv = float32_val(b);
2558     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2559     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2560 
2561 }
2562 
2563 /*----------------------------------------------------------------------------
2564 | Returns 1 if the single-precision floating-point value `a' is less than
2565 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2566 | raised if either operand is a NaN.  The comparison is performed according
2567 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2568 *----------------------------------------------------------------------------*/
2569 
2570 int float32_lt( float32 a, float32 b STATUS_PARAM )
2571 {
2572     flag aSign, bSign;
2573     uint32_t av, bv;
2574     a = float32_squash_input_denormal(a STATUS_VAR);
2575     b = float32_squash_input_denormal(b STATUS_VAR);
2576 
2577     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2578          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2579        ) {
2580         float_raise( float_flag_invalid STATUS_VAR);
2581         return 0;
2582     }
2583     aSign = extractFloat32Sign( a );
2584     bSign = extractFloat32Sign( b );
2585     av = float32_val(a);
2586     bv = float32_val(b);
2587     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2588     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2589 
2590 }
2591 
2592 /*----------------------------------------------------------------------------
2593 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2594 | be compared, and 0 otherwise.  The invalid exception is raised if either
2595 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
2596 | Standard for Binary Floating-Point Arithmetic.
2597 *----------------------------------------------------------------------------*/
2598 
2599 int float32_unordered( float32 a, float32 b STATUS_PARAM )
2600 {
2601     a = float32_squash_input_denormal(a STATUS_VAR);
2602     b = float32_squash_input_denormal(b STATUS_VAR);
2603 
2604     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2605          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2606        ) {
2607         float_raise( float_flag_invalid STATUS_VAR);
2608         return 1;
2609     }
2610     return 0;
2611 }
2612 
2613 /*----------------------------------------------------------------------------
2614 | Returns 1 if the single-precision floating-point value `a' is equal to
2615 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2616 | exception.  The comparison is performed according to the IEC/IEEE Standard
2617 | for Binary Floating-Point Arithmetic.
2618 *----------------------------------------------------------------------------*/
2619 
2620 int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
2621 {
2622     a = float32_squash_input_denormal(a STATUS_VAR);
2623     b = float32_squash_input_denormal(b STATUS_VAR);
2624 
2625     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2626          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2627        ) {
2628         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2629             float_raise( float_flag_invalid STATUS_VAR);
2630         }
2631         return 0;
2632     }
2633     return ( float32_val(a) == float32_val(b) ) ||
2634             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2635 }
2636 
2637 /*----------------------------------------------------------------------------
2638 | Returns 1 if the single-precision floating-point value `a' is less than or
2639 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2640 | cause an exception.  Otherwise, the comparison is performed according to the
2641 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2642 *----------------------------------------------------------------------------*/
2643 
2644 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2645 {
2646     flag aSign, bSign;
2647     uint32_t av, bv;
2648     a = float32_squash_input_denormal(a STATUS_VAR);
2649     b = float32_squash_input_denormal(b STATUS_VAR);
2650 
2651     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2652          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2653        ) {
2654         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2655             float_raise( float_flag_invalid STATUS_VAR);
2656         }
2657         return 0;
2658     }
2659     aSign = extractFloat32Sign( a );
2660     bSign = extractFloat32Sign( b );
2661     av = float32_val(a);
2662     bv = float32_val(b);
2663     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2664     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2665 
2666 }
2667 
2668 /*----------------------------------------------------------------------------
2669 | Returns 1 if the single-precision floating-point value `a' is less than
2670 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2671 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2672 | Standard for Binary Floating-Point Arithmetic.
2673 *----------------------------------------------------------------------------*/
2674 
2675 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2676 {
2677     flag aSign, bSign;
2678     uint32_t av, bv;
2679     a = float32_squash_input_denormal(a STATUS_VAR);
2680     b = float32_squash_input_denormal(b STATUS_VAR);
2681 
2682     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2683          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2684        ) {
2685         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2686             float_raise( float_flag_invalid STATUS_VAR);
2687         }
2688         return 0;
2689     }
2690     aSign = extractFloat32Sign( a );
2691     bSign = extractFloat32Sign( b );
2692     av = float32_val(a);
2693     bv = float32_val(b);
2694     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2695     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2696 
2697 }
2698 
2699 /*----------------------------------------------------------------------------
2700 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2701 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
2702 | comparison is performed according to the IEC/IEEE Standard for Binary
2703 | Floating-Point Arithmetic.
2704 *----------------------------------------------------------------------------*/
2705 
2706 int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2707 {
2708     a = float32_squash_input_denormal(a STATUS_VAR);
2709     b = float32_squash_input_denormal(b STATUS_VAR);
2710 
2711     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2712          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2713        ) {
2714         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2715             float_raise( float_flag_invalid STATUS_VAR);
2716         }
2717         return 1;
2718     }
2719     return 0;
2720 }
2721 
2722 /*----------------------------------------------------------------------------
2723 | Returns the result of converting the double-precision floating-point value
2724 | `a' to the 32-bit two's complement integer format.  The conversion is
2725 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2726 | Arithmetic---which means in particular that the conversion is rounded
2727 | according to the current rounding mode.  If `a' is a NaN, the largest
2728 | positive integer is returned.  Otherwise, if the conversion overflows, the
2729 | largest integer with the same sign as `a' is returned.
2730 *----------------------------------------------------------------------------*/
2731 
2732 int32 float64_to_int32( float64 a STATUS_PARAM )
2733 {
2734     flag aSign;
2735     int16 aExp, shiftCount;
2736     uint64_t aSig;
2737     a = float64_squash_input_denormal(a STATUS_VAR);
2738 
2739     aSig = extractFloat64Frac( a );
2740     aExp = extractFloat64Exp( a );
2741     aSign = extractFloat64Sign( a );
2742     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2743     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2744     shiftCount = 0x42C - aExp;
2745     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2746     return roundAndPackInt32( aSign, aSig STATUS_VAR );
2747 
2748 }
2749 
2750 /*----------------------------------------------------------------------------
2751 | Returns the result of converting the double-precision floating-point value
2752 | `a' to the 32-bit two's complement integer format.  The conversion is
2753 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2754 | Arithmetic, except that the conversion is always rounded toward zero.
2755 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2756 | the conversion overflows, the largest integer with the same sign as `a' is
2757 | returned.
2758 *----------------------------------------------------------------------------*/
2759 
2760 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2761 {
2762     flag aSign;
2763     int16 aExp, shiftCount;
2764     uint64_t aSig, savedASig;
2765     int32 z;
2766     a = float64_squash_input_denormal(a STATUS_VAR);
2767 
2768     aSig = extractFloat64Frac( a );
2769     aExp = extractFloat64Exp( a );
2770     aSign = extractFloat64Sign( a );
2771     if ( 0x41E < aExp ) {
2772         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2773         goto invalid;
2774     }
2775     else if ( aExp < 0x3FF ) {
2776         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2777         return 0;
2778     }
2779     aSig |= LIT64( 0x0010000000000000 );
2780     shiftCount = 0x433 - aExp;
2781     savedASig = aSig;
2782     aSig >>= shiftCount;
2783     z = aSig;
2784     if ( aSign ) z = - z;
2785     if ( ( z < 0 ) ^ aSign ) {
2786  invalid:
2787         float_raise( float_flag_invalid STATUS_VAR);
2788         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2789     }
2790     if ( ( aSig<<shiftCount ) != savedASig ) {
2791         STATUS(float_exception_flags) |= float_flag_inexact;
2792     }
2793     return z;
2794 
2795 }
2796 
2797 /*----------------------------------------------------------------------------
2798 | Returns the result of converting the double-precision floating-point value
2799 | `a' to the 16-bit two's complement integer format.  The conversion is
2800 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2801 | Arithmetic, except that the conversion is always rounded toward zero.
2802 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2803 | the conversion overflows, the largest integer with the same sign as `a' is
2804 | returned.
2805 *----------------------------------------------------------------------------*/
2806 
2807 int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM )
2808 {
2809     flag aSign;
2810     int16 aExp, shiftCount;
2811     uint64_t aSig, savedASig;
2812     int32 z;
2813 
2814     aSig = extractFloat64Frac( a );
2815     aExp = extractFloat64Exp( a );
2816     aSign = extractFloat64Sign( a );
2817     if ( 0x40E < aExp ) {
2818         if ( ( aExp == 0x7FF ) && aSig ) {
2819             aSign = 0;
2820         }
2821         goto invalid;
2822     }
2823     else if ( aExp < 0x3FF ) {
2824         if ( aExp || aSig ) {
2825             STATUS(float_exception_flags) |= float_flag_inexact;
2826         }
2827         return 0;
2828     }
2829     aSig |= LIT64( 0x0010000000000000 );
2830     shiftCount = 0x433 - aExp;
2831     savedASig = aSig;
2832     aSig >>= shiftCount;
2833     z = aSig;
2834     if ( aSign ) {
2835         z = - z;
2836     }
2837     if ( ( (int16_t)z < 0 ) ^ aSign ) {
2838  invalid:
2839         float_raise( float_flag_invalid STATUS_VAR);
2840         return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
2841     }
2842     if ( ( aSig<<shiftCount ) != savedASig ) {
2843         STATUS(float_exception_flags) |= float_flag_inexact;
2844     }
2845     return z;
2846 }
2847 
2848 /*----------------------------------------------------------------------------
2849 | Returns the result of converting the double-precision floating-point value
2850 | `a' to the 64-bit two's complement integer format.  The conversion is
2851 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2852 | Arithmetic---which means in particular that the conversion is rounded
2853 | according to the current rounding mode.  If `a' is a NaN, the largest
2854 | positive integer is returned.  Otherwise, if the conversion overflows, the
2855 | largest integer with the same sign as `a' is returned.
2856 *----------------------------------------------------------------------------*/
2857 
2858 int64 float64_to_int64( float64 a STATUS_PARAM )
2859 {
2860     flag aSign;
2861     int16 aExp, shiftCount;
2862     uint64_t aSig, aSigExtra;
2863     a = float64_squash_input_denormal(a STATUS_VAR);
2864 
2865     aSig = extractFloat64Frac( a );
2866     aExp = extractFloat64Exp( a );
2867     aSign = extractFloat64Sign( a );
2868     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2869     shiftCount = 0x433 - aExp;
2870     if ( shiftCount <= 0 ) {
2871         if ( 0x43E < aExp ) {
2872             float_raise( float_flag_invalid STATUS_VAR);
2873             if (    ! aSign
2874                  || (    ( aExp == 0x7FF )
2875                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2876                ) {
2877                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2878             }
2879             return (int64_t) LIT64( 0x8000000000000000 );
2880         }
2881         aSigExtra = 0;
2882         aSig <<= - shiftCount;
2883     }
2884     else {
2885         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2886     }
2887     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2888 
2889 }
2890 
2891 /*----------------------------------------------------------------------------
2892 | Returns the result of converting the double-precision floating-point value
2893 | `a' to the 64-bit two's complement integer format.  The conversion is
2894 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2895 | Arithmetic, except that the conversion is always rounded toward zero.
2896 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2897 | the conversion overflows, the largest integer with the same sign as `a' is
2898 | returned.
2899 *----------------------------------------------------------------------------*/
2900 
2901 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2902 {
2903     flag aSign;
2904     int16 aExp, shiftCount;
2905     uint64_t aSig;
2906     int64 z;
2907     a = float64_squash_input_denormal(a STATUS_VAR);
2908 
2909     aSig = extractFloat64Frac( a );
2910     aExp = extractFloat64Exp( a );
2911     aSign = extractFloat64Sign( a );
2912     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2913     shiftCount = aExp - 0x433;
2914     if ( 0 <= shiftCount ) {
2915         if ( 0x43E <= aExp ) {
2916             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2917                 float_raise( float_flag_invalid STATUS_VAR);
2918                 if (    ! aSign
2919                      || (    ( aExp == 0x7FF )
2920                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2921                    ) {
2922                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2923                 }
2924             }
2925             return (int64_t) LIT64( 0x8000000000000000 );
2926         }
2927         z = aSig<<shiftCount;
2928     }
2929     else {
2930         if ( aExp < 0x3FE ) {
2931             if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2932             return 0;
2933         }
2934         z = aSig>>( - shiftCount );
2935         if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
2936             STATUS(float_exception_flags) |= float_flag_inexact;
2937         }
2938     }
2939     if ( aSign ) z = - z;
2940     return z;
2941 
2942 }
2943 
2944 /*----------------------------------------------------------------------------
2945 | Returns the result of converting the double-precision floating-point value
2946 | `a' to the single-precision floating-point format.  The conversion is
2947 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2948 | Arithmetic.
2949 *----------------------------------------------------------------------------*/
2950 
2951 float32 float64_to_float32( float64 a STATUS_PARAM )
2952 {
2953     flag aSign;
2954     int16 aExp;
2955     uint64_t aSig;
2956     uint32_t zSig;
2957     a = float64_squash_input_denormal(a STATUS_VAR);
2958 
2959     aSig = extractFloat64Frac( a );
2960     aExp = extractFloat64Exp( a );
2961     aSign = extractFloat64Sign( a );
2962     if ( aExp == 0x7FF ) {
2963         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2964         return packFloat32( aSign, 0xFF, 0 );
2965     }
2966     shift64RightJamming( aSig, 22, &aSig );
2967     zSig = aSig;
2968     if ( aExp || zSig ) {
2969         zSig |= 0x40000000;
2970         aExp -= 0x381;
2971     }
2972     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2973 
2974 }
2975 
2976 
2977 /*----------------------------------------------------------------------------
2978 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2979 | half-precision floating-point value, returning the result.  After being
2980 | shifted into the proper positions, the three fields are simply added
2981 | together to form the result.  This means that any integer portion of `zSig'
2982 | will be added into the exponent.  Since a properly normalized significand
2983 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2984 | than the desired result exponent whenever `zSig' is a complete, normalized
2985 | significand.
2986 *----------------------------------------------------------------------------*/
2987 static float16 packFloat16(flag zSign, int16 zExp, uint16_t zSig)
2988 {
2989     return make_float16(
2990         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
2991 }
2992 
2993 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
2994    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
2995 
2996 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
2997 {
2998     flag aSign;
2999     int16 aExp;
3000     uint32_t aSig;
3001 
3002     aSign = extractFloat16Sign(a);
3003     aExp = extractFloat16Exp(a);
3004     aSig = extractFloat16Frac(a);
3005 
3006     if (aExp == 0x1f && ieee) {
3007         if (aSig) {
3008             return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3009         }
3010         return packFloat32(aSign, 0xff, aSig << 13);
3011     }
3012     if (aExp == 0) {
3013         int8 shiftCount;
3014 
3015         if (aSig == 0) {
3016             return packFloat32(aSign, 0, 0);
3017         }
3018 
3019         shiftCount = countLeadingZeros32( aSig ) - 21;
3020         aSig = aSig << shiftCount;
3021         aExp = -shiftCount;
3022     }
3023     return packFloat32( aSign, aExp + 0x70, aSig << 13);
3024 }
3025 
3026 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
3027 {
3028     flag aSign;
3029     int16 aExp;
3030     uint32_t aSig;
3031     uint32_t mask;
3032     uint32_t increment;
3033     int8 roundingMode;
3034     a = float32_squash_input_denormal(a STATUS_VAR);
3035 
3036     aSig = extractFloat32Frac( a );
3037     aExp = extractFloat32Exp( a );
3038     aSign = extractFloat32Sign( a );
3039     if ( aExp == 0xFF ) {
3040         if (aSig) {
3041             /* Input is a NaN */
3042             float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3043             if (!ieee) {
3044                 return packFloat16(aSign, 0, 0);
3045             }
3046             return r;
3047         }
3048         /* Infinity */
3049         if (!ieee) {
3050             float_raise(float_flag_invalid STATUS_VAR);
3051             return packFloat16(aSign, 0x1f, 0x3ff);
3052         }
3053         return packFloat16(aSign, 0x1f, 0);
3054     }
3055     if (aExp == 0 && aSig == 0) {
3056         return packFloat16(aSign, 0, 0);
3057     }
3058     /* Decimal point between bits 22 and 23.  */
3059     aSig |= 0x00800000;
3060     aExp -= 0x7f;
3061     if (aExp < -14) {
3062         mask = 0x00ffffff;
3063         if (aExp >= -24) {
3064             mask >>= 25 + aExp;
3065         }
3066     } else {
3067         mask = 0x00001fff;
3068     }
3069     if (aSig & mask) {
3070         float_raise( float_flag_underflow STATUS_VAR );
3071         roundingMode = STATUS(float_rounding_mode);
3072         switch (roundingMode) {
3073         case float_round_nearest_even:
3074             increment = (mask + 1) >> 1;
3075             if ((aSig & mask) == increment) {
3076                 increment = aSig & (increment << 1);
3077             }
3078             break;
3079         case float_round_up:
3080             increment = aSign ? 0 : mask;
3081             break;
3082         case float_round_down:
3083             increment = aSign ? mask : 0;
3084             break;
3085         default: /* round_to_zero */
3086             increment = 0;
3087             break;
3088         }
3089         aSig += increment;
3090         if (aSig >= 0x01000000) {
3091             aSig >>= 1;
3092             aExp++;
3093         }
3094     } else if (aExp < -14
3095           && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
3096         float_raise( float_flag_underflow STATUS_VAR);
3097     }
3098 
3099     if (ieee) {
3100         if (aExp > 15) {
3101             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
3102             return packFloat16(aSign, 0x1f, 0);
3103         }
3104     } else {
3105         if (aExp > 16) {
3106             float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR);
3107             return packFloat16(aSign, 0x1f, 0x3ff);
3108         }
3109     }
3110     if (aExp < -24) {
3111         return packFloat16(aSign, 0, 0);
3112     }
3113     if (aExp < -14) {
3114         aSig >>= -14 - aExp;
3115         aExp = -14;
3116     }
3117     return packFloat16(aSign, aExp + 14, aSig >> 13);
3118 }
3119 
3120 /*----------------------------------------------------------------------------
3121 | Returns the result of converting the double-precision floating-point value
3122 | `a' to the extended double-precision floating-point format.  The conversion
3123 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3124 | Arithmetic.
3125 *----------------------------------------------------------------------------*/
3126 
3127 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
3128 {
3129     flag aSign;
3130     int16 aExp;
3131     uint64_t aSig;
3132 
3133     a = float64_squash_input_denormal(a STATUS_VAR);
3134     aSig = extractFloat64Frac( a );
3135     aExp = extractFloat64Exp( a );
3136     aSign = extractFloat64Sign( a );
3137     if ( aExp == 0x7FF ) {
3138         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3139         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3140     }
3141     if ( aExp == 0 ) {
3142         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3143         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3144     }
3145     return
3146         packFloatx80(
3147             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3148 
3149 }
3150 
3151 /*----------------------------------------------------------------------------
3152 | Returns the result of converting the double-precision floating-point value
3153 | `a' to the quadruple-precision floating-point format.  The conversion is
3154 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3155 | Arithmetic.
3156 *----------------------------------------------------------------------------*/
3157 
3158 float128 float64_to_float128( float64 a STATUS_PARAM )
3159 {
3160     flag aSign;
3161     int16 aExp;
3162     uint64_t aSig, zSig0, zSig1;
3163 
3164     a = float64_squash_input_denormal(a STATUS_VAR);
3165     aSig = extractFloat64Frac( a );
3166     aExp = extractFloat64Exp( a );
3167     aSign = extractFloat64Sign( a );
3168     if ( aExp == 0x7FF ) {
3169         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3170         return packFloat128( aSign, 0x7FFF, 0, 0 );
3171     }
3172     if ( aExp == 0 ) {
3173         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3174         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3175         --aExp;
3176     }
3177     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3178     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3179 
3180 }
3181 
3182 /*----------------------------------------------------------------------------
3183 | Rounds the double-precision floating-point value `a' to an integer, and
3184 | returns the result as a double-precision floating-point value.  The
3185 | operation is performed according to the IEC/IEEE Standard for Binary
3186 | Floating-Point Arithmetic.
3187 *----------------------------------------------------------------------------*/
3188 
3189 float64 float64_round_to_int( float64 a STATUS_PARAM )
3190 {
3191     flag aSign;
3192     int16 aExp;
3193     uint64_t lastBitMask, roundBitsMask;
3194     int8 roundingMode;
3195     uint64_t z;
3196     a = float64_squash_input_denormal(a STATUS_VAR);
3197 
3198     aExp = extractFloat64Exp( a );
3199     if ( 0x433 <= aExp ) {
3200         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3201             return propagateFloat64NaN( a, a STATUS_VAR );
3202         }
3203         return a;
3204     }
3205     if ( aExp < 0x3FF ) {
3206         if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3207         STATUS(float_exception_flags) |= float_flag_inexact;
3208         aSign = extractFloat64Sign( a );
3209         switch ( STATUS(float_rounding_mode) ) {
3210          case float_round_nearest_even:
3211             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3212                 return packFloat64( aSign, 0x3FF, 0 );
3213             }
3214             break;
3215          case float_round_down:
3216             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3217          case float_round_up:
3218             return make_float64(
3219             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3220         }
3221         return packFloat64( aSign, 0, 0 );
3222     }
3223     lastBitMask = 1;
3224     lastBitMask <<= 0x433 - aExp;
3225     roundBitsMask = lastBitMask - 1;
3226     z = float64_val(a);
3227     roundingMode = STATUS(float_rounding_mode);
3228     if ( roundingMode == float_round_nearest_even ) {
3229         z += lastBitMask>>1;
3230         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3231     }
3232     else if ( roundingMode != float_round_to_zero ) {
3233         if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
3234             z += roundBitsMask;
3235         }
3236     }
3237     z &= ~ roundBitsMask;
3238     if ( z != float64_val(a) )
3239         STATUS(float_exception_flags) |= float_flag_inexact;
3240     return make_float64(z);
3241 
3242 }
3243 
3244 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3245 {
3246     int oldmode;
3247     float64 res;
3248     oldmode = STATUS(float_rounding_mode);
3249     STATUS(float_rounding_mode) = float_round_to_zero;
3250     res = float64_round_to_int(a STATUS_VAR);
3251     STATUS(float_rounding_mode) = oldmode;
3252     return res;
3253 }
3254 
3255 /*----------------------------------------------------------------------------
3256 | Returns the result of adding the absolute values of the double-precision
3257 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3258 | before being returned.  `zSign' is ignored if the result is a NaN.
3259 | The addition is performed according to the IEC/IEEE Standard for Binary
3260 | Floating-Point Arithmetic.
3261 *----------------------------------------------------------------------------*/
3262 
3263 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3264 {
3265     int16 aExp, bExp, zExp;
3266     uint64_t aSig, bSig, zSig;
3267     int16 expDiff;
3268 
3269     aSig = extractFloat64Frac( a );
3270     aExp = extractFloat64Exp( a );
3271     bSig = extractFloat64Frac( b );
3272     bExp = extractFloat64Exp( b );
3273     expDiff = aExp - bExp;
3274     aSig <<= 9;
3275     bSig <<= 9;
3276     if ( 0 < expDiff ) {
3277         if ( aExp == 0x7FF ) {
3278             if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3279             return a;
3280         }
3281         if ( bExp == 0 ) {
3282             --expDiff;
3283         }
3284         else {
3285             bSig |= LIT64( 0x2000000000000000 );
3286         }
3287         shift64RightJamming( bSig, expDiff, &bSig );
3288         zExp = aExp;
3289     }
3290     else if ( expDiff < 0 ) {
3291         if ( bExp == 0x7FF ) {
3292             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3293             return packFloat64( zSign, 0x7FF, 0 );
3294         }
3295         if ( aExp == 0 ) {
3296             ++expDiff;
3297         }
3298         else {
3299             aSig |= LIT64( 0x2000000000000000 );
3300         }
3301         shift64RightJamming( aSig, - expDiff, &aSig );
3302         zExp = bExp;
3303     }
3304     else {
3305         if ( aExp == 0x7FF ) {
3306             if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3307             return a;
3308         }
3309         if ( aExp == 0 ) {
3310             if (STATUS(flush_to_zero)) {
3311                 if (aSig | bSig) {
3312                     float_raise(float_flag_output_denormal STATUS_VAR);
3313                 }
3314                 return packFloat64(zSign, 0, 0);
3315             }
3316             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3317         }
3318         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3319         zExp = aExp;
3320         goto roundAndPack;
3321     }
3322     aSig |= LIT64( 0x2000000000000000 );
3323     zSig = ( aSig + bSig )<<1;
3324     --zExp;
3325     if ( (int64_t) zSig < 0 ) {
3326         zSig = aSig + bSig;
3327         ++zExp;
3328     }
3329  roundAndPack:
3330     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3331 
3332 }
3333 
3334 /*----------------------------------------------------------------------------
3335 | Returns the result of subtracting the absolute values of the double-
3336 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
3337 | difference is negated before being returned.  `zSign' is ignored if the
3338 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3339 | Standard for Binary Floating-Point Arithmetic.
3340 *----------------------------------------------------------------------------*/
3341 
3342 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3343 {
3344     int16 aExp, bExp, zExp;
3345     uint64_t aSig, bSig, zSig;
3346     int16 expDiff;
3347 
3348     aSig = extractFloat64Frac( a );
3349     aExp = extractFloat64Exp( a );
3350     bSig = extractFloat64Frac( b );
3351     bExp = extractFloat64Exp( b );
3352     expDiff = aExp - bExp;
3353     aSig <<= 10;
3354     bSig <<= 10;
3355     if ( 0 < expDiff ) goto aExpBigger;
3356     if ( expDiff < 0 ) goto bExpBigger;
3357     if ( aExp == 0x7FF ) {
3358         if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3359         float_raise( float_flag_invalid STATUS_VAR);
3360         return float64_default_nan;
3361     }
3362     if ( aExp == 0 ) {
3363         aExp = 1;
3364         bExp = 1;
3365     }
3366     if ( bSig < aSig ) goto aBigger;
3367     if ( aSig < bSig ) goto bBigger;
3368     return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3369  bExpBigger:
3370     if ( bExp == 0x7FF ) {
3371         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3372         return packFloat64( zSign ^ 1, 0x7FF, 0 );
3373     }
3374     if ( aExp == 0 ) {
3375         ++expDiff;
3376     }
3377     else {
3378         aSig |= LIT64( 0x4000000000000000 );
3379     }
3380     shift64RightJamming( aSig, - expDiff, &aSig );
3381     bSig |= LIT64( 0x4000000000000000 );
3382  bBigger:
3383     zSig = bSig - aSig;
3384     zExp = bExp;
3385     zSign ^= 1;
3386     goto normalizeRoundAndPack;
3387  aExpBigger:
3388     if ( aExp == 0x7FF ) {
3389         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3390         return a;
3391     }
3392     if ( bExp == 0 ) {
3393         --expDiff;
3394     }
3395     else {
3396         bSig |= LIT64( 0x4000000000000000 );
3397     }
3398     shift64RightJamming( bSig, expDiff, &bSig );
3399     aSig |= LIT64( 0x4000000000000000 );
3400  aBigger:
3401     zSig = aSig - bSig;
3402     zExp = aExp;
3403  normalizeRoundAndPack:
3404     --zExp;
3405     return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3406 
3407 }
3408 
3409 /*----------------------------------------------------------------------------
3410 | Returns the result of adding the double-precision floating-point values `a'
3411 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
3412 | Binary Floating-Point Arithmetic.
3413 *----------------------------------------------------------------------------*/
3414 
3415 float64 float64_add( float64 a, float64 b STATUS_PARAM )
3416 {
3417     flag aSign, bSign;
3418     a = float64_squash_input_denormal(a STATUS_VAR);
3419     b = float64_squash_input_denormal(b STATUS_VAR);
3420 
3421     aSign = extractFloat64Sign( a );
3422     bSign = extractFloat64Sign( b );
3423     if ( aSign == bSign ) {
3424         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3425     }
3426     else {
3427         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3428     }
3429 
3430 }
3431 
3432 /*----------------------------------------------------------------------------
3433 | Returns the result of subtracting the double-precision floating-point values
3434 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3435 | for Binary Floating-Point Arithmetic.
3436 *----------------------------------------------------------------------------*/
3437 
3438 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3439 {
3440     flag aSign, bSign;
3441     a = float64_squash_input_denormal(a STATUS_VAR);
3442     b = float64_squash_input_denormal(b STATUS_VAR);
3443 
3444     aSign = extractFloat64Sign( a );
3445     bSign = extractFloat64Sign( b );
3446     if ( aSign == bSign ) {
3447         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3448     }
3449     else {
3450         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3451     }
3452 
3453 }
3454 
3455 /*----------------------------------------------------------------------------
3456 | Returns the result of multiplying the double-precision floating-point values
3457 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3458 | for Binary Floating-Point Arithmetic.
3459 *----------------------------------------------------------------------------*/
3460 
3461 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3462 {
3463     flag aSign, bSign, zSign;
3464     int16 aExp, bExp, zExp;
3465     uint64_t aSig, bSig, zSig0, zSig1;
3466 
3467     a = float64_squash_input_denormal(a STATUS_VAR);
3468     b = float64_squash_input_denormal(b STATUS_VAR);
3469 
3470     aSig = extractFloat64Frac( a );
3471     aExp = extractFloat64Exp( a );
3472     aSign = extractFloat64Sign( a );
3473     bSig = extractFloat64Frac( b );
3474     bExp = extractFloat64Exp( b );
3475     bSign = extractFloat64Sign( b );
3476     zSign = aSign ^ bSign;
3477     if ( aExp == 0x7FF ) {
3478         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3479             return propagateFloat64NaN( a, b STATUS_VAR );
3480         }
3481         if ( ( bExp | bSig ) == 0 ) {
3482             float_raise( float_flag_invalid STATUS_VAR);
3483             return float64_default_nan;
3484         }
3485         return packFloat64( zSign, 0x7FF, 0 );
3486     }
3487     if ( bExp == 0x7FF ) {
3488         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3489         if ( ( aExp | aSig ) == 0 ) {
3490             float_raise( float_flag_invalid STATUS_VAR);
3491             return float64_default_nan;
3492         }
3493         return packFloat64( zSign, 0x7FF, 0 );
3494     }
3495     if ( aExp == 0 ) {
3496         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3497         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3498     }
3499     if ( bExp == 0 ) {
3500         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3501         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3502     }
3503     zExp = aExp + bExp - 0x3FF;
3504     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3505     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3506     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3507     zSig0 |= ( zSig1 != 0 );
3508     if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3509         zSig0 <<= 1;
3510         --zExp;
3511     }
3512     return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3513 
3514 }
3515 
3516 /*----------------------------------------------------------------------------
3517 | Returns the result of dividing the double-precision floating-point value `a'
3518 | by the corresponding value `b'.  The operation is performed according to
3519 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3520 *----------------------------------------------------------------------------*/
3521 
3522 float64 float64_div( float64 a, float64 b STATUS_PARAM )
3523 {
3524     flag aSign, bSign, zSign;
3525     int16 aExp, bExp, zExp;
3526     uint64_t aSig, bSig, zSig;
3527     uint64_t rem0, rem1;
3528     uint64_t term0, term1;
3529     a = float64_squash_input_denormal(a STATUS_VAR);
3530     b = float64_squash_input_denormal(b STATUS_VAR);
3531 
3532     aSig = extractFloat64Frac( a );
3533     aExp = extractFloat64Exp( a );
3534     aSign = extractFloat64Sign( a );
3535     bSig = extractFloat64Frac( b );
3536     bExp = extractFloat64Exp( b );
3537     bSign = extractFloat64Sign( b );
3538     zSign = aSign ^ bSign;
3539     if ( aExp == 0x7FF ) {
3540         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3541         if ( bExp == 0x7FF ) {
3542             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3543             float_raise( float_flag_invalid STATUS_VAR);
3544             return float64_default_nan;
3545         }
3546         return packFloat64( zSign, 0x7FF, 0 );
3547     }
3548     if ( bExp == 0x7FF ) {
3549         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3550         return packFloat64( zSign, 0, 0 );
3551     }
3552     if ( bExp == 0 ) {
3553         if ( bSig == 0 ) {
3554             if ( ( aExp | aSig ) == 0 ) {
3555                 float_raise( float_flag_invalid STATUS_VAR);
3556                 return float64_default_nan;
3557             }
3558             float_raise( float_flag_divbyzero STATUS_VAR);
3559             return packFloat64( zSign, 0x7FF, 0 );
3560         }
3561         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3562     }
3563     if ( aExp == 0 ) {
3564         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3565         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3566     }
3567     zExp = aExp - bExp + 0x3FD;
3568     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3569     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3570     if ( bSig <= ( aSig + aSig ) ) {
3571         aSig >>= 1;
3572         ++zExp;
3573     }
3574     zSig = estimateDiv128To64( aSig, 0, bSig );
3575     if ( ( zSig & 0x1FF ) <= 2 ) {
3576         mul64To128( bSig, zSig, &term0, &term1 );
3577         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3578         while ( (int64_t) rem0 < 0 ) {
3579             --zSig;
3580             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3581         }
3582         zSig |= ( rem1 != 0 );
3583     }
3584     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3585 
3586 }
3587 
3588 /*----------------------------------------------------------------------------
3589 | Returns the remainder of the double-precision floating-point value `a'
3590 | with respect to the corresponding value `b'.  The operation is performed
3591 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3592 *----------------------------------------------------------------------------*/
3593 
3594 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3595 {
3596     flag aSign, zSign;
3597     int16 aExp, bExp, expDiff;
3598     uint64_t aSig, bSig;
3599     uint64_t q, alternateASig;
3600     int64_t sigMean;
3601 
3602     a = float64_squash_input_denormal(a STATUS_VAR);
3603     b = float64_squash_input_denormal(b STATUS_VAR);
3604     aSig = extractFloat64Frac( a );
3605     aExp = extractFloat64Exp( a );
3606     aSign = extractFloat64Sign( a );
3607     bSig = extractFloat64Frac( b );
3608     bExp = extractFloat64Exp( b );
3609     if ( aExp == 0x7FF ) {
3610         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3611             return propagateFloat64NaN( a, b STATUS_VAR );
3612         }
3613         float_raise( float_flag_invalid STATUS_VAR);
3614         return float64_default_nan;
3615     }
3616     if ( bExp == 0x7FF ) {
3617         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3618         return a;
3619     }
3620     if ( bExp == 0 ) {
3621         if ( bSig == 0 ) {
3622             float_raise( float_flag_invalid STATUS_VAR);
3623             return float64_default_nan;
3624         }
3625         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3626     }
3627     if ( aExp == 0 ) {
3628         if ( aSig == 0 ) return a;
3629         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3630     }
3631     expDiff = aExp - bExp;
3632     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3633     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3634     if ( expDiff < 0 ) {
3635         if ( expDiff < -1 ) return a;
3636         aSig >>= 1;
3637     }
3638     q = ( bSig <= aSig );
3639     if ( q ) aSig -= bSig;
3640     expDiff -= 64;
3641     while ( 0 < expDiff ) {
3642         q = estimateDiv128To64( aSig, 0, bSig );
3643         q = ( 2 < q ) ? q - 2 : 0;
3644         aSig = - ( ( bSig>>2 ) * q );
3645         expDiff -= 62;
3646     }
3647     expDiff += 64;
3648     if ( 0 < expDiff ) {
3649         q = estimateDiv128To64( aSig, 0, bSig );
3650         q = ( 2 < q ) ? q - 2 : 0;
3651         q >>= 64 - expDiff;
3652         bSig >>= 2;
3653         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3654     }
3655     else {
3656         aSig >>= 2;
3657         bSig >>= 2;
3658     }
3659     do {
3660         alternateASig = aSig;
3661         ++q;
3662         aSig -= bSig;
3663     } while ( 0 <= (int64_t) aSig );
3664     sigMean = aSig + alternateASig;
3665     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3666         aSig = alternateASig;
3667     }
3668     zSign = ( (int64_t) aSig < 0 );
3669     if ( zSign ) aSig = - aSig;
3670     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3671 
3672 }
3673 
3674 /*----------------------------------------------------------------------------
3675 | Returns the result of multiplying the double-precision floating-point values
3676 | `a' and `b' then adding 'c', with no intermediate rounding step after the
3677 | multiplication.  The operation is performed according to the IEC/IEEE
3678 | Standard for Binary Floating-Point Arithmetic 754-2008.
3679 | The flags argument allows the caller to select negation of the
3680 | addend, the intermediate product, or the final result. (The difference
3681 | between this and having the caller do a separate negation is that negating
3682 | externally will flip the sign bit on NaNs.)
3683 *----------------------------------------------------------------------------*/
3684 
3685 float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM)
3686 {
3687     flag aSign, bSign, cSign, zSign;
3688     int aExp, bExp, cExp, pExp, zExp, expDiff;
3689     uint64_t aSig, bSig, cSig;
3690     flag pInf, pZero, pSign;
3691     uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
3692     int shiftcount;
3693     flag signflip, infzero;
3694 
3695     a = float64_squash_input_denormal(a STATUS_VAR);
3696     b = float64_squash_input_denormal(b STATUS_VAR);
3697     c = float64_squash_input_denormal(c STATUS_VAR);
3698     aSig = extractFloat64Frac(a);
3699     aExp = extractFloat64Exp(a);
3700     aSign = extractFloat64Sign(a);
3701     bSig = extractFloat64Frac(b);
3702     bExp = extractFloat64Exp(b);
3703     bSign = extractFloat64Sign(b);
3704     cSig = extractFloat64Frac(c);
3705     cExp = extractFloat64Exp(c);
3706     cSign = extractFloat64Sign(c);
3707 
3708     infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
3709                (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
3710 
3711     /* It is implementation-defined whether the cases of (0,inf,qnan)
3712      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
3713      * they return if they do), so we have to hand this information
3714      * off to the target-specific pick-a-NaN routine.
3715      */
3716     if (((aExp == 0x7ff) && aSig) ||
3717         ((bExp == 0x7ff) && bSig) ||
3718         ((cExp == 0x7ff) && cSig)) {
3719         return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR);
3720     }
3721 
3722     if (infzero) {
3723         float_raise(float_flag_invalid STATUS_VAR);
3724         return float64_default_nan;
3725     }
3726 
3727     if (flags & float_muladd_negate_c) {
3728         cSign ^= 1;
3729     }
3730 
3731     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
3732 
3733     /* Work out the sign and type of the product */
3734     pSign = aSign ^ bSign;
3735     if (flags & float_muladd_negate_product) {
3736         pSign ^= 1;
3737     }
3738     pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
3739     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
3740 
3741     if (cExp == 0x7ff) {
3742         if (pInf && (pSign ^ cSign)) {
3743             /* addition of opposite-signed infinities => InvalidOperation */
3744             float_raise(float_flag_invalid STATUS_VAR);
3745             return float64_default_nan;
3746         }
3747         /* Otherwise generate an infinity of the same sign */
3748         return packFloat64(cSign ^ signflip, 0x7ff, 0);
3749     }
3750 
3751     if (pInf) {
3752         return packFloat64(pSign ^ signflip, 0x7ff, 0);
3753     }
3754 
3755     if (pZero) {
3756         if (cExp == 0) {
3757             if (cSig == 0) {
3758                 /* Adding two exact zeroes */
3759                 if (pSign == cSign) {
3760                     zSign = pSign;
3761                 } else if (STATUS(float_rounding_mode) == float_round_down) {
3762                     zSign = 1;
3763                 } else {
3764                     zSign = 0;
3765                 }
3766                 return packFloat64(zSign ^ signflip, 0, 0);
3767             }
3768             /* Exact zero plus a denorm */
3769             if (STATUS(flush_to_zero)) {
3770                 float_raise(float_flag_output_denormal STATUS_VAR);
3771                 return packFloat64(cSign ^ signflip, 0, 0);
3772             }
3773         }
3774         /* Zero plus something non-zero : just return the something */
3775         return c ^ ((uint64_t)signflip << 63);
3776     }
3777 
3778     if (aExp == 0) {
3779         normalizeFloat64Subnormal(aSig, &aExp, &aSig);
3780     }
3781     if (bExp == 0) {
3782         normalizeFloat64Subnormal(bSig, &bExp, &bSig);
3783     }
3784 
3785     /* Calculate the actual result a * b + c */
3786 
3787     /* Multiply first; this is easy. */
3788     /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
3789      * because we want the true exponent, not the "one-less-than"
3790      * flavour that roundAndPackFloat64() takes.
3791      */
3792     pExp = aExp + bExp - 0x3fe;
3793     aSig = (aSig | LIT64(0x0010000000000000))<<10;
3794     bSig = (bSig | LIT64(0x0010000000000000))<<11;
3795     mul64To128(aSig, bSig, &pSig0, &pSig1);
3796     if ((int64_t)(pSig0 << 1) >= 0) {
3797         shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
3798         pExp--;
3799     }
3800 
3801     zSign = pSign ^ signflip;
3802 
3803     /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
3804      * bit in position 126.
3805      */
3806     if (cExp == 0) {
3807         if (!cSig) {
3808             /* Throw out the special case of c being an exact zero now */
3809             shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
3810             return roundAndPackFloat64(zSign, pExp - 1,
3811                                        pSig1 STATUS_VAR);
3812         }
3813         normalizeFloat64Subnormal(cSig, &cExp, &cSig);
3814     }
3815 
3816     /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
3817      * significand of the addend, with the explicit bit in position 126.
3818      */
3819     cSig0 = cSig << (126 - 64 - 52);
3820     cSig1 = 0;
3821     cSig0 |= LIT64(0x4000000000000000);
3822     expDiff = pExp - cExp;
3823 
3824     if (pSign == cSign) {
3825         /* Addition */
3826         if (expDiff > 0) {
3827             /* scale c to match p */
3828             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
3829             zExp = pExp;
3830         } else if (expDiff < 0) {
3831             /* scale p to match c */
3832             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
3833             zExp = cExp;
3834         } else {
3835             /* no scaling needed */
3836             zExp = cExp;
3837         }
3838         /* Add significands and make sure explicit bit ends up in posn 126 */
3839         add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3840         if ((int64_t)zSig0 < 0) {
3841             shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
3842         } else {
3843             zExp--;
3844         }
3845         shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
3846         return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR);
3847     } else {
3848         /* Subtraction */
3849         if (expDiff > 0) {
3850             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
3851             sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3852             zExp = pExp;
3853         } else if (expDiff < 0) {
3854             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
3855             sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
3856             zExp = cExp;
3857             zSign ^= 1;
3858         } else {
3859             zExp = pExp;
3860             if (lt128(cSig0, cSig1, pSig0, pSig1)) {
3861                 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3862             } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
3863                 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
3864                 zSign ^= 1;
3865             } else {
3866                 /* Exact zero */
3867                 zSign = signflip;
3868                 if (STATUS(float_rounding_mode) == float_round_down) {
3869                     zSign ^= 1;
3870                 }
3871                 return packFloat64(zSign, 0, 0);
3872             }
3873         }
3874         --zExp;
3875         /* Do the equivalent of normalizeRoundAndPackFloat64() but
3876          * starting with the significand in a pair of uint64_t.
3877          */
3878         if (zSig0) {
3879             shiftcount = countLeadingZeros64(zSig0) - 1;
3880             shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
3881             if (zSig1) {
3882                 zSig0 |= 1;
3883             }
3884             zExp -= shiftcount;
3885         } else {
3886             shiftcount = countLeadingZeros64(zSig1) - 1;
3887             zSig0 = zSig1 << shiftcount;
3888             zExp -= (shiftcount + 64);
3889         }
3890         return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR);
3891     }
3892 }
3893 
3894 /*----------------------------------------------------------------------------
3895 | Returns the square root of the double-precision floating-point value `a'.
3896 | The operation is performed according to the IEC/IEEE Standard for Binary
3897 | Floating-Point Arithmetic.
3898 *----------------------------------------------------------------------------*/
3899 
3900 float64 float64_sqrt( float64 a STATUS_PARAM )
3901 {
3902     flag aSign;
3903     int16 aExp, zExp;
3904     uint64_t aSig, zSig, doubleZSig;
3905     uint64_t rem0, rem1, term0, term1;
3906     a = float64_squash_input_denormal(a STATUS_VAR);
3907 
3908     aSig = extractFloat64Frac( a );
3909     aExp = extractFloat64Exp( a );
3910     aSign = extractFloat64Sign( a );
3911     if ( aExp == 0x7FF ) {
3912         if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3913         if ( ! aSign ) return a;
3914         float_raise( float_flag_invalid STATUS_VAR);
3915         return float64_default_nan;
3916     }
3917     if ( aSign ) {
3918         if ( ( aExp | aSig ) == 0 ) return a;
3919         float_raise( float_flag_invalid STATUS_VAR);
3920         return float64_default_nan;
3921     }
3922     if ( aExp == 0 ) {
3923         if ( aSig == 0 ) return float64_zero;
3924         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3925     }
3926     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3927     aSig |= LIT64( 0x0010000000000000 );
3928     zSig = estimateSqrt32( aExp, aSig>>21 );
3929     aSig <<= 9 - ( aExp & 1 );
3930     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3931     if ( ( zSig & 0x1FF ) <= 5 ) {
3932         doubleZSig = zSig<<1;
3933         mul64To128( zSig, zSig, &term0, &term1 );
3934         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3935         while ( (int64_t) rem0 < 0 ) {
3936             --zSig;
3937             doubleZSig -= 2;
3938             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3939         }
3940         zSig |= ( ( rem0 | rem1 ) != 0 );
3941     }
3942     return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3943 
3944 }
3945 
3946 /*----------------------------------------------------------------------------
3947 | Returns the binary log of the double-precision floating-point value `a'.
3948 | The operation is performed according to the IEC/IEEE Standard for Binary
3949 | Floating-Point Arithmetic.
3950 *----------------------------------------------------------------------------*/
3951 float64 float64_log2( float64 a STATUS_PARAM )
3952 {
3953     flag aSign, zSign;
3954     int16 aExp;
3955     uint64_t aSig, aSig0, aSig1, zSig, i;
3956     a = float64_squash_input_denormal(a STATUS_VAR);
3957 
3958     aSig = extractFloat64Frac( a );
3959     aExp = extractFloat64Exp( a );
3960     aSign = extractFloat64Sign( a );
3961 
3962     if ( aExp == 0 ) {
3963         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3964         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3965     }
3966     if ( aSign ) {
3967         float_raise( float_flag_invalid STATUS_VAR);
3968         return float64_default_nan;
3969     }
3970     if ( aExp == 0x7FF ) {
3971         if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3972         return a;
3973     }
3974 
3975     aExp -= 0x3FF;
3976     aSig |= LIT64( 0x0010000000000000 );
3977     zSign = aExp < 0;
3978     zSig = (uint64_t)aExp << 52;
3979     for (i = 1LL << 51; i > 0; i >>= 1) {
3980         mul64To128( aSig, aSig, &aSig0, &aSig1 );
3981         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3982         if ( aSig & LIT64( 0x0020000000000000 ) ) {
3983             aSig >>= 1;
3984             zSig |= i;
3985         }
3986     }
3987 
3988     if ( zSign )
3989         zSig = -zSig;
3990     return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3991 }
3992 
3993 /*----------------------------------------------------------------------------
3994 | Returns 1 if the double-precision floating-point value `a' is equal to the
3995 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
3996 | if either operand is a NaN.  Otherwise, the comparison is performed
3997 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3998 *----------------------------------------------------------------------------*/
3999 
4000 int float64_eq( float64 a, float64 b STATUS_PARAM )
4001 {
4002     uint64_t av, bv;
4003     a = float64_squash_input_denormal(a STATUS_VAR);
4004     b = float64_squash_input_denormal(b STATUS_VAR);
4005 
4006     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4007          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4008        ) {
4009         float_raise( float_flag_invalid STATUS_VAR);
4010         return 0;
4011     }
4012     av = float64_val(a);
4013     bv = float64_val(b);
4014     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4015 
4016 }
4017 
4018 /*----------------------------------------------------------------------------
4019 | Returns 1 if the double-precision floating-point value `a' is less than or
4020 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4021 | exception is raised if either operand is a NaN.  The comparison is performed
4022 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4023 *----------------------------------------------------------------------------*/
4024 
4025 int float64_le( float64 a, float64 b STATUS_PARAM )
4026 {
4027     flag aSign, bSign;
4028     uint64_t av, bv;
4029     a = float64_squash_input_denormal(a STATUS_VAR);
4030     b = float64_squash_input_denormal(b STATUS_VAR);
4031 
4032     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4033          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4034        ) {
4035         float_raise( float_flag_invalid STATUS_VAR);
4036         return 0;
4037     }
4038     aSign = extractFloat64Sign( a );
4039     bSign = extractFloat64Sign( b );
4040     av = float64_val(a);
4041     bv = float64_val(b);
4042     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4043     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4044 
4045 }
4046 
4047 /*----------------------------------------------------------------------------
4048 | Returns 1 if the double-precision floating-point value `a' is less than
4049 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4050 | raised if either operand is a NaN.  The comparison is performed according
4051 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4052 *----------------------------------------------------------------------------*/
4053 
4054 int float64_lt( float64 a, float64 b STATUS_PARAM )
4055 {
4056     flag aSign, bSign;
4057     uint64_t av, bv;
4058 
4059     a = float64_squash_input_denormal(a STATUS_VAR);
4060     b = float64_squash_input_denormal(b STATUS_VAR);
4061     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4062          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4063        ) {
4064         float_raise( float_flag_invalid STATUS_VAR);
4065         return 0;
4066     }
4067     aSign = extractFloat64Sign( a );
4068     bSign = extractFloat64Sign( b );
4069     av = float64_val(a);
4070     bv = float64_val(b);
4071     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4072     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4073 
4074 }
4075 
4076 /*----------------------------------------------------------------------------
4077 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4078 | be compared, and 0 otherwise.  The invalid exception is raised if either
4079 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4080 | Standard for Binary Floating-Point Arithmetic.
4081 *----------------------------------------------------------------------------*/
4082 
4083 int float64_unordered( float64 a, float64 b STATUS_PARAM )
4084 {
4085     a = float64_squash_input_denormal(a STATUS_VAR);
4086     b = float64_squash_input_denormal(b STATUS_VAR);
4087 
4088     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4089          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4090        ) {
4091         float_raise( float_flag_invalid STATUS_VAR);
4092         return 1;
4093     }
4094     return 0;
4095 }
4096 
4097 /*----------------------------------------------------------------------------
4098 | Returns 1 if the double-precision floating-point value `a' is equal to the
4099 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4100 | exception.The comparison is performed according to the IEC/IEEE Standard
4101 | for Binary Floating-Point Arithmetic.
4102 *----------------------------------------------------------------------------*/
4103 
4104 int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
4105 {
4106     uint64_t av, bv;
4107     a = float64_squash_input_denormal(a STATUS_VAR);
4108     b = float64_squash_input_denormal(b STATUS_VAR);
4109 
4110     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4111          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4112        ) {
4113         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4114             float_raise( float_flag_invalid STATUS_VAR);
4115         }
4116         return 0;
4117     }
4118     av = float64_val(a);
4119     bv = float64_val(b);
4120     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4121 
4122 }
4123 
4124 /*----------------------------------------------------------------------------
4125 | Returns 1 if the double-precision floating-point value `a' is less than or
4126 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4127 | cause an exception.  Otherwise, the comparison is performed according to the
4128 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4129 *----------------------------------------------------------------------------*/
4130 
4131 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
4132 {
4133     flag aSign, bSign;
4134     uint64_t av, bv;
4135     a = float64_squash_input_denormal(a STATUS_VAR);
4136     b = float64_squash_input_denormal(b STATUS_VAR);
4137 
4138     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4139          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4140        ) {
4141         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4142             float_raise( float_flag_invalid STATUS_VAR);
4143         }
4144         return 0;
4145     }
4146     aSign = extractFloat64Sign( a );
4147     bSign = extractFloat64Sign( b );
4148     av = float64_val(a);
4149     bv = float64_val(b);
4150     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4151     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4152 
4153 }
4154 
4155 /*----------------------------------------------------------------------------
4156 | Returns 1 if the double-precision floating-point value `a' is less than
4157 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4158 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4159 | Standard for Binary Floating-Point Arithmetic.
4160 *----------------------------------------------------------------------------*/
4161 
4162 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
4163 {
4164     flag aSign, bSign;
4165     uint64_t av, bv;
4166     a = float64_squash_input_denormal(a STATUS_VAR);
4167     b = float64_squash_input_denormal(b STATUS_VAR);
4168 
4169     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4170          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4171        ) {
4172         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4173             float_raise( float_flag_invalid STATUS_VAR);
4174         }
4175         return 0;
4176     }
4177     aSign = extractFloat64Sign( a );
4178     bSign = extractFloat64Sign( b );
4179     av = float64_val(a);
4180     bv = float64_val(b);
4181     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4182     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4183 
4184 }
4185 
4186 /*----------------------------------------------------------------------------
4187 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4188 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4189 | comparison is performed according to the IEC/IEEE Standard for Binary
4190 | Floating-Point Arithmetic.
4191 *----------------------------------------------------------------------------*/
4192 
4193 int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
4194 {
4195     a = float64_squash_input_denormal(a STATUS_VAR);
4196     b = float64_squash_input_denormal(b STATUS_VAR);
4197 
4198     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4199          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4200        ) {
4201         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4202             float_raise( float_flag_invalid STATUS_VAR);
4203         }
4204         return 1;
4205     }
4206     return 0;
4207 }
4208 
4209 /*----------------------------------------------------------------------------
4210 | Returns the result of converting the extended double-precision floating-
4211 | point value `a' to the 32-bit two's complement integer format.  The
4212 | conversion is performed according to the IEC/IEEE Standard for Binary
4213 | Floating-Point Arithmetic---which means in particular that the conversion
4214 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4215 | largest positive integer is returned.  Otherwise, if the conversion
4216 | overflows, the largest integer with the same sign as `a' is returned.
4217 *----------------------------------------------------------------------------*/
4218 
4219 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
4220 {
4221     flag aSign;
4222     int32 aExp, shiftCount;
4223     uint64_t aSig;
4224 
4225     aSig = extractFloatx80Frac( a );
4226     aExp = extractFloatx80Exp( a );
4227     aSign = extractFloatx80Sign( a );
4228     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4229     shiftCount = 0x4037 - aExp;
4230     if ( shiftCount <= 0 ) shiftCount = 1;
4231     shift64RightJamming( aSig, shiftCount, &aSig );
4232     return roundAndPackInt32( aSign, aSig STATUS_VAR );
4233 
4234 }
4235 
4236 /*----------------------------------------------------------------------------
4237 | Returns the result of converting the extended double-precision floating-
4238 | point value `a' to the 32-bit two's complement integer format.  The
4239 | conversion is performed according to the IEC/IEEE Standard for Binary
4240 | Floating-Point Arithmetic, except that the conversion is always rounded
4241 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4242 | Otherwise, if the conversion overflows, the largest integer with the same
4243 | sign as `a' is returned.
4244 *----------------------------------------------------------------------------*/
4245 
4246 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
4247 {
4248     flag aSign;
4249     int32 aExp, shiftCount;
4250     uint64_t aSig, savedASig;
4251     int32 z;
4252 
4253     aSig = extractFloatx80Frac( a );
4254     aExp = extractFloatx80Exp( a );
4255     aSign = extractFloatx80Sign( a );
4256     if ( 0x401E < aExp ) {
4257         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4258         goto invalid;
4259     }
4260     else if ( aExp < 0x3FFF ) {
4261         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4262         return 0;
4263     }
4264     shiftCount = 0x403E - aExp;
4265     savedASig = aSig;
4266     aSig >>= shiftCount;
4267     z = aSig;
4268     if ( aSign ) z = - z;
4269     if ( ( z < 0 ) ^ aSign ) {
4270  invalid:
4271         float_raise( float_flag_invalid STATUS_VAR);
4272         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4273     }
4274     if ( ( aSig<<shiftCount ) != savedASig ) {
4275         STATUS(float_exception_flags) |= float_flag_inexact;
4276     }
4277     return z;
4278 
4279 }
4280 
4281 /*----------------------------------------------------------------------------
4282 | Returns the result of converting the extended double-precision floating-
4283 | point value `a' to the 64-bit two's complement integer format.  The
4284 | conversion is performed according to the IEC/IEEE Standard for Binary
4285 | Floating-Point Arithmetic---which means in particular that the conversion
4286 | is rounded according to the current rounding mode.  If `a' is a NaN,
4287 | the largest positive integer is returned.  Otherwise, if the conversion
4288 | overflows, the largest integer with the same sign as `a' is returned.
4289 *----------------------------------------------------------------------------*/
4290 
4291 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
4292 {
4293     flag aSign;
4294     int32 aExp, shiftCount;
4295     uint64_t aSig, aSigExtra;
4296 
4297     aSig = extractFloatx80Frac( a );
4298     aExp = extractFloatx80Exp( a );
4299     aSign = extractFloatx80Sign( a );
4300     shiftCount = 0x403E - aExp;
4301     if ( shiftCount <= 0 ) {
4302         if ( shiftCount ) {
4303             float_raise( float_flag_invalid STATUS_VAR);
4304             if (    ! aSign
4305                  || (    ( aExp == 0x7FFF )
4306                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
4307                ) {
4308                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4309             }
4310             return (int64_t) LIT64( 0x8000000000000000 );
4311         }
4312         aSigExtra = 0;
4313     }
4314     else {
4315         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4316     }
4317     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
4318 
4319 }
4320 
4321 /*----------------------------------------------------------------------------
4322 | Returns the result of converting the extended double-precision floating-
4323 | point value `a' to the 64-bit two's complement integer format.  The
4324 | conversion is performed according to the IEC/IEEE Standard for Binary
4325 | Floating-Point Arithmetic, except that the conversion is always rounded
4326 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4327 | Otherwise, if the conversion overflows, the largest integer with the same
4328 | sign as `a' is returned.
4329 *----------------------------------------------------------------------------*/
4330 
4331 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
4332 {
4333     flag aSign;
4334     int32 aExp, shiftCount;
4335     uint64_t aSig;
4336     int64 z;
4337 
4338     aSig = extractFloatx80Frac( a );
4339     aExp = extractFloatx80Exp( a );
4340     aSign = extractFloatx80Sign( a );
4341     shiftCount = aExp - 0x403E;
4342     if ( 0 <= shiftCount ) {
4343         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4344         if ( ( a.high != 0xC03E ) || aSig ) {
4345             float_raise( float_flag_invalid STATUS_VAR);
4346             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4347                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4348             }
4349         }
4350         return (int64_t) LIT64( 0x8000000000000000 );
4351     }
4352     else if ( aExp < 0x3FFF ) {
4353         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4354         return 0;
4355     }
4356     z = aSig>>( - shiftCount );
4357     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4358         STATUS(float_exception_flags) |= float_flag_inexact;
4359     }
4360     if ( aSign ) z = - z;
4361     return z;
4362 
4363 }
4364 
4365 /*----------------------------------------------------------------------------
4366 | Returns the result of converting the extended double-precision floating-
4367 | point value `a' to the single-precision floating-point format.  The
4368 | conversion is performed according to the IEC/IEEE Standard for Binary
4369 | Floating-Point Arithmetic.
4370 *----------------------------------------------------------------------------*/
4371 
4372 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
4373 {
4374     flag aSign;
4375     int32 aExp;
4376     uint64_t aSig;
4377 
4378     aSig = extractFloatx80Frac( a );
4379     aExp = extractFloatx80Exp( a );
4380     aSign = extractFloatx80Sign( a );
4381     if ( aExp == 0x7FFF ) {
4382         if ( (uint64_t) ( aSig<<1 ) ) {
4383             return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4384         }
4385         return packFloat32( aSign, 0xFF, 0 );
4386     }
4387     shift64RightJamming( aSig, 33, &aSig );
4388     if ( aExp || aSig ) aExp -= 0x3F81;
4389     return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
4390 
4391 }
4392 
4393 /*----------------------------------------------------------------------------
4394 | Returns the result of converting the extended double-precision floating-
4395 | point value `a' to the double-precision floating-point format.  The
4396 | conversion is performed according to the IEC/IEEE Standard for Binary
4397 | Floating-Point Arithmetic.
4398 *----------------------------------------------------------------------------*/
4399 
4400 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
4401 {
4402     flag aSign;
4403     int32 aExp;
4404     uint64_t aSig, zSig;
4405 
4406     aSig = extractFloatx80Frac( a );
4407     aExp = extractFloatx80Exp( a );
4408     aSign = extractFloatx80Sign( a );
4409     if ( aExp == 0x7FFF ) {
4410         if ( (uint64_t) ( aSig<<1 ) ) {
4411             return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4412         }
4413         return packFloat64( aSign, 0x7FF, 0 );
4414     }
4415     shift64RightJamming( aSig, 1, &zSig );
4416     if ( aExp || aSig ) aExp -= 0x3C01;
4417     return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4418 
4419 }
4420 
4421 /*----------------------------------------------------------------------------
4422 | Returns the result of converting the extended double-precision floating-
4423 | point value `a' to the quadruple-precision floating-point format.  The
4424 | conversion is performed according to the IEC/IEEE Standard for Binary
4425 | Floating-Point Arithmetic.
4426 *----------------------------------------------------------------------------*/
4427 
4428 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4429 {
4430     flag aSign;
4431     int16 aExp;
4432     uint64_t aSig, zSig0, zSig1;
4433 
4434     aSig = extractFloatx80Frac( a );
4435     aExp = extractFloatx80Exp( a );
4436     aSign = extractFloatx80Sign( a );
4437     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4438         return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4439     }
4440     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4441     return packFloat128( aSign, aExp, zSig0, zSig1 );
4442 
4443 }
4444 
4445 /*----------------------------------------------------------------------------
4446 | Rounds the extended double-precision floating-point value `a' to an integer,
4447 | and returns the result as an extended quadruple-precision floating-point
4448 | value.  The operation is performed according to the IEC/IEEE Standard for
4449 | Binary Floating-Point Arithmetic.
4450 *----------------------------------------------------------------------------*/
4451 
4452 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4453 {
4454     flag aSign;
4455     int32 aExp;
4456     uint64_t lastBitMask, roundBitsMask;
4457     int8 roundingMode;
4458     floatx80 z;
4459 
4460     aExp = extractFloatx80Exp( a );
4461     if ( 0x403E <= aExp ) {
4462         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4463             return propagateFloatx80NaN( a, a STATUS_VAR );
4464         }
4465         return a;
4466     }
4467     if ( aExp < 0x3FFF ) {
4468         if (    ( aExp == 0 )
4469              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4470             return a;
4471         }
4472         STATUS(float_exception_flags) |= float_flag_inexact;
4473         aSign = extractFloatx80Sign( a );
4474         switch ( STATUS(float_rounding_mode) ) {
4475          case float_round_nearest_even:
4476             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4477                ) {
4478                 return
4479                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4480             }
4481             break;
4482          case float_round_down:
4483             return
4484                   aSign ?
4485                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4486                 : packFloatx80( 0, 0, 0 );
4487          case float_round_up:
4488             return
4489                   aSign ? packFloatx80( 1, 0, 0 )
4490                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4491         }
4492         return packFloatx80( aSign, 0, 0 );
4493     }
4494     lastBitMask = 1;
4495     lastBitMask <<= 0x403E - aExp;
4496     roundBitsMask = lastBitMask - 1;
4497     z = a;
4498     roundingMode = STATUS(float_rounding_mode);
4499     if ( roundingMode == float_round_nearest_even ) {
4500         z.low += lastBitMask>>1;
4501         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4502     }
4503     else if ( roundingMode != float_round_to_zero ) {
4504         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4505             z.low += roundBitsMask;
4506         }
4507     }
4508     z.low &= ~ roundBitsMask;
4509     if ( z.low == 0 ) {
4510         ++z.high;
4511         z.low = LIT64( 0x8000000000000000 );
4512     }
4513     if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4514     return z;
4515 
4516 }
4517 
4518 /*----------------------------------------------------------------------------
4519 | Returns the result of adding the absolute values of the extended double-
4520 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4521 | negated before being returned.  `zSign' is ignored if the result is a NaN.
4522 | The addition is performed according to the IEC/IEEE Standard for Binary
4523 | Floating-Point Arithmetic.
4524 *----------------------------------------------------------------------------*/
4525 
4526 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4527 {
4528     int32 aExp, bExp, zExp;
4529     uint64_t aSig, bSig, zSig0, zSig1;
4530     int32 expDiff;
4531 
4532     aSig = extractFloatx80Frac( a );
4533     aExp = extractFloatx80Exp( a );
4534     bSig = extractFloatx80Frac( b );
4535     bExp = extractFloatx80Exp( b );
4536     expDiff = aExp - bExp;
4537     if ( 0 < expDiff ) {
4538         if ( aExp == 0x7FFF ) {
4539             if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4540             return a;
4541         }
4542         if ( bExp == 0 ) --expDiff;
4543         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4544         zExp = aExp;
4545     }
4546     else if ( expDiff < 0 ) {
4547         if ( bExp == 0x7FFF ) {
4548             if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4549             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4550         }
4551         if ( aExp == 0 ) ++expDiff;
4552         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4553         zExp = bExp;
4554     }
4555     else {
4556         if ( aExp == 0x7FFF ) {
4557             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4558                 return propagateFloatx80NaN( a, b STATUS_VAR );
4559             }
4560             return a;
4561         }
4562         zSig1 = 0;
4563         zSig0 = aSig + bSig;
4564         if ( aExp == 0 ) {
4565             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4566             goto roundAndPack;
4567         }
4568         zExp = aExp;
4569         goto shiftRight1;
4570     }
4571     zSig0 = aSig + bSig;
4572     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4573  shiftRight1:
4574     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4575     zSig0 |= LIT64( 0x8000000000000000 );
4576     ++zExp;
4577  roundAndPack:
4578     return
4579         roundAndPackFloatx80(
4580             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4581 
4582 }
4583 
4584 /*----------------------------------------------------------------------------
4585 | Returns the result of subtracting the absolute values of the extended
4586 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4587 | difference is negated before being returned.  `zSign' is ignored if the
4588 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4589 | Standard for Binary Floating-Point Arithmetic.
4590 *----------------------------------------------------------------------------*/
4591 
4592 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4593 {
4594     int32 aExp, bExp, zExp;
4595     uint64_t aSig, bSig, zSig0, zSig1;
4596     int32 expDiff;
4597     floatx80 z;
4598 
4599     aSig = extractFloatx80Frac( a );
4600     aExp = extractFloatx80Exp( a );
4601     bSig = extractFloatx80Frac( b );
4602     bExp = extractFloatx80Exp( b );
4603     expDiff = aExp - bExp;
4604     if ( 0 < expDiff ) goto aExpBigger;
4605     if ( expDiff < 0 ) goto bExpBigger;
4606     if ( aExp == 0x7FFF ) {
4607         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4608             return propagateFloatx80NaN( a, b STATUS_VAR );
4609         }
4610         float_raise( float_flag_invalid STATUS_VAR);
4611         z.low = floatx80_default_nan_low;
4612         z.high = floatx80_default_nan_high;
4613         return z;
4614     }
4615     if ( aExp == 0 ) {
4616         aExp = 1;
4617         bExp = 1;
4618     }
4619     zSig1 = 0;
4620     if ( bSig < aSig ) goto aBigger;
4621     if ( aSig < bSig ) goto bBigger;
4622     return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4623  bExpBigger:
4624     if ( bExp == 0x7FFF ) {
4625         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4626         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4627     }
4628     if ( aExp == 0 ) ++expDiff;
4629     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4630  bBigger:
4631     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4632     zExp = bExp;
4633     zSign ^= 1;
4634     goto normalizeRoundAndPack;
4635  aExpBigger:
4636     if ( aExp == 0x7FFF ) {
4637         if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4638         return a;
4639     }
4640     if ( bExp == 0 ) --expDiff;
4641     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4642  aBigger:
4643     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4644     zExp = aExp;
4645  normalizeRoundAndPack:
4646     return
4647         normalizeRoundAndPackFloatx80(
4648             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4649 
4650 }
4651 
4652 /*----------------------------------------------------------------------------
4653 | Returns the result of adding the extended double-precision floating-point
4654 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4655 | Standard for Binary Floating-Point Arithmetic.
4656 *----------------------------------------------------------------------------*/
4657 
4658 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4659 {
4660     flag aSign, bSign;
4661 
4662     aSign = extractFloatx80Sign( a );
4663     bSign = extractFloatx80Sign( b );
4664     if ( aSign == bSign ) {
4665         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4666     }
4667     else {
4668         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4669     }
4670 
4671 }
4672 
4673 /*----------------------------------------------------------------------------
4674 | Returns the result of subtracting the extended double-precision floating-
4675 | point values `a' and `b'.  The operation is performed according to the
4676 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4677 *----------------------------------------------------------------------------*/
4678 
4679 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4680 {
4681     flag aSign, bSign;
4682 
4683     aSign = extractFloatx80Sign( a );
4684     bSign = extractFloatx80Sign( b );
4685     if ( aSign == bSign ) {
4686         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4687     }
4688     else {
4689         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4690     }
4691 
4692 }
4693 
4694 /*----------------------------------------------------------------------------
4695 | Returns the result of multiplying the extended double-precision floating-
4696 | point values `a' and `b'.  The operation is performed according to the
4697 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4698 *----------------------------------------------------------------------------*/
4699 
4700 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4701 {
4702     flag aSign, bSign, zSign;
4703     int32 aExp, bExp, zExp;
4704     uint64_t aSig, bSig, zSig0, zSig1;
4705     floatx80 z;
4706 
4707     aSig = extractFloatx80Frac( a );
4708     aExp = extractFloatx80Exp( a );
4709     aSign = extractFloatx80Sign( a );
4710     bSig = extractFloatx80Frac( b );
4711     bExp = extractFloatx80Exp( b );
4712     bSign = extractFloatx80Sign( b );
4713     zSign = aSign ^ bSign;
4714     if ( aExp == 0x7FFF ) {
4715         if (    (uint64_t) ( aSig<<1 )
4716              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4717             return propagateFloatx80NaN( a, b STATUS_VAR );
4718         }
4719         if ( ( bExp | bSig ) == 0 ) goto invalid;
4720         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4721     }
4722     if ( bExp == 0x7FFF ) {
4723         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4724         if ( ( aExp | aSig ) == 0 ) {
4725  invalid:
4726             float_raise( float_flag_invalid STATUS_VAR);
4727             z.low = floatx80_default_nan_low;
4728             z.high = floatx80_default_nan_high;
4729             return z;
4730         }
4731         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4732     }
4733     if ( aExp == 0 ) {
4734         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4735         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4736     }
4737     if ( bExp == 0 ) {
4738         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4739         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4740     }
4741     zExp = aExp + bExp - 0x3FFE;
4742     mul64To128( aSig, bSig, &zSig0, &zSig1 );
4743     if ( 0 < (int64_t) zSig0 ) {
4744         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4745         --zExp;
4746     }
4747     return
4748         roundAndPackFloatx80(
4749             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4750 
4751 }
4752 
4753 /*----------------------------------------------------------------------------
4754 | Returns the result of dividing the extended double-precision floating-point
4755 | value `a' by the corresponding value `b'.  The operation is performed
4756 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4757 *----------------------------------------------------------------------------*/
4758 
4759 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4760 {
4761     flag aSign, bSign, zSign;
4762     int32 aExp, bExp, zExp;
4763     uint64_t aSig, bSig, zSig0, zSig1;
4764     uint64_t rem0, rem1, rem2, term0, term1, term2;
4765     floatx80 z;
4766 
4767     aSig = extractFloatx80Frac( a );
4768     aExp = extractFloatx80Exp( a );
4769     aSign = extractFloatx80Sign( a );
4770     bSig = extractFloatx80Frac( b );
4771     bExp = extractFloatx80Exp( b );
4772     bSign = extractFloatx80Sign( b );
4773     zSign = aSign ^ bSign;
4774     if ( aExp == 0x7FFF ) {
4775         if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4776         if ( bExp == 0x7FFF ) {
4777             if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4778             goto invalid;
4779         }
4780         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4781     }
4782     if ( bExp == 0x7FFF ) {
4783         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4784         return packFloatx80( zSign, 0, 0 );
4785     }
4786     if ( bExp == 0 ) {
4787         if ( bSig == 0 ) {
4788             if ( ( aExp | aSig ) == 0 ) {
4789  invalid:
4790                 float_raise( float_flag_invalid STATUS_VAR);
4791                 z.low = floatx80_default_nan_low;
4792                 z.high = floatx80_default_nan_high;
4793                 return z;
4794             }
4795             float_raise( float_flag_divbyzero STATUS_VAR);
4796             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4797         }
4798         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4799     }
4800     if ( aExp == 0 ) {
4801         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4802         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4803     }
4804     zExp = aExp - bExp + 0x3FFE;
4805     rem1 = 0;
4806     if ( bSig <= aSig ) {
4807         shift128Right( aSig, 0, 1, &aSig, &rem1 );
4808         ++zExp;
4809     }
4810     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4811     mul64To128( bSig, zSig0, &term0, &term1 );
4812     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4813     while ( (int64_t) rem0 < 0 ) {
4814         --zSig0;
4815         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4816     }
4817     zSig1 = estimateDiv128To64( rem1, 0, bSig );
4818     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
4819         mul64To128( bSig, zSig1, &term1, &term2 );
4820         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4821         while ( (int64_t) rem1 < 0 ) {
4822             --zSig1;
4823             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4824         }
4825         zSig1 |= ( ( rem1 | rem2 ) != 0 );
4826     }
4827     return
4828         roundAndPackFloatx80(
4829             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4830 
4831 }
4832 
4833 /*----------------------------------------------------------------------------
4834 | Returns the remainder of the extended double-precision floating-point value
4835 | `a' with respect to the corresponding value `b'.  The operation is performed
4836 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4837 *----------------------------------------------------------------------------*/
4838 
4839 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4840 {
4841     flag aSign, zSign;
4842     int32 aExp, bExp, expDiff;
4843     uint64_t aSig0, aSig1, bSig;
4844     uint64_t q, term0, term1, alternateASig0, alternateASig1;
4845     floatx80 z;
4846 
4847     aSig0 = extractFloatx80Frac( a );
4848     aExp = extractFloatx80Exp( a );
4849     aSign = extractFloatx80Sign( a );
4850     bSig = extractFloatx80Frac( b );
4851     bExp = extractFloatx80Exp( b );
4852     if ( aExp == 0x7FFF ) {
4853         if (    (uint64_t) ( aSig0<<1 )
4854              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4855             return propagateFloatx80NaN( a, b STATUS_VAR );
4856         }
4857         goto invalid;
4858     }
4859     if ( bExp == 0x7FFF ) {
4860         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4861         return a;
4862     }
4863     if ( bExp == 0 ) {
4864         if ( bSig == 0 ) {
4865  invalid:
4866             float_raise( float_flag_invalid STATUS_VAR);
4867             z.low = floatx80_default_nan_low;
4868             z.high = floatx80_default_nan_high;
4869             return z;
4870         }
4871         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4872     }
4873     if ( aExp == 0 ) {
4874         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
4875         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4876     }
4877     bSig |= LIT64( 0x8000000000000000 );
4878     zSign = aSign;
4879     expDiff = aExp - bExp;
4880     aSig1 = 0;
4881     if ( expDiff < 0 ) {
4882         if ( expDiff < -1 ) return a;
4883         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4884         expDiff = 0;
4885     }
4886     q = ( bSig <= aSig0 );
4887     if ( q ) aSig0 -= bSig;
4888     expDiff -= 64;
4889     while ( 0 < expDiff ) {
4890         q = estimateDiv128To64( aSig0, aSig1, bSig );
4891         q = ( 2 < q ) ? q - 2 : 0;
4892         mul64To128( bSig, q, &term0, &term1 );
4893         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4894         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4895         expDiff -= 62;
4896     }
4897     expDiff += 64;
4898     if ( 0 < expDiff ) {
4899         q = estimateDiv128To64( aSig0, aSig1, bSig );
4900         q = ( 2 < q ) ? q - 2 : 0;
4901         q >>= 64 - expDiff;
4902         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4903         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4904         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4905         while ( le128( term0, term1, aSig0, aSig1 ) ) {
4906             ++q;
4907             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4908         }
4909     }
4910     else {
4911         term1 = 0;
4912         term0 = bSig;
4913     }
4914     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4915     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4916          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4917               && ( q & 1 ) )
4918        ) {
4919         aSig0 = alternateASig0;
4920         aSig1 = alternateASig1;
4921         zSign = ! zSign;
4922     }
4923     return
4924         normalizeRoundAndPackFloatx80(
4925             80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4926 
4927 }
4928 
4929 /*----------------------------------------------------------------------------
4930 | Returns the square root of the extended double-precision floating-point
4931 | value `a'.  The operation is performed according to the IEC/IEEE Standard
4932 | for Binary Floating-Point Arithmetic.
4933 *----------------------------------------------------------------------------*/
4934 
4935 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4936 {
4937     flag aSign;
4938     int32 aExp, zExp;
4939     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4940     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4941     floatx80 z;
4942 
4943     aSig0 = extractFloatx80Frac( a );
4944     aExp = extractFloatx80Exp( a );
4945     aSign = extractFloatx80Sign( a );
4946     if ( aExp == 0x7FFF ) {
4947         if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4948         if ( ! aSign ) return a;
4949         goto invalid;
4950     }
4951     if ( aSign ) {
4952         if ( ( aExp | aSig0 ) == 0 ) return a;
4953  invalid:
4954         float_raise( float_flag_invalid STATUS_VAR);
4955         z.low = floatx80_default_nan_low;
4956         z.high = floatx80_default_nan_high;
4957         return z;
4958     }
4959     if ( aExp == 0 ) {
4960         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4961         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4962     }
4963     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4964     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4965     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4966     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4967     doubleZSig0 = zSig0<<1;
4968     mul64To128( zSig0, zSig0, &term0, &term1 );
4969     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4970     while ( (int64_t) rem0 < 0 ) {
4971         --zSig0;
4972         doubleZSig0 -= 2;
4973         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4974     }
4975     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4976     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4977         if ( zSig1 == 0 ) zSig1 = 1;
4978         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4979         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4980         mul64To128( zSig1, zSig1, &term2, &term3 );
4981         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4982         while ( (int64_t) rem1 < 0 ) {
4983             --zSig1;
4984             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4985             term3 |= 1;
4986             term2 |= doubleZSig0;
4987             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4988         }
4989         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4990     }
4991     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4992     zSig0 |= doubleZSig0;
4993     return
4994         roundAndPackFloatx80(
4995             STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4996 
4997 }
4998 
4999 /*----------------------------------------------------------------------------
5000 | Returns 1 if the extended double-precision floating-point value `a' is equal
5001 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5002 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5003 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5004 *----------------------------------------------------------------------------*/
5005 
5006 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
5007 {
5008 
5009     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5010               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5011          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5012               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5013        ) {
5014         float_raise( float_flag_invalid STATUS_VAR);
5015         return 0;
5016     }
5017     return
5018            ( a.low == b.low )
5019         && (    ( a.high == b.high )
5020              || (    ( a.low == 0 )
5021                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5022            );
5023 
5024 }
5025 
5026 /*----------------------------------------------------------------------------
5027 | Returns 1 if the extended double-precision floating-point value `a' is
5028 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5029 | invalid exception is raised if either operand is a NaN.  The comparison is
5030 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5031 | Arithmetic.
5032 *----------------------------------------------------------------------------*/
5033 
5034 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
5035 {
5036     flag aSign, bSign;
5037 
5038     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5039               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5040          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5041               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5042        ) {
5043         float_raise( float_flag_invalid STATUS_VAR);
5044         return 0;
5045     }
5046     aSign = extractFloatx80Sign( a );
5047     bSign = extractFloatx80Sign( b );
5048     if ( aSign != bSign ) {
5049         return
5050                aSign
5051             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5052                  == 0 );
5053     }
5054     return
5055           aSign ? le128( b.high, b.low, a.high, a.low )
5056         : le128( a.high, a.low, b.high, b.low );
5057 
5058 }
5059 
5060 /*----------------------------------------------------------------------------
5061 | Returns 1 if the extended double-precision floating-point value `a' is
5062 | less than the corresponding value `b', and 0 otherwise.  The invalid
5063 | exception is raised if either operand is a NaN.  The comparison is performed
5064 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5065 *----------------------------------------------------------------------------*/
5066 
5067 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
5068 {
5069     flag aSign, bSign;
5070 
5071     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5072               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5073          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5074               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5075        ) {
5076         float_raise( float_flag_invalid STATUS_VAR);
5077         return 0;
5078     }
5079     aSign = extractFloatx80Sign( a );
5080     bSign = extractFloatx80Sign( b );
5081     if ( aSign != bSign ) {
5082         return
5083                aSign
5084             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5085                  != 0 );
5086     }
5087     return
5088           aSign ? lt128( b.high, b.low, a.high, a.low )
5089         : lt128( a.high, a.low, b.high, b.low );
5090 
5091 }
5092 
5093 /*----------------------------------------------------------------------------
5094 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5095 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5096 | either operand is a NaN.   The comparison is performed according to the
5097 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5098 *----------------------------------------------------------------------------*/
5099 int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
5100 {
5101     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5102               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5103          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5104               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5105        ) {
5106         float_raise( float_flag_invalid STATUS_VAR);
5107         return 1;
5108     }
5109     return 0;
5110 }
5111 
5112 /*----------------------------------------------------------------------------
5113 | Returns 1 if the extended double-precision floating-point value `a' is
5114 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5115 | cause an exception.  The comparison is performed according to the IEC/IEEE
5116 | Standard for Binary Floating-Point Arithmetic.
5117 *----------------------------------------------------------------------------*/
5118 
5119 int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5120 {
5121 
5122     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5123               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5124          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5125               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5126        ) {
5127         if (    floatx80_is_signaling_nan( a )
5128              || floatx80_is_signaling_nan( b ) ) {
5129             float_raise( float_flag_invalid STATUS_VAR);
5130         }
5131         return 0;
5132     }
5133     return
5134            ( a.low == b.low )
5135         && (    ( a.high == b.high )
5136              || (    ( a.low == 0 )
5137                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5138            );
5139 
5140 }
5141 
5142 /*----------------------------------------------------------------------------
5143 | Returns 1 if the extended double-precision floating-point value `a' is less
5144 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5145 | do not cause an exception.  Otherwise, the comparison is performed according
5146 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5147 *----------------------------------------------------------------------------*/
5148 
5149 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5150 {
5151     flag aSign, bSign;
5152 
5153     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5154               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5155          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5156               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5157        ) {
5158         if (    floatx80_is_signaling_nan( a )
5159              || floatx80_is_signaling_nan( b ) ) {
5160             float_raise( float_flag_invalid STATUS_VAR);
5161         }
5162         return 0;
5163     }
5164     aSign = extractFloatx80Sign( a );
5165     bSign = extractFloatx80Sign( b );
5166     if ( aSign != bSign ) {
5167         return
5168                aSign
5169             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5170                  == 0 );
5171     }
5172     return
5173           aSign ? le128( b.high, b.low, a.high, a.low )
5174         : le128( a.high, a.low, b.high, b.low );
5175 
5176 }
5177 
5178 /*----------------------------------------------------------------------------
5179 | Returns 1 if the extended double-precision floating-point value `a' is less
5180 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5181 | an exception.  Otherwise, the comparison is performed according to the
5182 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5183 *----------------------------------------------------------------------------*/
5184 
5185 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5186 {
5187     flag aSign, bSign;
5188 
5189     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5190               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5191          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5192               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5193        ) {
5194         if (    floatx80_is_signaling_nan( a )
5195              || floatx80_is_signaling_nan( b ) ) {
5196             float_raise( float_flag_invalid STATUS_VAR);
5197         }
5198         return 0;
5199     }
5200     aSign = extractFloatx80Sign( a );
5201     bSign = extractFloatx80Sign( b );
5202     if ( aSign != bSign ) {
5203         return
5204                aSign
5205             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5206                  != 0 );
5207     }
5208     return
5209           aSign ? lt128( b.high, b.low, a.high, a.low )
5210         : lt128( a.high, a.low, b.high, b.low );
5211 
5212 }
5213 
5214 /*----------------------------------------------------------------------------
5215 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5216 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5217 | The comparison is performed according to the IEC/IEEE Standard for Binary
5218 | Floating-Point Arithmetic.
5219 *----------------------------------------------------------------------------*/
5220 int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5221 {
5222     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5223               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5224          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5225               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5226        ) {
5227         if (    floatx80_is_signaling_nan( a )
5228              || floatx80_is_signaling_nan( b ) ) {
5229             float_raise( float_flag_invalid STATUS_VAR);
5230         }
5231         return 1;
5232     }
5233     return 0;
5234 }
5235 
5236 /*----------------------------------------------------------------------------
5237 | Returns the result of converting the quadruple-precision floating-point
5238 | value `a' to the 32-bit two's complement integer format.  The conversion
5239 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5240 | Arithmetic---which means in particular that the conversion is rounded
5241 | according to the current rounding mode.  If `a' is a NaN, the largest
5242 | positive integer is returned.  Otherwise, if the conversion overflows, the
5243 | largest integer with the same sign as `a' is returned.
5244 *----------------------------------------------------------------------------*/
5245 
5246 int32 float128_to_int32( float128 a STATUS_PARAM )
5247 {
5248     flag aSign;
5249     int32 aExp, shiftCount;
5250     uint64_t aSig0, aSig1;
5251 
5252     aSig1 = extractFloat128Frac1( a );
5253     aSig0 = extractFloat128Frac0( a );
5254     aExp = extractFloat128Exp( a );
5255     aSign = extractFloat128Sign( a );
5256     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5257     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5258     aSig0 |= ( aSig1 != 0 );
5259     shiftCount = 0x4028 - aExp;
5260     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5261     return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
5262 
5263 }
5264 
5265 /*----------------------------------------------------------------------------
5266 | Returns the result of converting the quadruple-precision floating-point
5267 | value `a' to the 32-bit two's complement integer format.  The conversion
5268 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5269 | Arithmetic, except that the conversion is always rounded toward zero.  If
5270 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5271 | conversion overflows, the largest integer with the same sign as `a' is
5272 | returned.
5273 *----------------------------------------------------------------------------*/
5274 
5275 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
5276 {
5277     flag aSign;
5278     int32 aExp, shiftCount;
5279     uint64_t aSig0, aSig1, savedASig;
5280     int32 z;
5281 
5282     aSig1 = extractFloat128Frac1( a );
5283     aSig0 = extractFloat128Frac0( a );
5284     aExp = extractFloat128Exp( a );
5285     aSign = extractFloat128Sign( a );
5286     aSig0 |= ( aSig1 != 0 );
5287     if ( 0x401E < aExp ) {
5288         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5289         goto invalid;
5290     }
5291     else if ( aExp < 0x3FFF ) {
5292         if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
5293         return 0;
5294     }
5295     aSig0 |= LIT64( 0x0001000000000000 );
5296     shiftCount = 0x402F - aExp;
5297     savedASig = aSig0;
5298     aSig0 >>= shiftCount;
5299     z = aSig0;
5300     if ( aSign ) z = - z;
5301     if ( ( z < 0 ) ^ aSign ) {
5302  invalid:
5303         float_raise( float_flag_invalid STATUS_VAR);
5304         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5305     }
5306     if ( ( aSig0<<shiftCount ) != savedASig ) {
5307         STATUS(float_exception_flags) |= float_flag_inexact;
5308     }
5309     return z;
5310 
5311 }
5312 
5313 /*----------------------------------------------------------------------------
5314 | Returns the result of converting the quadruple-precision floating-point
5315 | value `a' to the 64-bit two's complement integer format.  The conversion
5316 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5317 | Arithmetic---which means in particular that the conversion is rounded
5318 | according to the current rounding mode.  If `a' is a NaN, the largest
5319 | positive integer is returned.  Otherwise, if the conversion overflows, the
5320 | largest integer with the same sign as `a' is returned.
5321 *----------------------------------------------------------------------------*/
5322 
5323 int64 float128_to_int64( float128 a STATUS_PARAM )
5324 {
5325     flag aSign;
5326     int32 aExp, shiftCount;
5327     uint64_t aSig0, aSig1;
5328 
5329     aSig1 = extractFloat128Frac1( a );
5330     aSig0 = extractFloat128Frac0( a );
5331     aExp = extractFloat128Exp( a );
5332     aSign = extractFloat128Sign( a );
5333     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5334     shiftCount = 0x402F - aExp;
5335     if ( shiftCount <= 0 ) {
5336         if ( 0x403E < aExp ) {
5337             float_raise( float_flag_invalid STATUS_VAR);
5338             if (    ! aSign
5339                  || (    ( aExp == 0x7FFF )
5340                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5341                     )
5342                ) {
5343                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5344             }
5345             return (int64_t) LIT64( 0x8000000000000000 );
5346         }
5347         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5348     }
5349     else {
5350         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5351     }
5352     return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
5353 
5354 }
5355 
5356 /*----------------------------------------------------------------------------
5357 | Returns the result of converting the quadruple-precision floating-point
5358 | value `a' to the 64-bit two's complement integer format.  The conversion
5359 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5360 | Arithmetic, except that the conversion is always rounded toward zero.
5361 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5362 | the conversion overflows, the largest integer with the same sign as `a' is
5363 | returned.
5364 *----------------------------------------------------------------------------*/
5365 
5366 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
5367 {
5368     flag aSign;
5369     int32 aExp, shiftCount;
5370     uint64_t aSig0, aSig1;
5371     int64 z;
5372 
5373     aSig1 = extractFloat128Frac1( a );
5374     aSig0 = extractFloat128Frac0( a );
5375     aExp = extractFloat128Exp( a );
5376     aSign = extractFloat128Sign( a );
5377     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5378     shiftCount = aExp - 0x402F;
5379     if ( 0 < shiftCount ) {
5380         if ( 0x403E <= aExp ) {
5381             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5382             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5383                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5384                 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
5385             }
5386             else {
5387                 float_raise( float_flag_invalid STATUS_VAR);
5388                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5389                     return LIT64( 0x7FFFFFFFFFFFFFFF );
5390                 }
5391             }
5392             return (int64_t) LIT64( 0x8000000000000000 );
5393         }
5394         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5395         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5396             STATUS(float_exception_flags) |= float_flag_inexact;
5397         }
5398     }
5399     else {
5400         if ( aExp < 0x3FFF ) {
5401             if ( aExp | aSig0 | aSig1 ) {
5402                 STATUS(float_exception_flags) |= float_flag_inexact;
5403             }
5404             return 0;
5405         }
5406         z = aSig0>>( - shiftCount );
5407         if (    aSig1
5408              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5409             STATUS(float_exception_flags) |= float_flag_inexact;
5410         }
5411     }
5412     if ( aSign ) z = - z;
5413     return z;
5414 
5415 }
5416 
5417 /*----------------------------------------------------------------------------
5418 | Returns the result of converting the quadruple-precision floating-point
5419 | value `a' to the single-precision floating-point format.  The conversion
5420 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5421 | Arithmetic.
5422 *----------------------------------------------------------------------------*/
5423 
5424 float32 float128_to_float32( float128 a STATUS_PARAM )
5425 {
5426     flag aSign;
5427     int32 aExp;
5428     uint64_t aSig0, aSig1;
5429     uint32_t zSig;
5430 
5431     aSig1 = extractFloat128Frac1( a );
5432     aSig0 = extractFloat128Frac0( a );
5433     aExp = extractFloat128Exp( a );
5434     aSign = extractFloat128Sign( a );
5435     if ( aExp == 0x7FFF ) {
5436         if ( aSig0 | aSig1 ) {
5437             return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5438         }
5439         return packFloat32( aSign, 0xFF, 0 );
5440     }
5441     aSig0 |= ( aSig1 != 0 );
5442     shift64RightJamming( aSig0, 18, &aSig0 );
5443     zSig = aSig0;
5444     if ( aExp || zSig ) {
5445         zSig |= 0x40000000;
5446         aExp -= 0x3F81;
5447     }
5448     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5449 
5450 }
5451 
5452 /*----------------------------------------------------------------------------
5453 | Returns the result of converting the quadruple-precision floating-point
5454 | value `a' to the double-precision floating-point format.  The conversion
5455 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5456 | Arithmetic.
5457 *----------------------------------------------------------------------------*/
5458 
5459 float64 float128_to_float64( float128 a STATUS_PARAM )
5460 {
5461     flag aSign;
5462     int32 aExp;
5463     uint64_t aSig0, aSig1;
5464 
5465     aSig1 = extractFloat128Frac1( a );
5466     aSig0 = extractFloat128Frac0( a );
5467     aExp = extractFloat128Exp( a );
5468     aSign = extractFloat128Sign( a );
5469     if ( aExp == 0x7FFF ) {
5470         if ( aSig0 | aSig1 ) {
5471             return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5472         }
5473         return packFloat64( aSign, 0x7FF, 0 );
5474     }
5475     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5476     aSig0 |= ( aSig1 != 0 );
5477     if ( aExp || aSig0 ) {
5478         aSig0 |= LIT64( 0x4000000000000000 );
5479         aExp -= 0x3C01;
5480     }
5481     return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5482 
5483 }
5484 
5485 /*----------------------------------------------------------------------------
5486 | Returns the result of converting the quadruple-precision floating-point
5487 | value `a' to the extended double-precision floating-point format.  The
5488 | conversion is performed according to the IEC/IEEE Standard for Binary
5489 | Floating-Point Arithmetic.
5490 *----------------------------------------------------------------------------*/
5491 
5492 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5493 {
5494     flag aSign;
5495     int32 aExp;
5496     uint64_t aSig0, aSig1;
5497 
5498     aSig1 = extractFloat128Frac1( a );
5499     aSig0 = extractFloat128Frac0( a );
5500     aExp = extractFloat128Exp( a );
5501     aSign = extractFloat128Sign( a );
5502     if ( aExp == 0x7FFF ) {
5503         if ( aSig0 | aSig1 ) {
5504             return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5505         }
5506         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5507     }
5508     if ( aExp == 0 ) {
5509         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5510         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5511     }
5512     else {
5513         aSig0 |= LIT64( 0x0001000000000000 );
5514     }
5515     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5516     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5517 
5518 }
5519 
5520 /*----------------------------------------------------------------------------
5521 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5522 | returns the result as a quadruple-precision floating-point value.  The
5523 | operation is performed according to the IEC/IEEE Standard for Binary
5524 | Floating-Point Arithmetic.
5525 *----------------------------------------------------------------------------*/
5526 
5527 float128 float128_round_to_int( float128 a STATUS_PARAM )
5528 {
5529     flag aSign;
5530     int32 aExp;
5531     uint64_t lastBitMask, roundBitsMask;
5532     int8 roundingMode;
5533     float128 z;
5534 
5535     aExp = extractFloat128Exp( a );
5536     if ( 0x402F <= aExp ) {
5537         if ( 0x406F <= aExp ) {
5538             if (    ( aExp == 0x7FFF )
5539                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5540                ) {
5541                 return propagateFloat128NaN( a, a STATUS_VAR );
5542             }
5543             return a;
5544         }
5545         lastBitMask = 1;
5546         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5547         roundBitsMask = lastBitMask - 1;
5548         z = a;
5549         roundingMode = STATUS(float_rounding_mode);
5550         if ( roundingMode == float_round_nearest_even ) {
5551             if ( lastBitMask ) {
5552                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5553                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5554             }
5555             else {
5556                 if ( (int64_t) z.low < 0 ) {
5557                     ++z.high;
5558                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5559                 }
5560             }
5561         }
5562         else if ( roundingMode != float_round_to_zero ) {
5563             if (   extractFloat128Sign( z )
5564                  ^ ( roundingMode == float_round_up ) ) {
5565                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5566             }
5567         }
5568         z.low &= ~ roundBitsMask;
5569     }
5570     else {
5571         if ( aExp < 0x3FFF ) {
5572             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5573             STATUS(float_exception_flags) |= float_flag_inexact;
5574             aSign = extractFloat128Sign( a );
5575             switch ( STATUS(float_rounding_mode) ) {
5576              case float_round_nearest_even:
5577                 if (    ( aExp == 0x3FFE )
5578                      && (   extractFloat128Frac0( a )
5579                           | extractFloat128Frac1( a ) )
5580                    ) {
5581                     return packFloat128( aSign, 0x3FFF, 0, 0 );
5582                 }
5583                 break;
5584              case float_round_down:
5585                 return
5586                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5587                     : packFloat128( 0, 0, 0, 0 );
5588              case float_round_up:
5589                 return
5590                       aSign ? packFloat128( 1, 0, 0, 0 )
5591                     : packFloat128( 0, 0x3FFF, 0, 0 );
5592             }
5593             return packFloat128( aSign, 0, 0, 0 );
5594         }
5595         lastBitMask = 1;
5596         lastBitMask <<= 0x402F - aExp;
5597         roundBitsMask = lastBitMask - 1;
5598         z.low = 0;
5599         z.high = a.high;
5600         roundingMode = STATUS(float_rounding_mode);
5601         if ( roundingMode == float_round_nearest_even ) {
5602             z.high += lastBitMask>>1;
5603             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5604                 z.high &= ~ lastBitMask;
5605             }
5606         }
5607         else if ( roundingMode != float_round_to_zero ) {
5608             if (   extractFloat128Sign( z )
5609                  ^ ( roundingMode == float_round_up ) ) {
5610                 z.high |= ( a.low != 0 );
5611                 z.high += roundBitsMask;
5612             }
5613         }
5614         z.high &= ~ roundBitsMask;
5615     }
5616     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5617         STATUS(float_exception_flags) |= float_flag_inexact;
5618     }
5619     return z;
5620 
5621 }
5622 
5623 /*----------------------------------------------------------------------------
5624 | Returns the result of adding the absolute values of the quadruple-precision
5625 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
5626 | before being returned.  `zSign' is ignored if the result is a NaN.
5627 | The addition is performed according to the IEC/IEEE Standard for Binary
5628 | Floating-Point Arithmetic.
5629 *----------------------------------------------------------------------------*/
5630 
5631 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5632 {
5633     int32 aExp, bExp, zExp;
5634     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5635     int32 expDiff;
5636 
5637     aSig1 = extractFloat128Frac1( a );
5638     aSig0 = extractFloat128Frac0( a );
5639     aExp = extractFloat128Exp( a );
5640     bSig1 = extractFloat128Frac1( b );
5641     bSig0 = extractFloat128Frac0( b );
5642     bExp = extractFloat128Exp( b );
5643     expDiff = aExp - bExp;
5644     if ( 0 < expDiff ) {
5645         if ( aExp == 0x7FFF ) {
5646             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5647             return a;
5648         }
5649         if ( bExp == 0 ) {
5650             --expDiff;
5651         }
5652         else {
5653             bSig0 |= LIT64( 0x0001000000000000 );
5654         }
5655         shift128ExtraRightJamming(
5656             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5657         zExp = aExp;
5658     }
5659     else if ( expDiff < 0 ) {
5660         if ( bExp == 0x7FFF ) {
5661             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5662             return packFloat128( zSign, 0x7FFF, 0, 0 );
5663         }
5664         if ( aExp == 0 ) {
5665             ++expDiff;
5666         }
5667         else {
5668             aSig0 |= LIT64( 0x0001000000000000 );
5669         }
5670         shift128ExtraRightJamming(
5671             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5672         zExp = bExp;
5673     }
5674     else {
5675         if ( aExp == 0x7FFF ) {
5676             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5677                 return propagateFloat128NaN( a, b STATUS_VAR );
5678             }
5679             return a;
5680         }
5681         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5682         if ( aExp == 0 ) {
5683             if (STATUS(flush_to_zero)) {
5684                 if (zSig0 | zSig1) {
5685                     float_raise(float_flag_output_denormal STATUS_VAR);
5686                 }
5687                 return packFloat128(zSign, 0, 0, 0);
5688             }
5689             return packFloat128( zSign, 0, zSig0, zSig1 );
5690         }
5691         zSig2 = 0;
5692         zSig0 |= LIT64( 0x0002000000000000 );
5693         zExp = aExp;
5694         goto shiftRight1;
5695     }
5696     aSig0 |= LIT64( 0x0001000000000000 );
5697     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5698     --zExp;
5699     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5700     ++zExp;
5701  shiftRight1:
5702     shift128ExtraRightJamming(
5703         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5704  roundAndPack:
5705     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5706 
5707 }
5708 
5709 /*----------------------------------------------------------------------------
5710 | Returns the result of subtracting the absolute values of the quadruple-
5711 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
5712 | difference is negated before being returned.  `zSign' is ignored if the
5713 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5714 | Standard for Binary Floating-Point Arithmetic.
5715 *----------------------------------------------------------------------------*/
5716 
5717 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5718 {
5719     int32 aExp, bExp, zExp;
5720     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5721     int32 expDiff;
5722     float128 z;
5723 
5724     aSig1 = extractFloat128Frac1( a );
5725     aSig0 = extractFloat128Frac0( a );
5726     aExp = extractFloat128Exp( a );
5727     bSig1 = extractFloat128Frac1( b );
5728     bSig0 = extractFloat128Frac0( b );
5729     bExp = extractFloat128Exp( b );
5730     expDiff = aExp - bExp;
5731     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5732     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5733     if ( 0 < expDiff ) goto aExpBigger;
5734     if ( expDiff < 0 ) goto bExpBigger;
5735     if ( aExp == 0x7FFF ) {
5736         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5737             return propagateFloat128NaN( a, b STATUS_VAR );
5738         }
5739         float_raise( float_flag_invalid STATUS_VAR);
5740         z.low = float128_default_nan_low;
5741         z.high = float128_default_nan_high;
5742         return z;
5743     }
5744     if ( aExp == 0 ) {
5745         aExp = 1;
5746         bExp = 1;
5747     }
5748     if ( bSig0 < aSig0 ) goto aBigger;
5749     if ( aSig0 < bSig0 ) goto bBigger;
5750     if ( bSig1 < aSig1 ) goto aBigger;
5751     if ( aSig1 < bSig1 ) goto bBigger;
5752     return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5753  bExpBigger:
5754     if ( bExp == 0x7FFF ) {
5755         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5756         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5757     }
5758     if ( aExp == 0 ) {
5759         ++expDiff;
5760     }
5761     else {
5762         aSig0 |= LIT64( 0x4000000000000000 );
5763     }
5764     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5765     bSig0 |= LIT64( 0x4000000000000000 );
5766  bBigger:
5767     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5768     zExp = bExp;
5769     zSign ^= 1;
5770     goto normalizeRoundAndPack;
5771  aExpBigger:
5772     if ( aExp == 0x7FFF ) {
5773         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5774         return a;
5775     }
5776     if ( bExp == 0 ) {
5777         --expDiff;
5778     }
5779     else {
5780         bSig0 |= LIT64( 0x4000000000000000 );
5781     }
5782     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5783     aSig0 |= LIT64( 0x4000000000000000 );
5784  aBigger:
5785     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5786     zExp = aExp;
5787  normalizeRoundAndPack:
5788     --zExp;
5789     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5790 
5791 }
5792 
5793 /*----------------------------------------------------------------------------
5794 | Returns the result of adding the quadruple-precision floating-point values
5795 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
5796 | for Binary Floating-Point Arithmetic.
5797 *----------------------------------------------------------------------------*/
5798 
5799 float128 float128_add( float128 a, float128 b STATUS_PARAM )
5800 {
5801     flag aSign, bSign;
5802 
5803     aSign = extractFloat128Sign( a );
5804     bSign = extractFloat128Sign( b );
5805     if ( aSign == bSign ) {
5806         return addFloat128Sigs( a, b, aSign STATUS_VAR );
5807     }
5808     else {
5809         return subFloat128Sigs( a, b, aSign STATUS_VAR );
5810     }
5811 
5812 }
5813 
5814 /*----------------------------------------------------------------------------
5815 | Returns the result of subtracting the quadruple-precision floating-point
5816 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5817 | Standard for Binary Floating-Point Arithmetic.
5818 *----------------------------------------------------------------------------*/
5819 
5820 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5821 {
5822     flag aSign, bSign;
5823 
5824     aSign = extractFloat128Sign( a );
5825     bSign = extractFloat128Sign( b );
5826     if ( aSign == bSign ) {
5827         return subFloat128Sigs( a, b, aSign STATUS_VAR );
5828     }
5829     else {
5830         return addFloat128Sigs( a, b, aSign STATUS_VAR );
5831     }
5832 
5833 }
5834 
5835 /*----------------------------------------------------------------------------
5836 | Returns the result of multiplying the quadruple-precision floating-point
5837 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5838 | Standard for Binary Floating-Point Arithmetic.
5839 *----------------------------------------------------------------------------*/
5840 
5841 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5842 {
5843     flag aSign, bSign, zSign;
5844     int32 aExp, bExp, zExp;
5845     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5846     float128 z;
5847 
5848     aSig1 = extractFloat128Frac1( a );
5849     aSig0 = extractFloat128Frac0( a );
5850     aExp = extractFloat128Exp( a );
5851     aSign = extractFloat128Sign( a );
5852     bSig1 = extractFloat128Frac1( b );
5853     bSig0 = extractFloat128Frac0( b );
5854     bExp = extractFloat128Exp( b );
5855     bSign = extractFloat128Sign( b );
5856     zSign = aSign ^ bSign;
5857     if ( aExp == 0x7FFF ) {
5858         if (    ( aSig0 | aSig1 )
5859              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5860             return propagateFloat128NaN( a, b STATUS_VAR );
5861         }
5862         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5863         return packFloat128( zSign, 0x7FFF, 0, 0 );
5864     }
5865     if ( bExp == 0x7FFF ) {
5866         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5867         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5868  invalid:
5869             float_raise( float_flag_invalid STATUS_VAR);
5870             z.low = float128_default_nan_low;
5871             z.high = float128_default_nan_high;
5872             return z;
5873         }
5874         return packFloat128( zSign, 0x7FFF, 0, 0 );
5875     }
5876     if ( aExp == 0 ) {
5877         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5878         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5879     }
5880     if ( bExp == 0 ) {
5881         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5882         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5883     }
5884     zExp = aExp + bExp - 0x4000;
5885     aSig0 |= LIT64( 0x0001000000000000 );
5886     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5887     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5888     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5889     zSig2 |= ( zSig3 != 0 );
5890     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5891         shift128ExtraRightJamming(
5892             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5893         ++zExp;
5894     }
5895     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5896 
5897 }
5898 
5899 /*----------------------------------------------------------------------------
5900 | Returns the result of dividing the quadruple-precision floating-point value
5901 | `a' by the corresponding value `b'.  The operation is performed according to
5902 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5903 *----------------------------------------------------------------------------*/
5904 
5905 float128 float128_div( float128 a, float128 b STATUS_PARAM )
5906 {
5907     flag aSign, bSign, zSign;
5908     int32 aExp, bExp, zExp;
5909     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5910     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5911     float128 z;
5912 
5913     aSig1 = extractFloat128Frac1( a );
5914     aSig0 = extractFloat128Frac0( a );
5915     aExp = extractFloat128Exp( a );
5916     aSign = extractFloat128Sign( a );
5917     bSig1 = extractFloat128Frac1( b );
5918     bSig0 = extractFloat128Frac0( b );
5919     bExp = extractFloat128Exp( b );
5920     bSign = extractFloat128Sign( b );
5921     zSign = aSign ^ bSign;
5922     if ( aExp == 0x7FFF ) {
5923         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5924         if ( bExp == 0x7FFF ) {
5925             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5926             goto invalid;
5927         }
5928         return packFloat128( zSign, 0x7FFF, 0, 0 );
5929     }
5930     if ( bExp == 0x7FFF ) {
5931         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5932         return packFloat128( zSign, 0, 0, 0 );
5933     }
5934     if ( bExp == 0 ) {
5935         if ( ( bSig0 | bSig1 ) == 0 ) {
5936             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5937  invalid:
5938                 float_raise( float_flag_invalid STATUS_VAR);
5939                 z.low = float128_default_nan_low;
5940                 z.high = float128_default_nan_high;
5941                 return z;
5942             }
5943             float_raise( float_flag_divbyzero STATUS_VAR);
5944             return packFloat128( zSign, 0x7FFF, 0, 0 );
5945         }
5946         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5947     }
5948     if ( aExp == 0 ) {
5949         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5950         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5951     }
5952     zExp = aExp - bExp + 0x3FFD;
5953     shortShift128Left(
5954         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5955     shortShift128Left(
5956         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5957     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5958         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5959         ++zExp;
5960     }
5961     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5962     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5963     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5964     while ( (int64_t) rem0 < 0 ) {
5965         --zSig0;
5966         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5967     }
5968     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5969     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5970         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5971         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5972         while ( (int64_t) rem1 < 0 ) {
5973             --zSig1;
5974             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5975         }
5976         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5977     }
5978     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5979     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5980 
5981 }
5982 
5983 /*----------------------------------------------------------------------------
5984 | Returns the remainder of the quadruple-precision floating-point value `a'
5985 | with respect to the corresponding value `b'.  The operation is performed
5986 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5987 *----------------------------------------------------------------------------*/
5988 
5989 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5990 {
5991     flag aSign, zSign;
5992     int32 aExp, bExp, expDiff;
5993     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5994     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
5995     int64_t sigMean0;
5996     float128 z;
5997 
5998     aSig1 = extractFloat128Frac1( a );
5999     aSig0 = extractFloat128Frac0( a );
6000     aExp = extractFloat128Exp( a );
6001     aSign = extractFloat128Sign( a );
6002     bSig1 = extractFloat128Frac1( b );
6003     bSig0 = extractFloat128Frac0( b );
6004     bExp = extractFloat128Exp( b );
6005     if ( aExp == 0x7FFF ) {
6006         if (    ( aSig0 | aSig1 )
6007              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6008             return propagateFloat128NaN( a, b STATUS_VAR );
6009         }
6010         goto invalid;
6011     }
6012     if ( bExp == 0x7FFF ) {
6013         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6014         return a;
6015     }
6016     if ( bExp == 0 ) {
6017         if ( ( bSig0 | bSig1 ) == 0 ) {
6018  invalid:
6019             float_raise( float_flag_invalid STATUS_VAR);
6020             z.low = float128_default_nan_low;
6021             z.high = float128_default_nan_high;
6022             return z;
6023         }
6024         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6025     }
6026     if ( aExp == 0 ) {
6027         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6028         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6029     }
6030     expDiff = aExp - bExp;
6031     if ( expDiff < -1 ) return a;
6032     shortShift128Left(
6033         aSig0 | LIT64( 0x0001000000000000 ),
6034         aSig1,
6035         15 - ( expDiff < 0 ),
6036         &aSig0,
6037         &aSig1
6038     );
6039     shortShift128Left(
6040         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6041     q = le128( bSig0, bSig1, aSig0, aSig1 );
6042     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6043     expDiff -= 64;
6044     while ( 0 < expDiff ) {
6045         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6046         q = ( 4 < q ) ? q - 4 : 0;
6047         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6048         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6049         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6050         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6051         expDiff -= 61;
6052     }
6053     if ( -64 < expDiff ) {
6054         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6055         q = ( 4 < q ) ? q - 4 : 0;
6056         q >>= - expDiff;
6057         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6058         expDiff += 52;
6059         if ( expDiff < 0 ) {
6060             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6061         }
6062         else {
6063             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6064         }
6065         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6066         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6067     }
6068     else {
6069         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6070         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6071     }
6072     do {
6073         alternateASig0 = aSig0;
6074         alternateASig1 = aSig1;
6075         ++q;
6076         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6077     } while ( 0 <= (int64_t) aSig0 );
6078     add128(
6079         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6080     if (    ( sigMean0 < 0 )
6081          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6082         aSig0 = alternateASig0;
6083         aSig1 = alternateASig1;
6084     }
6085     zSign = ( (int64_t) aSig0 < 0 );
6086     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6087     return
6088         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
6089 
6090 }
6091 
6092 /*----------------------------------------------------------------------------
6093 | Returns the square root of the quadruple-precision floating-point value `a'.
6094 | The operation is performed according to the IEC/IEEE Standard for Binary
6095 | Floating-Point Arithmetic.
6096 *----------------------------------------------------------------------------*/
6097 
6098 float128 float128_sqrt( float128 a STATUS_PARAM )
6099 {
6100     flag aSign;
6101     int32 aExp, zExp;
6102     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6103     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6104     float128 z;
6105 
6106     aSig1 = extractFloat128Frac1( a );
6107     aSig0 = extractFloat128Frac0( a );
6108     aExp = extractFloat128Exp( a );
6109     aSign = extractFloat128Sign( a );
6110     if ( aExp == 0x7FFF ) {
6111         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
6112         if ( ! aSign ) return a;
6113         goto invalid;
6114     }
6115     if ( aSign ) {
6116         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6117  invalid:
6118         float_raise( float_flag_invalid STATUS_VAR);
6119         z.low = float128_default_nan_low;
6120         z.high = float128_default_nan_high;
6121         return z;
6122     }
6123     if ( aExp == 0 ) {
6124         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6125         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6126     }
6127     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6128     aSig0 |= LIT64( 0x0001000000000000 );
6129     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6130     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6131     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6132     doubleZSig0 = zSig0<<1;
6133     mul64To128( zSig0, zSig0, &term0, &term1 );
6134     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6135     while ( (int64_t) rem0 < 0 ) {
6136         --zSig0;
6137         doubleZSig0 -= 2;
6138         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6139     }
6140     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6141     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6142         if ( zSig1 == 0 ) zSig1 = 1;
6143         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6144         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6145         mul64To128( zSig1, zSig1, &term2, &term3 );
6146         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6147         while ( (int64_t) rem1 < 0 ) {
6148             --zSig1;
6149             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6150             term3 |= 1;
6151             term2 |= doubleZSig0;
6152             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6153         }
6154         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6155     }
6156     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6157     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6158 
6159 }
6160 
6161 /*----------------------------------------------------------------------------
6162 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6163 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6164 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6165 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6166 *----------------------------------------------------------------------------*/
6167 
6168 int float128_eq( float128 a, float128 b STATUS_PARAM )
6169 {
6170 
6171     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6172               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6173          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6174               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6175        ) {
6176         float_raise( float_flag_invalid STATUS_VAR);
6177         return 0;
6178     }
6179     return
6180            ( a.low == b.low )
6181         && (    ( a.high == b.high )
6182              || (    ( a.low == 0 )
6183                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6184            );
6185 
6186 }
6187 
6188 /*----------------------------------------------------------------------------
6189 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6190 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
6191 | exception is raised if either operand is a NaN.  The comparison is performed
6192 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6193 *----------------------------------------------------------------------------*/
6194 
6195 int float128_le( float128 a, float128 b STATUS_PARAM )
6196 {
6197     flag aSign, bSign;
6198 
6199     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6200               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6201          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6202               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6203        ) {
6204         float_raise( float_flag_invalid STATUS_VAR);
6205         return 0;
6206     }
6207     aSign = extractFloat128Sign( a );
6208     bSign = extractFloat128Sign( b );
6209     if ( aSign != bSign ) {
6210         return
6211                aSign
6212             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6213                  == 0 );
6214     }
6215     return
6216           aSign ? le128( b.high, b.low, a.high, a.low )
6217         : le128( a.high, a.low, b.high, b.low );
6218 
6219 }
6220 
6221 /*----------------------------------------------------------------------------
6222 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6223 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6224 | raised if either operand is a NaN.  The comparison is performed according
6225 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6226 *----------------------------------------------------------------------------*/
6227 
6228 int float128_lt( float128 a, float128 b STATUS_PARAM )
6229 {
6230     flag aSign, bSign;
6231 
6232     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6233               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6234          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6235               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6236        ) {
6237         float_raise( float_flag_invalid STATUS_VAR);
6238         return 0;
6239     }
6240     aSign = extractFloat128Sign( a );
6241     bSign = extractFloat128Sign( b );
6242     if ( aSign != bSign ) {
6243         return
6244                aSign
6245             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6246                  != 0 );
6247     }
6248     return
6249           aSign ? lt128( b.high, b.low, a.high, a.low )
6250         : lt128( a.high, a.low, b.high, b.low );
6251 
6252 }
6253 
6254 /*----------------------------------------------------------------------------
6255 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6256 | be compared, and 0 otherwise.  The invalid exception is raised if either
6257 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6258 | Standard for Binary Floating-Point Arithmetic.
6259 *----------------------------------------------------------------------------*/
6260 
6261 int float128_unordered( float128 a, float128 b STATUS_PARAM )
6262 {
6263     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6264               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6265          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6266               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6267        ) {
6268         float_raise( float_flag_invalid STATUS_VAR);
6269         return 1;
6270     }
6271     return 0;
6272 }
6273 
6274 /*----------------------------------------------------------------------------
6275 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6276 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6277 | exception.  The comparison is performed according to the IEC/IEEE Standard
6278 | for Binary Floating-Point Arithmetic.
6279 *----------------------------------------------------------------------------*/
6280 
6281 int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
6282 {
6283 
6284     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6285               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6286          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6287               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6288        ) {
6289         if (    float128_is_signaling_nan( a )
6290              || float128_is_signaling_nan( b ) ) {
6291             float_raise( float_flag_invalid STATUS_VAR);
6292         }
6293         return 0;
6294     }
6295     return
6296            ( a.low == b.low )
6297         && (    ( a.high == b.high )
6298              || (    ( a.low == 0 )
6299                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6300            );
6301 
6302 }
6303 
6304 /*----------------------------------------------------------------------------
6305 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6306 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6307 | cause an exception.  Otherwise, the comparison is performed according to the
6308 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6309 *----------------------------------------------------------------------------*/
6310 
6311 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
6312 {
6313     flag aSign, bSign;
6314 
6315     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6316               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6317          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6318               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6319        ) {
6320         if (    float128_is_signaling_nan( a )
6321              || float128_is_signaling_nan( b ) ) {
6322             float_raise( float_flag_invalid STATUS_VAR);
6323         }
6324         return 0;
6325     }
6326     aSign = extractFloat128Sign( a );
6327     bSign = extractFloat128Sign( b );
6328     if ( aSign != bSign ) {
6329         return
6330                aSign
6331             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6332                  == 0 );
6333     }
6334     return
6335           aSign ? le128( b.high, b.low, a.high, a.low )
6336         : le128( a.high, a.low, b.high, b.low );
6337 
6338 }
6339 
6340 /*----------------------------------------------------------------------------
6341 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6342 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6343 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
6344 | Standard for Binary Floating-Point Arithmetic.
6345 *----------------------------------------------------------------------------*/
6346 
6347 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
6348 {
6349     flag aSign, bSign;
6350 
6351     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6352               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6353          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6354               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6355        ) {
6356         if (    float128_is_signaling_nan( a )
6357              || float128_is_signaling_nan( b ) ) {
6358             float_raise( float_flag_invalid STATUS_VAR);
6359         }
6360         return 0;
6361     }
6362     aSign = extractFloat128Sign( a );
6363     bSign = extractFloat128Sign( b );
6364     if ( aSign != bSign ) {
6365         return
6366                aSign
6367             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6368                  != 0 );
6369     }
6370     return
6371           aSign ? lt128( b.high, b.low, a.high, a.low )
6372         : lt128( a.high, a.low, b.high, b.low );
6373 
6374 }
6375 
6376 /*----------------------------------------------------------------------------
6377 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6378 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
6379 | comparison is performed according to the IEC/IEEE Standard for Binary
6380 | Floating-Point Arithmetic.
6381 *----------------------------------------------------------------------------*/
6382 
6383 int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
6384 {
6385     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6386               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6387          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6388               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6389        ) {
6390         if (    float128_is_signaling_nan( a )
6391              || float128_is_signaling_nan( b ) ) {
6392             float_raise( float_flag_invalid STATUS_VAR);
6393         }
6394         return 1;
6395     }
6396     return 0;
6397 }
6398 
6399 /* misc functions */
6400 float32 uint32_to_float32( uint32 a STATUS_PARAM )
6401 {
6402     return int64_to_float32(a STATUS_VAR);
6403 }
6404 
6405 float64 uint32_to_float64( uint32 a STATUS_PARAM )
6406 {
6407     return int64_to_float64(a STATUS_VAR);
6408 }
6409 
6410 uint32 float32_to_uint32( float32 a STATUS_PARAM )
6411 {
6412     int64_t v;
6413     uint32 res;
6414 
6415     v = float32_to_int64(a STATUS_VAR);
6416     if (v < 0) {
6417         res = 0;
6418         float_raise( float_flag_invalid STATUS_VAR);
6419     } else if (v > 0xffffffff) {
6420         res = 0xffffffff;
6421         float_raise( float_flag_invalid STATUS_VAR);
6422     } else {
6423         res = v;
6424     }
6425     return res;
6426 }
6427 
6428 uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
6429 {
6430     int64_t v;
6431     uint32 res;
6432 
6433     v = float32_to_int64_round_to_zero(a STATUS_VAR);
6434     if (v < 0) {
6435         res = 0;
6436         float_raise( float_flag_invalid STATUS_VAR);
6437     } else if (v > 0xffffffff) {
6438         res = 0xffffffff;
6439         float_raise( float_flag_invalid STATUS_VAR);
6440     } else {
6441         res = v;
6442     }
6443     return res;
6444 }
6445 
6446 uint16 float32_to_uint16_round_to_zero( float32 a STATUS_PARAM )
6447 {
6448     int64_t v;
6449     uint16 res;
6450 
6451     v = float32_to_int64_round_to_zero(a STATUS_VAR);
6452     if (v < 0) {
6453         res = 0;
6454         float_raise( float_flag_invalid STATUS_VAR);
6455     } else if (v > 0xffff) {
6456         res = 0xffff;
6457         float_raise( float_flag_invalid STATUS_VAR);
6458     } else {
6459         res = v;
6460     }
6461     return res;
6462 }
6463 
6464 uint32 float64_to_uint32( float64 a STATUS_PARAM )
6465 {
6466     int64_t v;
6467     uint32 res;
6468 
6469     v = float64_to_int64(a STATUS_VAR);
6470     if (v < 0) {
6471         res = 0;
6472         float_raise( float_flag_invalid STATUS_VAR);
6473     } else if (v > 0xffffffff) {
6474         res = 0xffffffff;
6475         float_raise( float_flag_invalid STATUS_VAR);
6476     } else {
6477         res = v;
6478     }
6479     return res;
6480 }
6481 
6482 uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
6483 {
6484     int64_t v;
6485     uint32 res;
6486 
6487     v = float64_to_int64_round_to_zero(a STATUS_VAR);
6488     if (v < 0) {
6489         res = 0;
6490         float_raise( float_flag_invalid STATUS_VAR);
6491     } else if (v > 0xffffffff) {
6492         res = 0xffffffff;
6493         float_raise( float_flag_invalid STATUS_VAR);
6494     } else {
6495         res = v;
6496     }
6497     return res;
6498 }
6499 
6500 uint16 float64_to_uint16_round_to_zero( float64 a STATUS_PARAM )
6501 {
6502     int64_t v;
6503     uint16 res;
6504 
6505     v = float64_to_int64_round_to_zero(a STATUS_VAR);
6506     if (v < 0) {
6507         res = 0;
6508         float_raise( float_flag_invalid STATUS_VAR);
6509     } else if (v > 0xffff) {
6510         res = 0xffff;
6511         float_raise( float_flag_invalid STATUS_VAR);
6512     } else {
6513         res = v;
6514     }
6515     return res;
6516 }
6517 
6518 /* FIXME: This looks broken.  */
6519 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
6520 {
6521     int64_t v;
6522 
6523     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6524     v += float64_val(a);
6525     v = float64_to_int64(make_float64(v) STATUS_VAR);
6526 
6527     return v - INT64_MIN;
6528 }
6529 
6530 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
6531 {
6532     int64_t v;
6533 
6534     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6535     v += float64_val(a);
6536     v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
6537 
6538     return v - INT64_MIN;
6539 }
6540 
6541 #define COMPARE(s, nan_exp)                                                  \
6542 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
6543                                       int is_quiet STATUS_PARAM )            \
6544 {                                                                            \
6545     flag aSign, bSign;                                                       \
6546     uint ## s ## _t av, bv;                                                  \
6547     a = float ## s ## _squash_input_denormal(a STATUS_VAR);                  \
6548     b = float ## s ## _squash_input_denormal(b STATUS_VAR);                  \
6549                                                                              \
6550     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
6551          extractFloat ## s ## Frac( a ) ) ||                                 \
6552         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
6553           extractFloat ## s ## Frac( b ) )) {                                \
6554         if (!is_quiet ||                                                     \
6555             float ## s ## _is_signaling_nan( a ) ||                          \
6556             float ## s ## _is_signaling_nan( b ) ) {                         \
6557             float_raise( float_flag_invalid STATUS_VAR);                     \
6558         }                                                                    \
6559         return float_relation_unordered;                                     \
6560     }                                                                        \
6561     aSign = extractFloat ## s ## Sign( a );                                  \
6562     bSign = extractFloat ## s ## Sign( b );                                  \
6563     av = float ## s ## _val(a);                                              \
6564     bv = float ## s ## _val(b);                                              \
6565     if ( aSign != bSign ) {                                                  \
6566         if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
6567             /* zero case */                                                  \
6568             return float_relation_equal;                                     \
6569         } else {                                                             \
6570             return 1 - (2 * aSign);                                          \
6571         }                                                                    \
6572     } else {                                                                 \
6573         if (av == bv) {                                                      \
6574             return float_relation_equal;                                     \
6575         } else {                                                             \
6576             return 1 - 2 * (aSign ^ ( av < bv ));                            \
6577         }                                                                    \
6578     }                                                                        \
6579 }                                                                            \
6580                                                                              \
6581 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
6582 {                                                                            \
6583     return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
6584 }                                                                            \
6585                                                                              \
6586 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
6587 {                                                                            \
6588     return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
6589 }
6590 
6591 COMPARE(32, 0xff)
6592 COMPARE(64, 0x7ff)
6593 
6594 INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,
6595                                       int is_quiet STATUS_PARAM )
6596 {
6597     flag aSign, bSign;
6598 
6599     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6600           ( extractFloatx80Frac( a )<<1 ) ) ||
6601         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6602           ( extractFloatx80Frac( b )<<1 ) )) {
6603         if (!is_quiet ||
6604             floatx80_is_signaling_nan( a ) ||
6605             floatx80_is_signaling_nan( b ) ) {
6606             float_raise( float_flag_invalid STATUS_VAR);
6607         }
6608         return float_relation_unordered;
6609     }
6610     aSign = extractFloatx80Sign( a );
6611     bSign = extractFloatx80Sign( b );
6612     if ( aSign != bSign ) {
6613 
6614         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6615              ( ( a.low | b.low ) == 0 ) ) {
6616             /* zero case */
6617             return float_relation_equal;
6618         } else {
6619             return 1 - (2 * aSign);
6620         }
6621     } else {
6622         if (a.low == b.low && a.high == b.high) {
6623             return float_relation_equal;
6624         } else {
6625             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6626         }
6627     }
6628 }
6629 
6630 int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
6631 {
6632     return floatx80_compare_internal(a, b, 0 STATUS_VAR);
6633 }
6634 
6635 int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
6636 {
6637     return floatx80_compare_internal(a, b, 1 STATUS_VAR);
6638 }
6639 
6640 INLINE int float128_compare_internal( float128 a, float128 b,
6641                                       int is_quiet STATUS_PARAM )
6642 {
6643     flag aSign, bSign;
6644 
6645     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6646           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6647         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6648           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6649         if (!is_quiet ||
6650             float128_is_signaling_nan( a ) ||
6651             float128_is_signaling_nan( b ) ) {
6652             float_raise( float_flag_invalid STATUS_VAR);
6653         }
6654         return float_relation_unordered;
6655     }
6656     aSign = extractFloat128Sign( a );
6657     bSign = extractFloat128Sign( b );
6658     if ( aSign != bSign ) {
6659         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6660             /* zero case */
6661             return float_relation_equal;
6662         } else {
6663             return 1 - (2 * aSign);
6664         }
6665     } else {
6666         if (a.low == b.low && a.high == b.high) {
6667             return float_relation_equal;
6668         } else {
6669             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6670         }
6671     }
6672 }
6673 
6674 int float128_compare( float128 a, float128 b STATUS_PARAM )
6675 {
6676     return float128_compare_internal(a, b, 0 STATUS_VAR);
6677 }
6678 
6679 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6680 {
6681     return float128_compare_internal(a, b, 1 STATUS_VAR);
6682 }
6683 
6684 /* min() and max() functions. These can't be implemented as
6685  * 'compare and pick one input' because that would mishandle
6686  * NaNs and +0 vs -0.
6687  */
6688 #define MINMAX(s, nan_exp)                                              \
6689 INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
6690                                         int ismin STATUS_PARAM )        \
6691 {                                                                       \
6692     flag aSign, bSign;                                                  \
6693     uint ## s ## _t av, bv;                                             \
6694     a = float ## s ## _squash_input_denormal(a STATUS_VAR);             \
6695     b = float ## s ## _squash_input_denormal(b STATUS_VAR);             \
6696     if (float ## s ## _is_any_nan(a) ||                                 \
6697         float ## s ## _is_any_nan(b)) {                                 \
6698         return propagateFloat ## s ## NaN(a, b STATUS_VAR);             \
6699     }                                                                   \
6700     aSign = extractFloat ## s ## Sign(a);                               \
6701     bSign = extractFloat ## s ## Sign(b);                               \
6702     av = float ## s ## _val(a);                                         \
6703     bv = float ## s ## _val(b);                                         \
6704     if (aSign != bSign) {                                               \
6705         if (ismin) {                                                    \
6706             return aSign ? a : b;                                       \
6707         } else {                                                        \
6708             return aSign ? b : a;                                       \
6709         }                                                               \
6710     } else {                                                            \
6711         if (ismin) {                                                    \
6712             return (aSign ^ (av < bv)) ? a : b;                         \
6713         } else {                                                        \
6714             return (aSign ^ (av < bv)) ? b : a;                         \
6715         }                                                               \
6716     }                                                                   \
6717 }                                                                       \
6718                                                                         \
6719 float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM)  \
6720 {                                                                       \
6721     return float ## s ## _minmax(a, b, 1 STATUS_VAR);                   \
6722 }                                                                       \
6723                                                                         \
6724 float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM)  \
6725 {                                                                       \
6726     return float ## s ## _minmax(a, b, 0 STATUS_VAR);                   \
6727 }
6728 
6729 MINMAX(32, 0xff)
6730 MINMAX(64, 0x7ff)
6731 
6732 
6733 /* Multiply A by 2 raised to the power N.  */
6734 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6735 {
6736     flag aSign;
6737     int16_t aExp;
6738     uint32_t aSig;
6739 
6740     a = float32_squash_input_denormal(a STATUS_VAR);
6741     aSig = extractFloat32Frac( a );
6742     aExp = extractFloat32Exp( a );
6743     aSign = extractFloat32Sign( a );
6744 
6745     if ( aExp == 0xFF ) {
6746         if ( aSig ) {
6747             return propagateFloat32NaN( a, a STATUS_VAR );
6748         }
6749         return a;
6750     }
6751     if ( aExp != 0 )
6752         aSig |= 0x00800000;
6753     else if ( aSig == 0 )
6754         return a;
6755 
6756     if (n > 0x200) {
6757         n = 0x200;
6758     } else if (n < -0x200) {
6759         n = -0x200;
6760     }
6761 
6762     aExp += n - 1;
6763     aSig <<= 7;
6764     return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
6765 }
6766 
6767 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
6768 {
6769     flag aSign;
6770     int16_t aExp;
6771     uint64_t aSig;
6772 
6773     a = float64_squash_input_denormal(a STATUS_VAR);
6774     aSig = extractFloat64Frac( a );
6775     aExp = extractFloat64Exp( a );
6776     aSign = extractFloat64Sign( a );
6777 
6778     if ( aExp == 0x7FF ) {
6779         if ( aSig ) {
6780             return propagateFloat64NaN( a, a STATUS_VAR );
6781         }
6782         return a;
6783     }
6784     if ( aExp != 0 )
6785         aSig |= LIT64( 0x0010000000000000 );
6786     else if ( aSig == 0 )
6787         return a;
6788 
6789     if (n > 0x1000) {
6790         n = 0x1000;
6791     } else if (n < -0x1000) {
6792         n = -0x1000;
6793     }
6794 
6795     aExp += n - 1;
6796     aSig <<= 10;
6797     return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
6798 }
6799 
6800 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
6801 {
6802     flag aSign;
6803     int32_t aExp;
6804     uint64_t aSig;
6805 
6806     aSig = extractFloatx80Frac( a );
6807     aExp = extractFloatx80Exp( a );
6808     aSign = extractFloatx80Sign( a );
6809 
6810     if ( aExp == 0x7FFF ) {
6811         if ( aSig<<1 ) {
6812             return propagateFloatx80NaN( a, a STATUS_VAR );
6813         }
6814         return a;
6815     }
6816 
6817     if (aExp == 0 && aSig == 0)
6818         return a;
6819 
6820     if (n > 0x10000) {
6821         n = 0x10000;
6822     } else if (n < -0x10000) {
6823         n = -0x10000;
6824     }
6825 
6826     aExp += n;
6827     return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
6828                                           aSign, aExp, aSig, 0 STATUS_VAR );
6829 }
6830 
6831 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
6832 {
6833     flag aSign;
6834     int32_t aExp;
6835     uint64_t aSig0, aSig1;
6836 
6837     aSig1 = extractFloat128Frac1( a );
6838     aSig0 = extractFloat128Frac0( a );
6839     aExp = extractFloat128Exp( a );
6840     aSign = extractFloat128Sign( a );
6841     if ( aExp == 0x7FFF ) {
6842         if ( aSig0 | aSig1 ) {
6843             return propagateFloat128NaN( a, a STATUS_VAR );
6844         }
6845         return a;
6846     }
6847     if ( aExp != 0 )
6848         aSig0 |= LIT64( 0x0001000000000000 );
6849     else if ( aSig0 == 0 && aSig1 == 0 )
6850         return a;
6851 
6852     if (n > 0x10000) {
6853         n = 0x10000;
6854     } else if (n < -0x10000) {
6855         n = -0x10000;
6856     }
6857 
6858     aExp += n - 1;
6859     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6860                                           STATUS_VAR );
6861 
6862 }
6863