xref: /openbmc/qemu/fpu/softfloat.c (revision d54f10bba7635b5ad8b750afd2bb2f0f8eb68b45)
1 /*
2  * QEMU float support
3  *
4  * Derived from SoftFloat.
5  */
6 
7 /*============================================================================
8 
9 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
10 Package, Release 2b.
11 
12 Written by John R. Hauser.  This work was made possible in part by the
13 International Computer Science Institute, located at Suite 600, 1947 Center
14 Street, Berkeley, California 94704.  Funding was partially provided by the
15 National Science Foundation under grant MIP-9311980.  The original version
16 of this code was written as part of a project to build a fixed-point vector
17 processor in collaboration with the University of California at Berkeley,
18 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
19 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20 arithmetic/SoftFloat.html'.
21 
22 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
23 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
30 
31 Derivative works are acceptable, even for commercial purposes, so long as
32 (1) the source code for the derivative work includes prominent notice that
33 the work is derivative, and (2) the source code includes prominent notice with
34 these four paragraphs for those parts of this code that are retained.
35 
36 =============================================================================*/
37 
38 #include "softfloat.h"
39 
40 /*----------------------------------------------------------------------------
41 | Primitive arithmetic functions, including multi-word arithmetic, and
42 | division and square root approximations.  (Can be specialized to target if
43 | desired.)
44 *----------------------------------------------------------------------------*/
45 #include "softfloat-macros.h"
46 
47 /*----------------------------------------------------------------------------
48 | Functions and definitions to determine:  (1) whether tininess for underflow
49 | is detected before or after rounding by default, (2) what (if anything)
50 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
51 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
52 | are propagated from function inputs to output.  These details are target-
53 | specific.
54 *----------------------------------------------------------------------------*/
55 #include "softfloat-specialize.h"
56 
57 void set_float_rounding_mode(int val STATUS_PARAM)
58 {
59     STATUS(float_rounding_mode) = val;
60 }
61 
62 void set_float_exception_flags(int val STATUS_PARAM)
63 {
64     STATUS(float_exception_flags) = val;
65 }
66 
67 #ifdef FLOATX80
68 void set_floatx80_rounding_precision(int val STATUS_PARAM)
69 {
70     STATUS(floatx80_rounding_precision) = val;
71 }
72 #endif
73 
74 /*----------------------------------------------------------------------------
75 | Returns the fraction bits of the half-precision floating-point value `a'.
76 *----------------------------------------------------------------------------*/
77 
78 INLINE uint32_t extractFloat16Frac(float16 a)
79 {
80     return float16_val(a) & 0x3ff;
81 }
82 
83 /*----------------------------------------------------------------------------
84 | Returns the exponent bits of the half-precision floating-point value `a'.
85 *----------------------------------------------------------------------------*/
86 
87 INLINE int16 extractFloat16Exp(float16 a)
88 {
89     return (float16_val(a) >> 10) & 0x1f;
90 }
91 
92 /*----------------------------------------------------------------------------
93 | Returns the sign bit of the single-precision floating-point value `a'.
94 *----------------------------------------------------------------------------*/
95 
96 INLINE flag extractFloat16Sign(float16 a)
97 {
98     return float16_val(a)>>15;
99 }
100 
101 /*----------------------------------------------------------------------------
102 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
103 | and 7, and returns the properly rounded 32-bit integer corresponding to the
104 | input.  If `zSign' is 1, the input is negated before being converted to an
105 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
106 | is simply rounded to an integer, with the inexact exception raised if the
107 | input cannot be represented exactly as an integer.  However, if the fixed-
108 | point input is too large, the invalid exception is raised and the largest
109 | positive or negative integer is returned.
110 *----------------------------------------------------------------------------*/
111 
112 static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
113 {
114     int8 roundingMode;
115     flag roundNearestEven;
116     int8 roundIncrement, roundBits;
117     int32 z;
118 
119     roundingMode = STATUS(float_rounding_mode);
120     roundNearestEven = ( roundingMode == float_round_nearest_even );
121     roundIncrement = 0x40;
122     if ( ! roundNearestEven ) {
123         if ( roundingMode == float_round_to_zero ) {
124             roundIncrement = 0;
125         }
126         else {
127             roundIncrement = 0x7F;
128             if ( zSign ) {
129                 if ( roundingMode == float_round_up ) roundIncrement = 0;
130             }
131             else {
132                 if ( roundingMode == float_round_down ) roundIncrement = 0;
133             }
134         }
135     }
136     roundBits = absZ & 0x7F;
137     absZ = ( absZ + roundIncrement )>>7;
138     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
139     z = absZ;
140     if ( zSign ) z = - z;
141     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
142         float_raise( float_flag_invalid STATUS_VAR);
143         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
144     }
145     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
146     return z;
147 
148 }
149 
150 /*----------------------------------------------------------------------------
151 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
152 | `absZ1', with binary point between bits 63 and 64 (between the input words),
153 | and returns the properly rounded 64-bit integer corresponding to the input.
154 | If `zSign' is 1, the input is negated before being converted to an integer.
155 | Ordinarily, the fixed-point input is simply rounded to an integer, with
156 | the inexact exception raised if the input cannot be represented exactly as
157 | an integer.  However, if the fixed-point input is too large, the invalid
158 | exception is raised and the largest positive or negative integer is
159 | returned.
160 *----------------------------------------------------------------------------*/
161 
162 static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
163 {
164     int8 roundingMode;
165     flag roundNearestEven, increment;
166     int64 z;
167 
168     roundingMode = STATUS(float_rounding_mode);
169     roundNearestEven = ( roundingMode == float_round_nearest_even );
170     increment = ( (int64_t) absZ1 < 0 );
171     if ( ! roundNearestEven ) {
172         if ( roundingMode == float_round_to_zero ) {
173             increment = 0;
174         }
175         else {
176             if ( zSign ) {
177                 increment = ( roundingMode == float_round_down ) && absZ1;
178             }
179             else {
180                 increment = ( roundingMode == float_round_up ) && absZ1;
181             }
182         }
183     }
184     if ( increment ) {
185         ++absZ0;
186         if ( absZ0 == 0 ) goto overflow;
187         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
188     }
189     z = absZ0;
190     if ( zSign ) z = - z;
191     if ( z && ( ( z < 0 ) ^ zSign ) ) {
192  overflow:
193         float_raise( float_flag_invalid STATUS_VAR);
194         return
195               zSign ? (int64_t) LIT64( 0x8000000000000000 )
196             : LIT64( 0x7FFFFFFFFFFFFFFF );
197     }
198     if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
199     return z;
200 
201 }
202 
203 /*----------------------------------------------------------------------------
204 | Returns the fraction bits of the single-precision floating-point value `a'.
205 *----------------------------------------------------------------------------*/
206 
207 INLINE uint32_t extractFloat32Frac( float32 a )
208 {
209 
210     return float32_val(a) & 0x007FFFFF;
211 
212 }
213 
214 /*----------------------------------------------------------------------------
215 | Returns the exponent bits of the single-precision floating-point value `a'.
216 *----------------------------------------------------------------------------*/
217 
218 INLINE int16 extractFloat32Exp( float32 a )
219 {
220 
221     return ( float32_val(a)>>23 ) & 0xFF;
222 
223 }
224 
225 /*----------------------------------------------------------------------------
226 | Returns the sign bit of the single-precision floating-point value `a'.
227 *----------------------------------------------------------------------------*/
228 
229 INLINE flag extractFloat32Sign( float32 a )
230 {
231 
232     return float32_val(a)>>31;
233 
234 }
235 
236 /*----------------------------------------------------------------------------
237 | If `a' is denormal and we are in flush-to-zero mode then set the
238 | input-denormal exception and return zero. Otherwise just return the value.
239 *----------------------------------------------------------------------------*/
240 static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
241 {
242     if (STATUS(flush_inputs_to_zero)) {
243         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
244             float_raise(float_flag_input_denormal STATUS_VAR);
245             return make_float32(float32_val(a) & 0x80000000);
246         }
247     }
248     return a;
249 }
250 
251 /*----------------------------------------------------------------------------
252 | Normalizes the subnormal single-precision floating-point value represented
253 | by the denormalized significand `aSig'.  The normalized exponent and
254 | significand are stored at the locations pointed to by `zExpPtr' and
255 | `zSigPtr', respectively.
256 *----------------------------------------------------------------------------*/
257 
258 static void
259  normalizeFloat32Subnormal( uint32_t aSig, int16 *zExpPtr, uint32_t *zSigPtr )
260 {
261     int8 shiftCount;
262 
263     shiftCount = countLeadingZeros32( aSig ) - 8;
264     *zSigPtr = aSig<<shiftCount;
265     *zExpPtr = 1 - shiftCount;
266 
267 }
268 
269 /*----------------------------------------------------------------------------
270 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
271 | single-precision floating-point value, returning the result.  After being
272 | shifted into the proper positions, the three fields are simply added
273 | together to form the result.  This means that any integer portion of `zSig'
274 | will be added into the exponent.  Since a properly normalized significand
275 | will have an integer portion equal to 1, the `zExp' input should be 1 less
276 | than the desired result exponent whenever `zSig' is a complete, normalized
277 | significand.
278 *----------------------------------------------------------------------------*/
279 
280 INLINE float32 packFloat32( flag zSign, int16 zExp, uint32_t zSig )
281 {
282 
283     return make_float32(
284           ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
285 
286 }
287 
288 /*----------------------------------------------------------------------------
289 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
290 | and significand `zSig', and returns the proper single-precision floating-
291 | point value corresponding to the abstract input.  Ordinarily, the abstract
292 | value is simply rounded and packed into the single-precision format, with
293 | the inexact exception raised if the abstract input cannot be represented
294 | exactly.  However, if the abstract value is too large, the overflow and
295 | inexact exceptions are raised and an infinity or maximal finite value is
296 | returned.  If the abstract value is too small, the input value is rounded to
297 | a subnormal number, and the underflow and inexact exceptions are raised if
298 | the abstract input cannot be represented exactly as a subnormal single-
299 | precision floating-point number.
300 |     The input significand `zSig' has its binary point between bits 30
301 | and 29, which is 7 bits to the left of the usual location.  This shifted
302 | significand must be normalized or smaller.  If `zSig' is not normalized,
303 | `zExp' must be 0; in that case, the result returned is a subnormal number,
304 | and it must not require rounding.  In the usual case that `zSig' is
305 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
306 | The handling of underflow and overflow follows the IEC/IEEE Standard for
307 | Binary Floating-Point Arithmetic.
308 *----------------------------------------------------------------------------*/
309 
310 static float32 roundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
311 {
312     int8 roundingMode;
313     flag roundNearestEven;
314     int8 roundIncrement, roundBits;
315     flag isTiny;
316 
317     roundingMode = STATUS(float_rounding_mode);
318     roundNearestEven = ( roundingMode == float_round_nearest_even );
319     roundIncrement = 0x40;
320     if ( ! roundNearestEven ) {
321         if ( roundingMode == float_round_to_zero ) {
322             roundIncrement = 0;
323         }
324         else {
325             roundIncrement = 0x7F;
326             if ( zSign ) {
327                 if ( roundingMode == float_round_up ) roundIncrement = 0;
328             }
329             else {
330                 if ( roundingMode == float_round_down ) roundIncrement = 0;
331             }
332         }
333     }
334     roundBits = zSig & 0x7F;
335     if ( 0xFD <= (uint16_t) zExp ) {
336         if (    ( 0xFD < zExp )
337              || (    ( zExp == 0xFD )
338                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
339            ) {
340             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
341             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
342         }
343         if ( zExp < 0 ) {
344             if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
345             isTiny =
346                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
347                 || ( zExp < -1 )
348                 || ( zSig + roundIncrement < 0x80000000 );
349             shift32RightJamming( zSig, - zExp, &zSig );
350             zExp = 0;
351             roundBits = zSig & 0x7F;
352             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
353         }
354     }
355     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
356     zSig = ( zSig + roundIncrement )>>7;
357     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
358     if ( zSig == 0 ) zExp = 0;
359     return packFloat32( zSign, zExp, zSig );
360 
361 }
362 
363 /*----------------------------------------------------------------------------
364 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
365 | and significand `zSig', and returns the proper single-precision floating-
366 | point value corresponding to the abstract input.  This routine is just like
367 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
368 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
369 | floating-point exponent.
370 *----------------------------------------------------------------------------*/
371 
372 static float32
373  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
374 {
375     int8 shiftCount;
376 
377     shiftCount = countLeadingZeros32( zSig ) - 1;
378     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
379 
380 }
381 
382 /*----------------------------------------------------------------------------
383 | Returns the fraction bits of the double-precision floating-point value `a'.
384 *----------------------------------------------------------------------------*/
385 
386 INLINE uint64_t extractFloat64Frac( float64 a )
387 {
388 
389     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
390 
391 }
392 
393 /*----------------------------------------------------------------------------
394 | Returns the exponent bits of the double-precision floating-point value `a'.
395 *----------------------------------------------------------------------------*/
396 
397 INLINE int16 extractFloat64Exp( float64 a )
398 {
399 
400     return ( float64_val(a)>>52 ) & 0x7FF;
401 
402 }
403 
404 /*----------------------------------------------------------------------------
405 | Returns the sign bit of the double-precision floating-point value `a'.
406 *----------------------------------------------------------------------------*/
407 
408 INLINE flag extractFloat64Sign( float64 a )
409 {
410 
411     return float64_val(a)>>63;
412 
413 }
414 
415 /*----------------------------------------------------------------------------
416 | If `a' is denormal and we are in flush-to-zero mode then set the
417 | input-denormal exception and return zero. Otherwise just return the value.
418 *----------------------------------------------------------------------------*/
419 static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
420 {
421     if (STATUS(flush_inputs_to_zero)) {
422         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
423             float_raise(float_flag_input_denormal STATUS_VAR);
424             return make_float64(float64_val(a) & (1ULL << 63));
425         }
426     }
427     return a;
428 }
429 
430 /*----------------------------------------------------------------------------
431 | Normalizes the subnormal double-precision floating-point value represented
432 | by the denormalized significand `aSig'.  The normalized exponent and
433 | significand are stored at the locations pointed to by `zExpPtr' and
434 | `zSigPtr', respectively.
435 *----------------------------------------------------------------------------*/
436 
437 static void
438  normalizeFloat64Subnormal( uint64_t aSig, int16 *zExpPtr, uint64_t *zSigPtr )
439 {
440     int8 shiftCount;
441 
442     shiftCount = countLeadingZeros64( aSig ) - 11;
443     *zSigPtr = aSig<<shiftCount;
444     *zExpPtr = 1 - shiftCount;
445 
446 }
447 
448 /*----------------------------------------------------------------------------
449 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
450 | double-precision floating-point value, returning the result.  After being
451 | shifted into the proper positions, the three fields are simply added
452 | together to form the result.  This means that any integer portion of `zSig'
453 | will be added into the exponent.  Since a properly normalized significand
454 | will have an integer portion equal to 1, the `zExp' input should be 1 less
455 | than the desired result exponent whenever `zSig' is a complete, normalized
456 | significand.
457 *----------------------------------------------------------------------------*/
458 
459 INLINE float64 packFloat64( flag zSign, int16 zExp, uint64_t zSig )
460 {
461 
462     return make_float64(
463         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
464 
465 }
466 
467 /*----------------------------------------------------------------------------
468 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
469 | and significand `zSig', and returns the proper double-precision floating-
470 | point value corresponding to the abstract input.  Ordinarily, the abstract
471 | value is simply rounded and packed into the double-precision format, with
472 | the inexact exception raised if the abstract input cannot be represented
473 | exactly.  However, if the abstract value is too large, the overflow and
474 | inexact exceptions are raised and an infinity or maximal finite value is
475 | returned.  If the abstract value is too small, the input value is rounded
476 | to a subnormal number, and the underflow and inexact exceptions are raised
477 | if the abstract input cannot be represented exactly as a subnormal double-
478 | precision floating-point number.
479 |     The input significand `zSig' has its binary point between bits 62
480 | and 61, which is 10 bits to the left of the usual location.  This shifted
481 | significand must be normalized or smaller.  If `zSig' is not normalized,
482 | `zExp' must be 0; in that case, the result returned is a subnormal number,
483 | and it must not require rounding.  In the usual case that `zSig' is
484 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
485 | The handling of underflow and overflow follows the IEC/IEEE Standard for
486 | Binary Floating-Point Arithmetic.
487 *----------------------------------------------------------------------------*/
488 
489 static float64 roundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
490 {
491     int8 roundingMode;
492     flag roundNearestEven;
493     int16 roundIncrement, roundBits;
494     flag isTiny;
495 
496     roundingMode = STATUS(float_rounding_mode);
497     roundNearestEven = ( roundingMode == float_round_nearest_even );
498     roundIncrement = 0x200;
499     if ( ! roundNearestEven ) {
500         if ( roundingMode == float_round_to_zero ) {
501             roundIncrement = 0;
502         }
503         else {
504             roundIncrement = 0x3FF;
505             if ( zSign ) {
506                 if ( roundingMode == float_round_up ) roundIncrement = 0;
507             }
508             else {
509                 if ( roundingMode == float_round_down ) roundIncrement = 0;
510             }
511         }
512     }
513     roundBits = zSig & 0x3FF;
514     if ( 0x7FD <= (uint16_t) zExp ) {
515         if (    ( 0x7FD < zExp )
516              || (    ( zExp == 0x7FD )
517                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
518            ) {
519             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
520             return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
521         }
522         if ( zExp < 0 ) {
523             if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
524             isTiny =
525                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
526                 || ( zExp < -1 )
527                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
528             shift64RightJamming( zSig, - zExp, &zSig );
529             zExp = 0;
530             roundBits = zSig & 0x3FF;
531             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
532         }
533     }
534     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
535     zSig = ( zSig + roundIncrement )>>10;
536     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
537     if ( zSig == 0 ) zExp = 0;
538     return packFloat64( zSign, zExp, zSig );
539 
540 }
541 
542 /*----------------------------------------------------------------------------
543 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
544 | and significand `zSig', and returns the proper double-precision floating-
545 | point value corresponding to the abstract input.  This routine is just like
546 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
547 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
548 | floating-point exponent.
549 *----------------------------------------------------------------------------*/
550 
551 static float64
552  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
553 {
554     int8 shiftCount;
555 
556     shiftCount = countLeadingZeros64( zSig ) - 1;
557     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
558 
559 }
560 
561 #ifdef FLOATX80
562 
563 /*----------------------------------------------------------------------------
564 | Returns the fraction bits of the extended double-precision floating-point
565 | value `a'.
566 *----------------------------------------------------------------------------*/
567 
568 INLINE uint64_t extractFloatx80Frac( floatx80 a )
569 {
570 
571     return a.low;
572 
573 }
574 
575 /*----------------------------------------------------------------------------
576 | Returns the exponent bits of the extended double-precision floating-point
577 | value `a'.
578 *----------------------------------------------------------------------------*/
579 
580 INLINE int32 extractFloatx80Exp( floatx80 a )
581 {
582 
583     return a.high & 0x7FFF;
584 
585 }
586 
587 /*----------------------------------------------------------------------------
588 | Returns the sign bit of the extended double-precision floating-point value
589 | `a'.
590 *----------------------------------------------------------------------------*/
591 
592 INLINE flag extractFloatx80Sign( floatx80 a )
593 {
594 
595     return a.high>>15;
596 
597 }
598 
599 /*----------------------------------------------------------------------------
600 | Normalizes the subnormal extended double-precision floating-point value
601 | represented by the denormalized significand `aSig'.  The normalized exponent
602 | and significand are stored at the locations pointed to by `zExpPtr' and
603 | `zSigPtr', respectively.
604 *----------------------------------------------------------------------------*/
605 
606 static void
607  normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
608 {
609     int8 shiftCount;
610 
611     shiftCount = countLeadingZeros64( aSig );
612     *zSigPtr = aSig<<shiftCount;
613     *zExpPtr = 1 - shiftCount;
614 
615 }
616 
617 /*----------------------------------------------------------------------------
618 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
619 | extended double-precision floating-point value, returning the result.
620 *----------------------------------------------------------------------------*/
621 
622 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
623 {
624     floatx80 z;
625 
626     z.low = zSig;
627     z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
628     return z;
629 
630 }
631 
632 /*----------------------------------------------------------------------------
633 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
634 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
635 | and returns the proper extended double-precision floating-point value
636 | corresponding to the abstract input.  Ordinarily, the abstract value is
637 | rounded and packed into the extended double-precision format, with the
638 | inexact exception raised if the abstract input cannot be represented
639 | exactly.  However, if the abstract value is too large, the overflow and
640 | inexact exceptions are raised and an infinity or maximal finite value is
641 | returned.  If the abstract value is too small, the input value is rounded to
642 | a subnormal number, and the underflow and inexact exceptions are raised if
643 | the abstract input cannot be represented exactly as a subnormal extended
644 | double-precision floating-point number.
645 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
646 | number of bits as single or double precision, respectively.  Otherwise, the
647 | result is rounded to the full precision of the extended double-precision
648 | format.
649 |     The input significand must be normalized or smaller.  If the input
650 | significand is not normalized, `zExp' must be 0; in that case, the result
651 | returned is a subnormal number, and it must not require rounding.  The
652 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
653 | Floating-Point Arithmetic.
654 *----------------------------------------------------------------------------*/
655 
656 static floatx80
657  roundAndPackFloatx80(
658      int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
659  STATUS_PARAM)
660 {
661     int8 roundingMode;
662     flag roundNearestEven, increment, isTiny;
663     int64 roundIncrement, roundMask, roundBits;
664 
665     roundingMode = STATUS(float_rounding_mode);
666     roundNearestEven = ( roundingMode == float_round_nearest_even );
667     if ( roundingPrecision == 80 ) goto precision80;
668     if ( roundingPrecision == 64 ) {
669         roundIncrement = LIT64( 0x0000000000000400 );
670         roundMask = LIT64( 0x00000000000007FF );
671     }
672     else if ( roundingPrecision == 32 ) {
673         roundIncrement = LIT64( 0x0000008000000000 );
674         roundMask = LIT64( 0x000000FFFFFFFFFF );
675     }
676     else {
677         goto precision80;
678     }
679     zSig0 |= ( zSig1 != 0 );
680     if ( ! roundNearestEven ) {
681         if ( roundingMode == float_round_to_zero ) {
682             roundIncrement = 0;
683         }
684         else {
685             roundIncrement = roundMask;
686             if ( zSign ) {
687                 if ( roundingMode == float_round_up ) roundIncrement = 0;
688             }
689             else {
690                 if ( roundingMode == float_round_down ) roundIncrement = 0;
691             }
692         }
693     }
694     roundBits = zSig0 & roundMask;
695     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
696         if (    ( 0x7FFE < zExp )
697              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
698            ) {
699             goto overflow;
700         }
701         if ( zExp <= 0 ) {
702             if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
703             isTiny =
704                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
705                 || ( zExp < 0 )
706                 || ( zSig0 <= zSig0 + roundIncrement );
707             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
708             zExp = 0;
709             roundBits = zSig0 & roundMask;
710             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
711             if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
712             zSig0 += roundIncrement;
713             if ( (int64_t) zSig0 < 0 ) zExp = 1;
714             roundIncrement = roundMask + 1;
715             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
716                 roundMask |= roundIncrement;
717             }
718             zSig0 &= ~ roundMask;
719             return packFloatx80( zSign, zExp, zSig0 );
720         }
721     }
722     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
723     zSig0 += roundIncrement;
724     if ( zSig0 < roundIncrement ) {
725         ++zExp;
726         zSig0 = LIT64( 0x8000000000000000 );
727     }
728     roundIncrement = roundMask + 1;
729     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
730         roundMask |= roundIncrement;
731     }
732     zSig0 &= ~ roundMask;
733     if ( zSig0 == 0 ) zExp = 0;
734     return packFloatx80( zSign, zExp, zSig0 );
735  precision80:
736     increment = ( (int64_t) zSig1 < 0 );
737     if ( ! roundNearestEven ) {
738         if ( roundingMode == float_round_to_zero ) {
739             increment = 0;
740         }
741         else {
742             if ( zSign ) {
743                 increment = ( roundingMode == float_round_down ) && zSig1;
744             }
745             else {
746                 increment = ( roundingMode == float_round_up ) && zSig1;
747             }
748         }
749     }
750     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
751         if (    ( 0x7FFE < zExp )
752              || (    ( zExp == 0x7FFE )
753                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
754                   && increment
755                 )
756            ) {
757             roundMask = 0;
758  overflow:
759             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
760             if (    ( roundingMode == float_round_to_zero )
761                  || ( zSign && ( roundingMode == float_round_up ) )
762                  || ( ! zSign && ( roundingMode == float_round_down ) )
763                ) {
764                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
765             }
766             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
767         }
768         if ( zExp <= 0 ) {
769             isTiny =
770                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
771                 || ( zExp < 0 )
772                 || ! increment
773                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
774             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
775             zExp = 0;
776             if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
777             if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
778             if ( roundNearestEven ) {
779                 increment = ( (int64_t) zSig1 < 0 );
780             }
781             else {
782                 if ( zSign ) {
783                     increment = ( roundingMode == float_round_down ) && zSig1;
784                 }
785                 else {
786                     increment = ( roundingMode == float_round_up ) && zSig1;
787                 }
788             }
789             if ( increment ) {
790                 ++zSig0;
791                 zSig0 &=
792                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
793                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
794             }
795             return packFloatx80( zSign, zExp, zSig0 );
796         }
797     }
798     if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
799     if ( increment ) {
800         ++zSig0;
801         if ( zSig0 == 0 ) {
802             ++zExp;
803             zSig0 = LIT64( 0x8000000000000000 );
804         }
805         else {
806             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
807         }
808     }
809     else {
810         if ( zSig0 == 0 ) zExp = 0;
811     }
812     return packFloatx80( zSign, zExp, zSig0 );
813 
814 }
815 
816 /*----------------------------------------------------------------------------
817 | Takes an abstract floating-point value having sign `zSign', exponent
818 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
819 | and returns the proper extended double-precision floating-point value
820 | corresponding to the abstract input.  This routine is just like
821 | `roundAndPackFloatx80' except that the input significand does not have to be
822 | normalized.
823 *----------------------------------------------------------------------------*/
824 
825 static floatx80
826  normalizeRoundAndPackFloatx80(
827      int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
828  STATUS_PARAM)
829 {
830     int8 shiftCount;
831 
832     if ( zSig0 == 0 ) {
833         zSig0 = zSig1;
834         zSig1 = 0;
835         zExp -= 64;
836     }
837     shiftCount = countLeadingZeros64( zSig0 );
838     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
839     zExp -= shiftCount;
840     return
841         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
842 
843 }
844 
845 #endif
846 
847 #ifdef FLOAT128
848 
849 /*----------------------------------------------------------------------------
850 | Returns the least-significant 64 fraction bits of the quadruple-precision
851 | floating-point value `a'.
852 *----------------------------------------------------------------------------*/
853 
854 INLINE uint64_t extractFloat128Frac1( float128 a )
855 {
856 
857     return a.low;
858 
859 }
860 
861 /*----------------------------------------------------------------------------
862 | Returns the most-significant 48 fraction bits of the quadruple-precision
863 | floating-point value `a'.
864 *----------------------------------------------------------------------------*/
865 
866 INLINE uint64_t extractFloat128Frac0( float128 a )
867 {
868 
869     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
870 
871 }
872 
873 /*----------------------------------------------------------------------------
874 | Returns the exponent bits of the quadruple-precision floating-point value
875 | `a'.
876 *----------------------------------------------------------------------------*/
877 
878 INLINE int32 extractFloat128Exp( float128 a )
879 {
880 
881     return ( a.high>>48 ) & 0x7FFF;
882 
883 }
884 
885 /*----------------------------------------------------------------------------
886 | Returns the sign bit of the quadruple-precision floating-point value `a'.
887 *----------------------------------------------------------------------------*/
888 
889 INLINE flag extractFloat128Sign( float128 a )
890 {
891 
892     return a.high>>63;
893 
894 }
895 
896 /*----------------------------------------------------------------------------
897 | Normalizes the subnormal quadruple-precision floating-point value
898 | represented by the denormalized significand formed by the concatenation of
899 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
900 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
901 | significand are stored at the location pointed to by `zSig0Ptr', and the
902 | least significant 64 bits of the normalized significand are stored at the
903 | location pointed to by `zSig1Ptr'.
904 *----------------------------------------------------------------------------*/
905 
906 static void
907  normalizeFloat128Subnormal(
908      uint64_t aSig0,
909      uint64_t aSig1,
910      int32 *zExpPtr,
911      uint64_t *zSig0Ptr,
912      uint64_t *zSig1Ptr
913  )
914 {
915     int8 shiftCount;
916 
917     if ( aSig0 == 0 ) {
918         shiftCount = countLeadingZeros64( aSig1 ) - 15;
919         if ( shiftCount < 0 ) {
920             *zSig0Ptr = aSig1>>( - shiftCount );
921             *zSig1Ptr = aSig1<<( shiftCount & 63 );
922         }
923         else {
924             *zSig0Ptr = aSig1<<shiftCount;
925             *zSig1Ptr = 0;
926         }
927         *zExpPtr = - shiftCount - 63;
928     }
929     else {
930         shiftCount = countLeadingZeros64( aSig0 ) - 15;
931         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
932         *zExpPtr = 1 - shiftCount;
933     }
934 
935 }
936 
937 /*----------------------------------------------------------------------------
938 | Packs the sign `zSign', the exponent `zExp', and the significand formed
939 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
940 | floating-point value, returning the result.  After being shifted into the
941 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
942 | added together to form the most significant 32 bits of the result.  This
943 | means that any integer portion of `zSig0' will be added into the exponent.
944 | Since a properly normalized significand will have an integer portion equal
945 | to 1, the `zExp' input should be 1 less than the desired result exponent
946 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
947 | significand.
948 *----------------------------------------------------------------------------*/
949 
950 INLINE float128
951  packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
952 {
953     float128 z;
954 
955     z.low = zSig1;
956     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
957     return z;
958 
959 }
960 
961 /*----------------------------------------------------------------------------
962 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
963 | and extended significand formed by the concatenation of `zSig0', `zSig1',
964 | and `zSig2', and returns the proper quadruple-precision floating-point value
965 | corresponding to the abstract input.  Ordinarily, the abstract value is
966 | simply rounded and packed into the quadruple-precision format, with the
967 | inexact exception raised if the abstract input cannot be represented
968 | exactly.  However, if the abstract value is too large, the overflow and
969 | inexact exceptions are raised and an infinity or maximal finite value is
970 | returned.  If the abstract value is too small, the input value is rounded to
971 | a subnormal number, and the underflow and inexact exceptions are raised if
972 | the abstract input cannot be represented exactly as a subnormal quadruple-
973 | precision floating-point number.
974 |     The input significand must be normalized or smaller.  If the input
975 | significand is not normalized, `zExp' must be 0; in that case, the result
976 | returned is a subnormal number, and it must not require rounding.  In the
977 | usual case that the input significand is normalized, `zExp' must be 1 less
978 | than the ``true'' floating-point exponent.  The handling of underflow and
979 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
980 *----------------------------------------------------------------------------*/
981 
982 static float128
983  roundAndPackFloat128(
984      flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
985 {
986     int8 roundingMode;
987     flag roundNearestEven, increment, isTiny;
988 
989     roundingMode = STATUS(float_rounding_mode);
990     roundNearestEven = ( roundingMode == float_round_nearest_even );
991     increment = ( (int64_t) zSig2 < 0 );
992     if ( ! roundNearestEven ) {
993         if ( roundingMode == float_round_to_zero ) {
994             increment = 0;
995         }
996         else {
997             if ( zSign ) {
998                 increment = ( roundingMode == float_round_down ) && zSig2;
999             }
1000             else {
1001                 increment = ( roundingMode == float_round_up ) && zSig2;
1002             }
1003         }
1004     }
1005     if ( 0x7FFD <= (uint32_t) zExp ) {
1006         if (    ( 0x7FFD < zExp )
1007              || (    ( zExp == 0x7FFD )
1008                   && eq128(
1009                          LIT64( 0x0001FFFFFFFFFFFF ),
1010                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1011                          zSig0,
1012                          zSig1
1013                      )
1014                   && increment
1015                 )
1016            ) {
1017             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1018             if (    ( roundingMode == float_round_to_zero )
1019                  || ( zSign && ( roundingMode == float_round_up ) )
1020                  || ( ! zSign && ( roundingMode == float_round_down ) )
1021                ) {
1022                 return
1023                     packFloat128(
1024                         zSign,
1025                         0x7FFE,
1026                         LIT64( 0x0000FFFFFFFFFFFF ),
1027                         LIT64( 0xFFFFFFFFFFFFFFFF )
1028                     );
1029             }
1030             return packFloat128( zSign, 0x7FFF, 0, 0 );
1031         }
1032         if ( zExp < 0 ) {
1033             if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
1034             isTiny =
1035                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1036                 || ( zExp < -1 )
1037                 || ! increment
1038                 || lt128(
1039                        zSig0,
1040                        zSig1,
1041                        LIT64( 0x0001FFFFFFFFFFFF ),
1042                        LIT64( 0xFFFFFFFFFFFFFFFF )
1043                    );
1044             shift128ExtraRightJamming(
1045                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1046             zExp = 0;
1047             if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1048             if ( roundNearestEven ) {
1049                 increment = ( (int64_t) zSig2 < 0 );
1050             }
1051             else {
1052                 if ( zSign ) {
1053                     increment = ( roundingMode == float_round_down ) && zSig2;
1054                 }
1055                 else {
1056                     increment = ( roundingMode == float_round_up ) && zSig2;
1057                 }
1058             }
1059         }
1060     }
1061     if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1062     if ( increment ) {
1063         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1064         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1065     }
1066     else {
1067         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1068     }
1069     return packFloat128( zSign, zExp, zSig0, zSig1 );
1070 
1071 }
1072 
1073 /*----------------------------------------------------------------------------
1074 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1075 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1076 | returns the proper quadruple-precision floating-point value corresponding
1077 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1078 | except that the input significand has fewer bits and does not have to be
1079 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1080 | point exponent.
1081 *----------------------------------------------------------------------------*/
1082 
1083 static float128
1084  normalizeRoundAndPackFloat128(
1085      flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1086 {
1087     int8 shiftCount;
1088     uint64_t zSig2;
1089 
1090     if ( zSig0 == 0 ) {
1091         zSig0 = zSig1;
1092         zSig1 = 0;
1093         zExp -= 64;
1094     }
1095     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1096     if ( 0 <= shiftCount ) {
1097         zSig2 = 0;
1098         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1099     }
1100     else {
1101         shift128ExtraRightJamming(
1102             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1103     }
1104     zExp -= shiftCount;
1105     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1106 
1107 }
1108 
1109 #endif
1110 
1111 /*----------------------------------------------------------------------------
1112 | Returns the result of converting the 32-bit two's complement integer `a'
1113 | to the single-precision floating-point format.  The conversion is performed
1114 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1115 *----------------------------------------------------------------------------*/
1116 
1117 float32 int32_to_float32( int32 a STATUS_PARAM )
1118 {
1119     flag zSign;
1120 
1121     if ( a == 0 ) return float32_zero;
1122     if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1123     zSign = ( a < 0 );
1124     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1125 
1126 }
1127 
1128 /*----------------------------------------------------------------------------
1129 | Returns the result of converting the 32-bit two's complement integer `a'
1130 | to the double-precision floating-point format.  The conversion is performed
1131 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1132 *----------------------------------------------------------------------------*/
1133 
1134 float64 int32_to_float64( int32 a STATUS_PARAM )
1135 {
1136     flag zSign;
1137     uint32 absA;
1138     int8 shiftCount;
1139     uint64_t zSig;
1140 
1141     if ( a == 0 ) return float64_zero;
1142     zSign = ( a < 0 );
1143     absA = zSign ? - a : a;
1144     shiftCount = countLeadingZeros32( absA ) + 21;
1145     zSig = absA;
1146     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1147 
1148 }
1149 
1150 #ifdef FLOATX80
1151 
1152 /*----------------------------------------------------------------------------
1153 | Returns the result of converting the 32-bit two's complement integer `a'
1154 | to the extended double-precision floating-point format.  The conversion
1155 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1156 | Arithmetic.
1157 *----------------------------------------------------------------------------*/
1158 
1159 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1160 {
1161     flag zSign;
1162     uint32 absA;
1163     int8 shiftCount;
1164     uint64_t zSig;
1165 
1166     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1167     zSign = ( a < 0 );
1168     absA = zSign ? - a : a;
1169     shiftCount = countLeadingZeros32( absA ) + 32;
1170     zSig = absA;
1171     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1172 
1173 }
1174 
1175 #endif
1176 
1177 #ifdef FLOAT128
1178 
1179 /*----------------------------------------------------------------------------
1180 | Returns the result of converting the 32-bit two's complement integer `a' to
1181 | the quadruple-precision floating-point format.  The conversion is performed
1182 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1183 *----------------------------------------------------------------------------*/
1184 
1185 float128 int32_to_float128( int32 a STATUS_PARAM )
1186 {
1187     flag zSign;
1188     uint32 absA;
1189     int8 shiftCount;
1190     uint64_t zSig0;
1191 
1192     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1193     zSign = ( a < 0 );
1194     absA = zSign ? - a : a;
1195     shiftCount = countLeadingZeros32( absA ) + 17;
1196     zSig0 = absA;
1197     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1198 
1199 }
1200 
1201 #endif
1202 
1203 /*----------------------------------------------------------------------------
1204 | Returns the result of converting the 64-bit two's complement integer `a'
1205 | to the single-precision floating-point format.  The conversion is performed
1206 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1207 *----------------------------------------------------------------------------*/
1208 
1209 float32 int64_to_float32( int64 a STATUS_PARAM )
1210 {
1211     flag zSign;
1212     uint64 absA;
1213     int8 shiftCount;
1214 
1215     if ( a == 0 ) return float32_zero;
1216     zSign = ( a < 0 );
1217     absA = zSign ? - a : a;
1218     shiftCount = countLeadingZeros64( absA ) - 40;
1219     if ( 0 <= shiftCount ) {
1220         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1221     }
1222     else {
1223         shiftCount += 7;
1224         if ( shiftCount < 0 ) {
1225             shift64RightJamming( absA, - shiftCount, &absA );
1226         }
1227         else {
1228             absA <<= shiftCount;
1229         }
1230         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1231     }
1232 
1233 }
1234 
1235 float32 uint64_to_float32( uint64 a STATUS_PARAM )
1236 {
1237     int8 shiftCount;
1238 
1239     if ( a == 0 ) return float32_zero;
1240     shiftCount = countLeadingZeros64( a ) - 40;
1241     if ( 0 <= shiftCount ) {
1242         return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1243     }
1244     else {
1245         shiftCount += 7;
1246         if ( shiftCount < 0 ) {
1247             shift64RightJamming( a, - shiftCount, &a );
1248         }
1249         else {
1250             a <<= shiftCount;
1251         }
1252         return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1253     }
1254 }
1255 
1256 /*----------------------------------------------------------------------------
1257 | Returns the result of converting the 64-bit two's complement integer `a'
1258 | to the double-precision floating-point format.  The conversion is performed
1259 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1260 *----------------------------------------------------------------------------*/
1261 
1262 float64 int64_to_float64( int64 a STATUS_PARAM )
1263 {
1264     flag zSign;
1265 
1266     if ( a == 0 ) return float64_zero;
1267     if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1268         return packFloat64( 1, 0x43E, 0 );
1269     }
1270     zSign = ( a < 0 );
1271     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1272 
1273 }
1274 
1275 float64 uint64_to_float64( uint64 a STATUS_PARAM )
1276 {
1277     if ( a == 0 ) return float64_zero;
1278     return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1279 
1280 }
1281 
1282 #ifdef FLOATX80
1283 
1284 /*----------------------------------------------------------------------------
1285 | Returns the result of converting the 64-bit two's complement integer `a'
1286 | to the extended double-precision floating-point format.  The conversion
1287 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1288 | Arithmetic.
1289 *----------------------------------------------------------------------------*/
1290 
1291 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1292 {
1293     flag zSign;
1294     uint64 absA;
1295     int8 shiftCount;
1296 
1297     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1298     zSign = ( a < 0 );
1299     absA = zSign ? - a : a;
1300     shiftCount = countLeadingZeros64( absA );
1301     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1302 
1303 }
1304 
1305 #endif
1306 
1307 #ifdef FLOAT128
1308 
1309 /*----------------------------------------------------------------------------
1310 | Returns the result of converting the 64-bit two's complement integer `a' to
1311 | the quadruple-precision floating-point format.  The conversion is performed
1312 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1313 *----------------------------------------------------------------------------*/
1314 
1315 float128 int64_to_float128( int64 a STATUS_PARAM )
1316 {
1317     flag zSign;
1318     uint64 absA;
1319     int8 shiftCount;
1320     int32 zExp;
1321     uint64_t zSig0, zSig1;
1322 
1323     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1324     zSign = ( a < 0 );
1325     absA = zSign ? - a : a;
1326     shiftCount = countLeadingZeros64( absA ) + 49;
1327     zExp = 0x406E - shiftCount;
1328     if ( 64 <= shiftCount ) {
1329         zSig1 = 0;
1330         zSig0 = absA;
1331         shiftCount -= 64;
1332     }
1333     else {
1334         zSig1 = absA;
1335         zSig0 = 0;
1336     }
1337     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1338     return packFloat128( zSign, zExp, zSig0, zSig1 );
1339 
1340 }
1341 
1342 #endif
1343 
1344 /*----------------------------------------------------------------------------
1345 | Returns the result of converting the single-precision floating-point value
1346 | `a' to the 32-bit two's complement integer format.  The conversion is
1347 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1348 | Arithmetic---which means in particular that the conversion is rounded
1349 | according to the current rounding mode.  If `a' is a NaN, the largest
1350 | positive integer is returned.  Otherwise, if the conversion overflows, the
1351 | largest integer with the same sign as `a' is returned.
1352 *----------------------------------------------------------------------------*/
1353 
1354 int32 float32_to_int32( float32 a STATUS_PARAM )
1355 {
1356     flag aSign;
1357     int16 aExp, shiftCount;
1358     uint32_t aSig;
1359     uint64_t aSig64;
1360 
1361     a = float32_squash_input_denormal(a STATUS_VAR);
1362     aSig = extractFloat32Frac( a );
1363     aExp = extractFloat32Exp( a );
1364     aSign = extractFloat32Sign( a );
1365     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1366     if ( aExp ) aSig |= 0x00800000;
1367     shiftCount = 0xAF - aExp;
1368     aSig64 = aSig;
1369     aSig64 <<= 32;
1370     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1371     return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1372 
1373 }
1374 
1375 /*----------------------------------------------------------------------------
1376 | Returns the result of converting the single-precision floating-point value
1377 | `a' to the 32-bit two's complement integer format.  The conversion is
1378 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1379 | Arithmetic, except that the conversion is always rounded toward zero.
1380 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1381 | the conversion overflows, the largest integer with the same sign as `a' is
1382 | returned.
1383 *----------------------------------------------------------------------------*/
1384 
1385 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1386 {
1387     flag aSign;
1388     int16 aExp, shiftCount;
1389     uint32_t aSig;
1390     int32 z;
1391     a = float32_squash_input_denormal(a STATUS_VAR);
1392 
1393     aSig = extractFloat32Frac( a );
1394     aExp = extractFloat32Exp( a );
1395     aSign = extractFloat32Sign( a );
1396     shiftCount = aExp - 0x9E;
1397     if ( 0 <= shiftCount ) {
1398         if ( float32_val(a) != 0xCF000000 ) {
1399             float_raise( float_flag_invalid STATUS_VAR);
1400             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1401         }
1402         return (int32_t) 0x80000000;
1403     }
1404     else if ( aExp <= 0x7E ) {
1405         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1406         return 0;
1407     }
1408     aSig = ( aSig | 0x00800000 )<<8;
1409     z = aSig>>( - shiftCount );
1410     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1411         STATUS(float_exception_flags) |= float_flag_inexact;
1412     }
1413     if ( aSign ) z = - z;
1414     return z;
1415 
1416 }
1417 
1418 /*----------------------------------------------------------------------------
1419 | Returns the result of converting the single-precision floating-point value
1420 | `a' to the 16-bit two's complement integer format.  The conversion is
1421 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1422 | Arithmetic, except that the conversion is always rounded toward zero.
1423 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1424 | the conversion overflows, the largest integer with the same sign as `a' is
1425 | returned.
1426 *----------------------------------------------------------------------------*/
1427 
1428 int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM )
1429 {
1430     flag aSign;
1431     int16 aExp, shiftCount;
1432     uint32_t aSig;
1433     int32 z;
1434 
1435     aSig = extractFloat32Frac( a );
1436     aExp = extractFloat32Exp( a );
1437     aSign = extractFloat32Sign( a );
1438     shiftCount = aExp - 0x8E;
1439     if ( 0 <= shiftCount ) {
1440         if ( float32_val(a) != 0xC7000000 ) {
1441             float_raise( float_flag_invalid STATUS_VAR);
1442             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1443                 return 0x7FFF;
1444             }
1445         }
1446         return (int32_t) 0xffff8000;
1447     }
1448     else if ( aExp <= 0x7E ) {
1449         if ( aExp | aSig ) {
1450             STATUS(float_exception_flags) |= float_flag_inexact;
1451         }
1452         return 0;
1453     }
1454     shiftCount -= 0x10;
1455     aSig = ( aSig | 0x00800000 )<<8;
1456     z = aSig>>( - shiftCount );
1457     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1458         STATUS(float_exception_flags) |= float_flag_inexact;
1459     }
1460     if ( aSign ) {
1461         z = - z;
1462     }
1463     return z;
1464 
1465 }
1466 
1467 /*----------------------------------------------------------------------------
1468 | Returns the result of converting the single-precision floating-point value
1469 | `a' to the 64-bit two's complement integer format.  The conversion is
1470 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1471 | Arithmetic---which means in particular that the conversion is rounded
1472 | according to the current rounding mode.  If `a' is a NaN, the largest
1473 | positive integer is returned.  Otherwise, if the conversion overflows, the
1474 | largest integer with the same sign as `a' is returned.
1475 *----------------------------------------------------------------------------*/
1476 
1477 int64 float32_to_int64( float32 a STATUS_PARAM )
1478 {
1479     flag aSign;
1480     int16 aExp, shiftCount;
1481     uint32_t aSig;
1482     uint64_t aSig64, aSigExtra;
1483     a = float32_squash_input_denormal(a STATUS_VAR);
1484 
1485     aSig = extractFloat32Frac( a );
1486     aExp = extractFloat32Exp( a );
1487     aSign = extractFloat32Sign( a );
1488     shiftCount = 0xBE - aExp;
1489     if ( shiftCount < 0 ) {
1490         float_raise( float_flag_invalid STATUS_VAR);
1491         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1492             return LIT64( 0x7FFFFFFFFFFFFFFF );
1493         }
1494         return (int64_t) LIT64( 0x8000000000000000 );
1495     }
1496     if ( aExp ) aSig |= 0x00800000;
1497     aSig64 = aSig;
1498     aSig64 <<= 40;
1499     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1500     return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1501 
1502 }
1503 
1504 /*----------------------------------------------------------------------------
1505 | Returns the result of converting the single-precision floating-point value
1506 | `a' to the 64-bit two's complement integer format.  The conversion is
1507 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1508 | Arithmetic, except that the conversion is always rounded toward zero.  If
1509 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1510 | conversion overflows, the largest integer with the same sign as `a' is
1511 | returned.
1512 *----------------------------------------------------------------------------*/
1513 
1514 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1515 {
1516     flag aSign;
1517     int16 aExp, shiftCount;
1518     uint32_t aSig;
1519     uint64_t aSig64;
1520     int64 z;
1521     a = float32_squash_input_denormal(a STATUS_VAR);
1522 
1523     aSig = extractFloat32Frac( a );
1524     aExp = extractFloat32Exp( a );
1525     aSign = extractFloat32Sign( a );
1526     shiftCount = aExp - 0xBE;
1527     if ( 0 <= shiftCount ) {
1528         if ( float32_val(a) != 0xDF000000 ) {
1529             float_raise( float_flag_invalid STATUS_VAR);
1530             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1531                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1532             }
1533         }
1534         return (int64_t) LIT64( 0x8000000000000000 );
1535     }
1536     else if ( aExp <= 0x7E ) {
1537         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1538         return 0;
1539     }
1540     aSig64 = aSig | 0x00800000;
1541     aSig64 <<= 40;
1542     z = aSig64>>( - shiftCount );
1543     if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1544         STATUS(float_exception_flags) |= float_flag_inexact;
1545     }
1546     if ( aSign ) z = - z;
1547     return z;
1548 
1549 }
1550 
1551 /*----------------------------------------------------------------------------
1552 | Returns the result of converting the single-precision floating-point value
1553 | `a' to the double-precision floating-point format.  The conversion is
1554 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1555 | Arithmetic.
1556 *----------------------------------------------------------------------------*/
1557 
1558 float64 float32_to_float64( float32 a STATUS_PARAM )
1559 {
1560     flag aSign;
1561     int16 aExp;
1562     uint32_t aSig;
1563     a = float32_squash_input_denormal(a STATUS_VAR);
1564 
1565     aSig = extractFloat32Frac( a );
1566     aExp = extractFloat32Exp( a );
1567     aSign = extractFloat32Sign( a );
1568     if ( aExp == 0xFF ) {
1569         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1570         return packFloat64( aSign, 0x7FF, 0 );
1571     }
1572     if ( aExp == 0 ) {
1573         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1574         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1575         --aExp;
1576     }
1577     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1578 
1579 }
1580 
1581 #ifdef FLOATX80
1582 
1583 /*----------------------------------------------------------------------------
1584 | Returns the result of converting the single-precision floating-point value
1585 | `a' to the extended double-precision floating-point format.  The conversion
1586 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1587 | Arithmetic.
1588 *----------------------------------------------------------------------------*/
1589 
1590 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1591 {
1592     flag aSign;
1593     int16 aExp;
1594     uint32_t aSig;
1595 
1596     a = float32_squash_input_denormal(a STATUS_VAR);
1597     aSig = extractFloat32Frac( a );
1598     aExp = extractFloat32Exp( a );
1599     aSign = extractFloat32Sign( a );
1600     if ( aExp == 0xFF ) {
1601         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1602         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1603     }
1604     if ( aExp == 0 ) {
1605         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1606         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1607     }
1608     aSig |= 0x00800000;
1609     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1610 
1611 }
1612 
1613 #endif
1614 
1615 #ifdef FLOAT128
1616 
1617 /*----------------------------------------------------------------------------
1618 | Returns the result of converting the single-precision floating-point value
1619 | `a' to the double-precision floating-point format.  The conversion is
1620 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1621 | Arithmetic.
1622 *----------------------------------------------------------------------------*/
1623 
1624 float128 float32_to_float128( float32 a STATUS_PARAM )
1625 {
1626     flag aSign;
1627     int16 aExp;
1628     uint32_t aSig;
1629 
1630     a = float32_squash_input_denormal(a STATUS_VAR);
1631     aSig = extractFloat32Frac( a );
1632     aExp = extractFloat32Exp( a );
1633     aSign = extractFloat32Sign( a );
1634     if ( aExp == 0xFF ) {
1635         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1636         return packFloat128( aSign, 0x7FFF, 0, 0 );
1637     }
1638     if ( aExp == 0 ) {
1639         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1640         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1641         --aExp;
1642     }
1643     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1644 
1645 }
1646 
1647 #endif
1648 
1649 /*----------------------------------------------------------------------------
1650 | Rounds the single-precision floating-point value `a' to an integer, and
1651 | returns the result as a single-precision floating-point value.  The
1652 | operation is performed according to the IEC/IEEE Standard for Binary
1653 | Floating-Point Arithmetic.
1654 *----------------------------------------------------------------------------*/
1655 
1656 float32 float32_round_to_int( float32 a STATUS_PARAM)
1657 {
1658     flag aSign;
1659     int16 aExp;
1660     uint32_t lastBitMask, roundBitsMask;
1661     int8 roundingMode;
1662     uint32_t z;
1663     a = float32_squash_input_denormal(a STATUS_VAR);
1664 
1665     aExp = extractFloat32Exp( a );
1666     if ( 0x96 <= aExp ) {
1667         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1668             return propagateFloat32NaN( a, a STATUS_VAR );
1669         }
1670         return a;
1671     }
1672     if ( aExp <= 0x7E ) {
1673         if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1674         STATUS(float_exception_flags) |= float_flag_inexact;
1675         aSign = extractFloat32Sign( a );
1676         switch ( STATUS(float_rounding_mode) ) {
1677          case float_round_nearest_even:
1678             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1679                 return packFloat32( aSign, 0x7F, 0 );
1680             }
1681             break;
1682          case float_round_down:
1683             return make_float32(aSign ? 0xBF800000 : 0);
1684          case float_round_up:
1685             return make_float32(aSign ? 0x80000000 : 0x3F800000);
1686         }
1687         return packFloat32( aSign, 0, 0 );
1688     }
1689     lastBitMask = 1;
1690     lastBitMask <<= 0x96 - aExp;
1691     roundBitsMask = lastBitMask - 1;
1692     z = float32_val(a);
1693     roundingMode = STATUS(float_rounding_mode);
1694     if ( roundingMode == float_round_nearest_even ) {
1695         z += lastBitMask>>1;
1696         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1697     }
1698     else if ( roundingMode != float_round_to_zero ) {
1699         if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1700             z += roundBitsMask;
1701         }
1702     }
1703     z &= ~ roundBitsMask;
1704     if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1705     return make_float32(z);
1706 
1707 }
1708 
1709 /*----------------------------------------------------------------------------
1710 | Returns the result of adding the absolute values of the single-precision
1711 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1712 | before being returned.  `zSign' is ignored if the result is a NaN.
1713 | The addition is performed according to the IEC/IEEE Standard for Binary
1714 | Floating-Point Arithmetic.
1715 *----------------------------------------------------------------------------*/
1716 
1717 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1718 {
1719     int16 aExp, bExp, zExp;
1720     uint32_t aSig, bSig, zSig;
1721     int16 expDiff;
1722 
1723     aSig = extractFloat32Frac( a );
1724     aExp = extractFloat32Exp( a );
1725     bSig = extractFloat32Frac( b );
1726     bExp = extractFloat32Exp( b );
1727     expDiff = aExp - bExp;
1728     aSig <<= 6;
1729     bSig <<= 6;
1730     if ( 0 < expDiff ) {
1731         if ( aExp == 0xFF ) {
1732             if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1733             return a;
1734         }
1735         if ( bExp == 0 ) {
1736             --expDiff;
1737         }
1738         else {
1739             bSig |= 0x20000000;
1740         }
1741         shift32RightJamming( bSig, expDiff, &bSig );
1742         zExp = aExp;
1743     }
1744     else if ( expDiff < 0 ) {
1745         if ( bExp == 0xFF ) {
1746             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1747             return packFloat32( zSign, 0xFF, 0 );
1748         }
1749         if ( aExp == 0 ) {
1750             ++expDiff;
1751         }
1752         else {
1753             aSig |= 0x20000000;
1754         }
1755         shift32RightJamming( aSig, - expDiff, &aSig );
1756         zExp = bExp;
1757     }
1758     else {
1759         if ( aExp == 0xFF ) {
1760             if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1761             return a;
1762         }
1763         if ( aExp == 0 ) {
1764             if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
1765             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1766         }
1767         zSig = 0x40000000 + aSig + bSig;
1768         zExp = aExp;
1769         goto roundAndPack;
1770     }
1771     aSig |= 0x20000000;
1772     zSig = ( aSig + bSig )<<1;
1773     --zExp;
1774     if ( (int32_t) zSig < 0 ) {
1775         zSig = aSig + bSig;
1776         ++zExp;
1777     }
1778  roundAndPack:
1779     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1780 
1781 }
1782 
1783 /*----------------------------------------------------------------------------
1784 | Returns the result of subtracting the absolute values of the single-
1785 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
1786 | difference is negated before being returned.  `zSign' is ignored if the
1787 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
1788 | Standard for Binary Floating-Point Arithmetic.
1789 *----------------------------------------------------------------------------*/
1790 
1791 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1792 {
1793     int16 aExp, bExp, zExp;
1794     uint32_t aSig, bSig, zSig;
1795     int16 expDiff;
1796 
1797     aSig = extractFloat32Frac( a );
1798     aExp = extractFloat32Exp( a );
1799     bSig = extractFloat32Frac( b );
1800     bExp = extractFloat32Exp( b );
1801     expDiff = aExp - bExp;
1802     aSig <<= 7;
1803     bSig <<= 7;
1804     if ( 0 < expDiff ) goto aExpBigger;
1805     if ( expDiff < 0 ) goto bExpBigger;
1806     if ( aExp == 0xFF ) {
1807         if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1808         float_raise( float_flag_invalid STATUS_VAR);
1809         return float32_default_nan;
1810     }
1811     if ( aExp == 0 ) {
1812         aExp = 1;
1813         bExp = 1;
1814     }
1815     if ( bSig < aSig ) goto aBigger;
1816     if ( aSig < bSig ) goto bBigger;
1817     return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1818  bExpBigger:
1819     if ( bExp == 0xFF ) {
1820         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1821         return packFloat32( zSign ^ 1, 0xFF, 0 );
1822     }
1823     if ( aExp == 0 ) {
1824         ++expDiff;
1825     }
1826     else {
1827         aSig |= 0x40000000;
1828     }
1829     shift32RightJamming( aSig, - expDiff, &aSig );
1830     bSig |= 0x40000000;
1831  bBigger:
1832     zSig = bSig - aSig;
1833     zExp = bExp;
1834     zSign ^= 1;
1835     goto normalizeRoundAndPack;
1836  aExpBigger:
1837     if ( aExp == 0xFF ) {
1838         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1839         return a;
1840     }
1841     if ( bExp == 0 ) {
1842         --expDiff;
1843     }
1844     else {
1845         bSig |= 0x40000000;
1846     }
1847     shift32RightJamming( bSig, expDiff, &bSig );
1848     aSig |= 0x40000000;
1849  aBigger:
1850     zSig = aSig - bSig;
1851     zExp = aExp;
1852  normalizeRoundAndPack:
1853     --zExp;
1854     return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1855 
1856 }
1857 
1858 /*----------------------------------------------------------------------------
1859 | Returns the result of adding the single-precision floating-point values `a'
1860 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
1861 | Binary Floating-Point Arithmetic.
1862 *----------------------------------------------------------------------------*/
1863 
1864 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1865 {
1866     flag aSign, bSign;
1867     a = float32_squash_input_denormal(a STATUS_VAR);
1868     b = float32_squash_input_denormal(b STATUS_VAR);
1869 
1870     aSign = extractFloat32Sign( a );
1871     bSign = extractFloat32Sign( b );
1872     if ( aSign == bSign ) {
1873         return addFloat32Sigs( a, b, aSign STATUS_VAR);
1874     }
1875     else {
1876         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1877     }
1878 
1879 }
1880 
1881 /*----------------------------------------------------------------------------
1882 | Returns the result of subtracting the single-precision floating-point values
1883 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1884 | for Binary Floating-Point Arithmetic.
1885 *----------------------------------------------------------------------------*/
1886 
1887 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1888 {
1889     flag aSign, bSign;
1890     a = float32_squash_input_denormal(a STATUS_VAR);
1891     b = float32_squash_input_denormal(b STATUS_VAR);
1892 
1893     aSign = extractFloat32Sign( a );
1894     bSign = extractFloat32Sign( b );
1895     if ( aSign == bSign ) {
1896         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1897     }
1898     else {
1899         return addFloat32Sigs( a, b, aSign STATUS_VAR );
1900     }
1901 
1902 }
1903 
1904 /*----------------------------------------------------------------------------
1905 | Returns the result of multiplying the single-precision floating-point values
1906 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1907 | for Binary Floating-Point Arithmetic.
1908 *----------------------------------------------------------------------------*/
1909 
1910 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1911 {
1912     flag aSign, bSign, zSign;
1913     int16 aExp, bExp, zExp;
1914     uint32_t aSig, bSig;
1915     uint64_t zSig64;
1916     uint32_t zSig;
1917 
1918     a = float32_squash_input_denormal(a STATUS_VAR);
1919     b = float32_squash_input_denormal(b STATUS_VAR);
1920 
1921     aSig = extractFloat32Frac( a );
1922     aExp = extractFloat32Exp( a );
1923     aSign = extractFloat32Sign( a );
1924     bSig = extractFloat32Frac( b );
1925     bExp = extractFloat32Exp( b );
1926     bSign = extractFloat32Sign( b );
1927     zSign = aSign ^ bSign;
1928     if ( aExp == 0xFF ) {
1929         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1930             return propagateFloat32NaN( a, b STATUS_VAR );
1931         }
1932         if ( ( bExp | bSig ) == 0 ) {
1933             float_raise( float_flag_invalid STATUS_VAR);
1934             return float32_default_nan;
1935         }
1936         return packFloat32( zSign, 0xFF, 0 );
1937     }
1938     if ( bExp == 0xFF ) {
1939         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1940         if ( ( aExp | aSig ) == 0 ) {
1941             float_raise( float_flag_invalid STATUS_VAR);
1942             return float32_default_nan;
1943         }
1944         return packFloat32( zSign, 0xFF, 0 );
1945     }
1946     if ( aExp == 0 ) {
1947         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1948         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1949     }
1950     if ( bExp == 0 ) {
1951         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1952         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1953     }
1954     zExp = aExp + bExp - 0x7F;
1955     aSig = ( aSig | 0x00800000 )<<7;
1956     bSig = ( bSig | 0x00800000 )<<8;
1957     shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
1958     zSig = zSig64;
1959     if ( 0 <= (int32_t) ( zSig<<1 ) ) {
1960         zSig <<= 1;
1961         --zExp;
1962     }
1963     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1964 
1965 }
1966 
1967 /*----------------------------------------------------------------------------
1968 | Returns the result of dividing the single-precision floating-point value `a'
1969 | by the corresponding value `b'.  The operation is performed according to the
1970 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1971 *----------------------------------------------------------------------------*/
1972 
1973 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1974 {
1975     flag aSign, bSign, zSign;
1976     int16 aExp, bExp, zExp;
1977     uint32_t aSig, bSig, zSig;
1978     a = float32_squash_input_denormal(a STATUS_VAR);
1979     b = float32_squash_input_denormal(b STATUS_VAR);
1980 
1981     aSig = extractFloat32Frac( a );
1982     aExp = extractFloat32Exp( a );
1983     aSign = extractFloat32Sign( a );
1984     bSig = extractFloat32Frac( b );
1985     bExp = extractFloat32Exp( b );
1986     bSign = extractFloat32Sign( b );
1987     zSign = aSign ^ bSign;
1988     if ( aExp == 0xFF ) {
1989         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1990         if ( bExp == 0xFF ) {
1991             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1992             float_raise( float_flag_invalid STATUS_VAR);
1993             return float32_default_nan;
1994         }
1995         return packFloat32( zSign, 0xFF, 0 );
1996     }
1997     if ( bExp == 0xFF ) {
1998         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1999         return packFloat32( zSign, 0, 0 );
2000     }
2001     if ( bExp == 0 ) {
2002         if ( bSig == 0 ) {
2003             if ( ( aExp | aSig ) == 0 ) {
2004                 float_raise( float_flag_invalid STATUS_VAR);
2005                 return float32_default_nan;
2006             }
2007             float_raise( float_flag_divbyzero STATUS_VAR);
2008             return packFloat32( zSign, 0xFF, 0 );
2009         }
2010         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2011     }
2012     if ( aExp == 0 ) {
2013         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2014         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2015     }
2016     zExp = aExp - bExp + 0x7D;
2017     aSig = ( aSig | 0x00800000 )<<7;
2018     bSig = ( bSig | 0x00800000 )<<8;
2019     if ( bSig <= ( aSig + aSig ) ) {
2020         aSig >>= 1;
2021         ++zExp;
2022     }
2023     zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2024     if ( ( zSig & 0x3F ) == 0 ) {
2025         zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2026     }
2027     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2028 
2029 }
2030 
2031 /*----------------------------------------------------------------------------
2032 | Returns the remainder of the single-precision floating-point value `a'
2033 | with respect to the corresponding value `b'.  The operation is performed
2034 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2035 *----------------------------------------------------------------------------*/
2036 
2037 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2038 {
2039     flag aSign, zSign;
2040     int16 aExp, bExp, expDiff;
2041     uint32_t aSig, bSig;
2042     uint32_t q;
2043     uint64_t aSig64, bSig64, q64;
2044     uint32_t alternateASig;
2045     int32_t sigMean;
2046     a = float32_squash_input_denormal(a STATUS_VAR);
2047     b = float32_squash_input_denormal(b STATUS_VAR);
2048 
2049     aSig = extractFloat32Frac( a );
2050     aExp = extractFloat32Exp( a );
2051     aSign = extractFloat32Sign( a );
2052     bSig = extractFloat32Frac( b );
2053     bExp = extractFloat32Exp( b );
2054     if ( aExp == 0xFF ) {
2055         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2056             return propagateFloat32NaN( a, b STATUS_VAR );
2057         }
2058         float_raise( float_flag_invalid STATUS_VAR);
2059         return float32_default_nan;
2060     }
2061     if ( bExp == 0xFF ) {
2062         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2063         return a;
2064     }
2065     if ( bExp == 0 ) {
2066         if ( bSig == 0 ) {
2067             float_raise( float_flag_invalid STATUS_VAR);
2068             return float32_default_nan;
2069         }
2070         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2071     }
2072     if ( aExp == 0 ) {
2073         if ( aSig == 0 ) return a;
2074         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2075     }
2076     expDiff = aExp - bExp;
2077     aSig |= 0x00800000;
2078     bSig |= 0x00800000;
2079     if ( expDiff < 32 ) {
2080         aSig <<= 8;
2081         bSig <<= 8;
2082         if ( expDiff < 0 ) {
2083             if ( expDiff < -1 ) return a;
2084             aSig >>= 1;
2085         }
2086         q = ( bSig <= aSig );
2087         if ( q ) aSig -= bSig;
2088         if ( 0 < expDiff ) {
2089             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2090             q >>= 32 - expDiff;
2091             bSig >>= 2;
2092             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2093         }
2094         else {
2095             aSig >>= 2;
2096             bSig >>= 2;
2097         }
2098     }
2099     else {
2100         if ( bSig <= aSig ) aSig -= bSig;
2101         aSig64 = ( (uint64_t) aSig )<<40;
2102         bSig64 = ( (uint64_t) bSig )<<40;
2103         expDiff -= 64;
2104         while ( 0 < expDiff ) {
2105             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2106             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2107             aSig64 = - ( ( bSig * q64 )<<38 );
2108             expDiff -= 62;
2109         }
2110         expDiff += 64;
2111         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2112         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2113         q = q64>>( 64 - expDiff );
2114         bSig <<= 6;
2115         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2116     }
2117     do {
2118         alternateASig = aSig;
2119         ++q;
2120         aSig -= bSig;
2121     } while ( 0 <= (int32_t) aSig );
2122     sigMean = aSig + alternateASig;
2123     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2124         aSig = alternateASig;
2125     }
2126     zSign = ( (int32_t) aSig < 0 );
2127     if ( zSign ) aSig = - aSig;
2128     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2129 
2130 }
2131 
2132 /*----------------------------------------------------------------------------
2133 | Returns the square root of the single-precision floating-point value `a'.
2134 | The operation is performed according to the IEC/IEEE Standard for Binary
2135 | Floating-Point Arithmetic.
2136 *----------------------------------------------------------------------------*/
2137 
2138 float32 float32_sqrt( float32 a STATUS_PARAM )
2139 {
2140     flag aSign;
2141     int16 aExp, zExp;
2142     uint32_t aSig, zSig;
2143     uint64_t rem, term;
2144     a = float32_squash_input_denormal(a STATUS_VAR);
2145 
2146     aSig = extractFloat32Frac( a );
2147     aExp = extractFloat32Exp( a );
2148     aSign = extractFloat32Sign( a );
2149     if ( aExp == 0xFF ) {
2150         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2151         if ( ! aSign ) return a;
2152         float_raise( float_flag_invalid STATUS_VAR);
2153         return float32_default_nan;
2154     }
2155     if ( aSign ) {
2156         if ( ( aExp | aSig ) == 0 ) return a;
2157         float_raise( float_flag_invalid STATUS_VAR);
2158         return float32_default_nan;
2159     }
2160     if ( aExp == 0 ) {
2161         if ( aSig == 0 ) return float32_zero;
2162         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2163     }
2164     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2165     aSig = ( aSig | 0x00800000 )<<8;
2166     zSig = estimateSqrt32( aExp, aSig ) + 2;
2167     if ( ( zSig & 0x7F ) <= 5 ) {
2168         if ( zSig < 2 ) {
2169             zSig = 0x7FFFFFFF;
2170             goto roundAndPack;
2171         }
2172         aSig >>= aExp & 1;
2173         term = ( (uint64_t) zSig ) * zSig;
2174         rem = ( ( (uint64_t) aSig )<<32 ) - term;
2175         while ( (int64_t) rem < 0 ) {
2176             --zSig;
2177             rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2178         }
2179         zSig |= ( rem != 0 );
2180     }
2181     shift32RightJamming( zSig, 1, &zSig );
2182  roundAndPack:
2183     return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2184 
2185 }
2186 
2187 /*----------------------------------------------------------------------------
2188 | Returns the binary exponential of the single-precision floating-point value
2189 | `a'. The operation is performed according to the IEC/IEEE Standard for
2190 | Binary Floating-Point Arithmetic.
2191 |
2192 | Uses the following identities:
2193 |
2194 | 1. -------------------------------------------------------------------------
2195 |      x    x*ln(2)
2196 |     2  = e
2197 |
2198 | 2. -------------------------------------------------------------------------
2199 |                      2     3     4     5           n
2200 |      x        x     x     x     x     x           x
2201 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2202 |               1!    2!    3!    4!    5!          n!
2203 *----------------------------------------------------------------------------*/
2204 
2205 static const float64 float32_exp2_coefficients[15] =
2206 {
2207     const_float64( 0x3ff0000000000000ll ), /*  1 */
2208     const_float64( 0x3fe0000000000000ll ), /*  2 */
2209     const_float64( 0x3fc5555555555555ll ), /*  3 */
2210     const_float64( 0x3fa5555555555555ll ), /*  4 */
2211     const_float64( 0x3f81111111111111ll ), /*  5 */
2212     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2213     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2214     const_float64( 0x3efa01a01a01a01all ), /*  8 */
2215     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2216     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2217     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2218     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2219     const_float64( 0x3de6124613a86d09ll ), /* 13 */
2220     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2221     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2222 };
2223 
2224 float32 float32_exp2( float32 a STATUS_PARAM )
2225 {
2226     flag aSign;
2227     int16 aExp;
2228     uint32_t aSig;
2229     float64 r, x, xn;
2230     int i;
2231     a = float32_squash_input_denormal(a STATUS_VAR);
2232 
2233     aSig = extractFloat32Frac( a );
2234     aExp = extractFloat32Exp( a );
2235     aSign = extractFloat32Sign( a );
2236 
2237     if ( aExp == 0xFF) {
2238         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2239         return (aSign) ? float32_zero : a;
2240     }
2241     if (aExp == 0) {
2242         if (aSig == 0) return float32_one;
2243     }
2244 
2245     float_raise( float_flag_inexact STATUS_VAR);
2246 
2247     /* ******************************* */
2248     /* using float64 for approximation */
2249     /* ******************************* */
2250     x = float32_to_float64(a STATUS_VAR);
2251     x = float64_mul(x, float64_ln2 STATUS_VAR);
2252 
2253     xn = x;
2254     r = float64_one;
2255     for (i = 0 ; i < 15 ; i++) {
2256         float64 f;
2257 
2258         f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2259         r = float64_add(r, f STATUS_VAR);
2260 
2261         xn = float64_mul(xn, x STATUS_VAR);
2262     }
2263 
2264     return float64_to_float32(r, status);
2265 }
2266 
2267 /*----------------------------------------------------------------------------
2268 | Returns the binary log of the single-precision floating-point value `a'.
2269 | The operation is performed according to the IEC/IEEE Standard for Binary
2270 | Floating-Point Arithmetic.
2271 *----------------------------------------------------------------------------*/
2272 float32 float32_log2( float32 a STATUS_PARAM )
2273 {
2274     flag aSign, zSign;
2275     int16 aExp;
2276     uint32_t aSig, zSig, i;
2277 
2278     a = float32_squash_input_denormal(a STATUS_VAR);
2279     aSig = extractFloat32Frac( a );
2280     aExp = extractFloat32Exp( a );
2281     aSign = extractFloat32Sign( a );
2282 
2283     if ( aExp == 0 ) {
2284         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2285         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2286     }
2287     if ( aSign ) {
2288         float_raise( float_flag_invalid STATUS_VAR);
2289         return float32_default_nan;
2290     }
2291     if ( aExp == 0xFF ) {
2292         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2293         return a;
2294     }
2295 
2296     aExp -= 0x7F;
2297     aSig |= 0x00800000;
2298     zSign = aExp < 0;
2299     zSig = aExp << 23;
2300 
2301     for (i = 1 << 22; i > 0; i >>= 1) {
2302         aSig = ( (uint64_t)aSig * aSig ) >> 23;
2303         if ( aSig & 0x01000000 ) {
2304             aSig >>= 1;
2305             zSig |= i;
2306         }
2307     }
2308 
2309     if ( zSign )
2310         zSig = -zSig;
2311 
2312     return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2313 }
2314 
2315 /*----------------------------------------------------------------------------
2316 | Returns 1 if the single-precision floating-point value `a' is equal to
2317 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2318 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2319 *----------------------------------------------------------------------------*/
2320 
2321 int float32_eq( float32 a, float32 b STATUS_PARAM )
2322 {
2323     a = float32_squash_input_denormal(a STATUS_VAR);
2324     b = float32_squash_input_denormal(b STATUS_VAR);
2325 
2326     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2327          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2328        ) {
2329         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2330             float_raise( float_flag_invalid STATUS_VAR);
2331         }
2332         return 0;
2333     }
2334     return ( float32_val(a) == float32_val(b) ) ||
2335             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2336 
2337 }
2338 
2339 /*----------------------------------------------------------------------------
2340 | Returns 1 if the single-precision floating-point value `a' is less than
2341 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
2342 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2343 | Arithmetic.
2344 *----------------------------------------------------------------------------*/
2345 
2346 int float32_le( float32 a, float32 b STATUS_PARAM )
2347 {
2348     flag aSign, bSign;
2349     uint32_t av, bv;
2350     a = float32_squash_input_denormal(a STATUS_VAR);
2351     b = float32_squash_input_denormal(b STATUS_VAR);
2352 
2353     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2354          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2355        ) {
2356         float_raise( float_flag_invalid STATUS_VAR);
2357         return 0;
2358     }
2359     aSign = extractFloat32Sign( a );
2360     bSign = extractFloat32Sign( b );
2361     av = float32_val(a);
2362     bv = float32_val(b);
2363     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2364     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2365 
2366 }
2367 
2368 /*----------------------------------------------------------------------------
2369 | Returns 1 if the single-precision floating-point value `a' is less than
2370 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2371 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2372 *----------------------------------------------------------------------------*/
2373 
2374 int float32_lt( float32 a, float32 b STATUS_PARAM )
2375 {
2376     flag aSign, bSign;
2377     uint32_t av, bv;
2378     a = float32_squash_input_denormal(a STATUS_VAR);
2379     b = float32_squash_input_denormal(b STATUS_VAR);
2380 
2381     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2382          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2383        ) {
2384         float_raise( float_flag_invalid STATUS_VAR);
2385         return 0;
2386     }
2387     aSign = extractFloat32Sign( a );
2388     bSign = extractFloat32Sign( b );
2389     av = float32_val(a);
2390     bv = float32_val(b);
2391     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2392     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2393 
2394 }
2395 
2396 /*----------------------------------------------------------------------------
2397 | Returns 1 if the single-precision floating-point value `a' is equal to
2398 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2399 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2400 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2401 *----------------------------------------------------------------------------*/
2402 
2403 int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2404 {
2405     uint32_t av, bv;
2406     a = float32_squash_input_denormal(a STATUS_VAR);
2407     b = float32_squash_input_denormal(b STATUS_VAR);
2408 
2409     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2410          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2411        ) {
2412         float_raise( float_flag_invalid STATUS_VAR);
2413         return 0;
2414     }
2415     av = float32_val(a);
2416     bv = float32_val(b);
2417     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2418 
2419 }
2420 
2421 /*----------------------------------------------------------------------------
2422 | Returns 1 if the single-precision floating-point value `a' is less than or
2423 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2424 | cause an exception.  Otherwise, the comparison is performed according to the
2425 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2426 *----------------------------------------------------------------------------*/
2427 
2428 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2429 {
2430     flag aSign, bSign;
2431     uint32_t av, bv;
2432     a = float32_squash_input_denormal(a STATUS_VAR);
2433     b = float32_squash_input_denormal(b STATUS_VAR);
2434 
2435     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2436          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2437        ) {
2438         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2439             float_raise( float_flag_invalid STATUS_VAR);
2440         }
2441         return 0;
2442     }
2443     aSign = extractFloat32Sign( a );
2444     bSign = extractFloat32Sign( b );
2445     av = float32_val(a);
2446     bv = float32_val(b);
2447     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2448     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2449 
2450 }
2451 
2452 /*----------------------------------------------------------------------------
2453 | Returns 1 if the single-precision floating-point value `a' is less than
2454 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2455 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2456 | Standard for Binary Floating-Point Arithmetic.
2457 *----------------------------------------------------------------------------*/
2458 
2459 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2460 {
2461     flag aSign, bSign;
2462     uint32_t av, bv;
2463     a = float32_squash_input_denormal(a STATUS_VAR);
2464     b = float32_squash_input_denormal(b STATUS_VAR);
2465 
2466     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2467          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2468        ) {
2469         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2470             float_raise( float_flag_invalid STATUS_VAR);
2471         }
2472         return 0;
2473     }
2474     aSign = extractFloat32Sign( a );
2475     bSign = extractFloat32Sign( b );
2476     av = float32_val(a);
2477     bv = float32_val(b);
2478     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2479     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2480 
2481 }
2482 
2483 /*----------------------------------------------------------------------------
2484 | Returns the result of converting the double-precision floating-point value
2485 | `a' to the 32-bit two's complement integer format.  The conversion is
2486 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2487 | Arithmetic---which means in particular that the conversion is rounded
2488 | according to the current rounding mode.  If `a' is a NaN, the largest
2489 | positive integer is returned.  Otherwise, if the conversion overflows, the
2490 | largest integer with the same sign as `a' is returned.
2491 *----------------------------------------------------------------------------*/
2492 
2493 int32 float64_to_int32( float64 a STATUS_PARAM )
2494 {
2495     flag aSign;
2496     int16 aExp, shiftCount;
2497     uint64_t aSig;
2498     a = float64_squash_input_denormal(a STATUS_VAR);
2499 
2500     aSig = extractFloat64Frac( a );
2501     aExp = extractFloat64Exp( a );
2502     aSign = extractFloat64Sign( a );
2503     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2504     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2505     shiftCount = 0x42C - aExp;
2506     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2507     return roundAndPackInt32( aSign, aSig STATUS_VAR );
2508 
2509 }
2510 
2511 /*----------------------------------------------------------------------------
2512 | Returns the result of converting the double-precision floating-point value
2513 | `a' to the 32-bit two's complement integer format.  The conversion is
2514 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2515 | Arithmetic, except that the conversion is always rounded toward zero.
2516 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2517 | the conversion overflows, the largest integer with the same sign as `a' is
2518 | returned.
2519 *----------------------------------------------------------------------------*/
2520 
2521 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2522 {
2523     flag aSign;
2524     int16 aExp, shiftCount;
2525     uint64_t aSig, savedASig;
2526     int32 z;
2527     a = float64_squash_input_denormal(a STATUS_VAR);
2528 
2529     aSig = extractFloat64Frac( a );
2530     aExp = extractFloat64Exp( a );
2531     aSign = extractFloat64Sign( a );
2532     if ( 0x41E < aExp ) {
2533         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2534         goto invalid;
2535     }
2536     else if ( aExp < 0x3FF ) {
2537         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2538         return 0;
2539     }
2540     aSig |= LIT64( 0x0010000000000000 );
2541     shiftCount = 0x433 - aExp;
2542     savedASig = aSig;
2543     aSig >>= shiftCount;
2544     z = aSig;
2545     if ( aSign ) z = - z;
2546     if ( ( z < 0 ) ^ aSign ) {
2547  invalid:
2548         float_raise( float_flag_invalid STATUS_VAR);
2549         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2550     }
2551     if ( ( aSig<<shiftCount ) != savedASig ) {
2552         STATUS(float_exception_flags) |= float_flag_inexact;
2553     }
2554     return z;
2555 
2556 }
2557 
2558 /*----------------------------------------------------------------------------
2559 | Returns the result of converting the double-precision floating-point value
2560 | `a' to the 16-bit two's complement integer format.  The conversion is
2561 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2562 | Arithmetic, except that the conversion is always rounded toward zero.
2563 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2564 | the conversion overflows, the largest integer with the same sign as `a' is
2565 | returned.
2566 *----------------------------------------------------------------------------*/
2567 
2568 int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM )
2569 {
2570     flag aSign;
2571     int16 aExp, shiftCount;
2572     uint64_t aSig, savedASig;
2573     int32 z;
2574 
2575     aSig = extractFloat64Frac( a );
2576     aExp = extractFloat64Exp( a );
2577     aSign = extractFloat64Sign( a );
2578     if ( 0x40E < aExp ) {
2579         if ( ( aExp == 0x7FF ) && aSig ) {
2580             aSign = 0;
2581         }
2582         goto invalid;
2583     }
2584     else if ( aExp < 0x3FF ) {
2585         if ( aExp || aSig ) {
2586             STATUS(float_exception_flags) |= float_flag_inexact;
2587         }
2588         return 0;
2589     }
2590     aSig |= LIT64( 0x0010000000000000 );
2591     shiftCount = 0x433 - aExp;
2592     savedASig = aSig;
2593     aSig >>= shiftCount;
2594     z = aSig;
2595     if ( aSign ) {
2596         z = - z;
2597     }
2598     if ( ( (int16_t)z < 0 ) ^ aSign ) {
2599  invalid:
2600         float_raise( float_flag_invalid STATUS_VAR);
2601         return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
2602     }
2603     if ( ( aSig<<shiftCount ) != savedASig ) {
2604         STATUS(float_exception_flags) |= float_flag_inexact;
2605     }
2606     return z;
2607 }
2608 
2609 /*----------------------------------------------------------------------------
2610 | Returns the result of converting the double-precision floating-point value
2611 | `a' to the 64-bit two's complement integer format.  The conversion is
2612 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2613 | Arithmetic---which means in particular that the conversion is rounded
2614 | according to the current rounding mode.  If `a' is a NaN, the largest
2615 | positive integer is returned.  Otherwise, if the conversion overflows, the
2616 | largest integer with the same sign as `a' is returned.
2617 *----------------------------------------------------------------------------*/
2618 
2619 int64 float64_to_int64( float64 a STATUS_PARAM )
2620 {
2621     flag aSign;
2622     int16 aExp, shiftCount;
2623     uint64_t aSig, aSigExtra;
2624     a = float64_squash_input_denormal(a STATUS_VAR);
2625 
2626     aSig = extractFloat64Frac( a );
2627     aExp = extractFloat64Exp( a );
2628     aSign = extractFloat64Sign( a );
2629     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2630     shiftCount = 0x433 - aExp;
2631     if ( shiftCount <= 0 ) {
2632         if ( 0x43E < aExp ) {
2633             float_raise( float_flag_invalid STATUS_VAR);
2634             if (    ! aSign
2635                  || (    ( aExp == 0x7FF )
2636                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2637                ) {
2638                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2639             }
2640             return (int64_t) LIT64( 0x8000000000000000 );
2641         }
2642         aSigExtra = 0;
2643         aSig <<= - shiftCount;
2644     }
2645     else {
2646         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2647     }
2648     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2649 
2650 }
2651 
2652 /*----------------------------------------------------------------------------
2653 | Returns the result of converting the double-precision floating-point value
2654 | `a' to the 64-bit two's complement integer format.  The conversion is
2655 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2656 | Arithmetic, except that the conversion is always rounded toward zero.
2657 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2658 | the conversion overflows, the largest integer with the same sign as `a' is
2659 | returned.
2660 *----------------------------------------------------------------------------*/
2661 
2662 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2663 {
2664     flag aSign;
2665     int16 aExp, shiftCount;
2666     uint64_t aSig;
2667     int64 z;
2668     a = float64_squash_input_denormal(a STATUS_VAR);
2669 
2670     aSig = extractFloat64Frac( a );
2671     aExp = extractFloat64Exp( a );
2672     aSign = extractFloat64Sign( a );
2673     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2674     shiftCount = aExp - 0x433;
2675     if ( 0 <= shiftCount ) {
2676         if ( 0x43E <= aExp ) {
2677             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2678                 float_raise( float_flag_invalid STATUS_VAR);
2679                 if (    ! aSign
2680                      || (    ( aExp == 0x7FF )
2681                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2682                    ) {
2683                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2684                 }
2685             }
2686             return (int64_t) LIT64( 0x8000000000000000 );
2687         }
2688         z = aSig<<shiftCount;
2689     }
2690     else {
2691         if ( aExp < 0x3FE ) {
2692             if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2693             return 0;
2694         }
2695         z = aSig>>( - shiftCount );
2696         if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
2697             STATUS(float_exception_flags) |= float_flag_inexact;
2698         }
2699     }
2700     if ( aSign ) z = - z;
2701     return z;
2702 
2703 }
2704 
2705 /*----------------------------------------------------------------------------
2706 | Returns the result of converting the double-precision floating-point value
2707 | `a' to the single-precision floating-point format.  The conversion is
2708 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2709 | Arithmetic.
2710 *----------------------------------------------------------------------------*/
2711 
2712 float32 float64_to_float32( float64 a STATUS_PARAM )
2713 {
2714     flag aSign;
2715     int16 aExp;
2716     uint64_t aSig;
2717     uint32_t zSig;
2718     a = float64_squash_input_denormal(a STATUS_VAR);
2719 
2720     aSig = extractFloat64Frac( a );
2721     aExp = extractFloat64Exp( a );
2722     aSign = extractFloat64Sign( a );
2723     if ( aExp == 0x7FF ) {
2724         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2725         return packFloat32( aSign, 0xFF, 0 );
2726     }
2727     shift64RightJamming( aSig, 22, &aSig );
2728     zSig = aSig;
2729     if ( aExp || zSig ) {
2730         zSig |= 0x40000000;
2731         aExp -= 0x381;
2732     }
2733     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2734 
2735 }
2736 
2737 
2738 /*----------------------------------------------------------------------------
2739 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2740 | half-precision floating-point value, returning the result.  After being
2741 | shifted into the proper positions, the three fields are simply added
2742 | together to form the result.  This means that any integer portion of `zSig'
2743 | will be added into the exponent.  Since a properly normalized significand
2744 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2745 | than the desired result exponent whenever `zSig' is a complete, normalized
2746 | significand.
2747 *----------------------------------------------------------------------------*/
2748 static float16 packFloat16(flag zSign, int16 zExp, uint16_t zSig)
2749 {
2750     return make_float16(
2751         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
2752 }
2753 
2754 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
2755    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
2756 
2757 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
2758 {
2759     flag aSign;
2760     int16 aExp;
2761     uint32_t aSig;
2762 
2763     aSign = extractFloat16Sign(a);
2764     aExp = extractFloat16Exp(a);
2765     aSig = extractFloat16Frac(a);
2766 
2767     if (aExp == 0x1f && ieee) {
2768         if (aSig) {
2769             return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
2770         }
2771         return packFloat32(aSign, 0xff, aSig << 13);
2772     }
2773     if (aExp == 0) {
2774         int8 shiftCount;
2775 
2776         if (aSig == 0) {
2777             return packFloat32(aSign, 0, 0);
2778         }
2779 
2780         shiftCount = countLeadingZeros32( aSig ) - 21;
2781         aSig = aSig << shiftCount;
2782         aExp = -shiftCount;
2783     }
2784     return packFloat32( aSign, aExp + 0x70, aSig << 13);
2785 }
2786 
2787 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
2788 {
2789     flag aSign;
2790     int16 aExp;
2791     uint32_t aSig;
2792     uint32_t mask;
2793     uint32_t increment;
2794     int8 roundingMode;
2795     a = float32_squash_input_denormal(a STATUS_VAR);
2796 
2797     aSig = extractFloat32Frac( a );
2798     aExp = extractFloat32Exp( a );
2799     aSign = extractFloat32Sign( a );
2800     if ( aExp == 0xFF ) {
2801         if (aSig) {
2802             /* Input is a NaN */
2803             float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2804             if (!ieee) {
2805                 return packFloat16(aSign, 0, 0);
2806             }
2807             return r;
2808         }
2809         /* Infinity */
2810         if (!ieee) {
2811             float_raise(float_flag_invalid STATUS_VAR);
2812             return packFloat16(aSign, 0x1f, 0x3ff);
2813         }
2814         return packFloat16(aSign, 0x1f, 0);
2815     }
2816     if (aExp == 0 && aSig == 0) {
2817         return packFloat16(aSign, 0, 0);
2818     }
2819     /* Decimal point between bits 22 and 23.  */
2820     aSig |= 0x00800000;
2821     aExp -= 0x7f;
2822     if (aExp < -14) {
2823         mask = 0x00ffffff;
2824         if (aExp >= -24) {
2825             mask >>= 25 + aExp;
2826         }
2827     } else {
2828         mask = 0x00001fff;
2829     }
2830     if (aSig & mask) {
2831         float_raise( float_flag_underflow STATUS_VAR );
2832         roundingMode = STATUS(float_rounding_mode);
2833         switch (roundingMode) {
2834         case float_round_nearest_even:
2835             increment = (mask + 1) >> 1;
2836             if ((aSig & mask) == increment) {
2837                 increment = aSig & (increment << 1);
2838             }
2839             break;
2840         case float_round_up:
2841             increment = aSign ? 0 : mask;
2842             break;
2843         case float_round_down:
2844             increment = aSign ? mask : 0;
2845             break;
2846         default: /* round_to_zero */
2847             increment = 0;
2848             break;
2849         }
2850         aSig += increment;
2851         if (aSig >= 0x01000000) {
2852             aSig >>= 1;
2853             aExp++;
2854         }
2855     } else if (aExp < -14
2856           && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2857         float_raise( float_flag_underflow STATUS_VAR);
2858     }
2859 
2860     if (ieee) {
2861         if (aExp > 15) {
2862             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2863             return packFloat16(aSign, 0x1f, 0);
2864         }
2865     } else {
2866         if (aExp > 16) {
2867             float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR);
2868             return packFloat16(aSign, 0x1f, 0x3ff);
2869         }
2870     }
2871     if (aExp < -24) {
2872         return packFloat16(aSign, 0, 0);
2873     }
2874     if (aExp < -14) {
2875         aSig >>= -14 - aExp;
2876         aExp = -14;
2877     }
2878     return packFloat16(aSign, aExp + 14, aSig >> 13);
2879 }
2880 
2881 #ifdef FLOATX80
2882 
2883 /*----------------------------------------------------------------------------
2884 | Returns the result of converting the double-precision floating-point value
2885 | `a' to the extended double-precision floating-point format.  The conversion
2886 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2887 | Arithmetic.
2888 *----------------------------------------------------------------------------*/
2889 
2890 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2891 {
2892     flag aSign;
2893     int16 aExp;
2894     uint64_t aSig;
2895 
2896     a = float64_squash_input_denormal(a STATUS_VAR);
2897     aSig = extractFloat64Frac( a );
2898     aExp = extractFloat64Exp( a );
2899     aSign = extractFloat64Sign( a );
2900     if ( aExp == 0x7FF ) {
2901         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2902         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2903     }
2904     if ( aExp == 0 ) {
2905         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2906         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2907     }
2908     return
2909         packFloatx80(
2910             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2911 
2912 }
2913 
2914 #endif
2915 
2916 #ifdef FLOAT128
2917 
2918 /*----------------------------------------------------------------------------
2919 | Returns the result of converting the double-precision floating-point value
2920 | `a' to the quadruple-precision floating-point format.  The conversion is
2921 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2922 | Arithmetic.
2923 *----------------------------------------------------------------------------*/
2924 
2925 float128 float64_to_float128( float64 a STATUS_PARAM )
2926 {
2927     flag aSign;
2928     int16 aExp;
2929     uint64_t aSig, zSig0, zSig1;
2930 
2931     a = float64_squash_input_denormal(a STATUS_VAR);
2932     aSig = extractFloat64Frac( a );
2933     aExp = extractFloat64Exp( a );
2934     aSign = extractFloat64Sign( a );
2935     if ( aExp == 0x7FF ) {
2936         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2937         return packFloat128( aSign, 0x7FFF, 0, 0 );
2938     }
2939     if ( aExp == 0 ) {
2940         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2941         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2942         --aExp;
2943     }
2944     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2945     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2946 
2947 }
2948 
2949 #endif
2950 
2951 /*----------------------------------------------------------------------------
2952 | Rounds the double-precision floating-point value `a' to an integer, and
2953 | returns the result as a double-precision floating-point value.  The
2954 | operation is performed according to the IEC/IEEE Standard for Binary
2955 | Floating-Point Arithmetic.
2956 *----------------------------------------------------------------------------*/
2957 
2958 float64 float64_round_to_int( float64 a STATUS_PARAM )
2959 {
2960     flag aSign;
2961     int16 aExp;
2962     uint64_t lastBitMask, roundBitsMask;
2963     int8 roundingMode;
2964     uint64_t z;
2965     a = float64_squash_input_denormal(a STATUS_VAR);
2966 
2967     aExp = extractFloat64Exp( a );
2968     if ( 0x433 <= aExp ) {
2969         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2970             return propagateFloat64NaN( a, a STATUS_VAR );
2971         }
2972         return a;
2973     }
2974     if ( aExp < 0x3FF ) {
2975         if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
2976         STATUS(float_exception_flags) |= float_flag_inexact;
2977         aSign = extractFloat64Sign( a );
2978         switch ( STATUS(float_rounding_mode) ) {
2979          case float_round_nearest_even:
2980             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2981                 return packFloat64( aSign, 0x3FF, 0 );
2982             }
2983             break;
2984          case float_round_down:
2985             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
2986          case float_round_up:
2987             return make_float64(
2988             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2989         }
2990         return packFloat64( aSign, 0, 0 );
2991     }
2992     lastBitMask = 1;
2993     lastBitMask <<= 0x433 - aExp;
2994     roundBitsMask = lastBitMask - 1;
2995     z = float64_val(a);
2996     roundingMode = STATUS(float_rounding_mode);
2997     if ( roundingMode == float_round_nearest_even ) {
2998         z += lastBitMask>>1;
2999         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3000     }
3001     else if ( roundingMode != float_round_to_zero ) {
3002         if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
3003             z += roundBitsMask;
3004         }
3005     }
3006     z &= ~ roundBitsMask;
3007     if ( z != float64_val(a) )
3008         STATUS(float_exception_flags) |= float_flag_inexact;
3009     return make_float64(z);
3010 
3011 }
3012 
3013 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3014 {
3015     int oldmode;
3016     float64 res;
3017     oldmode = STATUS(float_rounding_mode);
3018     STATUS(float_rounding_mode) = float_round_to_zero;
3019     res = float64_round_to_int(a STATUS_VAR);
3020     STATUS(float_rounding_mode) = oldmode;
3021     return res;
3022 }
3023 
3024 /*----------------------------------------------------------------------------
3025 | Returns the result of adding the absolute values of the double-precision
3026 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3027 | before being returned.  `zSign' is ignored if the result is a NaN.
3028 | The addition is performed according to the IEC/IEEE Standard for Binary
3029 | Floating-Point Arithmetic.
3030 *----------------------------------------------------------------------------*/
3031 
3032 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3033 {
3034     int16 aExp, bExp, zExp;
3035     uint64_t aSig, bSig, zSig;
3036     int16 expDiff;
3037 
3038     aSig = extractFloat64Frac( a );
3039     aExp = extractFloat64Exp( a );
3040     bSig = extractFloat64Frac( b );
3041     bExp = extractFloat64Exp( b );
3042     expDiff = aExp - bExp;
3043     aSig <<= 9;
3044     bSig <<= 9;
3045     if ( 0 < expDiff ) {
3046         if ( aExp == 0x7FF ) {
3047             if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3048             return a;
3049         }
3050         if ( bExp == 0 ) {
3051             --expDiff;
3052         }
3053         else {
3054             bSig |= LIT64( 0x2000000000000000 );
3055         }
3056         shift64RightJamming( bSig, expDiff, &bSig );
3057         zExp = aExp;
3058     }
3059     else if ( expDiff < 0 ) {
3060         if ( bExp == 0x7FF ) {
3061             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3062             return packFloat64( zSign, 0x7FF, 0 );
3063         }
3064         if ( aExp == 0 ) {
3065             ++expDiff;
3066         }
3067         else {
3068             aSig |= LIT64( 0x2000000000000000 );
3069         }
3070         shift64RightJamming( aSig, - expDiff, &aSig );
3071         zExp = bExp;
3072     }
3073     else {
3074         if ( aExp == 0x7FF ) {
3075             if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3076             return a;
3077         }
3078         if ( aExp == 0 ) {
3079             if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
3080             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3081         }
3082         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3083         zExp = aExp;
3084         goto roundAndPack;
3085     }
3086     aSig |= LIT64( 0x2000000000000000 );
3087     zSig = ( aSig + bSig )<<1;
3088     --zExp;
3089     if ( (int64_t) zSig < 0 ) {
3090         zSig = aSig + bSig;
3091         ++zExp;
3092     }
3093  roundAndPack:
3094     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3095 
3096 }
3097 
3098 /*----------------------------------------------------------------------------
3099 | Returns the result of subtracting the absolute values of the double-
3100 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
3101 | difference is negated before being returned.  `zSign' is ignored if the
3102 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3103 | Standard for Binary Floating-Point Arithmetic.
3104 *----------------------------------------------------------------------------*/
3105 
3106 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3107 {
3108     int16 aExp, bExp, zExp;
3109     uint64_t aSig, bSig, zSig;
3110     int16 expDiff;
3111 
3112     aSig = extractFloat64Frac( a );
3113     aExp = extractFloat64Exp( a );
3114     bSig = extractFloat64Frac( b );
3115     bExp = extractFloat64Exp( b );
3116     expDiff = aExp - bExp;
3117     aSig <<= 10;
3118     bSig <<= 10;
3119     if ( 0 < expDiff ) goto aExpBigger;
3120     if ( expDiff < 0 ) goto bExpBigger;
3121     if ( aExp == 0x7FF ) {
3122         if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3123         float_raise( float_flag_invalid STATUS_VAR);
3124         return float64_default_nan;
3125     }
3126     if ( aExp == 0 ) {
3127         aExp = 1;
3128         bExp = 1;
3129     }
3130     if ( bSig < aSig ) goto aBigger;
3131     if ( aSig < bSig ) goto bBigger;
3132     return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3133  bExpBigger:
3134     if ( bExp == 0x7FF ) {
3135         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3136         return packFloat64( zSign ^ 1, 0x7FF, 0 );
3137     }
3138     if ( aExp == 0 ) {
3139         ++expDiff;
3140     }
3141     else {
3142         aSig |= LIT64( 0x4000000000000000 );
3143     }
3144     shift64RightJamming( aSig, - expDiff, &aSig );
3145     bSig |= LIT64( 0x4000000000000000 );
3146  bBigger:
3147     zSig = bSig - aSig;
3148     zExp = bExp;
3149     zSign ^= 1;
3150     goto normalizeRoundAndPack;
3151  aExpBigger:
3152     if ( aExp == 0x7FF ) {
3153         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3154         return a;
3155     }
3156     if ( bExp == 0 ) {
3157         --expDiff;
3158     }
3159     else {
3160         bSig |= LIT64( 0x4000000000000000 );
3161     }
3162     shift64RightJamming( bSig, expDiff, &bSig );
3163     aSig |= LIT64( 0x4000000000000000 );
3164  aBigger:
3165     zSig = aSig - bSig;
3166     zExp = aExp;
3167  normalizeRoundAndPack:
3168     --zExp;
3169     return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3170 
3171 }
3172 
3173 /*----------------------------------------------------------------------------
3174 | Returns the result of adding the double-precision floating-point values `a'
3175 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
3176 | Binary Floating-Point Arithmetic.
3177 *----------------------------------------------------------------------------*/
3178 
3179 float64 float64_add( float64 a, float64 b STATUS_PARAM )
3180 {
3181     flag aSign, bSign;
3182     a = float64_squash_input_denormal(a STATUS_VAR);
3183     b = float64_squash_input_denormal(b STATUS_VAR);
3184 
3185     aSign = extractFloat64Sign( a );
3186     bSign = extractFloat64Sign( b );
3187     if ( aSign == bSign ) {
3188         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3189     }
3190     else {
3191         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3192     }
3193 
3194 }
3195 
3196 /*----------------------------------------------------------------------------
3197 | Returns the result of subtracting the double-precision floating-point values
3198 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3199 | for Binary Floating-Point Arithmetic.
3200 *----------------------------------------------------------------------------*/
3201 
3202 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3203 {
3204     flag aSign, bSign;
3205     a = float64_squash_input_denormal(a STATUS_VAR);
3206     b = float64_squash_input_denormal(b STATUS_VAR);
3207 
3208     aSign = extractFloat64Sign( a );
3209     bSign = extractFloat64Sign( b );
3210     if ( aSign == bSign ) {
3211         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3212     }
3213     else {
3214         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3215     }
3216 
3217 }
3218 
3219 /*----------------------------------------------------------------------------
3220 | Returns the result of multiplying the double-precision floating-point values
3221 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3222 | for Binary Floating-Point Arithmetic.
3223 *----------------------------------------------------------------------------*/
3224 
3225 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3226 {
3227     flag aSign, bSign, zSign;
3228     int16 aExp, bExp, zExp;
3229     uint64_t aSig, bSig, zSig0, zSig1;
3230 
3231     a = float64_squash_input_denormal(a STATUS_VAR);
3232     b = float64_squash_input_denormal(b STATUS_VAR);
3233 
3234     aSig = extractFloat64Frac( a );
3235     aExp = extractFloat64Exp( a );
3236     aSign = extractFloat64Sign( a );
3237     bSig = extractFloat64Frac( b );
3238     bExp = extractFloat64Exp( b );
3239     bSign = extractFloat64Sign( b );
3240     zSign = aSign ^ bSign;
3241     if ( aExp == 0x7FF ) {
3242         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3243             return propagateFloat64NaN( a, b STATUS_VAR );
3244         }
3245         if ( ( bExp | bSig ) == 0 ) {
3246             float_raise( float_flag_invalid STATUS_VAR);
3247             return float64_default_nan;
3248         }
3249         return packFloat64( zSign, 0x7FF, 0 );
3250     }
3251     if ( bExp == 0x7FF ) {
3252         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3253         if ( ( aExp | aSig ) == 0 ) {
3254             float_raise( float_flag_invalid STATUS_VAR);
3255             return float64_default_nan;
3256         }
3257         return packFloat64( zSign, 0x7FF, 0 );
3258     }
3259     if ( aExp == 0 ) {
3260         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3261         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3262     }
3263     if ( bExp == 0 ) {
3264         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3265         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3266     }
3267     zExp = aExp + bExp - 0x3FF;
3268     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3269     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3270     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3271     zSig0 |= ( zSig1 != 0 );
3272     if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3273         zSig0 <<= 1;
3274         --zExp;
3275     }
3276     return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3277 
3278 }
3279 
3280 /*----------------------------------------------------------------------------
3281 | Returns the result of dividing the double-precision floating-point value `a'
3282 | by the corresponding value `b'.  The operation is performed according to
3283 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3284 *----------------------------------------------------------------------------*/
3285 
3286 float64 float64_div( float64 a, float64 b STATUS_PARAM )
3287 {
3288     flag aSign, bSign, zSign;
3289     int16 aExp, bExp, zExp;
3290     uint64_t aSig, bSig, zSig;
3291     uint64_t rem0, rem1;
3292     uint64_t term0, term1;
3293     a = float64_squash_input_denormal(a STATUS_VAR);
3294     b = float64_squash_input_denormal(b STATUS_VAR);
3295 
3296     aSig = extractFloat64Frac( a );
3297     aExp = extractFloat64Exp( a );
3298     aSign = extractFloat64Sign( a );
3299     bSig = extractFloat64Frac( b );
3300     bExp = extractFloat64Exp( b );
3301     bSign = extractFloat64Sign( b );
3302     zSign = aSign ^ bSign;
3303     if ( aExp == 0x7FF ) {
3304         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3305         if ( bExp == 0x7FF ) {
3306             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3307             float_raise( float_flag_invalid STATUS_VAR);
3308             return float64_default_nan;
3309         }
3310         return packFloat64( zSign, 0x7FF, 0 );
3311     }
3312     if ( bExp == 0x7FF ) {
3313         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3314         return packFloat64( zSign, 0, 0 );
3315     }
3316     if ( bExp == 0 ) {
3317         if ( bSig == 0 ) {
3318             if ( ( aExp | aSig ) == 0 ) {
3319                 float_raise( float_flag_invalid STATUS_VAR);
3320                 return float64_default_nan;
3321             }
3322             float_raise( float_flag_divbyzero STATUS_VAR);
3323             return packFloat64( zSign, 0x7FF, 0 );
3324         }
3325         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3326     }
3327     if ( aExp == 0 ) {
3328         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3329         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3330     }
3331     zExp = aExp - bExp + 0x3FD;
3332     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3333     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3334     if ( bSig <= ( aSig + aSig ) ) {
3335         aSig >>= 1;
3336         ++zExp;
3337     }
3338     zSig = estimateDiv128To64( aSig, 0, bSig );
3339     if ( ( zSig & 0x1FF ) <= 2 ) {
3340         mul64To128( bSig, zSig, &term0, &term1 );
3341         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3342         while ( (int64_t) rem0 < 0 ) {
3343             --zSig;
3344             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3345         }
3346         zSig |= ( rem1 != 0 );
3347     }
3348     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3349 
3350 }
3351 
3352 /*----------------------------------------------------------------------------
3353 | Returns the remainder of the double-precision floating-point value `a'
3354 | with respect to the corresponding value `b'.  The operation is performed
3355 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3356 *----------------------------------------------------------------------------*/
3357 
3358 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3359 {
3360     flag aSign, zSign;
3361     int16 aExp, bExp, expDiff;
3362     uint64_t aSig, bSig;
3363     uint64_t q, alternateASig;
3364     int64_t sigMean;
3365 
3366     a = float64_squash_input_denormal(a STATUS_VAR);
3367     b = float64_squash_input_denormal(b STATUS_VAR);
3368     aSig = extractFloat64Frac( a );
3369     aExp = extractFloat64Exp( a );
3370     aSign = extractFloat64Sign( a );
3371     bSig = extractFloat64Frac( b );
3372     bExp = extractFloat64Exp( b );
3373     if ( aExp == 0x7FF ) {
3374         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3375             return propagateFloat64NaN( a, b STATUS_VAR );
3376         }
3377         float_raise( float_flag_invalid STATUS_VAR);
3378         return float64_default_nan;
3379     }
3380     if ( bExp == 0x7FF ) {
3381         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3382         return a;
3383     }
3384     if ( bExp == 0 ) {
3385         if ( bSig == 0 ) {
3386             float_raise( float_flag_invalid STATUS_VAR);
3387             return float64_default_nan;
3388         }
3389         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3390     }
3391     if ( aExp == 0 ) {
3392         if ( aSig == 0 ) return a;
3393         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3394     }
3395     expDiff = aExp - bExp;
3396     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3397     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3398     if ( expDiff < 0 ) {
3399         if ( expDiff < -1 ) return a;
3400         aSig >>= 1;
3401     }
3402     q = ( bSig <= aSig );
3403     if ( q ) aSig -= bSig;
3404     expDiff -= 64;
3405     while ( 0 < expDiff ) {
3406         q = estimateDiv128To64( aSig, 0, bSig );
3407         q = ( 2 < q ) ? q - 2 : 0;
3408         aSig = - ( ( bSig>>2 ) * q );
3409         expDiff -= 62;
3410     }
3411     expDiff += 64;
3412     if ( 0 < expDiff ) {
3413         q = estimateDiv128To64( aSig, 0, bSig );
3414         q = ( 2 < q ) ? q - 2 : 0;
3415         q >>= 64 - expDiff;
3416         bSig >>= 2;
3417         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3418     }
3419     else {
3420         aSig >>= 2;
3421         bSig >>= 2;
3422     }
3423     do {
3424         alternateASig = aSig;
3425         ++q;
3426         aSig -= bSig;
3427     } while ( 0 <= (int64_t) aSig );
3428     sigMean = aSig + alternateASig;
3429     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3430         aSig = alternateASig;
3431     }
3432     zSign = ( (int64_t) aSig < 0 );
3433     if ( zSign ) aSig = - aSig;
3434     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3435 
3436 }
3437 
3438 /*----------------------------------------------------------------------------
3439 | Returns the square root of the double-precision floating-point value `a'.
3440 | The operation is performed according to the IEC/IEEE Standard for Binary
3441 | Floating-Point Arithmetic.
3442 *----------------------------------------------------------------------------*/
3443 
3444 float64 float64_sqrt( float64 a STATUS_PARAM )
3445 {
3446     flag aSign;
3447     int16 aExp, zExp;
3448     uint64_t aSig, zSig, doubleZSig;
3449     uint64_t rem0, rem1, term0, term1;
3450     a = float64_squash_input_denormal(a STATUS_VAR);
3451 
3452     aSig = extractFloat64Frac( a );
3453     aExp = extractFloat64Exp( a );
3454     aSign = extractFloat64Sign( a );
3455     if ( aExp == 0x7FF ) {
3456         if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3457         if ( ! aSign ) return a;
3458         float_raise( float_flag_invalid STATUS_VAR);
3459         return float64_default_nan;
3460     }
3461     if ( aSign ) {
3462         if ( ( aExp | aSig ) == 0 ) return a;
3463         float_raise( float_flag_invalid STATUS_VAR);
3464         return float64_default_nan;
3465     }
3466     if ( aExp == 0 ) {
3467         if ( aSig == 0 ) return float64_zero;
3468         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3469     }
3470     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3471     aSig |= LIT64( 0x0010000000000000 );
3472     zSig = estimateSqrt32( aExp, aSig>>21 );
3473     aSig <<= 9 - ( aExp & 1 );
3474     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3475     if ( ( zSig & 0x1FF ) <= 5 ) {
3476         doubleZSig = zSig<<1;
3477         mul64To128( zSig, zSig, &term0, &term1 );
3478         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3479         while ( (int64_t) rem0 < 0 ) {
3480             --zSig;
3481             doubleZSig -= 2;
3482             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3483         }
3484         zSig |= ( ( rem0 | rem1 ) != 0 );
3485     }
3486     return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3487 
3488 }
3489 
3490 /*----------------------------------------------------------------------------
3491 | Returns the binary log of the double-precision floating-point value `a'.
3492 | The operation is performed according to the IEC/IEEE Standard for Binary
3493 | Floating-Point Arithmetic.
3494 *----------------------------------------------------------------------------*/
3495 float64 float64_log2( float64 a STATUS_PARAM )
3496 {
3497     flag aSign, zSign;
3498     int16 aExp;
3499     uint64_t aSig, aSig0, aSig1, zSig, i;
3500     a = float64_squash_input_denormal(a STATUS_VAR);
3501 
3502     aSig = extractFloat64Frac( a );
3503     aExp = extractFloat64Exp( a );
3504     aSign = extractFloat64Sign( a );
3505 
3506     if ( aExp == 0 ) {
3507         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3508         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3509     }
3510     if ( aSign ) {
3511         float_raise( float_flag_invalid STATUS_VAR);
3512         return float64_default_nan;
3513     }
3514     if ( aExp == 0x7FF ) {
3515         if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3516         return a;
3517     }
3518 
3519     aExp -= 0x3FF;
3520     aSig |= LIT64( 0x0010000000000000 );
3521     zSign = aExp < 0;
3522     zSig = (uint64_t)aExp << 52;
3523     for (i = 1LL << 51; i > 0; i >>= 1) {
3524         mul64To128( aSig, aSig, &aSig0, &aSig1 );
3525         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3526         if ( aSig & LIT64( 0x0020000000000000 ) ) {
3527             aSig >>= 1;
3528             zSig |= i;
3529         }
3530     }
3531 
3532     if ( zSign )
3533         zSig = -zSig;
3534     return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3535 }
3536 
3537 /*----------------------------------------------------------------------------
3538 | Returns 1 if the double-precision floating-point value `a' is equal to the
3539 | corresponding value `b', and 0 otherwise.  The comparison is performed
3540 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3541 *----------------------------------------------------------------------------*/
3542 
3543 int float64_eq( float64 a, float64 b STATUS_PARAM )
3544 {
3545     uint64_t av, bv;
3546     a = float64_squash_input_denormal(a STATUS_VAR);
3547     b = float64_squash_input_denormal(b STATUS_VAR);
3548 
3549     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3550          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3551        ) {
3552         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3553             float_raise( float_flag_invalid STATUS_VAR);
3554         }
3555         return 0;
3556     }
3557     av = float64_val(a);
3558     bv = float64_val(b);
3559     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3560 
3561 }
3562 
3563 /*----------------------------------------------------------------------------
3564 | Returns 1 if the double-precision floating-point value `a' is less than or
3565 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
3566 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3567 | Arithmetic.
3568 *----------------------------------------------------------------------------*/
3569 
3570 int float64_le( float64 a, float64 b STATUS_PARAM )
3571 {
3572     flag aSign, bSign;
3573     uint64_t av, bv;
3574     a = float64_squash_input_denormal(a STATUS_VAR);
3575     b = float64_squash_input_denormal(b STATUS_VAR);
3576 
3577     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3578          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3579        ) {
3580         float_raise( float_flag_invalid STATUS_VAR);
3581         return 0;
3582     }
3583     aSign = extractFloat64Sign( a );
3584     bSign = extractFloat64Sign( b );
3585     av = float64_val(a);
3586     bv = float64_val(b);
3587     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3588     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3589 
3590 }
3591 
3592 /*----------------------------------------------------------------------------
3593 | Returns 1 if the double-precision floating-point value `a' is less than
3594 | the corresponding value `b', and 0 otherwise.  The comparison is performed
3595 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3596 *----------------------------------------------------------------------------*/
3597 
3598 int float64_lt( float64 a, float64 b STATUS_PARAM )
3599 {
3600     flag aSign, bSign;
3601     uint64_t av, bv;
3602 
3603     a = float64_squash_input_denormal(a STATUS_VAR);
3604     b = float64_squash_input_denormal(b STATUS_VAR);
3605     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3606          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3607        ) {
3608         float_raise( float_flag_invalid STATUS_VAR);
3609         return 0;
3610     }
3611     aSign = extractFloat64Sign( a );
3612     bSign = extractFloat64Sign( b );
3613     av = float64_val(a);
3614     bv = float64_val(b);
3615     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
3616     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3617 
3618 }
3619 
3620 /*----------------------------------------------------------------------------
3621 | Returns 1 if the double-precision floating-point value `a' is equal to the
3622 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
3623 | if either operand is a NaN.  Otherwise, the comparison is performed
3624 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3625 *----------------------------------------------------------------------------*/
3626 
3627 int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3628 {
3629     uint64_t av, bv;
3630     a = float64_squash_input_denormal(a STATUS_VAR);
3631     b = float64_squash_input_denormal(b STATUS_VAR);
3632 
3633     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3634          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3635        ) {
3636         float_raise( float_flag_invalid STATUS_VAR);
3637         return 0;
3638     }
3639     av = float64_val(a);
3640     bv = float64_val(b);
3641     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3642 
3643 }
3644 
3645 /*----------------------------------------------------------------------------
3646 | Returns 1 if the double-precision floating-point value `a' is less than or
3647 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3648 | cause an exception.  Otherwise, the comparison is performed according to the
3649 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3650 *----------------------------------------------------------------------------*/
3651 
3652 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3653 {
3654     flag aSign, bSign;
3655     uint64_t av, bv;
3656     a = float64_squash_input_denormal(a STATUS_VAR);
3657     b = float64_squash_input_denormal(b STATUS_VAR);
3658 
3659     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3660          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3661        ) {
3662         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3663             float_raise( float_flag_invalid STATUS_VAR);
3664         }
3665         return 0;
3666     }
3667     aSign = extractFloat64Sign( a );
3668     bSign = extractFloat64Sign( b );
3669     av = float64_val(a);
3670     bv = float64_val(b);
3671     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3672     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3673 
3674 }
3675 
3676 /*----------------------------------------------------------------------------
3677 | Returns 1 if the double-precision floating-point value `a' is less than
3678 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3679 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3680 | Standard for Binary Floating-Point Arithmetic.
3681 *----------------------------------------------------------------------------*/
3682 
3683 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3684 {
3685     flag aSign, bSign;
3686     uint64_t av, bv;
3687     a = float64_squash_input_denormal(a STATUS_VAR);
3688     b = float64_squash_input_denormal(b STATUS_VAR);
3689 
3690     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3691          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3692        ) {
3693         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3694             float_raise( float_flag_invalid STATUS_VAR);
3695         }
3696         return 0;
3697     }
3698     aSign = extractFloat64Sign( a );
3699     bSign = extractFloat64Sign( b );
3700     av = float64_val(a);
3701     bv = float64_val(b);
3702     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
3703     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3704 
3705 }
3706 
3707 #ifdef FLOATX80
3708 
3709 /*----------------------------------------------------------------------------
3710 | Returns the result of converting the extended double-precision floating-
3711 | point value `a' to the 32-bit two's complement integer format.  The
3712 | conversion is performed according to the IEC/IEEE Standard for Binary
3713 | Floating-Point Arithmetic---which means in particular that the conversion
3714 | is rounded according to the current rounding mode.  If `a' is a NaN, the
3715 | largest positive integer is returned.  Otherwise, if the conversion
3716 | overflows, the largest integer with the same sign as `a' is returned.
3717 *----------------------------------------------------------------------------*/
3718 
3719 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3720 {
3721     flag aSign;
3722     int32 aExp, shiftCount;
3723     uint64_t aSig;
3724 
3725     aSig = extractFloatx80Frac( a );
3726     aExp = extractFloatx80Exp( a );
3727     aSign = extractFloatx80Sign( a );
3728     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
3729     shiftCount = 0x4037 - aExp;
3730     if ( shiftCount <= 0 ) shiftCount = 1;
3731     shift64RightJamming( aSig, shiftCount, &aSig );
3732     return roundAndPackInt32( aSign, aSig STATUS_VAR );
3733 
3734 }
3735 
3736 /*----------------------------------------------------------------------------
3737 | Returns the result of converting the extended double-precision floating-
3738 | point value `a' to the 32-bit two's complement integer format.  The
3739 | conversion is performed according to the IEC/IEEE Standard for Binary
3740 | Floating-Point Arithmetic, except that the conversion is always rounded
3741 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3742 | Otherwise, if the conversion overflows, the largest integer with the same
3743 | sign as `a' is returned.
3744 *----------------------------------------------------------------------------*/
3745 
3746 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3747 {
3748     flag aSign;
3749     int32 aExp, shiftCount;
3750     uint64_t aSig, savedASig;
3751     int32 z;
3752 
3753     aSig = extractFloatx80Frac( a );
3754     aExp = extractFloatx80Exp( a );
3755     aSign = extractFloatx80Sign( a );
3756     if ( 0x401E < aExp ) {
3757         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
3758         goto invalid;
3759     }
3760     else if ( aExp < 0x3FFF ) {
3761         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3762         return 0;
3763     }
3764     shiftCount = 0x403E - aExp;
3765     savedASig = aSig;
3766     aSig >>= shiftCount;
3767     z = aSig;
3768     if ( aSign ) z = - z;
3769     if ( ( z < 0 ) ^ aSign ) {
3770  invalid:
3771         float_raise( float_flag_invalid STATUS_VAR);
3772         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3773     }
3774     if ( ( aSig<<shiftCount ) != savedASig ) {
3775         STATUS(float_exception_flags) |= float_flag_inexact;
3776     }
3777     return z;
3778 
3779 }
3780 
3781 /*----------------------------------------------------------------------------
3782 | Returns the result of converting the extended double-precision floating-
3783 | point value `a' to the 64-bit two's complement integer format.  The
3784 | conversion is performed according to the IEC/IEEE Standard for Binary
3785 | Floating-Point Arithmetic---which means in particular that the conversion
3786 | is rounded according to the current rounding mode.  If `a' is a NaN,
3787 | the largest positive integer is returned.  Otherwise, if the conversion
3788 | overflows, the largest integer with the same sign as `a' is returned.
3789 *----------------------------------------------------------------------------*/
3790 
3791 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3792 {
3793     flag aSign;
3794     int32 aExp, shiftCount;
3795     uint64_t aSig, aSigExtra;
3796 
3797     aSig = extractFloatx80Frac( a );
3798     aExp = extractFloatx80Exp( a );
3799     aSign = extractFloatx80Sign( a );
3800     shiftCount = 0x403E - aExp;
3801     if ( shiftCount <= 0 ) {
3802         if ( shiftCount ) {
3803             float_raise( float_flag_invalid STATUS_VAR);
3804             if (    ! aSign
3805                  || (    ( aExp == 0x7FFF )
3806                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3807                ) {
3808                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3809             }
3810             return (int64_t) LIT64( 0x8000000000000000 );
3811         }
3812         aSigExtra = 0;
3813     }
3814     else {
3815         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3816     }
3817     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3818 
3819 }
3820 
3821 /*----------------------------------------------------------------------------
3822 | Returns the result of converting the extended double-precision floating-
3823 | point value `a' to the 64-bit two's complement integer format.  The
3824 | conversion is performed according to the IEC/IEEE Standard for Binary
3825 | Floating-Point Arithmetic, except that the conversion is always rounded
3826 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3827 | Otherwise, if the conversion overflows, the largest integer with the same
3828 | sign as `a' is returned.
3829 *----------------------------------------------------------------------------*/
3830 
3831 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3832 {
3833     flag aSign;
3834     int32 aExp, shiftCount;
3835     uint64_t aSig;
3836     int64 z;
3837 
3838     aSig = extractFloatx80Frac( a );
3839     aExp = extractFloatx80Exp( a );
3840     aSign = extractFloatx80Sign( a );
3841     shiftCount = aExp - 0x403E;
3842     if ( 0 <= shiftCount ) {
3843         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3844         if ( ( a.high != 0xC03E ) || aSig ) {
3845             float_raise( float_flag_invalid STATUS_VAR);
3846             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3847                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3848             }
3849         }
3850         return (int64_t) LIT64( 0x8000000000000000 );
3851     }
3852     else if ( aExp < 0x3FFF ) {
3853         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3854         return 0;
3855     }
3856     z = aSig>>( - shiftCount );
3857     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3858         STATUS(float_exception_flags) |= float_flag_inexact;
3859     }
3860     if ( aSign ) z = - z;
3861     return z;
3862 
3863 }
3864 
3865 /*----------------------------------------------------------------------------
3866 | Returns the result of converting the extended double-precision floating-
3867 | point value `a' to the single-precision floating-point format.  The
3868 | conversion is performed according to the IEC/IEEE Standard for Binary
3869 | Floating-Point Arithmetic.
3870 *----------------------------------------------------------------------------*/
3871 
3872 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3873 {
3874     flag aSign;
3875     int32 aExp;
3876     uint64_t aSig;
3877 
3878     aSig = extractFloatx80Frac( a );
3879     aExp = extractFloatx80Exp( a );
3880     aSign = extractFloatx80Sign( a );
3881     if ( aExp == 0x7FFF ) {
3882         if ( (uint64_t) ( aSig<<1 ) ) {
3883             return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3884         }
3885         return packFloat32( aSign, 0xFF, 0 );
3886     }
3887     shift64RightJamming( aSig, 33, &aSig );
3888     if ( aExp || aSig ) aExp -= 0x3F81;
3889     return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3890 
3891 }
3892 
3893 /*----------------------------------------------------------------------------
3894 | Returns the result of converting the extended double-precision floating-
3895 | point value `a' to the double-precision floating-point format.  The
3896 | conversion is performed according to the IEC/IEEE Standard for Binary
3897 | Floating-Point Arithmetic.
3898 *----------------------------------------------------------------------------*/
3899 
3900 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3901 {
3902     flag aSign;
3903     int32 aExp;
3904     uint64_t aSig, zSig;
3905 
3906     aSig = extractFloatx80Frac( a );
3907     aExp = extractFloatx80Exp( a );
3908     aSign = extractFloatx80Sign( a );
3909     if ( aExp == 0x7FFF ) {
3910         if ( (uint64_t) ( aSig<<1 ) ) {
3911             return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3912         }
3913         return packFloat64( aSign, 0x7FF, 0 );
3914     }
3915     shift64RightJamming( aSig, 1, &zSig );
3916     if ( aExp || aSig ) aExp -= 0x3C01;
3917     return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3918 
3919 }
3920 
3921 #ifdef FLOAT128
3922 
3923 /*----------------------------------------------------------------------------
3924 | Returns the result of converting the extended double-precision floating-
3925 | point value `a' to the quadruple-precision floating-point format.  The
3926 | conversion is performed according to the IEC/IEEE Standard for Binary
3927 | Floating-Point Arithmetic.
3928 *----------------------------------------------------------------------------*/
3929 
3930 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3931 {
3932     flag aSign;
3933     int16 aExp;
3934     uint64_t aSig, zSig0, zSig1;
3935 
3936     aSig = extractFloatx80Frac( a );
3937     aExp = extractFloatx80Exp( a );
3938     aSign = extractFloatx80Sign( a );
3939     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
3940         return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3941     }
3942     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3943     return packFloat128( aSign, aExp, zSig0, zSig1 );
3944 
3945 }
3946 
3947 #endif
3948 
3949 /*----------------------------------------------------------------------------
3950 | Rounds the extended double-precision floating-point value `a' to an integer,
3951 | and returns the result as an extended quadruple-precision floating-point
3952 | value.  The operation is performed according to the IEC/IEEE Standard for
3953 | Binary Floating-Point Arithmetic.
3954 *----------------------------------------------------------------------------*/
3955 
3956 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3957 {
3958     flag aSign;
3959     int32 aExp;
3960     uint64_t lastBitMask, roundBitsMask;
3961     int8 roundingMode;
3962     floatx80 z;
3963 
3964     aExp = extractFloatx80Exp( a );
3965     if ( 0x403E <= aExp ) {
3966         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
3967             return propagateFloatx80NaN( a, a STATUS_VAR );
3968         }
3969         return a;
3970     }
3971     if ( aExp < 0x3FFF ) {
3972         if (    ( aExp == 0 )
3973              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3974             return a;
3975         }
3976         STATUS(float_exception_flags) |= float_flag_inexact;
3977         aSign = extractFloatx80Sign( a );
3978         switch ( STATUS(float_rounding_mode) ) {
3979          case float_round_nearest_even:
3980             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
3981                ) {
3982                 return
3983                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3984             }
3985             break;
3986          case float_round_down:
3987             return
3988                   aSign ?
3989                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3990                 : packFloatx80( 0, 0, 0 );
3991          case float_round_up:
3992             return
3993                   aSign ? packFloatx80( 1, 0, 0 )
3994                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3995         }
3996         return packFloatx80( aSign, 0, 0 );
3997     }
3998     lastBitMask = 1;
3999     lastBitMask <<= 0x403E - aExp;
4000     roundBitsMask = lastBitMask - 1;
4001     z = a;
4002     roundingMode = STATUS(float_rounding_mode);
4003     if ( roundingMode == float_round_nearest_even ) {
4004         z.low += lastBitMask>>1;
4005         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4006     }
4007     else if ( roundingMode != float_round_to_zero ) {
4008         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4009             z.low += roundBitsMask;
4010         }
4011     }
4012     z.low &= ~ roundBitsMask;
4013     if ( z.low == 0 ) {
4014         ++z.high;
4015         z.low = LIT64( 0x8000000000000000 );
4016     }
4017     if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4018     return z;
4019 
4020 }
4021 
4022 /*----------------------------------------------------------------------------
4023 | Returns the result of adding the absolute values of the extended double-
4024 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4025 | negated before being returned.  `zSign' is ignored if the result is a NaN.
4026 | The addition is performed according to the IEC/IEEE Standard for Binary
4027 | Floating-Point Arithmetic.
4028 *----------------------------------------------------------------------------*/
4029 
4030 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4031 {
4032     int32 aExp, bExp, zExp;
4033     uint64_t aSig, bSig, zSig0, zSig1;
4034     int32 expDiff;
4035 
4036     aSig = extractFloatx80Frac( a );
4037     aExp = extractFloatx80Exp( a );
4038     bSig = extractFloatx80Frac( b );
4039     bExp = extractFloatx80Exp( b );
4040     expDiff = aExp - bExp;
4041     if ( 0 < expDiff ) {
4042         if ( aExp == 0x7FFF ) {
4043             if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4044             return a;
4045         }
4046         if ( bExp == 0 ) --expDiff;
4047         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4048         zExp = aExp;
4049     }
4050     else if ( expDiff < 0 ) {
4051         if ( bExp == 0x7FFF ) {
4052             if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4053             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4054         }
4055         if ( aExp == 0 ) ++expDiff;
4056         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4057         zExp = bExp;
4058     }
4059     else {
4060         if ( aExp == 0x7FFF ) {
4061             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4062                 return propagateFloatx80NaN( a, b STATUS_VAR );
4063             }
4064             return a;
4065         }
4066         zSig1 = 0;
4067         zSig0 = aSig + bSig;
4068         if ( aExp == 0 ) {
4069             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4070             goto roundAndPack;
4071         }
4072         zExp = aExp;
4073         goto shiftRight1;
4074     }
4075     zSig0 = aSig + bSig;
4076     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4077  shiftRight1:
4078     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4079     zSig0 |= LIT64( 0x8000000000000000 );
4080     ++zExp;
4081  roundAndPack:
4082     return
4083         roundAndPackFloatx80(
4084             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4085 
4086 }
4087 
4088 /*----------------------------------------------------------------------------
4089 | Returns the result of subtracting the absolute values of the extended
4090 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4091 | difference is negated before being returned.  `zSign' is ignored if the
4092 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4093 | Standard for Binary Floating-Point Arithmetic.
4094 *----------------------------------------------------------------------------*/
4095 
4096 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4097 {
4098     int32 aExp, bExp, zExp;
4099     uint64_t aSig, bSig, zSig0, zSig1;
4100     int32 expDiff;
4101     floatx80 z;
4102 
4103     aSig = extractFloatx80Frac( a );
4104     aExp = extractFloatx80Exp( a );
4105     bSig = extractFloatx80Frac( b );
4106     bExp = extractFloatx80Exp( b );
4107     expDiff = aExp - bExp;
4108     if ( 0 < expDiff ) goto aExpBigger;
4109     if ( expDiff < 0 ) goto bExpBigger;
4110     if ( aExp == 0x7FFF ) {
4111         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4112             return propagateFloatx80NaN( a, b STATUS_VAR );
4113         }
4114         float_raise( float_flag_invalid STATUS_VAR);
4115         z.low = floatx80_default_nan_low;
4116         z.high = floatx80_default_nan_high;
4117         return z;
4118     }
4119     if ( aExp == 0 ) {
4120         aExp = 1;
4121         bExp = 1;
4122     }
4123     zSig1 = 0;
4124     if ( bSig < aSig ) goto aBigger;
4125     if ( aSig < bSig ) goto bBigger;
4126     return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4127  bExpBigger:
4128     if ( bExp == 0x7FFF ) {
4129         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4130         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4131     }
4132     if ( aExp == 0 ) ++expDiff;
4133     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4134  bBigger:
4135     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4136     zExp = bExp;
4137     zSign ^= 1;
4138     goto normalizeRoundAndPack;
4139  aExpBigger:
4140     if ( aExp == 0x7FFF ) {
4141         if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4142         return a;
4143     }
4144     if ( bExp == 0 ) --expDiff;
4145     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4146  aBigger:
4147     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4148     zExp = aExp;
4149  normalizeRoundAndPack:
4150     return
4151         normalizeRoundAndPackFloatx80(
4152             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4153 
4154 }
4155 
4156 /*----------------------------------------------------------------------------
4157 | Returns the result of adding the extended double-precision floating-point
4158 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4159 | Standard for Binary Floating-Point Arithmetic.
4160 *----------------------------------------------------------------------------*/
4161 
4162 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4163 {
4164     flag aSign, bSign;
4165 
4166     aSign = extractFloatx80Sign( a );
4167     bSign = extractFloatx80Sign( b );
4168     if ( aSign == bSign ) {
4169         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4170     }
4171     else {
4172         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4173     }
4174 
4175 }
4176 
4177 /*----------------------------------------------------------------------------
4178 | Returns the result of subtracting the extended double-precision floating-
4179 | point values `a' and `b'.  The operation is performed according to the
4180 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4181 *----------------------------------------------------------------------------*/
4182 
4183 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4184 {
4185     flag aSign, bSign;
4186 
4187     aSign = extractFloatx80Sign( a );
4188     bSign = extractFloatx80Sign( b );
4189     if ( aSign == bSign ) {
4190         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4191     }
4192     else {
4193         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4194     }
4195 
4196 }
4197 
4198 /*----------------------------------------------------------------------------
4199 | Returns the result of multiplying the extended double-precision floating-
4200 | point values `a' and `b'.  The operation is performed according to the
4201 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4202 *----------------------------------------------------------------------------*/
4203 
4204 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4205 {
4206     flag aSign, bSign, zSign;
4207     int32 aExp, bExp, zExp;
4208     uint64_t aSig, bSig, zSig0, zSig1;
4209     floatx80 z;
4210 
4211     aSig = extractFloatx80Frac( a );
4212     aExp = extractFloatx80Exp( a );
4213     aSign = extractFloatx80Sign( a );
4214     bSig = extractFloatx80Frac( b );
4215     bExp = extractFloatx80Exp( b );
4216     bSign = extractFloatx80Sign( b );
4217     zSign = aSign ^ bSign;
4218     if ( aExp == 0x7FFF ) {
4219         if (    (uint64_t) ( aSig<<1 )
4220              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4221             return propagateFloatx80NaN( a, b STATUS_VAR );
4222         }
4223         if ( ( bExp | bSig ) == 0 ) goto invalid;
4224         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4225     }
4226     if ( bExp == 0x7FFF ) {
4227         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4228         if ( ( aExp | aSig ) == 0 ) {
4229  invalid:
4230             float_raise( float_flag_invalid STATUS_VAR);
4231             z.low = floatx80_default_nan_low;
4232             z.high = floatx80_default_nan_high;
4233             return z;
4234         }
4235         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4236     }
4237     if ( aExp == 0 ) {
4238         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4239         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4240     }
4241     if ( bExp == 0 ) {
4242         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4243         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4244     }
4245     zExp = aExp + bExp - 0x3FFE;
4246     mul64To128( aSig, bSig, &zSig0, &zSig1 );
4247     if ( 0 < (int64_t) zSig0 ) {
4248         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4249         --zExp;
4250     }
4251     return
4252         roundAndPackFloatx80(
4253             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4254 
4255 }
4256 
4257 /*----------------------------------------------------------------------------
4258 | Returns the result of dividing the extended double-precision floating-point
4259 | value `a' by the corresponding value `b'.  The operation is performed
4260 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4261 *----------------------------------------------------------------------------*/
4262 
4263 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4264 {
4265     flag aSign, bSign, zSign;
4266     int32 aExp, bExp, zExp;
4267     uint64_t aSig, bSig, zSig0, zSig1;
4268     uint64_t rem0, rem1, rem2, term0, term1, term2;
4269     floatx80 z;
4270 
4271     aSig = extractFloatx80Frac( a );
4272     aExp = extractFloatx80Exp( a );
4273     aSign = extractFloatx80Sign( a );
4274     bSig = extractFloatx80Frac( b );
4275     bExp = extractFloatx80Exp( b );
4276     bSign = extractFloatx80Sign( b );
4277     zSign = aSign ^ bSign;
4278     if ( aExp == 0x7FFF ) {
4279         if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4280         if ( bExp == 0x7FFF ) {
4281             if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4282             goto invalid;
4283         }
4284         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4285     }
4286     if ( bExp == 0x7FFF ) {
4287         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4288         return packFloatx80( zSign, 0, 0 );
4289     }
4290     if ( bExp == 0 ) {
4291         if ( bSig == 0 ) {
4292             if ( ( aExp | aSig ) == 0 ) {
4293  invalid:
4294                 float_raise( float_flag_invalid STATUS_VAR);
4295                 z.low = floatx80_default_nan_low;
4296                 z.high = floatx80_default_nan_high;
4297                 return z;
4298             }
4299             float_raise( float_flag_divbyzero STATUS_VAR);
4300             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4301         }
4302         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4303     }
4304     if ( aExp == 0 ) {
4305         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4306         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4307     }
4308     zExp = aExp - bExp + 0x3FFE;
4309     rem1 = 0;
4310     if ( bSig <= aSig ) {
4311         shift128Right( aSig, 0, 1, &aSig, &rem1 );
4312         ++zExp;
4313     }
4314     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4315     mul64To128( bSig, zSig0, &term0, &term1 );
4316     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4317     while ( (int64_t) rem0 < 0 ) {
4318         --zSig0;
4319         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4320     }
4321     zSig1 = estimateDiv128To64( rem1, 0, bSig );
4322     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
4323         mul64To128( bSig, zSig1, &term1, &term2 );
4324         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4325         while ( (int64_t) rem1 < 0 ) {
4326             --zSig1;
4327             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4328         }
4329         zSig1 |= ( ( rem1 | rem2 ) != 0 );
4330     }
4331     return
4332         roundAndPackFloatx80(
4333             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4334 
4335 }
4336 
4337 /*----------------------------------------------------------------------------
4338 | Returns the remainder of the extended double-precision floating-point value
4339 | `a' with respect to the corresponding value `b'.  The operation is performed
4340 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4341 *----------------------------------------------------------------------------*/
4342 
4343 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4344 {
4345     flag aSign, zSign;
4346     int32 aExp, bExp, expDiff;
4347     uint64_t aSig0, aSig1, bSig;
4348     uint64_t q, term0, term1, alternateASig0, alternateASig1;
4349     floatx80 z;
4350 
4351     aSig0 = extractFloatx80Frac( a );
4352     aExp = extractFloatx80Exp( a );
4353     aSign = extractFloatx80Sign( a );
4354     bSig = extractFloatx80Frac( b );
4355     bExp = extractFloatx80Exp( b );
4356     if ( aExp == 0x7FFF ) {
4357         if (    (uint64_t) ( aSig0<<1 )
4358              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4359             return propagateFloatx80NaN( a, b STATUS_VAR );
4360         }
4361         goto invalid;
4362     }
4363     if ( bExp == 0x7FFF ) {
4364         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4365         return a;
4366     }
4367     if ( bExp == 0 ) {
4368         if ( bSig == 0 ) {
4369  invalid:
4370             float_raise( float_flag_invalid STATUS_VAR);
4371             z.low = floatx80_default_nan_low;
4372             z.high = floatx80_default_nan_high;
4373             return z;
4374         }
4375         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4376     }
4377     if ( aExp == 0 ) {
4378         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
4379         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4380     }
4381     bSig |= LIT64( 0x8000000000000000 );
4382     zSign = aSign;
4383     expDiff = aExp - bExp;
4384     aSig1 = 0;
4385     if ( expDiff < 0 ) {
4386         if ( expDiff < -1 ) return a;
4387         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4388         expDiff = 0;
4389     }
4390     q = ( bSig <= aSig0 );
4391     if ( q ) aSig0 -= bSig;
4392     expDiff -= 64;
4393     while ( 0 < expDiff ) {
4394         q = estimateDiv128To64( aSig0, aSig1, bSig );
4395         q = ( 2 < q ) ? q - 2 : 0;
4396         mul64To128( bSig, q, &term0, &term1 );
4397         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4398         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4399         expDiff -= 62;
4400     }
4401     expDiff += 64;
4402     if ( 0 < expDiff ) {
4403         q = estimateDiv128To64( aSig0, aSig1, bSig );
4404         q = ( 2 < q ) ? q - 2 : 0;
4405         q >>= 64 - expDiff;
4406         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4407         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4408         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4409         while ( le128( term0, term1, aSig0, aSig1 ) ) {
4410             ++q;
4411             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4412         }
4413     }
4414     else {
4415         term1 = 0;
4416         term0 = bSig;
4417     }
4418     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4419     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4420          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4421               && ( q & 1 ) )
4422        ) {
4423         aSig0 = alternateASig0;
4424         aSig1 = alternateASig1;
4425         zSign = ! zSign;
4426     }
4427     return
4428         normalizeRoundAndPackFloatx80(
4429             80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4430 
4431 }
4432 
4433 /*----------------------------------------------------------------------------
4434 | Returns the square root of the extended double-precision floating-point
4435 | value `a'.  The operation is performed according to the IEC/IEEE Standard
4436 | for Binary Floating-Point Arithmetic.
4437 *----------------------------------------------------------------------------*/
4438 
4439 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4440 {
4441     flag aSign;
4442     int32 aExp, zExp;
4443     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4444     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4445     floatx80 z;
4446 
4447     aSig0 = extractFloatx80Frac( a );
4448     aExp = extractFloatx80Exp( a );
4449     aSign = extractFloatx80Sign( a );
4450     if ( aExp == 0x7FFF ) {
4451         if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4452         if ( ! aSign ) return a;
4453         goto invalid;
4454     }
4455     if ( aSign ) {
4456         if ( ( aExp | aSig0 ) == 0 ) return a;
4457  invalid:
4458         float_raise( float_flag_invalid STATUS_VAR);
4459         z.low = floatx80_default_nan_low;
4460         z.high = floatx80_default_nan_high;
4461         return z;
4462     }
4463     if ( aExp == 0 ) {
4464         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4465         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4466     }
4467     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4468     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4469     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4470     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4471     doubleZSig0 = zSig0<<1;
4472     mul64To128( zSig0, zSig0, &term0, &term1 );
4473     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4474     while ( (int64_t) rem0 < 0 ) {
4475         --zSig0;
4476         doubleZSig0 -= 2;
4477         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4478     }
4479     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4480     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4481         if ( zSig1 == 0 ) zSig1 = 1;
4482         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4483         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4484         mul64To128( zSig1, zSig1, &term2, &term3 );
4485         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4486         while ( (int64_t) rem1 < 0 ) {
4487             --zSig1;
4488             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4489             term3 |= 1;
4490             term2 |= doubleZSig0;
4491             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4492         }
4493         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4494     }
4495     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4496     zSig0 |= doubleZSig0;
4497     return
4498         roundAndPackFloatx80(
4499             STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4500 
4501 }
4502 
4503 /*----------------------------------------------------------------------------
4504 | Returns 1 if the extended double-precision floating-point value `a' is
4505 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
4506 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4507 | Arithmetic.
4508 *----------------------------------------------------------------------------*/
4509 
4510 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4511 {
4512 
4513     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4514               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4515          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4516               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4517        ) {
4518         if (    floatx80_is_signaling_nan( a )
4519              || floatx80_is_signaling_nan( b ) ) {
4520             float_raise( float_flag_invalid STATUS_VAR);
4521         }
4522         return 0;
4523     }
4524     return
4525            ( a.low == b.low )
4526         && (    ( a.high == b.high )
4527              || (    ( a.low == 0 )
4528                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
4529            );
4530 
4531 }
4532 
4533 /*----------------------------------------------------------------------------
4534 | Returns 1 if the extended double-precision floating-point value `a' is
4535 | less than or equal to the corresponding value `b', and 0 otherwise.  The
4536 | comparison is performed according to the IEC/IEEE Standard for Binary
4537 | Floating-Point Arithmetic.
4538 *----------------------------------------------------------------------------*/
4539 
4540 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4541 {
4542     flag aSign, bSign;
4543 
4544     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4545               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4546          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4547               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4548        ) {
4549         float_raise( float_flag_invalid STATUS_VAR);
4550         return 0;
4551     }
4552     aSign = extractFloatx80Sign( a );
4553     bSign = extractFloatx80Sign( b );
4554     if ( aSign != bSign ) {
4555         return
4556                aSign
4557             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4558                  == 0 );
4559     }
4560     return
4561           aSign ? le128( b.high, b.low, a.high, a.low )
4562         : le128( a.high, a.low, b.high, b.low );
4563 
4564 }
4565 
4566 /*----------------------------------------------------------------------------
4567 | Returns 1 if the extended double-precision floating-point value `a' is
4568 | less than the corresponding value `b', and 0 otherwise.  The comparison
4569 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4570 | Arithmetic.
4571 *----------------------------------------------------------------------------*/
4572 
4573 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4574 {
4575     flag aSign, bSign;
4576 
4577     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4578               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4579          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4580               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4581        ) {
4582         float_raise( float_flag_invalid STATUS_VAR);
4583         return 0;
4584     }
4585     aSign = extractFloatx80Sign( a );
4586     bSign = extractFloatx80Sign( b );
4587     if ( aSign != bSign ) {
4588         return
4589                aSign
4590             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4591                  != 0 );
4592     }
4593     return
4594           aSign ? lt128( b.high, b.low, a.high, a.low )
4595         : lt128( a.high, a.low, b.high, b.low );
4596 
4597 }
4598 
4599 /*----------------------------------------------------------------------------
4600 | Returns 1 if the extended double-precision floating-point value `a' is equal
4601 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
4602 | raised if either operand is a NaN.  Otherwise, the comparison is performed
4603 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4604 *----------------------------------------------------------------------------*/
4605 
4606 int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4607 {
4608 
4609     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4610               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4611          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4612               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4613        ) {
4614         float_raise( float_flag_invalid STATUS_VAR);
4615         return 0;
4616     }
4617     return
4618            ( a.low == b.low )
4619         && (    ( a.high == b.high )
4620              || (    ( a.low == 0 )
4621                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
4622            );
4623 
4624 }
4625 
4626 /*----------------------------------------------------------------------------
4627 | Returns 1 if the extended double-precision floating-point value `a' is less
4628 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4629 | do not cause an exception.  Otherwise, the comparison is performed according
4630 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4631 *----------------------------------------------------------------------------*/
4632 
4633 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4634 {
4635     flag aSign, bSign;
4636 
4637     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4638               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4639          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4640               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4641        ) {
4642         if (    floatx80_is_signaling_nan( a )
4643              || floatx80_is_signaling_nan( b ) ) {
4644             float_raise( float_flag_invalid STATUS_VAR);
4645         }
4646         return 0;
4647     }
4648     aSign = extractFloatx80Sign( a );
4649     bSign = extractFloatx80Sign( b );
4650     if ( aSign != bSign ) {
4651         return
4652                aSign
4653             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4654                  == 0 );
4655     }
4656     return
4657           aSign ? le128( b.high, b.low, a.high, a.low )
4658         : le128( a.high, a.low, b.high, b.low );
4659 
4660 }
4661 
4662 /*----------------------------------------------------------------------------
4663 | Returns 1 if the extended double-precision floating-point value `a' is less
4664 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4665 | an exception.  Otherwise, the comparison is performed according to the
4666 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4667 *----------------------------------------------------------------------------*/
4668 
4669 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4670 {
4671     flag aSign, bSign;
4672 
4673     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4674               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4675          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4676               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4677        ) {
4678         if (    floatx80_is_signaling_nan( a )
4679              || floatx80_is_signaling_nan( b ) ) {
4680             float_raise( float_flag_invalid STATUS_VAR);
4681         }
4682         return 0;
4683     }
4684     aSign = extractFloatx80Sign( a );
4685     bSign = extractFloatx80Sign( b );
4686     if ( aSign != bSign ) {
4687         return
4688                aSign
4689             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4690                  != 0 );
4691     }
4692     return
4693           aSign ? lt128( b.high, b.low, a.high, a.low )
4694         : lt128( a.high, a.low, b.high, b.low );
4695 
4696 }
4697 
4698 #endif
4699 
4700 #ifdef FLOAT128
4701 
4702 /*----------------------------------------------------------------------------
4703 | Returns the result of converting the quadruple-precision floating-point
4704 | value `a' to the 32-bit two's complement integer format.  The conversion
4705 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4706 | Arithmetic---which means in particular that the conversion is rounded
4707 | according to the current rounding mode.  If `a' is a NaN, the largest
4708 | positive integer is returned.  Otherwise, if the conversion overflows, the
4709 | largest integer with the same sign as `a' is returned.
4710 *----------------------------------------------------------------------------*/
4711 
4712 int32 float128_to_int32( float128 a STATUS_PARAM )
4713 {
4714     flag aSign;
4715     int32 aExp, shiftCount;
4716     uint64_t aSig0, aSig1;
4717 
4718     aSig1 = extractFloat128Frac1( a );
4719     aSig0 = extractFloat128Frac0( a );
4720     aExp = extractFloat128Exp( a );
4721     aSign = extractFloat128Sign( a );
4722     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4723     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4724     aSig0 |= ( aSig1 != 0 );
4725     shiftCount = 0x4028 - aExp;
4726     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4727     return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4728 
4729 }
4730 
4731 /*----------------------------------------------------------------------------
4732 | Returns the result of converting the quadruple-precision floating-point
4733 | value `a' to the 32-bit two's complement integer format.  The conversion
4734 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4735 | Arithmetic, except that the conversion is always rounded toward zero.  If
4736 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4737 | conversion overflows, the largest integer with the same sign as `a' is
4738 | returned.
4739 *----------------------------------------------------------------------------*/
4740 
4741 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4742 {
4743     flag aSign;
4744     int32 aExp, shiftCount;
4745     uint64_t aSig0, aSig1, savedASig;
4746     int32 z;
4747 
4748     aSig1 = extractFloat128Frac1( a );
4749     aSig0 = extractFloat128Frac0( a );
4750     aExp = extractFloat128Exp( a );
4751     aSign = extractFloat128Sign( a );
4752     aSig0 |= ( aSig1 != 0 );
4753     if ( 0x401E < aExp ) {
4754         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4755         goto invalid;
4756     }
4757     else if ( aExp < 0x3FFF ) {
4758         if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4759         return 0;
4760     }
4761     aSig0 |= LIT64( 0x0001000000000000 );
4762     shiftCount = 0x402F - aExp;
4763     savedASig = aSig0;
4764     aSig0 >>= shiftCount;
4765     z = aSig0;
4766     if ( aSign ) z = - z;
4767     if ( ( z < 0 ) ^ aSign ) {
4768  invalid:
4769         float_raise( float_flag_invalid STATUS_VAR);
4770         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4771     }
4772     if ( ( aSig0<<shiftCount ) != savedASig ) {
4773         STATUS(float_exception_flags) |= float_flag_inexact;
4774     }
4775     return z;
4776 
4777 }
4778 
4779 /*----------------------------------------------------------------------------
4780 | Returns the result of converting the quadruple-precision floating-point
4781 | value `a' to the 64-bit two's complement integer format.  The conversion
4782 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4783 | Arithmetic---which means in particular that the conversion is rounded
4784 | according to the current rounding mode.  If `a' is a NaN, the largest
4785 | positive integer is returned.  Otherwise, if the conversion overflows, the
4786 | largest integer with the same sign as `a' is returned.
4787 *----------------------------------------------------------------------------*/
4788 
4789 int64 float128_to_int64( float128 a STATUS_PARAM )
4790 {
4791     flag aSign;
4792     int32 aExp, shiftCount;
4793     uint64_t aSig0, aSig1;
4794 
4795     aSig1 = extractFloat128Frac1( a );
4796     aSig0 = extractFloat128Frac0( a );
4797     aExp = extractFloat128Exp( a );
4798     aSign = extractFloat128Sign( a );
4799     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4800     shiftCount = 0x402F - aExp;
4801     if ( shiftCount <= 0 ) {
4802         if ( 0x403E < aExp ) {
4803             float_raise( float_flag_invalid STATUS_VAR);
4804             if (    ! aSign
4805                  || (    ( aExp == 0x7FFF )
4806                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4807                     )
4808                ) {
4809                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4810             }
4811             return (int64_t) LIT64( 0x8000000000000000 );
4812         }
4813         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4814     }
4815     else {
4816         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4817     }
4818     return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4819 
4820 }
4821 
4822 /*----------------------------------------------------------------------------
4823 | Returns the result of converting the quadruple-precision floating-point
4824 | value `a' to the 64-bit two's complement integer format.  The conversion
4825 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4826 | Arithmetic, except that the conversion is always rounded toward zero.
4827 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4828 | the conversion overflows, the largest integer with the same sign as `a' is
4829 | returned.
4830 *----------------------------------------------------------------------------*/
4831 
4832 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4833 {
4834     flag aSign;
4835     int32 aExp, shiftCount;
4836     uint64_t aSig0, aSig1;
4837     int64 z;
4838 
4839     aSig1 = extractFloat128Frac1( a );
4840     aSig0 = extractFloat128Frac0( a );
4841     aExp = extractFloat128Exp( a );
4842     aSign = extractFloat128Sign( a );
4843     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4844     shiftCount = aExp - 0x402F;
4845     if ( 0 < shiftCount ) {
4846         if ( 0x403E <= aExp ) {
4847             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4848             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4849                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4850                 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4851             }
4852             else {
4853                 float_raise( float_flag_invalid STATUS_VAR);
4854                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4855                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4856                 }
4857             }
4858             return (int64_t) LIT64( 0x8000000000000000 );
4859         }
4860         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4861         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
4862             STATUS(float_exception_flags) |= float_flag_inexact;
4863         }
4864     }
4865     else {
4866         if ( aExp < 0x3FFF ) {
4867             if ( aExp | aSig0 | aSig1 ) {
4868                 STATUS(float_exception_flags) |= float_flag_inexact;
4869             }
4870             return 0;
4871         }
4872         z = aSig0>>( - shiftCount );
4873         if (    aSig1
4874              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4875             STATUS(float_exception_flags) |= float_flag_inexact;
4876         }
4877     }
4878     if ( aSign ) z = - z;
4879     return z;
4880 
4881 }
4882 
4883 /*----------------------------------------------------------------------------
4884 | Returns the result of converting the quadruple-precision floating-point
4885 | value `a' to the single-precision floating-point format.  The conversion
4886 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4887 | Arithmetic.
4888 *----------------------------------------------------------------------------*/
4889 
4890 float32 float128_to_float32( float128 a STATUS_PARAM )
4891 {
4892     flag aSign;
4893     int32 aExp;
4894     uint64_t aSig0, aSig1;
4895     uint32_t zSig;
4896 
4897     aSig1 = extractFloat128Frac1( a );
4898     aSig0 = extractFloat128Frac0( a );
4899     aExp = extractFloat128Exp( a );
4900     aSign = extractFloat128Sign( a );
4901     if ( aExp == 0x7FFF ) {
4902         if ( aSig0 | aSig1 ) {
4903             return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4904         }
4905         return packFloat32( aSign, 0xFF, 0 );
4906     }
4907     aSig0 |= ( aSig1 != 0 );
4908     shift64RightJamming( aSig0, 18, &aSig0 );
4909     zSig = aSig0;
4910     if ( aExp || zSig ) {
4911         zSig |= 0x40000000;
4912         aExp -= 0x3F81;
4913     }
4914     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4915 
4916 }
4917 
4918 /*----------------------------------------------------------------------------
4919 | Returns the result of converting the quadruple-precision floating-point
4920 | value `a' to the double-precision floating-point format.  The conversion
4921 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4922 | Arithmetic.
4923 *----------------------------------------------------------------------------*/
4924 
4925 float64 float128_to_float64( float128 a STATUS_PARAM )
4926 {
4927     flag aSign;
4928     int32 aExp;
4929     uint64_t aSig0, aSig1;
4930 
4931     aSig1 = extractFloat128Frac1( a );
4932     aSig0 = extractFloat128Frac0( a );
4933     aExp = extractFloat128Exp( a );
4934     aSign = extractFloat128Sign( a );
4935     if ( aExp == 0x7FFF ) {
4936         if ( aSig0 | aSig1 ) {
4937             return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4938         }
4939         return packFloat64( aSign, 0x7FF, 0 );
4940     }
4941     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4942     aSig0 |= ( aSig1 != 0 );
4943     if ( aExp || aSig0 ) {
4944         aSig0 |= LIT64( 0x4000000000000000 );
4945         aExp -= 0x3C01;
4946     }
4947     return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4948 
4949 }
4950 
4951 #ifdef FLOATX80
4952 
4953 /*----------------------------------------------------------------------------
4954 | Returns the result of converting the quadruple-precision floating-point
4955 | value `a' to the extended double-precision floating-point format.  The
4956 | conversion is performed according to the IEC/IEEE Standard for Binary
4957 | Floating-Point Arithmetic.
4958 *----------------------------------------------------------------------------*/
4959 
4960 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4961 {
4962     flag aSign;
4963     int32 aExp;
4964     uint64_t aSig0, aSig1;
4965 
4966     aSig1 = extractFloat128Frac1( a );
4967     aSig0 = extractFloat128Frac0( a );
4968     aExp = extractFloat128Exp( a );
4969     aSign = extractFloat128Sign( a );
4970     if ( aExp == 0x7FFF ) {
4971         if ( aSig0 | aSig1 ) {
4972             return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4973         }
4974         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4975     }
4976     if ( aExp == 0 ) {
4977         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4978         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4979     }
4980     else {
4981         aSig0 |= LIT64( 0x0001000000000000 );
4982     }
4983     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4984     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4985 
4986 }
4987 
4988 #endif
4989 
4990 /*----------------------------------------------------------------------------
4991 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4992 | returns the result as a quadruple-precision floating-point value.  The
4993 | operation is performed according to the IEC/IEEE Standard for Binary
4994 | Floating-Point Arithmetic.
4995 *----------------------------------------------------------------------------*/
4996 
4997 float128 float128_round_to_int( float128 a STATUS_PARAM )
4998 {
4999     flag aSign;
5000     int32 aExp;
5001     uint64_t lastBitMask, roundBitsMask;
5002     int8 roundingMode;
5003     float128 z;
5004 
5005     aExp = extractFloat128Exp( a );
5006     if ( 0x402F <= aExp ) {
5007         if ( 0x406F <= aExp ) {
5008             if (    ( aExp == 0x7FFF )
5009                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5010                ) {
5011                 return propagateFloat128NaN( a, a STATUS_VAR );
5012             }
5013             return a;
5014         }
5015         lastBitMask = 1;
5016         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5017         roundBitsMask = lastBitMask - 1;
5018         z = a;
5019         roundingMode = STATUS(float_rounding_mode);
5020         if ( roundingMode == float_round_nearest_even ) {
5021             if ( lastBitMask ) {
5022                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5023                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5024             }
5025             else {
5026                 if ( (int64_t) z.low < 0 ) {
5027                     ++z.high;
5028                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5029                 }
5030             }
5031         }
5032         else if ( roundingMode != float_round_to_zero ) {
5033             if (   extractFloat128Sign( z )
5034                  ^ ( roundingMode == float_round_up ) ) {
5035                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5036             }
5037         }
5038         z.low &= ~ roundBitsMask;
5039     }
5040     else {
5041         if ( aExp < 0x3FFF ) {
5042             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5043             STATUS(float_exception_flags) |= float_flag_inexact;
5044             aSign = extractFloat128Sign( a );
5045             switch ( STATUS(float_rounding_mode) ) {
5046              case float_round_nearest_even:
5047                 if (    ( aExp == 0x3FFE )
5048                      && (   extractFloat128Frac0( a )
5049                           | extractFloat128Frac1( a ) )
5050                    ) {
5051                     return packFloat128( aSign, 0x3FFF, 0, 0 );
5052                 }
5053                 break;
5054              case float_round_down:
5055                 return
5056                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5057                     : packFloat128( 0, 0, 0, 0 );
5058              case float_round_up:
5059                 return
5060                       aSign ? packFloat128( 1, 0, 0, 0 )
5061                     : packFloat128( 0, 0x3FFF, 0, 0 );
5062             }
5063             return packFloat128( aSign, 0, 0, 0 );
5064         }
5065         lastBitMask = 1;
5066         lastBitMask <<= 0x402F - aExp;
5067         roundBitsMask = lastBitMask - 1;
5068         z.low = 0;
5069         z.high = a.high;
5070         roundingMode = STATUS(float_rounding_mode);
5071         if ( roundingMode == float_round_nearest_even ) {
5072             z.high += lastBitMask>>1;
5073             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5074                 z.high &= ~ lastBitMask;
5075             }
5076         }
5077         else if ( roundingMode != float_round_to_zero ) {
5078             if (   extractFloat128Sign( z )
5079                  ^ ( roundingMode == float_round_up ) ) {
5080                 z.high |= ( a.low != 0 );
5081                 z.high += roundBitsMask;
5082             }
5083         }
5084         z.high &= ~ roundBitsMask;
5085     }
5086     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5087         STATUS(float_exception_flags) |= float_flag_inexact;
5088     }
5089     return z;
5090 
5091 }
5092 
5093 /*----------------------------------------------------------------------------
5094 | Returns the result of adding the absolute values of the quadruple-precision
5095 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
5096 | before being returned.  `zSign' is ignored if the result is a NaN.
5097 | The addition is performed according to the IEC/IEEE Standard for Binary
5098 | Floating-Point Arithmetic.
5099 *----------------------------------------------------------------------------*/
5100 
5101 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5102 {
5103     int32 aExp, bExp, zExp;
5104     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5105     int32 expDiff;
5106 
5107     aSig1 = extractFloat128Frac1( a );
5108     aSig0 = extractFloat128Frac0( a );
5109     aExp = extractFloat128Exp( a );
5110     bSig1 = extractFloat128Frac1( b );
5111     bSig0 = extractFloat128Frac0( b );
5112     bExp = extractFloat128Exp( b );
5113     expDiff = aExp - bExp;
5114     if ( 0 < expDiff ) {
5115         if ( aExp == 0x7FFF ) {
5116             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5117             return a;
5118         }
5119         if ( bExp == 0 ) {
5120             --expDiff;
5121         }
5122         else {
5123             bSig0 |= LIT64( 0x0001000000000000 );
5124         }
5125         shift128ExtraRightJamming(
5126             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5127         zExp = aExp;
5128     }
5129     else if ( expDiff < 0 ) {
5130         if ( bExp == 0x7FFF ) {
5131             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5132             return packFloat128( zSign, 0x7FFF, 0, 0 );
5133         }
5134         if ( aExp == 0 ) {
5135             ++expDiff;
5136         }
5137         else {
5138             aSig0 |= LIT64( 0x0001000000000000 );
5139         }
5140         shift128ExtraRightJamming(
5141             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5142         zExp = bExp;
5143     }
5144     else {
5145         if ( aExp == 0x7FFF ) {
5146             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5147                 return propagateFloat128NaN( a, b STATUS_VAR );
5148             }
5149             return a;
5150         }
5151         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5152         if ( aExp == 0 ) {
5153             if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
5154             return packFloat128( zSign, 0, zSig0, zSig1 );
5155         }
5156         zSig2 = 0;
5157         zSig0 |= LIT64( 0x0002000000000000 );
5158         zExp = aExp;
5159         goto shiftRight1;
5160     }
5161     aSig0 |= LIT64( 0x0001000000000000 );
5162     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5163     --zExp;
5164     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5165     ++zExp;
5166  shiftRight1:
5167     shift128ExtraRightJamming(
5168         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5169  roundAndPack:
5170     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5171 
5172 }
5173 
5174 /*----------------------------------------------------------------------------
5175 | Returns the result of subtracting the absolute values of the quadruple-
5176 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
5177 | difference is negated before being returned.  `zSign' is ignored if the
5178 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5179 | Standard for Binary Floating-Point Arithmetic.
5180 *----------------------------------------------------------------------------*/
5181 
5182 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5183 {
5184     int32 aExp, bExp, zExp;
5185     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5186     int32 expDiff;
5187     float128 z;
5188 
5189     aSig1 = extractFloat128Frac1( a );
5190     aSig0 = extractFloat128Frac0( a );
5191     aExp = extractFloat128Exp( a );
5192     bSig1 = extractFloat128Frac1( b );
5193     bSig0 = extractFloat128Frac0( b );
5194     bExp = extractFloat128Exp( b );
5195     expDiff = aExp - bExp;
5196     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5197     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5198     if ( 0 < expDiff ) goto aExpBigger;
5199     if ( expDiff < 0 ) goto bExpBigger;
5200     if ( aExp == 0x7FFF ) {
5201         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5202             return propagateFloat128NaN( a, b STATUS_VAR );
5203         }
5204         float_raise( float_flag_invalid STATUS_VAR);
5205         z.low = float128_default_nan_low;
5206         z.high = float128_default_nan_high;
5207         return z;
5208     }
5209     if ( aExp == 0 ) {
5210         aExp = 1;
5211         bExp = 1;
5212     }
5213     if ( bSig0 < aSig0 ) goto aBigger;
5214     if ( aSig0 < bSig0 ) goto bBigger;
5215     if ( bSig1 < aSig1 ) goto aBigger;
5216     if ( aSig1 < bSig1 ) goto bBigger;
5217     return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5218  bExpBigger:
5219     if ( bExp == 0x7FFF ) {
5220         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5221         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5222     }
5223     if ( aExp == 0 ) {
5224         ++expDiff;
5225     }
5226     else {
5227         aSig0 |= LIT64( 0x4000000000000000 );
5228     }
5229     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5230     bSig0 |= LIT64( 0x4000000000000000 );
5231  bBigger:
5232     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5233     zExp = bExp;
5234     zSign ^= 1;
5235     goto normalizeRoundAndPack;
5236  aExpBigger:
5237     if ( aExp == 0x7FFF ) {
5238         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5239         return a;
5240     }
5241     if ( bExp == 0 ) {
5242         --expDiff;
5243     }
5244     else {
5245         bSig0 |= LIT64( 0x4000000000000000 );
5246     }
5247     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5248     aSig0 |= LIT64( 0x4000000000000000 );
5249  aBigger:
5250     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5251     zExp = aExp;
5252  normalizeRoundAndPack:
5253     --zExp;
5254     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5255 
5256 }
5257 
5258 /*----------------------------------------------------------------------------
5259 | Returns the result of adding the quadruple-precision floating-point values
5260 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
5261 | for Binary Floating-Point Arithmetic.
5262 *----------------------------------------------------------------------------*/
5263 
5264 float128 float128_add( float128 a, float128 b STATUS_PARAM )
5265 {
5266     flag aSign, bSign;
5267 
5268     aSign = extractFloat128Sign( a );
5269     bSign = extractFloat128Sign( b );
5270     if ( aSign == bSign ) {
5271         return addFloat128Sigs( a, b, aSign STATUS_VAR );
5272     }
5273     else {
5274         return subFloat128Sigs( a, b, aSign STATUS_VAR );
5275     }
5276 
5277 }
5278 
5279 /*----------------------------------------------------------------------------
5280 | Returns the result of subtracting the quadruple-precision floating-point
5281 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5282 | Standard for Binary Floating-Point Arithmetic.
5283 *----------------------------------------------------------------------------*/
5284 
5285 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5286 {
5287     flag aSign, bSign;
5288 
5289     aSign = extractFloat128Sign( a );
5290     bSign = extractFloat128Sign( b );
5291     if ( aSign == bSign ) {
5292         return subFloat128Sigs( a, b, aSign STATUS_VAR );
5293     }
5294     else {
5295         return addFloat128Sigs( a, b, aSign STATUS_VAR );
5296     }
5297 
5298 }
5299 
5300 /*----------------------------------------------------------------------------
5301 | Returns the result of multiplying the quadruple-precision floating-point
5302 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5303 | Standard for Binary Floating-Point Arithmetic.
5304 *----------------------------------------------------------------------------*/
5305 
5306 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5307 {
5308     flag aSign, bSign, zSign;
5309     int32 aExp, bExp, zExp;
5310     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5311     float128 z;
5312 
5313     aSig1 = extractFloat128Frac1( a );
5314     aSig0 = extractFloat128Frac0( a );
5315     aExp = extractFloat128Exp( a );
5316     aSign = extractFloat128Sign( a );
5317     bSig1 = extractFloat128Frac1( b );
5318     bSig0 = extractFloat128Frac0( b );
5319     bExp = extractFloat128Exp( b );
5320     bSign = extractFloat128Sign( b );
5321     zSign = aSign ^ bSign;
5322     if ( aExp == 0x7FFF ) {
5323         if (    ( aSig0 | aSig1 )
5324              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5325             return propagateFloat128NaN( a, b STATUS_VAR );
5326         }
5327         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5328         return packFloat128( zSign, 0x7FFF, 0, 0 );
5329     }
5330     if ( bExp == 0x7FFF ) {
5331         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5332         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5333  invalid:
5334             float_raise( float_flag_invalid STATUS_VAR);
5335             z.low = float128_default_nan_low;
5336             z.high = float128_default_nan_high;
5337             return z;
5338         }
5339         return packFloat128( zSign, 0x7FFF, 0, 0 );
5340     }
5341     if ( aExp == 0 ) {
5342         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5343         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5344     }
5345     if ( bExp == 0 ) {
5346         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5347         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5348     }
5349     zExp = aExp + bExp - 0x4000;
5350     aSig0 |= LIT64( 0x0001000000000000 );
5351     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5352     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5353     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5354     zSig2 |= ( zSig3 != 0 );
5355     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5356         shift128ExtraRightJamming(
5357             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5358         ++zExp;
5359     }
5360     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5361 
5362 }
5363 
5364 /*----------------------------------------------------------------------------
5365 | Returns the result of dividing the quadruple-precision floating-point value
5366 | `a' by the corresponding value `b'.  The operation is performed according to
5367 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5368 *----------------------------------------------------------------------------*/
5369 
5370 float128 float128_div( float128 a, float128 b STATUS_PARAM )
5371 {
5372     flag aSign, bSign, zSign;
5373     int32 aExp, bExp, zExp;
5374     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5375     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5376     float128 z;
5377 
5378     aSig1 = extractFloat128Frac1( a );
5379     aSig0 = extractFloat128Frac0( a );
5380     aExp = extractFloat128Exp( a );
5381     aSign = extractFloat128Sign( a );
5382     bSig1 = extractFloat128Frac1( b );
5383     bSig0 = extractFloat128Frac0( b );
5384     bExp = extractFloat128Exp( b );
5385     bSign = extractFloat128Sign( b );
5386     zSign = aSign ^ bSign;
5387     if ( aExp == 0x7FFF ) {
5388         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5389         if ( bExp == 0x7FFF ) {
5390             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5391             goto invalid;
5392         }
5393         return packFloat128( zSign, 0x7FFF, 0, 0 );
5394     }
5395     if ( bExp == 0x7FFF ) {
5396         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5397         return packFloat128( zSign, 0, 0, 0 );
5398     }
5399     if ( bExp == 0 ) {
5400         if ( ( bSig0 | bSig1 ) == 0 ) {
5401             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5402  invalid:
5403                 float_raise( float_flag_invalid STATUS_VAR);
5404                 z.low = float128_default_nan_low;
5405                 z.high = float128_default_nan_high;
5406                 return z;
5407             }
5408             float_raise( float_flag_divbyzero STATUS_VAR);
5409             return packFloat128( zSign, 0x7FFF, 0, 0 );
5410         }
5411         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5412     }
5413     if ( aExp == 0 ) {
5414         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5415         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5416     }
5417     zExp = aExp - bExp + 0x3FFD;
5418     shortShift128Left(
5419         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5420     shortShift128Left(
5421         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5422     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5423         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5424         ++zExp;
5425     }
5426     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5427     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5428     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5429     while ( (int64_t) rem0 < 0 ) {
5430         --zSig0;
5431         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5432     }
5433     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5434     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5435         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5436         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5437         while ( (int64_t) rem1 < 0 ) {
5438             --zSig1;
5439             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5440         }
5441         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5442     }
5443     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5444     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5445 
5446 }
5447 
5448 /*----------------------------------------------------------------------------
5449 | Returns the remainder of the quadruple-precision floating-point value `a'
5450 | with respect to the corresponding value `b'.  The operation is performed
5451 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5452 *----------------------------------------------------------------------------*/
5453 
5454 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5455 {
5456     flag aSign, zSign;
5457     int32 aExp, bExp, expDiff;
5458     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5459     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
5460     int64_t sigMean0;
5461     float128 z;
5462 
5463     aSig1 = extractFloat128Frac1( a );
5464     aSig0 = extractFloat128Frac0( a );
5465     aExp = extractFloat128Exp( a );
5466     aSign = extractFloat128Sign( a );
5467     bSig1 = extractFloat128Frac1( b );
5468     bSig0 = extractFloat128Frac0( b );
5469     bExp = extractFloat128Exp( b );
5470     if ( aExp == 0x7FFF ) {
5471         if (    ( aSig0 | aSig1 )
5472              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5473             return propagateFloat128NaN( a, b STATUS_VAR );
5474         }
5475         goto invalid;
5476     }
5477     if ( bExp == 0x7FFF ) {
5478         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5479         return a;
5480     }
5481     if ( bExp == 0 ) {
5482         if ( ( bSig0 | bSig1 ) == 0 ) {
5483  invalid:
5484             float_raise( float_flag_invalid STATUS_VAR);
5485             z.low = float128_default_nan_low;
5486             z.high = float128_default_nan_high;
5487             return z;
5488         }
5489         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5490     }
5491     if ( aExp == 0 ) {
5492         if ( ( aSig0 | aSig1 ) == 0 ) return a;
5493         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5494     }
5495     expDiff = aExp - bExp;
5496     if ( expDiff < -1 ) return a;
5497     shortShift128Left(
5498         aSig0 | LIT64( 0x0001000000000000 ),
5499         aSig1,
5500         15 - ( expDiff < 0 ),
5501         &aSig0,
5502         &aSig1
5503     );
5504     shortShift128Left(
5505         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5506     q = le128( bSig0, bSig1, aSig0, aSig1 );
5507     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5508     expDiff -= 64;
5509     while ( 0 < expDiff ) {
5510         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5511         q = ( 4 < q ) ? q - 4 : 0;
5512         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5513         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5514         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5515         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5516         expDiff -= 61;
5517     }
5518     if ( -64 < expDiff ) {
5519         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5520         q = ( 4 < q ) ? q - 4 : 0;
5521         q >>= - expDiff;
5522         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5523         expDiff += 52;
5524         if ( expDiff < 0 ) {
5525             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5526         }
5527         else {
5528             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5529         }
5530         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5531         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5532     }
5533     else {
5534         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5535         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5536     }
5537     do {
5538         alternateASig0 = aSig0;
5539         alternateASig1 = aSig1;
5540         ++q;
5541         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5542     } while ( 0 <= (int64_t) aSig0 );
5543     add128(
5544         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
5545     if (    ( sigMean0 < 0 )
5546          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5547         aSig0 = alternateASig0;
5548         aSig1 = alternateASig1;
5549     }
5550     zSign = ( (int64_t) aSig0 < 0 );
5551     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5552     return
5553         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5554 
5555 }
5556 
5557 /*----------------------------------------------------------------------------
5558 | Returns the square root of the quadruple-precision floating-point value `a'.
5559 | The operation is performed according to the IEC/IEEE Standard for Binary
5560 | Floating-Point Arithmetic.
5561 *----------------------------------------------------------------------------*/
5562 
5563 float128 float128_sqrt( float128 a STATUS_PARAM )
5564 {
5565     flag aSign;
5566     int32 aExp, zExp;
5567     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5568     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5569     float128 z;
5570 
5571     aSig1 = extractFloat128Frac1( a );
5572     aSig0 = extractFloat128Frac0( a );
5573     aExp = extractFloat128Exp( a );
5574     aSign = extractFloat128Sign( a );
5575     if ( aExp == 0x7FFF ) {
5576         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5577         if ( ! aSign ) return a;
5578         goto invalid;
5579     }
5580     if ( aSign ) {
5581         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5582  invalid:
5583         float_raise( float_flag_invalid STATUS_VAR);
5584         z.low = float128_default_nan_low;
5585         z.high = float128_default_nan_high;
5586         return z;
5587     }
5588     if ( aExp == 0 ) {
5589         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5590         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5591     }
5592     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5593     aSig0 |= LIT64( 0x0001000000000000 );
5594     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5595     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5596     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5597     doubleZSig0 = zSig0<<1;
5598     mul64To128( zSig0, zSig0, &term0, &term1 );
5599     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5600     while ( (int64_t) rem0 < 0 ) {
5601         --zSig0;
5602         doubleZSig0 -= 2;
5603         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5604     }
5605     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5606     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5607         if ( zSig1 == 0 ) zSig1 = 1;
5608         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5609         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5610         mul64To128( zSig1, zSig1, &term2, &term3 );
5611         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5612         while ( (int64_t) rem1 < 0 ) {
5613             --zSig1;
5614             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5615             term3 |= 1;
5616             term2 |= doubleZSig0;
5617             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5618         }
5619         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5620     }
5621     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5622     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5623 
5624 }
5625 
5626 /*----------------------------------------------------------------------------
5627 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5628 | the corresponding value `b', and 0 otherwise.  The comparison is performed
5629 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5630 *----------------------------------------------------------------------------*/
5631 
5632 int float128_eq( float128 a, float128 b STATUS_PARAM )
5633 {
5634 
5635     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5636               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5637          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5638               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5639        ) {
5640         if (    float128_is_signaling_nan( a )
5641              || float128_is_signaling_nan( b ) ) {
5642             float_raise( float_flag_invalid STATUS_VAR);
5643         }
5644         return 0;
5645     }
5646     return
5647            ( a.low == b.low )
5648         && (    ( a.high == b.high )
5649              || (    ( a.low == 0 )
5650                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5651            );
5652 
5653 }
5654 
5655 /*----------------------------------------------------------------------------
5656 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5657 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
5658 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5659 | Arithmetic.
5660 *----------------------------------------------------------------------------*/
5661 
5662 int float128_le( float128 a, float128 b STATUS_PARAM )
5663 {
5664     flag aSign, bSign;
5665 
5666     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5667               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5668          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5669               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5670        ) {
5671         float_raise( float_flag_invalid STATUS_VAR);
5672         return 0;
5673     }
5674     aSign = extractFloat128Sign( a );
5675     bSign = extractFloat128Sign( b );
5676     if ( aSign != bSign ) {
5677         return
5678                aSign
5679             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5680                  == 0 );
5681     }
5682     return
5683           aSign ? le128( b.high, b.low, a.high, a.low )
5684         : le128( a.high, a.low, b.high, b.low );
5685 
5686 }
5687 
5688 /*----------------------------------------------------------------------------
5689 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5690 | the corresponding value `b', and 0 otherwise.  The comparison is performed
5691 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5692 *----------------------------------------------------------------------------*/
5693 
5694 int float128_lt( float128 a, float128 b STATUS_PARAM )
5695 {
5696     flag aSign, bSign;
5697 
5698     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5699               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5700          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5701               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5702        ) {
5703         float_raise( float_flag_invalid STATUS_VAR);
5704         return 0;
5705     }
5706     aSign = extractFloat128Sign( a );
5707     bSign = extractFloat128Sign( b );
5708     if ( aSign != bSign ) {
5709         return
5710                aSign
5711             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5712                  != 0 );
5713     }
5714     return
5715           aSign ? lt128( b.high, b.low, a.high, a.low )
5716         : lt128( a.high, a.low, b.high, b.low );
5717 
5718 }
5719 
5720 /*----------------------------------------------------------------------------
5721 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5722 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5723 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5724 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5725 *----------------------------------------------------------------------------*/
5726 
5727 int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5728 {
5729 
5730     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5731               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5732          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5733               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5734        ) {
5735         float_raise( float_flag_invalid STATUS_VAR);
5736         return 0;
5737     }
5738     return
5739            ( a.low == b.low )
5740         && (    ( a.high == b.high )
5741              || (    ( a.low == 0 )
5742                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5743            );
5744 
5745 }
5746 
5747 /*----------------------------------------------------------------------------
5748 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5749 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5750 | cause an exception.  Otherwise, the comparison is performed according to the
5751 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5752 *----------------------------------------------------------------------------*/
5753 
5754 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5755 {
5756     flag aSign, bSign;
5757 
5758     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5759               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5760          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5761               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5762        ) {
5763         if (    float128_is_signaling_nan( a )
5764              || float128_is_signaling_nan( b ) ) {
5765             float_raise( float_flag_invalid STATUS_VAR);
5766         }
5767         return 0;
5768     }
5769     aSign = extractFloat128Sign( a );
5770     bSign = extractFloat128Sign( b );
5771     if ( aSign != bSign ) {
5772         return
5773                aSign
5774             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5775                  == 0 );
5776     }
5777     return
5778           aSign ? le128( b.high, b.low, a.high, a.low )
5779         : le128( a.high, a.low, b.high, b.low );
5780 
5781 }
5782 
5783 /*----------------------------------------------------------------------------
5784 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5785 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5786 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5787 | Standard for Binary Floating-Point Arithmetic.
5788 *----------------------------------------------------------------------------*/
5789 
5790 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5791 {
5792     flag aSign, bSign;
5793 
5794     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5795               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5796          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5797               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5798        ) {
5799         if (    float128_is_signaling_nan( a )
5800              || float128_is_signaling_nan( b ) ) {
5801             float_raise( float_flag_invalid STATUS_VAR);
5802         }
5803         return 0;
5804     }
5805     aSign = extractFloat128Sign( a );
5806     bSign = extractFloat128Sign( b );
5807     if ( aSign != bSign ) {
5808         return
5809                aSign
5810             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5811                  != 0 );
5812     }
5813     return
5814           aSign ? lt128( b.high, b.low, a.high, a.low )
5815         : lt128( a.high, a.low, b.high, b.low );
5816 
5817 }
5818 
5819 #endif
5820 
5821 /* misc functions */
5822 float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5823 {
5824     return int64_to_float32(a STATUS_VAR);
5825 }
5826 
5827 float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5828 {
5829     return int64_to_float64(a STATUS_VAR);
5830 }
5831 
5832 unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5833 {
5834     int64_t v;
5835     unsigned int res;
5836 
5837     v = float32_to_int64(a STATUS_VAR);
5838     if (v < 0) {
5839         res = 0;
5840         float_raise( float_flag_invalid STATUS_VAR);
5841     } else if (v > 0xffffffff) {
5842         res = 0xffffffff;
5843         float_raise( float_flag_invalid STATUS_VAR);
5844     } else {
5845         res = v;
5846     }
5847     return res;
5848 }
5849 
5850 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5851 {
5852     int64_t v;
5853     unsigned int res;
5854 
5855     v = float32_to_int64_round_to_zero(a STATUS_VAR);
5856     if (v < 0) {
5857         res = 0;
5858         float_raise( float_flag_invalid STATUS_VAR);
5859     } else if (v > 0xffffffff) {
5860         res = 0xffffffff;
5861         float_raise( float_flag_invalid STATUS_VAR);
5862     } else {
5863         res = v;
5864     }
5865     return res;
5866 }
5867 
5868 unsigned int float32_to_uint16_round_to_zero( float32 a STATUS_PARAM )
5869 {
5870     int64_t v;
5871     unsigned int res;
5872 
5873     v = float32_to_int64_round_to_zero(a STATUS_VAR);
5874     if (v < 0) {
5875         res = 0;
5876         float_raise( float_flag_invalid STATUS_VAR);
5877     } else if (v > 0xffff) {
5878         res = 0xffff;
5879         float_raise( float_flag_invalid STATUS_VAR);
5880     } else {
5881         res = v;
5882     }
5883     return res;
5884 }
5885 
5886 unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5887 {
5888     int64_t v;
5889     unsigned int res;
5890 
5891     v = float64_to_int64(a STATUS_VAR);
5892     if (v < 0) {
5893         res = 0;
5894         float_raise( float_flag_invalid STATUS_VAR);
5895     } else if (v > 0xffffffff) {
5896         res = 0xffffffff;
5897         float_raise( float_flag_invalid STATUS_VAR);
5898     } else {
5899         res = v;
5900     }
5901     return res;
5902 }
5903 
5904 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5905 {
5906     int64_t v;
5907     unsigned int res;
5908 
5909     v = float64_to_int64_round_to_zero(a STATUS_VAR);
5910     if (v < 0) {
5911         res = 0;
5912         float_raise( float_flag_invalid STATUS_VAR);
5913     } else if (v > 0xffffffff) {
5914         res = 0xffffffff;
5915         float_raise( float_flag_invalid STATUS_VAR);
5916     } else {
5917         res = v;
5918     }
5919     return res;
5920 }
5921 
5922 unsigned int float64_to_uint16_round_to_zero( float64 a STATUS_PARAM )
5923 {
5924     int64_t v;
5925     unsigned int res;
5926 
5927     v = float64_to_int64_round_to_zero(a STATUS_VAR);
5928     if (v < 0) {
5929         res = 0;
5930         float_raise( float_flag_invalid STATUS_VAR);
5931     } else if (v > 0xffff) {
5932         res = 0xffff;
5933         float_raise( float_flag_invalid STATUS_VAR);
5934     } else {
5935         res = v;
5936     }
5937     return res;
5938 }
5939 
5940 /* FIXME: This looks broken.  */
5941 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
5942 {
5943     int64_t v;
5944 
5945     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5946     v += float64_val(a);
5947     v = float64_to_int64(make_float64(v) STATUS_VAR);
5948 
5949     return v - INT64_MIN;
5950 }
5951 
5952 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
5953 {
5954     int64_t v;
5955 
5956     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5957     v += float64_val(a);
5958     v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
5959 
5960     return v - INT64_MIN;
5961 }
5962 
5963 #define COMPARE(s, nan_exp)                                                  \
5964 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
5965                                       int is_quiet STATUS_PARAM )            \
5966 {                                                                            \
5967     flag aSign, bSign;                                                       \
5968     uint ## s ## _t av, bv;                                                  \
5969     a = float ## s ## _squash_input_denormal(a STATUS_VAR);                  \
5970     b = float ## s ## _squash_input_denormal(b STATUS_VAR);                  \
5971                                                                              \
5972     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
5973          extractFloat ## s ## Frac( a ) ) ||                                 \
5974         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
5975           extractFloat ## s ## Frac( b ) )) {                                \
5976         if (!is_quiet ||                                                     \
5977             float ## s ## _is_signaling_nan( a ) ||                          \
5978             float ## s ## _is_signaling_nan( b ) ) {                         \
5979             float_raise( float_flag_invalid STATUS_VAR);                     \
5980         }                                                                    \
5981         return float_relation_unordered;                                     \
5982     }                                                                        \
5983     aSign = extractFloat ## s ## Sign( a );                                  \
5984     bSign = extractFloat ## s ## Sign( b );                                  \
5985     av = float ## s ## _val(a);                                              \
5986     bv = float ## s ## _val(b);                                              \
5987     if ( aSign != bSign ) {                                                  \
5988         if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
5989             /* zero case */                                                  \
5990             return float_relation_equal;                                     \
5991         } else {                                                             \
5992             return 1 - (2 * aSign);                                          \
5993         }                                                                    \
5994     } else {                                                                 \
5995         if (av == bv) {                                                      \
5996             return float_relation_equal;                                     \
5997         } else {                                                             \
5998             return 1 - 2 * (aSign ^ ( av < bv ));                            \
5999         }                                                                    \
6000     }                                                                        \
6001 }                                                                            \
6002                                                                              \
6003 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
6004 {                                                                            \
6005     return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
6006 }                                                                            \
6007                                                                              \
6008 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
6009 {                                                                            \
6010     return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
6011 }
6012 
6013 COMPARE(32, 0xff)
6014 COMPARE(64, 0x7ff)
6015 
6016 INLINE int float128_compare_internal( float128 a, float128 b,
6017                                       int is_quiet STATUS_PARAM )
6018 {
6019     flag aSign, bSign;
6020 
6021     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6022           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6023         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6024           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6025         if (!is_quiet ||
6026             float128_is_signaling_nan( a ) ||
6027             float128_is_signaling_nan( b ) ) {
6028             float_raise( float_flag_invalid STATUS_VAR);
6029         }
6030         return float_relation_unordered;
6031     }
6032     aSign = extractFloat128Sign( a );
6033     bSign = extractFloat128Sign( b );
6034     if ( aSign != bSign ) {
6035         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6036             /* zero case */
6037             return float_relation_equal;
6038         } else {
6039             return 1 - (2 * aSign);
6040         }
6041     } else {
6042         if (a.low == b.low && a.high == b.high) {
6043             return float_relation_equal;
6044         } else {
6045             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6046         }
6047     }
6048 }
6049 
6050 int float128_compare( float128 a, float128 b STATUS_PARAM )
6051 {
6052     return float128_compare_internal(a, b, 0 STATUS_VAR);
6053 }
6054 
6055 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6056 {
6057     return float128_compare_internal(a, b, 1 STATUS_VAR);
6058 }
6059 
6060 /* min() and max() functions. These can't be implemented as
6061  * 'compare and pick one input' because that would mishandle
6062  * NaNs and +0 vs -0.
6063  */
6064 #define MINMAX(s, nan_exp)                                              \
6065 INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
6066                                         int ismin STATUS_PARAM )        \
6067 {                                                                       \
6068     flag aSign, bSign;                                                  \
6069     uint ## s ## _t av, bv;                                             \
6070     a = float ## s ## _squash_input_denormal(a STATUS_VAR);             \
6071     b = float ## s ## _squash_input_denormal(b STATUS_VAR);             \
6072     if (float ## s ## _is_any_nan(a) ||                                 \
6073         float ## s ## _is_any_nan(b)) {                                 \
6074         return propagateFloat ## s ## NaN(a, b STATUS_VAR);             \
6075     }                                                                   \
6076     aSign = extractFloat ## s ## Sign(a);                               \
6077     bSign = extractFloat ## s ## Sign(b);                               \
6078     av = float ## s ## _val(a);                                         \
6079     bv = float ## s ## _val(b);                                         \
6080     if (aSign != bSign) {                                               \
6081         if (ismin) {                                                    \
6082             return aSign ? a : b;                                       \
6083         } else {                                                        \
6084             return aSign ? b : a;                                       \
6085         }                                                               \
6086     } else {                                                            \
6087         if (ismin) {                                                    \
6088             return (aSign ^ (av < bv)) ? a : b;                         \
6089         } else {                                                        \
6090             return (aSign ^ (av < bv)) ? b : a;                         \
6091         }                                                               \
6092     }                                                                   \
6093 }                                                                       \
6094                                                                         \
6095 float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM)  \
6096 {                                                                       \
6097     return float ## s ## _minmax(a, b, 1 STATUS_VAR);                   \
6098 }                                                                       \
6099                                                                         \
6100 float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM)  \
6101 {                                                                       \
6102     return float ## s ## _minmax(a, b, 0 STATUS_VAR);                   \
6103 }
6104 
6105 MINMAX(32, 0xff)
6106 MINMAX(64, 0x7ff)
6107 
6108 
6109 /* Multiply A by 2 raised to the power N.  */
6110 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6111 {
6112     flag aSign;
6113     int16 aExp;
6114     uint32_t aSig;
6115 
6116     a = float32_squash_input_denormal(a STATUS_VAR);
6117     aSig = extractFloat32Frac( a );
6118     aExp = extractFloat32Exp( a );
6119     aSign = extractFloat32Sign( a );
6120 
6121     if ( aExp == 0xFF ) {
6122         return a;
6123     }
6124     if ( aExp != 0 )
6125         aSig |= 0x00800000;
6126     else if ( aSig == 0 )
6127         return a;
6128 
6129     aExp += n - 1;
6130     aSig <<= 7;
6131     return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
6132 }
6133 
6134 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
6135 {
6136     flag aSign;
6137     int16 aExp;
6138     uint64_t aSig;
6139 
6140     a = float64_squash_input_denormal(a STATUS_VAR);
6141     aSig = extractFloat64Frac( a );
6142     aExp = extractFloat64Exp( a );
6143     aSign = extractFloat64Sign( a );
6144 
6145     if ( aExp == 0x7FF ) {
6146         return a;
6147     }
6148     if ( aExp != 0 )
6149         aSig |= LIT64( 0x0010000000000000 );
6150     else if ( aSig == 0 )
6151         return a;
6152 
6153     aExp += n - 1;
6154     aSig <<= 10;
6155     return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
6156 }
6157 
6158 #ifdef FLOATX80
6159 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
6160 {
6161     flag aSign;
6162     int16 aExp;
6163     uint64_t aSig;
6164 
6165     aSig = extractFloatx80Frac( a );
6166     aExp = extractFloatx80Exp( a );
6167     aSign = extractFloatx80Sign( a );
6168 
6169     if ( aExp == 0x7FF ) {
6170         return a;
6171     }
6172     if (aExp == 0 && aSig == 0)
6173         return a;
6174 
6175     aExp += n;
6176     return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
6177                                           aSign, aExp, aSig, 0 STATUS_VAR );
6178 }
6179 #endif
6180 
6181 #ifdef FLOAT128
6182 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
6183 {
6184     flag aSign;
6185     int32 aExp;
6186     uint64_t aSig0, aSig1;
6187 
6188     aSig1 = extractFloat128Frac1( a );
6189     aSig0 = extractFloat128Frac0( a );
6190     aExp = extractFloat128Exp( a );
6191     aSign = extractFloat128Sign( a );
6192     if ( aExp == 0x7FFF ) {
6193         return a;
6194     }
6195     if ( aExp != 0 )
6196         aSig0 |= LIT64( 0x0001000000000000 );
6197     else if ( aSig0 == 0 && aSig1 == 0 )
6198         return a;
6199 
6200     aExp += n - 1;
6201     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6202                                           STATUS_VAR );
6203 
6204 }
6205 #endif
6206