xref: /openbmc/qemu/fpu/softfloat.c (revision 71af014f1451bec3244e086298813b5aa7b2a0ee)
1 
2 /*============================================================================
3 
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
5 Package, Release 2b.
6 
7 Written by John R. Hauser.  This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704.  Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980.  The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15 arithmetic/SoftFloat.html'.
16 
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
25 
26 Derivative works are acceptable, even for commercial purposes, so long as
27 (1) the source code for the derivative work includes prominent notice that
28 the work is derivative, and (2) the source code includes prominent notice with
29 these four paragraphs for those parts of this code that are retained.
30 
31 =============================================================================*/
32 
33 /* FIXME: Flush-To-Zero only effects results.  Denormal inputs should also
34    be flushed to zero.  */
35 #include "softfloat.h"
36 
37 /*----------------------------------------------------------------------------
38 | Primitive arithmetic functions, including multi-word arithmetic, and
39 | division and square root approximations.  (Can be specialized to target if
40 | desired.)
41 *----------------------------------------------------------------------------*/
42 #include "softfloat-macros.h"
43 
44 /*----------------------------------------------------------------------------
45 | Functions and definitions to determine:  (1) whether tininess for underflow
46 | is detected before or after rounding by default, (2) what (if anything)
47 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
48 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
49 | are propagated from function inputs to output.  These details are target-
50 | specific.
51 *----------------------------------------------------------------------------*/
52 #include "softfloat-specialize.h"
53 
54 void set_float_rounding_mode(int val STATUS_PARAM)
55 {
56     STATUS(float_rounding_mode) = val;
57 }
58 
59 void set_float_exception_flags(int val STATUS_PARAM)
60 {
61     STATUS(float_exception_flags) = val;
62 }
63 
64 #ifdef FLOATX80
65 void set_floatx80_rounding_precision(int val STATUS_PARAM)
66 {
67     STATUS(floatx80_rounding_precision) = val;
68 }
69 #endif
70 
71 /*----------------------------------------------------------------------------
72 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
73 | and 7, and returns the properly rounded 32-bit integer corresponding to the
74 | input.  If `zSign' is 1, the input is negated before being converted to an
75 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
76 | is simply rounded to an integer, with the inexact exception raised if the
77 | input cannot be represented exactly as an integer.  However, if the fixed-
78 | point input is too large, the invalid exception is raised and the largest
79 | positive or negative integer is returned.
80 *----------------------------------------------------------------------------*/
81 
82 static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
83 {
84     int8 roundingMode;
85     flag roundNearestEven;
86     int8 roundIncrement, roundBits;
87     int32 z;
88 
89     roundingMode = STATUS(float_rounding_mode);
90     roundNearestEven = ( roundingMode == float_round_nearest_even );
91     roundIncrement = 0x40;
92     if ( ! roundNearestEven ) {
93         if ( roundingMode == float_round_to_zero ) {
94             roundIncrement = 0;
95         }
96         else {
97             roundIncrement = 0x7F;
98             if ( zSign ) {
99                 if ( roundingMode == float_round_up ) roundIncrement = 0;
100             }
101             else {
102                 if ( roundingMode == float_round_down ) roundIncrement = 0;
103             }
104         }
105     }
106     roundBits = absZ & 0x7F;
107     absZ = ( absZ + roundIncrement )>>7;
108     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
109     z = absZ;
110     if ( zSign ) z = - z;
111     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
112         float_raise( float_flag_invalid STATUS_VAR);
113         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
114     }
115     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
116     return z;
117 
118 }
119 
120 /*----------------------------------------------------------------------------
121 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
122 | `absZ1', with binary point between bits 63 and 64 (between the input words),
123 | and returns the properly rounded 64-bit integer corresponding to the input.
124 | If `zSign' is 1, the input is negated before being converted to an integer.
125 | Ordinarily, the fixed-point input is simply rounded to an integer, with
126 | the inexact exception raised if the input cannot be represented exactly as
127 | an integer.  However, if the fixed-point input is too large, the invalid
128 | exception is raised and the largest positive or negative integer is
129 | returned.
130 *----------------------------------------------------------------------------*/
131 
132 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
133 {
134     int8 roundingMode;
135     flag roundNearestEven, increment;
136     int64 z;
137 
138     roundingMode = STATUS(float_rounding_mode);
139     roundNearestEven = ( roundingMode == float_round_nearest_even );
140     increment = ( (sbits64) absZ1 < 0 );
141     if ( ! roundNearestEven ) {
142         if ( roundingMode == float_round_to_zero ) {
143             increment = 0;
144         }
145         else {
146             if ( zSign ) {
147                 increment = ( roundingMode == float_round_down ) && absZ1;
148             }
149             else {
150                 increment = ( roundingMode == float_round_up ) && absZ1;
151             }
152         }
153     }
154     if ( increment ) {
155         ++absZ0;
156         if ( absZ0 == 0 ) goto overflow;
157         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
158     }
159     z = absZ0;
160     if ( zSign ) z = - z;
161     if ( z && ( ( z < 0 ) ^ zSign ) ) {
162  overflow:
163         float_raise( float_flag_invalid STATUS_VAR);
164         return
165               zSign ? (sbits64) LIT64( 0x8000000000000000 )
166             : LIT64( 0x7FFFFFFFFFFFFFFF );
167     }
168     if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
169     return z;
170 
171 }
172 
173 /*----------------------------------------------------------------------------
174 | Returns the fraction bits of the single-precision floating-point value `a'.
175 *----------------------------------------------------------------------------*/
176 
177 INLINE bits32 extractFloat32Frac( float32 a )
178 {
179 
180     return float32_val(a) & 0x007FFFFF;
181 
182 }
183 
184 /*----------------------------------------------------------------------------
185 | Returns the exponent bits of the single-precision floating-point value `a'.
186 *----------------------------------------------------------------------------*/
187 
188 INLINE int16 extractFloat32Exp( float32 a )
189 {
190 
191     return ( float32_val(a)>>23 ) & 0xFF;
192 
193 }
194 
195 /*----------------------------------------------------------------------------
196 | Returns the sign bit of the single-precision floating-point value `a'.
197 *----------------------------------------------------------------------------*/
198 
199 INLINE flag extractFloat32Sign( float32 a )
200 {
201 
202     return float32_val(a)>>31;
203 
204 }
205 
206 /*----------------------------------------------------------------------------
207 | Normalizes the subnormal single-precision floating-point value represented
208 | by the denormalized significand `aSig'.  The normalized exponent and
209 | significand are stored at the locations pointed to by `zExpPtr' and
210 | `zSigPtr', respectively.
211 *----------------------------------------------------------------------------*/
212 
213 static void
214  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
215 {
216     int8 shiftCount;
217 
218     shiftCount = countLeadingZeros32( aSig ) - 8;
219     *zSigPtr = aSig<<shiftCount;
220     *zExpPtr = 1 - shiftCount;
221 
222 }
223 
224 /*----------------------------------------------------------------------------
225 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
226 | single-precision floating-point value, returning the result.  After being
227 | shifted into the proper positions, the three fields are simply added
228 | together to form the result.  This means that any integer portion of `zSig'
229 | will be added into the exponent.  Since a properly normalized significand
230 | will have an integer portion equal to 1, the `zExp' input should be 1 less
231 | than the desired result exponent whenever `zSig' is a complete, normalized
232 | significand.
233 *----------------------------------------------------------------------------*/
234 
235 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
236 {
237 
238     return make_float32(
239           ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig);
240 
241 }
242 
243 /*----------------------------------------------------------------------------
244 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
245 | and significand `zSig', and returns the proper single-precision floating-
246 | point value corresponding to the abstract input.  Ordinarily, the abstract
247 | value is simply rounded and packed into the single-precision format, with
248 | the inexact exception raised if the abstract input cannot be represented
249 | exactly.  However, if the abstract value is too large, the overflow and
250 | inexact exceptions are raised and an infinity or maximal finite value is
251 | returned.  If the abstract value is too small, the input value is rounded to
252 | a subnormal number, and the underflow and inexact exceptions are raised if
253 | the abstract input cannot be represented exactly as a subnormal single-
254 | precision floating-point number.
255 |     The input significand `zSig' has its binary point between bits 30
256 | and 29, which is 7 bits to the left of the usual location.  This shifted
257 | significand must be normalized or smaller.  If `zSig' is not normalized,
258 | `zExp' must be 0; in that case, the result returned is a subnormal number,
259 | and it must not require rounding.  In the usual case that `zSig' is
260 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
261 | The handling of underflow and overflow follows the IEC/IEEE Standard for
262 | Binary Floating-Point Arithmetic.
263 *----------------------------------------------------------------------------*/
264 
265 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
266 {
267     int8 roundingMode;
268     flag roundNearestEven;
269     int8 roundIncrement, roundBits;
270     flag isTiny;
271 
272     roundingMode = STATUS(float_rounding_mode);
273     roundNearestEven = ( roundingMode == float_round_nearest_even );
274     roundIncrement = 0x40;
275     if ( ! roundNearestEven ) {
276         if ( roundingMode == float_round_to_zero ) {
277             roundIncrement = 0;
278         }
279         else {
280             roundIncrement = 0x7F;
281             if ( zSign ) {
282                 if ( roundingMode == float_round_up ) roundIncrement = 0;
283             }
284             else {
285                 if ( roundingMode == float_round_down ) roundIncrement = 0;
286             }
287         }
288     }
289     roundBits = zSig & 0x7F;
290     if ( 0xFD <= (bits16) zExp ) {
291         if (    ( 0xFD < zExp )
292              || (    ( zExp == 0xFD )
293                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
294            ) {
295             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
296             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
297         }
298         if ( zExp < 0 ) {
299             if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
300             isTiny =
301                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
302                 || ( zExp < -1 )
303                 || ( zSig + roundIncrement < 0x80000000 );
304             shift32RightJamming( zSig, - zExp, &zSig );
305             zExp = 0;
306             roundBits = zSig & 0x7F;
307             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
308         }
309     }
310     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
311     zSig = ( zSig + roundIncrement )>>7;
312     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
313     if ( zSig == 0 ) zExp = 0;
314     return packFloat32( zSign, zExp, zSig );
315 
316 }
317 
318 /*----------------------------------------------------------------------------
319 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
320 | and significand `zSig', and returns the proper single-precision floating-
321 | point value corresponding to the abstract input.  This routine is just like
322 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
323 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
324 | floating-point exponent.
325 *----------------------------------------------------------------------------*/
326 
327 static float32
328  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
329 {
330     int8 shiftCount;
331 
332     shiftCount = countLeadingZeros32( zSig ) - 1;
333     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
334 
335 }
336 
337 /*----------------------------------------------------------------------------
338 | Returns the fraction bits of the double-precision floating-point value `a'.
339 *----------------------------------------------------------------------------*/
340 
341 INLINE bits64 extractFloat64Frac( float64 a )
342 {
343 
344     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
345 
346 }
347 
348 /*----------------------------------------------------------------------------
349 | Returns the exponent bits of the double-precision floating-point value `a'.
350 *----------------------------------------------------------------------------*/
351 
352 INLINE int16 extractFloat64Exp( float64 a )
353 {
354 
355     return ( float64_val(a)>>52 ) & 0x7FF;
356 
357 }
358 
359 /*----------------------------------------------------------------------------
360 | Returns the sign bit of the double-precision floating-point value `a'.
361 *----------------------------------------------------------------------------*/
362 
363 INLINE flag extractFloat64Sign( float64 a )
364 {
365 
366     return float64_val(a)>>63;
367 
368 }
369 
370 /*----------------------------------------------------------------------------
371 | Normalizes the subnormal double-precision floating-point value represented
372 | by the denormalized significand `aSig'.  The normalized exponent and
373 | significand are stored at the locations pointed to by `zExpPtr' and
374 | `zSigPtr', respectively.
375 *----------------------------------------------------------------------------*/
376 
377 static void
378  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
379 {
380     int8 shiftCount;
381 
382     shiftCount = countLeadingZeros64( aSig ) - 11;
383     *zSigPtr = aSig<<shiftCount;
384     *zExpPtr = 1 - shiftCount;
385 
386 }
387 
388 /*----------------------------------------------------------------------------
389 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
390 | double-precision floating-point value, returning the result.  After being
391 | shifted into the proper positions, the three fields are simply added
392 | together to form the result.  This means that any integer portion of `zSig'
393 | will be added into the exponent.  Since a properly normalized significand
394 | will have an integer portion equal to 1, the `zExp' input should be 1 less
395 | than the desired result exponent whenever `zSig' is a complete, normalized
396 | significand.
397 *----------------------------------------------------------------------------*/
398 
399 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
400 {
401 
402     return make_float64(
403         ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig);
404 
405 }
406 
407 /*----------------------------------------------------------------------------
408 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
409 | and significand `zSig', and returns the proper double-precision floating-
410 | point value corresponding to the abstract input.  Ordinarily, the abstract
411 | value is simply rounded and packed into the double-precision format, with
412 | the inexact exception raised if the abstract input cannot be represented
413 | exactly.  However, if the abstract value is too large, the overflow and
414 | inexact exceptions are raised and an infinity or maximal finite value is
415 | returned.  If the abstract value is too small, the input value is rounded
416 | to a subnormal number, and the underflow and inexact exceptions are raised
417 | if the abstract input cannot be represented exactly as a subnormal double-
418 | precision floating-point number.
419 |     The input significand `zSig' has its binary point between bits 62
420 | and 61, which is 10 bits to the left of the usual location.  This shifted
421 | significand must be normalized or smaller.  If `zSig' is not normalized,
422 | `zExp' must be 0; in that case, the result returned is a subnormal number,
423 | and it must not require rounding.  In the usual case that `zSig' is
424 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
425 | The handling of underflow and overflow follows the IEC/IEEE Standard for
426 | Binary Floating-Point Arithmetic.
427 *----------------------------------------------------------------------------*/
428 
429 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
430 {
431     int8 roundingMode;
432     flag roundNearestEven;
433     int16 roundIncrement, roundBits;
434     flag isTiny;
435 
436     roundingMode = STATUS(float_rounding_mode);
437     roundNearestEven = ( roundingMode == float_round_nearest_even );
438     roundIncrement = 0x200;
439     if ( ! roundNearestEven ) {
440         if ( roundingMode == float_round_to_zero ) {
441             roundIncrement = 0;
442         }
443         else {
444             roundIncrement = 0x3FF;
445             if ( zSign ) {
446                 if ( roundingMode == float_round_up ) roundIncrement = 0;
447             }
448             else {
449                 if ( roundingMode == float_round_down ) roundIncrement = 0;
450             }
451         }
452     }
453     roundBits = zSig & 0x3FF;
454     if ( 0x7FD <= (bits16) zExp ) {
455         if (    ( 0x7FD < zExp )
456              || (    ( zExp == 0x7FD )
457                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
458            ) {
459             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
460             return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
461         }
462         if ( zExp < 0 ) {
463             if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
464             isTiny =
465                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
466                 || ( zExp < -1 )
467                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
468             shift64RightJamming( zSig, - zExp, &zSig );
469             zExp = 0;
470             roundBits = zSig & 0x3FF;
471             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
472         }
473     }
474     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
475     zSig = ( zSig + roundIncrement )>>10;
476     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
477     if ( zSig == 0 ) zExp = 0;
478     return packFloat64( zSign, zExp, zSig );
479 
480 }
481 
482 /*----------------------------------------------------------------------------
483 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
484 | and significand `zSig', and returns the proper double-precision floating-
485 | point value corresponding to the abstract input.  This routine is just like
486 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
487 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
488 | floating-point exponent.
489 *----------------------------------------------------------------------------*/
490 
491 static float64
492  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
493 {
494     int8 shiftCount;
495 
496     shiftCount = countLeadingZeros64( zSig ) - 1;
497     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
498 
499 }
500 
501 #ifdef FLOATX80
502 
503 /*----------------------------------------------------------------------------
504 | Returns the fraction bits of the extended double-precision floating-point
505 | value `a'.
506 *----------------------------------------------------------------------------*/
507 
508 INLINE bits64 extractFloatx80Frac( floatx80 a )
509 {
510 
511     return a.low;
512 
513 }
514 
515 /*----------------------------------------------------------------------------
516 | Returns the exponent bits of the extended double-precision floating-point
517 | value `a'.
518 *----------------------------------------------------------------------------*/
519 
520 INLINE int32 extractFloatx80Exp( floatx80 a )
521 {
522 
523     return a.high & 0x7FFF;
524 
525 }
526 
527 /*----------------------------------------------------------------------------
528 | Returns the sign bit of the extended double-precision floating-point value
529 | `a'.
530 *----------------------------------------------------------------------------*/
531 
532 INLINE flag extractFloatx80Sign( floatx80 a )
533 {
534 
535     return a.high>>15;
536 
537 }
538 
539 /*----------------------------------------------------------------------------
540 | Normalizes the subnormal extended double-precision floating-point value
541 | represented by the denormalized significand `aSig'.  The normalized exponent
542 | and significand are stored at the locations pointed to by `zExpPtr' and
543 | `zSigPtr', respectively.
544 *----------------------------------------------------------------------------*/
545 
546 static void
547  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
548 {
549     int8 shiftCount;
550 
551     shiftCount = countLeadingZeros64( aSig );
552     *zSigPtr = aSig<<shiftCount;
553     *zExpPtr = 1 - shiftCount;
554 
555 }
556 
557 /*----------------------------------------------------------------------------
558 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
559 | extended double-precision floating-point value, returning the result.
560 *----------------------------------------------------------------------------*/
561 
562 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
563 {
564     floatx80 z;
565 
566     z.low = zSig;
567     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
568     return z;
569 
570 }
571 
572 /*----------------------------------------------------------------------------
573 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
574 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
575 | and returns the proper extended double-precision floating-point value
576 | corresponding to the abstract input.  Ordinarily, the abstract value is
577 | rounded and packed into the extended double-precision format, with the
578 | inexact exception raised if the abstract input cannot be represented
579 | exactly.  However, if the abstract value is too large, the overflow and
580 | inexact exceptions are raised and an infinity or maximal finite value is
581 | returned.  If the abstract value is too small, the input value is rounded to
582 | a subnormal number, and the underflow and inexact exceptions are raised if
583 | the abstract input cannot be represented exactly as a subnormal extended
584 | double-precision floating-point number.
585 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
586 | number of bits as single or double precision, respectively.  Otherwise, the
587 | result is rounded to the full precision of the extended double-precision
588 | format.
589 |     The input significand must be normalized or smaller.  If the input
590 | significand is not normalized, `zExp' must be 0; in that case, the result
591 | returned is a subnormal number, and it must not require rounding.  The
592 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
593 | Floating-Point Arithmetic.
594 *----------------------------------------------------------------------------*/
595 
596 static floatx80
597  roundAndPackFloatx80(
598      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
599  STATUS_PARAM)
600 {
601     int8 roundingMode;
602     flag roundNearestEven, increment, isTiny;
603     int64 roundIncrement, roundMask, roundBits;
604 
605     roundingMode = STATUS(float_rounding_mode);
606     roundNearestEven = ( roundingMode == float_round_nearest_even );
607     if ( roundingPrecision == 80 ) goto precision80;
608     if ( roundingPrecision == 64 ) {
609         roundIncrement = LIT64( 0x0000000000000400 );
610         roundMask = LIT64( 0x00000000000007FF );
611     }
612     else if ( roundingPrecision == 32 ) {
613         roundIncrement = LIT64( 0x0000008000000000 );
614         roundMask = LIT64( 0x000000FFFFFFFFFF );
615     }
616     else {
617         goto precision80;
618     }
619     zSig0 |= ( zSig1 != 0 );
620     if ( ! roundNearestEven ) {
621         if ( roundingMode == float_round_to_zero ) {
622             roundIncrement = 0;
623         }
624         else {
625             roundIncrement = roundMask;
626             if ( zSign ) {
627                 if ( roundingMode == float_round_up ) roundIncrement = 0;
628             }
629             else {
630                 if ( roundingMode == float_round_down ) roundIncrement = 0;
631             }
632         }
633     }
634     roundBits = zSig0 & roundMask;
635     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
636         if (    ( 0x7FFE < zExp )
637              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
638            ) {
639             goto overflow;
640         }
641         if ( zExp <= 0 ) {
642             if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
643             isTiny =
644                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
645                 || ( zExp < 0 )
646                 || ( zSig0 <= zSig0 + roundIncrement );
647             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
648             zExp = 0;
649             roundBits = zSig0 & roundMask;
650             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
651             if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
652             zSig0 += roundIncrement;
653             if ( (sbits64) zSig0 < 0 ) zExp = 1;
654             roundIncrement = roundMask + 1;
655             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
656                 roundMask |= roundIncrement;
657             }
658             zSig0 &= ~ roundMask;
659             return packFloatx80( zSign, zExp, zSig0 );
660         }
661     }
662     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
663     zSig0 += roundIncrement;
664     if ( zSig0 < roundIncrement ) {
665         ++zExp;
666         zSig0 = LIT64( 0x8000000000000000 );
667     }
668     roundIncrement = roundMask + 1;
669     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
670         roundMask |= roundIncrement;
671     }
672     zSig0 &= ~ roundMask;
673     if ( zSig0 == 0 ) zExp = 0;
674     return packFloatx80( zSign, zExp, zSig0 );
675  precision80:
676     increment = ( (sbits64) zSig1 < 0 );
677     if ( ! roundNearestEven ) {
678         if ( roundingMode == float_round_to_zero ) {
679             increment = 0;
680         }
681         else {
682             if ( zSign ) {
683                 increment = ( roundingMode == float_round_down ) && zSig1;
684             }
685             else {
686                 increment = ( roundingMode == float_round_up ) && zSig1;
687             }
688         }
689     }
690     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
691         if (    ( 0x7FFE < zExp )
692              || (    ( zExp == 0x7FFE )
693                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
694                   && increment
695                 )
696            ) {
697             roundMask = 0;
698  overflow:
699             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
700             if (    ( roundingMode == float_round_to_zero )
701                  || ( zSign && ( roundingMode == float_round_up ) )
702                  || ( ! zSign && ( roundingMode == float_round_down ) )
703                ) {
704                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
705             }
706             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
707         }
708         if ( zExp <= 0 ) {
709             isTiny =
710                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
711                 || ( zExp < 0 )
712                 || ! increment
713                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
714             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
715             zExp = 0;
716             if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
717             if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
718             if ( roundNearestEven ) {
719                 increment = ( (sbits64) zSig1 < 0 );
720             }
721             else {
722                 if ( zSign ) {
723                     increment = ( roundingMode == float_round_down ) && zSig1;
724                 }
725                 else {
726                     increment = ( roundingMode == float_round_up ) && zSig1;
727                 }
728             }
729             if ( increment ) {
730                 ++zSig0;
731                 zSig0 &=
732                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
733                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
734             }
735             return packFloatx80( zSign, zExp, zSig0 );
736         }
737     }
738     if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
739     if ( increment ) {
740         ++zSig0;
741         if ( zSig0 == 0 ) {
742             ++zExp;
743             zSig0 = LIT64( 0x8000000000000000 );
744         }
745         else {
746             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
747         }
748     }
749     else {
750         if ( zSig0 == 0 ) zExp = 0;
751     }
752     return packFloatx80( zSign, zExp, zSig0 );
753 
754 }
755 
756 /*----------------------------------------------------------------------------
757 | Takes an abstract floating-point value having sign `zSign', exponent
758 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
759 | and returns the proper extended double-precision floating-point value
760 | corresponding to the abstract input.  This routine is just like
761 | `roundAndPackFloatx80' except that the input significand does not have to be
762 | normalized.
763 *----------------------------------------------------------------------------*/
764 
765 static floatx80
766  normalizeRoundAndPackFloatx80(
767      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
768  STATUS_PARAM)
769 {
770     int8 shiftCount;
771 
772     if ( zSig0 == 0 ) {
773         zSig0 = zSig1;
774         zSig1 = 0;
775         zExp -= 64;
776     }
777     shiftCount = countLeadingZeros64( zSig0 );
778     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
779     zExp -= shiftCount;
780     return
781         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
782 
783 }
784 
785 #endif
786 
787 #ifdef FLOAT128
788 
789 /*----------------------------------------------------------------------------
790 | Returns the least-significant 64 fraction bits of the quadruple-precision
791 | floating-point value `a'.
792 *----------------------------------------------------------------------------*/
793 
794 INLINE bits64 extractFloat128Frac1( float128 a )
795 {
796 
797     return a.low;
798 
799 }
800 
801 /*----------------------------------------------------------------------------
802 | Returns the most-significant 48 fraction bits of the quadruple-precision
803 | floating-point value `a'.
804 *----------------------------------------------------------------------------*/
805 
806 INLINE bits64 extractFloat128Frac0( float128 a )
807 {
808 
809     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
810 
811 }
812 
813 /*----------------------------------------------------------------------------
814 | Returns the exponent bits of the quadruple-precision floating-point value
815 | `a'.
816 *----------------------------------------------------------------------------*/
817 
818 INLINE int32 extractFloat128Exp( float128 a )
819 {
820 
821     return ( a.high>>48 ) & 0x7FFF;
822 
823 }
824 
825 /*----------------------------------------------------------------------------
826 | Returns the sign bit of the quadruple-precision floating-point value `a'.
827 *----------------------------------------------------------------------------*/
828 
829 INLINE flag extractFloat128Sign( float128 a )
830 {
831 
832     return a.high>>63;
833 
834 }
835 
836 /*----------------------------------------------------------------------------
837 | Normalizes the subnormal quadruple-precision floating-point value
838 | represented by the denormalized significand formed by the concatenation of
839 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
840 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
841 | significand are stored at the location pointed to by `zSig0Ptr', and the
842 | least significant 64 bits of the normalized significand are stored at the
843 | location pointed to by `zSig1Ptr'.
844 *----------------------------------------------------------------------------*/
845 
846 static void
847  normalizeFloat128Subnormal(
848      bits64 aSig0,
849      bits64 aSig1,
850      int32 *zExpPtr,
851      bits64 *zSig0Ptr,
852      bits64 *zSig1Ptr
853  )
854 {
855     int8 shiftCount;
856 
857     if ( aSig0 == 0 ) {
858         shiftCount = countLeadingZeros64( aSig1 ) - 15;
859         if ( shiftCount < 0 ) {
860             *zSig0Ptr = aSig1>>( - shiftCount );
861             *zSig1Ptr = aSig1<<( shiftCount & 63 );
862         }
863         else {
864             *zSig0Ptr = aSig1<<shiftCount;
865             *zSig1Ptr = 0;
866         }
867         *zExpPtr = - shiftCount - 63;
868     }
869     else {
870         shiftCount = countLeadingZeros64( aSig0 ) - 15;
871         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
872         *zExpPtr = 1 - shiftCount;
873     }
874 
875 }
876 
877 /*----------------------------------------------------------------------------
878 | Packs the sign `zSign', the exponent `zExp', and the significand formed
879 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
880 | floating-point value, returning the result.  After being shifted into the
881 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
882 | added together to form the most significant 32 bits of the result.  This
883 | means that any integer portion of `zSig0' will be added into the exponent.
884 | Since a properly normalized significand will have an integer portion equal
885 | to 1, the `zExp' input should be 1 less than the desired result exponent
886 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
887 | significand.
888 *----------------------------------------------------------------------------*/
889 
890 INLINE float128
891  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
892 {
893     float128 z;
894 
895     z.low = zSig1;
896     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
897     return z;
898 
899 }
900 
901 /*----------------------------------------------------------------------------
902 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
903 | and extended significand formed by the concatenation of `zSig0', `zSig1',
904 | and `zSig2', and returns the proper quadruple-precision floating-point value
905 | corresponding to the abstract input.  Ordinarily, the abstract value is
906 | simply rounded and packed into the quadruple-precision format, with the
907 | inexact exception raised if the abstract input cannot be represented
908 | exactly.  However, if the abstract value is too large, the overflow and
909 | inexact exceptions are raised and an infinity or maximal finite value is
910 | returned.  If the abstract value is too small, the input value is rounded to
911 | a subnormal number, and the underflow and inexact exceptions are raised if
912 | the abstract input cannot be represented exactly as a subnormal quadruple-
913 | precision floating-point number.
914 |     The input significand must be normalized or smaller.  If the input
915 | significand is not normalized, `zExp' must be 0; in that case, the result
916 | returned is a subnormal number, and it must not require rounding.  In the
917 | usual case that the input significand is normalized, `zExp' must be 1 less
918 | than the ``true'' floating-point exponent.  The handling of underflow and
919 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
920 *----------------------------------------------------------------------------*/
921 
922 static float128
923  roundAndPackFloat128(
924      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
925 {
926     int8 roundingMode;
927     flag roundNearestEven, increment, isTiny;
928 
929     roundingMode = STATUS(float_rounding_mode);
930     roundNearestEven = ( roundingMode == float_round_nearest_even );
931     increment = ( (sbits64) zSig2 < 0 );
932     if ( ! roundNearestEven ) {
933         if ( roundingMode == float_round_to_zero ) {
934             increment = 0;
935         }
936         else {
937             if ( zSign ) {
938                 increment = ( roundingMode == float_round_down ) && zSig2;
939             }
940             else {
941                 increment = ( roundingMode == float_round_up ) && zSig2;
942             }
943         }
944     }
945     if ( 0x7FFD <= (bits32) zExp ) {
946         if (    ( 0x7FFD < zExp )
947              || (    ( zExp == 0x7FFD )
948                   && eq128(
949                          LIT64( 0x0001FFFFFFFFFFFF ),
950                          LIT64( 0xFFFFFFFFFFFFFFFF ),
951                          zSig0,
952                          zSig1
953                      )
954                   && increment
955                 )
956            ) {
957             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
958             if (    ( roundingMode == float_round_to_zero )
959                  || ( zSign && ( roundingMode == float_round_up ) )
960                  || ( ! zSign && ( roundingMode == float_round_down ) )
961                ) {
962                 return
963                     packFloat128(
964                         zSign,
965                         0x7FFE,
966                         LIT64( 0x0000FFFFFFFFFFFF ),
967                         LIT64( 0xFFFFFFFFFFFFFFFF )
968                     );
969             }
970             return packFloat128( zSign, 0x7FFF, 0, 0 );
971         }
972         if ( zExp < 0 ) {
973             if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
974             isTiny =
975                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
976                 || ( zExp < -1 )
977                 || ! increment
978                 || lt128(
979                        zSig0,
980                        zSig1,
981                        LIT64( 0x0001FFFFFFFFFFFF ),
982                        LIT64( 0xFFFFFFFFFFFFFFFF )
983                    );
984             shift128ExtraRightJamming(
985                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
986             zExp = 0;
987             if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
988             if ( roundNearestEven ) {
989                 increment = ( (sbits64) zSig2 < 0 );
990             }
991             else {
992                 if ( zSign ) {
993                     increment = ( roundingMode == float_round_down ) && zSig2;
994                 }
995                 else {
996                     increment = ( roundingMode == float_round_up ) && zSig2;
997                 }
998             }
999         }
1000     }
1001     if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1002     if ( increment ) {
1003         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1004         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1005     }
1006     else {
1007         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1008     }
1009     return packFloat128( zSign, zExp, zSig0, zSig1 );
1010 
1011 }
1012 
1013 /*----------------------------------------------------------------------------
1014 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1015 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1016 | returns the proper quadruple-precision floating-point value corresponding
1017 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1018 | except that the input significand has fewer bits and does not have to be
1019 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1020 | point exponent.
1021 *----------------------------------------------------------------------------*/
1022 
1023 static float128
1024  normalizeRoundAndPackFloat128(
1025      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1026 {
1027     int8 shiftCount;
1028     bits64 zSig2;
1029 
1030     if ( zSig0 == 0 ) {
1031         zSig0 = zSig1;
1032         zSig1 = 0;
1033         zExp -= 64;
1034     }
1035     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1036     if ( 0 <= shiftCount ) {
1037         zSig2 = 0;
1038         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1039     }
1040     else {
1041         shift128ExtraRightJamming(
1042             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1043     }
1044     zExp -= shiftCount;
1045     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1046 
1047 }
1048 
1049 #endif
1050 
1051 /*----------------------------------------------------------------------------
1052 | Returns the result of converting the 32-bit two's complement integer `a'
1053 | to the single-precision floating-point format.  The conversion is performed
1054 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1055 *----------------------------------------------------------------------------*/
1056 
1057 float32 int32_to_float32( int32 a STATUS_PARAM )
1058 {
1059     flag zSign;
1060 
1061     if ( a == 0 ) return float32_zero;
1062     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1063     zSign = ( a < 0 );
1064     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1065 
1066 }
1067 
1068 /*----------------------------------------------------------------------------
1069 | Returns the result of converting the 32-bit two's complement integer `a'
1070 | to the double-precision floating-point format.  The conversion is performed
1071 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1072 *----------------------------------------------------------------------------*/
1073 
1074 float64 int32_to_float64( int32 a STATUS_PARAM )
1075 {
1076     flag zSign;
1077     uint32 absA;
1078     int8 shiftCount;
1079     bits64 zSig;
1080 
1081     if ( a == 0 ) return float64_zero;
1082     zSign = ( a < 0 );
1083     absA = zSign ? - a : a;
1084     shiftCount = countLeadingZeros32( absA ) + 21;
1085     zSig = absA;
1086     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1087 
1088 }
1089 
1090 #ifdef FLOATX80
1091 
1092 /*----------------------------------------------------------------------------
1093 | Returns the result of converting the 32-bit two's complement integer `a'
1094 | to the extended double-precision floating-point format.  The conversion
1095 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1096 | Arithmetic.
1097 *----------------------------------------------------------------------------*/
1098 
1099 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1100 {
1101     flag zSign;
1102     uint32 absA;
1103     int8 shiftCount;
1104     bits64 zSig;
1105 
1106     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1107     zSign = ( a < 0 );
1108     absA = zSign ? - a : a;
1109     shiftCount = countLeadingZeros32( absA ) + 32;
1110     zSig = absA;
1111     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1112 
1113 }
1114 
1115 #endif
1116 
1117 #ifdef FLOAT128
1118 
1119 /*----------------------------------------------------------------------------
1120 | Returns the result of converting the 32-bit two's complement integer `a' to
1121 | the quadruple-precision floating-point format.  The conversion is performed
1122 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1123 *----------------------------------------------------------------------------*/
1124 
1125 float128 int32_to_float128( int32 a STATUS_PARAM )
1126 {
1127     flag zSign;
1128     uint32 absA;
1129     int8 shiftCount;
1130     bits64 zSig0;
1131 
1132     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1133     zSign = ( a < 0 );
1134     absA = zSign ? - a : a;
1135     shiftCount = countLeadingZeros32( absA ) + 17;
1136     zSig0 = absA;
1137     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1138 
1139 }
1140 
1141 #endif
1142 
1143 /*----------------------------------------------------------------------------
1144 | Returns the result of converting the 64-bit two's complement integer `a'
1145 | to the single-precision floating-point format.  The conversion is performed
1146 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1147 *----------------------------------------------------------------------------*/
1148 
1149 float32 int64_to_float32( int64 a STATUS_PARAM )
1150 {
1151     flag zSign;
1152     uint64 absA;
1153     int8 shiftCount;
1154 
1155     if ( a == 0 ) return float32_zero;
1156     zSign = ( a < 0 );
1157     absA = zSign ? - a : a;
1158     shiftCount = countLeadingZeros64( absA ) - 40;
1159     if ( 0 <= shiftCount ) {
1160         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1161     }
1162     else {
1163         shiftCount += 7;
1164         if ( shiftCount < 0 ) {
1165             shift64RightJamming( absA, - shiftCount, &absA );
1166         }
1167         else {
1168             absA <<= shiftCount;
1169         }
1170         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1171     }
1172 
1173 }
1174 
1175 float32 uint64_to_float32( uint64 a STATUS_PARAM )
1176 {
1177     int8 shiftCount;
1178 
1179     if ( a == 0 ) return float32_zero;
1180     shiftCount = countLeadingZeros64( a ) - 40;
1181     if ( 0 <= shiftCount ) {
1182         return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1183     }
1184     else {
1185         shiftCount += 7;
1186         if ( shiftCount < 0 ) {
1187             shift64RightJamming( a, - shiftCount, &a );
1188         }
1189         else {
1190             a <<= shiftCount;
1191         }
1192         return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1193     }
1194 }
1195 
1196 /*----------------------------------------------------------------------------
1197 | Returns the result of converting the 64-bit two's complement integer `a'
1198 | to the double-precision floating-point format.  The conversion is performed
1199 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1200 *----------------------------------------------------------------------------*/
1201 
1202 float64 int64_to_float64( int64 a STATUS_PARAM )
1203 {
1204     flag zSign;
1205 
1206     if ( a == 0 ) return float64_zero;
1207     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1208         return packFloat64( 1, 0x43E, 0 );
1209     }
1210     zSign = ( a < 0 );
1211     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1212 
1213 }
1214 
1215 float64 uint64_to_float64( uint64 a STATUS_PARAM )
1216 {
1217     if ( a == 0 ) return float64_zero;
1218     return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1219 
1220 }
1221 
1222 #ifdef FLOATX80
1223 
1224 /*----------------------------------------------------------------------------
1225 | Returns the result of converting the 64-bit two's complement integer `a'
1226 | to the extended double-precision floating-point format.  The conversion
1227 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1228 | Arithmetic.
1229 *----------------------------------------------------------------------------*/
1230 
1231 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1232 {
1233     flag zSign;
1234     uint64 absA;
1235     int8 shiftCount;
1236 
1237     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1238     zSign = ( a < 0 );
1239     absA = zSign ? - a : a;
1240     shiftCount = countLeadingZeros64( absA );
1241     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1242 
1243 }
1244 
1245 #endif
1246 
1247 #ifdef FLOAT128
1248 
1249 /*----------------------------------------------------------------------------
1250 | Returns the result of converting the 64-bit two's complement integer `a' to
1251 | the quadruple-precision floating-point format.  The conversion is performed
1252 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1253 *----------------------------------------------------------------------------*/
1254 
1255 float128 int64_to_float128( int64 a STATUS_PARAM )
1256 {
1257     flag zSign;
1258     uint64 absA;
1259     int8 shiftCount;
1260     int32 zExp;
1261     bits64 zSig0, zSig1;
1262 
1263     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1264     zSign = ( a < 0 );
1265     absA = zSign ? - a : a;
1266     shiftCount = countLeadingZeros64( absA ) + 49;
1267     zExp = 0x406E - shiftCount;
1268     if ( 64 <= shiftCount ) {
1269         zSig1 = 0;
1270         zSig0 = absA;
1271         shiftCount -= 64;
1272     }
1273     else {
1274         zSig1 = absA;
1275         zSig0 = 0;
1276     }
1277     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1278     return packFloat128( zSign, zExp, zSig0, zSig1 );
1279 
1280 }
1281 
1282 #endif
1283 
1284 /*----------------------------------------------------------------------------
1285 | Returns the result of converting the single-precision floating-point value
1286 | `a' to the 32-bit two's complement integer format.  The conversion is
1287 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1288 | Arithmetic---which means in particular that the conversion is rounded
1289 | according to the current rounding mode.  If `a' is a NaN, the largest
1290 | positive integer is returned.  Otherwise, if the conversion overflows, the
1291 | largest integer with the same sign as `a' is returned.
1292 *----------------------------------------------------------------------------*/
1293 
1294 int32 float32_to_int32( float32 a STATUS_PARAM )
1295 {
1296     flag aSign;
1297     int16 aExp, shiftCount;
1298     bits32 aSig;
1299     bits64 aSig64;
1300 
1301     aSig = extractFloat32Frac( a );
1302     aExp = extractFloat32Exp( a );
1303     aSign = extractFloat32Sign( a );
1304     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1305     if ( aExp ) aSig |= 0x00800000;
1306     shiftCount = 0xAF - aExp;
1307     aSig64 = aSig;
1308     aSig64 <<= 32;
1309     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1310     return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1311 
1312 }
1313 
1314 /*----------------------------------------------------------------------------
1315 | Returns the result of converting the single-precision floating-point value
1316 | `a' to the 32-bit two's complement integer format.  The conversion is
1317 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1318 | Arithmetic, except that the conversion is always rounded toward zero.
1319 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1320 | the conversion overflows, the largest integer with the same sign as `a' is
1321 | returned.
1322 *----------------------------------------------------------------------------*/
1323 
1324 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1325 {
1326     flag aSign;
1327     int16 aExp, shiftCount;
1328     bits32 aSig;
1329     int32 z;
1330 
1331     aSig = extractFloat32Frac( a );
1332     aExp = extractFloat32Exp( a );
1333     aSign = extractFloat32Sign( a );
1334     shiftCount = aExp - 0x9E;
1335     if ( 0 <= shiftCount ) {
1336         if ( float32_val(a) != 0xCF000000 ) {
1337             float_raise( float_flag_invalid STATUS_VAR);
1338             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1339         }
1340         return (sbits32) 0x80000000;
1341     }
1342     else if ( aExp <= 0x7E ) {
1343         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1344         return 0;
1345     }
1346     aSig = ( aSig | 0x00800000 )<<8;
1347     z = aSig>>( - shiftCount );
1348     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1349         STATUS(float_exception_flags) |= float_flag_inexact;
1350     }
1351     if ( aSign ) z = - z;
1352     return z;
1353 
1354 }
1355 
1356 /*----------------------------------------------------------------------------
1357 | Returns the result of converting the single-precision floating-point value
1358 | `a' to the 16-bit two's complement integer format.  The conversion is
1359 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1360 | Arithmetic, except that the conversion is always rounded toward zero.
1361 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1362 | the conversion overflows, the largest integer with the same sign as `a' is
1363 | returned.
1364 *----------------------------------------------------------------------------*/
1365 
1366 int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM )
1367 {
1368     flag aSign;
1369     int16 aExp, shiftCount;
1370     bits32 aSig;
1371     int32 z;
1372 
1373     aSig = extractFloat32Frac( a );
1374     aExp = extractFloat32Exp( a );
1375     aSign = extractFloat32Sign( a );
1376     shiftCount = aExp - 0x8E;
1377     if ( 0 <= shiftCount ) {
1378         if ( float32_val(a) != 0xC7000000 ) {
1379             float_raise( float_flag_invalid STATUS_VAR);
1380             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1381                 return 0x7FFF;
1382             }
1383         }
1384         return (sbits32) 0xffff8000;
1385     }
1386     else if ( aExp <= 0x7E ) {
1387         if ( aExp | aSig ) {
1388             STATUS(float_exception_flags) |= float_flag_inexact;
1389         }
1390         return 0;
1391     }
1392     shiftCount -= 0x10;
1393     aSig = ( aSig | 0x00800000 )<<8;
1394     z = aSig>>( - shiftCount );
1395     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1396         STATUS(float_exception_flags) |= float_flag_inexact;
1397     }
1398     if ( aSign ) {
1399         z = - z;
1400     }
1401     return z;
1402 
1403 }
1404 
1405 /*----------------------------------------------------------------------------
1406 | Returns the result of converting the single-precision floating-point value
1407 | `a' to the 64-bit two's complement integer format.  The conversion is
1408 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1409 | Arithmetic---which means in particular that the conversion is rounded
1410 | according to the current rounding mode.  If `a' is a NaN, the largest
1411 | positive integer is returned.  Otherwise, if the conversion overflows, the
1412 | largest integer with the same sign as `a' is returned.
1413 *----------------------------------------------------------------------------*/
1414 
1415 int64 float32_to_int64( float32 a STATUS_PARAM )
1416 {
1417     flag aSign;
1418     int16 aExp, shiftCount;
1419     bits32 aSig;
1420     bits64 aSig64, aSigExtra;
1421 
1422     aSig = extractFloat32Frac( a );
1423     aExp = extractFloat32Exp( a );
1424     aSign = extractFloat32Sign( a );
1425     shiftCount = 0xBE - aExp;
1426     if ( shiftCount < 0 ) {
1427         float_raise( float_flag_invalid STATUS_VAR);
1428         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1429             return LIT64( 0x7FFFFFFFFFFFFFFF );
1430         }
1431         return (sbits64) LIT64( 0x8000000000000000 );
1432     }
1433     if ( aExp ) aSig |= 0x00800000;
1434     aSig64 = aSig;
1435     aSig64 <<= 40;
1436     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1437     return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1438 
1439 }
1440 
1441 /*----------------------------------------------------------------------------
1442 | Returns the result of converting the single-precision floating-point value
1443 | `a' to the 64-bit two's complement integer format.  The conversion is
1444 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1445 | Arithmetic, except that the conversion is always rounded toward zero.  If
1446 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1447 | conversion overflows, the largest integer with the same sign as `a' is
1448 | returned.
1449 *----------------------------------------------------------------------------*/
1450 
1451 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1452 {
1453     flag aSign;
1454     int16 aExp, shiftCount;
1455     bits32 aSig;
1456     bits64 aSig64;
1457     int64 z;
1458 
1459     aSig = extractFloat32Frac( a );
1460     aExp = extractFloat32Exp( a );
1461     aSign = extractFloat32Sign( a );
1462     shiftCount = aExp - 0xBE;
1463     if ( 0 <= shiftCount ) {
1464         if ( float32_val(a) != 0xDF000000 ) {
1465             float_raise( float_flag_invalid STATUS_VAR);
1466             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1467                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1468             }
1469         }
1470         return (sbits64) LIT64( 0x8000000000000000 );
1471     }
1472     else if ( aExp <= 0x7E ) {
1473         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1474         return 0;
1475     }
1476     aSig64 = aSig | 0x00800000;
1477     aSig64 <<= 40;
1478     z = aSig64>>( - shiftCount );
1479     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1480         STATUS(float_exception_flags) |= float_flag_inexact;
1481     }
1482     if ( aSign ) z = - z;
1483     return z;
1484 
1485 }
1486 
1487 /*----------------------------------------------------------------------------
1488 | Returns the result of converting the single-precision floating-point value
1489 | `a' to the double-precision floating-point format.  The conversion is
1490 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1491 | Arithmetic.
1492 *----------------------------------------------------------------------------*/
1493 
1494 float64 float32_to_float64( float32 a STATUS_PARAM )
1495 {
1496     flag aSign;
1497     int16 aExp;
1498     bits32 aSig;
1499 
1500     aSig = extractFloat32Frac( a );
1501     aExp = extractFloat32Exp( a );
1502     aSign = extractFloat32Sign( a );
1503     if ( aExp == 0xFF ) {
1504         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1505         return packFloat64( aSign, 0x7FF, 0 );
1506     }
1507     if ( aExp == 0 ) {
1508         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1509         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1510         --aExp;
1511     }
1512     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1513 
1514 }
1515 
1516 #ifdef FLOATX80
1517 
1518 /*----------------------------------------------------------------------------
1519 | Returns the result of converting the single-precision floating-point value
1520 | `a' to the extended double-precision floating-point format.  The conversion
1521 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1522 | Arithmetic.
1523 *----------------------------------------------------------------------------*/
1524 
1525 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1526 {
1527     flag aSign;
1528     int16 aExp;
1529     bits32 aSig;
1530 
1531     aSig = extractFloat32Frac( a );
1532     aExp = extractFloat32Exp( a );
1533     aSign = extractFloat32Sign( a );
1534     if ( aExp == 0xFF ) {
1535         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1536         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1537     }
1538     if ( aExp == 0 ) {
1539         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1540         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1541     }
1542     aSig |= 0x00800000;
1543     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1544 
1545 }
1546 
1547 #endif
1548 
1549 #ifdef FLOAT128
1550 
1551 /*----------------------------------------------------------------------------
1552 | Returns the result of converting the single-precision floating-point value
1553 | `a' to the double-precision floating-point format.  The conversion is
1554 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1555 | Arithmetic.
1556 *----------------------------------------------------------------------------*/
1557 
1558 float128 float32_to_float128( float32 a STATUS_PARAM )
1559 {
1560     flag aSign;
1561     int16 aExp;
1562     bits32 aSig;
1563 
1564     aSig = extractFloat32Frac( a );
1565     aExp = extractFloat32Exp( a );
1566     aSign = extractFloat32Sign( a );
1567     if ( aExp == 0xFF ) {
1568         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1569         return packFloat128( aSign, 0x7FFF, 0, 0 );
1570     }
1571     if ( aExp == 0 ) {
1572         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1573         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1574         --aExp;
1575     }
1576     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1577 
1578 }
1579 
1580 #endif
1581 
1582 /*----------------------------------------------------------------------------
1583 | Rounds the single-precision floating-point value `a' to an integer, and
1584 | returns the result as a single-precision floating-point value.  The
1585 | operation is performed according to the IEC/IEEE Standard for Binary
1586 | Floating-Point Arithmetic.
1587 *----------------------------------------------------------------------------*/
1588 
1589 float32 float32_round_to_int( float32 a STATUS_PARAM)
1590 {
1591     flag aSign;
1592     int16 aExp;
1593     bits32 lastBitMask, roundBitsMask;
1594     int8 roundingMode;
1595     bits32 z;
1596 
1597     aExp = extractFloat32Exp( a );
1598     if ( 0x96 <= aExp ) {
1599         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1600             return propagateFloat32NaN( a, a STATUS_VAR );
1601         }
1602         return a;
1603     }
1604     if ( aExp <= 0x7E ) {
1605         if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a;
1606         STATUS(float_exception_flags) |= float_flag_inexact;
1607         aSign = extractFloat32Sign( a );
1608         switch ( STATUS(float_rounding_mode) ) {
1609          case float_round_nearest_even:
1610             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1611                 return packFloat32( aSign, 0x7F, 0 );
1612             }
1613             break;
1614          case float_round_down:
1615             return make_float32(aSign ? 0xBF800000 : 0);
1616          case float_round_up:
1617             return make_float32(aSign ? 0x80000000 : 0x3F800000);
1618         }
1619         return packFloat32( aSign, 0, 0 );
1620     }
1621     lastBitMask = 1;
1622     lastBitMask <<= 0x96 - aExp;
1623     roundBitsMask = lastBitMask - 1;
1624     z = float32_val(a);
1625     roundingMode = STATUS(float_rounding_mode);
1626     if ( roundingMode == float_round_nearest_even ) {
1627         z += lastBitMask>>1;
1628         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1629     }
1630     else if ( roundingMode != float_round_to_zero ) {
1631         if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1632             z += roundBitsMask;
1633         }
1634     }
1635     z &= ~ roundBitsMask;
1636     if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1637     return make_float32(z);
1638 
1639 }
1640 
1641 /*----------------------------------------------------------------------------
1642 | Returns the result of adding the absolute values of the single-precision
1643 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1644 | before being returned.  `zSign' is ignored if the result is a NaN.
1645 | The addition is performed according to the IEC/IEEE Standard for Binary
1646 | Floating-Point Arithmetic.
1647 *----------------------------------------------------------------------------*/
1648 
1649 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1650 {
1651     int16 aExp, bExp, zExp;
1652     bits32 aSig, bSig, zSig;
1653     int16 expDiff;
1654 
1655     aSig = extractFloat32Frac( a );
1656     aExp = extractFloat32Exp( a );
1657     bSig = extractFloat32Frac( b );
1658     bExp = extractFloat32Exp( b );
1659     expDiff = aExp - bExp;
1660     aSig <<= 6;
1661     bSig <<= 6;
1662     if ( 0 < expDiff ) {
1663         if ( aExp == 0xFF ) {
1664             if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1665             return a;
1666         }
1667         if ( bExp == 0 ) {
1668             --expDiff;
1669         }
1670         else {
1671             bSig |= 0x20000000;
1672         }
1673         shift32RightJamming( bSig, expDiff, &bSig );
1674         zExp = aExp;
1675     }
1676     else if ( expDiff < 0 ) {
1677         if ( bExp == 0xFF ) {
1678             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1679             return packFloat32( zSign, 0xFF, 0 );
1680         }
1681         if ( aExp == 0 ) {
1682             ++expDiff;
1683         }
1684         else {
1685             aSig |= 0x20000000;
1686         }
1687         shift32RightJamming( aSig, - expDiff, &aSig );
1688         zExp = bExp;
1689     }
1690     else {
1691         if ( aExp == 0xFF ) {
1692             if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1693             return a;
1694         }
1695         if ( aExp == 0 ) {
1696             if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
1697             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1698         }
1699         zSig = 0x40000000 + aSig + bSig;
1700         zExp = aExp;
1701         goto roundAndPack;
1702     }
1703     aSig |= 0x20000000;
1704     zSig = ( aSig + bSig )<<1;
1705     --zExp;
1706     if ( (sbits32) zSig < 0 ) {
1707         zSig = aSig + bSig;
1708         ++zExp;
1709     }
1710  roundAndPack:
1711     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1712 
1713 }
1714 
1715 /*----------------------------------------------------------------------------
1716 | Returns the result of subtracting the absolute values of the single-
1717 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
1718 | difference is negated before being returned.  `zSign' is ignored if the
1719 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
1720 | Standard for Binary Floating-Point Arithmetic.
1721 *----------------------------------------------------------------------------*/
1722 
1723 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1724 {
1725     int16 aExp, bExp, zExp;
1726     bits32 aSig, bSig, zSig;
1727     int16 expDiff;
1728 
1729     aSig = extractFloat32Frac( a );
1730     aExp = extractFloat32Exp( a );
1731     bSig = extractFloat32Frac( b );
1732     bExp = extractFloat32Exp( b );
1733     expDiff = aExp - bExp;
1734     aSig <<= 7;
1735     bSig <<= 7;
1736     if ( 0 < expDiff ) goto aExpBigger;
1737     if ( expDiff < 0 ) goto bExpBigger;
1738     if ( aExp == 0xFF ) {
1739         if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1740         float_raise( float_flag_invalid STATUS_VAR);
1741         return float32_default_nan;
1742     }
1743     if ( aExp == 0 ) {
1744         aExp = 1;
1745         bExp = 1;
1746     }
1747     if ( bSig < aSig ) goto aBigger;
1748     if ( aSig < bSig ) goto bBigger;
1749     return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1750  bExpBigger:
1751     if ( bExp == 0xFF ) {
1752         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1753         return packFloat32( zSign ^ 1, 0xFF, 0 );
1754     }
1755     if ( aExp == 0 ) {
1756         ++expDiff;
1757     }
1758     else {
1759         aSig |= 0x40000000;
1760     }
1761     shift32RightJamming( aSig, - expDiff, &aSig );
1762     bSig |= 0x40000000;
1763  bBigger:
1764     zSig = bSig - aSig;
1765     zExp = bExp;
1766     zSign ^= 1;
1767     goto normalizeRoundAndPack;
1768  aExpBigger:
1769     if ( aExp == 0xFF ) {
1770         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1771         return a;
1772     }
1773     if ( bExp == 0 ) {
1774         --expDiff;
1775     }
1776     else {
1777         bSig |= 0x40000000;
1778     }
1779     shift32RightJamming( bSig, expDiff, &bSig );
1780     aSig |= 0x40000000;
1781  aBigger:
1782     zSig = aSig - bSig;
1783     zExp = aExp;
1784  normalizeRoundAndPack:
1785     --zExp;
1786     return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1787 
1788 }
1789 
1790 /*----------------------------------------------------------------------------
1791 | Returns the result of adding the single-precision floating-point values `a'
1792 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
1793 | Binary Floating-Point Arithmetic.
1794 *----------------------------------------------------------------------------*/
1795 
1796 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1797 {
1798     flag aSign, bSign;
1799 
1800     aSign = extractFloat32Sign( a );
1801     bSign = extractFloat32Sign( b );
1802     if ( aSign == bSign ) {
1803         return addFloat32Sigs( a, b, aSign STATUS_VAR);
1804     }
1805     else {
1806         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1807     }
1808 
1809 }
1810 
1811 /*----------------------------------------------------------------------------
1812 | Returns the result of subtracting the single-precision floating-point values
1813 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1814 | for Binary Floating-Point Arithmetic.
1815 *----------------------------------------------------------------------------*/
1816 
1817 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1818 {
1819     flag aSign, bSign;
1820 
1821     aSign = extractFloat32Sign( a );
1822     bSign = extractFloat32Sign( b );
1823     if ( aSign == bSign ) {
1824         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1825     }
1826     else {
1827         return addFloat32Sigs( a, b, aSign STATUS_VAR );
1828     }
1829 
1830 }
1831 
1832 /*----------------------------------------------------------------------------
1833 | Returns the result of multiplying the single-precision floating-point values
1834 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1835 | for Binary Floating-Point Arithmetic.
1836 *----------------------------------------------------------------------------*/
1837 
1838 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1839 {
1840     flag aSign, bSign, zSign;
1841     int16 aExp, bExp, zExp;
1842     bits32 aSig, bSig;
1843     bits64 zSig64;
1844     bits32 zSig;
1845 
1846     aSig = extractFloat32Frac( a );
1847     aExp = extractFloat32Exp( a );
1848     aSign = extractFloat32Sign( a );
1849     bSig = extractFloat32Frac( b );
1850     bExp = extractFloat32Exp( b );
1851     bSign = extractFloat32Sign( b );
1852     zSign = aSign ^ bSign;
1853     if ( aExp == 0xFF ) {
1854         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1855             return propagateFloat32NaN( a, b STATUS_VAR );
1856         }
1857         if ( ( bExp | bSig ) == 0 ) {
1858             float_raise( float_flag_invalid STATUS_VAR);
1859             return float32_default_nan;
1860         }
1861         return packFloat32( zSign, 0xFF, 0 );
1862     }
1863     if ( bExp == 0xFF ) {
1864         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1865         if ( ( aExp | aSig ) == 0 ) {
1866             float_raise( float_flag_invalid STATUS_VAR);
1867             return float32_default_nan;
1868         }
1869         return packFloat32( zSign, 0xFF, 0 );
1870     }
1871     if ( aExp == 0 ) {
1872         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1873         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1874     }
1875     if ( bExp == 0 ) {
1876         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1877         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1878     }
1879     zExp = aExp + bExp - 0x7F;
1880     aSig = ( aSig | 0x00800000 )<<7;
1881     bSig = ( bSig | 0x00800000 )<<8;
1882     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1883     zSig = zSig64;
1884     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1885         zSig <<= 1;
1886         --zExp;
1887     }
1888     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1889 
1890 }
1891 
1892 /*----------------------------------------------------------------------------
1893 | Returns the result of dividing the single-precision floating-point value `a'
1894 | by the corresponding value `b'.  The operation is performed according to the
1895 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1896 *----------------------------------------------------------------------------*/
1897 
1898 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1899 {
1900     flag aSign, bSign, zSign;
1901     int16 aExp, bExp, zExp;
1902     bits32 aSig, bSig, zSig;
1903 
1904     aSig = extractFloat32Frac( a );
1905     aExp = extractFloat32Exp( a );
1906     aSign = extractFloat32Sign( a );
1907     bSig = extractFloat32Frac( b );
1908     bExp = extractFloat32Exp( b );
1909     bSign = extractFloat32Sign( b );
1910     zSign = aSign ^ bSign;
1911     if ( aExp == 0xFF ) {
1912         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1913         if ( bExp == 0xFF ) {
1914             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1915             float_raise( float_flag_invalid STATUS_VAR);
1916             return float32_default_nan;
1917         }
1918         return packFloat32( zSign, 0xFF, 0 );
1919     }
1920     if ( bExp == 0xFF ) {
1921         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1922         return packFloat32( zSign, 0, 0 );
1923     }
1924     if ( bExp == 0 ) {
1925         if ( bSig == 0 ) {
1926             if ( ( aExp | aSig ) == 0 ) {
1927                 float_raise( float_flag_invalid STATUS_VAR);
1928                 return float32_default_nan;
1929             }
1930             float_raise( float_flag_divbyzero STATUS_VAR);
1931             return packFloat32( zSign, 0xFF, 0 );
1932         }
1933         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1934     }
1935     if ( aExp == 0 ) {
1936         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1937         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1938     }
1939     zExp = aExp - bExp + 0x7D;
1940     aSig = ( aSig | 0x00800000 )<<7;
1941     bSig = ( bSig | 0x00800000 )<<8;
1942     if ( bSig <= ( aSig + aSig ) ) {
1943         aSig >>= 1;
1944         ++zExp;
1945     }
1946     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1947     if ( ( zSig & 0x3F ) == 0 ) {
1948         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1949     }
1950     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1951 
1952 }
1953 
1954 /*----------------------------------------------------------------------------
1955 | Returns the remainder of the single-precision floating-point value `a'
1956 | with respect to the corresponding value `b'.  The operation is performed
1957 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1958 *----------------------------------------------------------------------------*/
1959 
1960 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
1961 {
1962     flag aSign, zSign;
1963     int16 aExp, bExp, expDiff;
1964     bits32 aSig, bSig;
1965     bits32 q;
1966     bits64 aSig64, bSig64, q64;
1967     bits32 alternateASig;
1968     sbits32 sigMean;
1969 
1970     aSig = extractFloat32Frac( a );
1971     aExp = extractFloat32Exp( a );
1972     aSign = extractFloat32Sign( a );
1973     bSig = extractFloat32Frac( b );
1974     bExp = extractFloat32Exp( b );
1975     if ( aExp == 0xFF ) {
1976         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1977             return propagateFloat32NaN( a, b STATUS_VAR );
1978         }
1979         float_raise( float_flag_invalid STATUS_VAR);
1980         return float32_default_nan;
1981     }
1982     if ( bExp == 0xFF ) {
1983         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1984         return a;
1985     }
1986     if ( bExp == 0 ) {
1987         if ( bSig == 0 ) {
1988             float_raise( float_flag_invalid STATUS_VAR);
1989             return float32_default_nan;
1990         }
1991         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1992     }
1993     if ( aExp == 0 ) {
1994         if ( aSig == 0 ) return a;
1995         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1996     }
1997     expDiff = aExp - bExp;
1998     aSig |= 0x00800000;
1999     bSig |= 0x00800000;
2000     if ( expDiff < 32 ) {
2001         aSig <<= 8;
2002         bSig <<= 8;
2003         if ( expDiff < 0 ) {
2004             if ( expDiff < -1 ) return a;
2005             aSig >>= 1;
2006         }
2007         q = ( bSig <= aSig );
2008         if ( q ) aSig -= bSig;
2009         if ( 0 < expDiff ) {
2010             q = ( ( (bits64) aSig )<<32 ) / bSig;
2011             q >>= 32 - expDiff;
2012             bSig >>= 2;
2013             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2014         }
2015         else {
2016             aSig >>= 2;
2017             bSig >>= 2;
2018         }
2019     }
2020     else {
2021         if ( bSig <= aSig ) aSig -= bSig;
2022         aSig64 = ( (bits64) aSig )<<40;
2023         bSig64 = ( (bits64) bSig )<<40;
2024         expDiff -= 64;
2025         while ( 0 < expDiff ) {
2026             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2027             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2028             aSig64 = - ( ( bSig * q64 )<<38 );
2029             expDiff -= 62;
2030         }
2031         expDiff += 64;
2032         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2033         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2034         q = q64>>( 64 - expDiff );
2035         bSig <<= 6;
2036         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2037     }
2038     do {
2039         alternateASig = aSig;
2040         ++q;
2041         aSig -= bSig;
2042     } while ( 0 <= (sbits32) aSig );
2043     sigMean = aSig + alternateASig;
2044     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2045         aSig = alternateASig;
2046     }
2047     zSign = ( (sbits32) aSig < 0 );
2048     if ( zSign ) aSig = - aSig;
2049     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2050 
2051 }
2052 
2053 /*----------------------------------------------------------------------------
2054 | Returns the square root of the single-precision floating-point value `a'.
2055 | The operation is performed according to the IEC/IEEE Standard for Binary
2056 | Floating-Point Arithmetic.
2057 *----------------------------------------------------------------------------*/
2058 
2059 float32 float32_sqrt( float32 a STATUS_PARAM )
2060 {
2061     flag aSign;
2062     int16 aExp, zExp;
2063     bits32 aSig, zSig;
2064     bits64 rem, term;
2065 
2066     aSig = extractFloat32Frac( a );
2067     aExp = extractFloat32Exp( a );
2068     aSign = extractFloat32Sign( a );
2069     if ( aExp == 0xFF ) {
2070         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2071         if ( ! aSign ) return a;
2072         float_raise( float_flag_invalid STATUS_VAR);
2073         return float32_default_nan;
2074     }
2075     if ( aSign ) {
2076         if ( ( aExp | aSig ) == 0 ) return a;
2077         float_raise( float_flag_invalid STATUS_VAR);
2078         return float32_default_nan;
2079     }
2080     if ( aExp == 0 ) {
2081         if ( aSig == 0 ) return float32_zero;
2082         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2083     }
2084     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2085     aSig = ( aSig | 0x00800000 )<<8;
2086     zSig = estimateSqrt32( aExp, aSig ) + 2;
2087     if ( ( zSig & 0x7F ) <= 5 ) {
2088         if ( zSig < 2 ) {
2089             zSig = 0x7FFFFFFF;
2090             goto roundAndPack;
2091         }
2092         aSig >>= aExp & 1;
2093         term = ( (bits64) zSig ) * zSig;
2094         rem = ( ( (bits64) aSig )<<32 ) - term;
2095         while ( (sbits64) rem < 0 ) {
2096             --zSig;
2097             rem += ( ( (bits64) zSig )<<1 ) | 1;
2098         }
2099         zSig |= ( rem != 0 );
2100     }
2101     shift32RightJamming( zSig, 1, &zSig );
2102  roundAndPack:
2103     return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2104 
2105 }
2106 
2107 /*----------------------------------------------------------------------------
2108 | Returns the binary exponential of the single-precision floating-point value
2109 | `a'. The operation is performed according to the IEC/IEEE Standard for
2110 | Binary Floating-Point Arithmetic.
2111 |
2112 | Uses the following identities:
2113 |
2114 | 1. -------------------------------------------------------------------------
2115 |      x    x*ln(2)
2116 |     2  = e
2117 |
2118 | 2. -------------------------------------------------------------------------
2119 |                      2     3     4     5           n
2120 |      x        x     x     x     x     x           x
2121 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2122 |               1!    2!    3!    4!    5!          n!
2123 *----------------------------------------------------------------------------*/
2124 
2125 static const float64 float32_exp2_coefficients[15] =
2126 {
2127     make_float64( 0x3ff0000000000000ll ), /*  1 */
2128     make_float64( 0x3fe0000000000000ll ), /*  2 */
2129     make_float64( 0x3fc5555555555555ll ), /*  3 */
2130     make_float64( 0x3fa5555555555555ll ), /*  4 */
2131     make_float64( 0x3f81111111111111ll ), /*  5 */
2132     make_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2133     make_float64( 0x3f2a01a01a01a01all ), /*  7 */
2134     make_float64( 0x3efa01a01a01a01all ), /*  8 */
2135     make_float64( 0x3ec71de3a556c734ll ), /*  9 */
2136     make_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2137     make_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2138     make_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2139     make_float64( 0x3de6124613a86d09ll ), /* 13 */
2140     make_float64( 0x3da93974a8c07c9dll ), /* 14 */
2141     make_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2142 };
2143 
2144 float32 float32_exp2( float32 a STATUS_PARAM )
2145 {
2146     flag aSign;
2147     int16 aExp;
2148     bits32 aSig;
2149     float64 r, x, xn;
2150     int i;
2151 
2152     aSig = extractFloat32Frac( a );
2153     aExp = extractFloat32Exp( a );
2154     aSign = extractFloat32Sign( a );
2155 
2156     if ( aExp == 0xFF) {
2157         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2158         return (aSign) ? float32_zero : a;
2159     }
2160     if (aExp == 0) {
2161         if (aSig == 0) return float32_one;
2162     }
2163 
2164     float_raise( float_flag_inexact STATUS_VAR);
2165 
2166     /* ******************************* */
2167     /* using float64 for approximation */
2168     /* ******************************* */
2169     x = float32_to_float64(a STATUS_VAR);
2170     x = float64_mul(x, float64_ln2 STATUS_VAR);
2171 
2172     xn = x;
2173     r = float64_one;
2174     for (i = 0 ; i < 15 ; i++) {
2175         float64 f;
2176 
2177         f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2178         r = float64_add(r, f STATUS_VAR);
2179 
2180         xn = float64_mul(xn, x STATUS_VAR);
2181     }
2182 
2183     return float64_to_float32(r, status);
2184 }
2185 
2186 /*----------------------------------------------------------------------------
2187 | Returns the binary log of the single-precision floating-point value `a'.
2188 | The operation is performed according to the IEC/IEEE Standard for Binary
2189 | Floating-Point Arithmetic.
2190 *----------------------------------------------------------------------------*/
2191 float32 float32_log2( float32 a STATUS_PARAM )
2192 {
2193     flag aSign, zSign;
2194     int16 aExp;
2195     bits32 aSig, zSig, i;
2196 
2197     aSig = extractFloat32Frac( a );
2198     aExp = extractFloat32Exp( a );
2199     aSign = extractFloat32Sign( a );
2200 
2201     if ( aExp == 0 ) {
2202         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2203         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2204     }
2205     if ( aSign ) {
2206         float_raise( float_flag_invalid STATUS_VAR);
2207         return float32_default_nan;
2208     }
2209     if ( aExp == 0xFF ) {
2210         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2211         return a;
2212     }
2213 
2214     aExp -= 0x7F;
2215     aSig |= 0x00800000;
2216     zSign = aExp < 0;
2217     zSig = aExp << 23;
2218 
2219     for (i = 1 << 22; i > 0; i >>= 1) {
2220         aSig = ( (bits64)aSig * aSig ) >> 23;
2221         if ( aSig & 0x01000000 ) {
2222             aSig >>= 1;
2223             zSig |= i;
2224         }
2225     }
2226 
2227     if ( zSign )
2228         zSig = -zSig;
2229 
2230     return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2231 }
2232 
2233 /*----------------------------------------------------------------------------
2234 | Returns 1 if the single-precision floating-point value `a' is equal to
2235 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2236 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2237 *----------------------------------------------------------------------------*/
2238 
2239 int float32_eq( float32 a, float32 b STATUS_PARAM )
2240 {
2241 
2242     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2243          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2244        ) {
2245         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2246             float_raise( float_flag_invalid STATUS_VAR);
2247         }
2248         return 0;
2249     }
2250     return ( float32_val(a) == float32_val(b) ) ||
2251             ( (bits32) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2252 
2253 }
2254 
2255 /*----------------------------------------------------------------------------
2256 | Returns 1 if the single-precision floating-point value `a' is less than
2257 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
2258 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2259 | Arithmetic.
2260 *----------------------------------------------------------------------------*/
2261 
2262 int float32_le( float32 a, float32 b STATUS_PARAM )
2263 {
2264     flag aSign, bSign;
2265     bits32 av, bv;
2266 
2267     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2268          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2269        ) {
2270         float_raise( float_flag_invalid STATUS_VAR);
2271         return 0;
2272     }
2273     aSign = extractFloat32Sign( a );
2274     bSign = extractFloat32Sign( b );
2275     av = float32_val(a);
2276     bv = float32_val(b);
2277     if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2278     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2279 
2280 }
2281 
2282 /*----------------------------------------------------------------------------
2283 | Returns 1 if the single-precision floating-point value `a' is less than
2284 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2285 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2286 *----------------------------------------------------------------------------*/
2287 
2288 int float32_lt( float32 a, float32 b STATUS_PARAM )
2289 {
2290     flag aSign, bSign;
2291     bits32 av, bv;
2292 
2293     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2294          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2295        ) {
2296         float_raise( float_flag_invalid STATUS_VAR);
2297         return 0;
2298     }
2299     aSign = extractFloat32Sign( a );
2300     bSign = extractFloat32Sign( b );
2301     av = float32_val(a);
2302     bv = float32_val(b);
2303     if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2304     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2305 
2306 }
2307 
2308 /*----------------------------------------------------------------------------
2309 | Returns 1 if the single-precision floating-point value `a' is equal to
2310 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2311 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2312 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2313 *----------------------------------------------------------------------------*/
2314 
2315 int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2316 {
2317     bits32 av, bv;
2318 
2319     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2320          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2321        ) {
2322         float_raise( float_flag_invalid STATUS_VAR);
2323         return 0;
2324     }
2325     av = float32_val(a);
2326     bv = float32_val(b);
2327     return ( av == bv ) || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2328 
2329 }
2330 
2331 /*----------------------------------------------------------------------------
2332 | Returns 1 if the single-precision floating-point value `a' is less than or
2333 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2334 | cause an exception.  Otherwise, the comparison is performed according to the
2335 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2336 *----------------------------------------------------------------------------*/
2337 
2338 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2339 {
2340     flag aSign, bSign;
2341     bits32 av, bv;
2342 
2343     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2344          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2345        ) {
2346         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2347             float_raise( float_flag_invalid STATUS_VAR);
2348         }
2349         return 0;
2350     }
2351     aSign = extractFloat32Sign( a );
2352     bSign = extractFloat32Sign( b );
2353     av = float32_val(a);
2354     bv = float32_val(b);
2355     if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2356     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2357 
2358 }
2359 
2360 /*----------------------------------------------------------------------------
2361 | Returns 1 if the single-precision floating-point value `a' is less than
2362 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2363 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2364 | Standard for Binary Floating-Point Arithmetic.
2365 *----------------------------------------------------------------------------*/
2366 
2367 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2368 {
2369     flag aSign, bSign;
2370     bits32 av, bv;
2371 
2372     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2373          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2374        ) {
2375         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2376             float_raise( float_flag_invalid STATUS_VAR);
2377         }
2378         return 0;
2379     }
2380     aSign = extractFloat32Sign( a );
2381     bSign = extractFloat32Sign( b );
2382     av = float32_val(a);
2383     bv = float32_val(b);
2384     if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2385     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2386 
2387 }
2388 
2389 /*----------------------------------------------------------------------------
2390 | Returns the result of converting the double-precision floating-point value
2391 | `a' to the 32-bit two's complement integer format.  The conversion is
2392 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2393 | Arithmetic---which means in particular that the conversion is rounded
2394 | according to the current rounding mode.  If `a' is a NaN, the largest
2395 | positive integer is returned.  Otherwise, if the conversion overflows, the
2396 | largest integer with the same sign as `a' is returned.
2397 *----------------------------------------------------------------------------*/
2398 
2399 int32 float64_to_int32( float64 a STATUS_PARAM )
2400 {
2401     flag aSign;
2402     int16 aExp, shiftCount;
2403     bits64 aSig;
2404 
2405     aSig = extractFloat64Frac( a );
2406     aExp = extractFloat64Exp( a );
2407     aSign = extractFloat64Sign( a );
2408     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2409     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2410     shiftCount = 0x42C - aExp;
2411     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2412     return roundAndPackInt32( aSign, aSig STATUS_VAR );
2413 
2414 }
2415 
2416 /*----------------------------------------------------------------------------
2417 | Returns the result of converting the double-precision floating-point value
2418 | `a' to the 32-bit two's complement integer format.  The conversion is
2419 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2420 | Arithmetic, except that the conversion is always rounded toward zero.
2421 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2422 | the conversion overflows, the largest integer with the same sign as `a' is
2423 | returned.
2424 *----------------------------------------------------------------------------*/
2425 
2426 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2427 {
2428     flag aSign;
2429     int16 aExp, shiftCount;
2430     bits64 aSig, savedASig;
2431     int32 z;
2432 
2433     aSig = extractFloat64Frac( a );
2434     aExp = extractFloat64Exp( a );
2435     aSign = extractFloat64Sign( a );
2436     if ( 0x41E < aExp ) {
2437         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2438         goto invalid;
2439     }
2440     else if ( aExp < 0x3FF ) {
2441         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2442         return 0;
2443     }
2444     aSig |= LIT64( 0x0010000000000000 );
2445     shiftCount = 0x433 - aExp;
2446     savedASig = aSig;
2447     aSig >>= shiftCount;
2448     z = aSig;
2449     if ( aSign ) z = - z;
2450     if ( ( z < 0 ) ^ aSign ) {
2451  invalid:
2452         float_raise( float_flag_invalid STATUS_VAR);
2453         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2454     }
2455     if ( ( aSig<<shiftCount ) != savedASig ) {
2456         STATUS(float_exception_flags) |= float_flag_inexact;
2457     }
2458     return z;
2459 
2460 }
2461 
2462 /*----------------------------------------------------------------------------
2463 | Returns the result of converting the double-precision floating-point value
2464 | `a' to the 16-bit two's complement integer format.  The conversion is
2465 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2466 | Arithmetic, except that the conversion is always rounded toward zero.
2467 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2468 | the conversion overflows, the largest integer with the same sign as `a' is
2469 | returned.
2470 *----------------------------------------------------------------------------*/
2471 
2472 int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM )
2473 {
2474     flag aSign;
2475     int16 aExp, shiftCount;
2476     bits64 aSig, savedASig;
2477     int32 z;
2478 
2479     aSig = extractFloat64Frac( a );
2480     aExp = extractFloat64Exp( a );
2481     aSign = extractFloat64Sign( a );
2482     if ( 0x40E < aExp ) {
2483         if ( ( aExp == 0x7FF ) && aSig ) {
2484             aSign = 0;
2485         }
2486         goto invalid;
2487     }
2488     else if ( aExp < 0x3FF ) {
2489         if ( aExp || aSig ) {
2490             STATUS(float_exception_flags) |= float_flag_inexact;
2491         }
2492         return 0;
2493     }
2494     aSig |= LIT64( 0x0010000000000000 );
2495     shiftCount = 0x433 - aExp;
2496     savedASig = aSig;
2497     aSig >>= shiftCount;
2498     z = aSig;
2499     if ( aSign ) {
2500         z = - z;
2501     }
2502     if ( ( (int16_t)z < 0 ) ^ aSign ) {
2503  invalid:
2504         float_raise( float_flag_invalid STATUS_VAR);
2505         return aSign ? (sbits32) 0xffff8000 : 0x7FFF;
2506     }
2507     if ( ( aSig<<shiftCount ) != savedASig ) {
2508         STATUS(float_exception_flags) |= float_flag_inexact;
2509     }
2510     return z;
2511 }
2512 
2513 /*----------------------------------------------------------------------------
2514 | Returns the result of converting the double-precision floating-point value
2515 | `a' to the 64-bit two's complement integer format.  The conversion is
2516 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2517 | Arithmetic---which means in particular that the conversion is rounded
2518 | according to the current rounding mode.  If `a' is a NaN, the largest
2519 | positive integer is returned.  Otherwise, if the conversion overflows, the
2520 | largest integer with the same sign as `a' is returned.
2521 *----------------------------------------------------------------------------*/
2522 
2523 int64 float64_to_int64( float64 a STATUS_PARAM )
2524 {
2525     flag aSign;
2526     int16 aExp, shiftCount;
2527     bits64 aSig, aSigExtra;
2528 
2529     aSig = extractFloat64Frac( a );
2530     aExp = extractFloat64Exp( a );
2531     aSign = extractFloat64Sign( a );
2532     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2533     shiftCount = 0x433 - aExp;
2534     if ( shiftCount <= 0 ) {
2535         if ( 0x43E < aExp ) {
2536             float_raise( float_flag_invalid STATUS_VAR);
2537             if (    ! aSign
2538                  || (    ( aExp == 0x7FF )
2539                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2540                ) {
2541                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2542             }
2543             return (sbits64) LIT64( 0x8000000000000000 );
2544         }
2545         aSigExtra = 0;
2546         aSig <<= - shiftCount;
2547     }
2548     else {
2549         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2550     }
2551     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2552 
2553 }
2554 
2555 /*----------------------------------------------------------------------------
2556 | Returns the result of converting the double-precision floating-point value
2557 | `a' to the 64-bit two's complement integer format.  The conversion is
2558 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2559 | Arithmetic, except that the conversion is always rounded toward zero.
2560 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2561 | the conversion overflows, the largest integer with the same sign as `a' is
2562 | returned.
2563 *----------------------------------------------------------------------------*/
2564 
2565 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2566 {
2567     flag aSign;
2568     int16 aExp, shiftCount;
2569     bits64 aSig;
2570     int64 z;
2571 
2572     aSig = extractFloat64Frac( a );
2573     aExp = extractFloat64Exp( a );
2574     aSign = extractFloat64Sign( a );
2575     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2576     shiftCount = aExp - 0x433;
2577     if ( 0 <= shiftCount ) {
2578         if ( 0x43E <= aExp ) {
2579             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2580                 float_raise( float_flag_invalid STATUS_VAR);
2581                 if (    ! aSign
2582                      || (    ( aExp == 0x7FF )
2583                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2584                    ) {
2585                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2586                 }
2587             }
2588             return (sbits64) LIT64( 0x8000000000000000 );
2589         }
2590         z = aSig<<shiftCount;
2591     }
2592     else {
2593         if ( aExp < 0x3FE ) {
2594             if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2595             return 0;
2596         }
2597         z = aSig>>( - shiftCount );
2598         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2599             STATUS(float_exception_flags) |= float_flag_inexact;
2600         }
2601     }
2602     if ( aSign ) z = - z;
2603     return z;
2604 
2605 }
2606 
2607 /*----------------------------------------------------------------------------
2608 | Returns the result of converting the double-precision floating-point value
2609 | `a' to the single-precision floating-point format.  The conversion is
2610 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2611 | Arithmetic.
2612 *----------------------------------------------------------------------------*/
2613 
2614 float32 float64_to_float32( float64 a STATUS_PARAM )
2615 {
2616     flag aSign;
2617     int16 aExp;
2618     bits64 aSig;
2619     bits32 zSig;
2620 
2621     aSig = extractFloat64Frac( a );
2622     aExp = extractFloat64Exp( a );
2623     aSign = extractFloat64Sign( a );
2624     if ( aExp == 0x7FF ) {
2625         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2626         return packFloat32( aSign, 0xFF, 0 );
2627     }
2628     shift64RightJamming( aSig, 22, &aSig );
2629     zSig = aSig;
2630     if ( aExp || zSig ) {
2631         zSig |= 0x40000000;
2632         aExp -= 0x381;
2633     }
2634     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2635 
2636 }
2637 
2638 
2639 /*----------------------------------------------------------------------------
2640 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2641 | half-precision floating-point value, returning the result.  After being
2642 | shifted into the proper positions, the three fields are simply added
2643 | together to form the result.  This means that any integer portion of `zSig'
2644 | will be added into the exponent.  Since a properly normalized significand
2645 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2646 | than the desired result exponent whenever `zSig' is a complete, normalized
2647 | significand.
2648 *----------------------------------------------------------------------------*/
2649 static bits16 packFloat16(flag zSign, int16 zExp, bits16 zSig)
2650 {
2651     return (((bits32)zSign) << 15) + (((bits32)zExp) << 10) + zSig;
2652 }
2653 
2654 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
2655    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
2656 
2657 float32 float16_to_float32( bits16 a, flag ieee STATUS_PARAM )
2658 {
2659     flag aSign;
2660     int16 aExp;
2661     bits32 aSig;
2662 
2663     aSign = a >> 15;
2664     aExp = (a >> 10) & 0x1f;
2665     aSig = a & 0x3ff;
2666 
2667     if (aExp == 0x1f && ieee) {
2668         if (aSig) {
2669             /* Make sure correct exceptions are raised.  */
2670             float32ToCommonNaN(a STATUS_VAR);
2671             aSig |= 0x200;
2672         }
2673         return packFloat32(aSign, 0xff, aSig << 13);
2674     }
2675     if (aExp == 0) {
2676         int8 shiftCount;
2677 
2678         if (aSig == 0) {
2679             return packFloat32(aSign, 0, 0);
2680         }
2681 
2682         shiftCount = countLeadingZeros32( aSig ) - 21;
2683         aSig = aSig << shiftCount;
2684         aExp = -shiftCount;
2685     }
2686     return packFloat32( aSign, aExp + 0x70, aSig << 13);
2687 }
2688 
2689 bits16 float32_to_float16( float32 a, flag ieee STATUS_PARAM)
2690 {
2691     flag aSign;
2692     int16 aExp;
2693     bits32 aSig;
2694     bits32 mask;
2695     bits32 increment;
2696     int8 roundingMode;
2697 
2698     aSig = extractFloat32Frac( a );
2699     aExp = extractFloat32Exp( a );
2700     aSign = extractFloat32Sign( a );
2701     if ( aExp == 0xFF ) {
2702         if (aSig) {
2703             /* Make sure correct exceptions are raised.  */
2704             float32ToCommonNaN(a STATUS_VAR);
2705             aSig |= 0x00400000;
2706         }
2707         return packFloat16(aSign, 0x1f, aSig >> 13);
2708     }
2709     if (aExp == 0 && aSign == 0) {
2710         return packFloat16(aSign, 0, 0);
2711     }
2712     /* Decimal point between bits 22 and 23.  */
2713     aSig |= 0x00800000;
2714     aExp -= 0x7f;
2715     if (aExp < -14) {
2716         mask = 0x007fffff;
2717         if (aExp < -24) {
2718             aExp = -25;
2719         } else {
2720             mask >>= 24 + aExp;
2721         }
2722     } else {
2723         mask = 0x00001fff;
2724     }
2725     if (aSig & mask) {
2726         float_raise( float_flag_underflow STATUS_VAR );
2727         roundingMode = STATUS(float_rounding_mode);
2728         switch (roundingMode) {
2729         case float_round_nearest_even:
2730             increment = (mask + 1) >> 1;
2731             if ((aSig & mask) == increment) {
2732                 increment = aSig & (increment << 1);
2733             }
2734             break;
2735         case float_round_up:
2736             increment = aSign ? 0 : mask;
2737             break;
2738         case float_round_down:
2739             increment = aSign ? mask : 0;
2740             break;
2741         default: /* round_to_zero */
2742             increment = 0;
2743             break;
2744         }
2745         aSig += increment;
2746         if (aSig >= 0x01000000) {
2747             aSig >>= 1;
2748             aExp++;
2749         }
2750     } else if (aExp < -14
2751           && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2752         float_raise( float_flag_underflow STATUS_VAR);
2753     }
2754 
2755     if (ieee) {
2756         if (aExp > 15) {
2757             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2758             return packFloat16(aSign, 0x1f, 0);
2759         }
2760     } else {
2761         if (aExp > 16) {
2762             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2763             return packFloat16(aSign, 0x1f, 0x3ff);
2764         }
2765     }
2766     if (aExp < -24) {
2767         return packFloat16(aSign, 0, 0);
2768     }
2769     if (aExp < -14) {
2770         aSig >>= -14 - aExp;
2771         aExp = -14;
2772     }
2773     return packFloat16(aSign, aExp + 14, aSig >> 13);
2774 }
2775 
2776 #ifdef FLOATX80
2777 
2778 /*----------------------------------------------------------------------------
2779 | Returns the result of converting the double-precision floating-point value
2780 | `a' to the extended double-precision floating-point format.  The conversion
2781 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2782 | Arithmetic.
2783 *----------------------------------------------------------------------------*/
2784 
2785 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2786 {
2787     flag aSign;
2788     int16 aExp;
2789     bits64 aSig;
2790 
2791     aSig = extractFloat64Frac( a );
2792     aExp = extractFloat64Exp( a );
2793     aSign = extractFloat64Sign( a );
2794     if ( aExp == 0x7FF ) {
2795         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2796         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2797     }
2798     if ( aExp == 0 ) {
2799         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2800         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2801     }
2802     return
2803         packFloatx80(
2804             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2805 
2806 }
2807 
2808 #endif
2809 
2810 #ifdef FLOAT128
2811 
2812 /*----------------------------------------------------------------------------
2813 | Returns the result of converting the double-precision floating-point value
2814 | `a' to the quadruple-precision floating-point format.  The conversion is
2815 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2816 | Arithmetic.
2817 *----------------------------------------------------------------------------*/
2818 
2819 float128 float64_to_float128( float64 a STATUS_PARAM )
2820 {
2821     flag aSign;
2822     int16 aExp;
2823     bits64 aSig, zSig0, zSig1;
2824 
2825     aSig = extractFloat64Frac( a );
2826     aExp = extractFloat64Exp( a );
2827     aSign = extractFloat64Sign( a );
2828     if ( aExp == 0x7FF ) {
2829         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2830         return packFloat128( aSign, 0x7FFF, 0, 0 );
2831     }
2832     if ( aExp == 0 ) {
2833         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2834         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2835         --aExp;
2836     }
2837     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2838     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2839 
2840 }
2841 
2842 #endif
2843 
2844 /*----------------------------------------------------------------------------
2845 | Rounds the double-precision floating-point value `a' to an integer, and
2846 | returns the result as a double-precision floating-point value.  The
2847 | operation is performed according to the IEC/IEEE Standard for Binary
2848 | Floating-Point Arithmetic.
2849 *----------------------------------------------------------------------------*/
2850 
2851 float64 float64_round_to_int( float64 a STATUS_PARAM )
2852 {
2853     flag aSign;
2854     int16 aExp;
2855     bits64 lastBitMask, roundBitsMask;
2856     int8 roundingMode;
2857     bits64 z;
2858 
2859     aExp = extractFloat64Exp( a );
2860     if ( 0x433 <= aExp ) {
2861         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2862             return propagateFloat64NaN( a, a STATUS_VAR );
2863         }
2864         return a;
2865     }
2866     if ( aExp < 0x3FF ) {
2867         if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a;
2868         STATUS(float_exception_flags) |= float_flag_inexact;
2869         aSign = extractFloat64Sign( a );
2870         switch ( STATUS(float_rounding_mode) ) {
2871          case float_round_nearest_even:
2872             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2873                 return packFloat64( aSign, 0x3FF, 0 );
2874             }
2875             break;
2876          case float_round_down:
2877             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
2878          case float_round_up:
2879             return make_float64(
2880             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2881         }
2882         return packFloat64( aSign, 0, 0 );
2883     }
2884     lastBitMask = 1;
2885     lastBitMask <<= 0x433 - aExp;
2886     roundBitsMask = lastBitMask - 1;
2887     z = float64_val(a);
2888     roundingMode = STATUS(float_rounding_mode);
2889     if ( roundingMode == float_round_nearest_even ) {
2890         z += lastBitMask>>1;
2891         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2892     }
2893     else if ( roundingMode != float_round_to_zero ) {
2894         if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
2895             z += roundBitsMask;
2896         }
2897     }
2898     z &= ~ roundBitsMask;
2899     if ( z != float64_val(a) )
2900         STATUS(float_exception_flags) |= float_flag_inexact;
2901     return make_float64(z);
2902 
2903 }
2904 
2905 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
2906 {
2907     int oldmode;
2908     float64 res;
2909     oldmode = STATUS(float_rounding_mode);
2910     STATUS(float_rounding_mode) = float_round_to_zero;
2911     res = float64_round_to_int(a STATUS_VAR);
2912     STATUS(float_rounding_mode) = oldmode;
2913     return res;
2914 }
2915 
2916 /*----------------------------------------------------------------------------
2917 | Returns the result of adding the absolute values of the double-precision
2918 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2919 | before being returned.  `zSign' is ignored if the result is a NaN.
2920 | The addition is performed according to the IEC/IEEE Standard for Binary
2921 | Floating-Point Arithmetic.
2922 *----------------------------------------------------------------------------*/
2923 
2924 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2925 {
2926     int16 aExp, bExp, zExp;
2927     bits64 aSig, bSig, zSig;
2928     int16 expDiff;
2929 
2930     aSig = extractFloat64Frac( a );
2931     aExp = extractFloat64Exp( a );
2932     bSig = extractFloat64Frac( b );
2933     bExp = extractFloat64Exp( b );
2934     expDiff = aExp - bExp;
2935     aSig <<= 9;
2936     bSig <<= 9;
2937     if ( 0 < expDiff ) {
2938         if ( aExp == 0x7FF ) {
2939             if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2940             return a;
2941         }
2942         if ( bExp == 0 ) {
2943             --expDiff;
2944         }
2945         else {
2946             bSig |= LIT64( 0x2000000000000000 );
2947         }
2948         shift64RightJamming( bSig, expDiff, &bSig );
2949         zExp = aExp;
2950     }
2951     else if ( expDiff < 0 ) {
2952         if ( bExp == 0x7FF ) {
2953             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2954             return packFloat64( zSign, 0x7FF, 0 );
2955         }
2956         if ( aExp == 0 ) {
2957             ++expDiff;
2958         }
2959         else {
2960             aSig |= LIT64( 0x2000000000000000 );
2961         }
2962         shift64RightJamming( aSig, - expDiff, &aSig );
2963         zExp = bExp;
2964     }
2965     else {
2966         if ( aExp == 0x7FF ) {
2967             if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2968             return a;
2969         }
2970         if ( aExp == 0 ) {
2971             if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
2972             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2973         }
2974         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2975         zExp = aExp;
2976         goto roundAndPack;
2977     }
2978     aSig |= LIT64( 0x2000000000000000 );
2979     zSig = ( aSig + bSig )<<1;
2980     --zExp;
2981     if ( (sbits64) zSig < 0 ) {
2982         zSig = aSig + bSig;
2983         ++zExp;
2984     }
2985  roundAndPack:
2986     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2987 
2988 }
2989 
2990 /*----------------------------------------------------------------------------
2991 | Returns the result of subtracting the absolute values of the double-
2992 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
2993 | difference is negated before being returned.  `zSign' is ignored if the
2994 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
2995 | Standard for Binary Floating-Point Arithmetic.
2996 *----------------------------------------------------------------------------*/
2997 
2998 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2999 {
3000     int16 aExp, bExp, zExp;
3001     bits64 aSig, bSig, zSig;
3002     int16 expDiff;
3003 
3004     aSig = extractFloat64Frac( a );
3005     aExp = extractFloat64Exp( a );
3006     bSig = extractFloat64Frac( b );
3007     bExp = extractFloat64Exp( b );
3008     expDiff = aExp - bExp;
3009     aSig <<= 10;
3010     bSig <<= 10;
3011     if ( 0 < expDiff ) goto aExpBigger;
3012     if ( expDiff < 0 ) goto bExpBigger;
3013     if ( aExp == 0x7FF ) {
3014         if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3015         float_raise( float_flag_invalid STATUS_VAR);
3016         return float64_default_nan;
3017     }
3018     if ( aExp == 0 ) {
3019         aExp = 1;
3020         bExp = 1;
3021     }
3022     if ( bSig < aSig ) goto aBigger;
3023     if ( aSig < bSig ) goto bBigger;
3024     return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3025  bExpBigger:
3026     if ( bExp == 0x7FF ) {
3027         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3028         return packFloat64( zSign ^ 1, 0x7FF, 0 );
3029     }
3030     if ( aExp == 0 ) {
3031         ++expDiff;
3032     }
3033     else {
3034         aSig |= LIT64( 0x4000000000000000 );
3035     }
3036     shift64RightJamming( aSig, - expDiff, &aSig );
3037     bSig |= LIT64( 0x4000000000000000 );
3038  bBigger:
3039     zSig = bSig - aSig;
3040     zExp = bExp;
3041     zSign ^= 1;
3042     goto normalizeRoundAndPack;
3043  aExpBigger:
3044     if ( aExp == 0x7FF ) {
3045         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3046         return a;
3047     }
3048     if ( bExp == 0 ) {
3049         --expDiff;
3050     }
3051     else {
3052         bSig |= LIT64( 0x4000000000000000 );
3053     }
3054     shift64RightJamming( bSig, expDiff, &bSig );
3055     aSig |= LIT64( 0x4000000000000000 );
3056  aBigger:
3057     zSig = aSig - bSig;
3058     zExp = aExp;
3059  normalizeRoundAndPack:
3060     --zExp;
3061     return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3062 
3063 }
3064 
3065 /*----------------------------------------------------------------------------
3066 | Returns the result of adding the double-precision floating-point values `a'
3067 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
3068 | Binary Floating-Point Arithmetic.
3069 *----------------------------------------------------------------------------*/
3070 
3071 float64 float64_add( float64 a, float64 b STATUS_PARAM )
3072 {
3073     flag aSign, bSign;
3074 
3075     aSign = extractFloat64Sign( a );
3076     bSign = extractFloat64Sign( b );
3077     if ( aSign == bSign ) {
3078         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3079     }
3080     else {
3081         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3082     }
3083 
3084 }
3085 
3086 /*----------------------------------------------------------------------------
3087 | Returns the result of subtracting the double-precision floating-point values
3088 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3089 | for Binary Floating-Point Arithmetic.
3090 *----------------------------------------------------------------------------*/
3091 
3092 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3093 {
3094     flag aSign, bSign;
3095 
3096     aSign = extractFloat64Sign( a );
3097     bSign = extractFloat64Sign( b );
3098     if ( aSign == bSign ) {
3099         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3100     }
3101     else {
3102         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3103     }
3104 
3105 }
3106 
3107 /*----------------------------------------------------------------------------
3108 | Returns the result of multiplying the double-precision floating-point values
3109 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3110 | for Binary Floating-Point Arithmetic.
3111 *----------------------------------------------------------------------------*/
3112 
3113 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3114 {
3115     flag aSign, bSign, zSign;
3116     int16 aExp, bExp, zExp;
3117     bits64 aSig, bSig, zSig0, zSig1;
3118 
3119     aSig = extractFloat64Frac( a );
3120     aExp = extractFloat64Exp( a );
3121     aSign = extractFloat64Sign( a );
3122     bSig = extractFloat64Frac( b );
3123     bExp = extractFloat64Exp( b );
3124     bSign = extractFloat64Sign( b );
3125     zSign = aSign ^ bSign;
3126     if ( aExp == 0x7FF ) {
3127         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3128             return propagateFloat64NaN( a, b STATUS_VAR );
3129         }
3130         if ( ( bExp | bSig ) == 0 ) {
3131             float_raise( float_flag_invalid STATUS_VAR);
3132             return float64_default_nan;
3133         }
3134         return packFloat64( zSign, 0x7FF, 0 );
3135     }
3136     if ( bExp == 0x7FF ) {
3137         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3138         if ( ( aExp | aSig ) == 0 ) {
3139             float_raise( float_flag_invalid STATUS_VAR);
3140             return float64_default_nan;
3141         }
3142         return packFloat64( zSign, 0x7FF, 0 );
3143     }
3144     if ( aExp == 0 ) {
3145         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3146         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3147     }
3148     if ( bExp == 0 ) {
3149         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3150         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3151     }
3152     zExp = aExp + bExp - 0x3FF;
3153     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3154     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3155     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3156     zSig0 |= ( zSig1 != 0 );
3157     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
3158         zSig0 <<= 1;
3159         --zExp;
3160     }
3161     return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3162 
3163 }
3164 
3165 /*----------------------------------------------------------------------------
3166 | Returns the result of dividing the double-precision floating-point value `a'
3167 | by the corresponding value `b'.  The operation is performed according to
3168 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3169 *----------------------------------------------------------------------------*/
3170 
3171 float64 float64_div( float64 a, float64 b STATUS_PARAM )
3172 {
3173     flag aSign, bSign, zSign;
3174     int16 aExp, bExp, zExp;
3175     bits64 aSig, bSig, zSig;
3176     bits64 rem0, rem1;
3177     bits64 term0, term1;
3178 
3179     aSig = extractFloat64Frac( a );
3180     aExp = extractFloat64Exp( a );
3181     aSign = extractFloat64Sign( a );
3182     bSig = extractFloat64Frac( b );
3183     bExp = extractFloat64Exp( b );
3184     bSign = extractFloat64Sign( b );
3185     zSign = aSign ^ bSign;
3186     if ( aExp == 0x7FF ) {
3187         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3188         if ( bExp == 0x7FF ) {
3189             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3190             float_raise( float_flag_invalid STATUS_VAR);
3191             return float64_default_nan;
3192         }
3193         return packFloat64( zSign, 0x7FF, 0 );
3194     }
3195     if ( bExp == 0x7FF ) {
3196         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3197         return packFloat64( zSign, 0, 0 );
3198     }
3199     if ( bExp == 0 ) {
3200         if ( bSig == 0 ) {
3201             if ( ( aExp | aSig ) == 0 ) {
3202                 float_raise( float_flag_invalid STATUS_VAR);
3203                 return float64_default_nan;
3204             }
3205             float_raise( float_flag_divbyzero STATUS_VAR);
3206             return packFloat64( zSign, 0x7FF, 0 );
3207         }
3208         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3209     }
3210     if ( aExp == 0 ) {
3211         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3212         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3213     }
3214     zExp = aExp - bExp + 0x3FD;
3215     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3216     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3217     if ( bSig <= ( aSig + aSig ) ) {
3218         aSig >>= 1;
3219         ++zExp;
3220     }
3221     zSig = estimateDiv128To64( aSig, 0, bSig );
3222     if ( ( zSig & 0x1FF ) <= 2 ) {
3223         mul64To128( bSig, zSig, &term0, &term1 );
3224         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3225         while ( (sbits64) rem0 < 0 ) {
3226             --zSig;
3227             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3228         }
3229         zSig |= ( rem1 != 0 );
3230     }
3231     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3232 
3233 }
3234 
3235 /*----------------------------------------------------------------------------
3236 | Returns the remainder of the double-precision floating-point value `a'
3237 | with respect to the corresponding value `b'.  The operation is performed
3238 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3239 *----------------------------------------------------------------------------*/
3240 
3241 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3242 {
3243     flag aSign, zSign;
3244     int16 aExp, bExp, expDiff;
3245     bits64 aSig, bSig;
3246     bits64 q, alternateASig;
3247     sbits64 sigMean;
3248 
3249     aSig = extractFloat64Frac( a );
3250     aExp = extractFloat64Exp( a );
3251     aSign = extractFloat64Sign( a );
3252     bSig = extractFloat64Frac( b );
3253     bExp = extractFloat64Exp( b );
3254     if ( aExp == 0x7FF ) {
3255         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3256             return propagateFloat64NaN( a, b STATUS_VAR );
3257         }
3258         float_raise( float_flag_invalid STATUS_VAR);
3259         return float64_default_nan;
3260     }
3261     if ( bExp == 0x7FF ) {
3262         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3263         return a;
3264     }
3265     if ( bExp == 0 ) {
3266         if ( bSig == 0 ) {
3267             float_raise( float_flag_invalid STATUS_VAR);
3268             return float64_default_nan;
3269         }
3270         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3271     }
3272     if ( aExp == 0 ) {
3273         if ( aSig == 0 ) return a;
3274         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3275     }
3276     expDiff = aExp - bExp;
3277     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3278     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3279     if ( expDiff < 0 ) {
3280         if ( expDiff < -1 ) return a;
3281         aSig >>= 1;
3282     }
3283     q = ( bSig <= aSig );
3284     if ( q ) aSig -= bSig;
3285     expDiff -= 64;
3286     while ( 0 < expDiff ) {
3287         q = estimateDiv128To64( aSig, 0, bSig );
3288         q = ( 2 < q ) ? q - 2 : 0;
3289         aSig = - ( ( bSig>>2 ) * q );
3290         expDiff -= 62;
3291     }
3292     expDiff += 64;
3293     if ( 0 < expDiff ) {
3294         q = estimateDiv128To64( aSig, 0, bSig );
3295         q = ( 2 < q ) ? q - 2 : 0;
3296         q >>= 64 - expDiff;
3297         bSig >>= 2;
3298         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3299     }
3300     else {
3301         aSig >>= 2;
3302         bSig >>= 2;
3303     }
3304     do {
3305         alternateASig = aSig;
3306         ++q;
3307         aSig -= bSig;
3308     } while ( 0 <= (sbits64) aSig );
3309     sigMean = aSig + alternateASig;
3310     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3311         aSig = alternateASig;
3312     }
3313     zSign = ( (sbits64) aSig < 0 );
3314     if ( zSign ) aSig = - aSig;
3315     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3316 
3317 }
3318 
3319 /*----------------------------------------------------------------------------
3320 | Returns the square root of the double-precision floating-point value `a'.
3321 | The operation is performed according to the IEC/IEEE Standard for Binary
3322 | Floating-Point Arithmetic.
3323 *----------------------------------------------------------------------------*/
3324 
3325 float64 float64_sqrt( float64 a STATUS_PARAM )
3326 {
3327     flag aSign;
3328     int16 aExp, zExp;
3329     bits64 aSig, zSig, doubleZSig;
3330     bits64 rem0, rem1, term0, term1;
3331 
3332     aSig = extractFloat64Frac( a );
3333     aExp = extractFloat64Exp( a );
3334     aSign = extractFloat64Sign( a );
3335     if ( aExp == 0x7FF ) {
3336         if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3337         if ( ! aSign ) return a;
3338         float_raise( float_flag_invalid STATUS_VAR);
3339         return float64_default_nan;
3340     }
3341     if ( aSign ) {
3342         if ( ( aExp | aSig ) == 0 ) return a;
3343         float_raise( float_flag_invalid STATUS_VAR);
3344         return float64_default_nan;
3345     }
3346     if ( aExp == 0 ) {
3347         if ( aSig == 0 ) return float64_zero;
3348         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3349     }
3350     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3351     aSig |= LIT64( 0x0010000000000000 );
3352     zSig = estimateSqrt32( aExp, aSig>>21 );
3353     aSig <<= 9 - ( aExp & 1 );
3354     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3355     if ( ( zSig & 0x1FF ) <= 5 ) {
3356         doubleZSig = zSig<<1;
3357         mul64To128( zSig, zSig, &term0, &term1 );
3358         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3359         while ( (sbits64) rem0 < 0 ) {
3360             --zSig;
3361             doubleZSig -= 2;
3362             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3363         }
3364         zSig |= ( ( rem0 | rem1 ) != 0 );
3365     }
3366     return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3367 
3368 }
3369 
3370 /*----------------------------------------------------------------------------
3371 | Returns the binary log of the double-precision floating-point value `a'.
3372 | The operation is performed according to the IEC/IEEE Standard for Binary
3373 | Floating-Point Arithmetic.
3374 *----------------------------------------------------------------------------*/
3375 float64 float64_log2( float64 a STATUS_PARAM )
3376 {
3377     flag aSign, zSign;
3378     int16 aExp;
3379     bits64 aSig, aSig0, aSig1, zSig, i;
3380 
3381     aSig = extractFloat64Frac( a );
3382     aExp = extractFloat64Exp( a );
3383     aSign = extractFloat64Sign( a );
3384 
3385     if ( aExp == 0 ) {
3386         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3387         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3388     }
3389     if ( aSign ) {
3390         float_raise( float_flag_invalid STATUS_VAR);
3391         return float64_default_nan;
3392     }
3393     if ( aExp == 0x7FF ) {
3394         if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3395         return a;
3396     }
3397 
3398     aExp -= 0x3FF;
3399     aSig |= LIT64( 0x0010000000000000 );
3400     zSign = aExp < 0;
3401     zSig = (bits64)aExp << 52;
3402     for (i = 1LL << 51; i > 0; i >>= 1) {
3403         mul64To128( aSig, aSig, &aSig0, &aSig1 );
3404         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3405         if ( aSig & LIT64( 0x0020000000000000 ) ) {
3406             aSig >>= 1;
3407             zSig |= i;
3408         }
3409     }
3410 
3411     if ( zSign )
3412         zSig = -zSig;
3413     return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3414 }
3415 
3416 /*----------------------------------------------------------------------------
3417 | Returns 1 if the double-precision floating-point value `a' is equal to the
3418 | corresponding value `b', and 0 otherwise.  The comparison is performed
3419 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3420 *----------------------------------------------------------------------------*/
3421 
3422 int float64_eq( float64 a, float64 b STATUS_PARAM )
3423 {
3424     bits64 av, bv;
3425 
3426     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3427          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3428        ) {
3429         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3430             float_raise( float_flag_invalid STATUS_VAR);
3431         }
3432         return 0;
3433     }
3434     av = float64_val(a);
3435     bv = float64_val(b);
3436     return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3437 
3438 }
3439 
3440 /*----------------------------------------------------------------------------
3441 | Returns 1 if the double-precision floating-point value `a' is less than or
3442 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
3443 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3444 | Arithmetic.
3445 *----------------------------------------------------------------------------*/
3446 
3447 int float64_le( float64 a, float64 b STATUS_PARAM )
3448 {
3449     flag aSign, bSign;
3450     bits64 av, bv;
3451 
3452     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3453          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3454        ) {
3455         float_raise( float_flag_invalid STATUS_VAR);
3456         return 0;
3457     }
3458     aSign = extractFloat64Sign( a );
3459     bSign = extractFloat64Sign( b );
3460     av = float64_val(a);
3461     bv = float64_val(b);
3462     if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3463     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3464 
3465 }
3466 
3467 /*----------------------------------------------------------------------------
3468 | Returns 1 if the double-precision floating-point value `a' is less than
3469 | the corresponding value `b', and 0 otherwise.  The comparison is performed
3470 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3471 *----------------------------------------------------------------------------*/
3472 
3473 int float64_lt( float64 a, float64 b STATUS_PARAM )
3474 {
3475     flag aSign, bSign;
3476     bits64 av, bv;
3477 
3478     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3479          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3480        ) {
3481         float_raise( float_flag_invalid STATUS_VAR);
3482         return 0;
3483     }
3484     aSign = extractFloat64Sign( a );
3485     bSign = extractFloat64Sign( b );
3486     av = float64_val(a);
3487     bv = float64_val(b);
3488     if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3489     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3490 
3491 }
3492 
3493 /*----------------------------------------------------------------------------
3494 | Returns 1 if the double-precision floating-point value `a' is equal to the
3495 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
3496 | if either operand is a NaN.  Otherwise, the comparison is performed
3497 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3498 *----------------------------------------------------------------------------*/
3499 
3500 int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3501 {
3502     bits64 av, bv;
3503 
3504     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3505          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3506        ) {
3507         float_raise( float_flag_invalid STATUS_VAR);
3508         return 0;
3509     }
3510     av = float64_val(a);
3511     bv = float64_val(b);
3512     return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3513 
3514 }
3515 
3516 /*----------------------------------------------------------------------------
3517 | Returns 1 if the double-precision floating-point value `a' is less than or
3518 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3519 | cause an exception.  Otherwise, the comparison is performed according to the
3520 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3521 *----------------------------------------------------------------------------*/
3522 
3523 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3524 {
3525     flag aSign, bSign;
3526     bits64 av, bv;
3527 
3528     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3529          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3530        ) {
3531         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3532             float_raise( float_flag_invalid STATUS_VAR);
3533         }
3534         return 0;
3535     }
3536     aSign = extractFloat64Sign( a );
3537     bSign = extractFloat64Sign( b );
3538     av = float64_val(a);
3539     bv = float64_val(b);
3540     if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3541     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3542 
3543 }
3544 
3545 /*----------------------------------------------------------------------------
3546 | Returns 1 if the double-precision floating-point value `a' is less than
3547 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3548 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3549 | Standard for Binary Floating-Point Arithmetic.
3550 *----------------------------------------------------------------------------*/
3551 
3552 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3553 {
3554     flag aSign, bSign;
3555     bits64 av, bv;
3556 
3557     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3558          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3559        ) {
3560         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3561             float_raise( float_flag_invalid STATUS_VAR);
3562         }
3563         return 0;
3564     }
3565     aSign = extractFloat64Sign( a );
3566     bSign = extractFloat64Sign( b );
3567     av = float64_val(a);
3568     bv = float64_val(b);
3569     if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3570     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3571 
3572 }
3573 
3574 #ifdef FLOATX80
3575 
3576 /*----------------------------------------------------------------------------
3577 | Returns the result of converting the extended double-precision floating-
3578 | point value `a' to the 32-bit two's complement integer format.  The
3579 | conversion is performed according to the IEC/IEEE Standard for Binary
3580 | Floating-Point Arithmetic---which means in particular that the conversion
3581 | is rounded according to the current rounding mode.  If `a' is a NaN, the
3582 | largest positive integer is returned.  Otherwise, if the conversion
3583 | overflows, the largest integer with the same sign as `a' is returned.
3584 *----------------------------------------------------------------------------*/
3585 
3586 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3587 {
3588     flag aSign;
3589     int32 aExp, shiftCount;
3590     bits64 aSig;
3591 
3592     aSig = extractFloatx80Frac( a );
3593     aExp = extractFloatx80Exp( a );
3594     aSign = extractFloatx80Sign( a );
3595     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3596     shiftCount = 0x4037 - aExp;
3597     if ( shiftCount <= 0 ) shiftCount = 1;
3598     shift64RightJamming( aSig, shiftCount, &aSig );
3599     return roundAndPackInt32( aSign, aSig STATUS_VAR );
3600 
3601 }
3602 
3603 /*----------------------------------------------------------------------------
3604 | Returns the result of converting the extended double-precision floating-
3605 | point value `a' to the 32-bit two's complement integer format.  The
3606 | conversion is performed according to the IEC/IEEE Standard for Binary
3607 | Floating-Point Arithmetic, except that the conversion is always rounded
3608 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3609 | Otherwise, if the conversion overflows, the largest integer with the same
3610 | sign as `a' is returned.
3611 *----------------------------------------------------------------------------*/
3612 
3613 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3614 {
3615     flag aSign;
3616     int32 aExp, shiftCount;
3617     bits64 aSig, savedASig;
3618     int32 z;
3619 
3620     aSig = extractFloatx80Frac( a );
3621     aExp = extractFloatx80Exp( a );
3622     aSign = extractFloatx80Sign( a );
3623     if ( 0x401E < aExp ) {
3624         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3625         goto invalid;
3626     }
3627     else if ( aExp < 0x3FFF ) {
3628         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3629         return 0;
3630     }
3631     shiftCount = 0x403E - aExp;
3632     savedASig = aSig;
3633     aSig >>= shiftCount;
3634     z = aSig;
3635     if ( aSign ) z = - z;
3636     if ( ( z < 0 ) ^ aSign ) {
3637  invalid:
3638         float_raise( float_flag_invalid STATUS_VAR);
3639         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3640     }
3641     if ( ( aSig<<shiftCount ) != savedASig ) {
3642         STATUS(float_exception_flags) |= float_flag_inexact;
3643     }
3644     return z;
3645 
3646 }
3647 
3648 /*----------------------------------------------------------------------------
3649 | Returns the result of converting the extended double-precision floating-
3650 | point value `a' to the 64-bit two's complement integer format.  The
3651 | conversion is performed according to the IEC/IEEE Standard for Binary
3652 | Floating-Point Arithmetic---which means in particular that the conversion
3653 | is rounded according to the current rounding mode.  If `a' is a NaN,
3654 | the largest positive integer is returned.  Otherwise, if the conversion
3655 | overflows, the largest integer with the same sign as `a' is returned.
3656 *----------------------------------------------------------------------------*/
3657 
3658 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3659 {
3660     flag aSign;
3661     int32 aExp, shiftCount;
3662     bits64 aSig, aSigExtra;
3663 
3664     aSig = extractFloatx80Frac( a );
3665     aExp = extractFloatx80Exp( a );
3666     aSign = extractFloatx80Sign( a );
3667     shiftCount = 0x403E - aExp;
3668     if ( shiftCount <= 0 ) {
3669         if ( shiftCount ) {
3670             float_raise( float_flag_invalid STATUS_VAR);
3671             if (    ! aSign
3672                  || (    ( aExp == 0x7FFF )
3673                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3674                ) {
3675                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3676             }
3677             return (sbits64) LIT64( 0x8000000000000000 );
3678         }
3679         aSigExtra = 0;
3680     }
3681     else {
3682         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3683     }
3684     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3685 
3686 }
3687 
3688 /*----------------------------------------------------------------------------
3689 | Returns the result of converting the extended double-precision floating-
3690 | point value `a' to the 64-bit two's complement integer format.  The
3691 | conversion is performed according to the IEC/IEEE Standard for Binary
3692 | Floating-Point Arithmetic, except that the conversion is always rounded
3693 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3694 | Otherwise, if the conversion overflows, the largest integer with the same
3695 | sign as `a' is returned.
3696 *----------------------------------------------------------------------------*/
3697 
3698 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3699 {
3700     flag aSign;
3701     int32 aExp, shiftCount;
3702     bits64 aSig;
3703     int64 z;
3704 
3705     aSig = extractFloatx80Frac( a );
3706     aExp = extractFloatx80Exp( a );
3707     aSign = extractFloatx80Sign( a );
3708     shiftCount = aExp - 0x403E;
3709     if ( 0 <= shiftCount ) {
3710         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3711         if ( ( a.high != 0xC03E ) || aSig ) {
3712             float_raise( float_flag_invalid STATUS_VAR);
3713             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3714                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3715             }
3716         }
3717         return (sbits64) LIT64( 0x8000000000000000 );
3718     }
3719     else if ( aExp < 0x3FFF ) {
3720         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3721         return 0;
3722     }
3723     z = aSig>>( - shiftCount );
3724     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3725         STATUS(float_exception_flags) |= float_flag_inexact;
3726     }
3727     if ( aSign ) z = - z;
3728     return z;
3729 
3730 }
3731 
3732 /*----------------------------------------------------------------------------
3733 | Returns the result of converting the extended double-precision floating-
3734 | point value `a' to the single-precision floating-point format.  The
3735 | conversion is performed according to the IEC/IEEE Standard for Binary
3736 | Floating-Point Arithmetic.
3737 *----------------------------------------------------------------------------*/
3738 
3739 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3740 {
3741     flag aSign;
3742     int32 aExp;
3743     bits64 aSig;
3744 
3745     aSig = extractFloatx80Frac( a );
3746     aExp = extractFloatx80Exp( a );
3747     aSign = extractFloatx80Sign( a );
3748     if ( aExp == 0x7FFF ) {
3749         if ( (bits64) ( aSig<<1 ) ) {
3750             return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3751         }
3752         return packFloat32( aSign, 0xFF, 0 );
3753     }
3754     shift64RightJamming( aSig, 33, &aSig );
3755     if ( aExp || aSig ) aExp -= 0x3F81;
3756     return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3757 
3758 }
3759 
3760 /*----------------------------------------------------------------------------
3761 | Returns the result of converting the extended double-precision floating-
3762 | point value `a' to the double-precision floating-point format.  The
3763 | conversion is performed according to the IEC/IEEE Standard for Binary
3764 | Floating-Point Arithmetic.
3765 *----------------------------------------------------------------------------*/
3766 
3767 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3768 {
3769     flag aSign;
3770     int32 aExp;
3771     bits64 aSig, zSig;
3772 
3773     aSig = extractFloatx80Frac( a );
3774     aExp = extractFloatx80Exp( a );
3775     aSign = extractFloatx80Sign( a );
3776     if ( aExp == 0x7FFF ) {
3777         if ( (bits64) ( aSig<<1 ) ) {
3778             return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3779         }
3780         return packFloat64( aSign, 0x7FF, 0 );
3781     }
3782     shift64RightJamming( aSig, 1, &zSig );
3783     if ( aExp || aSig ) aExp -= 0x3C01;
3784     return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3785 
3786 }
3787 
3788 #ifdef FLOAT128
3789 
3790 /*----------------------------------------------------------------------------
3791 | Returns the result of converting the extended double-precision floating-
3792 | point value `a' to the quadruple-precision floating-point format.  The
3793 | conversion is performed according to the IEC/IEEE Standard for Binary
3794 | Floating-Point Arithmetic.
3795 *----------------------------------------------------------------------------*/
3796 
3797 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3798 {
3799     flag aSign;
3800     int16 aExp;
3801     bits64 aSig, zSig0, zSig1;
3802 
3803     aSig = extractFloatx80Frac( a );
3804     aExp = extractFloatx80Exp( a );
3805     aSign = extractFloatx80Sign( a );
3806     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3807         return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3808     }
3809     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3810     return packFloat128( aSign, aExp, zSig0, zSig1 );
3811 
3812 }
3813 
3814 #endif
3815 
3816 /*----------------------------------------------------------------------------
3817 | Rounds the extended double-precision floating-point value `a' to an integer,
3818 | and returns the result as an extended quadruple-precision floating-point
3819 | value.  The operation is performed according to the IEC/IEEE Standard for
3820 | Binary Floating-Point Arithmetic.
3821 *----------------------------------------------------------------------------*/
3822 
3823 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3824 {
3825     flag aSign;
3826     int32 aExp;
3827     bits64 lastBitMask, roundBitsMask;
3828     int8 roundingMode;
3829     floatx80 z;
3830 
3831     aExp = extractFloatx80Exp( a );
3832     if ( 0x403E <= aExp ) {
3833         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3834             return propagateFloatx80NaN( a, a STATUS_VAR );
3835         }
3836         return a;
3837     }
3838     if ( aExp < 0x3FFF ) {
3839         if (    ( aExp == 0 )
3840              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3841             return a;
3842         }
3843         STATUS(float_exception_flags) |= float_flag_inexact;
3844         aSign = extractFloatx80Sign( a );
3845         switch ( STATUS(float_rounding_mode) ) {
3846          case float_round_nearest_even:
3847             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3848                ) {
3849                 return
3850                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3851             }
3852             break;
3853          case float_round_down:
3854             return
3855                   aSign ?
3856                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3857                 : packFloatx80( 0, 0, 0 );
3858          case float_round_up:
3859             return
3860                   aSign ? packFloatx80( 1, 0, 0 )
3861                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3862         }
3863         return packFloatx80( aSign, 0, 0 );
3864     }
3865     lastBitMask = 1;
3866     lastBitMask <<= 0x403E - aExp;
3867     roundBitsMask = lastBitMask - 1;
3868     z = a;
3869     roundingMode = STATUS(float_rounding_mode);
3870     if ( roundingMode == float_round_nearest_even ) {
3871         z.low += lastBitMask>>1;
3872         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3873     }
3874     else if ( roundingMode != float_round_to_zero ) {
3875         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3876             z.low += roundBitsMask;
3877         }
3878     }
3879     z.low &= ~ roundBitsMask;
3880     if ( z.low == 0 ) {
3881         ++z.high;
3882         z.low = LIT64( 0x8000000000000000 );
3883     }
3884     if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
3885     return z;
3886 
3887 }
3888 
3889 /*----------------------------------------------------------------------------
3890 | Returns the result of adding the absolute values of the extended double-
3891 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3892 | negated before being returned.  `zSign' is ignored if the result is a NaN.
3893 | The addition is performed according to the IEC/IEEE Standard for Binary
3894 | Floating-Point Arithmetic.
3895 *----------------------------------------------------------------------------*/
3896 
3897 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
3898 {
3899     int32 aExp, bExp, zExp;
3900     bits64 aSig, bSig, zSig0, zSig1;
3901     int32 expDiff;
3902 
3903     aSig = extractFloatx80Frac( a );
3904     aExp = extractFloatx80Exp( a );
3905     bSig = extractFloatx80Frac( b );
3906     bExp = extractFloatx80Exp( b );
3907     expDiff = aExp - bExp;
3908     if ( 0 < expDiff ) {
3909         if ( aExp == 0x7FFF ) {
3910             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3911             return a;
3912         }
3913         if ( bExp == 0 ) --expDiff;
3914         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3915         zExp = aExp;
3916     }
3917     else if ( expDiff < 0 ) {
3918         if ( bExp == 0x7FFF ) {
3919             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3920             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3921         }
3922         if ( aExp == 0 ) ++expDiff;
3923         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3924         zExp = bExp;
3925     }
3926     else {
3927         if ( aExp == 0x7FFF ) {
3928             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3929                 return propagateFloatx80NaN( a, b STATUS_VAR );
3930             }
3931             return a;
3932         }
3933         zSig1 = 0;
3934         zSig0 = aSig + bSig;
3935         if ( aExp == 0 ) {
3936             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3937             goto roundAndPack;
3938         }
3939         zExp = aExp;
3940         goto shiftRight1;
3941     }
3942     zSig0 = aSig + bSig;
3943     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3944  shiftRight1:
3945     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3946     zSig0 |= LIT64( 0x8000000000000000 );
3947     ++zExp;
3948  roundAndPack:
3949     return
3950         roundAndPackFloatx80(
3951             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3952 
3953 }
3954 
3955 /*----------------------------------------------------------------------------
3956 | Returns the result of subtracting the absolute values of the extended
3957 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3958 | difference is negated before being returned.  `zSign' is ignored if the
3959 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3960 | Standard for Binary Floating-Point Arithmetic.
3961 *----------------------------------------------------------------------------*/
3962 
3963 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
3964 {
3965     int32 aExp, bExp, zExp;
3966     bits64 aSig, bSig, zSig0, zSig1;
3967     int32 expDiff;
3968     floatx80 z;
3969 
3970     aSig = extractFloatx80Frac( a );
3971     aExp = extractFloatx80Exp( a );
3972     bSig = extractFloatx80Frac( b );
3973     bExp = extractFloatx80Exp( b );
3974     expDiff = aExp - bExp;
3975     if ( 0 < expDiff ) goto aExpBigger;
3976     if ( expDiff < 0 ) goto bExpBigger;
3977     if ( aExp == 0x7FFF ) {
3978         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3979             return propagateFloatx80NaN( a, b STATUS_VAR );
3980         }
3981         float_raise( float_flag_invalid STATUS_VAR);
3982         z.low = floatx80_default_nan_low;
3983         z.high = floatx80_default_nan_high;
3984         return z;
3985     }
3986     if ( aExp == 0 ) {
3987         aExp = 1;
3988         bExp = 1;
3989     }
3990     zSig1 = 0;
3991     if ( bSig < aSig ) goto aBigger;
3992     if ( aSig < bSig ) goto bBigger;
3993     return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3994  bExpBigger:
3995     if ( bExp == 0x7FFF ) {
3996         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3997         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3998     }
3999     if ( aExp == 0 ) ++expDiff;
4000     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4001  bBigger:
4002     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4003     zExp = bExp;
4004     zSign ^= 1;
4005     goto normalizeRoundAndPack;
4006  aExpBigger:
4007     if ( aExp == 0x7FFF ) {
4008         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4009         return a;
4010     }
4011     if ( bExp == 0 ) --expDiff;
4012     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4013  aBigger:
4014     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4015     zExp = aExp;
4016  normalizeRoundAndPack:
4017     return
4018         normalizeRoundAndPackFloatx80(
4019             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4020 
4021 }
4022 
4023 /*----------------------------------------------------------------------------
4024 | Returns the result of adding the extended double-precision floating-point
4025 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4026 | Standard for Binary Floating-Point Arithmetic.
4027 *----------------------------------------------------------------------------*/
4028 
4029 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4030 {
4031     flag aSign, bSign;
4032 
4033     aSign = extractFloatx80Sign( a );
4034     bSign = extractFloatx80Sign( b );
4035     if ( aSign == bSign ) {
4036         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4037     }
4038     else {
4039         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4040     }
4041 
4042 }
4043 
4044 /*----------------------------------------------------------------------------
4045 | Returns the result of subtracting the extended double-precision floating-
4046 | point values `a' and `b'.  The operation is performed according to the
4047 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4048 *----------------------------------------------------------------------------*/
4049 
4050 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4051 {
4052     flag aSign, bSign;
4053 
4054     aSign = extractFloatx80Sign( a );
4055     bSign = extractFloatx80Sign( b );
4056     if ( aSign == bSign ) {
4057         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4058     }
4059     else {
4060         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4061     }
4062 
4063 }
4064 
4065 /*----------------------------------------------------------------------------
4066 | Returns the result of multiplying the extended double-precision floating-
4067 | point values `a' and `b'.  The operation is performed according to the
4068 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4069 *----------------------------------------------------------------------------*/
4070 
4071 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4072 {
4073     flag aSign, bSign, zSign;
4074     int32 aExp, bExp, zExp;
4075     bits64 aSig, bSig, zSig0, zSig1;
4076     floatx80 z;
4077 
4078     aSig = extractFloatx80Frac( a );
4079     aExp = extractFloatx80Exp( a );
4080     aSign = extractFloatx80Sign( a );
4081     bSig = extractFloatx80Frac( b );
4082     bExp = extractFloatx80Exp( b );
4083     bSign = extractFloatx80Sign( b );
4084     zSign = aSign ^ bSign;
4085     if ( aExp == 0x7FFF ) {
4086         if (    (bits64) ( aSig<<1 )
4087              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
4088             return propagateFloatx80NaN( a, b STATUS_VAR );
4089         }
4090         if ( ( bExp | bSig ) == 0 ) goto invalid;
4091         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4092     }
4093     if ( bExp == 0x7FFF ) {
4094         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4095         if ( ( aExp | aSig ) == 0 ) {
4096  invalid:
4097             float_raise( float_flag_invalid STATUS_VAR);
4098             z.low = floatx80_default_nan_low;
4099             z.high = floatx80_default_nan_high;
4100             return z;
4101         }
4102         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4103     }
4104     if ( aExp == 0 ) {
4105         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4106         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4107     }
4108     if ( bExp == 0 ) {
4109         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4110         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4111     }
4112     zExp = aExp + bExp - 0x3FFE;
4113     mul64To128( aSig, bSig, &zSig0, &zSig1 );
4114     if ( 0 < (sbits64) zSig0 ) {
4115         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4116         --zExp;
4117     }
4118     return
4119         roundAndPackFloatx80(
4120             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4121 
4122 }
4123 
4124 /*----------------------------------------------------------------------------
4125 | Returns the result of dividing the extended double-precision floating-point
4126 | value `a' by the corresponding value `b'.  The operation is performed
4127 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4128 *----------------------------------------------------------------------------*/
4129 
4130 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4131 {
4132     flag aSign, bSign, zSign;
4133     int32 aExp, bExp, zExp;
4134     bits64 aSig, bSig, zSig0, zSig1;
4135     bits64 rem0, rem1, rem2, term0, term1, term2;
4136     floatx80 z;
4137 
4138     aSig = extractFloatx80Frac( a );
4139     aExp = extractFloatx80Exp( a );
4140     aSign = extractFloatx80Sign( a );
4141     bSig = extractFloatx80Frac( b );
4142     bExp = extractFloatx80Exp( b );
4143     bSign = extractFloatx80Sign( b );
4144     zSign = aSign ^ bSign;
4145     if ( aExp == 0x7FFF ) {
4146         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4147         if ( bExp == 0x7FFF ) {
4148             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4149             goto invalid;
4150         }
4151         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4152     }
4153     if ( bExp == 0x7FFF ) {
4154         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4155         return packFloatx80( zSign, 0, 0 );
4156     }
4157     if ( bExp == 0 ) {
4158         if ( bSig == 0 ) {
4159             if ( ( aExp | aSig ) == 0 ) {
4160  invalid:
4161                 float_raise( float_flag_invalid STATUS_VAR);
4162                 z.low = floatx80_default_nan_low;
4163                 z.high = floatx80_default_nan_high;
4164                 return z;
4165             }
4166             float_raise( float_flag_divbyzero STATUS_VAR);
4167             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4168         }
4169         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4170     }
4171     if ( aExp == 0 ) {
4172         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4173         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4174     }
4175     zExp = aExp - bExp + 0x3FFE;
4176     rem1 = 0;
4177     if ( bSig <= aSig ) {
4178         shift128Right( aSig, 0, 1, &aSig, &rem1 );
4179         ++zExp;
4180     }
4181     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4182     mul64To128( bSig, zSig0, &term0, &term1 );
4183     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4184     while ( (sbits64) rem0 < 0 ) {
4185         --zSig0;
4186         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4187     }
4188     zSig1 = estimateDiv128To64( rem1, 0, bSig );
4189     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
4190         mul64To128( bSig, zSig1, &term1, &term2 );
4191         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4192         while ( (sbits64) rem1 < 0 ) {
4193             --zSig1;
4194             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4195         }
4196         zSig1 |= ( ( rem1 | rem2 ) != 0 );
4197     }
4198     return
4199         roundAndPackFloatx80(
4200             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4201 
4202 }
4203 
4204 /*----------------------------------------------------------------------------
4205 | Returns the remainder of the extended double-precision floating-point value
4206 | `a' with respect to the corresponding value `b'.  The operation is performed
4207 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4208 *----------------------------------------------------------------------------*/
4209 
4210 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4211 {
4212     flag aSign, zSign;
4213     int32 aExp, bExp, expDiff;
4214     bits64 aSig0, aSig1, bSig;
4215     bits64 q, term0, term1, alternateASig0, alternateASig1;
4216     floatx80 z;
4217 
4218     aSig0 = extractFloatx80Frac( a );
4219     aExp = extractFloatx80Exp( a );
4220     aSign = extractFloatx80Sign( a );
4221     bSig = extractFloatx80Frac( b );
4222     bExp = extractFloatx80Exp( b );
4223     if ( aExp == 0x7FFF ) {
4224         if (    (bits64) ( aSig0<<1 )
4225              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
4226             return propagateFloatx80NaN( a, b STATUS_VAR );
4227         }
4228         goto invalid;
4229     }
4230     if ( bExp == 0x7FFF ) {
4231         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4232         return a;
4233     }
4234     if ( bExp == 0 ) {
4235         if ( bSig == 0 ) {
4236  invalid:
4237             float_raise( float_flag_invalid STATUS_VAR);
4238             z.low = floatx80_default_nan_low;
4239             z.high = floatx80_default_nan_high;
4240             return z;
4241         }
4242         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4243     }
4244     if ( aExp == 0 ) {
4245         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
4246         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4247     }
4248     bSig |= LIT64( 0x8000000000000000 );
4249     zSign = aSign;
4250     expDiff = aExp - bExp;
4251     aSig1 = 0;
4252     if ( expDiff < 0 ) {
4253         if ( expDiff < -1 ) return a;
4254         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4255         expDiff = 0;
4256     }
4257     q = ( bSig <= aSig0 );
4258     if ( q ) aSig0 -= bSig;
4259     expDiff -= 64;
4260     while ( 0 < expDiff ) {
4261         q = estimateDiv128To64( aSig0, aSig1, bSig );
4262         q = ( 2 < q ) ? q - 2 : 0;
4263         mul64To128( bSig, q, &term0, &term1 );
4264         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4265         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4266         expDiff -= 62;
4267     }
4268     expDiff += 64;
4269     if ( 0 < expDiff ) {
4270         q = estimateDiv128To64( aSig0, aSig1, bSig );
4271         q = ( 2 < q ) ? q - 2 : 0;
4272         q >>= 64 - expDiff;
4273         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4274         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4275         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4276         while ( le128( term0, term1, aSig0, aSig1 ) ) {
4277             ++q;
4278             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4279         }
4280     }
4281     else {
4282         term1 = 0;
4283         term0 = bSig;
4284     }
4285     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4286     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4287          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4288               && ( q & 1 ) )
4289        ) {
4290         aSig0 = alternateASig0;
4291         aSig1 = alternateASig1;
4292         zSign = ! zSign;
4293     }
4294     return
4295         normalizeRoundAndPackFloatx80(
4296             80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4297 
4298 }
4299 
4300 /*----------------------------------------------------------------------------
4301 | Returns the square root of the extended double-precision floating-point
4302 | value `a'.  The operation is performed according to the IEC/IEEE Standard
4303 | for Binary Floating-Point Arithmetic.
4304 *----------------------------------------------------------------------------*/
4305 
4306 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4307 {
4308     flag aSign;
4309     int32 aExp, zExp;
4310     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4311     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4312     floatx80 z;
4313 
4314     aSig0 = extractFloatx80Frac( a );
4315     aExp = extractFloatx80Exp( a );
4316     aSign = extractFloatx80Sign( a );
4317     if ( aExp == 0x7FFF ) {
4318         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4319         if ( ! aSign ) return a;
4320         goto invalid;
4321     }
4322     if ( aSign ) {
4323         if ( ( aExp | aSig0 ) == 0 ) return a;
4324  invalid:
4325         float_raise( float_flag_invalid STATUS_VAR);
4326         z.low = floatx80_default_nan_low;
4327         z.high = floatx80_default_nan_high;
4328         return z;
4329     }
4330     if ( aExp == 0 ) {
4331         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4332         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4333     }
4334     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4335     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4336     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4337     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4338     doubleZSig0 = zSig0<<1;
4339     mul64To128( zSig0, zSig0, &term0, &term1 );
4340     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4341     while ( (sbits64) rem0 < 0 ) {
4342         --zSig0;
4343         doubleZSig0 -= 2;
4344         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4345     }
4346     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4347     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4348         if ( zSig1 == 0 ) zSig1 = 1;
4349         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4350         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4351         mul64To128( zSig1, zSig1, &term2, &term3 );
4352         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4353         while ( (sbits64) rem1 < 0 ) {
4354             --zSig1;
4355             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4356             term3 |= 1;
4357             term2 |= doubleZSig0;
4358             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4359         }
4360         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4361     }
4362     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4363     zSig0 |= doubleZSig0;
4364     return
4365         roundAndPackFloatx80(
4366             STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4367 
4368 }
4369 
4370 /*----------------------------------------------------------------------------
4371 | Returns 1 if the extended double-precision floating-point value `a' is
4372 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
4373 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4374 | Arithmetic.
4375 *----------------------------------------------------------------------------*/
4376 
4377 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4378 {
4379 
4380     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4381               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4382          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4383               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4384        ) {
4385         if (    floatx80_is_signaling_nan( a )
4386              || floatx80_is_signaling_nan( b ) ) {
4387             float_raise( float_flag_invalid STATUS_VAR);
4388         }
4389         return 0;
4390     }
4391     return
4392            ( a.low == b.low )
4393         && (    ( a.high == b.high )
4394              || (    ( a.low == 0 )
4395                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4396            );
4397 
4398 }
4399 
4400 /*----------------------------------------------------------------------------
4401 | Returns 1 if the extended double-precision floating-point value `a' is
4402 | less than or equal to the corresponding value `b', and 0 otherwise.  The
4403 | comparison is performed according to the IEC/IEEE Standard for Binary
4404 | Floating-Point Arithmetic.
4405 *----------------------------------------------------------------------------*/
4406 
4407 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4408 {
4409     flag aSign, bSign;
4410 
4411     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4412               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4413          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4414               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4415        ) {
4416         float_raise( float_flag_invalid STATUS_VAR);
4417         return 0;
4418     }
4419     aSign = extractFloatx80Sign( a );
4420     bSign = extractFloatx80Sign( b );
4421     if ( aSign != bSign ) {
4422         return
4423                aSign
4424             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4425                  == 0 );
4426     }
4427     return
4428           aSign ? le128( b.high, b.low, a.high, a.low )
4429         : le128( a.high, a.low, b.high, b.low );
4430 
4431 }
4432 
4433 /*----------------------------------------------------------------------------
4434 | Returns 1 if the extended double-precision floating-point value `a' is
4435 | less than the corresponding value `b', and 0 otherwise.  The comparison
4436 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4437 | Arithmetic.
4438 *----------------------------------------------------------------------------*/
4439 
4440 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4441 {
4442     flag aSign, bSign;
4443 
4444     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4445               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4446          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4447               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4448        ) {
4449         float_raise( float_flag_invalid STATUS_VAR);
4450         return 0;
4451     }
4452     aSign = extractFloatx80Sign( a );
4453     bSign = extractFloatx80Sign( b );
4454     if ( aSign != bSign ) {
4455         return
4456                aSign
4457             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4458                  != 0 );
4459     }
4460     return
4461           aSign ? lt128( b.high, b.low, a.high, a.low )
4462         : lt128( a.high, a.low, b.high, b.low );
4463 
4464 }
4465 
4466 /*----------------------------------------------------------------------------
4467 | Returns 1 if the extended double-precision floating-point value `a' is equal
4468 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
4469 | raised if either operand is a NaN.  Otherwise, the comparison is performed
4470 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4471 *----------------------------------------------------------------------------*/
4472 
4473 int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4474 {
4475 
4476     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4477               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4478          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4479               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4480        ) {
4481         float_raise( float_flag_invalid STATUS_VAR);
4482         return 0;
4483     }
4484     return
4485            ( a.low == b.low )
4486         && (    ( a.high == b.high )
4487              || (    ( a.low == 0 )
4488                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4489            );
4490 
4491 }
4492 
4493 /*----------------------------------------------------------------------------
4494 | Returns 1 if the extended double-precision floating-point value `a' is less
4495 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4496 | do not cause an exception.  Otherwise, the comparison is performed according
4497 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4498 *----------------------------------------------------------------------------*/
4499 
4500 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4501 {
4502     flag aSign, bSign;
4503 
4504     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4505               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4506          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4507               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4508        ) {
4509         if (    floatx80_is_signaling_nan( a )
4510              || floatx80_is_signaling_nan( b ) ) {
4511             float_raise( float_flag_invalid STATUS_VAR);
4512         }
4513         return 0;
4514     }
4515     aSign = extractFloatx80Sign( a );
4516     bSign = extractFloatx80Sign( b );
4517     if ( aSign != bSign ) {
4518         return
4519                aSign
4520             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4521                  == 0 );
4522     }
4523     return
4524           aSign ? le128( b.high, b.low, a.high, a.low )
4525         : le128( a.high, a.low, b.high, b.low );
4526 
4527 }
4528 
4529 /*----------------------------------------------------------------------------
4530 | Returns 1 if the extended double-precision floating-point value `a' is less
4531 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4532 | an exception.  Otherwise, the comparison is performed according to the
4533 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4534 *----------------------------------------------------------------------------*/
4535 
4536 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4537 {
4538     flag aSign, bSign;
4539 
4540     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4541               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4542          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4543               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4544        ) {
4545         if (    floatx80_is_signaling_nan( a )
4546              || floatx80_is_signaling_nan( b ) ) {
4547             float_raise( float_flag_invalid STATUS_VAR);
4548         }
4549         return 0;
4550     }
4551     aSign = extractFloatx80Sign( a );
4552     bSign = extractFloatx80Sign( b );
4553     if ( aSign != bSign ) {
4554         return
4555                aSign
4556             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4557                  != 0 );
4558     }
4559     return
4560           aSign ? lt128( b.high, b.low, a.high, a.low )
4561         : lt128( a.high, a.low, b.high, b.low );
4562 
4563 }
4564 
4565 #endif
4566 
4567 #ifdef FLOAT128
4568 
4569 /*----------------------------------------------------------------------------
4570 | Returns the result of converting the quadruple-precision floating-point
4571 | value `a' to the 32-bit two's complement integer format.  The conversion
4572 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4573 | Arithmetic---which means in particular that the conversion is rounded
4574 | according to the current rounding mode.  If `a' is a NaN, the largest
4575 | positive integer is returned.  Otherwise, if the conversion overflows, the
4576 | largest integer with the same sign as `a' is returned.
4577 *----------------------------------------------------------------------------*/
4578 
4579 int32 float128_to_int32( float128 a STATUS_PARAM )
4580 {
4581     flag aSign;
4582     int32 aExp, shiftCount;
4583     bits64 aSig0, aSig1;
4584 
4585     aSig1 = extractFloat128Frac1( a );
4586     aSig0 = extractFloat128Frac0( a );
4587     aExp = extractFloat128Exp( a );
4588     aSign = extractFloat128Sign( a );
4589     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4590     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4591     aSig0 |= ( aSig1 != 0 );
4592     shiftCount = 0x4028 - aExp;
4593     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4594     return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4595 
4596 }
4597 
4598 /*----------------------------------------------------------------------------
4599 | Returns the result of converting the quadruple-precision floating-point
4600 | value `a' to the 32-bit two's complement integer format.  The conversion
4601 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4602 | Arithmetic, except that the conversion is always rounded toward zero.  If
4603 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4604 | conversion overflows, the largest integer with the same sign as `a' is
4605 | returned.
4606 *----------------------------------------------------------------------------*/
4607 
4608 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4609 {
4610     flag aSign;
4611     int32 aExp, shiftCount;
4612     bits64 aSig0, aSig1, savedASig;
4613     int32 z;
4614 
4615     aSig1 = extractFloat128Frac1( a );
4616     aSig0 = extractFloat128Frac0( a );
4617     aExp = extractFloat128Exp( a );
4618     aSign = extractFloat128Sign( a );
4619     aSig0 |= ( aSig1 != 0 );
4620     if ( 0x401E < aExp ) {
4621         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4622         goto invalid;
4623     }
4624     else if ( aExp < 0x3FFF ) {
4625         if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4626         return 0;
4627     }
4628     aSig0 |= LIT64( 0x0001000000000000 );
4629     shiftCount = 0x402F - aExp;
4630     savedASig = aSig0;
4631     aSig0 >>= shiftCount;
4632     z = aSig0;
4633     if ( aSign ) z = - z;
4634     if ( ( z < 0 ) ^ aSign ) {
4635  invalid:
4636         float_raise( float_flag_invalid STATUS_VAR);
4637         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4638     }
4639     if ( ( aSig0<<shiftCount ) != savedASig ) {
4640         STATUS(float_exception_flags) |= float_flag_inexact;
4641     }
4642     return z;
4643 
4644 }
4645 
4646 /*----------------------------------------------------------------------------
4647 | Returns the result of converting the quadruple-precision floating-point
4648 | value `a' to the 64-bit two's complement integer format.  The conversion
4649 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4650 | Arithmetic---which means in particular that the conversion is rounded
4651 | according to the current rounding mode.  If `a' is a NaN, the largest
4652 | positive integer is returned.  Otherwise, if the conversion overflows, the
4653 | largest integer with the same sign as `a' is returned.
4654 *----------------------------------------------------------------------------*/
4655 
4656 int64 float128_to_int64( float128 a STATUS_PARAM )
4657 {
4658     flag aSign;
4659     int32 aExp, shiftCount;
4660     bits64 aSig0, aSig1;
4661 
4662     aSig1 = extractFloat128Frac1( a );
4663     aSig0 = extractFloat128Frac0( a );
4664     aExp = extractFloat128Exp( a );
4665     aSign = extractFloat128Sign( a );
4666     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4667     shiftCount = 0x402F - aExp;
4668     if ( shiftCount <= 0 ) {
4669         if ( 0x403E < aExp ) {
4670             float_raise( float_flag_invalid STATUS_VAR);
4671             if (    ! aSign
4672                  || (    ( aExp == 0x7FFF )
4673                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4674                     )
4675                ) {
4676                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4677             }
4678             return (sbits64) LIT64( 0x8000000000000000 );
4679         }
4680         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4681     }
4682     else {
4683         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4684     }
4685     return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4686 
4687 }
4688 
4689 /*----------------------------------------------------------------------------
4690 | Returns the result of converting the quadruple-precision floating-point
4691 | value `a' to the 64-bit two's complement integer format.  The conversion
4692 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4693 | Arithmetic, except that the conversion is always rounded toward zero.
4694 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4695 | the conversion overflows, the largest integer with the same sign as `a' is
4696 | returned.
4697 *----------------------------------------------------------------------------*/
4698 
4699 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4700 {
4701     flag aSign;
4702     int32 aExp, shiftCount;
4703     bits64 aSig0, aSig1;
4704     int64 z;
4705 
4706     aSig1 = extractFloat128Frac1( a );
4707     aSig0 = extractFloat128Frac0( a );
4708     aExp = extractFloat128Exp( a );
4709     aSign = extractFloat128Sign( a );
4710     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4711     shiftCount = aExp - 0x402F;
4712     if ( 0 < shiftCount ) {
4713         if ( 0x403E <= aExp ) {
4714             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4715             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4716                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4717                 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4718             }
4719             else {
4720                 float_raise( float_flag_invalid STATUS_VAR);
4721                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4722                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4723                 }
4724             }
4725             return (sbits64) LIT64( 0x8000000000000000 );
4726         }
4727         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4728         if ( (bits64) ( aSig1<<shiftCount ) ) {
4729             STATUS(float_exception_flags) |= float_flag_inexact;
4730         }
4731     }
4732     else {
4733         if ( aExp < 0x3FFF ) {
4734             if ( aExp | aSig0 | aSig1 ) {
4735                 STATUS(float_exception_flags) |= float_flag_inexact;
4736             }
4737             return 0;
4738         }
4739         z = aSig0>>( - shiftCount );
4740         if (    aSig1
4741              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4742             STATUS(float_exception_flags) |= float_flag_inexact;
4743         }
4744     }
4745     if ( aSign ) z = - z;
4746     return z;
4747 
4748 }
4749 
4750 /*----------------------------------------------------------------------------
4751 | Returns the result of converting the quadruple-precision floating-point
4752 | value `a' to the single-precision floating-point format.  The conversion
4753 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4754 | Arithmetic.
4755 *----------------------------------------------------------------------------*/
4756 
4757 float32 float128_to_float32( float128 a STATUS_PARAM )
4758 {
4759     flag aSign;
4760     int32 aExp;
4761     bits64 aSig0, aSig1;
4762     bits32 zSig;
4763 
4764     aSig1 = extractFloat128Frac1( a );
4765     aSig0 = extractFloat128Frac0( a );
4766     aExp = extractFloat128Exp( a );
4767     aSign = extractFloat128Sign( a );
4768     if ( aExp == 0x7FFF ) {
4769         if ( aSig0 | aSig1 ) {
4770             return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
4771         }
4772         return packFloat32( aSign, 0xFF, 0 );
4773     }
4774     aSig0 |= ( aSig1 != 0 );
4775     shift64RightJamming( aSig0, 18, &aSig0 );
4776     zSig = aSig0;
4777     if ( aExp || zSig ) {
4778         zSig |= 0x40000000;
4779         aExp -= 0x3F81;
4780     }
4781     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4782 
4783 }
4784 
4785 /*----------------------------------------------------------------------------
4786 | Returns the result of converting the quadruple-precision floating-point
4787 | value `a' to the double-precision floating-point format.  The conversion
4788 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4789 | Arithmetic.
4790 *----------------------------------------------------------------------------*/
4791 
4792 float64 float128_to_float64( float128 a STATUS_PARAM )
4793 {
4794     flag aSign;
4795     int32 aExp;
4796     bits64 aSig0, aSig1;
4797 
4798     aSig1 = extractFloat128Frac1( a );
4799     aSig0 = extractFloat128Frac0( a );
4800     aExp = extractFloat128Exp( a );
4801     aSign = extractFloat128Sign( a );
4802     if ( aExp == 0x7FFF ) {
4803         if ( aSig0 | aSig1 ) {
4804             return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
4805         }
4806         return packFloat64( aSign, 0x7FF, 0 );
4807     }
4808     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4809     aSig0 |= ( aSig1 != 0 );
4810     if ( aExp || aSig0 ) {
4811         aSig0 |= LIT64( 0x4000000000000000 );
4812         aExp -= 0x3C01;
4813     }
4814     return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4815 
4816 }
4817 
4818 #ifdef FLOATX80
4819 
4820 /*----------------------------------------------------------------------------
4821 | Returns the result of converting the quadruple-precision floating-point
4822 | value `a' to the extended double-precision floating-point format.  The
4823 | conversion is performed according to the IEC/IEEE Standard for Binary
4824 | Floating-Point Arithmetic.
4825 *----------------------------------------------------------------------------*/
4826 
4827 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4828 {
4829     flag aSign;
4830     int32 aExp;
4831     bits64 aSig0, aSig1;
4832 
4833     aSig1 = extractFloat128Frac1( a );
4834     aSig0 = extractFloat128Frac0( a );
4835     aExp = extractFloat128Exp( a );
4836     aSign = extractFloat128Sign( a );
4837     if ( aExp == 0x7FFF ) {
4838         if ( aSig0 | aSig1 ) {
4839             return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
4840         }
4841         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4842     }
4843     if ( aExp == 0 ) {
4844         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4845         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4846     }
4847     else {
4848         aSig0 |= LIT64( 0x0001000000000000 );
4849     }
4850     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4851     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4852 
4853 }
4854 
4855 #endif
4856 
4857 /*----------------------------------------------------------------------------
4858 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4859 | returns the result as a quadruple-precision floating-point value.  The
4860 | operation is performed according to the IEC/IEEE Standard for Binary
4861 | Floating-Point Arithmetic.
4862 *----------------------------------------------------------------------------*/
4863 
4864 float128 float128_round_to_int( float128 a STATUS_PARAM )
4865 {
4866     flag aSign;
4867     int32 aExp;
4868     bits64 lastBitMask, roundBitsMask;
4869     int8 roundingMode;
4870     float128 z;
4871 
4872     aExp = extractFloat128Exp( a );
4873     if ( 0x402F <= aExp ) {
4874         if ( 0x406F <= aExp ) {
4875             if (    ( aExp == 0x7FFF )
4876                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4877                ) {
4878                 return propagateFloat128NaN( a, a STATUS_VAR );
4879             }
4880             return a;
4881         }
4882         lastBitMask = 1;
4883         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4884         roundBitsMask = lastBitMask - 1;
4885         z = a;
4886         roundingMode = STATUS(float_rounding_mode);
4887         if ( roundingMode == float_round_nearest_even ) {
4888             if ( lastBitMask ) {
4889                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4890                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4891             }
4892             else {
4893                 if ( (sbits64) z.low < 0 ) {
4894                     ++z.high;
4895                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4896                 }
4897             }
4898         }
4899         else if ( roundingMode != float_round_to_zero ) {
4900             if (   extractFloat128Sign( z )
4901                  ^ ( roundingMode == float_round_up ) ) {
4902                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4903             }
4904         }
4905         z.low &= ~ roundBitsMask;
4906     }
4907     else {
4908         if ( aExp < 0x3FFF ) {
4909             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4910             STATUS(float_exception_flags) |= float_flag_inexact;
4911             aSign = extractFloat128Sign( a );
4912             switch ( STATUS(float_rounding_mode) ) {
4913              case float_round_nearest_even:
4914                 if (    ( aExp == 0x3FFE )
4915                      && (   extractFloat128Frac0( a )
4916                           | extractFloat128Frac1( a ) )
4917                    ) {
4918                     return packFloat128( aSign, 0x3FFF, 0, 0 );
4919                 }
4920                 break;
4921              case float_round_down:
4922                 return
4923                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4924                     : packFloat128( 0, 0, 0, 0 );
4925              case float_round_up:
4926                 return
4927                       aSign ? packFloat128( 1, 0, 0, 0 )
4928                     : packFloat128( 0, 0x3FFF, 0, 0 );
4929             }
4930             return packFloat128( aSign, 0, 0, 0 );
4931         }
4932         lastBitMask = 1;
4933         lastBitMask <<= 0x402F - aExp;
4934         roundBitsMask = lastBitMask - 1;
4935         z.low = 0;
4936         z.high = a.high;
4937         roundingMode = STATUS(float_rounding_mode);
4938         if ( roundingMode == float_round_nearest_even ) {
4939             z.high += lastBitMask>>1;
4940             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4941                 z.high &= ~ lastBitMask;
4942             }
4943         }
4944         else if ( roundingMode != float_round_to_zero ) {
4945             if (   extractFloat128Sign( z )
4946                  ^ ( roundingMode == float_round_up ) ) {
4947                 z.high |= ( a.low != 0 );
4948                 z.high += roundBitsMask;
4949             }
4950         }
4951         z.high &= ~ roundBitsMask;
4952     }
4953     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4954         STATUS(float_exception_flags) |= float_flag_inexact;
4955     }
4956     return z;
4957 
4958 }
4959 
4960 /*----------------------------------------------------------------------------
4961 | Returns the result of adding the absolute values of the quadruple-precision
4962 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4963 | before being returned.  `zSign' is ignored if the result is a NaN.
4964 | The addition is performed according to the IEC/IEEE Standard for Binary
4965 | Floating-Point Arithmetic.
4966 *----------------------------------------------------------------------------*/
4967 
4968 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4969 {
4970     int32 aExp, bExp, zExp;
4971     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4972     int32 expDiff;
4973 
4974     aSig1 = extractFloat128Frac1( a );
4975     aSig0 = extractFloat128Frac0( a );
4976     aExp = extractFloat128Exp( a );
4977     bSig1 = extractFloat128Frac1( b );
4978     bSig0 = extractFloat128Frac0( b );
4979     bExp = extractFloat128Exp( b );
4980     expDiff = aExp - bExp;
4981     if ( 0 < expDiff ) {
4982         if ( aExp == 0x7FFF ) {
4983             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4984             return a;
4985         }
4986         if ( bExp == 0 ) {
4987             --expDiff;
4988         }
4989         else {
4990             bSig0 |= LIT64( 0x0001000000000000 );
4991         }
4992         shift128ExtraRightJamming(
4993             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4994         zExp = aExp;
4995     }
4996     else if ( expDiff < 0 ) {
4997         if ( bExp == 0x7FFF ) {
4998             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4999             return packFloat128( zSign, 0x7FFF, 0, 0 );
5000         }
5001         if ( aExp == 0 ) {
5002             ++expDiff;
5003         }
5004         else {
5005             aSig0 |= LIT64( 0x0001000000000000 );
5006         }
5007         shift128ExtraRightJamming(
5008             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5009         zExp = bExp;
5010     }
5011     else {
5012         if ( aExp == 0x7FFF ) {
5013             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5014                 return propagateFloat128NaN( a, b STATUS_VAR );
5015             }
5016             return a;
5017         }
5018         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5019         if ( aExp == 0 ) {
5020             if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
5021             return packFloat128( zSign, 0, zSig0, zSig1 );
5022         }
5023         zSig2 = 0;
5024         zSig0 |= LIT64( 0x0002000000000000 );
5025         zExp = aExp;
5026         goto shiftRight1;
5027     }
5028     aSig0 |= LIT64( 0x0001000000000000 );
5029     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5030     --zExp;
5031     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5032     ++zExp;
5033  shiftRight1:
5034     shift128ExtraRightJamming(
5035         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5036  roundAndPack:
5037     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5038 
5039 }
5040 
5041 /*----------------------------------------------------------------------------
5042 | Returns the result of subtracting the absolute values of the quadruple-
5043 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
5044 | difference is negated before being returned.  `zSign' is ignored if the
5045 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5046 | Standard for Binary Floating-Point Arithmetic.
5047 *----------------------------------------------------------------------------*/
5048 
5049 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5050 {
5051     int32 aExp, bExp, zExp;
5052     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5053     int32 expDiff;
5054     float128 z;
5055 
5056     aSig1 = extractFloat128Frac1( a );
5057     aSig0 = extractFloat128Frac0( a );
5058     aExp = extractFloat128Exp( a );
5059     bSig1 = extractFloat128Frac1( b );
5060     bSig0 = extractFloat128Frac0( b );
5061     bExp = extractFloat128Exp( b );
5062     expDiff = aExp - bExp;
5063     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5064     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5065     if ( 0 < expDiff ) goto aExpBigger;
5066     if ( expDiff < 0 ) goto bExpBigger;
5067     if ( aExp == 0x7FFF ) {
5068         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5069             return propagateFloat128NaN( a, b STATUS_VAR );
5070         }
5071         float_raise( float_flag_invalid STATUS_VAR);
5072         z.low = float128_default_nan_low;
5073         z.high = float128_default_nan_high;
5074         return z;
5075     }
5076     if ( aExp == 0 ) {
5077         aExp = 1;
5078         bExp = 1;
5079     }
5080     if ( bSig0 < aSig0 ) goto aBigger;
5081     if ( aSig0 < bSig0 ) goto bBigger;
5082     if ( bSig1 < aSig1 ) goto aBigger;
5083     if ( aSig1 < bSig1 ) goto bBigger;
5084     return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5085  bExpBigger:
5086     if ( bExp == 0x7FFF ) {
5087         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5088         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5089     }
5090     if ( aExp == 0 ) {
5091         ++expDiff;
5092     }
5093     else {
5094         aSig0 |= LIT64( 0x4000000000000000 );
5095     }
5096     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5097     bSig0 |= LIT64( 0x4000000000000000 );
5098  bBigger:
5099     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5100     zExp = bExp;
5101     zSign ^= 1;
5102     goto normalizeRoundAndPack;
5103  aExpBigger:
5104     if ( aExp == 0x7FFF ) {
5105         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5106         return a;
5107     }
5108     if ( bExp == 0 ) {
5109         --expDiff;
5110     }
5111     else {
5112         bSig0 |= LIT64( 0x4000000000000000 );
5113     }
5114     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5115     aSig0 |= LIT64( 0x4000000000000000 );
5116  aBigger:
5117     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5118     zExp = aExp;
5119  normalizeRoundAndPack:
5120     --zExp;
5121     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5122 
5123 }
5124 
5125 /*----------------------------------------------------------------------------
5126 | Returns the result of adding the quadruple-precision floating-point values
5127 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
5128 | for Binary Floating-Point Arithmetic.
5129 *----------------------------------------------------------------------------*/
5130 
5131 float128 float128_add( float128 a, float128 b STATUS_PARAM )
5132 {
5133     flag aSign, bSign;
5134 
5135     aSign = extractFloat128Sign( a );
5136     bSign = extractFloat128Sign( b );
5137     if ( aSign == bSign ) {
5138         return addFloat128Sigs( a, b, aSign STATUS_VAR );
5139     }
5140     else {
5141         return subFloat128Sigs( a, b, aSign STATUS_VAR );
5142     }
5143 
5144 }
5145 
5146 /*----------------------------------------------------------------------------
5147 | Returns the result of subtracting the quadruple-precision floating-point
5148 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5149 | Standard for Binary Floating-Point Arithmetic.
5150 *----------------------------------------------------------------------------*/
5151 
5152 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5153 {
5154     flag aSign, bSign;
5155 
5156     aSign = extractFloat128Sign( a );
5157     bSign = extractFloat128Sign( b );
5158     if ( aSign == bSign ) {
5159         return subFloat128Sigs( a, b, aSign STATUS_VAR );
5160     }
5161     else {
5162         return addFloat128Sigs( a, b, aSign STATUS_VAR );
5163     }
5164 
5165 }
5166 
5167 /*----------------------------------------------------------------------------
5168 | Returns the result of multiplying the quadruple-precision floating-point
5169 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5170 | Standard for Binary Floating-Point Arithmetic.
5171 *----------------------------------------------------------------------------*/
5172 
5173 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5174 {
5175     flag aSign, bSign, zSign;
5176     int32 aExp, bExp, zExp;
5177     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5178     float128 z;
5179 
5180     aSig1 = extractFloat128Frac1( a );
5181     aSig0 = extractFloat128Frac0( a );
5182     aExp = extractFloat128Exp( a );
5183     aSign = extractFloat128Sign( a );
5184     bSig1 = extractFloat128Frac1( b );
5185     bSig0 = extractFloat128Frac0( b );
5186     bExp = extractFloat128Exp( b );
5187     bSign = extractFloat128Sign( b );
5188     zSign = aSign ^ bSign;
5189     if ( aExp == 0x7FFF ) {
5190         if (    ( aSig0 | aSig1 )
5191              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5192             return propagateFloat128NaN( a, b STATUS_VAR );
5193         }
5194         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5195         return packFloat128( zSign, 0x7FFF, 0, 0 );
5196     }
5197     if ( bExp == 0x7FFF ) {
5198         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5199         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5200  invalid:
5201             float_raise( float_flag_invalid STATUS_VAR);
5202             z.low = float128_default_nan_low;
5203             z.high = float128_default_nan_high;
5204             return z;
5205         }
5206         return packFloat128( zSign, 0x7FFF, 0, 0 );
5207     }
5208     if ( aExp == 0 ) {
5209         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5210         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5211     }
5212     if ( bExp == 0 ) {
5213         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5214         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5215     }
5216     zExp = aExp + bExp - 0x4000;
5217     aSig0 |= LIT64( 0x0001000000000000 );
5218     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5219     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5220     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5221     zSig2 |= ( zSig3 != 0 );
5222     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5223         shift128ExtraRightJamming(
5224             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5225         ++zExp;
5226     }
5227     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5228 
5229 }
5230 
5231 /*----------------------------------------------------------------------------
5232 | Returns the result of dividing the quadruple-precision floating-point value
5233 | `a' by the corresponding value `b'.  The operation is performed according to
5234 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5235 *----------------------------------------------------------------------------*/
5236 
5237 float128 float128_div( float128 a, float128 b STATUS_PARAM )
5238 {
5239     flag aSign, bSign, zSign;
5240     int32 aExp, bExp, zExp;
5241     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5242     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5243     float128 z;
5244 
5245     aSig1 = extractFloat128Frac1( a );
5246     aSig0 = extractFloat128Frac0( a );
5247     aExp = extractFloat128Exp( a );
5248     aSign = extractFloat128Sign( a );
5249     bSig1 = extractFloat128Frac1( b );
5250     bSig0 = extractFloat128Frac0( b );
5251     bExp = extractFloat128Exp( b );
5252     bSign = extractFloat128Sign( b );
5253     zSign = aSign ^ bSign;
5254     if ( aExp == 0x7FFF ) {
5255         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5256         if ( bExp == 0x7FFF ) {
5257             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5258             goto invalid;
5259         }
5260         return packFloat128( zSign, 0x7FFF, 0, 0 );
5261     }
5262     if ( bExp == 0x7FFF ) {
5263         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5264         return packFloat128( zSign, 0, 0, 0 );
5265     }
5266     if ( bExp == 0 ) {
5267         if ( ( bSig0 | bSig1 ) == 0 ) {
5268             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5269  invalid:
5270                 float_raise( float_flag_invalid STATUS_VAR);
5271                 z.low = float128_default_nan_low;
5272                 z.high = float128_default_nan_high;
5273                 return z;
5274             }
5275             float_raise( float_flag_divbyzero STATUS_VAR);
5276             return packFloat128( zSign, 0x7FFF, 0, 0 );
5277         }
5278         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5279     }
5280     if ( aExp == 0 ) {
5281         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5282         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5283     }
5284     zExp = aExp - bExp + 0x3FFD;
5285     shortShift128Left(
5286         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5287     shortShift128Left(
5288         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5289     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5290         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5291         ++zExp;
5292     }
5293     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5294     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5295     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5296     while ( (sbits64) rem0 < 0 ) {
5297         --zSig0;
5298         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5299     }
5300     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5301     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5302         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5303         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5304         while ( (sbits64) rem1 < 0 ) {
5305             --zSig1;
5306             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5307         }
5308         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5309     }
5310     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5311     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5312 
5313 }
5314 
5315 /*----------------------------------------------------------------------------
5316 | Returns the remainder of the quadruple-precision floating-point value `a'
5317 | with respect to the corresponding value `b'.  The operation is performed
5318 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5319 *----------------------------------------------------------------------------*/
5320 
5321 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5322 {
5323     flag aSign, zSign;
5324     int32 aExp, bExp, expDiff;
5325     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5326     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5327     sbits64 sigMean0;
5328     float128 z;
5329 
5330     aSig1 = extractFloat128Frac1( a );
5331     aSig0 = extractFloat128Frac0( a );
5332     aExp = extractFloat128Exp( a );
5333     aSign = extractFloat128Sign( a );
5334     bSig1 = extractFloat128Frac1( b );
5335     bSig0 = extractFloat128Frac0( b );
5336     bExp = extractFloat128Exp( b );
5337     if ( aExp == 0x7FFF ) {
5338         if (    ( aSig0 | aSig1 )
5339              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5340             return propagateFloat128NaN( a, b STATUS_VAR );
5341         }
5342         goto invalid;
5343     }
5344     if ( bExp == 0x7FFF ) {
5345         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5346         return a;
5347     }
5348     if ( bExp == 0 ) {
5349         if ( ( bSig0 | bSig1 ) == 0 ) {
5350  invalid:
5351             float_raise( float_flag_invalid STATUS_VAR);
5352             z.low = float128_default_nan_low;
5353             z.high = float128_default_nan_high;
5354             return z;
5355         }
5356         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5357     }
5358     if ( aExp == 0 ) {
5359         if ( ( aSig0 | aSig1 ) == 0 ) return a;
5360         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5361     }
5362     expDiff = aExp - bExp;
5363     if ( expDiff < -1 ) return a;
5364     shortShift128Left(
5365         aSig0 | LIT64( 0x0001000000000000 ),
5366         aSig1,
5367         15 - ( expDiff < 0 ),
5368         &aSig0,
5369         &aSig1
5370     );
5371     shortShift128Left(
5372         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5373     q = le128( bSig0, bSig1, aSig0, aSig1 );
5374     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5375     expDiff -= 64;
5376     while ( 0 < expDiff ) {
5377         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5378         q = ( 4 < q ) ? q - 4 : 0;
5379         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5380         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5381         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5382         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5383         expDiff -= 61;
5384     }
5385     if ( -64 < expDiff ) {
5386         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5387         q = ( 4 < q ) ? q - 4 : 0;
5388         q >>= - expDiff;
5389         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5390         expDiff += 52;
5391         if ( expDiff < 0 ) {
5392             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5393         }
5394         else {
5395             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5396         }
5397         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5398         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5399     }
5400     else {
5401         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5402         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5403     }
5404     do {
5405         alternateASig0 = aSig0;
5406         alternateASig1 = aSig1;
5407         ++q;
5408         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5409     } while ( 0 <= (sbits64) aSig0 );
5410     add128(
5411         aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5412     if (    ( sigMean0 < 0 )
5413          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5414         aSig0 = alternateASig0;
5415         aSig1 = alternateASig1;
5416     }
5417     zSign = ( (sbits64) aSig0 < 0 );
5418     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5419     return
5420         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5421 
5422 }
5423 
5424 /*----------------------------------------------------------------------------
5425 | Returns the square root of the quadruple-precision floating-point value `a'.
5426 | The operation is performed according to the IEC/IEEE Standard for Binary
5427 | Floating-Point Arithmetic.
5428 *----------------------------------------------------------------------------*/
5429 
5430 float128 float128_sqrt( float128 a STATUS_PARAM )
5431 {
5432     flag aSign;
5433     int32 aExp, zExp;
5434     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5435     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5436     float128 z;
5437 
5438     aSig1 = extractFloat128Frac1( a );
5439     aSig0 = extractFloat128Frac0( a );
5440     aExp = extractFloat128Exp( a );
5441     aSign = extractFloat128Sign( a );
5442     if ( aExp == 0x7FFF ) {
5443         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5444         if ( ! aSign ) return a;
5445         goto invalid;
5446     }
5447     if ( aSign ) {
5448         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5449  invalid:
5450         float_raise( float_flag_invalid STATUS_VAR);
5451         z.low = float128_default_nan_low;
5452         z.high = float128_default_nan_high;
5453         return z;
5454     }
5455     if ( aExp == 0 ) {
5456         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5457         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5458     }
5459     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5460     aSig0 |= LIT64( 0x0001000000000000 );
5461     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5462     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5463     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5464     doubleZSig0 = zSig0<<1;
5465     mul64To128( zSig0, zSig0, &term0, &term1 );
5466     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5467     while ( (sbits64) rem0 < 0 ) {
5468         --zSig0;
5469         doubleZSig0 -= 2;
5470         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5471     }
5472     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5473     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5474         if ( zSig1 == 0 ) zSig1 = 1;
5475         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5476         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5477         mul64To128( zSig1, zSig1, &term2, &term3 );
5478         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5479         while ( (sbits64) rem1 < 0 ) {
5480             --zSig1;
5481             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5482             term3 |= 1;
5483             term2 |= doubleZSig0;
5484             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5485         }
5486         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5487     }
5488     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5489     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5490 
5491 }
5492 
5493 /*----------------------------------------------------------------------------
5494 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5495 | the corresponding value `b', and 0 otherwise.  The comparison is performed
5496 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5497 *----------------------------------------------------------------------------*/
5498 
5499 int float128_eq( float128 a, float128 b STATUS_PARAM )
5500 {
5501 
5502     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5503               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5504          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5505               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5506        ) {
5507         if (    float128_is_signaling_nan( a )
5508              || float128_is_signaling_nan( b ) ) {
5509             float_raise( float_flag_invalid STATUS_VAR);
5510         }
5511         return 0;
5512     }
5513     return
5514            ( a.low == b.low )
5515         && (    ( a.high == b.high )
5516              || (    ( a.low == 0 )
5517                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5518            );
5519 
5520 }
5521 
5522 /*----------------------------------------------------------------------------
5523 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5524 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
5525 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5526 | Arithmetic.
5527 *----------------------------------------------------------------------------*/
5528 
5529 int float128_le( float128 a, float128 b STATUS_PARAM )
5530 {
5531     flag aSign, bSign;
5532 
5533     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5534               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5535          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5536               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5537        ) {
5538         float_raise( float_flag_invalid STATUS_VAR);
5539         return 0;
5540     }
5541     aSign = extractFloat128Sign( a );
5542     bSign = extractFloat128Sign( b );
5543     if ( aSign != bSign ) {
5544         return
5545                aSign
5546             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5547                  == 0 );
5548     }
5549     return
5550           aSign ? le128( b.high, b.low, a.high, a.low )
5551         : le128( a.high, a.low, b.high, b.low );
5552 
5553 }
5554 
5555 /*----------------------------------------------------------------------------
5556 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5557 | the corresponding value `b', and 0 otherwise.  The comparison is performed
5558 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5559 *----------------------------------------------------------------------------*/
5560 
5561 int float128_lt( float128 a, float128 b STATUS_PARAM )
5562 {
5563     flag aSign, bSign;
5564 
5565     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5566               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5567          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5568               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5569        ) {
5570         float_raise( float_flag_invalid STATUS_VAR);
5571         return 0;
5572     }
5573     aSign = extractFloat128Sign( a );
5574     bSign = extractFloat128Sign( b );
5575     if ( aSign != bSign ) {
5576         return
5577                aSign
5578             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5579                  != 0 );
5580     }
5581     return
5582           aSign ? lt128( b.high, b.low, a.high, a.low )
5583         : lt128( a.high, a.low, b.high, b.low );
5584 
5585 }
5586 
5587 /*----------------------------------------------------------------------------
5588 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5589 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5590 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5591 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5592 *----------------------------------------------------------------------------*/
5593 
5594 int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5595 {
5596 
5597     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5598               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5599          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5600               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5601        ) {
5602         float_raise( float_flag_invalid STATUS_VAR);
5603         return 0;
5604     }
5605     return
5606            ( a.low == b.low )
5607         && (    ( a.high == b.high )
5608              || (    ( a.low == 0 )
5609                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5610            );
5611 
5612 }
5613 
5614 /*----------------------------------------------------------------------------
5615 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5616 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5617 | cause an exception.  Otherwise, the comparison is performed according to the
5618 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5619 *----------------------------------------------------------------------------*/
5620 
5621 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5622 {
5623     flag aSign, bSign;
5624 
5625     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5626               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5627          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5628               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5629        ) {
5630         if (    float128_is_signaling_nan( a )
5631              || float128_is_signaling_nan( b ) ) {
5632             float_raise( float_flag_invalid STATUS_VAR);
5633         }
5634         return 0;
5635     }
5636     aSign = extractFloat128Sign( a );
5637     bSign = extractFloat128Sign( b );
5638     if ( aSign != bSign ) {
5639         return
5640                aSign
5641             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5642                  == 0 );
5643     }
5644     return
5645           aSign ? le128( b.high, b.low, a.high, a.low )
5646         : le128( a.high, a.low, b.high, b.low );
5647 
5648 }
5649 
5650 /*----------------------------------------------------------------------------
5651 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5652 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5653 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5654 | Standard for Binary Floating-Point Arithmetic.
5655 *----------------------------------------------------------------------------*/
5656 
5657 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5658 {
5659     flag aSign, bSign;
5660 
5661     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5662               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5663          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5664               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5665        ) {
5666         if (    float128_is_signaling_nan( a )
5667              || float128_is_signaling_nan( b ) ) {
5668             float_raise( float_flag_invalid STATUS_VAR);
5669         }
5670         return 0;
5671     }
5672     aSign = extractFloat128Sign( a );
5673     bSign = extractFloat128Sign( b );
5674     if ( aSign != bSign ) {
5675         return
5676                aSign
5677             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5678                  != 0 );
5679     }
5680     return
5681           aSign ? lt128( b.high, b.low, a.high, a.low )
5682         : lt128( a.high, a.low, b.high, b.low );
5683 
5684 }
5685 
5686 #endif
5687 
5688 /* misc functions */
5689 float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5690 {
5691     return int64_to_float32(a STATUS_VAR);
5692 }
5693 
5694 float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5695 {
5696     return int64_to_float64(a STATUS_VAR);
5697 }
5698 
5699 unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5700 {
5701     int64_t v;
5702     unsigned int res;
5703 
5704     v = float32_to_int64(a STATUS_VAR);
5705     if (v < 0) {
5706         res = 0;
5707         float_raise( float_flag_invalid STATUS_VAR);
5708     } else if (v > 0xffffffff) {
5709         res = 0xffffffff;
5710         float_raise( float_flag_invalid STATUS_VAR);
5711     } else {
5712         res = v;
5713     }
5714     return res;
5715 }
5716 
5717 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5718 {
5719     int64_t v;
5720     unsigned int res;
5721 
5722     v = float32_to_int64_round_to_zero(a STATUS_VAR);
5723     if (v < 0) {
5724         res = 0;
5725         float_raise( float_flag_invalid STATUS_VAR);
5726     } else if (v > 0xffffffff) {
5727         res = 0xffffffff;
5728         float_raise( float_flag_invalid STATUS_VAR);
5729     } else {
5730         res = v;
5731     }
5732     return res;
5733 }
5734 
5735 unsigned int float32_to_uint16_round_to_zero( float32 a STATUS_PARAM )
5736 {
5737     int64_t v;
5738     unsigned int res;
5739 
5740     v = float32_to_int64_round_to_zero(a STATUS_VAR);
5741     if (v < 0) {
5742         res = 0;
5743         float_raise( float_flag_invalid STATUS_VAR);
5744     } else if (v > 0xffff) {
5745         res = 0xffff;
5746         float_raise( float_flag_invalid STATUS_VAR);
5747     } else {
5748         res = v;
5749     }
5750     return res;
5751 }
5752 
5753 unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5754 {
5755     int64_t v;
5756     unsigned int res;
5757 
5758     v = float64_to_int64(a STATUS_VAR);
5759     if (v < 0) {
5760         res = 0;
5761         float_raise( float_flag_invalid STATUS_VAR);
5762     } else if (v > 0xffffffff) {
5763         res = 0xffffffff;
5764         float_raise( float_flag_invalid STATUS_VAR);
5765     } else {
5766         res = v;
5767     }
5768     return res;
5769 }
5770 
5771 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5772 {
5773     int64_t v;
5774     unsigned int res;
5775 
5776     v = float64_to_int64_round_to_zero(a STATUS_VAR);
5777     if (v < 0) {
5778         res = 0;
5779         float_raise( float_flag_invalid STATUS_VAR);
5780     } else if (v > 0xffffffff) {
5781         res = 0xffffffff;
5782         float_raise( float_flag_invalid STATUS_VAR);
5783     } else {
5784         res = v;
5785     }
5786     return res;
5787 }
5788 
5789 unsigned int float64_to_uint16_round_to_zero( float64 a STATUS_PARAM )
5790 {
5791     int64_t v;
5792     unsigned int res;
5793 
5794     v = float64_to_int64_round_to_zero(a STATUS_VAR);
5795     if (v < 0) {
5796         res = 0;
5797         float_raise( float_flag_invalid STATUS_VAR);
5798     } else if (v > 0xffff) {
5799         res = 0xffff;
5800         float_raise( float_flag_invalid STATUS_VAR);
5801     } else {
5802         res = v;
5803     }
5804     return res;
5805 }
5806 
5807 /* FIXME: This looks broken.  */
5808 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
5809 {
5810     int64_t v;
5811 
5812     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5813     v += float64_val(a);
5814     v = float64_to_int64(make_float64(v) STATUS_VAR);
5815 
5816     return v - INT64_MIN;
5817 }
5818 
5819 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
5820 {
5821     int64_t v;
5822 
5823     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5824     v += float64_val(a);
5825     v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
5826 
5827     return v - INT64_MIN;
5828 }
5829 
5830 #define COMPARE(s, nan_exp)                                                  \
5831 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
5832                                       int is_quiet STATUS_PARAM )            \
5833 {                                                                            \
5834     flag aSign, bSign;                                                       \
5835     bits ## s av, bv;                                                        \
5836                                                                              \
5837     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
5838          extractFloat ## s ## Frac( a ) ) ||                                 \
5839         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
5840           extractFloat ## s ## Frac( b ) )) {                                \
5841         if (!is_quiet ||                                                     \
5842             float ## s ## _is_signaling_nan( a ) ||                          \
5843             float ## s ## _is_signaling_nan( b ) ) {                         \
5844             float_raise( float_flag_invalid STATUS_VAR);                     \
5845         }                                                                    \
5846         return float_relation_unordered;                                     \
5847     }                                                                        \
5848     aSign = extractFloat ## s ## Sign( a );                                  \
5849     bSign = extractFloat ## s ## Sign( b );                                  \
5850     av = float ## s ## _val(a);                                              \
5851     bv = float ## s ## _val(b);                                              \
5852     if ( aSign != bSign ) {                                                  \
5853         if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) {                         \
5854             /* zero case */                                                  \
5855             return float_relation_equal;                                     \
5856         } else {                                                             \
5857             return 1 - (2 * aSign);                                          \
5858         }                                                                    \
5859     } else {                                                                 \
5860         if (av == bv) {                                                      \
5861             return float_relation_equal;                                     \
5862         } else {                                                             \
5863             return 1 - 2 * (aSign ^ ( av < bv ));                            \
5864         }                                                                    \
5865     }                                                                        \
5866 }                                                                            \
5867                                                                              \
5868 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
5869 {                                                                            \
5870     return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
5871 }                                                                            \
5872                                                                              \
5873 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
5874 {                                                                            \
5875     return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
5876 }
5877 
5878 COMPARE(32, 0xff)
5879 COMPARE(64, 0x7ff)
5880 
5881 INLINE int float128_compare_internal( float128 a, float128 b,
5882                                       int is_quiet STATUS_PARAM )
5883 {
5884     flag aSign, bSign;
5885 
5886     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
5887           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
5888         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
5889           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
5890         if (!is_quiet ||
5891             float128_is_signaling_nan( a ) ||
5892             float128_is_signaling_nan( b ) ) {
5893             float_raise( float_flag_invalid STATUS_VAR);
5894         }
5895         return float_relation_unordered;
5896     }
5897     aSign = extractFloat128Sign( a );
5898     bSign = extractFloat128Sign( b );
5899     if ( aSign != bSign ) {
5900         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
5901             /* zero case */
5902             return float_relation_equal;
5903         } else {
5904             return 1 - (2 * aSign);
5905         }
5906     } else {
5907         if (a.low == b.low && a.high == b.high) {
5908             return float_relation_equal;
5909         } else {
5910             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
5911         }
5912     }
5913 }
5914 
5915 int float128_compare( float128 a, float128 b STATUS_PARAM )
5916 {
5917     return float128_compare_internal(a, b, 0 STATUS_VAR);
5918 }
5919 
5920 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
5921 {
5922     return float128_compare_internal(a, b, 1 STATUS_VAR);
5923 }
5924 
5925 /* Multiply A by 2 raised to the power N.  */
5926 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
5927 {
5928     flag aSign;
5929     int16 aExp;
5930     bits32 aSig;
5931 
5932     aSig = extractFloat32Frac( a );
5933     aExp = extractFloat32Exp( a );
5934     aSign = extractFloat32Sign( a );
5935 
5936     if ( aExp == 0xFF ) {
5937         return a;
5938     }
5939     if ( aExp != 0 )
5940         aSig |= 0x00800000;
5941     else if ( aSig == 0 )
5942         return a;
5943 
5944     aExp += n - 1;
5945     aSig <<= 7;
5946     return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
5947 }
5948 
5949 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
5950 {
5951     flag aSign;
5952     int16 aExp;
5953     bits64 aSig;
5954 
5955     aSig = extractFloat64Frac( a );
5956     aExp = extractFloat64Exp( a );
5957     aSign = extractFloat64Sign( a );
5958 
5959     if ( aExp == 0x7FF ) {
5960         return a;
5961     }
5962     if ( aExp != 0 )
5963         aSig |= LIT64( 0x0010000000000000 );
5964     else if ( aSig == 0 )
5965         return a;
5966 
5967     aExp += n - 1;
5968     aSig <<= 10;
5969     return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
5970 }
5971 
5972 #ifdef FLOATX80
5973 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
5974 {
5975     flag aSign;
5976     int16 aExp;
5977     bits64 aSig;
5978 
5979     aSig = extractFloatx80Frac( a );
5980     aExp = extractFloatx80Exp( a );
5981     aSign = extractFloatx80Sign( a );
5982 
5983     if ( aExp == 0x7FF ) {
5984         return a;
5985     }
5986     if (aExp == 0 && aSig == 0)
5987         return a;
5988 
5989     aExp += n;
5990     return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
5991                                           aSign, aExp, aSig, 0 STATUS_VAR );
5992 }
5993 #endif
5994 
5995 #ifdef FLOAT128
5996 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
5997 {
5998     flag aSign;
5999     int32 aExp;
6000     bits64 aSig0, aSig1;
6001 
6002     aSig1 = extractFloat128Frac1( a );
6003     aSig0 = extractFloat128Frac0( a );
6004     aExp = extractFloat128Exp( a );
6005     aSign = extractFloat128Sign( a );
6006     if ( aExp == 0x7FFF ) {
6007         return a;
6008     }
6009     if ( aExp != 0 )
6010         aSig0 |= LIT64( 0x0001000000000000 );
6011     else if ( aSig0 == 0 && aSig1 == 0 )
6012         return a;
6013 
6014     aExp += n - 1;
6015     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6016                                           STATUS_VAR );
6017 
6018 }
6019 #endif
6020