xref: /openbmc/linux/arch/arm/nwfpe/softfloat.c (revision 50a23e6e)
1 /*
2 ===============================================================================
3 
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point
5 Arithmetic Package, Release 2.
6 
7 Written by John R. Hauser.  This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704.  Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980.  The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
14 is available through the web page
15 http://www.jhauser.us/arithmetic/SoftFloat-2b/SoftFloat-source.txt
16 
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
18 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
19 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
20 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
21 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
22 
23 Derivative works are acceptable, even for commercial purposes, so long as
24 (1) they include prominent notice that the work is derivative, and (2) they
25 include prominent notice akin to these three paragraphs for those parts of
26 this code that are retained.
27 
28 ===============================================================================
29 */
30 
31 #include <asm/div64.h>
32 
33 #include "fpa11.h"
34 //#include "milieu.h"
35 //#include "softfloat.h"
36 
37 /*
38 -------------------------------------------------------------------------------
39 Primitive arithmetic functions, including multi-word arithmetic, and
40 division and square root approximations.  (Can be specialized to target if
41 desired.)
42 -------------------------------------------------------------------------------
43 */
44 #include "softfloat-macros"
45 
46 /*
47 -------------------------------------------------------------------------------
48 Functions and definitions to determine:  (1) whether tininess for underflow
49 is detected before or after rounding by default, (2) what (if anything)
50 happens when exceptions are raised, (3) how signaling NaNs are distinguished
51 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
52 are propagated from function inputs to output.  These details are target-
53 specific.
54 -------------------------------------------------------------------------------
55 */
56 #include "softfloat-specialize"
57 
58 /*
59 -------------------------------------------------------------------------------
60 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
61 and 7, and returns the properly rounded 32-bit integer corresponding to the
62 input.  If `zSign' is nonzero, the input is negated before being converted
63 to an integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point
64 input is simply rounded to an integer, with the inexact exception raised if
65 the input cannot be represented exactly as an integer.  If the fixed-point
66 input is too large, however, the invalid exception is raised and the largest
67 positive or negative integer is returned.
68 -------------------------------------------------------------------------------
69 */
roundAndPackInt32(struct roundingData * roundData,flag zSign,bits64 absZ)70 static int32 roundAndPackInt32( struct roundingData *roundData, flag zSign, bits64 absZ )
71 {
72     int8 roundingMode;
73     flag roundNearestEven;
74     int8 roundIncrement, roundBits;
75     int32 z;
76 
77     roundingMode = roundData->mode;
78     roundNearestEven = ( roundingMode == float_round_nearest_even );
79     roundIncrement = 0x40;
80     if ( ! roundNearestEven ) {
81         if ( roundingMode == float_round_to_zero ) {
82             roundIncrement = 0;
83         }
84         else {
85             roundIncrement = 0x7F;
86             if ( zSign ) {
87                 if ( roundingMode == float_round_up ) roundIncrement = 0;
88             }
89             else {
90                 if ( roundingMode == float_round_down ) roundIncrement = 0;
91             }
92         }
93     }
94     roundBits = absZ & 0x7F;
95     absZ = ( absZ + roundIncrement )>>7;
96     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
97     z = absZ;
98     if ( zSign ) z = - z;
99     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
100         roundData->exception |= float_flag_invalid;
101         return zSign ? 0x80000000 : 0x7FFFFFFF;
102     }
103     if ( roundBits ) roundData->exception |= float_flag_inexact;
104     return z;
105 
106 }
107 
108 /*
109 -------------------------------------------------------------------------------
110 Returns the fraction bits of the single-precision floating-point value `a'.
111 -------------------------------------------------------------------------------
112 */
extractFloat32Frac(float32 a)113 INLINE bits32 extractFloat32Frac( float32 a )
114 {
115 
116     return a & 0x007FFFFF;
117 
118 }
119 
120 /*
121 -------------------------------------------------------------------------------
122 Returns the exponent bits of the single-precision floating-point value `a'.
123 -------------------------------------------------------------------------------
124 */
extractFloat32Exp(float32 a)125 INLINE int16 extractFloat32Exp( float32 a )
126 {
127 
128     return ( a>>23 ) & 0xFF;
129 
130 }
131 
132 /*
133 -------------------------------------------------------------------------------
134 Returns the sign bit of the single-precision floating-point value `a'.
135 -------------------------------------------------------------------------------
136 */
137 #if 0	/* in softfloat.h */
138 INLINE flag extractFloat32Sign( float32 a )
139 {
140 
141     return a>>31;
142 
143 }
144 #endif
145 
146 /*
147 -------------------------------------------------------------------------------
148 Normalizes the subnormal single-precision floating-point value represented
149 by the denormalized significand `aSig'.  The normalized exponent and
150 significand are stored at the locations pointed to by `zExpPtr' and
151 `zSigPtr', respectively.
152 -------------------------------------------------------------------------------
153 */
154 static void
normalizeFloat32Subnormal(bits32 aSig,int16 * zExpPtr,bits32 * zSigPtr)155  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
156 {
157     int8 shiftCount;
158 
159     shiftCount = countLeadingZeros32( aSig ) - 8;
160     *zSigPtr = aSig<<shiftCount;
161     *zExpPtr = 1 - shiftCount;
162 
163 }
164 
165 /*
166 -------------------------------------------------------------------------------
167 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
168 single-precision floating-point value, returning the result.  After being
169 shifted into the proper positions, the three fields are simply added
170 together to form the result.  This means that any integer portion of `zSig'
171 will be added into the exponent.  Since a properly normalized significand
172 will have an integer portion equal to 1, the `zExp' input should be 1 less
173 than the desired result exponent whenever `zSig' is a complete, normalized
174 significand.
175 -------------------------------------------------------------------------------
176 */
packFloat32(flag zSign,int16 zExp,bits32 zSig)177 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
178 {
179 #if 0
180    float32 f;
181    __asm__("@ packFloat32				\n\
182    	    mov %0, %1, asl #31				\n\
183    	    orr %0, %2, asl #23				\n\
184    	    orr %0, %3"
185    	    : /* no outputs */
186    	    : "g" (f), "g" (zSign), "g" (zExp), "g" (zSig)
187    	    : "cc");
188    return f;
189 #else
190     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
191 #endif
192 }
193 
194 /*
195 -------------------------------------------------------------------------------
196 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
197 and significand `zSig', and returns the proper single-precision floating-
198 point value corresponding to the abstract input.  Ordinarily, the abstract
199 value is simply rounded and packed into the single-precision format, with
200 the inexact exception raised if the abstract input cannot be represented
201 exactly.  If the abstract value is too large, however, the overflow and
202 inexact exceptions are raised and an infinity or maximal finite value is
203 returned.  If the abstract value is too small, the input value is rounded to
204 a subnormal number, and the underflow and inexact exceptions are raised if
205 the abstract input cannot be represented exactly as a subnormal single-
206 precision floating-point number.
207     The input significand `zSig' has its binary point between bits 30
208 and 29, which is 7 bits to the left of the usual location.  This shifted
209 significand must be normalized or smaller.  If `zSig' is not normalized,
210 `zExp' must be 0; in that case, the result returned is a subnormal number,
211 and it must not require rounding.  In the usual case that `zSig' is
212 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
213 The handling of underflow and overflow follows the IEC/IEEE Standard for
214 Binary Floating-point Arithmetic.
215 -------------------------------------------------------------------------------
216 */
roundAndPackFloat32(struct roundingData * roundData,flag zSign,int16 zExp,bits32 zSig)217 static float32 roundAndPackFloat32( struct roundingData *roundData, flag zSign, int16 zExp, bits32 zSig )
218 {
219     int8 roundingMode;
220     flag roundNearestEven;
221     int8 roundIncrement, roundBits;
222     flag isTiny;
223 
224     roundingMode = roundData->mode;
225     roundNearestEven = ( roundingMode == float_round_nearest_even );
226     roundIncrement = 0x40;
227     if ( ! roundNearestEven ) {
228         if ( roundingMode == float_round_to_zero ) {
229             roundIncrement = 0;
230         }
231         else {
232             roundIncrement = 0x7F;
233             if ( zSign ) {
234                 if ( roundingMode == float_round_up ) roundIncrement = 0;
235             }
236             else {
237                 if ( roundingMode == float_round_down ) roundIncrement = 0;
238             }
239         }
240     }
241     roundBits = zSig & 0x7F;
242     if ( 0xFD <= (bits16) zExp ) {
243         if (    ( 0xFD < zExp )
244              || (    ( zExp == 0xFD )
245                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
246            ) {
247             roundData->exception |= float_flag_overflow | float_flag_inexact;
248             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
249         }
250         if ( zExp < 0 ) {
251             isTiny =
252                    ( float_detect_tininess == float_tininess_before_rounding )
253                 || ( zExp < -1 )
254                 || ( zSig + roundIncrement < 0x80000000 );
255             shift32RightJamming( zSig, - zExp, &zSig );
256             zExp = 0;
257             roundBits = zSig & 0x7F;
258             if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow;
259         }
260     }
261     if ( roundBits ) roundData->exception |= float_flag_inexact;
262     zSig = ( zSig + roundIncrement )>>7;
263     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
264     if ( zSig == 0 ) zExp = 0;
265     return packFloat32( zSign, zExp, zSig );
266 
267 }
268 
269 /*
270 -------------------------------------------------------------------------------
271 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
272 and significand `zSig', and returns the proper single-precision floating-
273 point value corresponding to the abstract input.  This routine is just like
274 `roundAndPackFloat32' except that `zSig' does not have to be normalized in
275 any way.  In all cases, `zExp' must be 1 less than the ``true'' floating-
276 point exponent.
277 -------------------------------------------------------------------------------
278 */
279 static float32
normalizeRoundAndPackFloat32(struct roundingData * roundData,flag zSign,int16 zExp,bits32 zSig)280  normalizeRoundAndPackFloat32( struct roundingData *roundData, flag zSign, int16 zExp, bits32 zSig )
281 {
282     int8 shiftCount;
283 
284     shiftCount = countLeadingZeros32( zSig ) - 1;
285     return roundAndPackFloat32( roundData, zSign, zExp - shiftCount, zSig<<shiftCount );
286 
287 }
288 
289 /*
290 -------------------------------------------------------------------------------
291 Returns the fraction bits of the double-precision floating-point value `a'.
292 -------------------------------------------------------------------------------
293 */
extractFloat64Frac(float64 a)294 INLINE bits64 extractFloat64Frac( float64 a )
295 {
296 
297     return a & LIT64( 0x000FFFFFFFFFFFFF );
298 
299 }
300 
301 /*
302 -------------------------------------------------------------------------------
303 Returns the exponent bits of the double-precision floating-point value `a'.
304 -------------------------------------------------------------------------------
305 */
extractFloat64Exp(float64 a)306 INLINE int16 extractFloat64Exp( float64 a )
307 {
308 
309     return ( a>>52 ) & 0x7FF;
310 
311 }
312 
313 /*
314 -------------------------------------------------------------------------------
315 Returns the sign bit of the double-precision floating-point value `a'.
316 -------------------------------------------------------------------------------
317 */
318 #if 0	/* in softfloat.h */
319 INLINE flag extractFloat64Sign( float64 a )
320 {
321 
322     return a>>63;
323 
324 }
325 #endif
326 
327 /*
328 -------------------------------------------------------------------------------
329 Normalizes the subnormal double-precision floating-point value represented
330 by the denormalized significand `aSig'.  The normalized exponent and
331 significand are stored at the locations pointed to by `zExpPtr' and
332 `zSigPtr', respectively.
333 -------------------------------------------------------------------------------
334 */
335 static void
normalizeFloat64Subnormal(bits64 aSig,int16 * zExpPtr,bits64 * zSigPtr)336  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
337 {
338     int8 shiftCount;
339 
340     shiftCount = countLeadingZeros64( aSig ) - 11;
341     *zSigPtr = aSig<<shiftCount;
342     *zExpPtr = 1 - shiftCount;
343 
344 }
345 
346 /*
347 -------------------------------------------------------------------------------
348 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
349 double-precision floating-point value, returning the result.  After being
350 shifted into the proper positions, the three fields are simply added
351 together to form the result.  This means that any integer portion of `zSig'
352 will be added into the exponent.  Since a properly normalized significand
353 will have an integer portion equal to 1, the `zExp' input should be 1 less
354 than the desired result exponent whenever `zSig' is a complete, normalized
355 significand.
356 -------------------------------------------------------------------------------
357 */
packFloat64(flag zSign,int16 zExp,bits64 zSig)358 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
359 {
360 
361     return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
362 
363 }
364 
365 /*
366 -------------------------------------------------------------------------------
367 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
368 and significand `zSig', and returns the proper double-precision floating-
369 point value corresponding to the abstract input.  Ordinarily, the abstract
370 value is simply rounded and packed into the double-precision format, with
371 the inexact exception raised if the abstract input cannot be represented
372 exactly.  If the abstract value is too large, however, the overflow and
373 inexact exceptions are raised and an infinity or maximal finite value is
374 returned.  If the abstract value is too small, the input value is rounded to
375 a subnormal number, and the underflow and inexact exceptions are raised if
376 the abstract input cannot be represented exactly as a subnormal double-
377 precision floating-point number.
378     The input significand `zSig' has its binary point between bits 62
379 and 61, which is 10 bits to the left of the usual location.  This shifted
380 significand must be normalized or smaller.  If `zSig' is not normalized,
381 `zExp' must be 0; in that case, the result returned is a subnormal number,
382 and it must not require rounding.  In the usual case that `zSig' is
383 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
384 The handling of underflow and overflow follows the IEC/IEEE Standard for
385 Binary Floating-point Arithmetic.
386 -------------------------------------------------------------------------------
387 */
roundAndPackFloat64(struct roundingData * roundData,flag zSign,int16 zExp,bits64 zSig)388 static float64 roundAndPackFloat64( struct roundingData *roundData, flag zSign, int16 zExp, bits64 zSig )
389 {
390     int8 roundingMode;
391     flag roundNearestEven;
392     int16 roundIncrement, roundBits;
393     flag isTiny;
394 
395     roundingMode = roundData->mode;
396     roundNearestEven = ( roundingMode == float_round_nearest_even );
397     roundIncrement = 0x200;
398     if ( ! roundNearestEven ) {
399         if ( roundingMode == float_round_to_zero ) {
400             roundIncrement = 0;
401         }
402         else {
403             roundIncrement = 0x3FF;
404             if ( zSign ) {
405                 if ( roundingMode == float_round_up ) roundIncrement = 0;
406             }
407             else {
408                 if ( roundingMode == float_round_down ) roundIncrement = 0;
409             }
410         }
411     }
412     roundBits = zSig & 0x3FF;
413     if ( 0x7FD <= (bits16) zExp ) {
414         if (    ( 0x7FD < zExp )
415              || (    ( zExp == 0x7FD )
416                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
417            ) {
418             //register int lr = __builtin_return_address(0);
419             //printk("roundAndPackFloat64 called from 0x%08x\n",lr);
420             roundData->exception |= float_flag_overflow | float_flag_inexact;
421             return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
422         }
423         if ( zExp < 0 ) {
424             isTiny =
425                    ( float_detect_tininess == float_tininess_before_rounding )
426                 || ( zExp < -1 )
427                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
428             shift64RightJamming( zSig, - zExp, &zSig );
429             zExp = 0;
430             roundBits = zSig & 0x3FF;
431             if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow;
432         }
433     }
434     if ( roundBits ) roundData->exception |= float_flag_inexact;
435     zSig = ( zSig + roundIncrement )>>10;
436     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
437     if ( zSig == 0 ) zExp = 0;
438     return packFloat64( zSign, zExp, zSig );
439 
440 }
441 
442 /*
443 -------------------------------------------------------------------------------
444 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
445 and significand `zSig', and returns the proper double-precision floating-
446 point value corresponding to the abstract input.  This routine is just like
447 `roundAndPackFloat64' except that `zSig' does not have to be normalized in
448 any way.  In all cases, `zExp' must be 1 less than the ``true'' floating-
449 point exponent.
450 -------------------------------------------------------------------------------
451 */
452 static float64
normalizeRoundAndPackFloat64(struct roundingData * roundData,flag zSign,int16 zExp,bits64 zSig)453  normalizeRoundAndPackFloat64( struct roundingData *roundData, flag zSign, int16 zExp, bits64 zSig )
454 {
455     int8 shiftCount;
456 
457     shiftCount = countLeadingZeros64( zSig ) - 1;
458     return roundAndPackFloat64( roundData, zSign, zExp - shiftCount, zSig<<shiftCount );
459 
460 }
461 
462 #ifdef FLOATX80
463 
464 /*
465 -------------------------------------------------------------------------------
466 Returns the fraction bits of the extended double-precision floating-point
467 value `a'.
468 -------------------------------------------------------------------------------
469 */
extractFloatx80Frac(floatx80 a)470 INLINE bits64 extractFloatx80Frac( floatx80 a )
471 {
472 
473     return a.low;
474 
475 }
476 
477 /*
478 -------------------------------------------------------------------------------
479 Returns the exponent bits of the extended double-precision floating-point
480 value `a'.
481 -------------------------------------------------------------------------------
482 */
extractFloatx80Exp(floatx80 a)483 INLINE int32 extractFloatx80Exp( floatx80 a )
484 {
485 
486     return a.high & 0x7FFF;
487 
488 }
489 
490 /*
491 -------------------------------------------------------------------------------
492 Returns the sign bit of the extended double-precision floating-point value
493 `a'.
494 -------------------------------------------------------------------------------
495 */
extractFloatx80Sign(floatx80 a)496 INLINE flag extractFloatx80Sign( floatx80 a )
497 {
498 
499     return a.high>>15;
500 
501 }
502 
503 /*
504 -------------------------------------------------------------------------------
505 Normalizes the subnormal extended double-precision floating-point value
506 represented by the denormalized significand `aSig'.  The normalized exponent
507 and significand are stored at the locations pointed to by `zExpPtr' and
508 `zSigPtr', respectively.
509 -------------------------------------------------------------------------------
510 */
511 static void
normalizeFloatx80Subnormal(bits64 aSig,int32 * zExpPtr,bits64 * zSigPtr)512  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
513 {
514     int8 shiftCount;
515 
516     shiftCount = countLeadingZeros64( aSig );
517     *zSigPtr = aSig<<shiftCount;
518     *zExpPtr = 1 - shiftCount;
519 
520 }
521 
522 /*
523 -------------------------------------------------------------------------------
524 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
525 extended double-precision floating-point value, returning the result.
526 -------------------------------------------------------------------------------
527 */
packFloatx80(flag zSign,int32 zExp,bits64 zSig)528 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
529 {
530     floatx80 z;
531 
532     z.low = zSig;
533     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
534     z.__padding = 0;
535     return z;
536 
537 }
538 
539 /*
540 -------------------------------------------------------------------------------
541 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
542 and extended significand formed by the concatenation of `zSig0' and `zSig1',
543 and returns the proper extended double-precision floating-point value
544 corresponding to the abstract input.  Ordinarily, the abstract value is
545 rounded and packed into the extended double-precision format, with the
546 inexact exception raised if the abstract input cannot be represented
547 exactly.  If the abstract value is too large, however, the overflow and
548 inexact exceptions are raised and an infinity or maximal finite value is
549 returned.  If the abstract value is too small, the input value is rounded to
550 a subnormal number, and the underflow and inexact exceptions are raised if
551 the abstract input cannot be represented exactly as a subnormal extended
552 double-precision floating-point number.
553     If `roundingPrecision' is 32 or 64, the result is rounded to the same
554 number of bits as single or double precision, respectively.  Otherwise, the
555 result is rounded to the full precision of the extended double-precision
556 format.
557     The input significand must be normalized or smaller.  If the input
558 significand is not normalized, `zExp' must be 0; in that case, the result
559 returned is a subnormal number, and it must not require rounding.  The
560 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
561 Floating-point Arithmetic.
562 -------------------------------------------------------------------------------
563 */
564 static floatx80
roundAndPackFloatx80(struct roundingData * roundData,flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1)565  roundAndPackFloatx80(
566      struct roundingData *roundData, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
567  )
568 {
569     int8 roundingMode, roundingPrecision;
570     flag roundNearestEven, increment, isTiny;
571     int64 roundIncrement, roundMask, roundBits;
572 
573     roundingMode = roundData->mode;
574     roundingPrecision = roundData->precision;
575     roundNearestEven = ( roundingMode == float_round_nearest_even );
576     if ( roundingPrecision == 80 ) goto precision80;
577     if ( roundingPrecision == 64 ) {
578         roundIncrement = LIT64( 0x0000000000000400 );
579         roundMask = LIT64( 0x00000000000007FF );
580     }
581     else if ( roundingPrecision == 32 ) {
582         roundIncrement = LIT64( 0x0000008000000000 );
583         roundMask = LIT64( 0x000000FFFFFFFFFF );
584     }
585     else {
586         goto precision80;
587     }
588     zSig0 |= ( zSig1 != 0 );
589     if ( ! roundNearestEven ) {
590         if ( roundingMode == float_round_to_zero ) {
591             roundIncrement = 0;
592         }
593         else {
594             roundIncrement = roundMask;
595             if ( zSign ) {
596                 if ( roundingMode == float_round_up ) roundIncrement = 0;
597             }
598             else {
599                 if ( roundingMode == float_round_down ) roundIncrement = 0;
600             }
601         }
602     }
603     roundBits = zSig0 & roundMask;
604     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
605         if (    ( 0x7FFE < zExp )
606              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
607            ) {
608             goto overflow;
609         }
610         if ( zExp <= 0 ) {
611             isTiny =
612                    ( float_detect_tininess == float_tininess_before_rounding )
613                 || ( zExp < 0 )
614                 || ( zSig0 <= zSig0 + roundIncrement );
615             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
616             zExp = 0;
617             roundBits = zSig0 & roundMask;
618             if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow;
619             if ( roundBits ) roundData->exception |= float_flag_inexact;
620             zSig0 += roundIncrement;
621             if ( (sbits64) zSig0 < 0 ) zExp = 1;
622             roundIncrement = roundMask + 1;
623             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
624                 roundMask |= roundIncrement;
625             }
626             zSig0 &= ~ roundMask;
627             return packFloatx80( zSign, zExp, zSig0 );
628         }
629     }
630     if ( roundBits ) roundData->exception |= float_flag_inexact;
631     zSig0 += roundIncrement;
632     if ( zSig0 < roundIncrement ) {
633         ++zExp;
634         zSig0 = LIT64( 0x8000000000000000 );
635     }
636     roundIncrement = roundMask + 1;
637     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
638         roundMask |= roundIncrement;
639     }
640     zSig0 &= ~ roundMask;
641     if ( zSig0 == 0 ) zExp = 0;
642     return packFloatx80( zSign, zExp, zSig0 );
643  precision80:
644     increment = ( (sbits64) zSig1 < 0 );
645     if ( ! roundNearestEven ) {
646         if ( roundingMode == float_round_to_zero ) {
647             increment = 0;
648         }
649         else {
650             if ( zSign ) {
651                 increment = ( roundingMode == float_round_down ) && zSig1;
652             }
653             else {
654                 increment = ( roundingMode == float_round_up ) && zSig1;
655             }
656         }
657     }
658     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
659         if (    ( 0x7FFE < zExp )
660              || (    ( zExp == 0x7FFE )
661                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
662                   && increment
663                 )
664            ) {
665             roundMask = 0;
666  overflow:
667             roundData->exception |= float_flag_overflow | float_flag_inexact;
668             if (    ( roundingMode == float_round_to_zero )
669                  || ( zSign && ( roundingMode == float_round_up ) )
670                  || ( ! zSign && ( roundingMode == float_round_down ) )
671                ) {
672                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
673             }
674             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
675         }
676         if ( zExp <= 0 ) {
677             isTiny =
678                    ( float_detect_tininess == float_tininess_before_rounding )
679                 || ( zExp < 0 )
680                 || ! increment
681                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
682             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
683             zExp = 0;
684             if ( isTiny && zSig1 ) roundData->exception |= float_flag_underflow;
685             if ( zSig1 ) roundData->exception |= float_flag_inexact;
686             if ( roundNearestEven ) {
687                 increment = ( (sbits64) zSig1 < 0 );
688             }
689             else {
690                 if ( zSign ) {
691                     increment = ( roundingMode == float_round_down ) && zSig1;
692                 }
693                 else {
694                     increment = ( roundingMode == float_round_up ) && zSig1;
695                 }
696             }
697             if ( increment ) {
698                 ++zSig0;
699                 zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
700                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
701             }
702             return packFloatx80( zSign, zExp, zSig0 );
703         }
704     }
705     if ( zSig1 ) roundData->exception |= float_flag_inexact;
706     if ( increment ) {
707         ++zSig0;
708         if ( zSig0 == 0 ) {
709             ++zExp;
710             zSig0 = LIT64( 0x8000000000000000 );
711         }
712         else {
713             zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
714         }
715     }
716     else {
717         if ( zSig0 == 0 ) zExp = 0;
718     }
719 
720     return packFloatx80( zSign, zExp, zSig0 );
721 }
722 
723 /*
724 -------------------------------------------------------------------------------
725 Takes an abstract floating-point value having sign `zSign', exponent
726 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
727 and returns the proper extended double-precision floating-point value
728 corresponding to the abstract input.  This routine is just like
729 `roundAndPackFloatx80' except that the input significand does not have to be
730 normalized.
731 -------------------------------------------------------------------------------
732 */
733 static floatx80
normalizeRoundAndPackFloatx80(struct roundingData * roundData,flag zSign,int32 zExp,bits64 zSig0,bits64 zSig1)734  normalizeRoundAndPackFloatx80(
735      struct roundingData *roundData, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
736  )
737 {
738     int8 shiftCount;
739 
740     if ( zSig0 == 0 ) {
741         zSig0 = zSig1;
742         zSig1 = 0;
743         zExp -= 64;
744     }
745     shiftCount = countLeadingZeros64( zSig0 );
746     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
747     zExp -= shiftCount;
748     return
749         roundAndPackFloatx80( roundData, zSign, zExp, zSig0, zSig1 );
750 
751 }
752 
753 #endif
754 
755 /*
756 -------------------------------------------------------------------------------
757 Returns the result of converting the 32-bit two's complement integer `a' to
758 the single-precision floating-point format.  The conversion is performed
759 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
760 -------------------------------------------------------------------------------
761 */
int32_to_float32(struct roundingData * roundData,int32 a)762 float32 int32_to_float32(struct roundingData *roundData, int32 a)
763 {
764     flag zSign;
765 
766     if ( a == 0 ) return 0;
767     if ( a == 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
768     zSign = ( a < 0 );
769     return normalizeRoundAndPackFloat32( roundData, zSign, 0x9C, zSign ? - a : a );
770 
771 }
772 
773 /*
774 -------------------------------------------------------------------------------
775 Returns the result of converting the 32-bit two's complement integer `a' to
776 the double-precision floating-point format.  The conversion is performed
777 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
778 -------------------------------------------------------------------------------
779 */
int32_to_float64(int32 a)780 float64 int32_to_float64( int32 a )
781 {
782     flag aSign;
783     uint32 absA;
784     int8 shiftCount;
785     bits64 zSig;
786 
787     if ( a == 0 ) return 0;
788     aSign = ( a < 0 );
789     absA = aSign ? - a : a;
790     shiftCount = countLeadingZeros32( absA ) + 21;
791     zSig = absA;
792     return packFloat64( aSign, 0x432 - shiftCount, zSig<<shiftCount );
793 
794 }
795 
796 #ifdef FLOATX80
797 
798 /*
799 -------------------------------------------------------------------------------
800 Returns the result of converting the 32-bit two's complement integer `a'
801 to the extended double-precision floating-point format.  The conversion
802 is performed according to the IEC/IEEE Standard for Binary Floating-point
803 Arithmetic.
804 -------------------------------------------------------------------------------
805 */
int32_to_floatx80(int32 a)806 floatx80 int32_to_floatx80( int32 a )
807 {
808     flag zSign;
809     uint32 absA;
810     int8 shiftCount;
811     bits64 zSig;
812 
813     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
814     zSign = ( a < 0 );
815     absA = zSign ? - a : a;
816     shiftCount = countLeadingZeros32( absA ) + 32;
817     zSig = absA;
818     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
819 
820 }
821 
822 #endif
823 
824 /*
825 -------------------------------------------------------------------------------
826 Returns the result of converting the single-precision floating-point value
827 `a' to the 32-bit two's complement integer format.  The conversion is
828 performed according to the IEC/IEEE Standard for Binary Floating-point
829 Arithmetic---which means in particular that the conversion is rounded
830 according to the current rounding mode.  If `a' is a NaN, the largest
831 positive integer is returned.  Otherwise, if the conversion overflows, the
832 largest integer with the same sign as `a' is returned.
833 -------------------------------------------------------------------------------
834 */
float32_to_int32(struct roundingData * roundData,float32 a)835 int32 float32_to_int32( struct roundingData *roundData, float32 a )
836 {
837     flag aSign;
838     int16 aExp, shiftCount;
839     bits32 aSig;
840     bits64 zSig;
841 
842     aSig = extractFloat32Frac( a );
843     aExp = extractFloat32Exp( a );
844     aSign = extractFloat32Sign( a );
845     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
846     if ( aExp ) aSig |= 0x00800000;
847     shiftCount = 0xAF - aExp;
848     zSig = aSig;
849     zSig <<= 32;
850     if ( 0 < shiftCount ) shift64RightJamming( zSig, shiftCount, &zSig );
851     return roundAndPackInt32( roundData, aSign, zSig );
852 
853 }
854 
855 /*
856 -------------------------------------------------------------------------------
857 Returns the result of converting the single-precision floating-point value
858 `a' to the 32-bit two's complement integer format.  The conversion is
859 performed according to the IEC/IEEE Standard for Binary Floating-point
860 Arithmetic, except that the conversion is always rounded toward zero.  If
861 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
862 conversion overflows, the largest integer with the same sign as `a' is
863 returned.
864 -------------------------------------------------------------------------------
865 */
float32_to_int32_round_to_zero(float32 a)866 int32 float32_to_int32_round_to_zero( float32 a )
867 {
868     flag aSign;
869     int16 aExp, shiftCount;
870     bits32 aSig;
871     int32 z;
872 
873     aSig = extractFloat32Frac( a );
874     aExp = extractFloat32Exp( a );
875     aSign = extractFloat32Sign( a );
876     shiftCount = aExp - 0x9E;
877     if ( 0 <= shiftCount ) {
878         if ( a == 0xCF000000 ) return 0x80000000;
879         float_raise( float_flag_invalid );
880         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
881         return 0x80000000;
882     }
883     else if ( aExp <= 0x7E ) {
884         if ( aExp | aSig ) float_raise( float_flag_inexact );
885         return 0;
886     }
887     aSig = ( aSig | 0x00800000 )<<8;
888     z = aSig>>( - shiftCount );
889     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
890         float_raise( float_flag_inexact );
891     }
892     return aSign ? - z : z;
893 
894 }
895 
896 /*
897 -------------------------------------------------------------------------------
898 Returns the result of converting the single-precision floating-point value
899 `a' to the double-precision floating-point format.  The conversion is
900 performed according to the IEC/IEEE Standard for Binary Floating-point
901 Arithmetic.
902 -------------------------------------------------------------------------------
903 */
float32_to_float64(float32 a)904 float64 float32_to_float64( float32 a )
905 {
906     flag aSign;
907     int16 aExp;
908     bits32 aSig;
909 
910     aSig = extractFloat32Frac( a );
911     aExp = extractFloat32Exp( a );
912     aSign = extractFloat32Sign( a );
913     if ( aExp == 0xFF ) {
914         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
915         return packFloat64( aSign, 0x7FF, 0 );
916     }
917     if ( aExp == 0 ) {
918         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
919         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
920         --aExp;
921     }
922     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
923 
924 }
925 
926 #ifdef FLOATX80
927 
928 /*
929 -------------------------------------------------------------------------------
930 Returns the result of converting the single-precision floating-point value
931 `a' to the extended double-precision floating-point format.  The conversion
932 is performed according to the IEC/IEEE Standard for Binary Floating-point
933 Arithmetic.
934 -------------------------------------------------------------------------------
935 */
float32_to_floatx80(float32 a)936 floatx80 float32_to_floatx80( float32 a )
937 {
938     flag aSign;
939     int16 aExp;
940     bits32 aSig;
941 
942     aSig = extractFloat32Frac( a );
943     aExp = extractFloat32Exp( a );
944     aSign = extractFloat32Sign( a );
945     if ( aExp == 0xFF ) {
946         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
947         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
948     }
949     if ( aExp == 0 ) {
950         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
951         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
952     }
953     aSig |= 0x00800000;
954     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
955 
956 }
957 
958 #endif
959 
960 /*
961 -------------------------------------------------------------------------------
962 Rounds the single-precision floating-point value `a' to an integer, and
963 returns the result as a single-precision floating-point value.  The
964 operation is performed according to the IEC/IEEE Standard for Binary
965 Floating-point Arithmetic.
966 -------------------------------------------------------------------------------
967 */
float32_round_to_int(struct roundingData * roundData,float32 a)968 float32 float32_round_to_int( struct roundingData *roundData, float32 a )
969 {
970     flag aSign;
971     int16 aExp;
972     bits32 lastBitMask, roundBitsMask;
973     int8 roundingMode;
974     float32 z;
975 
976     aExp = extractFloat32Exp( a );
977     if ( 0x96 <= aExp ) {
978         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
979             return propagateFloat32NaN( a, a );
980         }
981         return a;
982     }
983     roundingMode = roundData->mode;
984     if ( aExp <= 0x7E ) {
985         if ( (bits32) ( a<<1 ) == 0 ) return a;
986         roundData->exception |= float_flag_inexact;
987         aSign = extractFloat32Sign( a );
988         switch ( roundingMode ) {
989          case float_round_nearest_even:
990             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
991                 return packFloat32( aSign, 0x7F, 0 );
992             }
993             break;
994          case float_round_down:
995             return aSign ? 0xBF800000 : 0;
996          case float_round_up:
997             return aSign ? 0x80000000 : 0x3F800000;
998         }
999         return packFloat32( aSign, 0, 0 );
1000     }
1001     lastBitMask = 1;
1002     lastBitMask <<= 0x96 - aExp;
1003     roundBitsMask = lastBitMask - 1;
1004     z = a;
1005     if ( roundingMode == float_round_nearest_even ) {
1006         z += lastBitMask>>1;
1007         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1008     }
1009     else if ( roundingMode != float_round_to_zero ) {
1010         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1011             z += roundBitsMask;
1012         }
1013     }
1014     z &= ~ roundBitsMask;
1015     if ( z != a ) roundData->exception |= float_flag_inexact;
1016     return z;
1017 
1018 }
1019 
1020 /*
1021 -------------------------------------------------------------------------------
1022 Returns the result of adding the absolute values of the single-precision
1023 floating-point values `a' and `b'.  If `zSign' is true, the sum is negated
1024 before being returned.  `zSign' is ignored if the result is a NaN.  The
1025 addition is performed according to the IEC/IEEE Standard for Binary
1026 Floating-point Arithmetic.
1027 -------------------------------------------------------------------------------
1028 */
addFloat32Sigs(struct roundingData * roundData,float32 a,float32 b,flag zSign)1029 static float32 addFloat32Sigs( struct roundingData *roundData, float32 a, float32 b, flag zSign )
1030 {
1031     int16 aExp, bExp, zExp;
1032     bits32 aSig, bSig, zSig;
1033     int16 expDiff;
1034 
1035     aSig = extractFloat32Frac( a );
1036     aExp = extractFloat32Exp( a );
1037     bSig = extractFloat32Frac( b );
1038     bExp = extractFloat32Exp( b );
1039     expDiff = aExp - bExp;
1040     aSig <<= 6;
1041     bSig <<= 6;
1042     if ( 0 < expDiff ) {
1043         if ( aExp == 0xFF ) {
1044             if ( aSig ) return propagateFloat32NaN( a, b );
1045             return a;
1046         }
1047         if ( bExp == 0 ) {
1048             --expDiff;
1049         }
1050         else {
1051             bSig |= 0x20000000;
1052         }
1053         shift32RightJamming( bSig, expDiff, &bSig );
1054         zExp = aExp;
1055     }
1056     else if ( expDiff < 0 ) {
1057         if ( bExp == 0xFF ) {
1058             if ( bSig ) return propagateFloat32NaN( a, b );
1059             return packFloat32( zSign, 0xFF, 0 );
1060         }
1061         if ( aExp == 0 ) {
1062             ++expDiff;
1063         }
1064         else {
1065             aSig |= 0x20000000;
1066         }
1067         shift32RightJamming( aSig, - expDiff, &aSig );
1068         zExp = bExp;
1069     }
1070     else {
1071         if ( aExp == 0xFF ) {
1072             if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1073             return a;
1074         }
1075         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1076         zSig = 0x40000000 + aSig + bSig;
1077         zExp = aExp;
1078         goto roundAndPack;
1079     }
1080     aSig |= 0x20000000;
1081     zSig = ( aSig + bSig )<<1;
1082     --zExp;
1083     if ( (sbits32) zSig < 0 ) {
1084         zSig = aSig + bSig;
1085         ++zExp;
1086     }
1087  roundAndPack:
1088     return roundAndPackFloat32( roundData, zSign, zExp, zSig );
1089 
1090 }
1091 
1092 /*
1093 -------------------------------------------------------------------------------
1094 Returns the result of subtracting the absolute values of the single-
1095 precision floating-point values `a' and `b'.  If `zSign' is true, the
1096 difference is negated before being returned.  `zSign' is ignored if the
1097 result is a NaN.  The subtraction is performed according to the IEC/IEEE
1098 Standard for Binary Floating-point Arithmetic.
1099 -------------------------------------------------------------------------------
1100 */
subFloat32Sigs(struct roundingData * roundData,float32 a,float32 b,flag zSign)1101 static float32 subFloat32Sigs( struct roundingData *roundData, float32 a, float32 b, flag zSign )
1102 {
1103     int16 aExp, bExp, zExp;
1104     bits32 aSig, bSig, zSig;
1105     int16 expDiff;
1106 
1107     aSig = extractFloat32Frac( a );
1108     aExp = extractFloat32Exp( a );
1109     bSig = extractFloat32Frac( b );
1110     bExp = extractFloat32Exp( b );
1111     expDiff = aExp - bExp;
1112     aSig <<= 7;
1113     bSig <<= 7;
1114     if ( 0 < expDiff ) goto aExpBigger;
1115     if ( expDiff < 0 ) goto bExpBigger;
1116     if ( aExp == 0xFF ) {
1117         if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1118         roundData->exception |= float_flag_invalid;
1119         return float32_default_nan;
1120     }
1121     if ( aExp == 0 ) {
1122         aExp = 1;
1123         bExp = 1;
1124     }
1125     if ( bSig < aSig ) goto aBigger;
1126     if ( aSig < bSig ) goto bBigger;
1127     return packFloat32( roundData->mode == float_round_down, 0, 0 );
1128  bExpBigger:
1129     if ( bExp == 0xFF ) {
1130         if ( bSig ) return propagateFloat32NaN( a, b );
1131         return packFloat32( zSign ^ 1, 0xFF, 0 );
1132     }
1133     if ( aExp == 0 ) {
1134         ++expDiff;
1135     }
1136     else {
1137         aSig |= 0x40000000;
1138     }
1139     shift32RightJamming( aSig, - expDiff, &aSig );
1140     bSig |= 0x40000000;
1141  bBigger:
1142     zSig = bSig - aSig;
1143     zExp = bExp;
1144     zSign ^= 1;
1145     goto normalizeRoundAndPack;
1146  aExpBigger:
1147     if ( aExp == 0xFF ) {
1148         if ( aSig ) return propagateFloat32NaN( a, b );
1149         return a;
1150     }
1151     if ( bExp == 0 ) {
1152         --expDiff;
1153     }
1154     else {
1155         bSig |= 0x40000000;
1156     }
1157     shift32RightJamming( bSig, expDiff, &bSig );
1158     aSig |= 0x40000000;
1159  aBigger:
1160     zSig = aSig - bSig;
1161     zExp = aExp;
1162  normalizeRoundAndPack:
1163     --zExp;
1164     return normalizeRoundAndPackFloat32( roundData, zSign, zExp, zSig );
1165 
1166 }
1167 
1168 /*
1169 -------------------------------------------------------------------------------
1170 Returns the result of adding the single-precision floating-point values `a'
1171 and `b'.  The operation is performed according to the IEC/IEEE Standard for
1172 Binary Floating-point Arithmetic.
1173 -------------------------------------------------------------------------------
1174 */
float32_add(struct roundingData * roundData,float32 a,float32 b)1175 float32 float32_add( struct roundingData *roundData, float32 a, float32 b )
1176 {
1177     flag aSign, bSign;
1178 
1179     aSign = extractFloat32Sign( a );
1180     bSign = extractFloat32Sign( b );
1181     if ( aSign == bSign ) {
1182         return addFloat32Sigs( roundData, a, b, aSign );
1183     }
1184     else {
1185         return subFloat32Sigs( roundData, a, b, aSign );
1186     }
1187 
1188 }
1189 
1190 /*
1191 -------------------------------------------------------------------------------
1192 Returns the result of subtracting the single-precision floating-point values
1193 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1194 for Binary Floating-point Arithmetic.
1195 -------------------------------------------------------------------------------
1196 */
float32_sub(struct roundingData * roundData,float32 a,float32 b)1197 float32 float32_sub( struct roundingData *roundData, float32 a, float32 b )
1198 {
1199     flag aSign, bSign;
1200 
1201     aSign = extractFloat32Sign( a );
1202     bSign = extractFloat32Sign( b );
1203     if ( aSign == bSign ) {
1204         return subFloat32Sigs( roundData, a, b, aSign );
1205     }
1206     else {
1207         return addFloat32Sigs( roundData, a, b, aSign );
1208     }
1209 
1210 }
1211 
1212 /*
1213 -------------------------------------------------------------------------------
1214 Returns the result of multiplying the single-precision floating-point values
1215 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1216 for Binary Floating-point Arithmetic.
1217 -------------------------------------------------------------------------------
1218 */
float32_mul(struct roundingData * roundData,float32 a,float32 b)1219 float32 float32_mul( struct roundingData *roundData, float32 a, float32 b )
1220 {
1221     flag aSign, bSign, zSign;
1222     int16 aExp, bExp, zExp;
1223     bits32 aSig, bSig;
1224     bits64 zSig64;
1225     bits32 zSig;
1226 
1227     aSig = extractFloat32Frac( a );
1228     aExp = extractFloat32Exp( a );
1229     aSign = extractFloat32Sign( a );
1230     bSig = extractFloat32Frac( b );
1231     bExp = extractFloat32Exp( b );
1232     bSign = extractFloat32Sign( b );
1233     zSign = aSign ^ bSign;
1234     if ( aExp == 0xFF ) {
1235         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1236             return propagateFloat32NaN( a, b );
1237         }
1238         if ( ( bExp | bSig ) == 0 ) {
1239             roundData->exception |= float_flag_invalid;
1240             return float32_default_nan;
1241         }
1242         return packFloat32( zSign, 0xFF, 0 );
1243     }
1244     if ( bExp == 0xFF ) {
1245         if ( bSig ) return propagateFloat32NaN( a, b );
1246         if ( ( aExp | aSig ) == 0 ) {
1247             roundData->exception |= float_flag_invalid;
1248             return float32_default_nan;
1249         }
1250         return packFloat32( zSign, 0xFF, 0 );
1251     }
1252     if ( aExp == 0 ) {
1253         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1254         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1255     }
1256     if ( bExp == 0 ) {
1257         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1258         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1259     }
1260     zExp = aExp + bExp - 0x7F;
1261     aSig = ( aSig | 0x00800000 )<<7;
1262     bSig = ( bSig | 0x00800000 )<<8;
1263     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1264     zSig = zSig64;
1265     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1266         zSig <<= 1;
1267         --zExp;
1268     }
1269     return roundAndPackFloat32( roundData, zSign, zExp, zSig );
1270 
1271 }
1272 
1273 /*
1274 -------------------------------------------------------------------------------
1275 Returns the result of dividing the single-precision floating-point value `a'
1276 by the corresponding value `b'.  The operation is performed according to the
1277 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1278 -------------------------------------------------------------------------------
1279 */
float32_div(struct roundingData * roundData,float32 a,float32 b)1280 float32 float32_div( struct roundingData *roundData, float32 a, float32 b )
1281 {
1282     flag aSign, bSign, zSign;
1283     int16 aExp, bExp, zExp;
1284     bits32 aSig, bSig, zSig;
1285 
1286     aSig = extractFloat32Frac( a );
1287     aExp = extractFloat32Exp( a );
1288     aSign = extractFloat32Sign( a );
1289     bSig = extractFloat32Frac( b );
1290     bExp = extractFloat32Exp( b );
1291     bSign = extractFloat32Sign( b );
1292     zSign = aSign ^ bSign;
1293     if ( aExp == 0xFF ) {
1294         if ( aSig ) return propagateFloat32NaN( a, b );
1295         if ( bExp == 0xFF ) {
1296             if ( bSig ) return propagateFloat32NaN( a, b );
1297             roundData->exception |= float_flag_invalid;
1298             return float32_default_nan;
1299         }
1300         return packFloat32( zSign, 0xFF, 0 );
1301     }
1302     if ( bExp == 0xFF ) {
1303         if ( bSig ) return propagateFloat32NaN( a, b );
1304         return packFloat32( zSign, 0, 0 );
1305     }
1306     if ( bExp == 0 ) {
1307         if ( bSig == 0 ) {
1308             if ( ( aExp | aSig ) == 0 ) {
1309                 roundData->exception |= float_flag_invalid;
1310                 return float32_default_nan;
1311             }
1312             roundData->exception |= float_flag_divbyzero;
1313             return packFloat32( zSign, 0xFF, 0 );
1314         }
1315         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1316     }
1317     if ( aExp == 0 ) {
1318         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1319         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1320     }
1321     zExp = aExp - bExp + 0x7D;
1322     aSig = ( aSig | 0x00800000 )<<7;
1323     bSig = ( bSig | 0x00800000 )<<8;
1324     if ( bSig <= ( aSig + aSig ) ) {
1325         aSig >>= 1;
1326         ++zExp;
1327     }
1328     {
1329         bits64 tmp = ( (bits64) aSig )<<32;
1330         do_div( tmp, bSig );
1331         zSig = tmp;
1332     }
1333     if ( ( zSig & 0x3F ) == 0 ) {
1334         zSig |= ( ( (bits64) bSig ) * zSig != ( (bits64) aSig )<<32 );
1335     }
1336     return roundAndPackFloat32( roundData, zSign, zExp, zSig );
1337 
1338 }
1339 
1340 /*
1341 -------------------------------------------------------------------------------
1342 Returns the remainder of the single-precision floating-point value `a'
1343 with respect to the corresponding value `b'.  The operation is performed
1344 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1345 -------------------------------------------------------------------------------
1346 */
float32_rem(struct roundingData * roundData,float32 a,float32 b)1347 float32 float32_rem( struct roundingData *roundData, float32 a, float32 b )
1348 {
1349     flag aSign, bSign, zSign;
1350     int16 aExp, bExp, expDiff;
1351     bits32 aSig, bSig;
1352     bits32 q;
1353     bits64 aSig64, bSig64, q64;
1354     bits32 alternateASig;
1355     sbits32 sigMean;
1356 
1357     aSig = extractFloat32Frac( a );
1358     aExp = extractFloat32Exp( a );
1359     aSign = extractFloat32Sign( a );
1360     bSig = extractFloat32Frac( b );
1361     bExp = extractFloat32Exp( b );
1362     bSign = extractFloat32Sign( b );
1363     if ( aExp == 0xFF ) {
1364         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1365             return propagateFloat32NaN( a, b );
1366         }
1367         roundData->exception |= float_flag_invalid;
1368         return float32_default_nan;
1369     }
1370     if ( bExp == 0xFF ) {
1371         if ( bSig ) return propagateFloat32NaN( a, b );
1372         return a;
1373     }
1374     if ( bExp == 0 ) {
1375         if ( bSig == 0 ) {
1376             roundData->exception |= float_flag_invalid;
1377             return float32_default_nan;
1378         }
1379         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1380     }
1381     if ( aExp == 0 ) {
1382         if ( aSig == 0 ) return a;
1383         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1384     }
1385     expDiff = aExp - bExp;
1386     aSig |= 0x00800000;
1387     bSig |= 0x00800000;
1388     if ( expDiff < 32 ) {
1389         aSig <<= 8;
1390         bSig <<= 8;
1391         if ( expDiff < 0 ) {
1392             if ( expDiff < -1 ) return a;
1393             aSig >>= 1;
1394         }
1395         q = ( bSig <= aSig );
1396         if ( q ) aSig -= bSig;
1397         if ( 0 < expDiff ) {
1398             bits64 tmp = ( (bits64) aSig )<<32;
1399             do_div( tmp, bSig );
1400             q = tmp;
1401             q >>= 32 - expDiff;
1402             bSig >>= 2;
1403             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1404         }
1405         else {
1406             aSig >>= 2;
1407             bSig >>= 2;
1408         }
1409     }
1410     else {
1411         if ( bSig <= aSig ) aSig -= bSig;
1412         aSig64 = ( (bits64) aSig )<<40;
1413         bSig64 = ( (bits64) bSig )<<40;
1414         expDiff -= 64;
1415         while ( 0 < expDiff ) {
1416             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1417             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1418             aSig64 = - ( ( bSig * q64 )<<38 );
1419             expDiff -= 62;
1420         }
1421         expDiff += 64;
1422         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1423         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1424         q = q64>>( 64 - expDiff );
1425         bSig <<= 6;
1426         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1427     }
1428     do {
1429         alternateASig = aSig;
1430         ++q;
1431         aSig -= bSig;
1432     } while ( 0 <= (sbits32) aSig );
1433     sigMean = aSig + alternateASig;
1434     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1435         aSig = alternateASig;
1436     }
1437     zSign = ( (sbits32) aSig < 0 );
1438     if ( zSign ) aSig = - aSig;
1439     return normalizeRoundAndPackFloat32( roundData, aSign ^ zSign, bExp, aSig );
1440 
1441 }
1442 
1443 /*
1444 -------------------------------------------------------------------------------
1445 Returns the square root of the single-precision floating-point value `a'.
1446 The operation is performed according to the IEC/IEEE Standard for Binary
1447 Floating-point Arithmetic.
1448 -------------------------------------------------------------------------------
1449 */
float32_sqrt(struct roundingData * roundData,float32 a)1450 float32 float32_sqrt( struct roundingData *roundData, float32 a )
1451 {
1452     flag aSign;
1453     int16 aExp, zExp;
1454     bits32 aSig, zSig;
1455     bits64 rem, term;
1456 
1457     aSig = extractFloat32Frac( a );
1458     aExp = extractFloat32Exp( a );
1459     aSign = extractFloat32Sign( a );
1460     if ( aExp == 0xFF ) {
1461         if ( aSig ) return propagateFloat32NaN( a, 0 );
1462         if ( ! aSign ) return a;
1463         roundData->exception |= float_flag_invalid;
1464         return float32_default_nan;
1465     }
1466     if ( aSign ) {
1467         if ( ( aExp | aSig ) == 0 ) return a;
1468         roundData->exception |= float_flag_invalid;
1469         return float32_default_nan;
1470     }
1471     if ( aExp == 0 ) {
1472         if ( aSig == 0 ) return 0;
1473         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1474     }
1475     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1476     aSig = ( aSig | 0x00800000 )<<8;
1477     zSig = estimateSqrt32( aExp, aSig ) + 2;
1478     if ( ( zSig & 0x7F ) <= 5 ) {
1479         if ( zSig < 2 ) {
1480             zSig = 0xFFFFFFFF;
1481         }
1482         else {
1483             aSig >>= aExp & 1;
1484             term = ( (bits64) zSig ) * zSig;
1485             rem = ( ( (bits64) aSig )<<32 ) - term;
1486             while ( (sbits64) rem < 0 ) {
1487                 --zSig;
1488                 rem += ( ( (bits64) zSig )<<1 ) | 1;
1489             }
1490             zSig |= ( rem != 0 );
1491         }
1492     }
1493     shift32RightJamming( zSig, 1, &zSig );
1494     return roundAndPackFloat32( roundData, 0, zExp, zSig );
1495 
1496 }
1497 
1498 /*
1499 -------------------------------------------------------------------------------
1500 Returns 1 if the single-precision floating-point value `a' is equal to the
1501 corresponding value `b', and 0 otherwise.  The comparison is performed
1502 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1503 -------------------------------------------------------------------------------
1504 */
float32_eq(float32 a,float32 b)1505 flag float32_eq( float32 a, float32 b )
1506 {
1507 
1508     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1509          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1510        ) {
1511         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1512             float_raise( float_flag_invalid );
1513         }
1514         return 0;
1515     }
1516     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1517 
1518 }
1519 
1520 /*
1521 -------------------------------------------------------------------------------
1522 Returns 1 if the single-precision floating-point value `a' is less than or
1523 equal to the corresponding value `b', and 0 otherwise.  The comparison is
1524 performed according to the IEC/IEEE Standard for Binary Floating-point
1525 Arithmetic.
1526 -------------------------------------------------------------------------------
1527 */
float32_le(float32 a,float32 b)1528 flag float32_le( float32 a, float32 b )
1529 {
1530     flag aSign, bSign;
1531 
1532     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1533          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1534        ) {
1535         float_raise( float_flag_invalid );
1536         return 0;
1537     }
1538     aSign = extractFloat32Sign( a );
1539     bSign = extractFloat32Sign( b );
1540     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1541     return ( a == b ) || ( aSign ^ ( a < b ) );
1542 
1543 }
1544 
1545 /*
1546 -------------------------------------------------------------------------------
1547 Returns 1 if the single-precision floating-point value `a' is less than
1548 the corresponding value `b', and 0 otherwise.  The comparison is performed
1549 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1550 -------------------------------------------------------------------------------
1551 */
float32_lt(float32 a,float32 b)1552 flag float32_lt( float32 a, float32 b )
1553 {
1554     flag aSign, bSign;
1555 
1556     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1557          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1558        ) {
1559         float_raise( float_flag_invalid );
1560         return 0;
1561     }
1562     aSign = extractFloat32Sign( a );
1563     bSign = extractFloat32Sign( b );
1564     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1565     return ( a != b ) && ( aSign ^ ( a < b ) );
1566 
1567 }
1568 
1569 /*
1570 -------------------------------------------------------------------------------
1571 Returns 1 if the single-precision floating-point value `a' is equal to the
1572 corresponding value `b', and 0 otherwise.  The invalid exception is raised
1573 if either operand is a NaN.  Otherwise, the comparison is performed
1574 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1575 -------------------------------------------------------------------------------
1576 */
float32_eq_signaling(float32 a,float32 b)1577 flag float32_eq_signaling( float32 a, float32 b )
1578 {
1579 
1580     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1581          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1582        ) {
1583         float_raise( float_flag_invalid );
1584         return 0;
1585     }
1586     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1587 
1588 }
1589 
1590 /*
1591 -------------------------------------------------------------------------------
1592 Returns 1 if the single-precision floating-point value `a' is less than or
1593 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
1594 cause an exception.  Otherwise, the comparison is performed according to the
1595 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1596 -------------------------------------------------------------------------------
1597 */
float32_le_quiet(float32 a,float32 b)1598 flag float32_le_quiet( float32 a, float32 b )
1599 {
1600     flag aSign, bSign;
1601     //int16 aExp, bExp;
1602 
1603     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1604          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1605        ) {
1606         /* Do nothing, even if NaN as we're quiet */
1607         return 0;
1608     }
1609     aSign = extractFloat32Sign( a );
1610     bSign = extractFloat32Sign( b );
1611     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1612     return ( a == b ) || ( aSign ^ ( a < b ) );
1613 
1614 }
1615 
1616 /*
1617 -------------------------------------------------------------------------------
1618 Returns 1 if the single-precision floating-point value `a' is less than
1619 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
1620 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
1621 Standard for Binary Floating-point Arithmetic.
1622 -------------------------------------------------------------------------------
1623 */
float32_lt_quiet(float32 a,float32 b)1624 flag float32_lt_quiet( float32 a, float32 b )
1625 {
1626     flag aSign, bSign;
1627 
1628     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1629          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1630        ) {
1631         /* Do nothing, even if NaN as we're quiet */
1632         return 0;
1633     }
1634     aSign = extractFloat32Sign( a );
1635     bSign = extractFloat32Sign( b );
1636     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1637     return ( a != b ) && ( aSign ^ ( a < b ) );
1638 
1639 }
1640 
1641 /*
1642 -------------------------------------------------------------------------------
1643 Returns the result of converting the double-precision floating-point value
1644 `a' to the 32-bit two's complement integer format.  The conversion is
1645 performed according to the IEC/IEEE Standard for Binary Floating-point
1646 Arithmetic---which means in particular that the conversion is rounded
1647 according to the current rounding mode.  If `a' is a NaN, the largest
1648 positive integer is returned.  Otherwise, if the conversion overflows, the
1649 largest integer with the same sign as `a' is returned.
1650 -------------------------------------------------------------------------------
1651 */
float64_to_int32(struct roundingData * roundData,float64 a)1652 int32 float64_to_int32( struct roundingData *roundData, float64 a )
1653 {
1654     flag aSign;
1655     int16 aExp, shiftCount;
1656     bits64 aSig;
1657 
1658     aSig = extractFloat64Frac( a );
1659     aExp = extractFloat64Exp( a );
1660     aSign = extractFloat64Sign( a );
1661     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1662     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1663     shiftCount = 0x42C - aExp;
1664     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1665     return roundAndPackInt32( roundData, aSign, aSig );
1666 
1667 }
1668 
1669 /*
1670 -------------------------------------------------------------------------------
1671 Returns the result of converting the double-precision floating-point value
1672 `a' to the 32-bit two's complement integer format.  The conversion is
1673 performed according to the IEC/IEEE Standard for Binary Floating-point
1674 Arithmetic, except that the conversion is always rounded toward zero.  If
1675 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1676 conversion overflows, the largest integer with the same sign as `a' is
1677 returned.
1678 -------------------------------------------------------------------------------
1679 */
float64_to_int32_round_to_zero(float64 a)1680 int32 float64_to_int32_round_to_zero( float64 a )
1681 {
1682     flag aSign;
1683     int16 aExp, shiftCount;
1684     bits64 aSig, savedASig;
1685     int32 z;
1686 
1687     aSig = extractFloat64Frac( a );
1688     aExp = extractFloat64Exp( a );
1689     aSign = extractFloat64Sign( a );
1690     shiftCount = 0x433 - aExp;
1691     if ( shiftCount < 21 ) {
1692         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1693         goto invalid;
1694     }
1695     else if ( 52 < shiftCount ) {
1696         if ( aExp || aSig ) float_raise( float_flag_inexact );
1697         return 0;
1698     }
1699     aSig |= LIT64( 0x0010000000000000 );
1700     savedASig = aSig;
1701     aSig >>= shiftCount;
1702     z = aSig;
1703     if ( aSign ) z = - z;
1704     if ( ( z < 0 ) ^ aSign ) {
1705  invalid:
1706         float_raise( float_flag_invalid );
1707         return aSign ? 0x80000000 : 0x7FFFFFFF;
1708     }
1709     if ( ( aSig<<shiftCount ) != savedASig ) {
1710         float_raise( float_flag_inexact );
1711     }
1712     return z;
1713 
1714 }
1715 
1716 /*
1717 -------------------------------------------------------------------------------
1718 Returns the result of converting the double-precision floating-point value
1719 `a' to the 32-bit two's complement unsigned integer format.  The conversion
1720 is performed according to the IEC/IEEE Standard for Binary Floating-point
1721 Arithmetic---which means in particular that the conversion is rounded
1722 according to the current rounding mode.  If `a' is a NaN, the largest
1723 positive integer is returned.  Otherwise, if the conversion overflows, the
1724 largest positive integer is returned.
1725 -------------------------------------------------------------------------------
1726 */
float64_to_uint32(struct roundingData * roundData,float64 a)1727 int32 float64_to_uint32( struct roundingData *roundData, float64 a )
1728 {
1729     flag aSign;
1730     int16 aExp, shiftCount;
1731     bits64 aSig;
1732 
1733     aSig = extractFloat64Frac( a );
1734     aExp = extractFloat64Exp( a );
1735     aSign = 0; //extractFloat64Sign( a );
1736     //if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1737     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1738     shiftCount = 0x42C - aExp;
1739     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1740     return roundAndPackInt32( roundData, aSign, aSig );
1741 }
1742 
1743 /*
1744 -------------------------------------------------------------------------------
1745 Returns the result of converting the double-precision floating-point value
1746 `a' to the 32-bit two's complement integer format.  The conversion is
1747 performed according to the IEC/IEEE Standard for Binary Floating-point
1748 Arithmetic, except that the conversion is always rounded toward zero.  If
1749 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1750 conversion overflows, the largest positive integer is returned.
1751 -------------------------------------------------------------------------------
1752 */
float64_to_uint32_round_to_zero(float64 a)1753 int32 float64_to_uint32_round_to_zero( float64 a )
1754 {
1755     flag aSign;
1756     int16 aExp, shiftCount;
1757     bits64 aSig, savedASig;
1758     int32 z;
1759 
1760     aSig = extractFloat64Frac( a );
1761     aExp = extractFloat64Exp( a );
1762     aSign = extractFloat64Sign( a );
1763     shiftCount = 0x433 - aExp;
1764     if ( shiftCount < 21 ) {
1765         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1766         goto invalid;
1767     }
1768     else if ( 52 < shiftCount ) {
1769         if ( aExp || aSig ) float_raise( float_flag_inexact );
1770         return 0;
1771     }
1772     aSig |= LIT64( 0x0010000000000000 );
1773     savedASig = aSig;
1774     aSig >>= shiftCount;
1775     z = aSig;
1776     if ( aSign ) z = - z;
1777     if ( ( z < 0 ) ^ aSign ) {
1778  invalid:
1779         float_raise( float_flag_invalid );
1780         return aSign ? 0x80000000 : 0x7FFFFFFF;
1781     }
1782     if ( ( aSig<<shiftCount ) != savedASig ) {
1783         float_raise( float_flag_inexact );
1784     }
1785     return z;
1786 }
1787 
1788 /*
1789 -------------------------------------------------------------------------------
1790 Returns the result of converting the double-precision floating-point value
1791 `a' to the single-precision floating-point format.  The conversion is
1792 performed according to the IEC/IEEE Standard for Binary Floating-point
1793 Arithmetic.
1794 -------------------------------------------------------------------------------
1795 */
float64_to_float32(struct roundingData * roundData,float64 a)1796 float32 float64_to_float32( struct roundingData *roundData, float64 a )
1797 {
1798     flag aSign;
1799     int16 aExp;
1800     bits64 aSig;
1801     bits32 zSig;
1802 
1803     aSig = extractFloat64Frac( a );
1804     aExp = extractFloat64Exp( a );
1805     aSign = extractFloat64Sign( a );
1806     if ( aExp == 0x7FF ) {
1807         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
1808         return packFloat32( aSign, 0xFF, 0 );
1809     }
1810     shift64RightJamming( aSig, 22, &aSig );
1811     zSig = aSig;
1812     if ( aExp || zSig ) {
1813         zSig |= 0x40000000;
1814         aExp -= 0x381;
1815     }
1816     return roundAndPackFloat32( roundData, aSign, aExp, zSig );
1817 
1818 }
1819 
1820 #ifdef FLOATX80
1821 
1822 /*
1823 -------------------------------------------------------------------------------
1824 Returns the result of converting the double-precision floating-point value
1825 `a' to the extended double-precision floating-point format.  The conversion
1826 is performed according to the IEC/IEEE Standard for Binary Floating-point
1827 Arithmetic.
1828 -------------------------------------------------------------------------------
1829 */
float64_to_floatx80(float64 a)1830 floatx80 float64_to_floatx80( float64 a )
1831 {
1832     flag aSign;
1833     int16 aExp;
1834     bits64 aSig;
1835 
1836     aSig = extractFloat64Frac( a );
1837     aExp = extractFloat64Exp( a );
1838     aSign = extractFloat64Sign( a );
1839     if ( aExp == 0x7FF ) {
1840         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
1841         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1842     }
1843     if ( aExp == 0 ) {
1844         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1845         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
1846     }
1847     return
1848         packFloatx80(
1849             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
1850 
1851 }
1852 
1853 #endif
1854 
1855 /*
1856 -------------------------------------------------------------------------------
1857 Rounds the double-precision floating-point value `a' to an integer, and
1858 returns the result as a double-precision floating-point value.  The
1859 operation is performed according to the IEC/IEEE Standard for Binary
1860 Floating-point Arithmetic.
1861 -------------------------------------------------------------------------------
1862 */
float64_round_to_int(struct roundingData * roundData,float64 a)1863 float64 float64_round_to_int( struct roundingData *roundData, float64 a )
1864 {
1865     flag aSign;
1866     int16 aExp;
1867     bits64 lastBitMask, roundBitsMask;
1868     int8 roundingMode;
1869     float64 z;
1870 
1871     aExp = extractFloat64Exp( a );
1872     if ( 0x433 <= aExp ) {
1873         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
1874             return propagateFloat64NaN( a, a );
1875         }
1876         return a;
1877     }
1878     if ( aExp <= 0x3FE ) {
1879         if ( (bits64) ( a<<1 ) == 0 ) return a;
1880         roundData->exception |= float_flag_inexact;
1881         aSign = extractFloat64Sign( a );
1882         switch ( roundData->mode ) {
1883          case float_round_nearest_even:
1884             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
1885                 return packFloat64( aSign, 0x3FF, 0 );
1886             }
1887             break;
1888          case float_round_down:
1889             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
1890          case float_round_up:
1891             return
1892             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
1893         }
1894         return packFloat64( aSign, 0, 0 );
1895     }
1896     lastBitMask = 1;
1897     lastBitMask <<= 0x433 - aExp;
1898     roundBitsMask = lastBitMask - 1;
1899     z = a;
1900     roundingMode = roundData->mode;
1901     if ( roundingMode == float_round_nearest_even ) {
1902         z += lastBitMask>>1;
1903         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1904     }
1905     else if ( roundingMode != float_round_to_zero ) {
1906         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1907             z += roundBitsMask;
1908         }
1909     }
1910     z &= ~ roundBitsMask;
1911     if ( z != a ) roundData->exception |= float_flag_inexact;
1912     return z;
1913 
1914 }
1915 
1916 /*
1917 -------------------------------------------------------------------------------
1918 Returns the result of adding the absolute values of the double-precision
1919 floating-point values `a' and `b'.  If `zSign' is true, the sum is negated
1920 before being returned.  `zSign' is ignored if the result is a NaN.  The
1921 addition is performed according to the IEC/IEEE Standard for Binary
1922 Floating-point Arithmetic.
1923 -------------------------------------------------------------------------------
1924 */
addFloat64Sigs(struct roundingData * roundData,float64 a,float64 b,flag zSign)1925 static float64 addFloat64Sigs( struct roundingData *roundData, float64 a, float64 b, flag zSign )
1926 {
1927     int16 aExp, bExp, zExp;
1928     bits64 aSig, bSig, zSig;
1929     int16 expDiff;
1930 
1931     aSig = extractFloat64Frac( a );
1932     aExp = extractFloat64Exp( a );
1933     bSig = extractFloat64Frac( b );
1934     bExp = extractFloat64Exp( b );
1935     expDiff = aExp - bExp;
1936     aSig <<= 9;
1937     bSig <<= 9;
1938     if ( 0 < expDiff ) {
1939         if ( aExp == 0x7FF ) {
1940             if ( aSig ) return propagateFloat64NaN( a, b );
1941             return a;
1942         }
1943         if ( bExp == 0 ) {
1944             --expDiff;
1945         }
1946         else {
1947             bSig |= LIT64( 0x2000000000000000 );
1948         }
1949         shift64RightJamming( bSig, expDiff, &bSig );
1950         zExp = aExp;
1951     }
1952     else if ( expDiff < 0 ) {
1953         if ( bExp == 0x7FF ) {
1954             if ( bSig ) return propagateFloat64NaN( a, b );
1955             return packFloat64( zSign, 0x7FF, 0 );
1956         }
1957         if ( aExp == 0 ) {
1958             ++expDiff;
1959         }
1960         else {
1961             aSig |= LIT64( 0x2000000000000000 );
1962         }
1963         shift64RightJamming( aSig, - expDiff, &aSig );
1964         zExp = bExp;
1965     }
1966     else {
1967         if ( aExp == 0x7FF ) {
1968             if ( aSig | bSig ) return propagateFloat64NaN( a, b );
1969             return a;
1970         }
1971         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
1972         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
1973         zExp = aExp;
1974         goto roundAndPack;
1975     }
1976     aSig |= LIT64( 0x2000000000000000 );
1977     zSig = ( aSig + bSig )<<1;
1978     --zExp;
1979     if ( (sbits64) zSig < 0 ) {
1980         zSig = aSig + bSig;
1981         ++zExp;
1982     }
1983  roundAndPack:
1984     return roundAndPackFloat64( roundData, zSign, zExp, zSig );
1985 
1986 }
1987 
1988 /*
1989 -------------------------------------------------------------------------------
1990 Returns the result of subtracting the absolute values of the double-
1991 precision floating-point values `a' and `b'.  If `zSign' is true, the
1992 difference is negated before being returned.  `zSign' is ignored if the
1993 result is a NaN.  The subtraction is performed according to the IEC/IEEE
1994 Standard for Binary Floating-point Arithmetic.
1995 -------------------------------------------------------------------------------
1996 */
subFloat64Sigs(struct roundingData * roundData,float64 a,float64 b,flag zSign)1997 static float64 subFloat64Sigs( struct roundingData *roundData, float64 a, float64 b, flag zSign )
1998 {
1999     int16 aExp, bExp, zExp;
2000     bits64 aSig, bSig, zSig;
2001     int16 expDiff;
2002 
2003     aSig = extractFloat64Frac( a );
2004     aExp = extractFloat64Exp( a );
2005     bSig = extractFloat64Frac( b );
2006     bExp = extractFloat64Exp( b );
2007     expDiff = aExp - bExp;
2008     aSig <<= 10;
2009     bSig <<= 10;
2010     if ( 0 < expDiff ) goto aExpBigger;
2011     if ( expDiff < 0 ) goto bExpBigger;
2012     if ( aExp == 0x7FF ) {
2013         if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2014         roundData->exception |= float_flag_invalid;
2015         return float64_default_nan;
2016     }
2017     if ( aExp == 0 ) {
2018         aExp = 1;
2019         bExp = 1;
2020     }
2021     if ( bSig < aSig ) goto aBigger;
2022     if ( aSig < bSig ) goto bBigger;
2023     return packFloat64( roundData->mode == float_round_down, 0, 0 );
2024  bExpBigger:
2025     if ( bExp == 0x7FF ) {
2026         if ( bSig ) return propagateFloat64NaN( a, b );
2027         return packFloat64( zSign ^ 1, 0x7FF, 0 );
2028     }
2029     if ( aExp == 0 ) {
2030         ++expDiff;
2031     }
2032     else {
2033         aSig |= LIT64( 0x4000000000000000 );
2034     }
2035     shift64RightJamming( aSig, - expDiff, &aSig );
2036     bSig |= LIT64( 0x4000000000000000 );
2037  bBigger:
2038     zSig = bSig - aSig;
2039     zExp = bExp;
2040     zSign ^= 1;
2041     goto normalizeRoundAndPack;
2042  aExpBigger:
2043     if ( aExp == 0x7FF ) {
2044         if ( aSig ) return propagateFloat64NaN( a, b );
2045         return a;
2046     }
2047     if ( bExp == 0 ) {
2048         --expDiff;
2049     }
2050     else {
2051         bSig |= LIT64( 0x4000000000000000 );
2052     }
2053     shift64RightJamming( bSig, expDiff, &bSig );
2054     aSig |= LIT64( 0x4000000000000000 );
2055  aBigger:
2056     zSig = aSig - bSig;
2057     zExp = aExp;
2058  normalizeRoundAndPack:
2059     --zExp;
2060     return normalizeRoundAndPackFloat64( roundData, zSign, zExp, zSig );
2061 
2062 }
2063 
2064 /*
2065 -------------------------------------------------------------------------------
2066 Returns the result of adding the double-precision floating-point values `a'
2067 and `b'.  The operation is performed according to the IEC/IEEE Standard for
2068 Binary Floating-point Arithmetic.
2069 -------------------------------------------------------------------------------
2070 */
float64_add(struct roundingData * roundData,float64 a,float64 b)2071 float64 float64_add( struct roundingData *roundData, float64 a, float64 b )
2072 {
2073     flag aSign, bSign;
2074 
2075     aSign = extractFloat64Sign( a );
2076     bSign = extractFloat64Sign( b );
2077     if ( aSign == bSign ) {
2078         return addFloat64Sigs( roundData, a, b, aSign );
2079     }
2080     else {
2081         return subFloat64Sigs( roundData, a, b, aSign );
2082     }
2083 
2084 }
2085 
2086 /*
2087 -------------------------------------------------------------------------------
2088 Returns the result of subtracting the double-precision floating-point values
2089 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2090 for Binary Floating-point Arithmetic.
2091 -------------------------------------------------------------------------------
2092 */
float64_sub(struct roundingData * roundData,float64 a,float64 b)2093 float64 float64_sub( struct roundingData *roundData, float64 a, float64 b )
2094 {
2095     flag aSign, bSign;
2096 
2097     aSign = extractFloat64Sign( a );
2098     bSign = extractFloat64Sign( b );
2099     if ( aSign == bSign ) {
2100         return subFloat64Sigs( roundData, a, b, aSign );
2101     }
2102     else {
2103         return addFloat64Sigs( roundData, a, b, aSign );
2104     }
2105 
2106 }
2107 
2108 /*
2109 -------------------------------------------------------------------------------
2110 Returns the result of multiplying the double-precision floating-point values
2111 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2112 for Binary Floating-point Arithmetic.
2113 -------------------------------------------------------------------------------
2114 */
float64_mul(struct roundingData * roundData,float64 a,float64 b)2115 float64 float64_mul( struct roundingData *roundData, float64 a, float64 b )
2116 {
2117     flag aSign, bSign, zSign;
2118     int16 aExp, bExp, zExp;
2119     bits64 aSig, bSig, zSig0, zSig1;
2120 
2121     aSig = extractFloat64Frac( a );
2122     aExp = extractFloat64Exp( a );
2123     aSign = extractFloat64Sign( a );
2124     bSig = extractFloat64Frac( b );
2125     bExp = extractFloat64Exp( b );
2126     bSign = extractFloat64Sign( b );
2127     zSign = aSign ^ bSign;
2128     if ( aExp == 0x7FF ) {
2129         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2130             return propagateFloat64NaN( a, b );
2131         }
2132         if ( ( bExp | bSig ) == 0 ) {
2133             roundData->exception |= float_flag_invalid;
2134             return float64_default_nan;
2135         }
2136         return packFloat64( zSign, 0x7FF, 0 );
2137     }
2138     if ( bExp == 0x7FF ) {
2139         if ( bSig ) return propagateFloat64NaN( a, b );
2140         if ( ( aExp | aSig ) == 0 ) {
2141             roundData->exception |= float_flag_invalid;
2142             return float64_default_nan;
2143         }
2144         return packFloat64( zSign, 0x7FF, 0 );
2145     }
2146     if ( aExp == 0 ) {
2147         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2148         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2149     }
2150     if ( bExp == 0 ) {
2151         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2152         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2153     }
2154     zExp = aExp + bExp - 0x3FF;
2155     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2156     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2157     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2158     zSig0 |= ( zSig1 != 0 );
2159     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2160         zSig0 <<= 1;
2161         --zExp;
2162     }
2163     return roundAndPackFloat64( roundData, zSign, zExp, zSig0 );
2164 
2165 }
2166 
2167 /*
2168 -------------------------------------------------------------------------------
2169 Returns the result of dividing the double-precision floating-point value `a'
2170 by the corresponding value `b'.  The operation is performed according to
2171 the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2172 -------------------------------------------------------------------------------
2173 */
float64_div(struct roundingData * roundData,float64 a,float64 b)2174 float64 float64_div( struct roundingData *roundData, float64 a, float64 b )
2175 {
2176     flag aSign, bSign, zSign;
2177     int16 aExp, bExp, zExp;
2178     bits64 aSig, bSig, zSig;
2179     bits64 rem0, rem1;
2180     bits64 term0, term1;
2181 
2182     aSig = extractFloat64Frac( a );
2183     aExp = extractFloat64Exp( a );
2184     aSign = extractFloat64Sign( a );
2185     bSig = extractFloat64Frac( b );
2186     bExp = extractFloat64Exp( b );
2187     bSign = extractFloat64Sign( b );
2188     zSign = aSign ^ bSign;
2189     if ( aExp == 0x7FF ) {
2190         if ( aSig ) return propagateFloat64NaN( a, b );
2191         if ( bExp == 0x7FF ) {
2192             if ( bSig ) return propagateFloat64NaN( a, b );
2193             roundData->exception |= float_flag_invalid;
2194             return float64_default_nan;
2195         }
2196         return packFloat64( zSign, 0x7FF, 0 );
2197     }
2198     if ( bExp == 0x7FF ) {
2199         if ( bSig ) return propagateFloat64NaN( a, b );
2200         return packFloat64( zSign, 0, 0 );
2201     }
2202     if ( bExp == 0 ) {
2203         if ( bSig == 0 ) {
2204             if ( ( aExp | aSig ) == 0 ) {
2205                 roundData->exception |= float_flag_invalid;
2206                 return float64_default_nan;
2207             }
2208             roundData->exception |= float_flag_divbyzero;
2209             return packFloat64( zSign, 0x7FF, 0 );
2210         }
2211         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2212     }
2213     if ( aExp == 0 ) {
2214         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2215         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2216     }
2217     zExp = aExp - bExp + 0x3FD;
2218     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2219     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2220     if ( bSig <= ( aSig + aSig ) ) {
2221         aSig >>= 1;
2222         ++zExp;
2223     }
2224     zSig = estimateDiv128To64( aSig, 0, bSig );
2225     if ( ( zSig & 0x1FF ) <= 2 ) {
2226         mul64To128( bSig, zSig, &term0, &term1 );
2227         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2228         while ( (sbits64) rem0 < 0 ) {
2229             --zSig;
2230             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2231         }
2232         zSig |= ( rem1 != 0 );
2233     }
2234     return roundAndPackFloat64( roundData, zSign, zExp, zSig );
2235 
2236 }
2237 
2238 /*
2239 -------------------------------------------------------------------------------
2240 Returns the remainder of the double-precision floating-point value `a'
2241 with respect to the corresponding value `b'.  The operation is performed
2242 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2243 -------------------------------------------------------------------------------
2244 */
float64_rem(struct roundingData * roundData,float64 a,float64 b)2245 float64 float64_rem( struct roundingData *roundData, float64 a, float64 b )
2246 {
2247     flag aSign, bSign, zSign;
2248     int16 aExp, bExp, expDiff;
2249     bits64 aSig, bSig;
2250     bits64 q, alternateASig;
2251     sbits64 sigMean;
2252 
2253     aSig = extractFloat64Frac( a );
2254     aExp = extractFloat64Exp( a );
2255     aSign = extractFloat64Sign( a );
2256     bSig = extractFloat64Frac( b );
2257     bExp = extractFloat64Exp( b );
2258     bSign = extractFloat64Sign( b );
2259     if ( aExp == 0x7FF ) {
2260         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2261             return propagateFloat64NaN( a, b );
2262         }
2263         roundData->exception |= float_flag_invalid;
2264         return float64_default_nan;
2265     }
2266     if ( bExp == 0x7FF ) {
2267         if ( bSig ) return propagateFloat64NaN( a, b );
2268         return a;
2269     }
2270     if ( bExp == 0 ) {
2271         if ( bSig == 0 ) {
2272             roundData->exception |= float_flag_invalid;
2273             return float64_default_nan;
2274         }
2275         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2276     }
2277     if ( aExp == 0 ) {
2278         if ( aSig == 0 ) return a;
2279         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2280     }
2281     expDiff = aExp - bExp;
2282     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2283     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2284     if ( expDiff < 0 ) {
2285         if ( expDiff < -1 ) return a;
2286         aSig >>= 1;
2287     }
2288     q = ( bSig <= aSig );
2289     if ( q ) aSig -= bSig;
2290     expDiff -= 64;
2291     while ( 0 < expDiff ) {
2292         q = estimateDiv128To64( aSig, 0, bSig );
2293         q = ( 2 < q ) ? q - 2 : 0;
2294         aSig = - ( ( bSig>>2 ) * q );
2295         expDiff -= 62;
2296     }
2297     expDiff += 64;
2298     if ( 0 < expDiff ) {
2299         q = estimateDiv128To64( aSig, 0, bSig );
2300         q = ( 2 < q ) ? q - 2 : 0;
2301         q >>= 64 - expDiff;
2302         bSig >>= 2;
2303         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2304     }
2305     else {
2306         aSig >>= 2;
2307         bSig >>= 2;
2308     }
2309     do {
2310         alternateASig = aSig;
2311         ++q;
2312         aSig -= bSig;
2313     } while ( 0 <= (sbits64) aSig );
2314     sigMean = aSig + alternateASig;
2315     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2316         aSig = alternateASig;
2317     }
2318     zSign = ( (sbits64) aSig < 0 );
2319     if ( zSign ) aSig = - aSig;
2320     return normalizeRoundAndPackFloat64( roundData, aSign ^ zSign, bExp, aSig );
2321 
2322 }
2323 
2324 /*
2325 -------------------------------------------------------------------------------
2326 Returns the square root of the double-precision floating-point value `a'.
2327 The operation is performed according to the IEC/IEEE Standard for Binary
2328 Floating-point Arithmetic.
2329 -------------------------------------------------------------------------------
2330 */
float64_sqrt(struct roundingData * roundData,float64 a)2331 float64 float64_sqrt( struct roundingData *roundData, float64 a )
2332 {
2333     flag aSign;
2334     int16 aExp, zExp;
2335     bits64 aSig, zSig;
2336     bits64 rem0, rem1, term0, term1; //, shiftedRem;
2337     //float64 z;
2338 
2339     aSig = extractFloat64Frac( a );
2340     aExp = extractFloat64Exp( a );
2341     aSign = extractFloat64Sign( a );
2342     if ( aExp == 0x7FF ) {
2343         if ( aSig ) return propagateFloat64NaN( a, a );
2344         if ( ! aSign ) return a;
2345         roundData->exception |= float_flag_invalid;
2346         return float64_default_nan;
2347     }
2348     if ( aSign ) {
2349         if ( ( aExp | aSig ) == 0 ) return a;
2350         roundData->exception |= float_flag_invalid;
2351         return float64_default_nan;
2352     }
2353     if ( aExp == 0 ) {
2354         if ( aSig == 0 ) return 0;
2355         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2356     }
2357     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2358     aSig |= LIT64( 0x0010000000000000 );
2359     zSig = estimateSqrt32( aExp, aSig>>21 );
2360     zSig <<= 31;
2361     aSig <<= 9 - ( aExp & 1 );
2362     zSig = estimateDiv128To64( aSig, 0, zSig ) + zSig + 2;
2363     if ( ( zSig & 0x3FF ) <= 5 ) {
2364         if ( zSig < 2 ) {
2365             zSig = LIT64( 0xFFFFFFFFFFFFFFFF );
2366         }
2367         else {
2368             aSig <<= 2;
2369             mul64To128( zSig, zSig, &term0, &term1 );
2370             sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2371             while ( (sbits64) rem0 < 0 ) {
2372                 --zSig;
2373                 shortShift128Left( 0, zSig, 1, &term0, &term1 );
2374                 term1 |= 1;
2375                 add128( rem0, rem1, term0, term1, &rem0, &rem1 );
2376             }
2377             zSig |= ( ( rem0 | rem1 ) != 0 );
2378         }
2379     }
2380     shift64RightJamming( zSig, 1, &zSig );
2381     return roundAndPackFloat64( roundData, 0, zExp, zSig );
2382 
2383 }
2384 
2385 /*
2386 -------------------------------------------------------------------------------
2387 Returns 1 if the double-precision floating-point value `a' is equal to the
2388 corresponding value `b', and 0 otherwise.  The comparison is performed
2389 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2390 -------------------------------------------------------------------------------
2391 */
float64_eq(float64 a,float64 b)2392 flag float64_eq( float64 a, float64 b )
2393 {
2394 
2395     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2396          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2397        ) {
2398         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2399             float_raise( float_flag_invalid );
2400         }
2401         return 0;
2402     }
2403     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2404 
2405 }
2406 
2407 /*
2408 -------------------------------------------------------------------------------
2409 Returns 1 if the double-precision floating-point value `a' is less than or
2410 equal to the corresponding value `b', and 0 otherwise.  The comparison is
2411 performed according to the IEC/IEEE Standard for Binary Floating-point
2412 Arithmetic.
2413 -------------------------------------------------------------------------------
2414 */
float64_le(float64 a,float64 b)2415 flag float64_le( float64 a, float64 b )
2416 {
2417     flag aSign, bSign;
2418 
2419     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2420          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2421        ) {
2422         float_raise( float_flag_invalid );
2423         return 0;
2424     }
2425     aSign = extractFloat64Sign( a );
2426     bSign = extractFloat64Sign( b );
2427     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2428     return ( a == b ) || ( aSign ^ ( a < b ) );
2429 
2430 }
2431 
2432 /*
2433 -------------------------------------------------------------------------------
2434 Returns 1 if the double-precision floating-point value `a' is less than
2435 the corresponding value `b', and 0 otherwise.  The comparison is performed
2436 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2437 -------------------------------------------------------------------------------
2438 */
float64_lt(float64 a,float64 b)2439 flag float64_lt( float64 a, float64 b )
2440 {
2441     flag aSign, bSign;
2442 
2443     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2444          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2445        ) {
2446         float_raise( float_flag_invalid );
2447         return 0;
2448     }
2449     aSign = extractFloat64Sign( a );
2450     bSign = extractFloat64Sign( b );
2451     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2452     return ( a != b ) && ( aSign ^ ( a < b ) );
2453 
2454 }
2455 
2456 /*
2457 -------------------------------------------------------------------------------
2458 Returns 1 if the double-precision floating-point value `a' is equal to the
2459 corresponding value `b', and 0 otherwise.  The invalid exception is raised
2460 if either operand is a NaN.  Otherwise, the comparison is performed
2461 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2462 -------------------------------------------------------------------------------
2463 */
float64_eq_signaling(float64 a,float64 b)2464 flag float64_eq_signaling( float64 a, float64 b )
2465 {
2466 
2467     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2468          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2469        ) {
2470         float_raise( float_flag_invalid );
2471         return 0;
2472     }
2473     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2474 
2475 }
2476 
2477 /*
2478 -------------------------------------------------------------------------------
2479 Returns 1 if the double-precision floating-point value `a' is less than or
2480 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2481 cause an exception.  Otherwise, the comparison is performed according to the
2482 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2483 -------------------------------------------------------------------------------
2484 */
float64_le_quiet(float64 a,float64 b)2485 flag float64_le_quiet( float64 a, float64 b )
2486 {
2487     flag aSign, bSign;
2488     //int16 aExp, bExp;
2489 
2490     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2491          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2492        ) {
2493         /* Do nothing, even if NaN as we're quiet */
2494         return 0;
2495     }
2496     aSign = extractFloat64Sign( a );
2497     bSign = extractFloat64Sign( b );
2498     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2499     return ( a == b ) || ( aSign ^ ( a < b ) );
2500 
2501 }
2502 
2503 /*
2504 -------------------------------------------------------------------------------
2505 Returns 1 if the double-precision floating-point value `a' is less than
2506 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2507 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2508 Standard for Binary Floating-point Arithmetic.
2509 -------------------------------------------------------------------------------
2510 */
float64_lt_quiet(float64 a,float64 b)2511 flag float64_lt_quiet( float64 a, float64 b )
2512 {
2513     flag aSign, bSign;
2514 
2515     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2516          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2517        ) {
2518         /* Do nothing, even if NaN as we're quiet */
2519         return 0;
2520     }
2521     aSign = extractFloat64Sign( a );
2522     bSign = extractFloat64Sign( b );
2523     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2524     return ( a != b ) && ( aSign ^ ( a < b ) );
2525 
2526 }
2527 
2528 #ifdef FLOATX80
2529 
2530 /*
2531 -------------------------------------------------------------------------------
2532 Returns the result of converting the extended double-precision floating-
2533 point value `a' to the 32-bit two's complement integer format.  The
2534 conversion is performed according to the IEC/IEEE Standard for Binary
2535 Floating-point Arithmetic---which means in particular that the conversion
2536 is rounded according to the current rounding mode.  If `a' is a NaN, the
2537 largest positive integer is returned.  Otherwise, if the conversion
2538 overflows, the largest integer with the same sign as `a' is returned.
2539 -------------------------------------------------------------------------------
2540 */
floatx80_to_int32(struct roundingData * roundData,floatx80 a)2541 int32 floatx80_to_int32( struct roundingData *roundData, floatx80 a )
2542 {
2543     flag aSign;
2544     int32 aExp, shiftCount;
2545     bits64 aSig;
2546 
2547     aSig = extractFloatx80Frac( a );
2548     aExp = extractFloatx80Exp( a );
2549     aSign = extractFloatx80Sign( a );
2550     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2551     shiftCount = 0x4037 - aExp;
2552     if ( shiftCount <= 0 ) shiftCount = 1;
2553     shift64RightJamming( aSig, shiftCount, &aSig );
2554     return roundAndPackInt32( roundData, aSign, aSig );
2555 
2556 }
2557 
2558 /*
2559 -------------------------------------------------------------------------------
2560 Returns the result of converting the extended double-precision floating-
2561 point value `a' to the 32-bit two's complement integer format.  The
2562 conversion is performed according to the IEC/IEEE Standard for Binary
2563 Floating-point Arithmetic, except that the conversion is always rounded
2564 toward zero.  If `a' is a NaN, the largest positive integer is returned.
2565 Otherwise, if the conversion overflows, the largest integer with the same
2566 sign as `a' is returned.
2567 -------------------------------------------------------------------------------
2568 */
floatx80_to_int32_round_to_zero(floatx80 a)2569 int32 floatx80_to_int32_round_to_zero( floatx80 a )
2570 {
2571     flag aSign;
2572     int32 aExp, shiftCount;
2573     bits64 aSig, savedASig;
2574     int32 z;
2575 
2576     aSig = extractFloatx80Frac( a );
2577     aExp = extractFloatx80Exp( a );
2578     aSign = extractFloatx80Sign( a );
2579     shiftCount = 0x403E - aExp;
2580     if ( shiftCount < 32 ) {
2581         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2582         goto invalid;
2583     }
2584     else if ( 63 < shiftCount ) {
2585         if ( aExp || aSig ) float_raise( float_flag_inexact );
2586         return 0;
2587     }
2588     savedASig = aSig;
2589     aSig >>= shiftCount;
2590     z = aSig;
2591     if ( aSign ) z = - z;
2592     if ( ( z < 0 ) ^ aSign ) {
2593  invalid:
2594         float_raise( float_flag_invalid );
2595         return aSign ? 0x80000000 : 0x7FFFFFFF;
2596     }
2597     if ( ( aSig<<shiftCount ) != savedASig ) {
2598         float_raise( float_flag_inexact );
2599     }
2600     return z;
2601 
2602 }
2603 
2604 /*
2605 -------------------------------------------------------------------------------
2606 Returns the result of converting the extended double-precision floating-
2607 point value `a' to the single-precision floating-point format.  The
2608 conversion is performed according to the IEC/IEEE Standard for Binary
2609 Floating-point Arithmetic.
2610 -------------------------------------------------------------------------------
2611 */
floatx80_to_float32(struct roundingData * roundData,floatx80 a)2612 float32 floatx80_to_float32( struct roundingData *roundData, floatx80 a )
2613 {
2614     flag aSign;
2615     int32 aExp;
2616     bits64 aSig;
2617 
2618     aSig = extractFloatx80Frac( a );
2619     aExp = extractFloatx80Exp( a );
2620     aSign = extractFloatx80Sign( a );
2621     if ( aExp == 0x7FFF ) {
2622         if ( (bits64) ( aSig<<1 ) ) {
2623             return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
2624         }
2625         return packFloat32( aSign, 0xFF, 0 );
2626     }
2627     shift64RightJamming( aSig, 33, &aSig );
2628     if ( aExp || aSig ) aExp -= 0x3F81;
2629     return roundAndPackFloat32( roundData, aSign, aExp, aSig );
2630 
2631 }
2632 
2633 /*
2634 -------------------------------------------------------------------------------
2635 Returns the result of converting the extended double-precision floating-
2636 point value `a' to the double-precision floating-point format.  The
2637 conversion is performed according to the IEC/IEEE Standard for Binary
2638 Floating-point Arithmetic.
2639 -------------------------------------------------------------------------------
2640 */
floatx80_to_float64(struct roundingData * roundData,floatx80 a)2641 float64 floatx80_to_float64( struct roundingData *roundData, floatx80 a )
2642 {
2643     flag aSign;
2644     int32 aExp;
2645     bits64 aSig, zSig;
2646 
2647     aSig = extractFloatx80Frac( a );
2648     aExp = extractFloatx80Exp( a );
2649     aSign = extractFloatx80Sign( a );
2650     if ( aExp == 0x7FFF ) {
2651         if ( (bits64) ( aSig<<1 ) ) {
2652             return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
2653         }
2654         return packFloat64( aSign, 0x7FF, 0 );
2655     }
2656     shift64RightJamming( aSig, 1, &zSig );
2657     if ( aExp || aSig ) aExp -= 0x3C01;
2658     return roundAndPackFloat64( roundData, aSign, aExp, zSig );
2659 
2660 }
2661 
2662 /*
2663 -------------------------------------------------------------------------------
2664 Rounds the extended double-precision floating-point value `a' to an integer,
2665 and returns the result as an extended quadruple-precision floating-point
2666 value.  The operation is performed according to the IEC/IEEE Standard for
2667 Binary Floating-point Arithmetic.
2668 -------------------------------------------------------------------------------
2669 */
floatx80_round_to_int(struct roundingData * roundData,floatx80 a)2670 floatx80 floatx80_round_to_int( struct roundingData *roundData, floatx80 a )
2671 {
2672     flag aSign;
2673     int32 aExp;
2674     bits64 lastBitMask, roundBitsMask;
2675     int8 roundingMode;
2676     floatx80 z;
2677 
2678     aExp = extractFloatx80Exp( a );
2679     if ( 0x403E <= aExp ) {
2680         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
2681             return propagateFloatx80NaN( a, a );
2682         }
2683         return a;
2684     }
2685     if ( aExp <= 0x3FFE ) {
2686         if (    ( aExp == 0 )
2687              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
2688             return a;
2689         }
2690         roundData->exception |= float_flag_inexact;
2691         aSign = extractFloatx80Sign( a );
2692         switch ( roundData->mode ) {
2693          case float_round_nearest_even:
2694             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
2695                ) {
2696                 return
2697                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
2698             }
2699             break;
2700          case float_round_down:
2701             return
2702                   aSign ?
2703                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
2704                 : packFloatx80( 0, 0, 0 );
2705          case float_round_up:
2706             return
2707                   aSign ? packFloatx80( 1, 0, 0 )
2708                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
2709         }
2710         return packFloatx80( aSign, 0, 0 );
2711     }
2712     lastBitMask = 1;
2713     lastBitMask <<= 0x403E - aExp;
2714     roundBitsMask = lastBitMask - 1;
2715     z = a;
2716     roundingMode = roundData->mode;
2717     if ( roundingMode == float_round_nearest_even ) {
2718         z.low += lastBitMask>>1;
2719         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
2720     }
2721     else if ( roundingMode != float_round_to_zero ) {
2722         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2723             z.low += roundBitsMask;
2724         }
2725     }
2726     z.low &= ~ roundBitsMask;
2727     if ( z.low == 0 ) {
2728         ++z.high;
2729         z.low = LIT64( 0x8000000000000000 );
2730     }
2731     if ( z.low != a.low ) roundData->exception |= float_flag_inexact;
2732     return z;
2733 
2734 }
2735 
2736 /*
2737 -------------------------------------------------------------------------------
2738 Returns the result of adding the absolute values of the extended double-
2739 precision floating-point values `a' and `b'.  If `zSign' is true, the sum is
2740 negated before being returned.  `zSign' is ignored if the result is a NaN.
2741 The addition is performed according to the IEC/IEEE Standard for Binary
2742 Floating-point Arithmetic.
2743 -------------------------------------------------------------------------------
2744 */
addFloatx80Sigs(struct roundingData * roundData,floatx80 a,floatx80 b,flag zSign)2745 static floatx80 addFloatx80Sigs( struct roundingData *roundData, floatx80 a, floatx80 b, flag zSign )
2746 {
2747     int32 aExp, bExp, zExp;
2748     bits64 aSig, bSig, zSig0, zSig1;
2749     int32 expDiff;
2750 
2751     aSig = extractFloatx80Frac( a );
2752     aExp = extractFloatx80Exp( a );
2753     bSig = extractFloatx80Frac( b );
2754     bExp = extractFloatx80Exp( b );
2755     expDiff = aExp - bExp;
2756     if ( 0 < expDiff ) {
2757         if ( aExp == 0x7FFF ) {
2758             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2759             return a;
2760         }
2761         if ( bExp == 0 ) --expDiff;
2762         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2763         zExp = aExp;
2764     }
2765     else if ( expDiff < 0 ) {
2766         if ( bExp == 0x7FFF ) {
2767             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2768             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2769         }
2770         if ( aExp == 0 ) ++expDiff;
2771         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2772         zExp = bExp;
2773     }
2774     else {
2775         if ( aExp == 0x7FFF ) {
2776             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2777                 return propagateFloatx80NaN( a, b );
2778             }
2779             return a;
2780         }
2781         zSig1 = 0;
2782         zSig0 = aSig + bSig;
2783         if ( aExp == 0 ) {
2784             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
2785             goto roundAndPack;
2786         }
2787         zExp = aExp;
2788         goto shiftRight1;
2789     }
2790 
2791     zSig0 = aSig + bSig;
2792 
2793     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
2794  shiftRight1:
2795     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
2796     zSig0 |= LIT64( 0x8000000000000000 );
2797     ++zExp;
2798  roundAndPack:
2799     return
2800         roundAndPackFloatx80(
2801             roundData, zSign, zExp, zSig0, zSig1 );
2802 
2803 }
2804 
2805 /*
2806 -------------------------------------------------------------------------------
2807 Returns the result of subtracting the absolute values of the extended
2808 double-precision floating-point values `a' and `b'.  If `zSign' is true,
2809 the difference is negated before being returned.  `zSign' is ignored if the
2810 result is a NaN.  The subtraction is performed according to the IEC/IEEE
2811 Standard for Binary Floating-point Arithmetic.
2812 -------------------------------------------------------------------------------
2813 */
subFloatx80Sigs(struct roundingData * roundData,floatx80 a,floatx80 b,flag zSign)2814 static floatx80 subFloatx80Sigs( struct roundingData *roundData, floatx80 a, floatx80 b, flag zSign )
2815 {
2816     int32 aExp, bExp, zExp;
2817     bits64 aSig, bSig, zSig0, zSig1;
2818     int32 expDiff;
2819     floatx80 z;
2820 
2821     aSig = extractFloatx80Frac( a );
2822     aExp = extractFloatx80Exp( a );
2823     bSig = extractFloatx80Frac( b );
2824     bExp = extractFloatx80Exp( b );
2825     expDiff = aExp - bExp;
2826     if ( 0 < expDiff ) goto aExpBigger;
2827     if ( expDiff < 0 ) goto bExpBigger;
2828     if ( aExp == 0x7FFF ) {
2829         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2830             return propagateFloatx80NaN( a, b );
2831         }
2832         roundData->exception |= float_flag_invalid;
2833         z.low = floatx80_default_nan_low;
2834         z.high = floatx80_default_nan_high;
2835         z.__padding = 0;
2836         return z;
2837     }
2838     if ( aExp == 0 ) {
2839         aExp = 1;
2840         bExp = 1;
2841     }
2842     zSig1 = 0;
2843     if ( bSig < aSig ) goto aBigger;
2844     if ( aSig < bSig ) goto bBigger;
2845     return packFloatx80( roundData->mode == float_round_down, 0, 0 );
2846  bExpBigger:
2847     if ( bExp == 0x7FFF ) {
2848         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2849         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
2850     }
2851     if ( aExp == 0 ) ++expDiff;
2852     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2853  bBigger:
2854     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
2855     zExp = bExp;
2856     zSign ^= 1;
2857     goto normalizeRoundAndPack;
2858  aExpBigger:
2859     if ( aExp == 0x7FFF ) {
2860         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2861         return a;
2862     }
2863     if ( bExp == 0 ) --expDiff;
2864     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2865  aBigger:
2866     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
2867     zExp = aExp;
2868  normalizeRoundAndPack:
2869     return
2870         normalizeRoundAndPackFloatx80(
2871             roundData, zSign, zExp, zSig0, zSig1 );
2872 
2873 }
2874 
2875 /*
2876 -------------------------------------------------------------------------------
2877 Returns the result of adding the extended double-precision floating-point
2878 values `a' and `b'.  The operation is performed according to the IEC/IEEE
2879 Standard for Binary Floating-point Arithmetic.
2880 -------------------------------------------------------------------------------
2881 */
floatx80_add(struct roundingData * roundData,floatx80 a,floatx80 b)2882 floatx80 floatx80_add( struct roundingData *roundData, floatx80 a, floatx80 b )
2883 {
2884     flag aSign, bSign;
2885 
2886     aSign = extractFloatx80Sign( a );
2887     bSign = extractFloatx80Sign( b );
2888     if ( aSign == bSign ) {
2889         return addFloatx80Sigs( roundData, a, b, aSign );
2890     }
2891     else {
2892         return subFloatx80Sigs( roundData, a, b, aSign );
2893     }
2894 
2895 }
2896 
2897 /*
2898 -------------------------------------------------------------------------------
2899 Returns the result of subtracting the extended double-precision floating-
2900 point values `a' and `b'.  The operation is performed according to the
2901 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2902 -------------------------------------------------------------------------------
2903 */
floatx80_sub(struct roundingData * roundData,floatx80 a,floatx80 b)2904 floatx80 floatx80_sub( struct roundingData *roundData, floatx80 a, floatx80 b )
2905 {
2906     flag aSign, bSign;
2907 
2908     aSign = extractFloatx80Sign( a );
2909     bSign = extractFloatx80Sign( b );
2910     if ( aSign == bSign ) {
2911         return subFloatx80Sigs( roundData, a, b, aSign );
2912     }
2913     else {
2914         return addFloatx80Sigs( roundData, a, b, aSign );
2915     }
2916 
2917 }
2918 
2919 /*
2920 -------------------------------------------------------------------------------
2921 Returns the result of multiplying the extended double-precision floating-
2922 point values `a' and `b'.  The operation is performed according to the
2923 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2924 -------------------------------------------------------------------------------
2925 */
floatx80_mul(struct roundingData * roundData,floatx80 a,floatx80 b)2926 floatx80 floatx80_mul( struct roundingData *roundData, floatx80 a, floatx80 b )
2927 {
2928     flag aSign, bSign, zSign;
2929     int32 aExp, bExp, zExp;
2930     bits64 aSig, bSig, zSig0, zSig1;
2931     floatx80 z;
2932 
2933     aSig = extractFloatx80Frac( a );
2934     aExp = extractFloatx80Exp( a );
2935     aSign = extractFloatx80Sign( a );
2936     bSig = extractFloatx80Frac( b );
2937     bExp = extractFloatx80Exp( b );
2938     bSign = extractFloatx80Sign( b );
2939     zSign = aSign ^ bSign;
2940     if ( aExp == 0x7FFF ) {
2941         if (    (bits64) ( aSig<<1 )
2942              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
2943             return propagateFloatx80NaN( a, b );
2944         }
2945         if ( ( bExp | bSig ) == 0 ) goto invalid;
2946         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2947     }
2948     if ( bExp == 0x7FFF ) {
2949         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2950         if ( ( aExp | aSig ) == 0 ) {
2951  invalid:
2952             roundData->exception |= float_flag_invalid;
2953             z.low = floatx80_default_nan_low;
2954             z.high = floatx80_default_nan_high;
2955             z.__padding = 0;
2956             return z;
2957         }
2958         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2959     }
2960     if ( aExp == 0 ) {
2961         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
2962         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
2963     }
2964     if ( bExp == 0 ) {
2965         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
2966         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
2967     }
2968     zExp = aExp + bExp - 0x3FFE;
2969     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2970     if ( 0 < (sbits64) zSig0 ) {
2971         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
2972         --zExp;
2973     }
2974     return
2975         roundAndPackFloatx80(
2976             roundData, zSign, zExp, zSig0, zSig1 );
2977 
2978 }
2979 
2980 /*
2981 -------------------------------------------------------------------------------
2982 Returns the result of dividing the extended double-precision floating-point
2983 value `a' by the corresponding value `b'.  The operation is performed
2984 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2985 -------------------------------------------------------------------------------
2986 */
floatx80_div(struct roundingData * roundData,floatx80 a,floatx80 b)2987 floatx80 floatx80_div( struct roundingData *roundData, floatx80 a, floatx80 b )
2988 {
2989     flag aSign, bSign, zSign;
2990     int32 aExp, bExp, zExp;
2991     bits64 aSig, bSig, zSig0, zSig1;
2992     bits64 rem0, rem1, rem2, term0, term1, term2;
2993     floatx80 z;
2994 
2995     aSig = extractFloatx80Frac( a );
2996     aExp = extractFloatx80Exp( a );
2997     aSign = extractFloatx80Sign( a );
2998     bSig = extractFloatx80Frac( b );
2999     bExp = extractFloatx80Exp( b );
3000     bSign = extractFloatx80Sign( b );
3001     zSign = aSign ^ bSign;
3002     if ( aExp == 0x7FFF ) {
3003         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3004         if ( bExp == 0x7FFF ) {
3005             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3006             goto invalid;
3007         }
3008         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3009     }
3010     if ( bExp == 0x7FFF ) {
3011         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3012         return packFloatx80( zSign, 0, 0 );
3013     }
3014     if ( bExp == 0 ) {
3015         if ( bSig == 0 ) {
3016             if ( ( aExp | aSig ) == 0 ) {
3017  invalid:
3018                 roundData->exception |= float_flag_invalid;
3019                 z.low = floatx80_default_nan_low;
3020                 z.high = floatx80_default_nan_high;
3021                 z.__padding = 0;
3022                 return z;
3023             }
3024             roundData->exception |= float_flag_divbyzero;
3025             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3026         }
3027         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3028     }
3029     if ( aExp == 0 ) {
3030         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3031         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3032     }
3033     zExp = aExp - bExp + 0x3FFE;
3034     rem1 = 0;
3035     if ( bSig <= aSig ) {
3036         shift128Right( aSig, 0, 1, &aSig, &rem1 );
3037         ++zExp;
3038     }
3039     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3040     mul64To128( bSig, zSig0, &term0, &term1 );
3041     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3042     while ( (sbits64) rem0 < 0 ) {
3043         --zSig0;
3044         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3045     }
3046     zSig1 = estimateDiv128To64( rem1, 0, bSig );
3047     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3048         mul64To128( bSig, zSig1, &term1, &term2 );
3049         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3050         while ( (sbits64) rem1 < 0 ) {
3051             --zSig1;
3052             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3053         }
3054         zSig1 |= ( ( rem1 | rem2 ) != 0 );
3055     }
3056     return
3057         roundAndPackFloatx80(
3058             roundData, zSign, zExp, zSig0, zSig1 );
3059 
3060 }
3061 
3062 /*
3063 -------------------------------------------------------------------------------
3064 Returns the remainder of the extended double-precision floating-point value
3065 `a' with respect to the corresponding value `b'.  The operation is performed
3066 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3067 -------------------------------------------------------------------------------
3068 */
floatx80_rem(struct roundingData * roundData,floatx80 a,floatx80 b)3069 floatx80 floatx80_rem( struct roundingData *roundData, floatx80 a, floatx80 b )
3070 {
3071     flag aSign, bSign, zSign;
3072     int32 aExp, bExp, expDiff;
3073     bits64 aSig0, aSig1, bSig;
3074     bits64 q, term0, term1, alternateASig0, alternateASig1;
3075     floatx80 z;
3076 
3077     aSig0 = extractFloatx80Frac( a );
3078     aExp = extractFloatx80Exp( a );
3079     aSign = extractFloatx80Sign( a );
3080     bSig = extractFloatx80Frac( b );
3081     bExp = extractFloatx80Exp( b );
3082     bSign = extractFloatx80Sign( b );
3083     if ( aExp == 0x7FFF ) {
3084         if (    (bits64) ( aSig0<<1 )
3085              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3086             return propagateFloatx80NaN( a, b );
3087         }
3088         goto invalid;
3089     }
3090     if ( bExp == 0x7FFF ) {
3091         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3092         return a;
3093     }
3094     if ( bExp == 0 ) {
3095         if ( bSig == 0 ) {
3096  invalid:
3097             roundData->exception |= float_flag_invalid;
3098             z.low = floatx80_default_nan_low;
3099             z.high = floatx80_default_nan_high;
3100             z.__padding = 0;
3101             return z;
3102         }
3103         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3104     }
3105     if ( aExp == 0 ) {
3106         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3107         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3108     }
3109     bSig |= LIT64( 0x8000000000000000 );
3110     zSign = aSign;
3111     expDiff = aExp - bExp;
3112     aSig1 = 0;
3113     if ( expDiff < 0 ) {
3114         if ( expDiff < -1 ) return a;
3115         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3116         expDiff = 0;
3117     }
3118     q = ( bSig <= aSig0 );
3119     if ( q ) aSig0 -= bSig;
3120     expDiff -= 64;
3121     while ( 0 < expDiff ) {
3122         q = estimateDiv128To64( aSig0, aSig1, bSig );
3123         q = ( 2 < q ) ? q - 2 : 0;
3124         mul64To128( bSig, q, &term0, &term1 );
3125         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3126         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3127         expDiff -= 62;
3128     }
3129     expDiff += 64;
3130     if ( 0 < expDiff ) {
3131         q = estimateDiv128To64( aSig0, aSig1, bSig );
3132         q = ( 2 < q ) ? q - 2 : 0;
3133         q >>= 64 - expDiff;
3134         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3135         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3136         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3137         while ( le128( term0, term1, aSig0, aSig1 ) ) {
3138             ++q;
3139             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3140         }
3141     }
3142     else {
3143         term1 = 0;
3144         term0 = bSig;
3145     }
3146     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3147     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3148          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3149               && ( q & 1 ) )
3150        ) {
3151         aSig0 = alternateASig0;
3152         aSig1 = alternateASig1;
3153         zSign = ! zSign;
3154     }
3155 
3156     return
3157         normalizeRoundAndPackFloatx80(
3158             roundData, zSign, bExp + expDiff, aSig0, aSig1 );
3159 
3160 }
3161 
3162 /*
3163 -------------------------------------------------------------------------------
3164 Returns the square root of the extended double-precision floating-point
3165 value `a'.  The operation is performed according to the IEC/IEEE Standard
3166 for Binary Floating-point Arithmetic.
3167 -------------------------------------------------------------------------------
3168 */
floatx80_sqrt(struct roundingData * roundData,floatx80 a)3169 floatx80 floatx80_sqrt( struct roundingData *roundData, floatx80 a )
3170 {
3171     flag aSign;
3172     int32 aExp, zExp;
3173     bits64 aSig0, aSig1, zSig0, zSig1;
3174     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3175     bits64 shiftedRem0, shiftedRem1;
3176     floatx80 z;
3177 
3178     aSig0 = extractFloatx80Frac( a );
3179     aExp = extractFloatx80Exp( a );
3180     aSign = extractFloatx80Sign( a );
3181     if ( aExp == 0x7FFF ) {
3182         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3183         if ( ! aSign ) return a;
3184         goto invalid;
3185     }
3186     if ( aSign ) {
3187         if ( ( aExp | aSig0 ) == 0 ) return a;
3188  invalid:
3189         roundData->exception |= float_flag_invalid;
3190         z.low = floatx80_default_nan_low;
3191         z.high = floatx80_default_nan_high;
3192         z.__padding = 0;
3193         return z;
3194     }
3195     if ( aExp == 0 ) {
3196         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3197         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3198     }
3199     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3200     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3201     zSig0 <<= 31;
3202     aSig1 = 0;
3203     shift128Right( aSig0, 0, ( aExp & 1 ) + 2, &aSig0, &aSig1 );
3204     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0 ) + zSig0 + 4;
3205     if ( 0 <= (sbits64) zSig0 ) zSig0 = LIT64( 0xFFFFFFFFFFFFFFFF );
3206     shortShift128Left( aSig0, aSig1, 2, &aSig0, &aSig1 );
3207     mul64To128( zSig0, zSig0, &term0, &term1 );
3208     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3209     while ( (sbits64) rem0 < 0 ) {
3210         --zSig0;
3211         shortShift128Left( 0, zSig0, 1, &term0, &term1 );
3212         term1 |= 1;
3213         add128( rem0, rem1, term0, term1, &rem0, &rem1 );
3214     }
3215     shortShift128Left( rem0, rem1, 63, &shiftedRem0, &shiftedRem1 );
3216     zSig1 = estimateDiv128To64( shiftedRem0, shiftedRem1, zSig0 );
3217     if ( (bits64) ( zSig1<<1 ) <= 10 ) {
3218         if ( zSig1 == 0 ) zSig1 = 1;
3219         mul64To128( zSig0, zSig1, &term1, &term2 );
3220         shortShift128Left( term1, term2, 1, &term1, &term2 );
3221         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3222         mul64To128( zSig1, zSig1, &term2, &term3 );
3223         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3224         while ( (sbits64) rem1 < 0 ) {
3225             --zSig1;
3226             shortShift192Left( 0, zSig0, zSig1, 1, &term1, &term2, &term3 );
3227             term3 |= 1;
3228             add192(
3229                 rem1, rem2, rem3, term1, term2, term3, &rem1, &rem2, &rem3 );
3230         }
3231         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3232     }
3233     return
3234         roundAndPackFloatx80(
3235             roundData, 0, zExp, zSig0, zSig1 );
3236 
3237 }
3238 
3239 /*
3240 -------------------------------------------------------------------------------
3241 Returns 1 if the extended double-precision floating-point value `a' is
3242 equal to the corresponding value `b', and 0 otherwise.  The comparison is
3243 performed according to the IEC/IEEE Standard for Binary Floating-point
3244 Arithmetic.
3245 -------------------------------------------------------------------------------
3246 */
floatx80_eq(floatx80 a,floatx80 b)3247 flag floatx80_eq( floatx80 a, floatx80 b )
3248 {
3249 
3250     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3251               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3252          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3253               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3254        ) {
3255         if (    floatx80_is_signaling_nan( a )
3256              || floatx80_is_signaling_nan( b ) ) {
3257             float_raise( float_flag_invalid );
3258         }
3259         return 0;
3260     }
3261     return
3262            ( a.low == b.low )
3263         && (    ( a.high == b.high )
3264              || (    ( a.low == 0 )
3265                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3266            );
3267 
3268 }
3269 
3270 /*
3271 -------------------------------------------------------------------------------
3272 Returns 1 if the extended double-precision floating-point value `a' is
3273 less than or equal to the corresponding value `b', and 0 otherwise.  The
3274 comparison is performed according to the IEC/IEEE Standard for Binary
3275 Floating-point Arithmetic.
3276 -------------------------------------------------------------------------------
3277 */
floatx80_le(floatx80 a,floatx80 b)3278 flag floatx80_le( floatx80 a, floatx80 b )
3279 {
3280     flag aSign, bSign;
3281 
3282     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3283               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3284          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3285               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3286        ) {
3287         float_raise( float_flag_invalid );
3288         return 0;
3289     }
3290     aSign = extractFloatx80Sign( a );
3291     bSign = extractFloatx80Sign( b );
3292     if ( aSign != bSign ) {
3293         return
3294                aSign
3295             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3296                  == 0 );
3297     }
3298     return
3299           aSign ? le128( b.high, b.low, a.high, a.low )
3300         : le128( a.high, a.low, b.high, b.low );
3301 
3302 }
3303 
3304 /*
3305 -------------------------------------------------------------------------------
3306 Returns 1 if the extended double-precision floating-point value `a' is
3307 less than the corresponding value `b', and 0 otherwise.  The comparison
3308 is performed according to the IEC/IEEE Standard for Binary Floating-point
3309 Arithmetic.
3310 -------------------------------------------------------------------------------
3311 */
floatx80_lt(floatx80 a,floatx80 b)3312 flag floatx80_lt( floatx80 a, floatx80 b )
3313 {
3314     flag aSign, bSign;
3315 
3316     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3317               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3318          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3319               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3320        ) {
3321         float_raise( float_flag_invalid );
3322         return 0;
3323     }
3324     aSign = extractFloatx80Sign( a );
3325     bSign = extractFloatx80Sign( b );
3326     if ( aSign != bSign ) {
3327         return
3328                aSign
3329             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3330                  != 0 );
3331     }
3332     return
3333           aSign ? lt128( b.high, b.low, a.high, a.low )
3334         : lt128( a.high, a.low, b.high, b.low );
3335 
3336 }
3337 
3338 /*
3339 -------------------------------------------------------------------------------
3340 Returns 1 if the extended double-precision floating-point value `a' is equal
3341 to the corresponding value `b', and 0 otherwise.  The invalid exception is
3342 raised if either operand is a NaN.  Otherwise, the comparison is performed
3343 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3344 -------------------------------------------------------------------------------
3345 */
floatx80_eq_signaling(floatx80 a,floatx80 b)3346 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
3347 {
3348 
3349     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3350               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3351          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3352               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3353        ) {
3354         float_raise( float_flag_invalid );
3355         return 0;
3356     }
3357     return
3358            ( a.low == b.low )
3359         && (    ( a.high == b.high )
3360              || (    ( a.low == 0 )
3361                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3362            );
3363 
3364 }
3365 
3366 /*
3367 -------------------------------------------------------------------------------
3368 Returns 1 if the extended double-precision floating-point value `a' is less
3369 than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
3370 do not cause an exception.  Otherwise, the comparison is performed according
3371 to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3372 -------------------------------------------------------------------------------
3373 */
floatx80_le_quiet(floatx80 a,floatx80 b)3374 flag floatx80_le_quiet( floatx80 a, floatx80 b )
3375 {
3376     flag aSign, bSign;
3377 
3378     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3379               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3380          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3381               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3382        ) {
3383         /* Do nothing, even if NaN as we're quiet */
3384         return 0;
3385     }
3386     aSign = extractFloatx80Sign( a );
3387     bSign = extractFloatx80Sign( b );
3388     if ( aSign != bSign ) {
3389         return
3390                aSign
3391             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3392                  == 0 );
3393     }
3394     return
3395           aSign ? le128( b.high, b.low, a.high, a.low )
3396         : le128( a.high, a.low, b.high, b.low );
3397 
3398 }
3399 
3400 /*
3401 -------------------------------------------------------------------------------
3402 Returns 1 if the extended double-precision floating-point value `a' is less
3403 than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
3404 an exception.  Otherwise, the comparison is performed according to the
3405 IEC/IEEE Standard for Binary Floating-point Arithmetic.
3406 -------------------------------------------------------------------------------
3407 */
floatx80_lt_quiet(floatx80 a,floatx80 b)3408 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
3409 {
3410     flag aSign, bSign;
3411 
3412     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3413               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3414          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3415               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3416        ) {
3417         /* Do nothing, even if NaN as we're quiet */
3418         return 0;
3419     }
3420     aSign = extractFloatx80Sign( a );
3421     bSign = extractFloatx80Sign( b );
3422     if ( aSign != bSign ) {
3423         return
3424                aSign
3425             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3426                  != 0 );
3427     }
3428     return
3429           aSign ? lt128( b.high, b.low, a.high, a.low )
3430         : lt128( a.high, a.low, b.high, b.low );
3431 
3432 }
3433 
3434 #endif
3435 
3436