1 2 /*============================================================================ 3 4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic 5 Package, Release 2b. 6 7 Written by John R. Hauser. This work was made possible in part by the 8 International Computer Science Institute, located at Suite 600, 1947 Center 9 Street, Berkeley, California 94704. Funding was partially provided by the 10 National Science Foundation under grant MIP-9311980. The original version 11 of this code was written as part of a project to build a fixed-point vector 12 processor in collaboration with the University of California at Berkeley, 13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ 15 arithmetic/SoftFloat.html'. 16 17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has 18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES 19 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS 20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, 21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE 22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE 23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR 24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. 25 26 Derivative works are acceptable, even for commercial purposes, so long as 27 (1) the source code for the derivative work includes prominent notice that 28 the work is derivative, and (2) the source code includes prominent notice with 29 these four paragraphs for those parts of this code that are retained. 30 31 =============================================================================*/ 32 33 /* FIXME: Flush-To-Zero only effects results. Denormal inputs should also 34 be flushed to zero. */ 35 #include "softfloat.h" 36 37 /*---------------------------------------------------------------------------- 38 | Primitive arithmetic functions, including multi-word arithmetic, and 39 | division and square root approximations. (Can be specialized to target if 40 | desired.) 41 *----------------------------------------------------------------------------*/ 42 #include "softfloat-macros.h" 43 44 /*---------------------------------------------------------------------------- 45 | Functions and definitions to determine: (1) whether tininess for underflow 46 | is detected before or after rounding by default, (2) what (if anything) 47 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 48 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 49 | are propagated from function inputs to output. These details are target- 50 | specific. 51 *----------------------------------------------------------------------------*/ 52 #include "softfloat-specialize.h" 53 54 void set_float_rounding_mode(int val STATUS_PARAM) 55 { 56 STATUS(float_rounding_mode) = val; 57 } 58 59 void set_float_exception_flags(int val STATUS_PARAM) 60 { 61 STATUS(float_exception_flags) = val; 62 } 63 64 #ifdef FLOATX80 65 void set_floatx80_rounding_precision(int val STATUS_PARAM) 66 { 67 STATUS(floatx80_rounding_precision) = val; 68 } 69 #endif 70 71 /*---------------------------------------------------------------------------- 72 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 73 | and 7, and returns the properly rounded 32-bit integer corresponding to the 74 | input. If `zSign' is 1, the input is negated before being converted to an 75 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 76 | is simply rounded to an integer, with the inexact exception raised if the 77 | input cannot be represented exactly as an integer. However, if the fixed- 78 | point input is too large, the invalid exception is raised and the largest 79 | positive or negative integer is returned. 80 *----------------------------------------------------------------------------*/ 81 82 static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM) 83 { 84 int8 roundingMode; 85 flag roundNearestEven; 86 int8 roundIncrement, roundBits; 87 int32 z; 88 89 roundingMode = STATUS(float_rounding_mode); 90 roundNearestEven = ( roundingMode == float_round_nearest_even ); 91 roundIncrement = 0x40; 92 if ( ! roundNearestEven ) { 93 if ( roundingMode == float_round_to_zero ) { 94 roundIncrement = 0; 95 } 96 else { 97 roundIncrement = 0x7F; 98 if ( zSign ) { 99 if ( roundingMode == float_round_up ) roundIncrement = 0; 100 } 101 else { 102 if ( roundingMode == float_round_down ) roundIncrement = 0; 103 } 104 } 105 } 106 roundBits = absZ & 0x7F; 107 absZ = ( absZ + roundIncrement )>>7; 108 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 109 z = absZ; 110 if ( zSign ) z = - z; 111 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 112 float_raise( float_flag_invalid STATUS_VAR); 113 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 114 } 115 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; 116 return z; 117 118 } 119 120 /*---------------------------------------------------------------------------- 121 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 122 | `absZ1', with binary point between bits 63 and 64 (between the input words), 123 | and returns the properly rounded 64-bit integer corresponding to the input. 124 | If `zSign' is 1, the input is negated before being converted to an integer. 125 | Ordinarily, the fixed-point input is simply rounded to an integer, with 126 | the inexact exception raised if the input cannot be represented exactly as 127 | an integer. However, if the fixed-point input is too large, the invalid 128 | exception is raised and the largest positive or negative integer is 129 | returned. 130 *----------------------------------------------------------------------------*/ 131 132 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM) 133 { 134 int8 roundingMode; 135 flag roundNearestEven, increment; 136 int64 z; 137 138 roundingMode = STATUS(float_rounding_mode); 139 roundNearestEven = ( roundingMode == float_round_nearest_even ); 140 increment = ( (sbits64) absZ1 < 0 ); 141 if ( ! roundNearestEven ) { 142 if ( roundingMode == float_round_to_zero ) { 143 increment = 0; 144 } 145 else { 146 if ( zSign ) { 147 increment = ( roundingMode == float_round_down ) && absZ1; 148 } 149 else { 150 increment = ( roundingMode == float_round_up ) && absZ1; 151 } 152 } 153 } 154 if ( increment ) { 155 ++absZ0; 156 if ( absZ0 == 0 ) goto overflow; 157 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 158 } 159 z = absZ0; 160 if ( zSign ) z = - z; 161 if ( z && ( ( z < 0 ) ^ zSign ) ) { 162 overflow: 163 float_raise( float_flag_invalid STATUS_VAR); 164 return 165 zSign ? (sbits64) LIT64( 0x8000000000000000 ) 166 : LIT64( 0x7FFFFFFFFFFFFFFF ); 167 } 168 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact; 169 return z; 170 171 } 172 173 /*---------------------------------------------------------------------------- 174 | Returns the fraction bits of the single-precision floating-point value `a'. 175 *----------------------------------------------------------------------------*/ 176 177 INLINE bits32 extractFloat32Frac( float32 a ) 178 { 179 180 return float32_val(a) & 0x007FFFFF; 181 182 } 183 184 /*---------------------------------------------------------------------------- 185 | Returns the exponent bits of the single-precision floating-point value `a'. 186 *----------------------------------------------------------------------------*/ 187 188 INLINE int16 extractFloat32Exp( float32 a ) 189 { 190 191 return ( float32_val(a)>>23 ) & 0xFF; 192 193 } 194 195 /*---------------------------------------------------------------------------- 196 | Returns the sign bit of the single-precision floating-point value `a'. 197 *----------------------------------------------------------------------------*/ 198 199 INLINE flag extractFloat32Sign( float32 a ) 200 { 201 202 return float32_val(a)>>31; 203 204 } 205 206 /*---------------------------------------------------------------------------- 207 | Normalizes the subnormal single-precision floating-point value represented 208 | by the denormalized significand `aSig'. The normalized exponent and 209 | significand are stored at the locations pointed to by `zExpPtr' and 210 | `zSigPtr', respectively. 211 *----------------------------------------------------------------------------*/ 212 213 static void 214 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr ) 215 { 216 int8 shiftCount; 217 218 shiftCount = countLeadingZeros32( aSig ) - 8; 219 *zSigPtr = aSig<<shiftCount; 220 *zExpPtr = 1 - shiftCount; 221 222 } 223 224 /*---------------------------------------------------------------------------- 225 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 226 | single-precision floating-point value, returning the result. After being 227 | shifted into the proper positions, the three fields are simply added 228 | together to form the result. This means that any integer portion of `zSig' 229 | will be added into the exponent. Since a properly normalized significand 230 | will have an integer portion equal to 1, the `zExp' input should be 1 less 231 | than the desired result exponent whenever `zSig' is a complete, normalized 232 | significand. 233 *----------------------------------------------------------------------------*/ 234 235 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig ) 236 { 237 238 return make_float32( 239 ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig); 240 241 } 242 243 /*---------------------------------------------------------------------------- 244 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 245 | and significand `zSig', and returns the proper single-precision floating- 246 | point value corresponding to the abstract input. Ordinarily, the abstract 247 | value is simply rounded and packed into the single-precision format, with 248 | the inexact exception raised if the abstract input cannot be represented 249 | exactly. However, if the abstract value is too large, the overflow and 250 | inexact exceptions are raised and an infinity or maximal finite value is 251 | returned. If the abstract value is too small, the input value is rounded to 252 | a subnormal number, and the underflow and inexact exceptions are raised if 253 | the abstract input cannot be represented exactly as a subnormal single- 254 | precision floating-point number. 255 | The input significand `zSig' has its binary point between bits 30 256 | and 29, which is 7 bits to the left of the usual location. This shifted 257 | significand must be normalized or smaller. If `zSig' is not normalized, 258 | `zExp' must be 0; in that case, the result returned is a subnormal number, 259 | and it must not require rounding. In the usual case that `zSig' is 260 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 261 | The handling of underflow and overflow follows the IEC/IEEE Standard for 262 | Binary Floating-Point Arithmetic. 263 *----------------------------------------------------------------------------*/ 264 265 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM) 266 { 267 int8 roundingMode; 268 flag roundNearestEven; 269 int8 roundIncrement, roundBits; 270 flag isTiny; 271 272 roundingMode = STATUS(float_rounding_mode); 273 roundNearestEven = ( roundingMode == float_round_nearest_even ); 274 roundIncrement = 0x40; 275 if ( ! roundNearestEven ) { 276 if ( roundingMode == float_round_to_zero ) { 277 roundIncrement = 0; 278 } 279 else { 280 roundIncrement = 0x7F; 281 if ( zSign ) { 282 if ( roundingMode == float_round_up ) roundIncrement = 0; 283 } 284 else { 285 if ( roundingMode == float_round_down ) roundIncrement = 0; 286 } 287 } 288 } 289 roundBits = zSig & 0x7F; 290 if ( 0xFD <= (bits16) zExp ) { 291 if ( ( 0xFD < zExp ) 292 || ( ( zExp == 0xFD ) 293 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) ) 294 ) { 295 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); 296 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 )); 297 } 298 if ( zExp < 0 ) { 299 if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 ); 300 isTiny = 301 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) 302 || ( zExp < -1 ) 303 || ( zSig + roundIncrement < 0x80000000 ); 304 shift32RightJamming( zSig, - zExp, &zSig ); 305 zExp = 0; 306 roundBits = zSig & 0x7F; 307 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR); 308 } 309 } 310 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; 311 zSig = ( zSig + roundIncrement )>>7; 312 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 313 if ( zSig == 0 ) zExp = 0; 314 return packFloat32( zSign, zExp, zSig ); 315 316 } 317 318 /*---------------------------------------------------------------------------- 319 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 320 | and significand `zSig', and returns the proper single-precision floating- 321 | point value corresponding to the abstract input. This routine is just like 322 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 323 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 324 | floating-point exponent. 325 *----------------------------------------------------------------------------*/ 326 327 static float32 328 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM) 329 { 330 int8 shiftCount; 331 332 shiftCount = countLeadingZeros32( zSig ) - 1; 333 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR); 334 335 } 336 337 /*---------------------------------------------------------------------------- 338 | Returns the fraction bits of the double-precision floating-point value `a'. 339 *----------------------------------------------------------------------------*/ 340 341 INLINE bits64 extractFloat64Frac( float64 a ) 342 { 343 344 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF ); 345 346 } 347 348 /*---------------------------------------------------------------------------- 349 | Returns the exponent bits of the double-precision floating-point value `a'. 350 *----------------------------------------------------------------------------*/ 351 352 INLINE int16 extractFloat64Exp( float64 a ) 353 { 354 355 return ( float64_val(a)>>52 ) & 0x7FF; 356 357 } 358 359 /*---------------------------------------------------------------------------- 360 | Returns the sign bit of the double-precision floating-point value `a'. 361 *----------------------------------------------------------------------------*/ 362 363 INLINE flag extractFloat64Sign( float64 a ) 364 { 365 366 return float64_val(a)>>63; 367 368 } 369 370 /*---------------------------------------------------------------------------- 371 | Normalizes the subnormal double-precision floating-point value represented 372 | by the denormalized significand `aSig'. The normalized exponent and 373 | significand are stored at the locations pointed to by `zExpPtr' and 374 | `zSigPtr', respectively. 375 *----------------------------------------------------------------------------*/ 376 377 static void 378 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr ) 379 { 380 int8 shiftCount; 381 382 shiftCount = countLeadingZeros64( aSig ) - 11; 383 *zSigPtr = aSig<<shiftCount; 384 *zExpPtr = 1 - shiftCount; 385 386 } 387 388 /*---------------------------------------------------------------------------- 389 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 390 | double-precision floating-point value, returning the result. After being 391 | shifted into the proper positions, the three fields are simply added 392 | together to form the result. This means that any integer portion of `zSig' 393 | will be added into the exponent. Since a properly normalized significand 394 | will have an integer portion equal to 1, the `zExp' input should be 1 less 395 | than the desired result exponent whenever `zSig' is a complete, normalized 396 | significand. 397 *----------------------------------------------------------------------------*/ 398 399 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig ) 400 { 401 402 return make_float64( 403 ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig); 404 405 } 406 407 /*---------------------------------------------------------------------------- 408 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 409 | and significand `zSig', and returns the proper double-precision floating- 410 | point value corresponding to the abstract input. Ordinarily, the abstract 411 | value is simply rounded and packed into the double-precision format, with 412 | the inexact exception raised if the abstract input cannot be represented 413 | exactly. However, if the abstract value is too large, the overflow and 414 | inexact exceptions are raised and an infinity or maximal finite value is 415 | returned. If the abstract value is too small, the input value is rounded 416 | to a subnormal number, and the underflow and inexact exceptions are raised 417 | if the abstract input cannot be represented exactly as a subnormal double- 418 | precision floating-point number. 419 | The input significand `zSig' has its binary point between bits 62 420 | and 61, which is 10 bits to the left of the usual location. This shifted 421 | significand must be normalized or smaller. If `zSig' is not normalized, 422 | `zExp' must be 0; in that case, the result returned is a subnormal number, 423 | and it must not require rounding. In the usual case that `zSig' is 424 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 425 | The handling of underflow and overflow follows the IEC/IEEE Standard for 426 | Binary Floating-Point Arithmetic. 427 *----------------------------------------------------------------------------*/ 428 429 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM) 430 { 431 int8 roundingMode; 432 flag roundNearestEven; 433 int16 roundIncrement, roundBits; 434 flag isTiny; 435 436 roundingMode = STATUS(float_rounding_mode); 437 roundNearestEven = ( roundingMode == float_round_nearest_even ); 438 roundIncrement = 0x200; 439 if ( ! roundNearestEven ) { 440 if ( roundingMode == float_round_to_zero ) { 441 roundIncrement = 0; 442 } 443 else { 444 roundIncrement = 0x3FF; 445 if ( zSign ) { 446 if ( roundingMode == float_round_up ) roundIncrement = 0; 447 } 448 else { 449 if ( roundingMode == float_round_down ) roundIncrement = 0; 450 } 451 } 452 } 453 roundBits = zSig & 0x3FF; 454 if ( 0x7FD <= (bits16) zExp ) { 455 if ( ( 0x7FD < zExp ) 456 || ( ( zExp == 0x7FD ) 457 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) ) 458 ) { 459 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); 460 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 )); 461 } 462 if ( zExp < 0 ) { 463 if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 ); 464 isTiny = 465 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) 466 || ( zExp < -1 ) 467 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 468 shift64RightJamming( zSig, - zExp, &zSig ); 469 zExp = 0; 470 roundBits = zSig & 0x3FF; 471 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR); 472 } 473 } 474 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; 475 zSig = ( zSig + roundIncrement )>>10; 476 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 477 if ( zSig == 0 ) zExp = 0; 478 return packFloat64( zSign, zExp, zSig ); 479 480 } 481 482 /*---------------------------------------------------------------------------- 483 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 484 | and significand `zSig', and returns the proper double-precision floating- 485 | point value corresponding to the abstract input. This routine is just like 486 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 487 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 488 | floating-point exponent. 489 *----------------------------------------------------------------------------*/ 490 491 static float64 492 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM) 493 { 494 int8 shiftCount; 495 496 shiftCount = countLeadingZeros64( zSig ) - 1; 497 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR); 498 499 } 500 501 #ifdef FLOATX80 502 503 /*---------------------------------------------------------------------------- 504 | Returns the fraction bits of the extended double-precision floating-point 505 | value `a'. 506 *----------------------------------------------------------------------------*/ 507 508 INLINE bits64 extractFloatx80Frac( floatx80 a ) 509 { 510 511 return a.low; 512 513 } 514 515 /*---------------------------------------------------------------------------- 516 | Returns the exponent bits of the extended double-precision floating-point 517 | value `a'. 518 *----------------------------------------------------------------------------*/ 519 520 INLINE int32 extractFloatx80Exp( floatx80 a ) 521 { 522 523 return a.high & 0x7FFF; 524 525 } 526 527 /*---------------------------------------------------------------------------- 528 | Returns the sign bit of the extended double-precision floating-point value 529 | `a'. 530 *----------------------------------------------------------------------------*/ 531 532 INLINE flag extractFloatx80Sign( floatx80 a ) 533 { 534 535 return a.high>>15; 536 537 } 538 539 /*---------------------------------------------------------------------------- 540 | Normalizes the subnormal extended double-precision floating-point value 541 | represented by the denormalized significand `aSig'. The normalized exponent 542 | and significand are stored at the locations pointed to by `zExpPtr' and 543 | `zSigPtr', respectively. 544 *----------------------------------------------------------------------------*/ 545 546 static void 547 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr ) 548 { 549 int8 shiftCount; 550 551 shiftCount = countLeadingZeros64( aSig ); 552 *zSigPtr = aSig<<shiftCount; 553 *zExpPtr = 1 - shiftCount; 554 555 } 556 557 /*---------------------------------------------------------------------------- 558 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 559 | extended double-precision floating-point value, returning the result. 560 *----------------------------------------------------------------------------*/ 561 562 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig ) 563 { 564 floatx80 z; 565 566 z.low = zSig; 567 z.high = ( ( (bits16) zSign )<<15 ) + zExp; 568 return z; 569 570 } 571 572 /*---------------------------------------------------------------------------- 573 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 574 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 575 | and returns the proper extended double-precision floating-point value 576 | corresponding to the abstract input. Ordinarily, the abstract value is 577 | rounded and packed into the extended double-precision format, with the 578 | inexact exception raised if the abstract input cannot be represented 579 | exactly. However, if the abstract value is too large, the overflow and 580 | inexact exceptions are raised and an infinity or maximal finite value is 581 | returned. If the abstract value is too small, the input value is rounded to 582 | a subnormal number, and the underflow and inexact exceptions are raised if 583 | the abstract input cannot be represented exactly as a subnormal extended 584 | double-precision floating-point number. 585 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 586 | number of bits as single or double precision, respectively. Otherwise, the 587 | result is rounded to the full precision of the extended double-precision 588 | format. 589 | The input significand must be normalized or smaller. If the input 590 | significand is not normalized, `zExp' must be 0; in that case, the result 591 | returned is a subnormal number, and it must not require rounding. The 592 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 593 | Floating-Point Arithmetic. 594 *----------------------------------------------------------------------------*/ 595 596 static floatx80 597 roundAndPackFloatx80( 598 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 599 STATUS_PARAM) 600 { 601 int8 roundingMode; 602 flag roundNearestEven, increment, isTiny; 603 int64 roundIncrement, roundMask, roundBits; 604 605 roundingMode = STATUS(float_rounding_mode); 606 roundNearestEven = ( roundingMode == float_round_nearest_even ); 607 if ( roundingPrecision == 80 ) goto precision80; 608 if ( roundingPrecision == 64 ) { 609 roundIncrement = LIT64( 0x0000000000000400 ); 610 roundMask = LIT64( 0x00000000000007FF ); 611 } 612 else if ( roundingPrecision == 32 ) { 613 roundIncrement = LIT64( 0x0000008000000000 ); 614 roundMask = LIT64( 0x000000FFFFFFFFFF ); 615 } 616 else { 617 goto precision80; 618 } 619 zSig0 |= ( zSig1 != 0 ); 620 if ( ! roundNearestEven ) { 621 if ( roundingMode == float_round_to_zero ) { 622 roundIncrement = 0; 623 } 624 else { 625 roundIncrement = roundMask; 626 if ( zSign ) { 627 if ( roundingMode == float_round_up ) roundIncrement = 0; 628 } 629 else { 630 if ( roundingMode == float_round_down ) roundIncrement = 0; 631 } 632 } 633 } 634 roundBits = zSig0 & roundMask; 635 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 636 if ( ( 0x7FFE < zExp ) 637 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 638 ) { 639 goto overflow; 640 } 641 if ( zExp <= 0 ) { 642 if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 ); 643 isTiny = 644 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) 645 || ( zExp < 0 ) 646 || ( zSig0 <= zSig0 + roundIncrement ); 647 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 648 zExp = 0; 649 roundBits = zSig0 & roundMask; 650 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR); 651 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; 652 zSig0 += roundIncrement; 653 if ( (sbits64) zSig0 < 0 ) zExp = 1; 654 roundIncrement = roundMask + 1; 655 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 656 roundMask |= roundIncrement; 657 } 658 zSig0 &= ~ roundMask; 659 return packFloatx80( zSign, zExp, zSig0 ); 660 } 661 } 662 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; 663 zSig0 += roundIncrement; 664 if ( zSig0 < roundIncrement ) { 665 ++zExp; 666 zSig0 = LIT64( 0x8000000000000000 ); 667 } 668 roundIncrement = roundMask + 1; 669 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 670 roundMask |= roundIncrement; 671 } 672 zSig0 &= ~ roundMask; 673 if ( zSig0 == 0 ) zExp = 0; 674 return packFloatx80( zSign, zExp, zSig0 ); 675 precision80: 676 increment = ( (sbits64) zSig1 < 0 ); 677 if ( ! roundNearestEven ) { 678 if ( roundingMode == float_round_to_zero ) { 679 increment = 0; 680 } 681 else { 682 if ( zSign ) { 683 increment = ( roundingMode == float_round_down ) && zSig1; 684 } 685 else { 686 increment = ( roundingMode == float_round_up ) && zSig1; 687 } 688 } 689 } 690 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 691 if ( ( 0x7FFE < zExp ) 692 || ( ( zExp == 0x7FFE ) 693 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 694 && increment 695 ) 696 ) { 697 roundMask = 0; 698 overflow: 699 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); 700 if ( ( roundingMode == float_round_to_zero ) 701 || ( zSign && ( roundingMode == float_round_up ) ) 702 || ( ! zSign && ( roundingMode == float_round_down ) ) 703 ) { 704 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 705 } 706 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 707 } 708 if ( zExp <= 0 ) { 709 isTiny = 710 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) 711 || ( zExp < 0 ) 712 || ! increment 713 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 714 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 715 zExp = 0; 716 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR); 717 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact; 718 if ( roundNearestEven ) { 719 increment = ( (sbits64) zSig1 < 0 ); 720 } 721 else { 722 if ( zSign ) { 723 increment = ( roundingMode == float_round_down ) && zSig1; 724 } 725 else { 726 increment = ( roundingMode == float_round_up ) && zSig1; 727 } 728 } 729 if ( increment ) { 730 ++zSig0; 731 zSig0 &= 732 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 733 if ( (sbits64) zSig0 < 0 ) zExp = 1; 734 } 735 return packFloatx80( zSign, zExp, zSig0 ); 736 } 737 } 738 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact; 739 if ( increment ) { 740 ++zSig0; 741 if ( zSig0 == 0 ) { 742 ++zExp; 743 zSig0 = LIT64( 0x8000000000000000 ); 744 } 745 else { 746 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 747 } 748 } 749 else { 750 if ( zSig0 == 0 ) zExp = 0; 751 } 752 return packFloatx80( zSign, zExp, zSig0 ); 753 754 } 755 756 /*---------------------------------------------------------------------------- 757 | Takes an abstract floating-point value having sign `zSign', exponent 758 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 759 | and returns the proper extended double-precision floating-point value 760 | corresponding to the abstract input. This routine is just like 761 | `roundAndPackFloatx80' except that the input significand does not have to be 762 | normalized. 763 *----------------------------------------------------------------------------*/ 764 765 static floatx80 766 normalizeRoundAndPackFloatx80( 767 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 768 STATUS_PARAM) 769 { 770 int8 shiftCount; 771 772 if ( zSig0 == 0 ) { 773 zSig0 = zSig1; 774 zSig1 = 0; 775 zExp -= 64; 776 } 777 shiftCount = countLeadingZeros64( zSig0 ); 778 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 779 zExp -= shiftCount; 780 return 781 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR); 782 783 } 784 785 #endif 786 787 #ifdef FLOAT128 788 789 /*---------------------------------------------------------------------------- 790 | Returns the least-significant 64 fraction bits of the quadruple-precision 791 | floating-point value `a'. 792 *----------------------------------------------------------------------------*/ 793 794 INLINE bits64 extractFloat128Frac1( float128 a ) 795 { 796 797 return a.low; 798 799 } 800 801 /*---------------------------------------------------------------------------- 802 | Returns the most-significant 48 fraction bits of the quadruple-precision 803 | floating-point value `a'. 804 *----------------------------------------------------------------------------*/ 805 806 INLINE bits64 extractFloat128Frac0( float128 a ) 807 { 808 809 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 810 811 } 812 813 /*---------------------------------------------------------------------------- 814 | Returns the exponent bits of the quadruple-precision floating-point value 815 | `a'. 816 *----------------------------------------------------------------------------*/ 817 818 INLINE int32 extractFloat128Exp( float128 a ) 819 { 820 821 return ( a.high>>48 ) & 0x7FFF; 822 823 } 824 825 /*---------------------------------------------------------------------------- 826 | Returns the sign bit of the quadruple-precision floating-point value `a'. 827 *----------------------------------------------------------------------------*/ 828 829 INLINE flag extractFloat128Sign( float128 a ) 830 { 831 832 return a.high>>63; 833 834 } 835 836 /*---------------------------------------------------------------------------- 837 | Normalizes the subnormal quadruple-precision floating-point value 838 | represented by the denormalized significand formed by the concatenation of 839 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 840 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 841 | significand are stored at the location pointed to by `zSig0Ptr', and the 842 | least significant 64 bits of the normalized significand are stored at the 843 | location pointed to by `zSig1Ptr'. 844 *----------------------------------------------------------------------------*/ 845 846 static void 847 normalizeFloat128Subnormal( 848 bits64 aSig0, 849 bits64 aSig1, 850 int32 *zExpPtr, 851 bits64 *zSig0Ptr, 852 bits64 *zSig1Ptr 853 ) 854 { 855 int8 shiftCount; 856 857 if ( aSig0 == 0 ) { 858 shiftCount = countLeadingZeros64( aSig1 ) - 15; 859 if ( shiftCount < 0 ) { 860 *zSig0Ptr = aSig1>>( - shiftCount ); 861 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 862 } 863 else { 864 *zSig0Ptr = aSig1<<shiftCount; 865 *zSig1Ptr = 0; 866 } 867 *zExpPtr = - shiftCount - 63; 868 } 869 else { 870 shiftCount = countLeadingZeros64( aSig0 ) - 15; 871 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 872 *zExpPtr = 1 - shiftCount; 873 } 874 875 } 876 877 /*---------------------------------------------------------------------------- 878 | Packs the sign `zSign', the exponent `zExp', and the significand formed 879 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 880 | floating-point value, returning the result. After being shifted into the 881 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 882 | added together to form the most significant 32 bits of the result. This 883 | means that any integer portion of `zSig0' will be added into the exponent. 884 | Since a properly normalized significand will have an integer portion equal 885 | to 1, the `zExp' input should be 1 less than the desired result exponent 886 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 887 | significand. 888 *----------------------------------------------------------------------------*/ 889 890 INLINE float128 891 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) 892 { 893 float128 z; 894 895 z.low = zSig1; 896 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0; 897 return z; 898 899 } 900 901 /*---------------------------------------------------------------------------- 902 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 903 | and extended significand formed by the concatenation of `zSig0', `zSig1', 904 | and `zSig2', and returns the proper quadruple-precision floating-point value 905 | corresponding to the abstract input. Ordinarily, the abstract value is 906 | simply rounded and packed into the quadruple-precision format, with the 907 | inexact exception raised if the abstract input cannot be represented 908 | exactly. However, if the abstract value is too large, the overflow and 909 | inexact exceptions are raised and an infinity or maximal finite value is 910 | returned. If the abstract value is too small, the input value is rounded to 911 | a subnormal number, and the underflow and inexact exceptions are raised if 912 | the abstract input cannot be represented exactly as a subnormal quadruple- 913 | precision floating-point number. 914 | The input significand must be normalized or smaller. If the input 915 | significand is not normalized, `zExp' must be 0; in that case, the result 916 | returned is a subnormal number, and it must not require rounding. In the 917 | usual case that the input significand is normalized, `zExp' must be 1 less 918 | than the ``true'' floating-point exponent. The handling of underflow and 919 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 920 *----------------------------------------------------------------------------*/ 921 922 static float128 923 roundAndPackFloat128( 924 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM) 925 { 926 int8 roundingMode; 927 flag roundNearestEven, increment, isTiny; 928 929 roundingMode = STATUS(float_rounding_mode); 930 roundNearestEven = ( roundingMode == float_round_nearest_even ); 931 increment = ( (sbits64) zSig2 < 0 ); 932 if ( ! roundNearestEven ) { 933 if ( roundingMode == float_round_to_zero ) { 934 increment = 0; 935 } 936 else { 937 if ( zSign ) { 938 increment = ( roundingMode == float_round_down ) && zSig2; 939 } 940 else { 941 increment = ( roundingMode == float_round_up ) && zSig2; 942 } 943 } 944 } 945 if ( 0x7FFD <= (bits32) zExp ) { 946 if ( ( 0x7FFD < zExp ) 947 || ( ( zExp == 0x7FFD ) 948 && eq128( 949 LIT64( 0x0001FFFFFFFFFFFF ), 950 LIT64( 0xFFFFFFFFFFFFFFFF ), 951 zSig0, 952 zSig1 953 ) 954 && increment 955 ) 956 ) { 957 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); 958 if ( ( roundingMode == float_round_to_zero ) 959 || ( zSign && ( roundingMode == float_round_up ) ) 960 || ( ! zSign && ( roundingMode == float_round_down ) ) 961 ) { 962 return 963 packFloat128( 964 zSign, 965 0x7FFE, 966 LIT64( 0x0000FFFFFFFFFFFF ), 967 LIT64( 0xFFFFFFFFFFFFFFFF ) 968 ); 969 } 970 return packFloat128( zSign, 0x7FFF, 0, 0 ); 971 } 972 if ( zExp < 0 ) { 973 if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 ); 974 isTiny = 975 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) 976 || ( zExp < -1 ) 977 || ! increment 978 || lt128( 979 zSig0, 980 zSig1, 981 LIT64( 0x0001FFFFFFFFFFFF ), 982 LIT64( 0xFFFFFFFFFFFFFFFF ) 983 ); 984 shift128ExtraRightJamming( 985 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 986 zExp = 0; 987 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR); 988 if ( roundNearestEven ) { 989 increment = ( (sbits64) zSig2 < 0 ); 990 } 991 else { 992 if ( zSign ) { 993 increment = ( roundingMode == float_round_down ) && zSig2; 994 } 995 else { 996 increment = ( roundingMode == float_round_up ) && zSig2; 997 } 998 } 999 } 1000 } 1001 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact; 1002 if ( increment ) { 1003 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 1004 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 1005 } 1006 else { 1007 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 1008 } 1009 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1010 1011 } 1012 1013 /*---------------------------------------------------------------------------- 1014 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1015 | and significand formed by the concatenation of `zSig0' and `zSig1', and 1016 | returns the proper quadruple-precision floating-point value corresponding 1017 | to the abstract input. This routine is just like `roundAndPackFloat128' 1018 | except that the input significand has fewer bits and does not have to be 1019 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 1020 | point exponent. 1021 *----------------------------------------------------------------------------*/ 1022 1023 static float128 1024 normalizeRoundAndPackFloat128( 1025 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM) 1026 { 1027 int8 shiftCount; 1028 bits64 zSig2; 1029 1030 if ( zSig0 == 0 ) { 1031 zSig0 = zSig1; 1032 zSig1 = 0; 1033 zExp -= 64; 1034 } 1035 shiftCount = countLeadingZeros64( zSig0 ) - 15; 1036 if ( 0 <= shiftCount ) { 1037 zSig2 = 0; 1038 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1039 } 1040 else { 1041 shift128ExtraRightJamming( 1042 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 1043 } 1044 zExp -= shiftCount; 1045 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR); 1046 1047 } 1048 1049 #endif 1050 1051 /*---------------------------------------------------------------------------- 1052 | Returns the result of converting the 32-bit two's complement integer `a' 1053 | to the single-precision floating-point format. The conversion is performed 1054 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1055 *----------------------------------------------------------------------------*/ 1056 1057 float32 int32_to_float32( int32 a STATUS_PARAM ) 1058 { 1059 flag zSign; 1060 1061 if ( a == 0 ) return float32_zero; 1062 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); 1063 zSign = ( a < 0 ); 1064 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR ); 1065 1066 } 1067 1068 /*---------------------------------------------------------------------------- 1069 | Returns the result of converting the 32-bit two's complement integer `a' 1070 | to the double-precision floating-point format. The conversion is performed 1071 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1072 *----------------------------------------------------------------------------*/ 1073 1074 float64 int32_to_float64( int32 a STATUS_PARAM ) 1075 { 1076 flag zSign; 1077 uint32 absA; 1078 int8 shiftCount; 1079 bits64 zSig; 1080 1081 if ( a == 0 ) return float64_zero; 1082 zSign = ( a < 0 ); 1083 absA = zSign ? - a : a; 1084 shiftCount = countLeadingZeros32( absA ) + 21; 1085 zSig = absA; 1086 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount ); 1087 1088 } 1089 1090 #ifdef FLOATX80 1091 1092 /*---------------------------------------------------------------------------- 1093 | Returns the result of converting the 32-bit two's complement integer `a' 1094 | to the extended double-precision floating-point format. The conversion 1095 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1096 | Arithmetic. 1097 *----------------------------------------------------------------------------*/ 1098 1099 floatx80 int32_to_floatx80( int32 a STATUS_PARAM ) 1100 { 1101 flag zSign; 1102 uint32 absA; 1103 int8 shiftCount; 1104 bits64 zSig; 1105 1106 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1107 zSign = ( a < 0 ); 1108 absA = zSign ? - a : a; 1109 shiftCount = countLeadingZeros32( absA ) + 32; 1110 zSig = absA; 1111 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 1112 1113 } 1114 1115 #endif 1116 1117 #ifdef FLOAT128 1118 1119 /*---------------------------------------------------------------------------- 1120 | Returns the result of converting the 32-bit two's complement integer `a' to 1121 | the quadruple-precision floating-point format. The conversion is performed 1122 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1123 *----------------------------------------------------------------------------*/ 1124 1125 float128 int32_to_float128( int32 a STATUS_PARAM ) 1126 { 1127 flag zSign; 1128 uint32 absA; 1129 int8 shiftCount; 1130 bits64 zSig0; 1131 1132 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1133 zSign = ( a < 0 ); 1134 absA = zSign ? - a : a; 1135 shiftCount = countLeadingZeros32( absA ) + 17; 1136 zSig0 = absA; 1137 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 1138 1139 } 1140 1141 #endif 1142 1143 /*---------------------------------------------------------------------------- 1144 | Returns the result of converting the 64-bit two's complement integer `a' 1145 | to the single-precision floating-point format. The conversion is performed 1146 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1147 *----------------------------------------------------------------------------*/ 1148 1149 float32 int64_to_float32( int64 a STATUS_PARAM ) 1150 { 1151 flag zSign; 1152 uint64 absA; 1153 int8 shiftCount; 1154 1155 if ( a == 0 ) return float32_zero; 1156 zSign = ( a < 0 ); 1157 absA = zSign ? - a : a; 1158 shiftCount = countLeadingZeros64( absA ) - 40; 1159 if ( 0 <= shiftCount ) { 1160 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount ); 1161 } 1162 else { 1163 shiftCount += 7; 1164 if ( shiftCount < 0 ) { 1165 shift64RightJamming( absA, - shiftCount, &absA ); 1166 } 1167 else { 1168 absA <<= shiftCount; 1169 } 1170 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR ); 1171 } 1172 1173 } 1174 1175 float32 uint64_to_float32( uint64 a STATUS_PARAM ) 1176 { 1177 int8 shiftCount; 1178 1179 if ( a == 0 ) return float32_zero; 1180 shiftCount = countLeadingZeros64( a ) - 40; 1181 if ( 0 <= shiftCount ) { 1182 return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount ); 1183 } 1184 else { 1185 shiftCount += 7; 1186 if ( shiftCount < 0 ) { 1187 shift64RightJamming( a, - shiftCount, &a ); 1188 } 1189 else { 1190 a <<= shiftCount; 1191 } 1192 return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR ); 1193 } 1194 } 1195 1196 /*---------------------------------------------------------------------------- 1197 | Returns the result of converting the 64-bit two's complement integer `a' 1198 | to the double-precision floating-point format. The conversion is performed 1199 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1200 *----------------------------------------------------------------------------*/ 1201 1202 float64 int64_to_float64( int64 a STATUS_PARAM ) 1203 { 1204 flag zSign; 1205 1206 if ( a == 0 ) return float64_zero; 1207 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) { 1208 return packFloat64( 1, 0x43E, 0 ); 1209 } 1210 zSign = ( a < 0 ); 1211 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR ); 1212 1213 } 1214 1215 float64 uint64_to_float64( uint64 a STATUS_PARAM ) 1216 { 1217 if ( a == 0 ) return float64_zero; 1218 return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR ); 1219 1220 } 1221 1222 #ifdef FLOATX80 1223 1224 /*---------------------------------------------------------------------------- 1225 | Returns the result of converting the 64-bit two's complement integer `a' 1226 | to the extended double-precision floating-point format. The conversion 1227 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1228 | Arithmetic. 1229 *----------------------------------------------------------------------------*/ 1230 1231 floatx80 int64_to_floatx80( int64 a STATUS_PARAM ) 1232 { 1233 flag zSign; 1234 uint64 absA; 1235 int8 shiftCount; 1236 1237 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1238 zSign = ( a < 0 ); 1239 absA = zSign ? - a : a; 1240 shiftCount = countLeadingZeros64( absA ); 1241 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 1242 1243 } 1244 1245 #endif 1246 1247 #ifdef FLOAT128 1248 1249 /*---------------------------------------------------------------------------- 1250 | Returns the result of converting the 64-bit two's complement integer `a' to 1251 | the quadruple-precision floating-point format. The conversion is performed 1252 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1253 *----------------------------------------------------------------------------*/ 1254 1255 float128 int64_to_float128( int64 a STATUS_PARAM ) 1256 { 1257 flag zSign; 1258 uint64 absA; 1259 int8 shiftCount; 1260 int32 zExp; 1261 bits64 zSig0, zSig1; 1262 1263 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1264 zSign = ( a < 0 ); 1265 absA = zSign ? - a : a; 1266 shiftCount = countLeadingZeros64( absA ) + 49; 1267 zExp = 0x406E - shiftCount; 1268 if ( 64 <= shiftCount ) { 1269 zSig1 = 0; 1270 zSig0 = absA; 1271 shiftCount -= 64; 1272 } 1273 else { 1274 zSig1 = absA; 1275 zSig0 = 0; 1276 } 1277 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1278 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1279 1280 } 1281 1282 #endif 1283 1284 /*---------------------------------------------------------------------------- 1285 | Returns the result of converting the single-precision floating-point value 1286 | `a' to the 32-bit two's complement integer format. The conversion is 1287 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1288 | Arithmetic---which means in particular that the conversion is rounded 1289 | according to the current rounding mode. If `a' is a NaN, the largest 1290 | positive integer is returned. Otherwise, if the conversion overflows, the 1291 | largest integer with the same sign as `a' is returned. 1292 *----------------------------------------------------------------------------*/ 1293 1294 int32 float32_to_int32( float32 a STATUS_PARAM ) 1295 { 1296 flag aSign; 1297 int16 aExp, shiftCount; 1298 bits32 aSig; 1299 bits64 aSig64; 1300 1301 aSig = extractFloat32Frac( a ); 1302 aExp = extractFloat32Exp( a ); 1303 aSign = extractFloat32Sign( a ); 1304 if ( ( aExp == 0xFF ) && aSig ) aSign = 0; 1305 if ( aExp ) aSig |= 0x00800000; 1306 shiftCount = 0xAF - aExp; 1307 aSig64 = aSig; 1308 aSig64 <<= 32; 1309 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 ); 1310 return roundAndPackInt32( aSign, aSig64 STATUS_VAR ); 1311 1312 } 1313 1314 /*---------------------------------------------------------------------------- 1315 | Returns the result of converting the single-precision floating-point value 1316 | `a' to the 32-bit two's complement integer format. The conversion is 1317 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1318 | Arithmetic, except that the conversion is always rounded toward zero. 1319 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1320 | the conversion overflows, the largest integer with the same sign as `a' is 1321 | returned. 1322 *----------------------------------------------------------------------------*/ 1323 1324 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM ) 1325 { 1326 flag aSign; 1327 int16 aExp, shiftCount; 1328 bits32 aSig; 1329 int32 z; 1330 1331 aSig = extractFloat32Frac( a ); 1332 aExp = extractFloat32Exp( a ); 1333 aSign = extractFloat32Sign( a ); 1334 shiftCount = aExp - 0x9E; 1335 if ( 0 <= shiftCount ) { 1336 if ( float32_val(a) != 0xCF000000 ) { 1337 float_raise( float_flag_invalid STATUS_VAR); 1338 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; 1339 } 1340 return (sbits32) 0x80000000; 1341 } 1342 else if ( aExp <= 0x7E ) { 1343 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 1344 return 0; 1345 } 1346 aSig = ( aSig | 0x00800000 )<<8; 1347 z = aSig>>( - shiftCount ); 1348 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) { 1349 STATUS(float_exception_flags) |= float_flag_inexact; 1350 } 1351 if ( aSign ) z = - z; 1352 return z; 1353 1354 } 1355 1356 /*---------------------------------------------------------------------------- 1357 | Returns the result of converting the single-precision floating-point value 1358 | `a' to the 16-bit two's complement integer format. The conversion is 1359 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1360 | Arithmetic, except that the conversion is always rounded toward zero. 1361 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1362 | the conversion overflows, the largest integer with the same sign as `a' is 1363 | returned. 1364 *----------------------------------------------------------------------------*/ 1365 1366 int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM ) 1367 { 1368 flag aSign; 1369 int16 aExp, shiftCount; 1370 bits32 aSig; 1371 int32 z; 1372 1373 aSig = extractFloat32Frac( a ); 1374 aExp = extractFloat32Exp( a ); 1375 aSign = extractFloat32Sign( a ); 1376 shiftCount = aExp - 0x8E; 1377 if ( 0 <= shiftCount ) { 1378 if ( float32_val(a) != 0xC7000000 ) { 1379 float_raise( float_flag_invalid STATUS_VAR); 1380 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1381 return 0x7FFF; 1382 } 1383 } 1384 return (sbits32) 0xffff8000; 1385 } 1386 else if ( aExp <= 0x7E ) { 1387 if ( aExp | aSig ) { 1388 STATUS(float_exception_flags) |= float_flag_inexact; 1389 } 1390 return 0; 1391 } 1392 shiftCount -= 0x10; 1393 aSig = ( aSig | 0x00800000 )<<8; 1394 z = aSig>>( - shiftCount ); 1395 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) { 1396 STATUS(float_exception_flags) |= float_flag_inexact; 1397 } 1398 if ( aSign ) { 1399 z = - z; 1400 } 1401 return z; 1402 1403 } 1404 1405 /*---------------------------------------------------------------------------- 1406 | Returns the result of converting the single-precision floating-point value 1407 | `a' to the 64-bit two's complement integer format. The conversion is 1408 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1409 | Arithmetic---which means in particular that the conversion is rounded 1410 | according to the current rounding mode. If `a' is a NaN, the largest 1411 | positive integer is returned. Otherwise, if the conversion overflows, the 1412 | largest integer with the same sign as `a' is returned. 1413 *----------------------------------------------------------------------------*/ 1414 1415 int64 float32_to_int64( float32 a STATUS_PARAM ) 1416 { 1417 flag aSign; 1418 int16 aExp, shiftCount; 1419 bits32 aSig; 1420 bits64 aSig64, aSigExtra; 1421 1422 aSig = extractFloat32Frac( a ); 1423 aExp = extractFloat32Exp( a ); 1424 aSign = extractFloat32Sign( a ); 1425 shiftCount = 0xBE - aExp; 1426 if ( shiftCount < 0 ) { 1427 float_raise( float_flag_invalid STATUS_VAR); 1428 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1429 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1430 } 1431 return (sbits64) LIT64( 0x8000000000000000 ); 1432 } 1433 if ( aExp ) aSig |= 0x00800000; 1434 aSig64 = aSig; 1435 aSig64 <<= 40; 1436 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra ); 1437 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR ); 1438 1439 } 1440 1441 /*---------------------------------------------------------------------------- 1442 | Returns the result of converting the single-precision floating-point value 1443 | `a' to the 64-bit two's complement integer format. The conversion is 1444 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1445 | Arithmetic, except that the conversion is always rounded toward zero. If 1446 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 1447 | conversion overflows, the largest integer with the same sign as `a' is 1448 | returned. 1449 *----------------------------------------------------------------------------*/ 1450 1451 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM ) 1452 { 1453 flag aSign; 1454 int16 aExp, shiftCount; 1455 bits32 aSig; 1456 bits64 aSig64; 1457 int64 z; 1458 1459 aSig = extractFloat32Frac( a ); 1460 aExp = extractFloat32Exp( a ); 1461 aSign = extractFloat32Sign( a ); 1462 shiftCount = aExp - 0xBE; 1463 if ( 0 <= shiftCount ) { 1464 if ( float32_val(a) != 0xDF000000 ) { 1465 float_raise( float_flag_invalid STATUS_VAR); 1466 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1467 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1468 } 1469 } 1470 return (sbits64) LIT64( 0x8000000000000000 ); 1471 } 1472 else if ( aExp <= 0x7E ) { 1473 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 1474 return 0; 1475 } 1476 aSig64 = aSig | 0x00800000; 1477 aSig64 <<= 40; 1478 z = aSig64>>( - shiftCount ); 1479 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) { 1480 STATUS(float_exception_flags) |= float_flag_inexact; 1481 } 1482 if ( aSign ) z = - z; 1483 return z; 1484 1485 } 1486 1487 /*---------------------------------------------------------------------------- 1488 | Returns the result of converting the single-precision floating-point value 1489 | `a' to the double-precision floating-point format. The conversion is 1490 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1491 | Arithmetic. 1492 *----------------------------------------------------------------------------*/ 1493 1494 float64 float32_to_float64( float32 a STATUS_PARAM ) 1495 { 1496 flag aSign; 1497 int16 aExp; 1498 bits32 aSig; 1499 1500 aSig = extractFloat32Frac( a ); 1501 aExp = extractFloat32Exp( a ); 1502 aSign = extractFloat32Sign( a ); 1503 if ( aExp == 0xFF ) { 1504 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR )); 1505 return packFloat64( aSign, 0x7FF, 0 ); 1506 } 1507 if ( aExp == 0 ) { 1508 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 1509 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1510 --aExp; 1511 } 1512 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 ); 1513 1514 } 1515 1516 #ifdef FLOATX80 1517 1518 /*---------------------------------------------------------------------------- 1519 | Returns the result of converting the single-precision floating-point value 1520 | `a' to the extended double-precision floating-point format. The conversion 1521 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1522 | Arithmetic. 1523 *----------------------------------------------------------------------------*/ 1524 1525 floatx80 float32_to_floatx80( float32 a STATUS_PARAM ) 1526 { 1527 flag aSign; 1528 int16 aExp; 1529 bits32 aSig; 1530 1531 aSig = extractFloat32Frac( a ); 1532 aExp = extractFloat32Exp( a ); 1533 aSign = extractFloat32Sign( a ); 1534 if ( aExp == 0xFF ) { 1535 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) ); 1536 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 1537 } 1538 if ( aExp == 0 ) { 1539 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 1540 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1541 } 1542 aSig |= 0x00800000; 1543 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 ); 1544 1545 } 1546 1547 #endif 1548 1549 #ifdef FLOAT128 1550 1551 /*---------------------------------------------------------------------------- 1552 | Returns the result of converting the single-precision floating-point value 1553 | `a' to the double-precision floating-point format. The conversion is 1554 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1555 | Arithmetic. 1556 *----------------------------------------------------------------------------*/ 1557 1558 float128 float32_to_float128( float32 a STATUS_PARAM ) 1559 { 1560 flag aSign; 1561 int16 aExp; 1562 bits32 aSig; 1563 1564 aSig = extractFloat32Frac( a ); 1565 aExp = extractFloat32Exp( a ); 1566 aSign = extractFloat32Sign( a ); 1567 if ( aExp == 0xFF ) { 1568 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) ); 1569 return packFloat128( aSign, 0x7FFF, 0, 0 ); 1570 } 1571 if ( aExp == 0 ) { 1572 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 1573 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1574 --aExp; 1575 } 1576 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 ); 1577 1578 } 1579 1580 #endif 1581 1582 /*---------------------------------------------------------------------------- 1583 | Rounds the single-precision floating-point value `a' to an integer, and 1584 | returns the result as a single-precision floating-point value. The 1585 | operation is performed according to the IEC/IEEE Standard for Binary 1586 | Floating-Point Arithmetic. 1587 *----------------------------------------------------------------------------*/ 1588 1589 float32 float32_round_to_int( float32 a STATUS_PARAM) 1590 { 1591 flag aSign; 1592 int16 aExp; 1593 bits32 lastBitMask, roundBitsMask; 1594 int8 roundingMode; 1595 bits32 z; 1596 1597 aExp = extractFloat32Exp( a ); 1598 if ( 0x96 <= aExp ) { 1599 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { 1600 return propagateFloat32NaN( a, a STATUS_VAR ); 1601 } 1602 return a; 1603 } 1604 if ( aExp <= 0x7E ) { 1605 if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a; 1606 STATUS(float_exception_flags) |= float_flag_inexact; 1607 aSign = extractFloat32Sign( a ); 1608 switch ( STATUS(float_rounding_mode) ) { 1609 case float_round_nearest_even: 1610 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { 1611 return packFloat32( aSign, 0x7F, 0 ); 1612 } 1613 break; 1614 case float_round_down: 1615 return make_float32(aSign ? 0xBF800000 : 0); 1616 case float_round_up: 1617 return make_float32(aSign ? 0x80000000 : 0x3F800000); 1618 } 1619 return packFloat32( aSign, 0, 0 ); 1620 } 1621 lastBitMask = 1; 1622 lastBitMask <<= 0x96 - aExp; 1623 roundBitsMask = lastBitMask - 1; 1624 z = float32_val(a); 1625 roundingMode = STATUS(float_rounding_mode); 1626 if ( roundingMode == float_round_nearest_even ) { 1627 z += lastBitMask>>1; 1628 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 1629 } 1630 else if ( roundingMode != float_round_to_zero ) { 1631 if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) { 1632 z += roundBitsMask; 1633 } 1634 } 1635 z &= ~ roundBitsMask; 1636 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact; 1637 return make_float32(z); 1638 1639 } 1640 1641 /*---------------------------------------------------------------------------- 1642 | Returns the result of adding the absolute values of the single-precision 1643 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 1644 | before being returned. `zSign' is ignored if the result is a NaN. 1645 | The addition is performed according to the IEC/IEEE Standard for Binary 1646 | Floating-Point Arithmetic. 1647 *----------------------------------------------------------------------------*/ 1648 1649 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM) 1650 { 1651 int16 aExp, bExp, zExp; 1652 bits32 aSig, bSig, zSig; 1653 int16 expDiff; 1654 1655 aSig = extractFloat32Frac( a ); 1656 aExp = extractFloat32Exp( a ); 1657 bSig = extractFloat32Frac( b ); 1658 bExp = extractFloat32Exp( b ); 1659 expDiff = aExp - bExp; 1660 aSig <<= 6; 1661 bSig <<= 6; 1662 if ( 0 < expDiff ) { 1663 if ( aExp == 0xFF ) { 1664 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1665 return a; 1666 } 1667 if ( bExp == 0 ) { 1668 --expDiff; 1669 } 1670 else { 1671 bSig |= 0x20000000; 1672 } 1673 shift32RightJamming( bSig, expDiff, &bSig ); 1674 zExp = aExp; 1675 } 1676 else if ( expDiff < 0 ) { 1677 if ( bExp == 0xFF ) { 1678 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1679 return packFloat32( zSign, 0xFF, 0 ); 1680 } 1681 if ( aExp == 0 ) { 1682 ++expDiff; 1683 } 1684 else { 1685 aSig |= 0x20000000; 1686 } 1687 shift32RightJamming( aSig, - expDiff, &aSig ); 1688 zExp = bExp; 1689 } 1690 else { 1691 if ( aExp == 0xFF ) { 1692 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1693 return a; 1694 } 1695 if ( aExp == 0 ) { 1696 if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 ); 1697 return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); 1698 } 1699 zSig = 0x40000000 + aSig + bSig; 1700 zExp = aExp; 1701 goto roundAndPack; 1702 } 1703 aSig |= 0x20000000; 1704 zSig = ( aSig + bSig )<<1; 1705 --zExp; 1706 if ( (sbits32) zSig < 0 ) { 1707 zSig = aSig + bSig; 1708 ++zExp; 1709 } 1710 roundAndPack: 1711 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR ); 1712 1713 } 1714 1715 /*---------------------------------------------------------------------------- 1716 | Returns the result of subtracting the absolute values of the single- 1717 | precision floating-point values `a' and `b'. If `zSign' is 1, the 1718 | difference is negated before being returned. `zSign' is ignored if the 1719 | result is a NaN. The subtraction is performed according to the IEC/IEEE 1720 | Standard for Binary Floating-Point Arithmetic. 1721 *----------------------------------------------------------------------------*/ 1722 1723 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM) 1724 { 1725 int16 aExp, bExp, zExp; 1726 bits32 aSig, bSig, zSig; 1727 int16 expDiff; 1728 1729 aSig = extractFloat32Frac( a ); 1730 aExp = extractFloat32Exp( a ); 1731 bSig = extractFloat32Frac( b ); 1732 bExp = extractFloat32Exp( b ); 1733 expDiff = aExp - bExp; 1734 aSig <<= 7; 1735 bSig <<= 7; 1736 if ( 0 < expDiff ) goto aExpBigger; 1737 if ( expDiff < 0 ) goto bExpBigger; 1738 if ( aExp == 0xFF ) { 1739 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1740 float_raise( float_flag_invalid STATUS_VAR); 1741 return float32_default_nan; 1742 } 1743 if ( aExp == 0 ) { 1744 aExp = 1; 1745 bExp = 1; 1746 } 1747 if ( bSig < aSig ) goto aBigger; 1748 if ( aSig < bSig ) goto bBigger; 1749 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 ); 1750 bExpBigger: 1751 if ( bExp == 0xFF ) { 1752 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1753 return packFloat32( zSign ^ 1, 0xFF, 0 ); 1754 } 1755 if ( aExp == 0 ) { 1756 ++expDiff; 1757 } 1758 else { 1759 aSig |= 0x40000000; 1760 } 1761 shift32RightJamming( aSig, - expDiff, &aSig ); 1762 bSig |= 0x40000000; 1763 bBigger: 1764 zSig = bSig - aSig; 1765 zExp = bExp; 1766 zSign ^= 1; 1767 goto normalizeRoundAndPack; 1768 aExpBigger: 1769 if ( aExp == 0xFF ) { 1770 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1771 return a; 1772 } 1773 if ( bExp == 0 ) { 1774 --expDiff; 1775 } 1776 else { 1777 bSig |= 0x40000000; 1778 } 1779 shift32RightJamming( bSig, expDiff, &bSig ); 1780 aSig |= 0x40000000; 1781 aBigger: 1782 zSig = aSig - bSig; 1783 zExp = aExp; 1784 normalizeRoundAndPack: 1785 --zExp; 1786 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR ); 1787 1788 } 1789 1790 /*---------------------------------------------------------------------------- 1791 | Returns the result of adding the single-precision floating-point values `a' 1792 | and `b'. The operation is performed according to the IEC/IEEE Standard for 1793 | Binary Floating-Point Arithmetic. 1794 *----------------------------------------------------------------------------*/ 1795 1796 float32 float32_add( float32 a, float32 b STATUS_PARAM ) 1797 { 1798 flag aSign, bSign; 1799 1800 aSign = extractFloat32Sign( a ); 1801 bSign = extractFloat32Sign( b ); 1802 if ( aSign == bSign ) { 1803 return addFloat32Sigs( a, b, aSign STATUS_VAR); 1804 } 1805 else { 1806 return subFloat32Sigs( a, b, aSign STATUS_VAR ); 1807 } 1808 1809 } 1810 1811 /*---------------------------------------------------------------------------- 1812 | Returns the result of subtracting the single-precision floating-point values 1813 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1814 | for Binary Floating-Point Arithmetic. 1815 *----------------------------------------------------------------------------*/ 1816 1817 float32 float32_sub( float32 a, float32 b STATUS_PARAM ) 1818 { 1819 flag aSign, bSign; 1820 1821 aSign = extractFloat32Sign( a ); 1822 bSign = extractFloat32Sign( b ); 1823 if ( aSign == bSign ) { 1824 return subFloat32Sigs( a, b, aSign STATUS_VAR ); 1825 } 1826 else { 1827 return addFloat32Sigs( a, b, aSign STATUS_VAR ); 1828 } 1829 1830 } 1831 1832 /*---------------------------------------------------------------------------- 1833 | Returns the result of multiplying the single-precision floating-point values 1834 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1835 | for Binary Floating-Point Arithmetic. 1836 *----------------------------------------------------------------------------*/ 1837 1838 float32 float32_mul( float32 a, float32 b STATUS_PARAM ) 1839 { 1840 flag aSign, bSign, zSign; 1841 int16 aExp, bExp, zExp; 1842 bits32 aSig, bSig; 1843 bits64 zSig64; 1844 bits32 zSig; 1845 1846 aSig = extractFloat32Frac( a ); 1847 aExp = extractFloat32Exp( a ); 1848 aSign = extractFloat32Sign( a ); 1849 bSig = extractFloat32Frac( b ); 1850 bExp = extractFloat32Exp( b ); 1851 bSign = extractFloat32Sign( b ); 1852 zSign = aSign ^ bSign; 1853 if ( aExp == 0xFF ) { 1854 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1855 return propagateFloat32NaN( a, b STATUS_VAR ); 1856 } 1857 if ( ( bExp | bSig ) == 0 ) { 1858 float_raise( float_flag_invalid STATUS_VAR); 1859 return float32_default_nan; 1860 } 1861 return packFloat32( zSign, 0xFF, 0 ); 1862 } 1863 if ( bExp == 0xFF ) { 1864 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1865 if ( ( aExp | aSig ) == 0 ) { 1866 float_raise( float_flag_invalid STATUS_VAR); 1867 return float32_default_nan; 1868 } 1869 return packFloat32( zSign, 0xFF, 0 ); 1870 } 1871 if ( aExp == 0 ) { 1872 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1873 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1874 } 1875 if ( bExp == 0 ) { 1876 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); 1877 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1878 } 1879 zExp = aExp + bExp - 0x7F; 1880 aSig = ( aSig | 0x00800000 )<<7; 1881 bSig = ( bSig | 0x00800000 )<<8; 1882 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 ); 1883 zSig = zSig64; 1884 if ( 0 <= (sbits32) ( zSig<<1 ) ) { 1885 zSig <<= 1; 1886 --zExp; 1887 } 1888 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR ); 1889 1890 } 1891 1892 /*---------------------------------------------------------------------------- 1893 | Returns the result of dividing the single-precision floating-point value `a' 1894 | by the corresponding value `b'. The operation is performed according to the 1895 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1896 *----------------------------------------------------------------------------*/ 1897 1898 float32 float32_div( float32 a, float32 b STATUS_PARAM ) 1899 { 1900 flag aSign, bSign, zSign; 1901 int16 aExp, bExp, zExp; 1902 bits32 aSig, bSig, zSig; 1903 1904 aSig = extractFloat32Frac( a ); 1905 aExp = extractFloat32Exp( a ); 1906 aSign = extractFloat32Sign( a ); 1907 bSig = extractFloat32Frac( b ); 1908 bExp = extractFloat32Exp( b ); 1909 bSign = extractFloat32Sign( b ); 1910 zSign = aSign ^ bSign; 1911 if ( aExp == 0xFF ) { 1912 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1913 if ( bExp == 0xFF ) { 1914 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1915 float_raise( float_flag_invalid STATUS_VAR); 1916 return float32_default_nan; 1917 } 1918 return packFloat32( zSign, 0xFF, 0 ); 1919 } 1920 if ( bExp == 0xFF ) { 1921 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1922 return packFloat32( zSign, 0, 0 ); 1923 } 1924 if ( bExp == 0 ) { 1925 if ( bSig == 0 ) { 1926 if ( ( aExp | aSig ) == 0 ) { 1927 float_raise( float_flag_invalid STATUS_VAR); 1928 return float32_default_nan; 1929 } 1930 float_raise( float_flag_divbyzero STATUS_VAR); 1931 return packFloat32( zSign, 0xFF, 0 ); 1932 } 1933 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1934 } 1935 if ( aExp == 0 ) { 1936 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1937 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1938 } 1939 zExp = aExp - bExp + 0x7D; 1940 aSig = ( aSig | 0x00800000 )<<7; 1941 bSig = ( bSig | 0x00800000 )<<8; 1942 if ( bSig <= ( aSig + aSig ) ) { 1943 aSig >>= 1; 1944 ++zExp; 1945 } 1946 zSig = ( ( (bits64) aSig )<<32 ) / bSig; 1947 if ( ( zSig & 0x3F ) == 0 ) { 1948 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 ); 1949 } 1950 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR ); 1951 1952 } 1953 1954 /*---------------------------------------------------------------------------- 1955 | Returns the remainder of the single-precision floating-point value `a' 1956 | with respect to the corresponding value `b'. The operation is performed 1957 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1958 *----------------------------------------------------------------------------*/ 1959 1960 float32 float32_rem( float32 a, float32 b STATUS_PARAM ) 1961 { 1962 flag aSign, zSign; 1963 int16 aExp, bExp, expDiff; 1964 bits32 aSig, bSig; 1965 bits32 q; 1966 bits64 aSig64, bSig64, q64; 1967 bits32 alternateASig; 1968 sbits32 sigMean; 1969 1970 aSig = extractFloat32Frac( a ); 1971 aExp = extractFloat32Exp( a ); 1972 aSign = extractFloat32Sign( a ); 1973 bSig = extractFloat32Frac( b ); 1974 bExp = extractFloat32Exp( b ); 1975 if ( aExp == 0xFF ) { 1976 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1977 return propagateFloat32NaN( a, b STATUS_VAR ); 1978 } 1979 float_raise( float_flag_invalid STATUS_VAR); 1980 return float32_default_nan; 1981 } 1982 if ( bExp == 0xFF ) { 1983 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1984 return a; 1985 } 1986 if ( bExp == 0 ) { 1987 if ( bSig == 0 ) { 1988 float_raise( float_flag_invalid STATUS_VAR); 1989 return float32_default_nan; 1990 } 1991 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1992 } 1993 if ( aExp == 0 ) { 1994 if ( aSig == 0 ) return a; 1995 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1996 } 1997 expDiff = aExp - bExp; 1998 aSig |= 0x00800000; 1999 bSig |= 0x00800000; 2000 if ( expDiff < 32 ) { 2001 aSig <<= 8; 2002 bSig <<= 8; 2003 if ( expDiff < 0 ) { 2004 if ( expDiff < -1 ) return a; 2005 aSig >>= 1; 2006 } 2007 q = ( bSig <= aSig ); 2008 if ( q ) aSig -= bSig; 2009 if ( 0 < expDiff ) { 2010 q = ( ( (bits64) aSig )<<32 ) / bSig; 2011 q >>= 32 - expDiff; 2012 bSig >>= 2; 2013 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 2014 } 2015 else { 2016 aSig >>= 2; 2017 bSig >>= 2; 2018 } 2019 } 2020 else { 2021 if ( bSig <= aSig ) aSig -= bSig; 2022 aSig64 = ( (bits64) aSig )<<40; 2023 bSig64 = ( (bits64) bSig )<<40; 2024 expDiff -= 64; 2025 while ( 0 < expDiff ) { 2026 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2027 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2028 aSig64 = - ( ( bSig * q64 )<<38 ); 2029 expDiff -= 62; 2030 } 2031 expDiff += 64; 2032 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2033 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2034 q = q64>>( 64 - expDiff ); 2035 bSig <<= 6; 2036 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 2037 } 2038 do { 2039 alternateASig = aSig; 2040 ++q; 2041 aSig -= bSig; 2042 } while ( 0 <= (sbits32) aSig ); 2043 sigMean = aSig + alternateASig; 2044 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 2045 aSig = alternateASig; 2046 } 2047 zSign = ( (sbits32) aSig < 0 ); 2048 if ( zSign ) aSig = - aSig; 2049 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR ); 2050 2051 } 2052 2053 /*---------------------------------------------------------------------------- 2054 | Returns the square root of the single-precision floating-point value `a'. 2055 | The operation is performed according to the IEC/IEEE Standard for Binary 2056 | Floating-Point Arithmetic. 2057 *----------------------------------------------------------------------------*/ 2058 2059 float32 float32_sqrt( float32 a STATUS_PARAM ) 2060 { 2061 flag aSign; 2062 int16 aExp, zExp; 2063 bits32 aSig, zSig; 2064 bits64 rem, term; 2065 2066 aSig = extractFloat32Frac( a ); 2067 aExp = extractFloat32Exp( a ); 2068 aSign = extractFloat32Sign( a ); 2069 if ( aExp == 0xFF ) { 2070 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR ); 2071 if ( ! aSign ) return a; 2072 float_raise( float_flag_invalid STATUS_VAR); 2073 return float32_default_nan; 2074 } 2075 if ( aSign ) { 2076 if ( ( aExp | aSig ) == 0 ) return a; 2077 float_raise( float_flag_invalid STATUS_VAR); 2078 return float32_default_nan; 2079 } 2080 if ( aExp == 0 ) { 2081 if ( aSig == 0 ) return float32_zero; 2082 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2083 } 2084 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 2085 aSig = ( aSig | 0x00800000 )<<8; 2086 zSig = estimateSqrt32( aExp, aSig ) + 2; 2087 if ( ( zSig & 0x7F ) <= 5 ) { 2088 if ( zSig < 2 ) { 2089 zSig = 0x7FFFFFFF; 2090 goto roundAndPack; 2091 } 2092 aSig >>= aExp & 1; 2093 term = ( (bits64) zSig ) * zSig; 2094 rem = ( ( (bits64) aSig )<<32 ) - term; 2095 while ( (sbits64) rem < 0 ) { 2096 --zSig; 2097 rem += ( ( (bits64) zSig )<<1 ) | 1; 2098 } 2099 zSig |= ( rem != 0 ); 2100 } 2101 shift32RightJamming( zSig, 1, &zSig ); 2102 roundAndPack: 2103 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR ); 2104 2105 } 2106 2107 /*---------------------------------------------------------------------------- 2108 | Returns the binary exponential of the single-precision floating-point value 2109 | `a'. The operation is performed according to the IEC/IEEE Standard for 2110 | Binary Floating-Point Arithmetic. 2111 | 2112 | Uses the following identities: 2113 | 2114 | 1. ------------------------------------------------------------------------- 2115 | x x*ln(2) 2116 | 2 = e 2117 | 2118 | 2. ------------------------------------------------------------------------- 2119 | 2 3 4 5 n 2120 | x x x x x x x 2121 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 2122 | 1! 2! 3! 4! 5! n! 2123 *----------------------------------------------------------------------------*/ 2124 2125 static const float64 float32_exp2_coefficients[15] = 2126 { 2127 make_float64( 0x3ff0000000000000ll ), /* 1 */ 2128 make_float64( 0x3fe0000000000000ll ), /* 2 */ 2129 make_float64( 0x3fc5555555555555ll ), /* 3 */ 2130 make_float64( 0x3fa5555555555555ll ), /* 4 */ 2131 make_float64( 0x3f81111111111111ll ), /* 5 */ 2132 make_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 2133 make_float64( 0x3f2a01a01a01a01all ), /* 7 */ 2134 make_float64( 0x3efa01a01a01a01all ), /* 8 */ 2135 make_float64( 0x3ec71de3a556c734ll ), /* 9 */ 2136 make_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 2137 make_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 2138 make_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 2139 make_float64( 0x3de6124613a86d09ll ), /* 13 */ 2140 make_float64( 0x3da93974a8c07c9dll ), /* 14 */ 2141 make_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 2142 }; 2143 2144 float32 float32_exp2( float32 a STATUS_PARAM ) 2145 { 2146 flag aSign; 2147 int16 aExp; 2148 bits32 aSig; 2149 float64 r, x, xn; 2150 int i; 2151 2152 aSig = extractFloat32Frac( a ); 2153 aExp = extractFloat32Exp( a ); 2154 aSign = extractFloat32Sign( a ); 2155 2156 if ( aExp == 0xFF) { 2157 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR ); 2158 return (aSign) ? float32_zero : a; 2159 } 2160 if (aExp == 0) { 2161 if (aSig == 0) return float32_one; 2162 } 2163 2164 float_raise( float_flag_inexact STATUS_VAR); 2165 2166 /* ******************************* */ 2167 /* using float64 for approximation */ 2168 /* ******************************* */ 2169 x = float32_to_float64(a STATUS_VAR); 2170 x = float64_mul(x, float64_ln2 STATUS_VAR); 2171 2172 xn = x; 2173 r = float64_one; 2174 for (i = 0 ; i < 15 ; i++) { 2175 float64 f; 2176 2177 f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR); 2178 r = float64_add(r, f STATUS_VAR); 2179 2180 xn = float64_mul(xn, x STATUS_VAR); 2181 } 2182 2183 return float64_to_float32(r, status); 2184 } 2185 2186 /*---------------------------------------------------------------------------- 2187 | Returns the binary log of the single-precision floating-point value `a'. 2188 | The operation is performed according to the IEC/IEEE Standard for Binary 2189 | Floating-Point Arithmetic. 2190 *----------------------------------------------------------------------------*/ 2191 float32 float32_log2( float32 a STATUS_PARAM ) 2192 { 2193 flag aSign, zSign; 2194 int16 aExp; 2195 bits32 aSig, zSig, i; 2196 2197 aSig = extractFloat32Frac( a ); 2198 aExp = extractFloat32Exp( a ); 2199 aSign = extractFloat32Sign( a ); 2200 2201 if ( aExp == 0 ) { 2202 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 2203 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2204 } 2205 if ( aSign ) { 2206 float_raise( float_flag_invalid STATUS_VAR); 2207 return float32_default_nan; 2208 } 2209 if ( aExp == 0xFF ) { 2210 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR ); 2211 return a; 2212 } 2213 2214 aExp -= 0x7F; 2215 aSig |= 0x00800000; 2216 zSign = aExp < 0; 2217 zSig = aExp << 23; 2218 2219 for (i = 1 << 22; i > 0; i >>= 1) { 2220 aSig = ( (bits64)aSig * aSig ) >> 23; 2221 if ( aSig & 0x01000000 ) { 2222 aSig >>= 1; 2223 zSig |= i; 2224 } 2225 } 2226 2227 if ( zSign ) 2228 zSig = -zSig; 2229 2230 return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR ); 2231 } 2232 2233 /*---------------------------------------------------------------------------- 2234 | Returns 1 if the single-precision floating-point value `a' is equal to 2235 | the corresponding value `b', and 0 otherwise. The comparison is performed 2236 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2237 *----------------------------------------------------------------------------*/ 2238 2239 int float32_eq( float32 a, float32 b STATUS_PARAM ) 2240 { 2241 2242 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2243 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2244 ) { 2245 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2246 float_raise( float_flag_invalid STATUS_VAR); 2247 } 2248 return 0; 2249 } 2250 return ( float32_val(a) == float32_val(b) ) || 2251 ( (bits32) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 2252 2253 } 2254 2255 /*---------------------------------------------------------------------------- 2256 | Returns 1 if the single-precision floating-point value `a' is less than 2257 | or equal to the corresponding value `b', and 0 otherwise. The comparison 2258 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 2259 | Arithmetic. 2260 *----------------------------------------------------------------------------*/ 2261 2262 int float32_le( float32 a, float32 b STATUS_PARAM ) 2263 { 2264 flag aSign, bSign; 2265 bits32 av, bv; 2266 2267 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2268 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2269 ) { 2270 float_raise( float_flag_invalid STATUS_VAR); 2271 return 0; 2272 } 2273 aSign = extractFloat32Sign( a ); 2274 bSign = extractFloat32Sign( b ); 2275 av = float32_val(a); 2276 bv = float32_val(b); 2277 if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 ); 2278 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 2279 2280 } 2281 2282 /*---------------------------------------------------------------------------- 2283 | Returns 1 if the single-precision floating-point value `a' is less than 2284 | the corresponding value `b', and 0 otherwise. The comparison is performed 2285 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2286 *----------------------------------------------------------------------------*/ 2287 2288 int float32_lt( float32 a, float32 b STATUS_PARAM ) 2289 { 2290 flag aSign, bSign; 2291 bits32 av, bv; 2292 2293 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2294 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2295 ) { 2296 float_raise( float_flag_invalid STATUS_VAR); 2297 return 0; 2298 } 2299 aSign = extractFloat32Sign( a ); 2300 bSign = extractFloat32Sign( b ); 2301 av = float32_val(a); 2302 bv = float32_val(b); 2303 if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 ); 2304 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 2305 2306 } 2307 2308 /*---------------------------------------------------------------------------- 2309 | Returns 1 if the single-precision floating-point value `a' is equal to 2310 | the corresponding value `b', and 0 otherwise. The invalid exception is 2311 | raised if either operand is a NaN. Otherwise, the comparison is performed 2312 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2313 *----------------------------------------------------------------------------*/ 2314 2315 int float32_eq_signaling( float32 a, float32 b STATUS_PARAM ) 2316 { 2317 bits32 av, bv; 2318 2319 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2320 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2321 ) { 2322 float_raise( float_flag_invalid STATUS_VAR); 2323 return 0; 2324 } 2325 av = float32_val(a); 2326 bv = float32_val(b); 2327 return ( av == bv ) || ( (bits32) ( ( av | bv )<<1 ) == 0 ); 2328 2329 } 2330 2331 /*---------------------------------------------------------------------------- 2332 | Returns 1 if the single-precision floating-point value `a' is less than or 2333 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 2334 | cause an exception. Otherwise, the comparison is performed according to the 2335 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2336 *----------------------------------------------------------------------------*/ 2337 2338 int float32_le_quiet( float32 a, float32 b STATUS_PARAM ) 2339 { 2340 flag aSign, bSign; 2341 bits32 av, bv; 2342 2343 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2344 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2345 ) { 2346 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2347 float_raise( float_flag_invalid STATUS_VAR); 2348 } 2349 return 0; 2350 } 2351 aSign = extractFloat32Sign( a ); 2352 bSign = extractFloat32Sign( b ); 2353 av = float32_val(a); 2354 bv = float32_val(b); 2355 if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 ); 2356 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 2357 2358 } 2359 2360 /*---------------------------------------------------------------------------- 2361 | Returns 1 if the single-precision floating-point value `a' is less than 2362 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 2363 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 2364 | Standard for Binary Floating-Point Arithmetic. 2365 *----------------------------------------------------------------------------*/ 2366 2367 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM ) 2368 { 2369 flag aSign, bSign; 2370 bits32 av, bv; 2371 2372 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2373 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2374 ) { 2375 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2376 float_raise( float_flag_invalid STATUS_VAR); 2377 } 2378 return 0; 2379 } 2380 aSign = extractFloat32Sign( a ); 2381 bSign = extractFloat32Sign( b ); 2382 av = float32_val(a); 2383 bv = float32_val(b); 2384 if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 ); 2385 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 2386 2387 } 2388 2389 /*---------------------------------------------------------------------------- 2390 | Returns the result of converting the double-precision floating-point value 2391 | `a' to the 32-bit two's complement integer format. The conversion is 2392 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2393 | Arithmetic---which means in particular that the conversion is rounded 2394 | according to the current rounding mode. If `a' is a NaN, the largest 2395 | positive integer is returned. Otherwise, if the conversion overflows, the 2396 | largest integer with the same sign as `a' is returned. 2397 *----------------------------------------------------------------------------*/ 2398 2399 int32 float64_to_int32( float64 a STATUS_PARAM ) 2400 { 2401 flag aSign; 2402 int16 aExp, shiftCount; 2403 bits64 aSig; 2404 2405 aSig = extractFloat64Frac( a ); 2406 aExp = extractFloat64Exp( a ); 2407 aSign = extractFloat64Sign( a ); 2408 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2409 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2410 shiftCount = 0x42C - aExp; 2411 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 2412 return roundAndPackInt32( aSign, aSig STATUS_VAR ); 2413 2414 } 2415 2416 /*---------------------------------------------------------------------------- 2417 | Returns the result of converting the double-precision floating-point value 2418 | `a' to the 32-bit two's complement integer format. The conversion is 2419 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2420 | Arithmetic, except that the conversion is always rounded toward zero. 2421 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2422 | the conversion overflows, the largest integer with the same sign as `a' is 2423 | returned. 2424 *----------------------------------------------------------------------------*/ 2425 2426 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM ) 2427 { 2428 flag aSign; 2429 int16 aExp, shiftCount; 2430 bits64 aSig, savedASig; 2431 int32 z; 2432 2433 aSig = extractFloat64Frac( a ); 2434 aExp = extractFloat64Exp( a ); 2435 aSign = extractFloat64Sign( a ); 2436 if ( 0x41E < aExp ) { 2437 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2438 goto invalid; 2439 } 2440 else if ( aExp < 0x3FF ) { 2441 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 2442 return 0; 2443 } 2444 aSig |= LIT64( 0x0010000000000000 ); 2445 shiftCount = 0x433 - aExp; 2446 savedASig = aSig; 2447 aSig >>= shiftCount; 2448 z = aSig; 2449 if ( aSign ) z = - z; 2450 if ( ( z < 0 ) ^ aSign ) { 2451 invalid: 2452 float_raise( float_flag_invalid STATUS_VAR); 2453 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 2454 } 2455 if ( ( aSig<<shiftCount ) != savedASig ) { 2456 STATUS(float_exception_flags) |= float_flag_inexact; 2457 } 2458 return z; 2459 2460 } 2461 2462 /*---------------------------------------------------------------------------- 2463 | Returns the result of converting the double-precision floating-point value 2464 | `a' to the 16-bit two's complement integer format. The conversion is 2465 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2466 | Arithmetic, except that the conversion is always rounded toward zero. 2467 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2468 | the conversion overflows, the largest integer with the same sign as `a' is 2469 | returned. 2470 *----------------------------------------------------------------------------*/ 2471 2472 int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM ) 2473 { 2474 flag aSign; 2475 int16 aExp, shiftCount; 2476 bits64 aSig, savedASig; 2477 int32 z; 2478 2479 aSig = extractFloat64Frac( a ); 2480 aExp = extractFloat64Exp( a ); 2481 aSign = extractFloat64Sign( a ); 2482 if ( 0x40E < aExp ) { 2483 if ( ( aExp == 0x7FF ) && aSig ) { 2484 aSign = 0; 2485 } 2486 goto invalid; 2487 } 2488 else if ( aExp < 0x3FF ) { 2489 if ( aExp || aSig ) { 2490 STATUS(float_exception_flags) |= float_flag_inexact; 2491 } 2492 return 0; 2493 } 2494 aSig |= LIT64( 0x0010000000000000 ); 2495 shiftCount = 0x433 - aExp; 2496 savedASig = aSig; 2497 aSig >>= shiftCount; 2498 z = aSig; 2499 if ( aSign ) { 2500 z = - z; 2501 } 2502 if ( ( (int16_t)z < 0 ) ^ aSign ) { 2503 invalid: 2504 float_raise( float_flag_invalid STATUS_VAR); 2505 return aSign ? (sbits32) 0xffff8000 : 0x7FFF; 2506 } 2507 if ( ( aSig<<shiftCount ) != savedASig ) { 2508 STATUS(float_exception_flags) |= float_flag_inexact; 2509 } 2510 return z; 2511 } 2512 2513 /*---------------------------------------------------------------------------- 2514 | Returns the result of converting the double-precision floating-point value 2515 | `a' to the 64-bit two's complement integer format. The conversion is 2516 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2517 | Arithmetic---which means in particular that the conversion is rounded 2518 | according to the current rounding mode. If `a' is a NaN, the largest 2519 | positive integer is returned. Otherwise, if the conversion overflows, the 2520 | largest integer with the same sign as `a' is returned. 2521 *----------------------------------------------------------------------------*/ 2522 2523 int64 float64_to_int64( float64 a STATUS_PARAM ) 2524 { 2525 flag aSign; 2526 int16 aExp, shiftCount; 2527 bits64 aSig, aSigExtra; 2528 2529 aSig = extractFloat64Frac( a ); 2530 aExp = extractFloat64Exp( a ); 2531 aSign = extractFloat64Sign( a ); 2532 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2533 shiftCount = 0x433 - aExp; 2534 if ( shiftCount <= 0 ) { 2535 if ( 0x43E < aExp ) { 2536 float_raise( float_flag_invalid STATUS_VAR); 2537 if ( ! aSign 2538 || ( ( aExp == 0x7FF ) 2539 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2540 ) { 2541 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2542 } 2543 return (sbits64) LIT64( 0x8000000000000000 ); 2544 } 2545 aSigExtra = 0; 2546 aSig <<= - shiftCount; 2547 } 2548 else { 2549 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 2550 } 2551 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR ); 2552 2553 } 2554 2555 /*---------------------------------------------------------------------------- 2556 | Returns the result of converting the double-precision floating-point value 2557 | `a' to the 64-bit two's complement integer format. The conversion is 2558 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2559 | Arithmetic, except that the conversion is always rounded toward zero. 2560 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2561 | the conversion overflows, the largest integer with the same sign as `a' is 2562 | returned. 2563 *----------------------------------------------------------------------------*/ 2564 2565 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM ) 2566 { 2567 flag aSign; 2568 int16 aExp, shiftCount; 2569 bits64 aSig; 2570 int64 z; 2571 2572 aSig = extractFloat64Frac( a ); 2573 aExp = extractFloat64Exp( a ); 2574 aSign = extractFloat64Sign( a ); 2575 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2576 shiftCount = aExp - 0x433; 2577 if ( 0 <= shiftCount ) { 2578 if ( 0x43E <= aExp ) { 2579 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) { 2580 float_raise( float_flag_invalid STATUS_VAR); 2581 if ( ! aSign 2582 || ( ( aExp == 0x7FF ) 2583 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2584 ) { 2585 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2586 } 2587 } 2588 return (sbits64) LIT64( 0x8000000000000000 ); 2589 } 2590 z = aSig<<shiftCount; 2591 } 2592 else { 2593 if ( aExp < 0x3FE ) { 2594 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 2595 return 0; 2596 } 2597 z = aSig>>( - shiftCount ); 2598 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 2599 STATUS(float_exception_flags) |= float_flag_inexact; 2600 } 2601 } 2602 if ( aSign ) z = - z; 2603 return z; 2604 2605 } 2606 2607 /*---------------------------------------------------------------------------- 2608 | Returns the result of converting the double-precision floating-point value 2609 | `a' to the single-precision floating-point format. The conversion is 2610 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2611 | Arithmetic. 2612 *----------------------------------------------------------------------------*/ 2613 2614 float32 float64_to_float32( float64 a STATUS_PARAM ) 2615 { 2616 flag aSign; 2617 int16 aExp; 2618 bits64 aSig; 2619 bits32 zSig; 2620 2621 aSig = extractFloat64Frac( a ); 2622 aExp = extractFloat64Exp( a ); 2623 aSign = extractFloat64Sign( a ); 2624 if ( aExp == 0x7FF ) { 2625 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) ); 2626 return packFloat32( aSign, 0xFF, 0 ); 2627 } 2628 shift64RightJamming( aSig, 22, &aSig ); 2629 zSig = aSig; 2630 if ( aExp || zSig ) { 2631 zSig |= 0x40000000; 2632 aExp -= 0x381; 2633 } 2634 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR ); 2635 2636 } 2637 2638 2639 /*---------------------------------------------------------------------------- 2640 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 2641 | half-precision floating-point value, returning the result. After being 2642 | shifted into the proper positions, the three fields are simply added 2643 | together to form the result. This means that any integer portion of `zSig' 2644 | will be added into the exponent. Since a properly normalized significand 2645 | will have an integer portion equal to 1, the `zExp' input should be 1 less 2646 | than the desired result exponent whenever `zSig' is a complete, normalized 2647 | significand. 2648 *----------------------------------------------------------------------------*/ 2649 static bits16 packFloat16(flag zSign, int16 zExp, bits16 zSig) 2650 { 2651 return (((bits32)zSign) << 15) + (((bits32)zExp) << 10) + zSig; 2652 } 2653 2654 /* Half precision floats come in two formats: standard IEEE and "ARM" format. 2655 The latter gains extra exponent range by omitting the NaN/Inf encodings. */ 2656 2657 float32 float16_to_float32( bits16 a, flag ieee STATUS_PARAM ) 2658 { 2659 flag aSign; 2660 int16 aExp; 2661 bits32 aSig; 2662 2663 aSign = a >> 15; 2664 aExp = (a >> 10) & 0x1f; 2665 aSig = a & 0x3ff; 2666 2667 if (aExp == 0x1f && ieee) { 2668 if (aSig) { 2669 /* Make sure correct exceptions are raised. */ 2670 float32ToCommonNaN(a STATUS_VAR); 2671 aSig |= 0x200; 2672 } 2673 return packFloat32(aSign, 0xff, aSig << 13); 2674 } 2675 if (aExp == 0) { 2676 int8 shiftCount; 2677 2678 if (aSig == 0) { 2679 return packFloat32(aSign, 0, 0); 2680 } 2681 2682 shiftCount = countLeadingZeros32( aSig ) - 21; 2683 aSig = aSig << shiftCount; 2684 aExp = -shiftCount; 2685 } 2686 return packFloat32( aSign, aExp + 0x70, aSig << 13); 2687 } 2688 2689 bits16 float32_to_float16( float32 a, flag ieee STATUS_PARAM) 2690 { 2691 flag aSign; 2692 int16 aExp; 2693 bits32 aSig; 2694 bits32 mask; 2695 bits32 increment; 2696 int8 roundingMode; 2697 2698 aSig = extractFloat32Frac( a ); 2699 aExp = extractFloat32Exp( a ); 2700 aSign = extractFloat32Sign( a ); 2701 if ( aExp == 0xFF ) { 2702 if (aSig) { 2703 /* Make sure correct exceptions are raised. */ 2704 float32ToCommonNaN(a STATUS_VAR); 2705 aSig |= 0x00400000; 2706 } 2707 return packFloat16(aSign, 0x1f, aSig >> 13); 2708 } 2709 if (aExp == 0 && aSign == 0) { 2710 return packFloat16(aSign, 0, 0); 2711 } 2712 /* Decimal point between bits 22 and 23. */ 2713 aSig |= 0x00800000; 2714 aExp -= 0x7f; 2715 if (aExp < -14) { 2716 mask = 0x007fffff; 2717 if (aExp < -24) { 2718 aExp = -25; 2719 } else { 2720 mask >>= 24 + aExp; 2721 } 2722 } else { 2723 mask = 0x00001fff; 2724 } 2725 if (aSig & mask) { 2726 float_raise( float_flag_underflow STATUS_VAR ); 2727 roundingMode = STATUS(float_rounding_mode); 2728 switch (roundingMode) { 2729 case float_round_nearest_even: 2730 increment = (mask + 1) >> 1; 2731 if ((aSig & mask) == increment) { 2732 increment = aSig & (increment << 1); 2733 } 2734 break; 2735 case float_round_up: 2736 increment = aSign ? 0 : mask; 2737 break; 2738 case float_round_down: 2739 increment = aSign ? mask : 0; 2740 break; 2741 default: /* round_to_zero */ 2742 increment = 0; 2743 break; 2744 } 2745 aSig += increment; 2746 if (aSig >= 0x01000000) { 2747 aSig >>= 1; 2748 aExp++; 2749 } 2750 } else if (aExp < -14 2751 && STATUS(float_detect_tininess) == float_tininess_before_rounding) { 2752 float_raise( float_flag_underflow STATUS_VAR); 2753 } 2754 2755 if (ieee) { 2756 if (aExp > 15) { 2757 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); 2758 return packFloat16(aSign, 0x1f, 0); 2759 } 2760 } else { 2761 if (aExp > 16) { 2762 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); 2763 return packFloat16(aSign, 0x1f, 0x3ff); 2764 } 2765 } 2766 if (aExp < -24) { 2767 return packFloat16(aSign, 0, 0); 2768 } 2769 if (aExp < -14) { 2770 aSig >>= -14 - aExp; 2771 aExp = -14; 2772 } 2773 return packFloat16(aSign, aExp + 14, aSig >> 13); 2774 } 2775 2776 #ifdef FLOATX80 2777 2778 /*---------------------------------------------------------------------------- 2779 | Returns the result of converting the double-precision floating-point value 2780 | `a' to the extended double-precision floating-point format. The conversion 2781 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 2782 | Arithmetic. 2783 *----------------------------------------------------------------------------*/ 2784 2785 floatx80 float64_to_floatx80( float64 a STATUS_PARAM ) 2786 { 2787 flag aSign; 2788 int16 aExp; 2789 bits64 aSig; 2790 2791 aSig = extractFloat64Frac( a ); 2792 aExp = extractFloat64Exp( a ); 2793 aSign = extractFloat64Sign( a ); 2794 if ( aExp == 0x7FF ) { 2795 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) ); 2796 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2797 } 2798 if ( aExp == 0 ) { 2799 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 2800 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2801 } 2802 return 2803 packFloatx80( 2804 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 2805 2806 } 2807 2808 #endif 2809 2810 #ifdef FLOAT128 2811 2812 /*---------------------------------------------------------------------------- 2813 | Returns the result of converting the double-precision floating-point value 2814 | `a' to the quadruple-precision floating-point format. The conversion is 2815 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2816 | Arithmetic. 2817 *----------------------------------------------------------------------------*/ 2818 2819 float128 float64_to_float128( float64 a STATUS_PARAM ) 2820 { 2821 flag aSign; 2822 int16 aExp; 2823 bits64 aSig, zSig0, zSig1; 2824 2825 aSig = extractFloat64Frac( a ); 2826 aExp = extractFloat64Exp( a ); 2827 aSign = extractFloat64Sign( a ); 2828 if ( aExp == 0x7FF ) { 2829 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) ); 2830 return packFloat128( aSign, 0x7FFF, 0, 0 ); 2831 } 2832 if ( aExp == 0 ) { 2833 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 2834 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2835 --aExp; 2836 } 2837 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 2838 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 2839 2840 } 2841 2842 #endif 2843 2844 /*---------------------------------------------------------------------------- 2845 | Rounds the double-precision floating-point value `a' to an integer, and 2846 | returns the result as a double-precision floating-point value. The 2847 | operation is performed according to the IEC/IEEE Standard for Binary 2848 | Floating-Point Arithmetic. 2849 *----------------------------------------------------------------------------*/ 2850 2851 float64 float64_round_to_int( float64 a STATUS_PARAM ) 2852 { 2853 flag aSign; 2854 int16 aExp; 2855 bits64 lastBitMask, roundBitsMask; 2856 int8 roundingMode; 2857 bits64 z; 2858 2859 aExp = extractFloat64Exp( a ); 2860 if ( 0x433 <= aExp ) { 2861 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { 2862 return propagateFloat64NaN( a, a STATUS_VAR ); 2863 } 2864 return a; 2865 } 2866 if ( aExp < 0x3FF ) { 2867 if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a; 2868 STATUS(float_exception_flags) |= float_flag_inexact; 2869 aSign = extractFloat64Sign( a ); 2870 switch ( STATUS(float_rounding_mode) ) { 2871 case float_round_nearest_even: 2872 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { 2873 return packFloat64( aSign, 0x3FF, 0 ); 2874 } 2875 break; 2876 case float_round_down: 2877 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0); 2878 case float_round_up: 2879 return make_float64( 2880 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 )); 2881 } 2882 return packFloat64( aSign, 0, 0 ); 2883 } 2884 lastBitMask = 1; 2885 lastBitMask <<= 0x433 - aExp; 2886 roundBitsMask = lastBitMask - 1; 2887 z = float64_val(a); 2888 roundingMode = STATUS(float_rounding_mode); 2889 if ( roundingMode == float_round_nearest_even ) { 2890 z += lastBitMask>>1; 2891 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 2892 } 2893 else if ( roundingMode != float_round_to_zero ) { 2894 if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) { 2895 z += roundBitsMask; 2896 } 2897 } 2898 z &= ~ roundBitsMask; 2899 if ( z != float64_val(a) ) 2900 STATUS(float_exception_flags) |= float_flag_inexact; 2901 return make_float64(z); 2902 2903 } 2904 2905 float64 float64_trunc_to_int( float64 a STATUS_PARAM) 2906 { 2907 int oldmode; 2908 float64 res; 2909 oldmode = STATUS(float_rounding_mode); 2910 STATUS(float_rounding_mode) = float_round_to_zero; 2911 res = float64_round_to_int(a STATUS_VAR); 2912 STATUS(float_rounding_mode) = oldmode; 2913 return res; 2914 } 2915 2916 /*---------------------------------------------------------------------------- 2917 | Returns the result of adding the absolute values of the double-precision 2918 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 2919 | before being returned. `zSign' is ignored if the result is a NaN. 2920 | The addition is performed according to the IEC/IEEE Standard for Binary 2921 | Floating-Point Arithmetic. 2922 *----------------------------------------------------------------------------*/ 2923 2924 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM ) 2925 { 2926 int16 aExp, bExp, zExp; 2927 bits64 aSig, bSig, zSig; 2928 int16 expDiff; 2929 2930 aSig = extractFloat64Frac( a ); 2931 aExp = extractFloat64Exp( a ); 2932 bSig = extractFloat64Frac( b ); 2933 bExp = extractFloat64Exp( b ); 2934 expDiff = aExp - bExp; 2935 aSig <<= 9; 2936 bSig <<= 9; 2937 if ( 0 < expDiff ) { 2938 if ( aExp == 0x7FF ) { 2939 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 2940 return a; 2941 } 2942 if ( bExp == 0 ) { 2943 --expDiff; 2944 } 2945 else { 2946 bSig |= LIT64( 0x2000000000000000 ); 2947 } 2948 shift64RightJamming( bSig, expDiff, &bSig ); 2949 zExp = aExp; 2950 } 2951 else if ( expDiff < 0 ) { 2952 if ( bExp == 0x7FF ) { 2953 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 2954 return packFloat64( zSign, 0x7FF, 0 ); 2955 } 2956 if ( aExp == 0 ) { 2957 ++expDiff; 2958 } 2959 else { 2960 aSig |= LIT64( 0x2000000000000000 ); 2961 } 2962 shift64RightJamming( aSig, - expDiff, &aSig ); 2963 zExp = bExp; 2964 } 2965 else { 2966 if ( aExp == 0x7FF ) { 2967 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 2968 return a; 2969 } 2970 if ( aExp == 0 ) { 2971 if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 ); 2972 return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); 2973 } 2974 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig; 2975 zExp = aExp; 2976 goto roundAndPack; 2977 } 2978 aSig |= LIT64( 0x2000000000000000 ); 2979 zSig = ( aSig + bSig )<<1; 2980 --zExp; 2981 if ( (sbits64) zSig < 0 ) { 2982 zSig = aSig + bSig; 2983 ++zExp; 2984 } 2985 roundAndPack: 2986 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR ); 2987 2988 } 2989 2990 /*---------------------------------------------------------------------------- 2991 | Returns the result of subtracting the absolute values of the double- 2992 | precision floating-point values `a' and `b'. If `zSign' is 1, the 2993 | difference is negated before being returned. `zSign' is ignored if the 2994 | result is a NaN. The subtraction is performed according to the IEC/IEEE 2995 | Standard for Binary Floating-Point Arithmetic. 2996 *----------------------------------------------------------------------------*/ 2997 2998 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM ) 2999 { 3000 int16 aExp, bExp, zExp; 3001 bits64 aSig, bSig, zSig; 3002 int16 expDiff; 3003 3004 aSig = extractFloat64Frac( a ); 3005 aExp = extractFloat64Exp( a ); 3006 bSig = extractFloat64Frac( b ); 3007 bExp = extractFloat64Exp( b ); 3008 expDiff = aExp - bExp; 3009 aSig <<= 10; 3010 bSig <<= 10; 3011 if ( 0 < expDiff ) goto aExpBigger; 3012 if ( expDiff < 0 ) goto bExpBigger; 3013 if ( aExp == 0x7FF ) { 3014 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3015 float_raise( float_flag_invalid STATUS_VAR); 3016 return float64_default_nan; 3017 } 3018 if ( aExp == 0 ) { 3019 aExp = 1; 3020 bExp = 1; 3021 } 3022 if ( bSig < aSig ) goto aBigger; 3023 if ( aSig < bSig ) goto bBigger; 3024 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 ); 3025 bExpBigger: 3026 if ( bExp == 0x7FF ) { 3027 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3028 return packFloat64( zSign ^ 1, 0x7FF, 0 ); 3029 } 3030 if ( aExp == 0 ) { 3031 ++expDiff; 3032 } 3033 else { 3034 aSig |= LIT64( 0x4000000000000000 ); 3035 } 3036 shift64RightJamming( aSig, - expDiff, &aSig ); 3037 bSig |= LIT64( 0x4000000000000000 ); 3038 bBigger: 3039 zSig = bSig - aSig; 3040 zExp = bExp; 3041 zSign ^= 1; 3042 goto normalizeRoundAndPack; 3043 aExpBigger: 3044 if ( aExp == 0x7FF ) { 3045 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3046 return a; 3047 } 3048 if ( bExp == 0 ) { 3049 --expDiff; 3050 } 3051 else { 3052 bSig |= LIT64( 0x4000000000000000 ); 3053 } 3054 shift64RightJamming( bSig, expDiff, &bSig ); 3055 aSig |= LIT64( 0x4000000000000000 ); 3056 aBigger: 3057 zSig = aSig - bSig; 3058 zExp = aExp; 3059 normalizeRoundAndPack: 3060 --zExp; 3061 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR ); 3062 3063 } 3064 3065 /*---------------------------------------------------------------------------- 3066 | Returns the result of adding the double-precision floating-point values `a' 3067 | and `b'. The operation is performed according to the IEC/IEEE Standard for 3068 | Binary Floating-Point Arithmetic. 3069 *----------------------------------------------------------------------------*/ 3070 3071 float64 float64_add( float64 a, float64 b STATUS_PARAM ) 3072 { 3073 flag aSign, bSign; 3074 3075 aSign = extractFloat64Sign( a ); 3076 bSign = extractFloat64Sign( b ); 3077 if ( aSign == bSign ) { 3078 return addFloat64Sigs( a, b, aSign STATUS_VAR ); 3079 } 3080 else { 3081 return subFloat64Sigs( a, b, aSign STATUS_VAR ); 3082 } 3083 3084 } 3085 3086 /*---------------------------------------------------------------------------- 3087 | Returns the result of subtracting the double-precision floating-point values 3088 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 3089 | for Binary Floating-Point Arithmetic. 3090 *----------------------------------------------------------------------------*/ 3091 3092 float64 float64_sub( float64 a, float64 b STATUS_PARAM ) 3093 { 3094 flag aSign, bSign; 3095 3096 aSign = extractFloat64Sign( a ); 3097 bSign = extractFloat64Sign( b ); 3098 if ( aSign == bSign ) { 3099 return subFloat64Sigs( a, b, aSign STATUS_VAR ); 3100 } 3101 else { 3102 return addFloat64Sigs( a, b, aSign STATUS_VAR ); 3103 } 3104 3105 } 3106 3107 /*---------------------------------------------------------------------------- 3108 | Returns the result of multiplying the double-precision floating-point values 3109 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 3110 | for Binary Floating-Point Arithmetic. 3111 *----------------------------------------------------------------------------*/ 3112 3113 float64 float64_mul( float64 a, float64 b STATUS_PARAM ) 3114 { 3115 flag aSign, bSign, zSign; 3116 int16 aExp, bExp, zExp; 3117 bits64 aSig, bSig, zSig0, zSig1; 3118 3119 aSig = extractFloat64Frac( a ); 3120 aExp = extractFloat64Exp( a ); 3121 aSign = extractFloat64Sign( a ); 3122 bSig = extractFloat64Frac( b ); 3123 bExp = extractFloat64Exp( b ); 3124 bSign = extractFloat64Sign( b ); 3125 zSign = aSign ^ bSign; 3126 if ( aExp == 0x7FF ) { 3127 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 3128 return propagateFloat64NaN( a, b STATUS_VAR ); 3129 } 3130 if ( ( bExp | bSig ) == 0 ) { 3131 float_raise( float_flag_invalid STATUS_VAR); 3132 return float64_default_nan; 3133 } 3134 return packFloat64( zSign, 0x7FF, 0 ); 3135 } 3136 if ( bExp == 0x7FF ) { 3137 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3138 if ( ( aExp | aSig ) == 0 ) { 3139 float_raise( float_flag_invalid STATUS_VAR); 3140 return float64_default_nan; 3141 } 3142 return packFloat64( zSign, 0x7FF, 0 ); 3143 } 3144 if ( aExp == 0 ) { 3145 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 3146 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3147 } 3148 if ( bExp == 0 ) { 3149 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 ); 3150 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 3151 } 3152 zExp = aExp + bExp - 0x3FF; 3153 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 3154 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 3155 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 3156 zSig0 |= ( zSig1 != 0 ); 3157 if ( 0 <= (sbits64) ( zSig0<<1 ) ) { 3158 zSig0 <<= 1; 3159 --zExp; 3160 } 3161 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR ); 3162 3163 } 3164 3165 /*---------------------------------------------------------------------------- 3166 | Returns the result of dividing the double-precision floating-point value `a' 3167 | by the corresponding value `b'. The operation is performed according to 3168 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3169 *----------------------------------------------------------------------------*/ 3170 3171 float64 float64_div( float64 a, float64 b STATUS_PARAM ) 3172 { 3173 flag aSign, bSign, zSign; 3174 int16 aExp, bExp, zExp; 3175 bits64 aSig, bSig, zSig; 3176 bits64 rem0, rem1; 3177 bits64 term0, term1; 3178 3179 aSig = extractFloat64Frac( a ); 3180 aExp = extractFloat64Exp( a ); 3181 aSign = extractFloat64Sign( a ); 3182 bSig = extractFloat64Frac( b ); 3183 bExp = extractFloat64Exp( b ); 3184 bSign = extractFloat64Sign( b ); 3185 zSign = aSign ^ bSign; 3186 if ( aExp == 0x7FF ) { 3187 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3188 if ( bExp == 0x7FF ) { 3189 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3190 float_raise( float_flag_invalid STATUS_VAR); 3191 return float64_default_nan; 3192 } 3193 return packFloat64( zSign, 0x7FF, 0 ); 3194 } 3195 if ( bExp == 0x7FF ) { 3196 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3197 return packFloat64( zSign, 0, 0 ); 3198 } 3199 if ( bExp == 0 ) { 3200 if ( bSig == 0 ) { 3201 if ( ( aExp | aSig ) == 0 ) { 3202 float_raise( float_flag_invalid STATUS_VAR); 3203 return float64_default_nan; 3204 } 3205 float_raise( float_flag_divbyzero STATUS_VAR); 3206 return packFloat64( zSign, 0x7FF, 0 ); 3207 } 3208 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 3209 } 3210 if ( aExp == 0 ) { 3211 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 3212 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3213 } 3214 zExp = aExp - bExp + 0x3FD; 3215 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 3216 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 3217 if ( bSig <= ( aSig + aSig ) ) { 3218 aSig >>= 1; 3219 ++zExp; 3220 } 3221 zSig = estimateDiv128To64( aSig, 0, bSig ); 3222 if ( ( zSig & 0x1FF ) <= 2 ) { 3223 mul64To128( bSig, zSig, &term0, &term1 ); 3224 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 3225 while ( (sbits64) rem0 < 0 ) { 3226 --zSig; 3227 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 3228 } 3229 zSig |= ( rem1 != 0 ); 3230 } 3231 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR ); 3232 3233 } 3234 3235 /*---------------------------------------------------------------------------- 3236 | Returns the remainder of the double-precision floating-point value `a' 3237 | with respect to the corresponding value `b'. The operation is performed 3238 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3239 *----------------------------------------------------------------------------*/ 3240 3241 float64 float64_rem( float64 a, float64 b STATUS_PARAM ) 3242 { 3243 flag aSign, zSign; 3244 int16 aExp, bExp, expDiff; 3245 bits64 aSig, bSig; 3246 bits64 q, alternateASig; 3247 sbits64 sigMean; 3248 3249 aSig = extractFloat64Frac( a ); 3250 aExp = extractFloat64Exp( a ); 3251 aSign = extractFloat64Sign( a ); 3252 bSig = extractFloat64Frac( b ); 3253 bExp = extractFloat64Exp( b ); 3254 if ( aExp == 0x7FF ) { 3255 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 3256 return propagateFloat64NaN( a, b STATUS_VAR ); 3257 } 3258 float_raise( float_flag_invalid STATUS_VAR); 3259 return float64_default_nan; 3260 } 3261 if ( bExp == 0x7FF ) { 3262 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3263 return a; 3264 } 3265 if ( bExp == 0 ) { 3266 if ( bSig == 0 ) { 3267 float_raise( float_flag_invalid STATUS_VAR); 3268 return float64_default_nan; 3269 } 3270 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 3271 } 3272 if ( aExp == 0 ) { 3273 if ( aSig == 0 ) return a; 3274 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3275 } 3276 expDiff = aExp - bExp; 3277 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 3278 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 3279 if ( expDiff < 0 ) { 3280 if ( expDiff < -1 ) return a; 3281 aSig >>= 1; 3282 } 3283 q = ( bSig <= aSig ); 3284 if ( q ) aSig -= bSig; 3285 expDiff -= 64; 3286 while ( 0 < expDiff ) { 3287 q = estimateDiv128To64( aSig, 0, bSig ); 3288 q = ( 2 < q ) ? q - 2 : 0; 3289 aSig = - ( ( bSig>>2 ) * q ); 3290 expDiff -= 62; 3291 } 3292 expDiff += 64; 3293 if ( 0 < expDiff ) { 3294 q = estimateDiv128To64( aSig, 0, bSig ); 3295 q = ( 2 < q ) ? q - 2 : 0; 3296 q >>= 64 - expDiff; 3297 bSig >>= 2; 3298 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 3299 } 3300 else { 3301 aSig >>= 2; 3302 bSig >>= 2; 3303 } 3304 do { 3305 alternateASig = aSig; 3306 ++q; 3307 aSig -= bSig; 3308 } while ( 0 <= (sbits64) aSig ); 3309 sigMean = aSig + alternateASig; 3310 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 3311 aSig = alternateASig; 3312 } 3313 zSign = ( (sbits64) aSig < 0 ); 3314 if ( zSign ) aSig = - aSig; 3315 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR ); 3316 3317 } 3318 3319 /*---------------------------------------------------------------------------- 3320 | Returns the square root of the double-precision floating-point value `a'. 3321 | The operation is performed according to the IEC/IEEE Standard for Binary 3322 | Floating-Point Arithmetic. 3323 *----------------------------------------------------------------------------*/ 3324 3325 float64 float64_sqrt( float64 a STATUS_PARAM ) 3326 { 3327 flag aSign; 3328 int16 aExp, zExp; 3329 bits64 aSig, zSig, doubleZSig; 3330 bits64 rem0, rem1, term0, term1; 3331 3332 aSig = extractFloat64Frac( a ); 3333 aExp = extractFloat64Exp( a ); 3334 aSign = extractFloat64Sign( a ); 3335 if ( aExp == 0x7FF ) { 3336 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR ); 3337 if ( ! aSign ) return a; 3338 float_raise( float_flag_invalid STATUS_VAR); 3339 return float64_default_nan; 3340 } 3341 if ( aSign ) { 3342 if ( ( aExp | aSig ) == 0 ) return a; 3343 float_raise( float_flag_invalid STATUS_VAR); 3344 return float64_default_nan; 3345 } 3346 if ( aExp == 0 ) { 3347 if ( aSig == 0 ) return float64_zero; 3348 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3349 } 3350 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 3351 aSig |= LIT64( 0x0010000000000000 ); 3352 zSig = estimateSqrt32( aExp, aSig>>21 ); 3353 aSig <<= 9 - ( aExp & 1 ); 3354 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); 3355 if ( ( zSig & 0x1FF ) <= 5 ) { 3356 doubleZSig = zSig<<1; 3357 mul64To128( zSig, zSig, &term0, &term1 ); 3358 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 3359 while ( (sbits64) rem0 < 0 ) { 3360 --zSig; 3361 doubleZSig -= 2; 3362 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); 3363 } 3364 zSig |= ( ( rem0 | rem1 ) != 0 ); 3365 } 3366 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR ); 3367 3368 } 3369 3370 /*---------------------------------------------------------------------------- 3371 | Returns the binary log of the double-precision floating-point value `a'. 3372 | The operation is performed according to the IEC/IEEE Standard for Binary 3373 | Floating-Point Arithmetic. 3374 *----------------------------------------------------------------------------*/ 3375 float64 float64_log2( float64 a STATUS_PARAM ) 3376 { 3377 flag aSign, zSign; 3378 int16 aExp; 3379 bits64 aSig, aSig0, aSig1, zSig, i; 3380 3381 aSig = extractFloat64Frac( a ); 3382 aExp = extractFloat64Exp( a ); 3383 aSign = extractFloat64Sign( a ); 3384 3385 if ( aExp == 0 ) { 3386 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 3387 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3388 } 3389 if ( aSign ) { 3390 float_raise( float_flag_invalid STATUS_VAR); 3391 return float64_default_nan; 3392 } 3393 if ( aExp == 0x7FF ) { 3394 if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR ); 3395 return a; 3396 } 3397 3398 aExp -= 0x3FF; 3399 aSig |= LIT64( 0x0010000000000000 ); 3400 zSign = aExp < 0; 3401 zSig = (bits64)aExp << 52; 3402 for (i = 1LL << 51; i > 0; i >>= 1) { 3403 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 3404 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 3405 if ( aSig & LIT64( 0x0020000000000000 ) ) { 3406 aSig >>= 1; 3407 zSig |= i; 3408 } 3409 } 3410 3411 if ( zSign ) 3412 zSig = -zSig; 3413 return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR ); 3414 } 3415 3416 /*---------------------------------------------------------------------------- 3417 | Returns 1 if the double-precision floating-point value `a' is equal to the 3418 | corresponding value `b', and 0 otherwise. The comparison is performed 3419 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3420 *----------------------------------------------------------------------------*/ 3421 3422 int float64_eq( float64 a, float64 b STATUS_PARAM ) 3423 { 3424 bits64 av, bv; 3425 3426 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3427 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3428 ) { 3429 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3430 float_raise( float_flag_invalid STATUS_VAR); 3431 } 3432 return 0; 3433 } 3434 av = float64_val(a); 3435 bv = float64_val(b); 3436 return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 ); 3437 3438 } 3439 3440 /*---------------------------------------------------------------------------- 3441 | Returns 1 if the double-precision floating-point value `a' is less than or 3442 | equal to the corresponding value `b', and 0 otherwise. The comparison is 3443 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3444 | Arithmetic. 3445 *----------------------------------------------------------------------------*/ 3446 3447 int float64_le( float64 a, float64 b STATUS_PARAM ) 3448 { 3449 flag aSign, bSign; 3450 bits64 av, bv; 3451 3452 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3453 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3454 ) { 3455 float_raise( float_flag_invalid STATUS_VAR); 3456 return 0; 3457 } 3458 aSign = extractFloat64Sign( a ); 3459 bSign = extractFloat64Sign( b ); 3460 av = float64_val(a); 3461 bv = float64_val(b); 3462 if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 ); 3463 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3464 3465 } 3466 3467 /*---------------------------------------------------------------------------- 3468 | Returns 1 if the double-precision floating-point value `a' is less than 3469 | the corresponding value `b', and 0 otherwise. The comparison is performed 3470 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3471 *----------------------------------------------------------------------------*/ 3472 3473 int float64_lt( float64 a, float64 b STATUS_PARAM ) 3474 { 3475 flag aSign, bSign; 3476 bits64 av, bv; 3477 3478 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3479 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3480 ) { 3481 float_raise( float_flag_invalid STATUS_VAR); 3482 return 0; 3483 } 3484 aSign = extractFloat64Sign( a ); 3485 bSign = extractFloat64Sign( b ); 3486 av = float64_val(a); 3487 bv = float64_val(b); 3488 if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 ); 3489 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3490 3491 } 3492 3493 /*---------------------------------------------------------------------------- 3494 | Returns 1 if the double-precision floating-point value `a' is equal to the 3495 | corresponding value `b', and 0 otherwise. The invalid exception is raised 3496 | if either operand is a NaN. Otherwise, the comparison is performed 3497 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3498 *----------------------------------------------------------------------------*/ 3499 3500 int float64_eq_signaling( float64 a, float64 b STATUS_PARAM ) 3501 { 3502 bits64 av, bv; 3503 3504 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3505 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3506 ) { 3507 float_raise( float_flag_invalid STATUS_VAR); 3508 return 0; 3509 } 3510 av = float64_val(a); 3511 bv = float64_val(b); 3512 return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 ); 3513 3514 } 3515 3516 /*---------------------------------------------------------------------------- 3517 | Returns 1 if the double-precision floating-point value `a' is less than or 3518 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 3519 | cause an exception. Otherwise, the comparison is performed according to the 3520 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3521 *----------------------------------------------------------------------------*/ 3522 3523 int float64_le_quiet( float64 a, float64 b STATUS_PARAM ) 3524 { 3525 flag aSign, bSign; 3526 bits64 av, bv; 3527 3528 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3529 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3530 ) { 3531 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3532 float_raise( float_flag_invalid STATUS_VAR); 3533 } 3534 return 0; 3535 } 3536 aSign = extractFloat64Sign( a ); 3537 bSign = extractFloat64Sign( b ); 3538 av = float64_val(a); 3539 bv = float64_val(b); 3540 if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 ); 3541 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3542 3543 } 3544 3545 /*---------------------------------------------------------------------------- 3546 | Returns 1 if the double-precision floating-point value `a' is less than 3547 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3548 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 3549 | Standard for Binary Floating-Point Arithmetic. 3550 *----------------------------------------------------------------------------*/ 3551 3552 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM ) 3553 { 3554 flag aSign, bSign; 3555 bits64 av, bv; 3556 3557 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3558 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3559 ) { 3560 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3561 float_raise( float_flag_invalid STATUS_VAR); 3562 } 3563 return 0; 3564 } 3565 aSign = extractFloat64Sign( a ); 3566 bSign = extractFloat64Sign( b ); 3567 av = float64_val(a); 3568 bv = float64_val(b); 3569 if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 ); 3570 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3571 3572 } 3573 3574 #ifdef FLOATX80 3575 3576 /*---------------------------------------------------------------------------- 3577 | Returns the result of converting the extended double-precision floating- 3578 | point value `a' to the 32-bit two's complement integer format. The 3579 | conversion is performed according to the IEC/IEEE Standard for Binary 3580 | Floating-Point Arithmetic---which means in particular that the conversion 3581 | is rounded according to the current rounding mode. If `a' is a NaN, the 3582 | largest positive integer is returned. Otherwise, if the conversion 3583 | overflows, the largest integer with the same sign as `a' is returned. 3584 *----------------------------------------------------------------------------*/ 3585 3586 int32 floatx80_to_int32( floatx80 a STATUS_PARAM ) 3587 { 3588 flag aSign; 3589 int32 aExp, shiftCount; 3590 bits64 aSig; 3591 3592 aSig = extractFloatx80Frac( a ); 3593 aExp = extractFloatx80Exp( a ); 3594 aSign = extractFloatx80Sign( a ); 3595 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3596 shiftCount = 0x4037 - aExp; 3597 if ( shiftCount <= 0 ) shiftCount = 1; 3598 shift64RightJamming( aSig, shiftCount, &aSig ); 3599 return roundAndPackInt32( aSign, aSig STATUS_VAR ); 3600 3601 } 3602 3603 /*---------------------------------------------------------------------------- 3604 | Returns the result of converting the extended double-precision floating- 3605 | point value `a' to the 32-bit two's complement integer format. The 3606 | conversion is performed according to the IEC/IEEE Standard for Binary 3607 | Floating-Point Arithmetic, except that the conversion is always rounded 3608 | toward zero. If `a' is a NaN, the largest positive integer is returned. 3609 | Otherwise, if the conversion overflows, the largest integer with the same 3610 | sign as `a' is returned. 3611 *----------------------------------------------------------------------------*/ 3612 3613 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM ) 3614 { 3615 flag aSign; 3616 int32 aExp, shiftCount; 3617 bits64 aSig, savedASig; 3618 int32 z; 3619 3620 aSig = extractFloatx80Frac( a ); 3621 aExp = extractFloatx80Exp( a ); 3622 aSign = extractFloatx80Sign( a ); 3623 if ( 0x401E < aExp ) { 3624 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3625 goto invalid; 3626 } 3627 else if ( aExp < 0x3FFF ) { 3628 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 3629 return 0; 3630 } 3631 shiftCount = 0x403E - aExp; 3632 savedASig = aSig; 3633 aSig >>= shiftCount; 3634 z = aSig; 3635 if ( aSign ) z = - z; 3636 if ( ( z < 0 ) ^ aSign ) { 3637 invalid: 3638 float_raise( float_flag_invalid STATUS_VAR); 3639 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 3640 } 3641 if ( ( aSig<<shiftCount ) != savedASig ) { 3642 STATUS(float_exception_flags) |= float_flag_inexact; 3643 } 3644 return z; 3645 3646 } 3647 3648 /*---------------------------------------------------------------------------- 3649 | Returns the result of converting the extended double-precision floating- 3650 | point value `a' to the 64-bit two's complement integer format. The 3651 | conversion is performed according to the IEC/IEEE Standard for Binary 3652 | Floating-Point Arithmetic---which means in particular that the conversion 3653 | is rounded according to the current rounding mode. If `a' is a NaN, 3654 | the largest positive integer is returned. Otherwise, if the conversion 3655 | overflows, the largest integer with the same sign as `a' is returned. 3656 *----------------------------------------------------------------------------*/ 3657 3658 int64 floatx80_to_int64( floatx80 a STATUS_PARAM ) 3659 { 3660 flag aSign; 3661 int32 aExp, shiftCount; 3662 bits64 aSig, aSigExtra; 3663 3664 aSig = extractFloatx80Frac( a ); 3665 aExp = extractFloatx80Exp( a ); 3666 aSign = extractFloatx80Sign( a ); 3667 shiftCount = 0x403E - aExp; 3668 if ( shiftCount <= 0 ) { 3669 if ( shiftCount ) { 3670 float_raise( float_flag_invalid STATUS_VAR); 3671 if ( ! aSign 3672 || ( ( aExp == 0x7FFF ) 3673 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 3674 ) { 3675 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3676 } 3677 return (sbits64) LIT64( 0x8000000000000000 ); 3678 } 3679 aSigExtra = 0; 3680 } 3681 else { 3682 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 3683 } 3684 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR ); 3685 3686 } 3687 3688 /*---------------------------------------------------------------------------- 3689 | Returns the result of converting the extended double-precision floating- 3690 | point value `a' to the 64-bit two's complement integer format. The 3691 | conversion is performed according to the IEC/IEEE Standard for Binary 3692 | Floating-Point Arithmetic, except that the conversion is always rounded 3693 | toward zero. If `a' is a NaN, the largest positive integer is returned. 3694 | Otherwise, if the conversion overflows, the largest integer with the same 3695 | sign as `a' is returned. 3696 *----------------------------------------------------------------------------*/ 3697 3698 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM ) 3699 { 3700 flag aSign; 3701 int32 aExp, shiftCount; 3702 bits64 aSig; 3703 int64 z; 3704 3705 aSig = extractFloatx80Frac( a ); 3706 aExp = extractFloatx80Exp( a ); 3707 aSign = extractFloatx80Sign( a ); 3708 shiftCount = aExp - 0x403E; 3709 if ( 0 <= shiftCount ) { 3710 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 3711 if ( ( a.high != 0xC03E ) || aSig ) { 3712 float_raise( float_flag_invalid STATUS_VAR); 3713 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 3714 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3715 } 3716 } 3717 return (sbits64) LIT64( 0x8000000000000000 ); 3718 } 3719 else if ( aExp < 0x3FFF ) { 3720 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 3721 return 0; 3722 } 3723 z = aSig>>( - shiftCount ); 3724 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 3725 STATUS(float_exception_flags) |= float_flag_inexact; 3726 } 3727 if ( aSign ) z = - z; 3728 return z; 3729 3730 } 3731 3732 /*---------------------------------------------------------------------------- 3733 | Returns the result of converting the extended double-precision floating- 3734 | point value `a' to the single-precision floating-point format. The 3735 | conversion is performed according to the IEC/IEEE Standard for Binary 3736 | Floating-Point Arithmetic. 3737 *----------------------------------------------------------------------------*/ 3738 3739 float32 floatx80_to_float32( floatx80 a STATUS_PARAM ) 3740 { 3741 flag aSign; 3742 int32 aExp; 3743 bits64 aSig; 3744 3745 aSig = extractFloatx80Frac( a ); 3746 aExp = extractFloatx80Exp( a ); 3747 aSign = extractFloatx80Sign( a ); 3748 if ( aExp == 0x7FFF ) { 3749 if ( (bits64) ( aSig<<1 ) ) { 3750 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) ); 3751 } 3752 return packFloat32( aSign, 0xFF, 0 ); 3753 } 3754 shift64RightJamming( aSig, 33, &aSig ); 3755 if ( aExp || aSig ) aExp -= 0x3F81; 3756 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR ); 3757 3758 } 3759 3760 /*---------------------------------------------------------------------------- 3761 | Returns the result of converting the extended double-precision floating- 3762 | point value `a' to the double-precision floating-point format. The 3763 | conversion is performed according to the IEC/IEEE Standard for Binary 3764 | Floating-Point Arithmetic. 3765 *----------------------------------------------------------------------------*/ 3766 3767 float64 floatx80_to_float64( floatx80 a STATUS_PARAM ) 3768 { 3769 flag aSign; 3770 int32 aExp; 3771 bits64 aSig, zSig; 3772 3773 aSig = extractFloatx80Frac( a ); 3774 aExp = extractFloatx80Exp( a ); 3775 aSign = extractFloatx80Sign( a ); 3776 if ( aExp == 0x7FFF ) { 3777 if ( (bits64) ( aSig<<1 ) ) { 3778 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) ); 3779 } 3780 return packFloat64( aSign, 0x7FF, 0 ); 3781 } 3782 shift64RightJamming( aSig, 1, &zSig ); 3783 if ( aExp || aSig ) aExp -= 0x3C01; 3784 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR ); 3785 3786 } 3787 3788 #ifdef FLOAT128 3789 3790 /*---------------------------------------------------------------------------- 3791 | Returns the result of converting the extended double-precision floating- 3792 | point value `a' to the quadruple-precision floating-point format. The 3793 | conversion is performed according to the IEC/IEEE Standard for Binary 3794 | Floating-Point Arithmetic. 3795 *----------------------------------------------------------------------------*/ 3796 3797 float128 floatx80_to_float128( floatx80 a STATUS_PARAM ) 3798 { 3799 flag aSign; 3800 int16 aExp; 3801 bits64 aSig, zSig0, zSig1; 3802 3803 aSig = extractFloatx80Frac( a ); 3804 aExp = extractFloatx80Exp( a ); 3805 aSign = extractFloatx80Sign( a ); 3806 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) { 3807 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) ); 3808 } 3809 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 3810 return packFloat128( aSign, aExp, zSig0, zSig1 ); 3811 3812 } 3813 3814 #endif 3815 3816 /*---------------------------------------------------------------------------- 3817 | Rounds the extended double-precision floating-point value `a' to an integer, 3818 | and returns the result as an extended quadruple-precision floating-point 3819 | value. The operation is performed according to the IEC/IEEE Standard for 3820 | Binary Floating-Point Arithmetic. 3821 *----------------------------------------------------------------------------*/ 3822 3823 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM ) 3824 { 3825 flag aSign; 3826 int32 aExp; 3827 bits64 lastBitMask, roundBitsMask; 3828 int8 roundingMode; 3829 floatx80 z; 3830 3831 aExp = extractFloatx80Exp( a ); 3832 if ( 0x403E <= aExp ) { 3833 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) { 3834 return propagateFloatx80NaN( a, a STATUS_VAR ); 3835 } 3836 return a; 3837 } 3838 if ( aExp < 0x3FFF ) { 3839 if ( ( aExp == 0 ) 3840 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 3841 return a; 3842 } 3843 STATUS(float_exception_flags) |= float_flag_inexact; 3844 aSign = extractFloatx80Sign( a ); 3845 switch ( STATUS(float_rounding_mode) ) { 3846 case float_round_nearest_even: 3847 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 ) 3848 ) { 3849 return 3850 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3851 } 3852 break; 3853 case float_round_down: 3854 return 3855 aSign ? 3856 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 3857 : packFloatx80( 0, 0, 0 ); 3858 case float_round_up: 3859 return 3860 aSign ? packFloatx80( 1, 0, 0 ) 3861 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3862 } 3863 return packFloatx80( aSign, 0, 0 ); 3864 } 3865 lastBitMask = 1; 3866 lastBitMask <<= 0x403E - aExp; 3867 roundBitsMask = lastBitMask - 1; 3868 z = a; 3869 roundingMode = STATUS(float_rounding_mode); 3870 if ( roundingMode == float_round_nearest_even ) { 3871 z.low += lastBitMask>>1; 3872 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 3873 } 3874 else if ( roundingMode != float_round_to_zero ) { 3875 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) { 3876 z.low += roundBitsMask; 3877 } 3878 } 3879 z.low &= ~ roundBitsMask; 3880 if ( z.low == 0 ) { 3881 ++z.high; 3882 z.low = LIT64( 0x8000000000000000 ); 3883 } 3884 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact; 3885 return z; 3886 3887 } 3888 3889 /*---------------------------------------------------------------------------- 3890 | Returns the result of adding the absolute values of the extended double- 3891 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 3892 | negated before being returned. `zSign' is ignored if the result is a NaN. 3893 | The addition is performed according to the IEC/IEEE Standard for Binary 3894 | Floating-Point Arithmetic. 3895 *----------------------------------------------------------------------------*/ 3896 3897 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM) 3898 { 3899 int32 aExp, bExp, zExp; 3900 bits64 aSig, bSig, zSig0, zSig1; 3901 int32 expDiff; 3902 3903 aSig = extractFloatx80Frac( a ); 3904 aExp = extractFloatx80Exp( a ); 3905 bSig = extractFloatx80Frac( b ); 3906 bExp = extractFloatx80Exp( b ); 3907 expDiff = aExp - bExp; 3908 if ( 0 < expDiff ) { 3909 if ( aExp == 0x7FFF ) { 3910 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 3911 return a; 3912 } 3913 if ( bExp == 0 ) --expDiff; 3914 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 3915 zExp = aExp; 3916 } 3917 else if ( expDiff < 0 ) { 3918 if ( bExp == 0x7FFF ) { 3919 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 3920 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3921 } 3922 if ( aExp == 0 ) ++expDiff; 3923 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 3924 zExp = bExp; 3925 } 3926 else { 3927 if ( aExp == 0x7FFF ) { 3928 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3929 return propagateFloatx80NaN( a, b STATUS_VAR ); 3930 } 3931 return a; 3932 } 3933 zSig1 = 0; 3934 zSig0 = aSig + bSig; 3935 if ( aExp == 0 ) { 3936 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 3937 goto roundAndPack; 3938 } 3939 zExp = aExp; 3940 goto shiftRight1; 3941 } 3942 zSig0 = aSig + bSig; 3943 if ( (sbits64) zSig0 < 0 ) goto roundAndPack; 3944 shiftRight1: 3945 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 3946 zSig0 |= LIT64( 0x8000000000000000 ); 3947 ++zExp; 3948 roundAndPack: 3949 return 3950 roundAndPackFloatx80( 3951 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR ); 3952 3953 } 3954 3955 /*---------------------------------------------------------------------------- 3956 | Returns the result of subtracting the absolute values of the extended 3957 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 3958 | difference is negated before being returned. `zSign' is ignored if the 3959 | result is a NaN. The subtraction is performed according to the IEC/IEEE 3960 | Standard for Binary Floating-Point Arithmetic. 3961 *----------------------------------------------------------------------------*/ 3962 3963 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM ) 3964 { 3965 int32 aExp, bExp, zExp; 3966 bits64 aSig, bSig, zSig0, zSig1; 3967 int32 expDiff; 3968 floatx80 z; 3969 3970 aSig = extractFloatx80Frac( a ); 3971 aExp = extractFloatx80Exp( a ); 3972 bSig = extractFloatx80Frac( b ); 3973 bExp = extractFloatx80Exp( b ); 3974 expDiff = aExp - bExp; 3975 if ( 0 < expDiff ) goto aExpBigger; 3976 if ( expDiff < 0 ) goto bExpBigger; 3977 if ( aExp == 0x7FFF ) { 3978 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 3979 return propagateFloatx80NaN( a, b STATUS_VAR ); 3980 } 3981 float_raise( float_flag_invalid STATUS_VAR); 3982 z.low = floatx80_default_nan_low; 3983 z.high = floatx80_default_nan_high; 3984 return z; 3985 } 3986 if ( aExp == 0 ) { 3987 aExp = 1; 3988 bExp = 1; 3989 } 3990 zSig1 = 0; 3991 if ( bSig < aSig ) goto aBigger; 3992 if ( aSig < bSig ) goto bBigger; 3993 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 ); 3994 bExpBigger: 3995 if ( bExp == 0x7FFF ) { 3996 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 3997 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3998 } 3999 if ( aExp == 0 ) ++expDiff; 4000 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 4001 bBigger: 4002 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 4003 zExp = bExp; 4004 zSign ^= 1; 4005 goto normalizeRoundAndPack; 4006 aExpBigger: 4007 if ( aExp == 0x7FFF ) { 4008 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 4009 return a; 4010 } 4011 if ( bExp == 0 ) --expDiff; 4012 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 4013 aBigger: 4014 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 4015 zExp = aExp; 4016 normalizeRoundAndPack: 4017 return 4018 normalizeRoundAndPackFloatx80( 4019 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR ); 4020 4021 } 4022 4023 /*---------------------------------------------------------------------------- 4024 | Returns the result of adding the extended double-precision floating-point 4025 | values `a' and `b'. The operation is performed according to the IEC/IEEE 4026 | Standard for Binary Floating-Point Arithmetic. 4027 *----------------------------------------------------------------------------*/ 4028 4029 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM ) 4030 { 4031 flag aSign, bSign; 4032 4033 aSign = extractFloatx80Sign( a ); 4034 bSign = extractFloatx80Sign( b ); 4035 if ( aSign == bSign ) { 4036 return addFloatx80Sigs( a, b, aSign STATUS_VAR ); 4037 } 4038 else { 4039 return subFloatx80Sigs( a, b, aSign STATUS_VAR ); 4040 } 4041 4042 } 4043 4044 /*---------------------------------------------------------------------------- 4045 | Returns the result of subtracting the extended double-precision floating- 4046 | point values `a' and `b'. The operation is performed according to the 4047 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4048 *----------------------------------------------------------------------------*/ 4049 4050 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM ) 4051 { 4052 flag aSign, bSign; 4053 4054 aSign = extractFloatx80Sign( a ); 4055 bSign = extractFloatx80Sign( b ); 4056 if ( aSign == bSign ) { 4057 return subFloatx80Sigs( a, b, aSign STATUS_VAR ); 4058 } 4059 else { 4060 return addFloatx80Sigs( a, b, aSign STATUS_VAR ); 4061 } 4062 4063 } 4064 4065 /*---------------------------------------------------------------------------- 4066 | Returns the result of multiplying the extended double-precision floating- 4067 | point values `a' and `b'. The operation is performed according to the 4068 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4069 *----------------------------------------------------------------------------*/ 4070 4071 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM ) 4072 { 4073 flag aSign, bSign, zSign; 4074 int32 aExp, bExp, zExp; 4075 bits64 aSig, bSig, zSig0, zSig1; 4076 floatx80 z; 4077 4078 aSig = extractFloatx80Frac( a ); 4079 aExp = extractFloatx80Exp( a ); 4080 aSign = extractFloatx80Sign( a ); 4081 bSig = extractFloatx80Frac( b ); 4082 bExp = extractFloatx80Exp( b ); 4083 bSign = extractFloatx80Sign( b ); 4084 zSign = aSign ^ bSign; 4085 if ( aExp == 0x7FFF ) { 4086 if ( (bits64) ( aSig<<1 ) 4087 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 4088 return propagateFloatx80NaN( a, b STATUS_VAR ); 4089 } 4090 if ( ( bExp | bSig ) == 0 ) goto invalid; 4091 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4092 } 4093 if ( bExp == 0x7FFF ) { 4094 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 4095 if ( ( aExp | aSig ) == 0 ) { 4096 invalid: 4097 float_raise( float_flag_invalid STATUS_VAR); 4098 z.low = floatx80_default_nan_low; 4099 z.high = floatx80_default_nan_high; 4100 return z; 4101 } 4102 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4103 } 4104 if ( aExp == 0 ) { 4105 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 4106 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 4107 } 4108 if ( bExp == 0 ) { 4109 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 4110 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 4111 } 4112 zExp = aExp + bExp - 0x3FFE; 4113 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 4114 if ( 0 < (sbits64) zSig0 ) { 4115 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 4116 --zExp; 4117 } 4118 return 4119 roundAndPackFloatx80( 4120 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR ); 4121 4122 } 4123 4124 /*---------------------------------------------------------------------------- 4125 | Returns the result of dividing the extended double-precision floating-point 4126 | value `a' by the corresponding value `b'. The operation is performed 4127 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4128 *----------------------------------------------------------------------------*/ 4129 4130 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM ) 4131 { 4132 flag aSign, bSign, zSign; 4133 int32 aExp, bExp, zExp; 4134 bits64 aSig, bSig, zSig0, zSig1; 4135 bits64 rem0, rem1, rem2, term0, term1, term2; 4136 floatx80 z; 4137 4138 aSig = extractFloatx80Frac( a ); 4139 aExp = extractFloatx80Exp( a ); 4140 aSign = extractFloatx80Sign( a ); 4141 bSig = extractFloatx80Frac( b ); 4142 bExp = extractFloatx80Exp( b ); 4143 bSign = extractFloatx80Sign( b ); 4144 zSign = aSign ^ bSign; 4145 if ( aExp == 0x7FFF ) { 4146 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 4147 if ( bExp == 0x7FFF ) { 4148 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 4149 goto invalid; 4150 } 4151 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4152 } 4153 if ( bExp == 0x7FFF ) { 4154 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 4155 return packFloatx80( zSign, 0, 0 ); 4156 } 4157 if ( bExp == 0 ) { 4158 if ( bSig == 0 ) { 4159 if ( ( aExp | aSig ) == 0 ) { 4160 invalid: 4161 float_raise( float_flag_invalid STATUS_VAR); 4162 z.low = floatx80_default_nan_low; 4163 z.high = floatx80_default_nan_high; 4164 return z; 4165 } 4166 float_raise( float_flag_divbyzero STATUS_VAR); 4167 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4168 } 4169 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 4170 } 4171 if ( aExp == 0 ) { 4172 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 4173 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 4174 } 4175 zExp = aExp - bExp + 0x3FFE; 4176 rem1 = 0; 4177 if ( bSig <= aSig ) { 4178 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 4179 ++zExp; 4180 } 4181 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 4182 mul64To128( bSig, zSig0, &term0, &term1 ); 4183 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 4184 while ( (sbits64) rem0 < 0 ) { 4185 --zSig0; 4186 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 4187 } 4188 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 4189 if ( (bits64) ( zSig1<<1 ) <= 8 ) { 4190 mul64To128( bSig, zSig1, &term1, &term2 ); 4191 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 4192 while ( (sbits64) rem1 < 0 ) { 4193 --zSig1; 4194 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 4195 } 4196 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 4197 } 4198 return 4199 roundAndPackFloatx80( 4200 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR ); 4201 4202 } 4203 4204 /*---------------------------------------------------------------------------- 4205 | Returns the remainder of the extended double-precision floating-point value 4206 | `a' with respect to the corresponding value `b'. The operation is performed 4207 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4208 *----------------------------------------------------------------------------*/ 4209 4210 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM ) 4211 { 4212 flag aSign, zSign; 4213 int32 aExp, bExp, expDiff; 4214 bits64 aSig0, aSig1, bSig; 4215 bits64 q, term0, term1, alternateASig0, alternateASig1; 4216 floatx80 z; 4217 4218 aSig0 = extractFloatx80Frac( a ); 4219 aExp = extractFloatx80Exp( a ); 4220 aSign = extractFloatx80Sign( a ); 4221 bSig = extractFloatx80Frac( b ); 4222 bExp = extractFloatx80Exp( b ); 4223 if ( aExp == 0x7FFF ) { 4224 if ( (bits64) ( aSig0<<1 ) 4225 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 4226 return propagateFloatx80NaN( a, b STATUS_VAR ); 4227 } 4228 goto invalid; 4229 } 4230 if ( bExp == 0x7FFF ) { 4231 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 4232 return a; 4233 } 4234 if ( bExp == 0 ) { 4235 if ( bSig == 0 ) { 4236 invalid: 4237 float_raise( float_flag_invalid STATUS_VAR); 4238 z.low = floatx80_default_nan_low; 4239 z.high = floatx80_default_nan_high; 4240 return z; 4241 } 4242 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 4243 } 4244 if ( aExp == 0 ) { 4245 if ( (bits64) ( aSig0<<1 ) == 0 ) return a; 4246 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 4247 } 4248 bSig |= LIT64( 0x8000000000000000 ); 4249 zSign = aSign; 4250 expDiff = aExp - bExp; 4251 aSig1 = 0; 4252 if ( expDiff < 0 ) { 4253 if ( expDiff < -1 ) return a; 4254 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 4255 expDiff = 0; 4256 } 4257 q = ( bSig <= aSig0 ); 4258 if ( q ) aSig0 -= bSig; 4259 expDiff -= 64; 4260 while ( 0 < expDiff ) { 4261 q = estimateDiv128To64( aSig0, aSig1, bSig ); 4262 q = ( 2 < q ) ? q - 2 : 0; 4263 mul64To128( bSig, q, &term0, &term1 ); 4264 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 4265 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 4266 expDiff -= 62; 4267 } 4268 expDiff += 64; 4269 if ( 0 < expDiff ) { 4270 q = estimateDiv128To64( aSig0, aSig1, bSig ); 4271 q = ( 2 < q ) ? q - 2 : 0; 4272 q >>= 64 - expDiff; 4273 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 4274 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 4275 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 4276 while ( le128( term0, term1, aSig0, aSig1 ) ) { 4277 ++q; 4278 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 4279 } 4280 } 4281 else { 4282 term1 = 0; 4283 term0 = bSig; 4284 } 4285 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 4286 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 4287 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 4288 && ( q & 1 ) ) 4289 ) { 4290 aSig0 = alternateASig0; 4291 aSig1 = alternateASig1; 4292 zSign = ! zSign; 4293 } 4294 return 4295 normalizeRoundAndPackFloatx80( 4296 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR ); 4297 4298 } 4299 4300 /*---------------------------------------------------------------------------- 4301 | Returns the square root of the extended double-precision floating-point 4302 | value `a'. The operation is performed according to the IEC/IEEE Standard 4303 | for Binary Floating-Point Arithmetic. 4304 *----------------------------------------------------------------------------*/ 4305 4306 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM ) 4307 { 4308 flag aSign; 4309 int32 aExp, zExp; 4310 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0; 4311 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 4312 floatx80 z; 4313 4314 aSig0 = extractFloatx80Frac( a ); 4315 aExp = extractFloatx80Exp( a ); 4316 aSign = extractFloatx80Sign( a ); 4317 if ( aExp == 0x7FFF ) { 4318 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR ); 4319 if ( ! aSign ) return a; 4320 goto invalid; 4321 } 4322 if ( aSign ) { 4323 if ( ( aExp | aSig0 ) == 0 ) return a; 4324 invalid: 4325 float_raise( float_flag_invalid STATUS_VAR); 4326 z.low = floatx80_default_nan_low; 4327 z.high = floatx80_default_nan_high; 4328 return z; 4329 } 4330 if ( aExp == 0 ) { 4331 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 4332 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 4333 } 4334 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 4335 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 4336 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 4337 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 4338 doubleZSig0 = zSig0<<1; 4339 mul64To128( zSig0, zSig0, &term0, &term1 ); 4340 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 4341 while ( (sbits64) rem0 < 0 ) { 4342 --zSig0; 4343 doubleZSig0 -= 2; 4344 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 4345 } 4346 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 4347 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 4348 if ( zSig1 == 0 ) zSig1 = 1; 4349 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 4350 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 4351 mul64To128( zSig1, zSig1, &term2, &term3 ); 4352 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 4353 while ( (sbits64) rem1 < 0 ) { 4354 --zSig1; 4355 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 4356 term3 |= 1; 4357 term2 |= doubleZSig0; 4358 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 4359 } 4360 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 4361 } 4362 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 4363 zSig0 |= doubleZSig0; 4364 return 4365 roundAndPackFloatx80( 4366 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR ); 4367 4368 } 4369 4370 /*---------------------------------------------------------------------------- 4371 | Returns 1 if the extended double-precision floating-point value `a' is 4372 | equal to the corresponding value `b', and 0 otherwise. The comparison is 4373 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4374 | Arithmetic. 4375 *----------------------------------------------------------------------------*/ 4376 4377 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM ) 4378 { 4379 4380 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4381 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4382 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4383 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4384 ) { 4385 if ( floatx80_is_signaling_nan( a ) 4386 || floatx80_is_signaling_nan( b ) ) { 4387 float_raise( float_flag_invalid STATUS_VAR); 4388 } 4389 return 0; 4390 } 4391 return 4392 ( a.low == b.low ) 4393 && ( ( a.high == b.high ) 4394 || ( ( a.low == 0 ) 4395 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 4396 ); 4397 4398 } 4399 4400 /*---------------------------------------------------------------------------- 4401 | Returns 1 if the extended double-precision floating-point value `a' is 4402 | less than or equal to the corresponding value `b', and 0 otherwise. The 4403 | comparison is performed according to the IEC/IEEE Standard for Binary 4404 | Floating-Point Arithmetic. 4405 *----------------------------------------------------------------------------*/ 4406 4407 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM ) 4408 { 4409 flag aSign, bSign; 4410 4411 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4412 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4413 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4414 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4415 ) { 4416 float_raise( float_flag_invalid STATUS_VAR); 4417 return 0; 4418 } 4419 aSign = extractFloatx80Sign( a ); 4420 bSign = extractFloatx80Sign( b ); 4421 if ( aSign != bSign ) { 4422 return 4423 aSign 4424 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4425 == 0 ); 4426 } 4427 return 4428 aSign ? le128( b.high, b.low, a.high, a.low ) 4429 : le128( a.high, a.low, b.high, b.low ); 4430 4431 } 4432 4433 /*---------------------------------------------------------------------------- 4434 | Returns 1 if the extended double-precision floating-point value `a' is 4435 | less than the corresponding value `b', and 0 otherwise. The comparison 4436 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4437 | Arithmetic. 4438 *----------------------------------------------------------------------------*/ 4439 4440 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM ) 4441 { 4442 flag aSign, bSign; 4443 4444 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4445 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4446 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4447 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4448 ) { 4449 float_raise( float_flag_invalid STATUS_VAR); 4450 return 0; 4451 } 4452 aSign = extractFloatx80Sign( a ); 4453 bSign = extractFloatx80Sign( b ); 4454 if ( aSign != bSign ) { 4455 return 4456 aSign 4457 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4458 != 0 ); 4459 } 4460 return 4461 aSign ? lt128( b.high, b.low, a.high, a.low ) 4462 : lt128( a.high, a.low, b.high, b.low ); 4463 4464 } 4465 4466 /*---------------------------------------------------------------------------- 4467 | Returns 1 if the extended double-precision floating-point value `a' is equal 4468 | to the corresponding value `b', and 0 otherwise. The invalid exception is 4469 | raised if either operand is a NaN. Otherwise, the comparison is performed 4470 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4471 *----------------------------------------------------------------------------*/ 4472 4473 int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM ) 4474 { 4475 4476 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4477 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4478 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4479 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4480 ) { 4481 float_raise( float_flag_invalid STATUS_VAR); 4482 return 0; 4483 } 4484 return 4485 ( a.low == b.low ) 4486 && ( ( a.high == b.high ) 4487 || ( ( a.low == 0 ) 4488 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 4489 ); 4490 4491 } 4492 4493 /*---------------------------------------------------------------------------- 4494 | Returns 1 if the extended double-precision floating-point value `a' is less 4495 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 4496 | do not cause an exception. Otherwise, the comparison is performed according 4497 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4498 *----------------------------------------------------------------------------*/ 4499 4500 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM ) 4501 { 4502 flag aSign, bSign; 4503 4504 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4505 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4506 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4507 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4508 ) { 4509 if ( floatx80_is_signaling_nan( a ) 4510 || floatx80_is_signaling_nan( b ) ) { 4511 float_raise( float_flag_invalid STATUS_VAR); 4512 } 4513 return 0; 4514 } 4515 aSign = extractFloatx80Sign( a ); 4516 bSign = extractFloatx80Sign( b ); 4517 if ( aSign != bSign ) { 4518 return 4519 aSign 4520 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4521 == 0 ); 4522 } 4523 return 4524 aSign ? le128( b.high, b.low, a.high, a.low ) 4525 : le128( a.high, a.low, b.high, b.low ); 4526 4527 } 4528 4529 /*---------------------------------------------------------------------------- 4530 | Returns 1 if the extended double-precision floating-point value `a' is less 4531 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 4532 | an exception. Otherwise, the comparison is performed according to the 4533 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4534 *----------------------------------------------------------------------------*/ 4535 4536 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM ) 4537 { 4538 flag aSign, bSign; 4539 4540 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4541 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4542 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4543 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4544 ) { 4545 if ( floatx80_is_signaling_nan( a ) 4546 || floatx80_is_signaling_nan( b ) ) { 4547 float_raise( float_flag_invalid STATUS_VAR); 4548 } 4549 return 0; 4550 } 4551 aSign = extractFloatx80Sign( a ); 4552 bSign = extractFloatx80Sign( b ); 4553 if ( aSign != bSign ) { 4554 return 4555 aSign 4556 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4557 != 0 ); 4558 } 4559 return 4560 aSign ? lt128( b.high, b.low, a.high, a.low ) 4561 : lt128( a.high, a.low, b.high, b.low ); 4562 4563 } 4564 4565 #endif 4566 4567 #ifdef FLOAT128 4568 4569 /*---------------------------------------------------------------------------- 4570 | Returns the result of converting the quadruple-precision floating-point 4571 | value `a' to the 32-bit two's complement integer format. The conversion 4572 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4573 | Arithmetic---which means in particular that the conversion is rounded 4574 | according to the current rounding mode. If `a' is a NaN, the largest 4575 | positive integer is returned. Otherwise, if the conversion overflows, the 4576 | largest integer with the same sign as `a' is returned. 4577 *----------------------------------------------------------------------------*/ 4578 4579 int32 float128_to_int32( float128 a STATUS_PARAM ) 4580 { 4581 flag aSign; 4582 int32 aExp, shiftCount; 4583 bits64 aSig0, aSig1; 4584 4585 aSig1 = extractFloat128Frac1( a ); 4586 aSig0 = extractFloat128Frac0( a ); 4587 aExp = extractFloat128Exp( a ); 4588 aSign = extractFloat128Sign( a ); 4589 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 4590 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4591 aSig0 |= ( aSig1 != 0 ); 4592 shiftCount = 0x4028 - aExp; 4593 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 4594 return roundAndPackInt32( aSign, aSig0 STATUS_VAR ); 4595 4596 } 4597 4598 /*---------------------------------------------------------------------------- 4599 | Returns the result of converting the quadruple-precision floating-point 4600 | value `a' to the 32-bit two's complement integer format. The conversion 4601 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4602 | Arithmetic, except that the conversion is always rounded toward zero. If 4603 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 4604 | conversion overflows, the largest integer with the same sign as `a' is 4605 | returned. 4606 *----------------------------------------------------------------------------*/ 4607 4608 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM ) 4609 { 4610 flag aSign; 4611 int32 aExp, shiftCount; 4612 bits64 aSig0, aSig1, savedASig; 4613 int32 z; 4614 4615 aSig1 = extractFloat128Frac1( a ); 4616 aSig0 = extractFloat128Frac0( a ); 4617 aExp = extractFloat128Exp( a ); 4618 aSign = extractFloat128Sign( a ); 4619 aSig0 |= ( aSig1 != 0 ); 4620 if ( 0x401E < aExp ) { 4621 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 4622 goto invalid; 4623 } 4624 else if ( aExp < 0x3FFF ) { 4625 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact; 4626 return 0; 4627 } 4628 aSig0 |= LIT64( 0x0001000000000000 ); 4629 shiftCount = 0x402F - aExp; 4630 savedASig = aSig0; 4631 aSig0 >>= shiftCount; 4632 z = aSig0; 4633 if ( aSign ) z = - z; 4634 if ( ( z < 0 ) ^ aSign ) { 4635 invalid: 4636 float_raise( float_flag_invalid STATUS_VAR); 4637 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 4638 } 4639 if ( ( aSig0<<shiftCount ) != savedASig ) { 4640 STATUS(float_exception_flags) |= float_flag_inexact; 4641 } 4642 return z; 4643 4644 } 4645 4646 /*---------------------------------------------------------------------------- 4647 | Returns the result of converting the quadruple-precision floating-point 4648 | value `a' to the 64-bit two's complement integer format. The conversion 4649 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4650 | Arithmetic---which means in particular that the conversion is rounded 4651 | according to the current rounding mode. If `a' is a NaN, the largest 4652 | positive integer is returned. Otherwise, if the conversion overflows, the 4653 | largest integer with the same sign as `a' is returned. 4654 *----------------------------------------------------------------------------*/ 4655 4656 int64 float128_to_int64( float128 a STATUS_PARAM ) 4657 { 4658 flag aSign; 4659 int32 aExp, shiftCount; 4660 bits64 aSig0, aSig1; 4661 4662 aSig1 = extractFloat128Frac1( a ); 4663 aSig0 = extractFloat128Frac0( a ); 4664 aExp = extractFloat128Exp( a ); 4665 aSign = extractFloat128Sign( a ); 4666 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4667 shiftCount = 0x402F - aExp; 4668 if ( shiftCount <= 0 ) { 4669 if ( 0x403E < aExp ) { 4670 float_raise( float_flag_invalid STATUS_VAR); 4671 if ( ! aSign 4672 || ( ( aExp == 0x7FFF ) 4673 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 4674 ) 4675 ) { 4676 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4677 } 4678 return (sbits64) LIT64( 0x8000000000000000 ); 4679 } 4680 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 4681 } 4682 else { 4683 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 4684 } 4685 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR ); 4686 4687 } 4688 4689 /*---------------------------------------------------------------------------- 4690 | Returns the result of converting the quadruple-precision floating-point 4691 | value `a' to the 64-bit two's complement integer format. The conversion 4692 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4693 | Arithmetic, except that the conversion is always rounded toward zero. 4694 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 4695 | the conversion overflows, the largest integer with the same sign as `a' is 4696 | returned. 4697 *----------------------------------------------------------------------------*/ 4698 4699 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM ) 4700 { 4701 flag aSign; 4702 int32 aExp, shiftCount; 4703 bits64 aSig0, aSig1; 4704 int64 z; 4705 4706 aSig1 = extractFloat128Frac1( a ); 4707 aSig0 = extractFloat128Frac0( a ); 4708 aExp = extractFloat128Exp( a ); 4709 aSign = extractFloat128Sign( a ); 4710 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4711 shiftCount = aExp - 0x402F; 4712 if ( 0 < shiftCount ) { 4713 if ( 0x403E <= aExp ) { 4714 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 4715 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 4716 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 4717 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact; 4718 } 4719 else { 4720 float_raise( float_flag_invalid STATUS_VAR); 4721 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 4722 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4723 } 4724 } 4725 return (sbits64) LIT64( 0x8000000000000000 ); 4726 } 4727 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 4728 if ( (bits64) ( aSig1<<shiftCount ) ) { 4729 STATUS(float_exception_flags) |= float_flag_inexact; 4730 } 4731 } 4732 else { 4733 if ( aExp < 0x3FFF ) { 4734 if ( aExp | aSig0 | aSig1 ) { 4735 STATUS(float_exception_flags) |= float_flag_inexact; 4736 } 4737 return 0; 4738 } 4739 z = aSig0>>( - shiftCount ); 4740 if ( aSig1 4741 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) { 4742 STATUS(float_exception_flags) |= float_flag_inexact; 4743 } 4744 } 4745 if ( aSign ) z = - z; 4746 return z; 4747 4748 } 4749 4750 /*---------------------------------------------------------------------------- 4751 | Returns the result of converting the quadruple-precision floating-point 4752 | value `a' to the single-precision floating-point format. The conversion 4753 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4754 | Arithmetic. 4755 *----------------------------------------------------------------------------*/ 4756 4757 float32 float128_to_float32( float128 a STATUS_PARAM ) 4758 { 4759 flag aSign; 4760 int32 aExp; 4761 bits64 aSig0, aSig1; 4762 bits32 zSig; 4763 4764 aSig1 = extractFloat128Frac1( a ); 4765 aSig0 = extractFloat128Frac0( a ); 4766 aExp = extractFloat128Exp( a ); 4767 aSign = extractFloat128Sign( a ); 4768 if ( aExp == 0x7FFF ) { 4769 if ( aSig0 | aSig1 ) { 4770 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) ); 4771 } 4772 return packFloat32( aSign, 0xFF, 0 ); 4773 } 4774 aSig0 |= ( aSig1 != 0 ); 4775 shift64RightJamming( aSig0, 18, &aSig0 ); 4776 zSig = aSig0; 4777 if ( aExp || zSig ) { 4778 zSig |= 0x40000000; 4779 aExp -= 0x3F81; 4780 } 4781 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR ); 4782 4783 } 4784 4785 /*---------------------------------------------------------------------------- 4786 | Returns the result of converting the quadruple-precision floating-point 4787 | value `a' to the double-precision floating-point format. The conversion 4788 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4789 | Arithmetic. 4790 *----------------------------------------------------------------------------*/ 4791 4792 float64 float128_to_float64( float128 a STATUS_PARAM ) 4793 { 4794 flag aSign; 4795 int32 aExp; 4796 bits64 aSig0, aSig1; 4797 4798 aSig1 = extractFloat128Frac1( a ); 4799 aSig0 = extractFloat128Frac0( a ); 4800 aExp = extractFloat128Exp( a ); 4801 aSign = extractFloat128Sign( a ); 4802 if ( aExp == 0x7FFF ) { 4803 if ( aSig0 | aSig1 ) { 4804 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) ); 4805 } 4806 return packFloat64( aSign, 0x7FF, 0 ); 4807 } 4808 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4809 aSig0 |= ( aSig1 != 0 ); 4810 if ( aExp || aSig0 ) { 4811 aSig0 |= LIT64( 0x4000000000000000 ); 4812 aExp -= 0x3C01; 4813 } 4814 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR ); 4815 4816 } 4817 4818 #ifdef FLOATX80 4819 4820 /*---------------------------------------------------------------------------- 4821 | Returns the result of converting the quadruple-precision floating-point 4822 | value `a' to the extended double-precision floating-point format. The 4823 | conversion is performed according to the IEC/IEEE Standard for Binary 4824 | Floating-Point Arithmetic. 4825 *----------------------------------------------------------------------------*/ 4826 4827 floatx80 float128_to_floatx80( float128 a STATUS_PARAM ) 4828 { 4829 flag aSign; 4830 int32 aExp; 4831 bits64 aSig0, aSig1; 4832 4833 aSig1 = extractFloat128Frac1( a ); 4834 aSig0 = extractFloat128Frac0( a ); 4835 aExp = extractFloat128Exp( a ); 4836 aSign = extractFloat128Sign( a ); 4837 if ( aExp == 0x7FFF ) { 4838 if ( aSig0 | aSig1 ) { 4839 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) ); 4840 } 4841 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4842 } 4843 if ( aExp == 0 ) { 4844 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 4845 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4846 } 4847 else { 4848 aSig0 |= LIT64( 0x0001000000000000 ); 4849 } 4850 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 4851 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR ); 4852 4853 } 4854 4855 #endif 4856 4857 /*---------------------------------------------------------------------------- 4858 | Rounds the quadruple-precision floating-point value `a' to an integer, and 4859 | returns the result as a quadruple-precision floating-point value. The 4860 | operation is performed according to the IEC/IEEE Standard for Binary 4861 | Floating-Point Arithmetic. 4862 *----------------------------------------------------------------------------*/ 4863 4864 float128 float128_round_to_int( float128 a STATUS_PARAM ) 4865 { 4866 flag aSign; 4867 int32 aExp; 4868 bits64 lastBitMask, roundBitsMask; 4869 int8 roundingMode; 4870 float128 z; 4871 4872 aExp = extractFloat128Exp( a ); 4873 if ( 0x402F <= aExp ) { 4874 if ( 0x406F <= aExp ) { 4875 if ( ( aExp == 0x7FFF ) 4876 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 4877 ) { 4878 return propagateFloat128NaN( a, a STATUS_VAR ); 4879 } 4880 return a; 4881 } 4882 lastBitMask = 1; 4883 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 4884 roundBitsMask = lastBitMask - 1; 4885 z = a; 4886 roundingMode = STATUS(float_rounding_mode); 4887 if ( roundingMode == float_round_nearest_even ) { 4888 if ( lastBitMask ) { 4889 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 4890 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 4891 } 4892 else { 4893 if ( (sbits64) z.low < 0 ) { 4894 ++z.high; 4895 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1; 4896 } 4897 } 4898 } 4899 else if ( roundingMode != float_round_to_zero ) { 4900 if ( extractFloat128Sign( z ) 4901 ^ ( roundingMode == float_round_up ) ) { 4902 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low ); 4903 } 4904 } 4905 z.low &= ~ roundBitsMask; 4906 } 4907 else { 4908 if ( aExp < 0x3FFF ) { 4909 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 4910 STATUS(float_exception_flags) |= float_flag_inexact; 4911 aSign = extractFloat128Sign( a ); 4912 switch ( STATUS(float_rounding_mode) ) { 4913 case float_round_nearest_even: 4914 if ( ( aExp == 0x3FFE ) 4915 && ( extractFloat128Frac0( a ) 4916 | extractFloat128Frac1( a ) ) 4917 ) { 4918 return packFloat128( aSign, 0x3FFF, 0, 0 ); 4919 } 4920 break; 4921 case float_round_down: 4922 return 4923 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 4924 : packFloat128( 0, 0, 0, 0 ); 4925 case float_round_up: 4926 return 4927 aSign ? packFloat128( 1, 0, 0, 0 ) 4928 : packFloat128( 0, 0x3FFF, 0, 0 ); 4929 } 4930 return packFloat128( aSign, 0, 0, 0 ); 4931 } 4932 lastBitMask = 1; 4933 lastBitMask <<= 0x402F - aExp; 4934 roundBitsMask = lastBitMask - 1; 4935 z.low = 0; 4936 z.high = a.high; 4937 roundingMode = STATUS(float_rounding_mode); 4938 if ( roundingMode == float_round_nearest_even ) { 4939 z.high += lastBitMask>>1; 4940 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 4941 z.high &= ~ lastBitMask; 4942 } 4943 } 4944 else if ( roundingMode != float_round_to_zero ) { 4945 if ( extractFloat128Sign( z ) 4946 ^ ( roundingMode == float_round_up ) ) { 4947 z.high |= ( a.low != 0 ); 4948 z.high += roundBitsMask; 4949 } 4950 } 4951 z.high &= ~ roundBitsMask; 4952 } 4953 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 4954 STATUS(float_exception_flags) |= float_flag_inexact; 4955 } 4956 return z; 4957 4958 } 4959 4960 /*---------------------------------------------------------------------------- 4961 | Returns the result of adding the absolute values of the quadruple-precision 4962 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 4963 | before being returned. `zSign' is ignored if the result is a NaN. 4964 | The addition is performed according to the IEC/IEEE Standard for Binary 4965 | Floating-Point Arithmetic. 4966 *----------------------------------------------------------------------------*/ 4967 4968 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM) 4969 { 4970 int32 aExp, bExp, zExp; 4971 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 4972 int32 expDiff; 4973 4974 aSig1 = extractFloat128Frac1( a ); 4975 aSig0 = extractFloat128Frac0( a ); 4976 aExp = extractFloat128Exp( a ); 4977 bSig1 = extractFloat128Frac1( b ); 4978 bSig0 = extractFloat128Frac0( b ); 4979 bExp = extractFloat128Exp( b ); 4980 expDiff = aExp - bExp; 4981 if ( 0 < expDiff ) { 4982 if ( aExp == 0x7FFF ) { 4983 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 4984 return a; 4985 } 4986 if ( bExp == 0 ) { 4987 --expDiff; 4988 } 4989 else { 4990 bSig0 |= LIT64( 0x0001000000000000 ); 4991 } 4992 shift128ExtraRightJamming( 4993 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 4994 zExp = aExp; 4995 } 4996 else if ( expDiff < 0 ) { 4997 if ( bExp == 0x7FFF ) { 4998 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 4999 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5000 } 5001 if ( aExp == 0 ) { 5002 ++expDiff; 5003 } 5004 else { 5005 aSig0 |= LIT64( 0x0001000000000000 ); 5006 } 5007 shift128ExtraRightJamming( 5008 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 5009 zExp = bExp; 5010 } 5011 else { 5012 if ( aExp == 0x7FFF ) { 5013 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 5014 return propagateFloat128NaN( a, b STATUS_VAR ); 5015 } 5016 return a; 5017 } 5018 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 5019 if ( aExp == 0 ) { 5020 if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 ); 5021 return packFloat128( zSign, 0, zSig0, zSig1 ); 5022 } 5023 zSig2 = 0; 5024 zSig0 |= LIT64( 0x0002000000000000 ); 5025 zExp = aExp; 5026 goto shiftRight1; 5027 } 5028 aSig0 |= LIT64( 0x0001000000000000 ); 5029 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 5030 --zExp; 5031 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 5032 ++zExp; 5033 shiftRight1: 5034 shift128ExtraRightJamming( 5035 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 5036 roundAndPack: 5037 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR ); 5038 5039 } 5040 5041 /*---------------------------------------------------------------------------- 5042 | Returns the result of subtracting the absolute values of the quadruple- 5043 | precision floating-point values `a' and `b'. If `zSign' is 1, the 5044 | difference is negated before being returned. `zSign' is ignored if the 5045 | result is a NaN. The subtraction is performed according to the IEC/IEEE 5046 | Standard for Binary Floating-Point Arithmetic. 5047 *----------------------------------------------------------------------------*/ 5048 5049 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM) 5050 { 5051 int32 aExp, bExp, zExp; 5052 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 5053 int32 expDiff; 5054 float128 z; 5055 5056 aSig1 = extractFloat128Frac1( a ); 5057 aSig0 = extractFloat128Frac0( a ); 5058 aExp = extractFloat128Exp( a ); 5059 bSig1 = extractFloat128Frac1( b ); 5060 bSig0 = extractFloat128Frac0( b ); 5061 bExp = extractFloat128Exp( b ); 5062 expDiff = aExp - bExp; 5063 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 5064 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 5065 if ( 0 < expDiff ) goto aExpBigger; 5066 if ( expDiff < 0 ) goto bExpBigger; 5067 if ( aExp == 0x7FFF ) { 5068 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 5069 return propagateFloat128NaN( a, b STATUS_VAR ); 5070 } 5071 float_raise( float_flag_invalid STATUS_VAR); 5072 z.low = float128_default_nan_low; 5073 z.high = float128_default_nan_high; 5074 return z; 5075 } 5076 if ( aExp == 0 ) { 5077 aExp = 1; 5078 bExp = 1; 5079 } 5080 if ( bSig0 < aSig0 ) goto aBigger; 5081 if ( aSig0 < bSig0 ) goto bBigger; 5082 if ( bSig1 < aSig1 ) goto aBigger; 5083 if ( aSig1 < bSig1 ) goto bBigger; 5084 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 ); 5085 bExpBigger: 5086 if ( bExp == 0x7FFF ) { 5087 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 5088 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 5089 } 5090 if ( aExp == 0 ) { 5091 ++expDiff; 5092 } 5093 else { 5094 aSig0 |= LIT64( 0x4000000000000000 ); 5095 } 5096 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 5097 bSig0 |= LIT64( 0x4000000000000000 ); 5098 bBigger: 5099 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 5100 zExp = bExp; 5101 zSign ^= 1; 5102 goto normalizeRoundAndPack; 5103 aExpBigger: 5104 if ( aExp == 0x7FFF ) { 5105 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 5106 return a; 5107 } 5108 if ( bExp == 0 ) { 5109 --expDiff; 5110 } 5111 else { 5112 bSig0 |= LIT64( 0x4000000000000000 ); 5113 } 5114 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 5115 aSig0 |= LIT64( 0x4000000000000000 ); 5116 aBigger: 5117 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 5118 zExp = aExp; 5119 normalizeRoundAndPack: 5120 --zExp; 5121 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR ); 5122 5123 } 5124 5125 /*---------------------------------------------------------------------------- 5126 | Returns the result of adding the quadruple-precision floating-point values 5127 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 5128 | for Binary Floating-Point Arithmetic. 5129 *----------------------------------------------------------------------------*/ 5130 5131 float128 float128_add( float128 a, float128 b STATUS_PARAM ) 5132 { 5133 flag aSign, bSign; 5134 5135 aSign = extractFloat128Sign( a ); 5136 bSign = extractFloat128Sign( b ); 5137 if ( aSign == bSign ) { 5138 return addFloat128Sigs( a, b, aSign STATUS_VAR ); 5139 } 5140 else { 5141 return subFloat128Sigs( a, b, aSign STATUS_VAR ); 5142 } 5143 5144 } 5145 5146 /*---------------------------------------------------------------------------- 5147 | Returns the result of subtracting the quadruple-precision floating-point 5148 | values `a' and `b'. The operation is performed according to the IEC/IEEE 5149 | Standard for Binary Floating-Point Arithmetic. 5150 *----------------------------------------------------------------------------*/ 5151 5152 float128 float128_sub( float128 a, float128 b STATUS_PARAM ) 5153 { 5154 flag aSign, bSign; 5155 5156 aSign = extractFloat128Sign( a ); 5157 bSign = extractFloat128Sign( b ); 5158 if ( aSign == bSign ) { 5159 return subFloat128Sigs( a, b, aSign STATUS_VAR ); 5160 } 5161 else { 5162 return addFloat128Sigs( a, b, aSign STATUS_VAR ); 5163 } 5164 5165 } 5166 5167 /*---------------------------------------------------------------------------- 5168 | Returns the result of multiplying the quadruple-precision floating-point 5169 | values `a' and `b'. The operation is performed according to the IEC/IEEE 5170 | Standard for Binary Floating-Point Arithmetic. 5171 *----------------------------------------------------------------------------*/ 5172 5173 float128 float128_mul( float128 a, float128 b STATUS_PARAM ) 5174 { 5175 flag aSign, bSign, zSign; 5176 int32 aExp, bExp, zExp; 5177 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 5178 float128 z; 5179 5180 aSig1 = extractFloat128Frac1( a ); 5181 aSig0 = extractFloat128Frac0( a ); 5182 aExp = extractFloat128Exp( a ); 5183 aSign = extractFloat128Sign( a ); 5184 bSig1 = extractFloat128Frac1( b ); 5185 bSig0 = extractFloat128Frac0( b ); 5186 bExp = extractFloat128Exp( b ); 5187 bSign = extractFloat128Sign( b ); 5188 zSign = aSign ^ bSign; 5189 if ( aExp == 0x7FFF ) { 5190 if ( ( aSig0 | aSig1 ) 5191 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 5192 return propagateFloat128NaN( a, b STATUS_VAR ); 5193 } 5194 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 5195 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5196 } 5197 if ( bExp == 0x7FFF ) { 5198 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 5199 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 5200 invalid: 5201 float_raise( float_flag_invalid STATUS_VAR); 5202 z.low = float128_default_nan_low; 5203 z.high = float128_default_nan_high; 5204 return z; 5205 } 5206 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5207 } 5208 if ( aExp == 0 ) { 5209 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 5210 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5211 } 5212 if ( bExp == 0 ) { 5213 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 5214 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5215 } 5216 zExp = aExp + bExp - 0x4000; 5217 aSig0 |= LIT64( 0x0001000000000000 ); 5218 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 5219 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 5220 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 5221 zSig2 |= ( zSig3 != 0 ); 5222 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 5223 shift128ExtraRightJamming( 5224 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 5225 ++zExp; 5226 } 5227 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR ); 5228 5229 } 5230 5231 /*---------------------------------------------------------------------------- 5232 | Returns the result of dividing the quadruple-precision floating-point value 5233 | `a' by the corresponding value `b'. The operation is performed according to 5234 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5235 *----------------------------------------------------------------------------*/ 5236 5237 float128 float128_div( float128 a, float128 b STATUS_PARAM ) 5238 { 5239 flag aSign, bSign, zSign; 5240 int32 aExp, bExp, zExp; 5241 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 5242 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5243 float128 z; 5244 5245 aSig1 = extractFloat128Frac1( a ); 5246 aSig0 = extractFloat128Frac0( a ); 5247 aExp = extractFloat128Exp( a ); 5248 aSign = extractFloat128Sign( a ); 5249 bSig1 = extractFloat128Frac1( b ); 5250 bSig0 = extractFloat128Frac0( b ); 5251 bExp = extractFloat128Exp( b ); 5252 bSign = extractFloat128Sign( b ); 5253 zSign = aSign ^ bSign; 5254 if ( aExp == 0x7FFF ) { 5255 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 5256 if ( bExp == 0x7FFF ) { 5257 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 5258 goto invalid; 5259 } 5260 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5261 } 5262 if ( bExp == 0x7FFF ) { 5263 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 5264 return packFloat128( zSign, 0, 0, 0 ); 5265 } 5266 if ( bExp == 0 ) { 5267 if ( ( bSig0 | bSig1 ) == 0 ) { 5268 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 5269 invalid: 5270 float_raise( float_flag_invalid STATUS_VAR); 5271 z.low = float128_default_nan_low; 5272 z.high = float128_default_nan_high; 5273 return z; 5274 } 5275 float_raise( float_flag_divbyzero STATUS_VAR); 5276 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5277 } 5278 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5279 } 5280 if ( aExp == 0 ) { 5281 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 5282 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5283 } 5284 zExp = aExp - bExp + 0x3FFD; 5285 shortShift128Left( 5286 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 5287 shortShift128Left( 5288 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 5289 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 5290 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 5291 ++zExp; 5292 } 5293 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5294 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 5295 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 5296 while ( (sbits64) rem0 < 0 ) { 5297 --zSig0; 5298 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 5299 } 5300 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 5301 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 5302 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 5303 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 5304 while ( (sbits64) rem1 < 0 ) { 5305 --zSig1; 5306 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 5307 } 5308 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5309 } 5310 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 5311 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR ); 5312 5313 } 5314 5315 /*---------------------------------------------------------------------------- 5316 | Returns the remainder of the quadruple-precision floating-point value `a' 5317 | with respect to the corresponding value `b'. The operation is performed 5318 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5319 *----------------------------------------------------------------------------*/ 5320 5321 float128 float128_rem( float128 a, float128 b STATUS_PARAM ) 5322 { 5323 flag aSign, zSign; 5324 int32 aExp, bExp, expDiff; 5325 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 5326 bits64 allZero, alternateASig0, alternateASig1, sigMean1; 5327 sbits64 sigMean0; 5328 float128 z; 5329 5330 aSig1 = extractFloat128Frac1( a ); 5331 aSig0 = extractFloat128Frac0( a ); 5332 aExp = extractFloat128Exp( a ); 5333 aSign = extractFloat128Sign( a ); 5334 bSig1 = extractFloat128Frac1( b ); 5335 bSig0 = extractFloat128Frac0( b ); 5336 bExp = extractFloat128Exp( b ); 5337 if ( aExp == 0x7FFF ) { 5338 if ( ( aSig0 | aSig1 ) 5339 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 5340 return propagateFloat128NaN( a, b STATUS_VAR ); 5341 } 5342 goto invalid; 5343 } 5344 if ( bExp == 0x7FFF ) { 5345 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 5346 return a; 5347 } 5348 if ( bExp == 0 ) { 5349 if ( ( bSig0 | bSig1 ) == 0 ) { 5350 invalid: 5351 float_raise( float_flag_invalid STATUS_VAR); 5352 z.low = float128_default_nan_low; 5353 z.high = float128_default_nan_high; 5354 return z; 5355 } 5356 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5357 } 5358 if ( aExp == 0 ) { 5359 if ( ( aSig0 | aSig1 ) == 0 ) return a; 5360 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5361 } 5362 expDiff = aExp - bExp; 5363 if ( expDiff < -1 ) return a; 5364 shortShift128Left( 5365 aSig0 | LIT64( 0x0001000000000000 ), 5366 aSig1, 5367 15 - ( expDiff < 0 ), 5368 &aSig0, 5369 &aSig1 5370 ); 5371 shortShift128Left( 5372 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 5373 q = le128( bSig0, bSig1, aSig0, aSig1 ); 5374 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5375 expDiff -= 64; 5376 while ( 0 < expDiff ) { 5377 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5378 q = ( 4 < q ) ? q - 4 : 0; 5379 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5380 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 5381 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 5382 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 5383 expDiff -= 61; 5384 } 5385 if ( -64 < expDiff ) { 5386 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5387 q = ( 4 < q ) ? q - 4 : 0; 5388 q >>= - expDiff; 5389 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5390 expDiff += 52; 5391 if ( expDiff < 0 ) { 5392 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 5393 } 5394 else { 5395 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 5396 } 5397 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5398 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 5399 } 5400 else { 5401 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 5402 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5403 } 5404 do { 5405 alternateASig0 = aSig0; 5406 alternateASig1 = aSig1; 5407 ++q; 5408 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5409 } while ( 0 <= (sbits64) aSig0 ); 5410 add128( 5411 aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 ); 5412 if ( ( sigMean0 < 0 ) 5413 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 5414 aSig0 = alternateASig0; 5415 aSig1 = alternateASig1; 5416 } 5417 zSign = ( (sbits64) aSig0 < 0 ); 5418 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 5419 return 5420 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR ); 5421 5422 } 5423 5424 /*---------------------------------------------------------------------------- 5425 | Returns the square root of the quadruple-precision floating-point value `a'. 5426 | The operation is performed according to the IEC/IEEE Standard for Binary 5427 | Floating-Point Arithmetic. 5428 *----------------------------------------------------------------------------*/ 5429 5430 float128 float128_sqrt( float128 a STATUS_PARAM ) 5431 { 5432 flag aSign; 5433 int32 aExp, zExp; 5434 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 5435 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5436 float128 z; 5437 5438 aSig1 = extractFloat128Frac1( a ); 5439 aSig0 = extractFloat128Frac0( a ); 5440 aExp = extractFloat128Exp( a ); 5441 aSign = extractFloat128Sign( a ); 5442 if ( aExp == 0x7FFF ) { 5443 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR ); 5444 if ( ! aSign ) return a; 5445 goto invalid; 5446 } 5447 if ( aSign ) { 5448 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 5449 invalid: 5450 float_raise( float_flag_invalid STATUS_VAR); 5451 z.low = float128_default_nan_low; 5452 z.high = float128_default_nan_high; 5453 return z; 5454 } 5455 if ( aExp == 0 ) { 5456 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 5457 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5458 } 5459 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 5460 aSig0 |= LIT64( 0x0001000000000000 ); 5461 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 5462 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 5463 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5464 doubleZSig0 = zSig0<<1; 5465 mul64To128( zSig0, zSig0, &term0, &term1 ); 5466 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5467 while ( (sbits64) rem0 < 0 ) { 5468 --zSig0; 5469 doubleZSig0 -= 2; 5470 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5471 } 5472 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5473 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 5474 if ( zSig1 == 0 ) zSig1 = 1; 5475 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5476 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5477 mul64To128( zSig1, zSig1, &term2, &term3 ); 5478 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5479 while ( (sbits64) rem1 < 0 ) { 5480 --zSig1; 5481 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5482 term3 |= 1; 5483 term2 |= doubleZSig0; 5484 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5485 } 5486 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5487 } 5488 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 5489 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR ); 5490 5491 } 5492 5493 /*---------------------------------------------------------------------------- 5494 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 5495 | the corresponding value `b', and 0 otherwise. The comparison is performed 5496 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5497 *----------------------------------------------------------------------------*/ 5498 5499 int float128_eq( float128 a, float128 b STATUS_PARAM ) 5500 { 5501 5502 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5503 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5504 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5505 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5506 ) { 5507 if ( float128_is_signaling_nan( a ) 5508 || float128_is_signaling_nan( b ) ) { 5509 float_raise( float_flag_invalid STATUS_VAR); 5510 } 5511 return 0; 5512 } 5513 return 5514 ( a.low == b.low ) 5515 && ( ( a.high == b.high ) 5516 || ( ( a.low == 0 ) 5517 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5518 ); 5519 5520 } 5521 5522 /*---------------------------------------------------------------------------- 5523 | Returns 1 if the quadruple-precision floating-point value `a' is less than 5524 | or equal to the corresponding value `b', and 0 otherwise. The comparison 5525 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5526 | Arithmetic. 5527 *----------------------------------------------------------------------------*/ 5528 5529 int float128_le( float128 a, float128 b STATUS_PARAM ) 5530 { 5531 flag aSign, bSign; 5532 5533 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5534 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5535 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5536 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5537 ) { 5538 float_raise( float_flag_invalid STATUS_VAR); 5539 return 0; 5540 } 5541 aSign = extractFloat128Sign( a ); 5542 bSign = extractFloat128Sign( b ); 5543 if ( aSign != bSign ) { 5544 return 5545 aSign 5546 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5547 == 0 ); 5548 } 5549 return 5550 aSign ? le128( b.high, b.low, a.high, a.low ) 5551 : le128( a.high, a.low, b.high, b.low ); 5552 5553 } 5554 5555 /*---------------------------------------------------------------------------- 5556 | Returns 1 if the quadruple-precision floating-point value `a' is less than 5557 | the corresponding value `b', and 0 otherwise. The comparison is performed 5558 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5559 *----------------------------------------------------------------------------*/ 5560 5561 int float128_lt( float128 a, float128 b STATUS_PARAM ) 5562 { 5563 flag aSign, bSign; 5564 5565 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5566 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5567 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5568 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5569 ) { 5570 float_raise( float_flag_invalid STATUS_VAR); 5571 return 0; 5572 } 5573 aSign = extractFloat128Sign( a ); 5574 bSign = extractFloat128Sign( b ); 5575 if ( aSign != bSign ) { 5576 return 5577 aSign 5578 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5579 != 0 ); 5580 } 5581 return 5582 aSign ? lt128( b.high, b.low, a.high, a.low ) 5583 : lt128( a.high, a.low, b.high, b.low ); 5584 5585 } 5586 5587 /*---------------------------------------------------------------------------- 5588 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 5589 | the corresponding value `b', and 0 otherwise. The invalid exception is 5590 | raised if either operand is a NaN. Otherwise, the comparison is performed 5591 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5592 *----------------------------------------------------------------------------*/ 5593 5594 int float128_eq_signaling( float128 a, float128 b STATUS_PARAM ) 5595 { 5596 5597 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5598 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5599 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5600 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5601 ) { 5602 float_raise( float_flag_invalid STATUS_VAR); 5603 return 0; 5604 } 5605 return 5606 ( a.low == b.low ) 5607 && ( ( a.high == b.high ) 5608 || ( ( a.low == 0 ) 5609 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5610 ); 5611 5612 } 5613 5614 /*---------------------------------------------------------------------------- 5615 | Returns 1 if the quadruple-precision floating-point value `a' is less than 5616 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5617 | cause an exception. Otherwise, the comparison is performed according to the 5618 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5619 *----------------------------------------------------------------------------*/ 5620 5621 int float128_le_quiet( float128 a, float128 b STATUS_PARAM ) 5622 { 5623 flag aSign, bSign; 5624 5625 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5626 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5627 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5628 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5629 ) { 5630 if ( float128_is_signaling_nan( a ) 5631 || float128_is_signaling_nan( b ) ) { 5632 float_raise( float_flag_invalid STATUS_VAR); 5633 } 5634 return 0; 5635 } 5636 aSign = extractFloat128Sign( a ); 5637 bSign = extractFloat128Sign( b ); 5638 if ( aSign != bSign ) { 5639 return 5640 aSign 5641 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5642 == 0 ); 5643 } 5644 return 5645 aSign ? le128( b.high, b.low, a.high, a.low ) 5646 : le128( a.high, a.low, b.high, b.low ); 5647 5648 } 5649 5650 /*---------------------------------------------------------------------------- 5651 | Returns 1 if the quadruple-precision floating-point value `a' is less than 5652 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5653 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 5654 | Standard for Binary Floating-Point Arithmetic. 5655 *----------------------------------------------------------------------------*/ 5656 5657 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM ) 5658 { 5659 flag aSign, bSign; 5660 5661 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5662 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5663 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5664 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5665 ) { 5666 if ( float128_is_signaling_nan( a ) 5667 || float128_is_signaling_nan( b ) ) { 5668 float_raise( float_flag_invalid STATUS_VAR); 5669 } 5670 return 0; 5671 } 5672 aSign = extractFloat128Sign( a ); 5673 bSign = extractFloat128Sign( b ); 5674 if ( aSign != bSign ) { 5675 return 5676 aSign 5677 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5678 != 0 ); 5679 } 5680 return 5681 aSign ? lt128( b.high, b.low, a.high, a.low ) 5682 : lt128( a.high, a.low, b.high, b.low ); 5683 5684 } 5685 5686 #endif 5687 5688 /* misc functions */ 5689 float32 uint32_to_float32( unsigned int a STATUS_PARAM ) 5690 { 5691 return int64_to_float32(a STATUS_VAR); 5692 } 5693 5694 float64 uint32_to_float64( unsigned int a STATUS_PARAM ) 5695 { 5696 return int64_to_float64(a STATUS_VAR); 5697 } 5698 5699 unsigned int float32_to_uint32( float32 a STATUS_PARAM ) 5700 { 5701 int64_t v; 5702 unsigned int res; 5703 5704 v = float32_to_int64(a STATUS_VAR); 5705 if (v < 0) { 5706 res = 0; 5707 float_raise( float_flag_invalid STATUS_VAR); 5708 } else if (v > 0xffffffff) { 5709 res = 0xffffffff; 5710 float_raise( float_flag_invalid STATUS_VAR); 5711 } else { 5712 res = v; 5713 } 5714 return res; 5715 } 5716 5717 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM ) 5718 { 5719 int64_t v; 5720 unsigned int res; 5721 5722 v = float32_to_int64_round_to_zero(a STATUS_VAR); 5723 if (v < 0) { 5724 res = 0; 5725 float_raise( float_flag_invalid STATUS_VAR); 5726 } else if (v > 0xffffffff) { 5727 res = 0xffffffff; 5728 float_raise( float_flag_invalid STATUS_VAR); 5729 } else { 5730 res = v; 5731 } 5732 return res; 5733 } 5734 5735 unsigned int float32_to_uint16_round_to_zero( float32 a STATUS_PARAM ) 5736 { 5737 int64_t v; 5738 unsigned int res; 5739 5740 v = float32_to_int64_round_to_zero(a STATUS_VAR); 5741 if (v < 0) { 5742 res = 0; 5743 float_raise( float_flag_invalid STATUS_VAR); 5744 } else if (v > 0xffff) { 5745 res = 0xffff; 5746 float_raise( float_flag_invalid STATUS_VAR); 5747 } else { 5748 res = v; 5749 } 5750 return res; 5751 } 5752 5753 unsigned int float64_to_uint32( float64 a STATUS_PARAM ) 5754 { 5755 int64_t v; 5756 unsigned int res; 5757 5758 v = float64_to_int64(a STATUS_VAR); 5759 if (v < 0) { 5760 res = 0; 5761 float_raise( float_flag_invalid STATUS_VAR); 5762 } else if (v > 0xffffffff) { 5763 res = 0xffffffff; 5764 float_raise( float_flag_invalid STATUS_VAR); 5765 } else { 5766 res = v; 5767 } 5768 return res; 5769 } 5770 5771 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM ) 5772 { 5773 int64_t v; 5774 unsigned int res; 5775 5776 v = float64_to_int64_round_to_zero(a STATUS_VAR); 5777 if (v < 0) { 5778 res = 0; 5779 float_raise( float_flag_invalid STATUS_VAR); 5780 } else if (v > 0xffffffff) { 5781 res = 0xffffffff; 5782 float_raise( float_flag_invalid STATUS_VAR); 5783 } else { 5784 res = v; 5785 } 5786 return res; 5787 } 5788 5789 unsigned int float64_to_uint16_round_to_zero( float64 a STATUS_PARAM ) 5790 { 5791 int64_t v; 5792 unsigned int res; 5793 5794 v = float64_to_int64_round_to_zero(a STATUS_VAR); 5795 if (v < 0) { 5796 res = 0; 5797 float_raise( float_flag_invalid STATUS_VAR); 5798 } else if (v > 0xffff) { 5799 res = 0xffff; 5800 float_raise( float_flag_invalid STATUS_VAR); 5801 } else { 5802 res = v; 5803 } 5804 return res; 5805 } 5806 5807 /* FIXME: This looks broken. */ 5808 uint64_t float64_to_uint64 (float64 a STATUS_PARAM) 5809 { 5810 int64_t v; 5811 5812 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR)); 5813 v += float64_val(a); 5814 v = float64_to_int64(make_float64(v) STATUS_VAR); 5815 5816 return v - INT64_MIN; 5817 } 5818 5819 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM) 5820 { 5821 int64_t v; 5822 5823 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR)); 5824 v += float64_val(a); 5825 v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR); 5826 5827 return v - INT64_MIN; 5828 } 5829 5830 #define COMPARE(s, nan_exp) \ 5831 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \ 5832 int is_quiet STATUS_PARAM ) \ 5833 { \ 5834 flag aSign, bSign; \ 5835 bits ## s av, bv; \ 5836 \ 5837 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \ 5838 extractFloat ## s ## Frac( a ) ) || \ 5839 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \ 5840 extractFloat ## s ## Frac( b ) )) { \ 5841 if (!is_quiet || \ 5842 float ## s ## _is_signaling_nan( a ) || \ 5843 float ## s ## _is_signaling_nan( b ) ) { \ 5844 float_raise( float_flag_invalid STATUS_VAR); \ 5845 } \ 5846 return float_relation_unordered; \ 5847 } \ 5848 aSign = extractFloat ## s ## Sign( a ); \ 5849 bSign = extractFloat ## s ## Sign( b ); \ 5850 av = float ## s ## _val(a); \ 5851 bv = float ## s ## _val(b); \ 5852 if ( aSign != bSign ) { \ 5853 if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) { \ 5854 /* zero case */ \ 5855 return float_relation_equal; \ 5856 } else { \ 5857 return 1 - (2 * aSign); \ 5858 } \ 5859 } else { \ 5860 if (av == bv) { \ 5861 return float_relation_equal; \ 5862 } else { \ 5863 return 1 - 2 * (aSign ^ ( av < bv )); \ 5864 } \ 5865 } \ 5866 } \ 5867 \ 5868 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \ 5869 { \ 5870 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \ 5871 } \ 5872 \ 5873 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \ 5874 { \ 5875 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \ 5876 } 5877 5878 COMPARE(32, 0xff) 5879 COMPARE(64, 0x7ff) 5880 5881 INLINE int float128_compare_internal( float128 a, float128 b, 5882 int is_quiet STATUS_PARAM ) 5883 { 5884 flag aSign, bSign; 5885 5886 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 5887 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 5888 ( ( extractFloat128Exp( b ) == 0x7fff ) && 5889 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 5890 if (!is_quiet || 5891 float128_is_signaling_nan( a ) || 5892 float128_is_signaling_nan( b ) ) { 5893 float_raise( float_flag_invalid STATUS_VAR); 5894 } 5895 return float_relation_unordered; 5896 } 5897 aSign = extractFloat128Sign( a ); 5898 bSign = extractFloat128Sign( b ); 5899 if ( aSign != bSign ) { 5900 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 5901 /* zero case */ 5902 return float_relation_equal; 5903 } else { 5904 return 1 - (2 * aSign); 5905 } 5906 } else { 5907 if (a.low == b.low && a.high == b.high) { 5908 return float_relation_equal; 5909 } else { 5910 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 5911 } 5912 } 5913 } 5914 5915 int float128_compare( float128 a, float128 b STATUS_PARAM ) 5916 { 5917 return float128_compare_internal(a, b, 0 STATUS_VAR); 5918 } 5919 5920 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM ) 5921 { 5922 return float128_compare_internal(a, b, 1 STATUS_VAR); 5923 } 5924 5925 /* Multiply A by 2 raised to the power N. */ 5926 float32 float32_scalbn( float32 a, int n STATUS_PARAM ) 5927 { 5928 flag aSign; 5929 int16 aExp; 5930 bits32 aSig; 5931 5932 aSig = extractFloat32Frac( a ); 5933 aExp = extractFloat32Exp( a ); 5934 aSign = extractFloat32Sign( a ); 5935 5936 if ( aExp == 0xFF ) { 5937 return a; 5938 } 5939 if ( aExp != 0 ) 5940 aSig |= 0x00800000; 5941 else if ( aSig == 0 ) 5942 return a; 5943 5944 aExp += n - 1; 5945 aSig <<= 7; 5946 return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR ); 5947 } 5948 5949 float64 float64_scalbn( float64 a, int n STATUS_PARAM ) 5950 { 5951 flag aSign; 5952 int16 aExp; 5953 bits64 aSig; 5954 5955 aSig = extractFloat64Frac( a ); 5956 aExp = extractFloat64Exp( a ); 5957 aSign = extractFloat64Sign( a ); 5958 5959 if ( aExp == 0x7FF ) { 5960 return a; 5961 } 5962 if ( aExp != 0 ) 5963 aSig |= LIT64( 0x0010000000000000 ); 5964 else if ( aSig == 0 ) 5965 return a; 5966 5967 aExp += n - 1; 5968 aSig <<= 10; 5969 return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR ); 5970 } 5971 5972 #ifdef FLOATX80 5973 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM ) 5974 { 5975 flag aSign; 5976 int16 aExp; 5977 bits64 aSig; 5978 5979 aSig = extractFloatx80Frac( a ); 5980 aExp = extractFloatx80Exp( a ); 5981 aSign = extractFloatx80Sign( a ); 5982 5983 if ( aExp == 0x7FF ) { 5984 return a; 5985 } 5986 if (aExp == 0 && aSig == 0) 5987 return a; 5988 5989 aExp += n; 5990 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision), 5991 aSign, aExp, aSig, 0 STATUS_VAR ); 5992 } 5993 #endif 5994 5995 #ifdef FLOAT128 5996 float128 float128_scalbn( float128 a, int n STATUS_PARAM ) 5997 { 5998 flag aSign; 5999 int32 aExp; 6000 bits64 aSig0, aSig1; 6001 6002 aSig1 = extractFloat128Frac1( a ); 6003 aSig0 = extractFloat128Frac0( a ); 6004 aExp = extractFloat128Exp( a ); 6005 aSign = extractFloat128Sign( a ); 6006 if ( aExp == 0x7FFF ) { 6007 return a; 6008 } 6009 if ( aExp != 0 ) 6010 aSig0 |= LIT64( 0x0001000000000000 ); 6011 else if ( aSig0 == 0 && aSig1 == 0 ) 6012 return a; 6013 6014 aExp += n - 1; 6015 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 6016 STATUS_VAR ); 6017 6018 } 6019 #endif 6020