1 2 /*============================================================================ 3 4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic 5 Package, Release 2b. 6 7 Written by John R. Hauser. This work was made possible in part by the 8 International Computer Science Institute, located at Suite 600, 1947 Center 9 Street, Berkeley, California 94704. Funding was partially provided by the 10 National Science Foundation under grant MIP-9311980. The original version 11 of this code was written as part of a project to build a fixed-point vector 12 processor in collaboration with the University of California at Berkeley, 13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ 15 arithmetic/SoftFloat.html'. 16 17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has 18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES 19 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS 20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, 21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE 22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE 23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR 24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. 25 26 Derivative works are acceptable, even for commercial purposes, so long as 27 (1) the source code for the derivative work includes prominent notice that 28 the work is derivative, and (2) the source code includes prominent notice with 29 these four paragraphs for those parts of this code that are retained. 30 31 =============================================================================*/ 32 33 #include "softfloat.h" 34 35 /*---------------------------------------------------------------------------- 36 | Primitive arithmetic functions, including multi-word arithmetic, and 37 | division and square root approximations. (Can be specialized to target if 38 | desired.) 39 *----------------------------------------------------------------------------*/ 40 #include "softfloat-macros.h" 41 42 /*---------------------------------------------------------------------------- 43 | Functions and definitions to determine: (1) whether tininess for underflow 44 | is detected before or after rounding by default, (2) what (if anything) 45 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 46 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 47 | are propagated from function inputs to output. These details are target- 48 | specific. 49 *----------------------------------------------------------------------------*/ 50 #include "softfloat-specialize.h" 51 52 void set_float_rounding_mode(int val STATUS_PARAM) 53 { 54 STATUS(float_rounding_mode) = val; 55 } 56 57 void set_float_exception_flags(int val STATUS_PARAM) 58 { 59 STATUS(float_exception_flags) = val; 60 } 61 62 #ifdef FLOATX80 63 void set_floatx80_rounding_precision(int val STATUS_PARAM) 64 { 65 STATUS(floatx80_rounding_precision) = val; 66 } 67 #endif 68 69 /*---------------------------------------------------------------------------- 70 | Returns the fraction bits of the half-precision floating-point value `a'. 71 *----------------------------------------------------------------------------*/ 72 73 INLINE uint32_t extractFloat16Frac(float16 a) 74 { 75 return float16_val(a) & 0x3ff; 76 } 77 78 /*---------------------------------------------------------------------------- 79 | Returns the exponent bits of the half-precision floating-point value `a'. 80 *----------------------------------------------------------------------------*/ 81 82 INLINE int16 extractFloat16Exp(float16 a) 83 { 84 return (float16_val(a) >> 10) & 0x1f; 85 } 86 87 /*---------------------------------------------------------------------------- 88 | Returns the sign bit of the single-precision floating-point value `a'. 89 *----------------------------------------------------------------------------*/ 90 91 INLINE flag extractFloat16Sign(float16 a) 92 { 93 return float16_val(a)>>15; 94 } 95 96 /*---------------------------------------------------------------------------- 97 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 98 | and 7, and returns the properly rounded 32-bit integer corresponding to the 99 | input. If `zSign' is 1, the input is negated before being converted to an 100 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 101 | is simply rounded to an integer, with the inexact exception raised if the 102 | input cannot be represented exactly as an integer. However, if the fixed- 103 | point input is too large, the invalid exception is raised and the largest 104 | positive or negative integer is returned. 105 *----------------------------------------------------------------------------*/ 106 107 static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM) 108 { 109 int8 roundingMode; 110 flag roundNearestEven; 111 int8 roundIncrement, roundBits; 112 int32 z; 113 114 roundingMode = STATUS(float_rounding_mode); 115 roundNearestEven = ( roundingMode == float_round_nearest_even ); 116 roundIncrement = 0x40; 117 if ( ! roundNearestEven ) { 118 if ( roundingMode == float_round_to_zero ) { 119 roundIncrement = 0; 120 } 121 else { 122 roundIncrement = 0x7F; 123 if ( zSign ) { 124 if ( roundingMode == float_round_up ) roundIncrement = 0; 125 } 126 else { 127 if ( roundingMode == float_round_down ) roundIncrement = 0; 128 } 129 } 130 } 131 roundBits = absZ & 0x7F; 132 absZ = ( absZ + roundIncrement )>>7; 133 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 134 z = absZ; 135 if ( zSign ) z = - z; 136 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 137 float_raise( float_flag_invalid STATUS_VAR); 138 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 139 } 140 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; 141 return z; 142 143 } 144 145 /*---------------------------------------------------------------------------- 146 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 147 | `absZ1', with binary point between bits 63 and 64 (between the input words), 148 | and returns the properly rounded 64-bit integer corresponding to the input. 149 | If `zSign' is 1, the input is negated before being converted to an integer. 150 | Ordinarily, the fixed-point input is simply rounded to an integer, with 151 | the inexact exception raised if the input cannot be represented exactly as 152 | an integer. However, if the fixed-point input is too large, the invalid 153 | exception is raised and the largest positive or negative integer is 154 | returned. 155 *----------------------------------------------------------------------------*/ 156 157 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM) 158 { 159 int8 roundingMode; 160 flag roundNearestEven, increment; 161 int64 z; 162 163 roundingMode = STATUS(float_rounding_mode); 164 roundNearestEven = ( roundingMode == float_round_nearest_even ); 165 increment = ( (sbits64) absZ1 < 0 ); 166 if ( ! roundNearestEven ) { 167 if ( roundingMode == float_round_to_zero ) { 168 increment = 0; 169 } 170 else { 171 if ( zSign ) { 172 increment = ( roundingMode == float_round_down ) && absZ1; 173 } 174 else { 175 increment = ( roundingMode == float_round_up ) && absZ1; 176 } 177 } 178 } 179 if ( increment ) { 180 ++absZ0; 181 if ( absZ0 == 0 ) goto overflow; 182 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 183 } 184 z = absZ0; 185 if ( zSign ) z = - z; 186 if ( z && ( ( z < 0 ) ^ zSign ) ) { 187 overflow: 188 float_raise( float_flag_invalid STATUS_VAR); 189 return 190 zSign ? (sbits64) LIT64( 0x8000000000000000 ) 191 : LIT64( 0x7FFFFFFFFFFFFFFF ); 192 } 193 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact; 194 return z; 195 196 } 197 198 /*---------------------------------------------------------------------------- 199 | Returns the fraction bits of the single-precision floating-point value `a'. 200 *----------------------------------------------------------------------------*/ 201 202 INLINE bits32 extractFloat32Frac( float32 a ) 203 { 204 205 return float32_val(a) & 0x007FFFFF; 206 207 } 208 209 /*---------------------------------------------------------------------------- 210 | Returns the exponent bits of the single-precision floating-point value `a'. 211 *----------------------------------------------------------------------------*/ 212 213 INLINE int16 extractFloat32Exp( float32 a ) 214 { 215 216 return ( float32_val(a)>>23 ) & 0xFF; 217 218 } 219 220 /*---------------------------------------------------------------------------- 221 | Returns the sign bit of the single-precision floating-point value `a'. 222 *----------------------------------------------------------------------------*/ 223 224 INLINE flag extractFloat32Sign( float32 a ) 225 { 226 227 return float32_val(a)>>31; 228 229 } 230 231 /*---------------------------------------------------------------------------- 232 | If `a' is denormal and we are in flush-to-zero mode then set the 233 | input-denormal exception and return zero. Otherwise just return the value. 234 *----------------------------------------------------------------------------*/ 235 static float32 float32_squash_input_denormal(float32 a STATUS_PARAM) 236 { 237 if (STATUS(flush_inputs_to_zero)) { 238 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) { 239 float_raise(float_flag_input_denormal STATUS_VAR); 240 return make_float32(float32_val(a) & 0x80000000); 241 } 242 } 243 return a; 244 } 245 246 /*---------------------------------------------------------------------------- 247 | Normalizes the subnormal single-precision floating-point value represented 248 | by the denormalized significand `aSig'. The normalized exponent and 249 | significand are stored at the locations pointed to by `zExpPtr' and 250 | `zSigPtr', respectively. 251 *----------------------------------------------------------------------------*/ 252 253 static void 254 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr ) 255 { 256 int8 shiftCount; 257 258 shiftCount = countLeadingZeros32( aSig ) - 8; 259 *zSigPtr = aSig<<shiftCount; 260 *zExpPtr = 1 - shiftCount; 261 262 } 263 264 /*---------------------------------------------------------------------------- 265 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 266 | single-precision floating-point value, returning the result. After being 267 | shifted into the proper positions, the three fields are simply added 268 | together to form the result. This means that any integer portion of `zSig' 269 | will be added into the exponent. Since a properly normalized significand 270 | will have an integer portion equal to 1, the `zExp' input should be 1 less 271 | than the desired result exponent whenever `zSig' is a complete, normalized 272 | significand. 273 *----------------------------------------------------------------------------*/ 274 275 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig ) 276 { 277 278 return make_float32( 279 ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig); 280 281 } 282 283 /*---------------------------------------------------------------------------- 284 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 285 | and significand `zSig', and returns the proper single-precision floating- 286 | point value corresponding to the abstract input. Ordinarily, the abstract 287 | value is simply rounded and packed into the single-precision format, with 288 | the inexact exception raised if the abstract input cannot be represented 289 | exactly. However, if the abstract value is too large, the overflow and 290 | inexact exceptions are raised and an infinity or maximal finite value is 291 | returned. If the abstract value is too small, the input value is rounded to 292 | a subnormal number, and the underflow and inexact exceptions are raised if 293 | the abstract input cannot be represented exactly as a subnormal single- 294 | precision floating-point number. 295 | The input significand `zSig' has its binary point between bits 30 296 | and 29, which is 7 bits to the left of the usual location. This shifted 297 | significand must be normalized or smaller. If `zSig' is not normalized, 298 | `zExp' must be 0; in that case, the result returned is a subnormal number, 299 | and it must not require rounding. In the usual case that `zSig' is 300 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 301 | The handling of underflow and overflow follows the IEC/IEEE Standard for 302 | Binary Floating-Point Arithmetic. 303 *----------------------------------------------------------------------------*/ 304 305 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM) 306 { 307 int8 roundingMode; 308 flag roundNearestEven; 309 int8 roundIncrement, roundBits; 310 flag isTiny; 311 312 roundingMode = STATUS(float_rounding_mode); 313 roundNearestEven = ( roundingMode == float_round_nearest_even ); 314 roundIncrement = 0x40; 315 if ( ! roundNearestEven ) { 316 if ( roundingMode == float_round_to_zero ) { 317 roundIncrement = 0; 318 } 319 else { 320 roundIncrement = 0x7F; 321 if ( zSign ) { 322 if ( roundingMode == float_round_up ) roundIncrement = 0; 323 } 324 else { 325 if ( roundingMode == float_round_down ) roundIncrement = 0; 326 } 327 } 328 } 329 roundBits = zSig & 0x7F; 330 if ( 0xFD <= (bits16) zExp ) { 331 if ( ( 0xFD < zExp ) 332 || ( ( zExp == 0xFD ) 333 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) ) 334 ) { 335 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); 336 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 )); 337 } 338 if ( zExp < 0 ) { 339 if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 ); 340 isTiny = 341 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) 342 || ( zExp < -1 ) 343 || ( zSig + roundIncrement < 0x80000000 ); 344 shift32RightJamming( zSig, - zExp, &zSig ); 345 zExp = 0; 346 roundBits = zSig & 0x7F; 347 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR); 348 } 349 } 350 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; 351 zSig = ( zSig + roundIncrement )>>7; 352 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 353 if ( zSig == 0 ) zExp = 0; 354 return packFloat32( zSign, zExp, zSig ); 355 356 } 357 358 /*---------------------------------------------------------------------------- 359 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 360 | and significand `zSig', and returns the proper single-precision floating- 361 | point value corresponding to the abstract input. This routine is just like 362 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 363 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 364 | floating-point exponent. 365 *----------------------------------------------------------------------------*/ 366 367 static float32 368 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM) 369 { 370 int8 shiftCount; 371 372 shiftCount = countLeadingZeros32( zSig ) - 1; 373 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR); 374 375 } 376 377 /*---------------------------------------------------------------------------- 378 | Returns the fraction bits of the double-precision floating-point value `a'. 379 *----------------------------------------------------------------------------*/ 380 381 INLINE bits64 extractFloat64Frac( float64 a ) 382 { 383 384 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF ); 385 386 } 387 388 /*---------------------------------------------------------------------------- 389 | Returns the exponent bits of the double-precision floating-point value `a'. 390 *----------------------------------------------------------------------------*/ 391 392 INLINE int16 extractFloat64Exp( float64 a ) 393 { 394 395 return ( float64_val(a)>>52 ) & 0x7FF; 396 397 } 398 399 /*---------------------------------------------------------------------------- 400 | Returns the sign bit of the double-precision floating-point value `a'. 401 *----------------------------------------------------------------------------*/ 402 403 INLINE flag extractFloat64Sign( float64 a ) 404 { 405 406 return float64_val(a)>>63; 407 408 } 409 410 /*---------------------------------------------------------------------------- 411 | If `a' is denormal and we are in flush-to-zero mode then set the 412 | input-denormal exception and return zero. Otherwise just return the value. 413 *----------------------------------------------------------------------------*/ 414 static float64 float64_squash_input_denormal(float64 a STATUS_PARAM) 415 { 416 if (STATUS(flush_inputs_to_zero)) { 417 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) { 418 float_raise(float_flag_input_denormal STATUS_VAR); 419 return make_float64(float64_val(a) & (1ULL << 63)); 420 } 421 } 422 return a; 423 } 424 425 /*---------------------------------------------------------------------------- 426 | Normalizes the subnormal double-precision floating-point value represented 427 | by the denormalized significand `aSig'. The normalized exponent and 428 | significand are stored at the locations pointed to by `zExpPtr' and 429 | `zSigPtr', respectively. 430 *----------------------------------------------------------------------------*/ 431 432 static void 433 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr ) 434 { 435 int8 shiftCount; 436 437 shiftCount = countLeadingZeros64( aSig ) - 11; 438 *zSigPtr = aSig<<shiftCount; 439 *zExpPtr = 1 - shiftCount; 440 441 } 442 443 /*---------------------------------------------------------------------------- 444 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 445 | double-precision floating-point value, returning the result. After being 446 | shifted into the proper positions, the three fields are simply added 447 | together to form the result. This means that any integer portion of `zSig' 448 | will be added into the exponent. Since a properly normalized significand 449 | will have an integer portion equal to 1, the `zExp' input should be 1 less 450 | than the desired result exponent whenever `zSig' is a complete, normalized 451 | significand. 452 *----------------------------------------------------------------------------*/ 453 454 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig ) 455 { 456 457 return make_float64( 458 ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig); 459 460 } 461 462 /*---------------------------------------------------------------------------- 463 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 464 | and significand `zSig', and returns the proper double-precision floating- 465 | point value corresponding to the abstract input. Ordinarily, the abstract 466 | value is simply rounded and packed into the double-precision format, with 467 | the inexact exception raised if the abstract input cannot be represented 468 | exactly. However, if the abstract value is too large, the overflow and 469 | inexact exceptions are raised and an infinity or maximal finite value is 470 | returned. If the abstract value is too small, the input value is rounded 471 | to a subnormal number, and the underflow and inexact exceptions are raised 472 | if the abstract input cannot be represented exactly as a subnormal double- 473 | precision floating-point number. 474 | The input significand `zSig' has its binary point between bits 62 475 | and 61, which is 10 bits to the left of the usual location. This shifted 476 | significand must be normalized or smaller. If `zSig' is not normalized, 477 | `zExp' must be 0; in that case, the result returned is a subnormal number, 478 | and it must not require rounding. In the usual case that `zSig' is 479 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 480 | The handling of underflow and overflow follows the IEC/IEEE Standard for 481 | Binary Floating-Point Arithmetic. 482 *----------------------------------------------------------------------------*/ 483 484 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM) 485 { 486 int8 roundingMode; 487 flag roundNearestEven; 488 int16 roundIncrement, roundBits; 489 flag isTiny; 490 491 roundingMode = STATUS(float_rounding_mode); 492 roundNearestEven = ( roundingMode == float_round_nearest_even ); 493 roundIncrement = 0x200; 494 if ( ! roundNearestEven ) { 495 if ( roundingMode == float_round_to_zero ) { 496 roundIncrement = 0; 497 } 498 else { 499 roundIncrement = 0x3FF; 500 if ( zSign ) { 501 if ( roundingMode == float_round_up ) roundIncrement = 0; 502 } 503 else { 504 if ( roundingMode == float_round_down ) roundIncrement = 0; 505 } 506 } 507 } 508 roundBits = zSig & 0x3FF; 509 if ( 0x7FD <= (bits16) zExp ) { 510 if ( ( 0x7FD < zExp ) 511 || ( ( zExp == 0x7FD ) 512 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) ) 513 ) { 514 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); 515 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 )); 516 } 517 if ( zExp < 0 ) { 518 if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 ); 519 isTiny = 520 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) 521 || ( zExp < -1 ) 522 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 523 shift64RightJamming( zSig, - zExp, &zSig ); 524 zExp = 0; 525 roundBits = zSig & 0x3FF; 526 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR); 527 } 528 } 529 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; 530 zSig = ( zSig + roundIncrement )>>10; 531 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 532 if ( zSig == 0 ) zExp = 0; 533 return packFloat64( zSign, zExp, zSig ); 534 535 } 536 537 /*---------------------------------------------------------------------------- 538 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 539 | and significand `zSig', and returns the proper double-precision floating- 540 | point value corresponding to the abstract input. This routine is just like 541 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 542 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 543 | floating-point exponent. 544 *----------------------------------------------------------------------------*/ 545 546 static float64 547 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM) 548 { 549 int8 shiftCount; 550 551 shiftCount = countLeadingZeros64( zSig ) - 1; 552 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR); 553 554 } 555 556 #ifdef FLOATX80 557 558 /*---------------------------------------------------------------------------- 559 | Returns the fraction bits of the extended double-precision floating-point 560 | value `a'. 561 *----------------------------------------------------------------------------*/ 562 563 INLINE bits64 extractFloatx80Frac( floatx80 a ) 564 { 565 566 return a.low; 567 568 } 569 570 /*---------------------------------------------------------------------------- 571 | Returns the exponent bits of the extended double-precision floating-point 572 | value `a'. 573 *----------------------------------------------------------------------------*/ 574 575 INLINE int32 extractFloatx80Exp( floatx80 a ) 576 { 577 578 return a.high & 0x7FFF; 579 580 } 581 582 /*---------------------------------------------------------------------------- 583 | Returns the sign bit of the extended double-precision floating-point value 584 | `a'. 585 *----------------------------------------------------------------------------*/ 586 587 INLINE flag extractFloatx80Sign( floatx80 a ) 588 { 589 590 return a.high>>15; 591 592 } 593 594 /*---------------------------------------------------------------------------- 595 | Normalizes the subnormal extended double-precision floating-point value 596 | represented by the denormalized significand `aSig'. The normalized exponent 597 | and significand are stored at the locations pointed to by `zExpPtr' and 598 | `zSigPtr', respectively. 599 *----------------------------------------------------------------------------*/ 600 601 static void 602 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr ) 603 { 604 int8 shiftCount; 605 606 shiftCount = countLeadingZeros64( aSig ); 607 *zSigPtr = aSig<<shiftCount; 608 *zExpPtr = 1 - shiftCount; 609 610 } 611 612 /*---------------------------------------------------------------------------- 613 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 614 | extended double-precision floating-point value, returning the result. 615 *----------------------------------------------------------------------------*/ 616 617 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig ) 618 { 619 floatx80 z; 620 621 z.low = zSig; 622 z.high = ( ( (bits16) zSign )<<15 ) + zExp; 623 return z; 624 625 } 626 627 /*---------------------------------------------------------------------------- 628 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 629 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 630 | and returns the proper extended double-precision floating-point value 631 | corresponding to the abstract input. Ordinarily, the abstract value is 632 | rounded and packed into the extended double-precision format, with the 633 | inexact exception raised if the abstract input cannot be represented 634 | exactly. However, if the abstract value is too large, the overflow and 635 | inexact exceptions are raised and an infinity or maximal finite value is 636 | returned. If the abstract value is too small, the input value is rounded to 637 | a subnormal number, and the underflow and inexact exceptions are raised if 638 | the abstract input cannot be represented exactly as a subnormal extended 639 | double-precision floating-point number. 640 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 641 | number of bits as single or double precision, respectively. Otherwise, the 642 | result is rounded to the full precision of the extended double-precision 643 | format. 644 | The input significand must be normalized or smaller. If the input 645 | significand is not normalized, `zExp' must be 0; in that case, the result 646 | returned is a subnormal number, and it must not require rounding. The 647 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 648 | Floating-Point Arithmetic. 649 *----------------------------------------------------------------------------*/ 650 651 static floatx80 652 roundAndPackFloatx80( 653 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 654 STATUS_PARAM) 655 { 656 int8 roundingMode; 657 flag roundNearestEven, increment, isTiny; 658 int64 roundIncrement, roundMask, roundBits; 659 660 roundingMode = STATUS(float_rounding_mode); 661 roundNearestEven = ( roundingMode == float_round_nearest_even ); 662 if ( roundingPrecision == 80 ) goto precision80; 663 if ( roundingPrecision == 64 ) { 664 roundIncrement = LIT64( 0x0000000000000400 ); 665 roundMask = LIT64( 0x00000000000007FF ); 666 } 667 else if ( roundingPrecision == 32 ) { 668 roundIncrement = LIT64( 0x0000008000000000 ); 669 roundMask = LIT64( 0x000000FFFFFFFFFF ); 670 } 671 else { 672 goto precision80; 673 } 674 zSig0 |= ( zSig1 != 0 ); 675 if ( ! roundNearestEven ) { 676 if ( roundingMode == float_round_to_zero ) { 677 roundIncrement = 0; 678 } 679 else { 680 roundIncrement = roundMask; 681 if ( zSign ) { 682 if ( roundingMode == float_round_up ) roundIncrement = 0; 683 } 684 else { 685 if ( roundingMode == float_round_down ) roundIncrement = 0; 686 } 687 } 688 } 689 roundBits = zSig0 & roundMask; 690 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 691 if ( ( 0x7FFE < zExp ) 692 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 693 ) { 694 goto overflow; 695 } 696 if ( zExp <= 0 ) { 697 if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 ); 698 isTiny = 699 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) 700 || ( zExp < 0 ) 701 || ( zSig0 <= zSig0 + roundIncrement ); 702 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 703 zExp = 0; 704 roundBits = zSig0 & roundMask; 705 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR); 706 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; 707 zSig0 += roundIncrement; 708 if ( (sbits64) zSig0 < 0 ) zExp = 1; 709 roundIncrement = roundMask + 1; 710 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 711 roundMask |= roundIncrement; 712 } 713 zSig0 &= ~ roundMask; 714 return packFloatx80( zSign, zExp, zSig0 ); 715 } 716 } 717 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; 718 zSig0 += roundIncrement; 719 if ( zSig0 < roundIncrement ) { 720 ++zExp; 721 zSig0 = LIT64( 0x8000000000000000 ); 722 } 723 roundIncrement = roundMask + 1; 724 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 725 roundMask |= roundIncrement; 726 } 727 zSig0 &= ~ roundMask; 728 if ( zSig0 == 0 ) zExp = 0; 729 return packFloatx80( zSign, zExp, zSig0 ); 730 precision80: 731 increment = ( (sbits64) zSig1 < 0 ); 732 if ( ! roundNearestEven ) { 733 if ( roundingMode == float_round_to_zero ) { 734 increment = 0; 735 } 736 else { 737 if ( zSign ) { 738 increment = ( roundingMode == float_round_down ) && zSig1; 739 } 740 else { 741 increment = ( roundingMode == float_round_up ) && zSig1; 742 } 743 } 744 } 745 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { 746 if ( ( 0x7FFE < zExp ) 747 || ( ( zExp == 0x7FFE ) 748 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 749 && increment 750 ) 751 ) { 752 roundMask = 0; 753 overflow: 754 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); 755 if ( ( roundingMode == float_round_to_zero ) 756 || ( zSign && ( roundingMode == float_round_up ) ) 757 || ( ! zSign && ( roundingMode == float_round_down ) ) 758 ) { 759 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 760 } 761 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 762 } 763 if ( zExp <= 0 ) { 764 isTiny = 765 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) 766 || ( zExp < 0 ) 767 || ! increment 768 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 769 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 770 zExp = 0; 771 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR); 772 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact; 773 if ( roundNearestEven ) { 774 increment = ( (sbits64) zSig1 < 0 ); 775 } 776 else { 777 if ( zSign ) { 778 increment = ( roundingMode == float_round_down ) && zSig1; 779 } 780 else { 781 increment = ( roundingMode == float_round_up ) && zSig1; 782 } 783 } 784 if ( increment ) { 785 ++zSig0; 786 zSig0 &= 787 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 788 if ( (sbits64) zSig0 < 0 ) zExp = 1; 789 } 790 return packFloatx80( zSign, zExp, zSig0 ); 791 } 792 } 793 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact; 794 if ( increment ) { 795 ++zSig0; 796 if ( zSig0 == 0 ) { 797 ++zExp; 798 zSig0 = LIT64( 0x8000000000000000 ); 799 } 800 else { 801 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 802 } 803 } 804 else { 805 if ( zSig0 == 0 ) zExp = 0; 806 } 807 return packFloatx80( zSign, zExp, zSig0 ); 808 809 } 810 811 /*---------------------------------------------------------------------------- 812 | Takes an abstract floating-point value having sign `zSign', exponent 813 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 814 | and returns the proper extended double-precision floating-point value 815 | corresponding to the abstract input. This routine is just like 816 | `roundAndPackFloatx80' except that the input significand does not have to be 817 | normalized. 818 *----------------------------------------------------------------------------*/ 819 820 static floatx80 821 normalizeRoundAndPackFloatx80( 822 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 823 STATUS_PARAM) 824 { 825 int8 shiftCount; 826 827 if ( zSig0 == 0 ) { 828 zSig0 = zSig1; 829 zSig1 = 0; 830 zExp -= 64; 831 } 832 shiftCount = countLeadingZeros64( zSig0 ); 833 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 834 zExp -= shiftCount; 835 return 836 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR); 837 838 } 839 840 #endif 841 842 #ifdef FLOAT128 843 844 /*---------------------------------------------------------------------------- 845 | Returns the least-significant 64 fraction bits of the quadruple-precision 846 | floating-point value `a'. 847 *----------------------------------------------------------------------------*/ 848 849 INLINE bits64 extractFloat128Frac1( float128 a ) 850 { 851 852 return a.low; 853 854 } 855 856 /*---------------------------------------------------------------------------- 857 | Returns the most-significant 48 fraction bits of the quadruple-precision 858 | floating-point value `a'. 859 *----------------------------------------------------------------------------*/ 860 861 INLINE bits64 extractFloat128Frac0( float128 a ) 862 { 863 864 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 865 866 } 867 868 /*---------------------------------------------------------------------------- 869 | Returns the exponent bits of the quadruple-precision floating-point value 870 | `a'. 871 *----------------------------------------------------------------------------*/ 872 873 INLINE int32 extractFloat128Exp( float128 a ) 874 { 875 876 return ( a.high>>48 ) & 0x7FFF; 877 878 } 879 880 /*---------------------------------------------------------------------------- 881 | Returns the sign bit of the quadruple-precision floating-point value `a'. 882 *----------------------------------------------------------------------------*/ 883 884 INLINE flag extractFloat128Sign( float128 a ) 885 { 886 887 return a.high>>63; 888 889 } 890 891 /*---------------------------------------------------------------------------- 892 | Normalizes the subnormal quadruple-precision floating-point value 893 | represented by the denormalized significand formed by the concatenation of 894 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 895 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 896 | significand are stored at the location pointed to by `zSig0Ptr', and the 897 | least significant 64 bits of the normalized significand are stored at the 898 | location pointed to by `zSig1Ptr'. 899 *----------------------------------------------------------------------------*/ 900 901 static void 902 normalizeFloat128Subnormal( 903 bits64 aSig0, 904 bits64 aSig1, 905 int32 *zExpPtr, 906 bits64 *zSig0Ptr, 907 bits64 *zSig1Ptr 908 ) 909 { 910 int8 shiftCount; 911 912 if ( aSig0 == 0 ) { 913 shiftCount = countLeadingZeros64( aSig1 ) - 15; 914 if ( shiftCount < 0 ) { 915 *zSig0Ptr = aSig1>>( - shiftCount ); 916 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 917 } 918 else { 919 *zSig0Ptr = aSig1<<shiftCount; 920 *zSig1Ptr = 0; 921 } 922 *zExpPtr = - shiftCount - 63; 923 } 924 else { 925 shiftCount = countLeadingZeros64( aSig0 ) - 15; 926 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 927 *zExpPtr = 1 - shiftCount; 928 } 929 930 } 931 932 /*---------------------------------------------------------------------------- 933 | Packs the sign `zSign', the exponent `zExp', and the significand formed 934 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 935 | floating-point value, returning the result. After being shifted into the 936 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 937 | added together to form the most significant 32 bits of the result. This 938 | means that any integer portion of `zSig0' will be added into the exponent. 939 | Since a properly normalized significand will have an integer portion equal 940 | to 1, the `zExp' input should be 1 less than the desired result exponent 941 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 942 | significand. 943 *----------------------------------------------------------------------------*/ 944 945 INLINE float128 946 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) 947 { 948 float128 z; 949 950 z.low = zSig1; 951 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0; 952 return z; 953 954 } 955 956 /*---------------------------------------------------------------------------- 957 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 958 | and extended significand formed by the concatenation of `zSig0', `zSig1', 959 | and `zSig2', and returns the proper quadruple-precision floating-point value 960 | corresponding to the abstract input. Ordinarily, the abstract value is 961 | simply rounded and packed into the quadruple-precision format, with the 962 | inexact exception raised if the abstract input cannot be represented 963 | exactly. However, if the abstract value is too large, the overflow and 964 | inexact exceptions are raised and an infinity or maximal finite value is 965 | returned. If the abstract value is too small, the input value is rounded to 966 | a subnormal number, and the underflow and inexact exceptions are raised if 967 | the abstract input cannot be represented exactly as a subnormal quadruple- 968 | precision floating-point number. 969 | The input significand must be normalized or smaller. If the input 970 | significand is not normalized, `zExp' must be 0; in that case, the result 971 | returned is a subnormal number, and it must not require rounding. In the 972 | usual case that the input significand is normalized, `zExp' must be 1 less 973 | than the ``true'' floating-point exponent. The handling of underflow and 974 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 975 *----------------------------------------------------------------------------*/ 976 977 static float128 978 roundAndPackFloat128( 979 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM) 980 { 981 int8 roundingMode; 982 flag roundNearestEven, increment, isTiny; 983 984 roundingMode = STATUS(float_rounding_mode); 985 roundNearestEven = ( roundingMode == float_round_nearest_even ); 986 increment = ( (sbits64) zSig2 < 0 ); 987 if ( ! roundNearestEven ) { 988 if ( roundingMode == float_round_to_zero ) { 989 increment = 0; 990 } 991 else { 992 if ( zSign ) { 993 increment = ( roundingMode == float_round_down ) && zSig2; 994 } 995 else { 996 increment = ( roundingMode == float_round_up ) && zSig2; 997 } 998 } 999 } 1000 if ( 0x7FFD <= (bits32) zExp ) { 1001 if ( ( 0x7FFD < zExp ) 1002 || ( ( zExp == 0x7FFD ) 1003 && eq128( 1004 LIT64( 0x0001FFFFFFFFFFFF ), 1005 LIT64( 0xFFFFFFFFFFFFFFFF ), 1006 zSig0, 1007 zSig1 1008 ) 1009 && increment 1010 ) 1011 ) { 1012 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); 1013 if ( ( roundingMode == float_round_to_zero ) 1014 || ( zSign && ( roundingMode == float_round_up ) ) 1015 || ( ! zSign && ( roundingMode == float_round_down ) ) 1016 ) { 1017 return 1018 packFloat128( 1019 zSign, 1020 0x7FFE, 1021 LIT64( 0x0000FFFFFFFFFFFF ), 1022 LIT64( 0xFFFFFFFFFFFFFFFF ) 1023 ); 1024 } 1025 return packFloat128( zSign, 0x7FFF, 0, 0 ); 1026 } 1027 if ( zExp < 0 ) { 1028 if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 ); 1029 isTiny = 1030 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) 1031 || ( zExp < -1 ) 1032 || ! increment 1033 || lt128( 1034 zSig0, 1035 zSig1, 1036 LIT64( 0x0001FFFFFFFFFFFF ), 1037 LIT64( 0xFFFFFFFFFFFFFFFF ) 1038 ); 1039 shift128ExtraRightJamming( 1040 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 1041 zExp = 0; 1042 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR); 1043 if ( roundNearestEven ) { 1044 increment = ( (sbits64) zSig2 < 0 ); 1045 } 1046 else { 1047 if ( zSign ) { 1048 increment = ( roundingMode == float_round_down ) && zSig2; 1049 } 1050 else { 1051 increment = ( roundingMode == float_round_up ) && zSig2; 1052 } 1053 } 1054 } 1055 } 1056 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact; 1057 if ( increment ) { 1058 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 1059 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 1060 } 1061 else { 1062 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 1063 } 1064 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1065 1066 } 1067 1068 /*---------------------------------------------------------------------------- 1069 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1070 | and significand formed by the concatenation of `zSig0' and `zSig1', and 1071 | returns the proper quadruple-precision floating-point value corresponding 1072 | to the abstract input. This routine is just like `roundAndPackFloat128' 1073 | except that the input significand has fewer bits and does not have to be 1074 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 1075 | point exponent. 1076 *----------------------------------------------------------------------------*/ 1077 1078 static float128 1079 normalizeRoundAndPackFloat128( 1080 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM) 1081 { 1082 int8 shiftCount; 1083 bits64 zSig2; 1084 1085 if ( zSig0 == 0 ) { 1086 zSig0 = zSig1; 1087 zSig1 = 0; 1088 zExp -= 64; 1089 } 1090 shiftCount = countLeadingZeros64( zSig0 ) - 15; 1091 if ( 0 <= shiftCount ) { 1092 zSig2 = 0; 1093 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1094 } 1095 else { 1096 shift128ExtraRightJamming( 1097 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 1098 } 1099 zExp -= shiftCount; 1100 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR); 1101 1102 } 1103 1104 #endif 1105 1106 /*---------------------------------------------------------------------------- 1107 | Returns the result of converting the 32-bit two's complement integer `a' 1108 | to the single-precision floating-point format. The conversion is performed 1109 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1110 *----------------------------------------------------------------------------*/ 1111 1112 float32 int32_to_float32( int32 a STATUS_PARAM ) 1113 { 1114 flag zSign; 1115 1116 if ( a == 0 ) return float32_zero; 1117 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); 1118 zSign = ( a < 0 ); 1119 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR ); 1120 1121 } 1122 1123 /*---------------------------------------------------------------------------- 1124 | Returns the result of converting the 32-bit two's complement integer `a' 1125 | to the double-precision floating-point format. The conversion is performed 1126 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1127 *----------------------------------------------------------------------------*/ 1128 1129 float64 int32_to_float64( int32 a STATUS_PARAM ) 1130 { 1131 flag zSign; 1132 uint32 absA; 1133 int8 shiftCount; 1134 bits64 zSig; 1135 1136 if ( a == 0 ) return float64_zero; 1137 zSign = ( a < 0 ); 1138 absA = zSign ? - a : a; 1139 shiftCount = countLeadingZeros32( absA ) + 21; 1140 zSig = absA; 1141 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount ); 1142 1143 } 1144 1145 #ifdef FLOATX80 1146 1147 /*---------------------------------------------------------------------------- 1148 | Returns the result of converting the 32-bit two's complement integer `a' 1149 | to the extended double-precision floating-point format. The conversion 1150 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1151 | Arithmetic. 1152 *----------------------------------------------------------------------------*/ 1153 1154 floatx80 int32_to_floatx80( int32 a STATUS_PARAM ) 1155 { 1156 flag zSign; 1157 uint32 absA; 1158 int8 shiftCount; 1159 bits64 zSig; 1160 1161 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1162 zSign = ( a < 0 ); 1163 absA = zSign ? - a : a; 1164 shiftCount = countLeadingZeros32( absA ) + 32; 1165 zSig = absA; 1166 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 1167 1168 } 1169 1170 #endif 1171 1172 #ifdef FLOAT128 1173 1174 /*---------------------------------------------------------------------------- 1175 | Returns the result of converting the 32-bit two's complement integer `a' to 1176 | the quadruple-precision floating-point format. The conversion is performed 1177 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1178 *----------------------------------------------------------------------------*/ 1179 1180 float128 int32_to_float128( int32 a STATUS_PARAM ) 1181 { 1182 flag zSign; 1183 uint32 absA; 1184 int8 shiftCount; 1185 bits64 zSig0; 1186 1187 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1188 zSign = ( a < 0 ); 1189 absA = zSign ? - a : a; 1190 shiftCount = countLeadingZeros32( absA ) + 17; 1191 zSig0 = absA; 1192 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 1193 1194 } 1195 1196 #endif 1197 1198 /*---------------------------------------------------------------------------- 1199 | Returns the result of converting the 64-bit two's complement integer `a' 1200 | to the single-precision floating-point format. The conversion is performed 1201 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1202 *----------------------------------------------------------------------------*/ 1203 1204 float32 int64_to_float32( int64 a STATUS_PARAM ) 1205 { 1206 flag zSign; 1207 uint64 absA; 1208 int8 shiftCount; 1209 1210 if ( a == 0 ) return float32_zero; 1211 zSign = ( a < 0 ); 1212 absA = zSign ? - a : a; 1213 shiftCount = countLeadingZeros64( absA ) - 40; 1214 if ( 0 <= shiftCount ) { 1215 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount ); 1216 } 1217 else { 1218 shiftCount += 7; 1219 if ( shiftCount < 0 ) { 1220 shift64RightJamming( absA, - shiftCount, &absA ); 1221 } 1222 else { 1223 absA <<= shiftCount; 1224 } 1225 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR ); 1226 } 1227 1228 } 1229 1230 float32 uint64_to_float32( uint64 a STATUS_PARAM ) 1231 { 1232 int8 shiftCount; 1233 1234 if ( a == 0 ) return float32_zero; 1235 shiftCount = countLeadingZeros64( a ) - 40; 1236 if ( 0 <= shiftCount ) { 1237 return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount ); 1238 } 1239 else { 1240 shiftCount += 7; 1241 if ( shiftCount < 0 ) { 1242 shift64RightJamming( a, - shiftCount, &a ); 1243 } 1244 else { 1245 a <<= shiftCount; 1246 } 1247 return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR ); 1248 } 1249 } 1250 1251 /*---------------------------------------------------------------------------- 1252 | Returns the result of converting the 64-bit two's complement integer `a' 1253 | to the double-precision floating-point format. The conversion is performed 1254 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1255 *----------------------------------------------------------------------------*/ 1256 1257 float64 int64_to_float64( int64 a STATUS_PARAM ) 1258 { 1259 flag zSign; 1260 1261 if ( a == 0 ) return float64_zero; 1262 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) { 1263 return packFloat64( 1, 0x43E, 0 ); 1264 } 1265 zSign = ( a < 0 ); 1266 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR ); 1267 1268 } 1269 1270 float64 uint64_to_float64( uint64 a STATUS_PARAM ) 1271 { 1272 if ( a == 0 ) return float64_zero; 1273 return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR ); 1274 1275 } 1276 1277 #ifdef FLOATX80 1278 1279 /*---------------------------------------------------------------------------- 1280 | Returns the result of converting the 64-bit two's complement integer `a' 1281 | to the extended double-precision floating-point format. The conversion 1282 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1283 | Arithmetic. 1284 *----------------------------------------------------------------------------*/ 1285 1286 floatx80 int64_to_floatx80( int64 a STATUS_PARAM ) 1287 { 1288 flag zSign; 1289 uint64 absA; 1290 int8 shiftCount; 1291 1292 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1293 zSign = ( a < 0 ); 1294 absA = zSign ? - a : a; 1295 shiftCount = countLeadingZeros64( absA ); 1296 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 1297 1298 } 1299 1300 #endif 1301 1302 #ifdef FLOAT128 1303 1304 /*---------------------------------------------------------------------------- 1305 | Returns the result of converting the 64-bit two's complement integer `a' to 1306 | the quadruple-precision floating-point format. The conversion is performed 1307 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1308 *----------------------------------------------------------------------------*/ 1309 1310 float128 int64_to_float128( int64 a STATUS_PARAM ) 1311 { 1312 flag zSign; 1313 uint64 absA; 1314 int8 shiftCount; 1315 int32 zExp; 1316 bits64 zSig0, zSig1; 1317 1318 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1319 zSign = ( a < 0 ); 1320 absA = zSign ? - a : a; 1321 shiftCount = countLeadingZeros64( absA ) + 49; 1322 zExp = 0x406E - shiftCount; 1323 if ( 64 <= shiftCount ) { 1324 zSig1 = 0; 1325 zSig0 = absA; 1326 shiftCount -= 64; 1327 } 1328 else { 1329 zSig1 = absA; 1330 zSig0 = 0; 1331 } 1332 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1333 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1334 1335 } 1336 1337 #endif 1338 1339 /*---------------------------------------------------------------------------- 1340 | Returns the result of converting the single-precision floating-point value 1341 | `a' to the 32-bit two's complement integer format. The conversion is 1342 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1343 | Arithmetic---which means in particular that the conversion is rounded 1344 | according to the current rounding mode. If `a' is a NaN, the largest 1345 | positive integer is returned. Otherwise, if the conversion overflows, the 1346 | largest integer with the same sign as `a' is returned. 1347 *----------------------------------------------------------------------------*/ 1348 1349 int32 float32_to_int32( float32 a STATUS_PARAM ) 1350 { 1351 flag aSign; 1352 int16 aExp, shiftCount; 1353 bits32 aSig; 1354 bits64 aSig64; 1355 1356 a = float32_squash_input_denormal(a STATUS_VAR); 1357 aSig = extractFloat32Frac( a ); 1358 aExp = extractFloat32Exp( a ); 1359 aSign = extractFloat32Sign( a ); 1360 if ( ( aExp == 0xFF ) && aSig ) aSign = 0; 1361 if ( aExp ) aSig |= 0x00800000; 1362 shiftCount = 0xAF - aExp; 1363 aSig64 = aSig; 1364 aSig64 <<= 32; 1365 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 ); 1366 return roundAndPackInt32( aSign, aSig64 STATUS_VAR ); 1367 1368 } 1369 1370 /*---------------------------------------------------------------------------- 1371 | Returns the result of converting the single-precision floating-point value 1372 | `a' to the 32-bit two's complement integer format. The conversion is 1373 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1374 | Arithmetic, except that the conversion is always rounded toward zero. 1375 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1376 | the conversion overflows, the largest integer with the same sign as `a' is 1377 | returned. 1378 *----------------------------------------------------------------------------*/ 1379 1380 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM ) 1381 { 1382 flag aSign; 1383 int16 aExp, shiftCount; 1384 bits32 aSig; 1385 int32 z; 1386 a = float32_squash_input_denormal(a STATUS_VAR); 1387 1388 aSig = extractFloat32Frac( a ); 1389 aExp = extractFloat32Exp( a ); 1390 aSign = extractFloat32Sign( a ); 1391 shiftCount = aExp - 0x9E; 1392 if ( 0 <= shiftCount ) { 1393 if ( float32_val(a) != 0xCF000000 ) { 1394 float_raise( float_flag_invalid STATUS_VAR); 1395 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; 1396 } 1397 return (sbits32) 0x80000000; 1398 } 1399 else if ( aExp <= 0x7E ) { 1400 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 1401 return 0; 1402 } 1403 aSig = ( aSig | 0x00800000 )<<8; 1404 z = aSig>>( - shiftCount ); 1405 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) { 1406 STATUS(float_exception_flags) |= float_flag_inexact; 1407 } 1408 if ( aSign ) z = - z; 1409 return z; 1410 1411 } 1412 1413 /*---------------------------------------------------------------------------- 1414 | Returns the result of converting the single-precision floating-point value 1415 | `a' to the 16-bit two's complement integer format. The conversion is 1416 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1417 | Arithmetic, except that the conversion is always rounded toward zero. 1418 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1419 | the conversion overflows, the largest integer with the same sign as `a' is 1420 | returned. 1421 *----------------------------------------------------------------------------*/ 1422 1423 int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM ) 1424 { 1425 flag aSign; 1426 int16 aExp, shiftCount; 1427 bits32 aSig; 1428 int32 z; 1429 1430 aSig = extractFloat32Frac( a ); 1431 aExp = extractFloat32Exp( a ); 1432 aSign = extractFloat32Sign( a ); 1433 shiftCount = aExp - 0x8E; 1434 if ( 0 <= shiftCount ) { 1435 if ( float32_val(a) != 0xC7000000 ) { 1436 float_raise( float_flag_invalid STATUS_VAR); 1437 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1438 return 0x7FFF; 1439 } 1440 } 1441 return (sbits32) 0xffff8000; 1442 } 1443 else if ( aExp <= 0x7E ) { 1444 if ( aExp | aSig ) { 1445 STATUS(float_exception_flags) |= float_flag_inexact; 1446 } 1447 return 0; 1448 } 1449 shiftCount -= 0x10; 1450 aSig = ( aSig | 0x00800000 )<<8; 1451 z = aSig>>( - shiftCount ); 1452 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) { 1453 STATUS(float_exception_flags) |= float_flag_inexact; 1454 } 1455 if ( aSign ) { 1456 z = - z; 1457 } 1458 return z; 1459 1460 } 1461 1462 /*---------------------------------------------------------------------------- 1463 | Returns the result of converting the single-precision floating-point value 1464 | `a' to the 64-bit two's complement integer format. The conversion is 1465 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1466 | Arithmetic---which means in particular that the conversion is rounded 1467 | according to the current rounding mode. If `a' is a NaN, the largest 1468 | positive integer is returned. Otherwise, if the conversion overflows, the 1469 | largest integer with the same sign as `a' is returned. 1470 *----------------------------------------------------------------------------*/ 1471 1472 int64 float32_to_int64( float32 a STATUS_PARAM ) 1473 { 1474 flag aSign; 1475 int16 aExp, shiftCount; 1476 bits32 aSig; 1477 bits64 aSig64, aSigExtra; 1478 a = float32_squash_input_denormal(a STATUS_VAR); 1479 1480 aSig = extractFloat32Frac( a ); 1481 aExp = extractFloat32Exp( a ); 1482 aSign = extractFloat32Sign( a ); 1483 shiftCount = 0xBE - aExp; 1484 if ( shiftCount < 0 ) { 1485 float_raise( float_flag_invalid STATUS_VAR); 1486 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1487 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1488 } 1489 return (sbits64) LIT64( 0x8000000000000000 ); 1490 } 1491 if ( aExp ) aSig |= 0x00800000; 1492 aSig64 = aSig; 1493 aSig64 <<= 40; 1494 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra ); 1495 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR ); 1496 1497 } 1498 1499 /*---------------------------------------------------------------------------- 1500 | Returns the result of converting the single-precision floating-point value 1501 | `a' to the 64-bit two's complement integer format. The conversion is 1502 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1503 | Arithmetic, except that the conversion is always rounded toward zero. If 1504 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 1505 | conversion overflows, the largest integer with the same sign as `a' is 1506 | returned. 1507 *----------------------------------------------------------------------------*/ 1508 1509 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM ) 1510 { 1511 flag aSign; 1512 int16 aExp, shiftCount; 1513 bits32 aSig; 1514 bits64 aSig64; 1515 int64 z; 1516 a = float32_squash_input_denormal(a STATUS_VAR); 1517 1518 aSig = extractFloat32Frac( a ); 1519 aExp = extractFloat32Exp( a ); 1520 aSign = extractFloat32Sign( a ); 1521 shiftCount = aExp - 0xBE; 1522 if ( 0 <= shiftCount ) { 1523 if ( float32_val(a) != 0xDF000000 ) { 1524 float_raise( float_flag_invalid STATUS_VAR); 1525 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1526 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1527 } 1528 } 1529 return (sbits64) LIT64( 0x8000000000000000 ); 1530 } 1531 else if ( aExp <= 0x7E ) { 1532 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 1533 return 0; 1534 } 1535 aSig64 = aSig | 0x00800000; 1536 aSig64 <<= 40; 1537 z = aSig64>>( - shiftCount ); 1538 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) { 1539 STATUS(float_exception_flags) |= float_flag_inexact; 1540 } 1541 if ( aSign ) z = - z; 1542 return z; 1543 1544 } 1545 1546 /*---------------------------------------------------------------------------- 1547 | Returns the result of converting the single-precision floating-point value 1548 | `a' to the double-precision floating-point format. The conversion is 1549 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1550 | Arithmetic. 1551 *----------------------------------------------------------------------------*/ 1552 1553 float64 float32_to_float64( float32 a STATUS_PARAM ) 1554 { 1555 flag aSign; 1556 int16 aExp; 1557 bits32 aSig; 1558 a = float32_squash_input_denormal(a STATUS_VAR); 1559 1560 aSig = extractFloat32Frac( a ); 1561 aExp = extractFloat32Exp( a ); 1562 aSign = extractFloat32Sign( a ); 1563 if ( aExp == 0xFF ) { 1564 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 1565 return packFloat64( aSign, 0x7FF, 0 ); 1566 } 1567 if ( aExp == 0 ) { 1568 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 1569 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1570 --aExp; 1571 } 1572 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 ); 1573 1574 } 1575 1576 #ifdef FLOATX80 1577 1578 /*---------------------------------------------------------------------------- 1579 | Returns the result of converting the single-precision floating-point value 1580 | `a' to the extended double-precision floating-point format. The conversion 1581 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1582 | Arithmetic. 1583 *----------------------------------------------------------------------------*/ 1584 1585 floatx80 float32_to_floatx80( float32 a STATUS_PARAM ) 1586 { 1587 flag aSign; 1588 int16 aExp; 1589 bits32 aSig; 1590 1591 a = float32_squash_input_denormal(a STATUS_VAR); 1592 aSig = extractFloat32Frac( a ); 1593 aExp = extractFloat32Exp( a ); 1594 aSign = extractFloat32Sign( a ); 1595 if ( aExp == 0xFF ) { 1596 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 1597 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 1598 } 1599 if ( aExp == 0 ) { 1600 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 1601 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1602 } 1603 aSig |= 0x00800000; 1604 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 ); 1605 1606 } 1607 1608 #endif 1609 1610 #ifdef FLOAT128 1611 1612 /*---------------------------------------------------------------------------- 1613 | Returns the result of converting the single-precision floating-point value 1614 | `a' to the double-precision floating-point format. The conversion is 1615 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1616 | Arithmetic. 1617 *----------------------------------------------------------------------------*/ 1618 1619 float128 float32_to_float128( float32 a STATUS_PARAM ) 1620 { 1621 flag aSign; 1622 int16 aExp; 1623 bits32 aSig; 1624 1625 a = float32_squash_input_denormal(a STATUS_VAR); 1626 aSig = extractFloat32Frac( a ); 1627 aExp = extractFloat32Exp( a ); 1628 aSign = extractFloat32Sign( a ); 1629 if ( aExp == 0xFF ) { 1630 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 1631 return packFloat128( aSign, 0x7FFF, 0, 0 ); 1632 } 1633 if ( aExp == 0 ) { 1634 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 1635 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1636 --aExp; 1637 } 1638 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 ); 1639 1640 } 1641 1642 #endif 1643 1644 /*---------------------------------------------------------------------------- 1645 | Rounds the single-precision floating-point value `a' to an integer, and 1646 | returns the result as a single-precision floating-point value. The 1647 | operation is performed according to the IEC/IEEE Standard for Binary 1648 | Floating-Point Arithmetic. 1649 *----------------------------------------------------------------------------*/ 1650 1651 float32 float32_round_to_int( float32 a STATUS_PARAM) 1652 { 1653 flag aSign; 1654 int16 aExp; 1655 bits32 lastBitMask, roundBitsMask; 1656 int8 roundingMode; 1657 bits32 z; 1658 a = float32_squash_input_denormal(a STATUS_VAR); 1659 1660 aExp = extractFloat32Exp( a ); 1661 if ( 0x96 <= aExp ) { 1662 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { 1663 return propagateFloat32NaN( a, a STATUS_VAR ); 1664 } 1665 return a; 1666 } 1667 if ( aExp <= 0x7E ) { 1668 if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a; 1669 STATUS(float_exception_flags) |= float_flag_inexact; 1670 aSign = extractFloat32Sign( a ); 1671 switch ( STATUS(float_rounding_mode) ) { 1672 case float_round_nearest_even: 1673 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { 1674 return packFloat32( aSign, 0x7F, 0 ); 1675 } 1676 break; 1677 case float_round_down: 1678 return make_float32(aSign ? 0xBF800000 : 0); 1679 case float_round_up: 1680 return make_float32(aSign ? 0x80000000 : 0x3F800000); 1681 } 1682 return packFloat32( aSign, 0, 0 ); 1683 } 1684 lastBitMask = 1; 1685 lastBitMask <<= 0x96 - aExp; 1686 roundBitsMask = lastBitMask - 1; 1687 z = float32_val(a); 1688 roundingMode = STATUS(float_rounding_mode); 1689 if ( roundingMode == float_round_nearest_even ) { 1690 z += lastBitMask>>1; 1691 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 1692 } 1693 else if ( roundingMode != float_round_to_zero ) { 1694 if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) { 1695 z += roundBitsMask; 1696 } 1697 } 1698 z &= ~ roundBitsMask; 1699 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact; 1700 return make_float32(z); 1701 1702 } 1703 1704 /*---------------------------------------------------------------------------- 1705 | Returns the result of adding the absolute values of the single-precision 1706 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 1707 | before being returned. `zSign' is ignored if the result is a NaN. 1708 | The addition is performed according to the IEC/IEEE Standard for Binary 1709 | Floating-Point Arithmetic. 1710 *----------------------------------------------------------------------------*/ 1711 1712 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM) 1713 { 1714 int16 aExp, bExp, zExp; 1715 bits32 aSig, bSig, zSig; 1716 int16 expDiff; 1717 1718 aSig = extractFloat32Frac( a ); 1719 aExp = extractFloat32Exp( a ); 1720 bSig = extractFloat32Frac( b ); 1721 bExp = extractFloat32Exp( b ); 1722 expDiff = aExp - bExp; 1723 aSig <<= 6; 1724 bSig <<= 6; 1725 if ( 0 < expDiff ) { 1726 if ( aExp == 0xFF ) { 1727 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1728 return a; 1729 } 1730 if ( bExp == 0 ) { 1731 --expDiff; 1732 } 1733 else { 1734 bSig |= 0x20000000; 1735 } 1736 shift32RightJamming( bSig, expDiff, &bSig ); 1737 zExp = aExp; 1738 } 1739 else if ( expDiff < 0 ) { 1740 if ( bExp == 0xFF ) { 1741 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1742 return packFloat32( zSign, 0xFF, 0 ); 1743 } 1744 if ( aExp == 0 ) { 1745 ++expDiff; 1746 } 1747 else { 1748 aSig |= 0x20000000; 1749 } 1750 shift32RightJamming( aSig, - expDiff, &aSig ); 1751 zExp = bExp; 1752 } 1753 else { 1754 if ( aExp == 0xFF ) { 1755 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1756 return a; 1757 } 1758 if ( aExp == 0 ) { 1759 if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 ); 1760 return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); 1761 } 1762 zSig = 0x40000000 + aSig + bSig; 1763 zExp = aExp; 1764 goto roundAndPack; 1765 } 1766 aSig |= 0x20000000; 1767 zSig = ( aSig + bSig )<<1; 1768 --zExp; 1769 if ( (sbits32) zSig < 0 ) { 1770 zSig = aSig + bSig; 1771 ++zExp; 1772 } 1773 roundAndPack: 1774 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR ); 1775 1776 } 1777 1778 /*---------------------------------------------------------------------------- 1779 | Returns the result of subtracting the absolute values of the single- 1780 | precision floating-point values `a' and `b'. If `zSign' is 1, the 1781 | difference is negated before being returned. `zSign' is ignored if the 1782 | result is a NaN. The subtraction is performed according to the IEC/IEEE 1783 | Standard for Binary Floating-Point Arithmetic. 1784 *----------------------------------------------------------------------------*/ 1785 1786 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM) 1787 { 1788 int16 aExp, bExp, zExp; 1789 bits32 aSig, bSig, zSig; 1790 int16 expDiff; 1791 1792 aSig = extractFloat32Frac( a ); 1793 aExp = extractFloat32Exp( a ); 1794 bSig = extractFloat32Frac( b ); 1795 bExp = extractFloat32Exp( b ); 1796 expDiff = aExp - bExp; 1797 aSig <<= 7; 1798 bSig <<= 7; 1799 if ( 0 < expDiff ) goto aExpBigger; 1800 if ( expDiff < 0 ) goto bExpBigger; 1801 if ( aExp == 0xFF ) { 1802 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1803 float_raise( float_flag_invalid STATUS_VAR); 1804 return float32_default_nan; 1805 } 1806 if ( aExp == 0 ) { 1807 aExp = 1; 1808 bExp = 1; 1809 } 1810 if ( bSig < aSig ) goto aBigger; 1811 if ( aSig < bSig ) goto bBigger; 1812 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 ); 1813 bExpBigger: 1814 if ( bExp == 0xFF ) { 1815 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1816 return packFloat32( zSign ^ 1, 0xFF, 0 ); 1817 } 1818 if ( aExp == 0 ) { 1819 ++expDiff; 1820 } 1821 else { 1822 aSig |= 0x40000000; 1823 } 1824 shift32RightJamming( aSig, - expDiff, &aSig ); 1825 bSig |= 0x40000000; 1826 bBigger: 1827 zSig = bSig - aSig; 1828 zExp = bExp; 1829 zSign ^= 1; 1830 goto normalizeRoundAndPack; 1831 aExpBigger: 1832 if ( aExp == 0xFF ) { 1833 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1834 return a; 1835 } 1836 if ( bExp == 0 ) { 1837 --expDiff; 1838 } 1839 else { 1840 bSig |= 0x40000000; 1841 } 1842 shift32RightJamming( bSig, expDiff, &bSig ); 1843 aSig |= 0x40000000; 1844 aBigger: 1845 zSig = aSig - bSig; 1846 zExp = aExp; 1847 normalizeRoundAndPack: 1848 --zExp; 1849 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR ); 1850 1851 } 1852 1853 /*---------------------------------------------------------------------------- 1854 | Returns the result of adding the single-precision floating-point values `a' 1855 | and `b'. The operation is performed according to the IEC/IEEE Standard for 1856 | Binary Floating-Point Arithmetic. 1857 *----------------------------------------------------------------------------*/ 1858 1859 float32 float32_add( float32 a, float32 b STATUS_PARAM ) 1860 { 1861 flag aSign, bSign; 1862 a = float32_squash_input_denormal(a STATUS_VAR); 1863 b = float32_squash_input_denormal(b STATUS_VAR); 1864 1865 aSign = extractFloat32Sign( a ); 1866 bSign = extractFloat32Sign( b ); 1867 if ( aSign == bSign ) { 1868 return addFloat32Sigs( a, b, aSign STATUS_VAR); 1869 } 1870 else { 1871 return subFloat32Sigs( a, b, aSign STATUS_VAR ); 1872 } 1873 1874 } 1875 1876 /*---------------------------------------------------------------------------- 1877 | Returns the result of subtracting the single-precision floating-point values 1878 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1879 | for Binary Floating-Point Arithmetic. 1880 *----------------------------------------------------------------------------*/ 1881 1882 float32 float32_sub( float32 a, float32 b STATUS_PARAM ) 1883 { 1884 flag aSign, bSign; 1885 a = float32_squash_input_denormal(a STATUS_VAR); 1886 b = float32_squash_input_denormal(b STATUS_VAR); 1887 1888 aSign = extractFloat32Sign( a ); 1889 bSign = extractFloat32Sign( b ); 1890 if ( aSign == bSign ) { 1891 return subFloat32Sigs( a, b, aSign STATUS_VAR ); 1892 } 1893 else { 1894 return addFloat32Sigs( a, b, aSign STATUS_VAR ); 1895 } 1896 1897 } 1898 1899 /*---------------------------------------------------------------------------- 1900 | Returns the result of multiplying the single-precision floating-point values 1901 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1902 | for Binary Floating-Point Arithmetic. 1903 *----------------------------------------------------------------------------*/ 1904 1905 float32 float32_mul( float32 a, float32 b STATUS_PARAM ) 1906 { 1907 flag aSign, bSign, zSign; 1908 int16 aExp, bExp, zExp; 1909 bits32 aSig, bSig; 1910 bits64 zSig64; 1911 bits32 zSig; 1912 1913 a = float32_squash_input_denormal(a STATUS_VAR); 1914 b = float32_squash_input_denormal(b STATUS_VAR); 1915 1916 aSig = extractFloat32Frac( a ); 1917 aExp = extractFloat32Exp( a ); 1918 aSign = extractFloat32Sign( a ); 1919 bSig = extractFloat32Frac( b ); 1920 bExp = extractFloat32Exp( b ); 1921 bSign = extractFloat32Sign( b ); 1922 zSign = aSign ^ bSign; 1923 if ( aExp == 0xFF ) { 1924 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1925 return propagateFloat32NaN( a, b STATUS_VAR ); 1926 } 1927 if ( ( bExp | bSig ) == 0 ) { 1928 float_raise( float_flag_invalid STATUS_VAR); 1929 return float32_default_nan; 1930 } 1931 return packFloat32( zSign, 0xFF, 0 ); 1932 } 1933 if ( bExp == 0xFF ) { 1934 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1935 if ( ( aExp | aSig ) == 0 ) { 1936 float_raise( float_flag_invalid STATUS_VAR); 1937 return float32_default_nan; 1938 } 1939 return packFloat32( zSign, 0xFF, 0 ); 1940 } 1941 if ( aExp == 0 ) { 1942 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1943 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1944 } 1945 if ( bExp == 0 ) { 1946 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); 1947 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1948 } 1949 zExp = aExp + bExp - 0x7F; 1950 aSig = ( aSig | 0x00800000 )<<7; 1951 bSig = ( bSig | 0x00800000 )<<8; 1952 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 ); 1953 zSig = zSig64; 1954 if ( 0 <= (sbits32) ( zSig<<1 ) ) { 1955 zSig <<= 1; 1956 --zExp; 1957 } 1958 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR ); 1959 1960 } 1961 1962 /*---------------------------------------------------------------------------- 1963 | Returns the result of dividing the single-precision floating-point value `a' 1964 | by the corresponding value `b'. The operation is performed according to the 1965 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1966 *----------------------------------------------------------------------------*/ 1967 1968 float32 float32_div( float32 a, float32 b STATUS_PARAM ) 1969 { 1970 flag aSign, bSign, zSign; 1971 int16 aExp, bExp, zExp; 1972 bits32 aSig, bSig, zSig; 1973 a = float32_squash_input_denormal(a STATUS_VAR); 1974 b = float32_squash_input_denormal(b STATUS_VAR); 1975 1976 aSig = extractFloat32Frac( a ); 1977 aExp = extractFloat32Exp( a ); 1978 aSign = extractFloat32Sign( a ); 1979 bSig = extractFloat32Frac( b ); 1980 bExp = extractFloat32Exp( b ); 1981 bSign = extractFloat32Sign( b ); 1982 zSign = aSign ^ bSign; 1983 if ( aExp == 0xFF ) { 1984 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1985 if ( bExp == 0xFF ) { 1986 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1987 float_raise( float_flag_invalid STATUS_VAR); 1988 return float32_default_nan; 1989 } 1990 return packFloat32( zSign, 0xFF, 0 ); 1991 } 1992 if ( bExp == 0xFF ) { 1993 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1994 return packFloat32( zSign, 0, 0 ); 1995 } 1996 if ( bExp == 0 ) { 1997 if ( bSig == 0 ) { 1998 if ( ( aExp | aSig ) == 0 ) { 1999 float_raise( float_flag_invalid STATUS_VAR); 2000 return float32_default_nan; 2001 } 2002 float_raise( float_flag_divbyzero STATUS_VAR); 2003 return packFloat32( zSign, 0xFF, 0 ); 2004 } 2005 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2006 } 2007 if ( aExp == 0 ) { 2008 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 2009 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2010 } 2011 zExp = aExp - bExp + 0x7D; 2012 aSig = ( aSig | 0x00800000 )<<7; 2013 bSig = ( bSig | 0x00800000 )<<8; 2014 if ( bSig <= ( aSig + aSig ) ) { 2015 aSig >>= 1; 2016 ++zExp; 2017 } 2018 zSig = ( ( (bits64) aSig )<<32 ) / bSig; 2019 if ( ( zSig & 0x3F ) == 0 ) { 2020 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 ); 2021 } 2022 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR ); 2023 2024 } 2025 2026 /*---------------------------------------------------------------------------- 2027 | Returns the remainder of the single-precision floating-point value `a' 2028 | with respect to the corresponding value `b'. The operation is performed 2029 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2030 *----------------------------------------------------------------------------*/ 2031 2032 float32 float32_rem( float32 a, float32 b STATUS_PARAM ) 2033 { 2034 flag aSign, zSign; 2035 int16 aExp, bExp, expDiff; 2036 bits32 aSig, bSig; 2037 bits32 q; 2038 bits64 aSig64, bSig64, q64; 2039 bits32 alternateASig; 2040 sbits32 sigMean; 2041 a = float32_squash_input_denormal(a STATUS_VAR); 2042 b = float32_squash_input_denormal(b STATUS_VAR); 2043 2044 aSig = extractFloat32Frac( a ); 2045 aExp = extractFloat32Exp( a ); 2046 aSign = extractFloat32Sign( a ); 2047 bSig = extractFloat32Frac( b ); 2048 bExp = extractFloat32Exp( b ); 2049 if ( aExp == 0xFF ) { 2050 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 2051 return propagateFloat32NaN( a, b STATUS_VAR ); 2052 } 2053 float_raise( float_flag_invalid STATUS_VAR); 2054 return float32_default_nan; 2055 } 2056 if ( bExp == 0xFF ) { 2057 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 2058 return a; 2059 } 2060 if ( bExp == 0 ) { 2061 if ( bSig == 0 ) { 2062 float_raise( float_flag_invalid STATUS_VAR); 2063 return float32_default_nan; 2064 } 2065 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2066 } 2067 if ( aExp == 0 ) { 2068 if ( aSig == 0 ) return a; 2069 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2070 } 2071 expDiff = aExp - bExp; 2072 aSig |= 0x00800000; 2073 bSig |= 0x00800000; 2074 if ( expDiff < 32 ) { 2075 aSig <<= 8; 2076 bSig <<= 8; 2077 if ( expDiff < 0 ) { 2078 if ( expDiff < -1 ) return a; 2079 aSig >>= 1; 2080 } 2081 q = ( bSig <= aSig ); 2082 if ( q ) aSig -= bSig; 2083 if ( 0 < expDiff ) { 2084 q = ( ( (bits64) aSig )<<32 ) / bSig; 2085 q >>= 32 - expDiff; 2086 bSig >>= 2; 2087 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 2088 } 2089 else { 2090 aSig >>= 2; 2091 bSig >>= 2; 2092 } 2093 } 2094 else { 2095 if ( bSig <= aSig ) aSig -= bSig; 2096 aSig64 = ( (bits64) aSig )<<40; 2097 bSig64 = ( (bits64) bSig )<<40; 2098 expDiff -= 64; 2099 while ( 0 < expDiff ) { 2100 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2101 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2102 aSig64 = - ( ( bSig * q64 )<<38 ); 2103 expDiff -= 62; 2104 } 2105 expDiff += 64; 2106 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2107 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2108 q = q64>>( 64 - expDiff ); 2109 bSig <<= 6; 2110 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 2111 } 2112 do { 2113 alternateASig = aSig; 2114 ++q; 2115 aSig -= bSig; 2116 } while ( 0 <= (sbits32) aSig ); 2117 sigMean = aSig + alternateASig; 2118 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 2119 aSig = alternateASig; 2120 } 2121 zSign = ( (sbits32) aSig < 0 ); 2122 if ( zSign ) aSig = - aSig; 2123 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR ); 2124 2125 } 2126 2127 /*---------------------------------------------------------------------------- 2128 | Returns the square root of the single-precision floating-point value `a'. 2129 | The operation is performed according to the IEC/IEEE Standard for Binary 2130 | Floating-Point Arithmetic. 2131 *----------------------------------------------------------------------------*/ 2132 2133 float32 float32_sqrt( float32 a STATUS_PARAM ) 2134 { 2135 flag aSign; 2136 int16 aExp, zExp; 2137 bits32 aSig, zSig; 2138 bits64 rem, term; 2139 a = float32_squash_input_denormal(a STATUS_VAR); 2140 2141 aSig = extractFloat32Frac( a ); 2142 aExp = extractFloat32Exp( a ); 2143 aSign = extractFloat32Sign( a ); 2144 if ( aExp == 0xFF ) { 2145 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR ); 2146 if ( ! aSign ) return a; 2147 float_raise( float_flag_invalid STATUS_VAR); 2148 return float32_default_nan; 2149 } 2150 if ( aSign ) { 2151 if ( ( aExp | aSig ) == 0 ) return a; 2152 float_raise( float_flag_invalid STATUS_VAR); 2153 return float32_default_nan; 2154 } 2155 if ( aExp == 0 ) { 2156 if ( aSig == 0 ) return float32_zero; 2157 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2158 } 2159 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 2160 aSig = ( aSig | 0x00800000 )<<8; 2161 zSig = estimateSqrt32( aExp, aSig ) + 2; 2162 if ( ( zSig & 0x7F ) <= 5 ) { 2163 if ( zSig < 2 ) { 2164 zSig = 0x7FFFFFFF; 2165 goto roundAndPack; 2166 } 2167 aSig >>= aExp & 1; 2168 term = ( (bits64) zSig ) * zSig; 2169 rem = ( ( (bits64) aSig )<<32 ) - term; 2170 while ( (sbits64) rem < 0 ) { 2171 --zSig; 2172 rem += ( ( (bits64) zSig )<<1 ) | 1; 2173 } 2174 zSig |= ( rem != 0 ); 2175 } 2176 shift32RightJamming( zSig, 1, &zSig ); 2177 roundAndPack: 2178 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR ); 2179 2180 } 2181 2182 /*---------------------------------------------------------------------------- 2183 | Returns the binary exponential of the single-precision floating-point value 2184 | `a'. The operation is performed according to the IEC/IEEE Standard for 2185 | Binary Floating-Point Arithmetic. 2186 | 2187 | Uses the following identities: 2188 | 2189 | 1. ------------------------------------------------------------------------- 2190 | x x*ln(2) 2191 | 2 = e 2192 | 2193 | 2. ------------------------------------------------------------------------- 2194 | 2 3 4 5 n 2195 | x x x x x x x 2196 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 2197 | 1! 2! 3! 4! 5! n! 2198 *----------------------------------------------------------------------------*/ 2199 2200 static const float64 float32_exp2_coefficients[15] = 2201 { 2202 const_float64( 0x3ff0000000000000ll ), /* 1 */ 2203 const_float64( 0x3fe0000000000000ll ), /* 2 */ 2204 const_float64( 0x3fc5555555555555ll ), /* 3 */ 2205 const_float64( 0x3fa5555555555555ll ), /* 4 */ 2206 const_float64( 0x3f81111111111111ll ), /* 5 */ 2207 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 2208 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 2209 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 2210 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 2211 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 2212 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 2213 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 2214 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 2215 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 2216 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 2217 }; 2218 2219 float32 float32_exp2( float32 a STATUS_PARAM ) 2220 { 2221 flag aSign; 2222 int16 aExp; 2223 bits32 aSig; 2224 float64 r, x, xn; 2225 int i; 2226 a = float32_squash_input_denormal(a STATUS_VAR); 2227 2228 aSig = extractFloat32Frac( a ); 2229 aExp = extractFloat32Exp( a ); 2230 aSign = extractFloat32Sign( a ); 2231 2232 if ( aExp == 0xFF) { 2233 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR ); 2234 return (aSign) ? float32_zero : a; 2235 } 2236 if (aExp == 0) { 2237 if (aSig == 0) return float32_one; 2238 } 2239 2240 float_raise( float_flag_inexact STATUS_VAR); 2241 2242 /* ******************************* */ 2243 /* using float64 for approximation */ 2244 /* ******************************* */ 2245 x = float32_to_float64(a STATUS_VAR); 2246 x = float64_mul(x, float64_ln2 STATUS_VAR); 2247 2248 xn = x; 2249 r = float64_one; 2250 for (i = 0 ; i < 15 ; i++) { 2251 float64 f; 2252 2253 f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR); 2254 r = float64_add(r, f STATUS_VAR); 2255 2256 xn = float64_mul(xn, x STATUS_VAR); 2257 } 2258 2259 return float64_to_float32(r, status); 2260 } 2261 2262 /*---------------------------------------------------------------------------- 2263 | Returns the binary log of the single-precision floating-point value `a'. 2264 | The operation is performed according to the IEC/IEEE Standard for Binary 2265 | Floating-Point Arithmetic. 2266 *----------------------------------------------------------------------------*/ 2267 float32 float32_log2( float32 a STATUS_PARAM ) 2268 { 2269 flag aSign, zSign; 2270 int16 aExp; 2271 bits32 aSig, zSig, i; 2272 2273 a = float32_squash_input_denormal(a STATUS_VAR); 2274 aSig = extractFloat32Frac( a ); 2275 aExp = extractFloat32Exp( a ); 2276 aSign = extractFloat32Sign( a ); 2277 2278 if ( aExp == 0 ) { 2279 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 2280 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2281 } 2282 if ( aSign ) { 2283 float_raise( float_flag_invalid STATUS_VAR); 2284 return float32_default_nan; 2285 } 2286 if ( aExp == 0xFF ) { 2287 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR ); 2288 return a; 2289 } 2290 2291 aExp -= 0x7F; 2292 aSig |= 0x00800000; 2293 zSign = aExp < 0; 2294 zSig = aExp << 23; 2295 2296 for (i = 1 << 22; i > 0; i >>= 1) { 2297 aSig = ( (bits64)aSig * aSig ) >> 23; 2298 if ( aSig & 0x01000000 ) { 2299 aSig >>= 1; 2300 zSig |= i; 2301 } 2302 } 2303 2304 if ( zSign ) 2305 zSig = -zSig; 2306 2307 return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR ); 2308 } 2309 2310 /*---------------------------------------------------------------------------- 2311 | Returns 1 if the single-precision floating-point value `a' is equal to 2312 | the corresponding value `b', and 0 otherwise. The comparison is performed 2313 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2314 *----------------------------------------------------------------------------*/ 2315 2316 int float32_eq( float32 a, float32 b STATUS_PARAM ) 2317 { 2318 a = float32_squash_input_denormal(a STATUS_VAR); 2319 b = float32_squash_input_denormal(b STATUS_VAR); 2320 2321 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2322 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2323 ) { 2324 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2325 float_raise( float_flag_invalid STATUS_VAR); 2326 } 2327 return 0; 2328 } 2329 return ( float32_val(a) == float32_val(b) ) || 2330 ( (bits32) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 2331 2332 } 2333 2334 /*---------------------------------------------------------------------------- 2335 | Returns 1 if the single-precision floating-point value `a' is less than 2336 | or equal to the corresponding value `b', and 0 otherwise. The comparison 2337 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 2338 | Arithmetic. 2339 *----------------------------------------------------------------------------*/ 2340 2341 int float32_le( float32 a, float32 b STATUS_PARAM ) 2342 { 2343 flag aSign, bSign; 2344 bits32 av, bv; 2345 a = float32_squash_input_denormal(a STATUS_VAR); 2346 b = float32_squash_input_denormal(b STATUS_VAR); 2347 2348 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2349 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2350 ) { 2351 float_raise( float_flag_invalid STATUS_VAR); 2352 return 0; 2353 } 2354 aSign = extractFloat32Sign( a ); 2355 bSign = extractFloat32Sign( b ); 2356 av = float32_val(a); 2357 bv = float32_val(b); 2358 if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 ); 2359 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 2360 2361 } 2362 2363 /*---------------------------------------------------------------------------- 2364 | Returns 1 if the single-precision floating-point value `a' is less than 2365 | the corresponding value `b', and 0 otherwise. The comparison is performed 2366 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2367 *----------------------------------------------------------------------------*/ 2368 2369 int float32_lt( float32 a, float32 b STATUS_PARAM ) 2370 { 2371 flag aSign, bSign; 2372 bits32 av, bv; 2373 a = float32_squash_input_denormal(a STATUS_VAR); 2374 b = float32_squash_input_denormal(b STATUS_VAR); 2375 2376 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2377 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2378 ) { 2379 float_raise( float_flag_invalid STATUS_VAR); 2380 return 0; 2381 } 2382 aSign = extractFloat32Sign( a ); 2383 bSign = extractFloat32Sign( b ); 2384 av = float32_val(a); 2385 bv = float32_val(b); 2386 if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 ); 2387 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 2388 2389 } 2390 2391 /*---------------------------------------------------------------------------- 2392 | Returns 1 if the single-precision floating-point value `a' is equal to 2393 | the corresponding value `b', and 0 otherwise. The invalid exception is 2394 | raised if either operand is a NaN. Otherwise, the comparison is performed 2395 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2396 *----------------------------------------------------------------------------*/ 2397 2398 int float32_eq_signaling( float32 a, float32 b STATUS_PARAM ) 2399 { 2400 bits32 av, bv; 2401 a = float32_squash_input_denormal(a STATUS_VAR); 2402 b = float32_squash_input_denormal(b STATUS_VAR); 2403 2404 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2405 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2406 ) { 2407 float_raise( float_flag_invalid STATUS_VAR); 2408 return 0; 2409 } 2410 av = float32_val(a); 2411 bv = float32_val(b); 2412 return ( av == bv ) || ( (bits32) ( ( av | bv )<<1 ) == 0 ); 2413 2414 } 2415 2416 /*---------------------------------------------------------------------------- 2417 | Returns 1 if the single-precision floating-point value `a' is less than or 2418 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 2419 | cause an exception. Otherwise, the comparison is performed according to the 2420 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2421 *----------------------------------------------------------------------------*/ 2422 2423 int float32_le_quiet( float32 a, float32 b STATUS_PARAM ) 2424 { 2425 flag aSign, bSign; 2426 bits32 av, bv; 2427 a = float32_squash_input_denormal(a STATUS_VAR); 2428 b = float32_squash_input_denormal(b STATUS_VAR); 2429 2430 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2431 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2432 ) { 2433 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2434 float_raise( float_flag_invalid STATUS_VAR); 2435 } 2436 return 0; 2437 } 2438 aSign = extractFloat32Sign( a ); 2439 bSign = extractFloat32Sign( b ); 2440 av = float32_val(a); 2441 bv = float32_val(b); 2442 if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 ); 2443 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 2444 2445 } 2446 2447 /*---------------------------------------------------------------------------- 2448 | Returns 1 if the single-precision floating-point value `a' is less than 2449 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 2450 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 2451 | Standard for Binary Floating-Point Arithmetic. 2452 *----------------------------------------------------------------------------*/ 2453 2454 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM ) 2455 { 2456 flag aSign, bSign; 2457 bits32 av, bv; 2458 a = float32_squash_input_denormal(a STATUS_VAR); 2459 b = float32_squash_input_denormal(b STATUS_VAR); 2460 2461 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2462 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2463 ) { 2464 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2465 float_raise( float_flag_invalid STATUS_VAR); 2466 } 2467 return 0; 2468 } 2469 aSign = extractFloat32Sign( a ); 2470 bSign = extractFloat32Sign( b ); 2471 av = float32_val(a); 2472 bv = float32_val(b); 2473 if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 ); 2474 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 2475 2476 } 2477 2478 /*---------------------------------------------------------------------------- 2479 | Returns the result of converting the double-precision floating-point value 2480 | `a' to the 32-bit two's complement integer format. The conversion is 2481 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2482 | Arithmetic---which means in particular that the conversion is rounded 2483 | according to the current rounding mode. If `a' is a NaN, the largest 2484 | positive integer is returned. Otherwise, if the conversion overflows, the 2485 | largest integer with the same sign as `a' is returned. 2486 *----------------------------------------------------------------------------*/ 2487 2488 int32 float64_to_int32( float64 a STATUS_PARAM ) 2489 { 2490 flag aSign; 2491 int16 aExp, shiftCount; 2492 bits64 aSig; 2493 a = float64_squash_input_denormal(a STATUS_VAR); 2494 2495 aSig = extractFloat64Frac( a ); 2496 aExp = extractFloat64Exp( a ); 2497 aSign = extractFloat64Sign( a ); 2498 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2499 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2500 shiftCount = 0x42C - aExp; 2501 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 2502 return roundAndPackInt32( aSign, aSig STATUS_VAR ); 2503 2504 } 2505 2506 /*---------------------------------------------------------------------------- 2507 | Returns the result of converting the double-precision floating-point value 2508 | `a' to the 32-bit two's complement integer format. The conversion is 2509 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2510 | Arithmetic, except that the conversion is always rounded toward zero. 2511 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2512 | the conversion overflows, the largest integer with the same sign as `a' is 2513 | returned. 2514 *----------------------------------------------------------------------------*/ 2515 2516 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM ) 2517 { 2518 flag aSign; 2519 int16 aExp, shiftCount; 2520 bits64 aSig, savedASig; 2521 int32 z; 2522 a = float64_squash_input_denormal(a STATUS_VAR); 2523 2524 aSig = extractFloat64Frac( a ); 2525 aExp = extractFloat64Exp( a ); 2526 aSign = extractFloat64Sign( a ); 2527 if ( 0x41E < aExp ) { 2528 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2529 goto invalid; 2530 } 2531 else if ( aExp < 0x3FF ) { 2532 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 2533 return 0; 2534 } 2535 aSig |= LIT64( 0x0010000000000000 ); 2536 shiftCount = 0x433 - aExp; 2537 savedASig = aSig; 2538 aSig >>= shiftCount; 2539 z = aSig; 2540 if ( aSign ) z = - z; 2541 if ( ( z < 0 ) ^ aSign ) { 2542 invalid: 2543 float_raise( float_flag_invalid STATUS_VAR); 2544 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 2545 } 2546 if ( ( aSig<<shiftCount ) != savedASig ) { 2547 STATUS(float_exception_flags) |= float_flag_inexact; 2548 } 2549 return z; 2550 2551 } 2552 2553 /*---------------------------------------------------------------------------- 2554 | Returns the result of converting the double-precision floating-point value 2555 | `a' to the 16-bit two's complement integer format. The conversion is 2556 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2557 | Arithmetic, except that the conversion is always rounded toward zero. 2558 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2559 | the conversion overflows, the largest integer with the same sign as `a' is 2560 | returned. 2561 *----------------------------------------------------------------------------*/ 2562 2563 int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM ) 2564 { 2565 flag aSign; 2566 int16 aExp, shiftCount; 2567 bits64 aSig, savedASig; 2568 int32 z; 2569 2570 aSig = extractFloat64Frac( a ); 2571 aExp = extractFloat64Exp( a ); 2572 aSign = extractFloat64Sign( a ); 2573 if ( 0x40E < aExp ) { 2574 if ( ( aExp == 0x7FF ) && aSig ) { 2575 aSign = 0; 2576 } 2577 goto invalid; 2578 } 2579 else if ( aExp < 0x3FF ) { 2580 if ( aExp || aSig ) { 2581 STATUS(float_exception_flags) |= float_flag_inexact; 2582 } 2583 return 0; 2584 } 2585 aSig |= LIT64( 0x0010000000000000 ); 2586 shiftCount = 0x433 - aExp; 2587 savedASig = aSig; 2588 aSig >>= shiftCount; 2589 z = aSig; 2590 if ( aSign ) { 2591 z = - z; 2592 } 2593 if ( ( (int16_t)z < 0 ) ^ aSign ) { 2594 invalid: 2595 float_raise( float_flag_invalid STATUS_VAR); 2596 return aSign ? (sbits32) 0xffff8000 : 0x7FFF; 2597 } 2598 if ( ( aSig<<shiftCount ) != savedASig ) { 2599 STATUS(float_exception_flags) |= float_flag_inexact; 2600 } 2601 return z; 2602 } 2603 2604 /*---------------------------------------------------------------------------- 2605 | Returns the result of converting the double-precision floating-point value 2606 | `a' to the 64-bit two's complement integer format. The conversion is 2607 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2608 | Arithmetic---which means in particular that the conversion is rounded 2609 | according to the current rounding mode. If `a' is a NaN, the largest 2610 | positive integer is returned. Otherwise, if the conversion overflows, the 2611 | largest integer with the same sign as `a' is returned. 2612 *----------------------------------------------------------------------------*/ 2613 2614 int64 float64_to_int64( float64 a STATUS_PARAM ) 2615 { 2616 flag aSign; 2617 int16 aExp, shiftCount; 2618 bits64 aSig, aSigExtra; 2619 a = float64_squash_input_denormal(a STATUS_VAR); 2620 2621 aSig = extractFloat64Frac( a ); 2622 aExp = extractFloat64Exp( a ); 2623 aSign = extractFloat64Sign( a ); 2624 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2625 shiftCount = 0x433 - aExp; 2626 if ( shiftCount <= 0 ) { 2627 if ( 0x43E < aExp ) { 2628 float_raise( float_flag_invalid STATUS_VAR); 2629 if ( ! aSign 2630 || ( ( aExp == 0x7FF ) 2631 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2632 ) { 2633 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2634 } 2635 return (sbits64) LIT64( 0x8000000000000000 ); 2636 } 2637 aSigExtra = 0; 2638 aSig <<= - shiftCount; 2639 } 2640 else { 2641 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 2642 } 2643 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR ); 2644 2645 } 2646 2647 /*---------------------------------------------------------------------------- 2648 | Returns the result of converting the double-precision floating-point value 2649 | `a' to the 64-bit two's complement integer format. The conversion is 2650 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2651 | Arithmetic, except that the conversion is always rounded toward zero. 2652 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2653 | the conversion overflows, the largest integer with the same sign as `a' is 2654 | returned. 2655 *----------------------------------------------------------------------------*/ 2656 2657 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM ) 2658 { 2659 flag aSign; 2660 int16 aExp, shiftCount; 2661 bits64 aSig; 2662 int64 z; 2663 a = float64_squash_input_denormal(a STATUS_VAR); 2664 2665 aSig = extractFloat64Frac( a ); 2666 aExp = extractFloat64Exp( a ); 2667 aSign = extractFloat64Sign( a ); 2668 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2669 shiftCount = aExp - 0x433; 2670 if ( 0 <= shiftCount ) { 2671 if ( 0x43E <= aExp ) { 2672 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) { 2673 float_raise( float_flag_invalid STATUS_VAR); 2674 if ( ! aSign 2675 || ( ( aExp == 0x7FF ) 2676 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2677 ) { 2678 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2679 } 2680 } 2681 return (sbits64) LIT64( 0x8000000000000000 ); 2682 } 2683 z = aSig<<shiftCount; 2684 } 2685 else { 2686 if ( aExp < 0x3FE ) { 2687 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 2688 return 0; 2689 } 2690 z = aSig>>( - shiftCount ); 2691 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 2692 STATUS(float_exception_flags) |= float_flag_inexact; 2693 } 2694 } 2695 if ( aSign ) z = - z; 2696 return z; 2697 2698 } 2699 2700 /*---------------------------------------------------------------------------- 2701 | Returns the result of converting the double-precision floating-point value 2702 | `a' to the single-precision floating-point format. The conversion is 2703 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2704 | Arithmetic. 2705 *----------------------------------------------------------------------------*/ 2706 2707 float32 float64_to_float32( float64 a STATUS_PARAM ) 2708 { 2709 flag aSign; 2710 int16 aExp; 2711 bits64 aSig; 2712 bits32 zSig; 2713 a = float64_squash_input_denormal(a STATUS_VAR); 2714 2715 aSig = extractFloat64Frac( a ); 2716 aExp = extractFloat64Exp( a ); 2717 aSign = extractFloat64Sign( a ); 2718 if ( aExp == 0x7FF ) { 2719 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 2720 return packFloat32( aSign, 0xFF, 0 ); 2721 } 2722 shift64RightJamming( aSig, 22, &aSig ); 2723 zSig = aSig; 2724 if ( aExp || zSig ) { 2725 zSig |= 0x40000000; 2726 aExp -= 0x381; 2727 } 2728 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR ); 2729 2730 } 2731 2732 2733 /*---------------------------------------------------------------------------- 2734 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 2735 | half-precision floating-point value, returning the result. After being 2736 | shifted into the proper positions, the three fields are simply added 2737 | together to form the result. This means that any integer portion of `zSig' 2738 | will be added into the exponent. Since a properly normalized significand 2739 | will have an integer portion equal to 1, the `zExp' input should be 1 less 2740 | than the desired result exponent whenever `zSig' is a complete, normalized 2741 | significand. 2742 *----------------------------------------------------------------------------*/ 2743 static float16 packFloat16(flag zSign, int16 zExp, bits16 zSig) 2744 { 2745 return make_float16( 2746 (((bits32)zSign) << 15) + (((bits32)zExp) << 10) + zSig); 2747 } 2748 2749 /* Half precision floats come in two formats: standard IEEE and "ARM" format. 2750 The latter gains extra exponent range by omitting the NaN/Inf encodings. */ 2751 2752 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM) 2753 { 2754 flag aSign; 2755 int16 aExp; 2756 bits32 aSig; 2757 2758 aSign = extractFloat16Sign(a); 2759 aExp = extractFloat16Exp(a); 2760 aSig = extractFloat16Frac(a); 2761 2762 if (aExp == 0x1f && ieee) { 2763 if (aSig) { 2764 return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR); 2765 } 2766 return packFloat32(aSign, 0xff, aSig << 13); 2767 } 2768 if (aExp == 0) { 2769 int8 shiftCount; 2770 2771 if (aSig == 0) { 2772 return packFloat32(aSign, 0, 0); 2773 } 2774 2775 shiftCount = countLeadingZeros32( aSig ) - 21; 2776 aSig = aSig << shiftCount; 2777 aExp = -shiftCount; 2778 } 2779 return packFloat32( aSign, aExp + 0x70, aSig << 13); 2780 } 2781 2782 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM) 2783 { 2784 flag aSign; 2785 int16 aExp; 2786 bits32 aSig; 2787 bits32 mask; 2788 bits32 increment; 2789 int8 roundingMode; 2790 a = float32_squash_input_denormal(a STATUS_VAR); 2791 2792 aSig = extractFloat32Frac( a ); 2793 aExp = extractFloat32Exp( a ); 2794 aSign = extractFloat32Sign( a ); 2795 if ( aExp == 0xFF ) { 2796 if (aSig) { 2797 /* Input is a NaN */ 2798 float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 2799 if (!ieee) { 2800 return packFloat16(aSign, 0, 0); 2801 } 2802 return r; 2803 } 2804 /* Infinity */ 2805 if (!ieee) { 2806 float_raise(float_flag_invalid STATUS_VAR); 2807 return packFloat16(aSign, 0x1f, 0x3ff); 2808 } 2809 return packFloat16(aSign, 0x1f, 0); 2810 } 2811 if (aExp == 0 && aSig == 0) { 2812 return packFloat16(aSign, 0, 0); 2813 } 2814 /* Decimal point between bits 22 and 23. */ 2815 aSig |= 0x00800000; 2816 aExp -= 0x7f; 2817 if (aExp < -14) { 2818 mask = 0x00ffffff; 2819 if (aExp >= -24) { 2820 mask >>= 25 + aExp; 2821 } 2822 } else { 2823 mask = 0x00001fff; 2824 } 2825 if (aSig & mask) { 2826 float_raise( float_flag_underflow STATUS_VAR ); 2827 roundingMode = STATUS(float_rounding_mode); 2828 switch (roundingMode) { 2829 case float_round_nearest_even: 2830 increment = (mask + 1) >> 1; 2831 if ((aSig & mask) == increment) { 2832 increment = aSig & (increment << 1); 2833 } 2834 break; 2835 case float_round_up: 2836 increment = aSign ? 0 : mask; 2837 break; 2838 case float_round_down: 2839 increment = aSign ? mask : 0; 2840 break; 2841 default: /* round_to_zero */ 2842 increment = 0; 2843 break; 2844 } 2845 aSig += increment; 2846 if (aSig >= 0x01000000) { 2847 aSig >>= 1; 2848 aExp++; 2849 } 2850 } else if (aExp < -14 2851 && STATUS(float_detect_tininess) == float_tininess_before_rounding) { 2852 float_raise( float_flag_underflow STATUS_VAR); 2853 } 2854 2855 if (ieee) { 2856 if (aExp > 15) { 2857 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); 2858 return packFloat16(aSign, 0x1f, 0); 2859 } 2860 } else { 2861 if (aExp > 16) { 2862 float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR); 2863 return packFloat16(aSign, 0x1f, 0x3ff); 2864 } 2865 } 2866 if (aExp < -24) { 2867 return packFloat16(aSign, 0, 0); 2868 } 2869 if (aExp < -14) { 2870 aSig >>= -14 - aExp; 2871 aExp = -14; 2872 } 2873 return packFloat16(aSign, aExp + 14, aSig >> 13); 2874 } 2875 2876 #ifdef FLOATX80 2877 2878 /*---------------------------------------------------------------------------- 2879 | Returns the result of converting the double-precision floating-point value 2880 | `a' to the extended double-precision floating-point format. The conversion 2881 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 2882 | Arithmetic. 2883 *----------------------------------------------------------------------------*/ 2884 2885 floatx80 float64_to_floatx80( float64 a STATUS_PARAM ) 2886 { 2887 flag aSign; 2888 int16 aExp; 2889 bits64 aSig; 2890 2891 a = float64_squash_input_denormal(a STATUS_VAR); 2892 aSig = extractFloat64Frac( a ); 2893 aExp = extractFloat64Exp( a ); 2894 aSign = extractFloat64Sign( a ); 2895 if ( aExp == 0x7FF ) { 2896 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 2897 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2898 } 2899 if ( aExp == 0 ) { 2900 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 2901 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2902 } 2903 return 2904 packFloatx80( 2905 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 2906 2907 } 2908 2909 #endif 2910 2911 #ifdef FLOAT128 2912 2913 /*---------------------------------------------------------------------------- 2914 | Returns the result of converting the double-precision floating-point value 2915 | `a' to the quadruple-precision floating-point format. The conversion is 2916 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2917 | Arithmetic. 2918 *----------------------------------------------------------------------------*/ 2919 2920 float128 float64_to_float128( float64 a STATUS_PARAM ) 2921 { 2922 flag aSign; 2923 int16 aExp; 2924 bits64 aSig, zSig0, zSig1; 2925 2926 a = float64_squash_input_denormal(a STATUS_VAR); 2927 aSig = extractFloat64Frac( a ); 2928 aExp = extractFloat64Exp( a ); 2929 aSign = extractFloat64Sign( a ); 2930 if ( aExp == 0x7FF ) { 2931 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 2932 return packFloat128( aSign, 0x7FFF, 0, 0 ); 2933 } 2934 if ( aExp == 0 ) { 2935 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 2936 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2937 --aExp; 2938 } 2939 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 2940 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 2941 2942 } 2943 2944 #endif 2945 2946 /*---------------------------------------------------------------------------- 2947 | Rounds the double-precision floating-point value `a' to an integer, and 2948 | returns the result as a double-precision floating-point value. The 2949 | operation is performed according to the IEC/IEEE Standard for Binary 2950 | Floating-Point Arithmetic. 2951 *----------------------------------------------------------------------------*/ 2952 2953 float64 float64_round_to_int( float64 a STATUS_PARAM ) 2954 { 2955 flag aSign; 2956 int16 aExp; 2957 bits64 lastBitMask, roundBitsMask; 2958 int8 roundingMode; 2959 bits64 z; 2960 a = float64_squash_input_denormal(a STATUS_VAR); 2961 2962 aExp = extractFloat64Exp( a ); 2963 if ( 0x433 <= aExp ) { 2964 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { 2965 return propagateFloat64NaN( a, a STATUS_VAR ); 2966 } 2967 return a; 2968 } 2969 if ( aExp < 0x3FF ) { 2970 if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a; 2971 STATUS(float_exception_flags) |= float_flag_inexact; 2972 aSign = extractFloat64Sign( a ); 2973 switch ( STATUS(float_rounding_mode) ) { 2974 case float_round_nearest_even: 2975 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { 2976 return packFloat64( aSign, 0x3FF, 0 ); 2977 } 2978 break; 2979 case float_round_down: 2980 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0); 2981 case float_round_up: 2982 return make_float64( 2983 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 )); 2984 } 2985 return packFloat64( aSign, 0, 0 ); 2986 } 2987 lastBitMask = 1; 2988 lastBitMask <<= 0x433 - aExp; 2989 roundBitsMask = lastBitMask - 1; 2990 z = float64_val(a); 2991 roundingMode = STATUS(float_rounding_mode); 2992 if ( roundingMode == float_round_nearest_even ) { 2993 z += lastBitMask>>1; 2994 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 2995 } 2996 else if ( roundingMode != float_round_to_zero ) { 2997 if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) { 2998 z += roundBitsMask; 2999 } 3000 } 3001 z &= ~ roundBitsMask; 3002 if ( z != float64_val(a) ) 3003 STATUS(float_exception_flags) |= float_flag_inexact; 3004 return make_float64(z); 3005 3006 } 3007 3008 float64 float64_trunc_to_int( float64 a STATUS_PARAM) 3009 { 3010 int oldmode; 3011 float64 res; 3012 oldmode = STATUS(float_rounding_mode); 3013 STATUS(float_rounding_mode) = float_round_to_zero; 3014 res = float64_round_to_int(a STATUS_VAR); 3015 STATUS(float_rounding_mode) = oldmode; 3016 return res; 3017 } 3018 3019 /*---------------------------------------------------------------------------- 3020 | Returns the result of adding the absolute values of the double-precision 3021 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 3022 | before being returned. `zSign' is ignored if the result is a NaN. 3023 | The addition is performed according to the IEC/IEEE Standard for Binary 3024 | Floating-Point Arithmetic. 3025 *----------------------------------------------------------------------------*/ 3026 3027 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM ) 3028 { 3029 int16 aExp, bExp, zExp; 3030 bits64 aSig, bSig, zSig; 3031 int16 expDiff; 3032 3033 aSig = extractFloat64Frac( a ); 3034 aExp = extractFloat64Exp( a ); 3035 bSig = extractFloat64Frac( b ); 3036 bExp = extractFloat64Exp( b ); 3037 expDiff = aExp - bExp; 3038 aSig <<= 9; 3039 bSig <<= 9; 3040 if ( 0 < expDiff ) { 3041 if ( aExp == 0x7FF ) { 3042 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3043 return a; 3044 } 3045 if ( bExp == 0 ) { 3046 --expDiff; 3047 } 3048 else { 3049 bSig |= LIT64( 0x2000000000000000 ); 3050 } 3051 shift64RightJamming( bSig, expDiff, &bSig ); 3052 zExp = aExp; 3053 } 3054 else if ( expDiff < 0 ) { 3055 if ( bExp == 0x7FF ) { 3056 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3057 return packFloat64( zSign, 0x7FF, 0 ); 3058 } 3059 if ( aExp == 0 ) { 3060 ++expDiff; 3061 } 3062 else { 3063 aSig |= LIT64( 0x2000000000000000 ); 3064 } 3065 shift64RightJamming( aSig, - expDiff, &aSig ); 3066 zExp = bExp; 3067 } 3068 else { 3069 if ( aExp == 0x7FF ) { 3070 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3071 return a; 3072 } 3073 if ( aExp == 0 ) { 3074 if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 ); 3075 return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); 3076 } 3077 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig; 3078 zExp = aExp; 3079 goto roundAndPack; 3080 } 3081 aSig |= LIT64( 0x2000000000000000 ); 3082 zSig = ( aSig + bSig )<<1; 3083 --zExp; 3084 if ( (sbits64) zSig < 0 ) { 3085 zSig = aSig + bSig; 3086 ++zExp; 3087 } 3088 roundAndPack: 3089 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR ); 3090 3091 } 3092 3093 /*---------------------------------------------------------------------------- 3094 | Returns the result of subtracting the absolute values of the double- 3095 | precision floating-point values `a' and `b'. If `zSign' is 1, the 3096 | difference is negated before being returned. `zSign' is ignored if the 3097 | result is a NaN. The subtraction is performed according to the IEC/IEEE 3098 | Standard for Binary Floating-Point Arithmetic. 3099 *----------------------------------------------------------------------------*/ 3100 3101 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM ) 3102 { 3103 int16 aExp, bExp, zExp; 3104 bits64 aSig, bSig, zSig; 3105 int16 expDiff; 3106 3107 aSig = extractFloat64Frac( a ); 3108 aExp = extractFloat64Exp( a ); 3109 bSig = extractFloat64Frac( b ); 3110 bExp = extractFloat64Exp( b ); 3111 expDiff = aExp - bExp; 3112 aSig <<= 10; 3113 bSig <<= 10; 3114 if ( 0 < expDiff ) goto aExpBigger; 3115 if ( expDiff < 0 ) goto bExpBigger; 3116 if ( aExp == 0x7FF ) { 3117 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3118 float_raise( float_flag_invalid STATUS_VAR); 3119 return float64_default_nan; 3120 } 3121 if ( aExp == 0 ) { 3122 aExp = 1; 3123 bExp = 1; 3124 } 3125 if ( bSig < aSig ) goto aBigger; 3126 if ( aSig < bSig ) goto bBigger; 3127 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 ); 3128 bExpBigger: 3129 if ( bExp == 0x7FF ) { 3130 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3131 return packFloat64( zSign ^ 1, 0x7FF, 0 ); 3132 } 3133 if ( aExp == 0 ) { 3134 ++expDiff; 3135 } 3136 else { 3137 aSig |= LIT64( 0x4000000000000000 ); 3138 } 3139 shift64RightJamming( aSig, - expDiff, &aSig ); 3140 bSig |= LIT64( 0x4000000000000000 ); 3141 bBigger: 3142 zSig = bSig - aSig; 3143 zExp = bExp; 3144 zSign ^= 1; 3145 goto normalizeRoundAndPack; 3146 aExpBigger: 3147 if ( aExp == 0x7FF ) { 3148 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3149 return a; 3150 } 3151 if ( bExp == 0 ) { 3152 --expDiff; 3153 } 3154 else { 3155 bSig |= LIT64( 0x4000000000000000 ); 3156 } 3157 shift64RightJamming( bSig, expDiff, &bSig ); 3158 aSig |= LIT64( 0x4000000000000000 ); 3159 aBigger: 3160 zSig = aSig - bSig; 3161 zExp = aExp; 3162 normalizeRoundAndPack: 3163 --zExp; 3164 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR ); 3165 3166 } 3167 3168 /*---------------------------------------------------------------------------- 3169 | Returns the result of adding the double-precision floating-point values `a' 3170 | and `b'. The operation is performed according to the IEC/IEEE Standard for 3171 | Binary Floating-Point Arithmetic. 3172 *----------------------------------------------------------------------------*/ 3173 3174 float64 float64_add( float64 a, float64 b STATUS_PARAM ) 3175 { 3176 flag aSign, bSign; 3177 a = float64_squash_input_denormal(a STATUS_VAR); 3178 b = float64_squash_input_denormal(b STATUS_VAR); 3179 3180 aSign = extractFloat64Sign( a ); 3181 bSign = extractFloat64Sign( b ); 3182 if ( aSign == bSign ) { 3183 return addFloat64Sigs( a, b, aSign STATUS_VAR ); 3184 } 3185 else { 3186 return subFloat64Sigs( a, b, aSign STATUS_VAR ); 3187 } 3188 3189 } 3190 3191 /*---------------------------------------------------------------------------- 3192 | Returns the result of subtracting the double-precision floating-point values 3193 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 3194 | for Binary Floating-Point Arithmetic. 3195 *----------------------------------------------------------------------------*/ 3196 3197 float64 float64_sub( float64 a, float64 b STATUS_PARAM ) 3198 { 3199 flag aSign, bSign; 3200 a = float64_squash_input_denormal(a STATUS_VAR); 3201 b = float64_squash_input_denormal(b STATUS_VAR); 3202 3203 aSign = extractFloat64Sign( a ); 3204 bSign = extractFloat64Sign( b ); 3205 if ( aSign == bSign ) { 3206 return subFloat64Sigs( a, b, aSign STATUS_VAR ); 3207 } 3208 else { 3209 return addFloat64Sigs( a, b, aSign STATUS_VAR ); 3210 } 3211 3212 } 3213 3214 /*---------------------------------------------------------------------------- 3215 | Returns the result of multiplying the double-precision floating-point values 3216 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 3217 | for Binary Floating-Point Arithmetic. 3218 *----------------------------------------------------------------------------*/ 3219 3220 float64 float64_mul( float64 a, float64 b STATUS_PARAM ) 3221 { 3222 flag aSign, bSign, zSign; 3223 int16 aExp, bExp, zExp; 3224 bits64 aSig, bSig, zSig0, zSig1; 3225 3226 a = float64_squash_input_denormal(a STATUS_VAR); 3227 b = float64_squash_input_denormal(b STATUS_VAR); 3228 3229 aSig = extractFloat64Frac( a ); 3230 aExp = extractFloat64Exp( a ); 3231 aSign = extractFloat64Sign( a ); 3232 bSig = extractFloat64Frac( b ); 3233 bExp = extractFloat64Exp( b ); 3234 bSign = extractFloat64Sign( b ); 3235 zSign = aSign ^ bSign; 3236 if ( aExp == 0x7FF ) { 3237 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 3238 return propagateFloat64NaN( a, b STATUS_VAR ); 3239 } 3240 if ( ( bExp | bSig ) == 0 ) { 3241 float_raise( float_flag_invalid STATUS_VAR); 3242 return float64_default_nan; 3243 } 3244 return packFloat64( zSign, 0x7FF, 0 ); 3245 } 3246 if ( bExp == 0x7FF ) { 3247 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3248 if ( ( aExp | aSig ) == 0 ) { 3249 float_raise( float_flag_invalid STATUS_VAR); 3250 return float64_default_nan; 3251 } 3252 return packFloat64( zSign, 0x7FF, 0 ); 3253 } 3254 if ( aExp == 0 ) { 3255 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 3256 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3257 } 3258 if ( bExp == 0 ) { 3259 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 ); 3260 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 3261 } 3262 zExp = aExp + bExp - 0x3FF; 3263 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 3264 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 3265 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 3266 zSig0 |= ( zSig1 != 0 ); 3267 if ( 0 <= (sbits64) ( zSig0<<1 ) ) { 3268 zSig0 <<= 1; 3269 --zExp; 3270 } 3271 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR ); 3272 3273 } 3274 3275 /*---------------------------------------------------------------------------- 3276 | Returns the result of dividing the double-precision floating-point value `a' 3277 | by the corresponding value `b'. The operation is performed according to 3278 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3279 *----------------------------------------------------------------------------*/ 3280 3281 float64 float64_div( float64 a, float64 b STATUS_PARAM ) 3282 { 3283 flag aSign, bSign, zSign; 3284 int16 aExp, bExp, zExp; 3285 bits64 aSig, bSig, zSig; 3286 bits64 rem0, rem1; 3287 bits64 term0, term1; 3288 a = float64_squash_input_denormal(a STATUS_VAR); 3289 b = float64_squash_input_denormal(b STATUS_VAR); 3290 3291 aSig = extractFloat64Frac( a ); 3292 aExp = extractFloat64Exp( a ); 3293 aSign = extractFloat64Sign( a ); 3294 bSig = extractFloat64Frac( b ); 3295 bExp = extractFloat64Exp( b ); 3296 bSign = extractFloat64Sign( b ); 3297 zSign = aSign ^ bSign; 3298 if ( aExp == 0x7FF ) { 3299 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3300 if ( bExp == 0x7FF ) { 3301 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3302 float_raise( float_flag_invalid STATUS_VAR); 3303 return float64_default_nan; 3304 } 3305 return packFloat64( zSign, 0x7FF, 0 ); 3306 } 3307 if ( bExp == 0x7FF ) { 3308 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3309 return packFloat64( zSign, 0, 0 ); 3310 } 3311 if ( bExp == 0 ) { 3312 if ( bSig == 0 ) { 3313 if ( ( aExp | aSig ) == 0 ) { 3314 float_raise( float_flag_invalid STATUS_VAR); 3315 return float64_default_nan; 3316 } 3317 float_raise( float_flag_divbyzero STATUS_VAR); 3318 return packFloat64( zSign, 0x7FF, 0 ); 3319 } 3320 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 3321 } 3322 if ( aExp == 0 ) { 3323 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 3324 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3325 } 3326 zExp = aExp - bExp + 0x3FD; 3327 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 3328 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 3329 if ( bSig <= ( aSig + aSig ) ) { 3330 aSig >>= 1; 3331 ++zExp; 3332 } 3333 zSig = estimateDiv128To64( aSig, 0, bSig ); 3334 if ( ( zSig & 0x1FF ) <= 2 ) { 3335 mul64To128( bSig, zSig, &term0, &term1 ); 3336 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 3337 while ( (sbits64) rem0 < 0 ) { 3338 --zSig; 3339 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 3340 } 3341 zSig |= ( rem1 != 0 ); 3342 } 3343 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR ); 3344 3345 } 3346 3347 /*---------------------------------------------------------------------------- 3348 | Returns the remainder of the double-precision floating-point value `a' 3349 | with respect to the corresponding value `b'. The operation is performed 3350 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3351 *----------------------------------------------------------------------------*/ 3352 3353 float64 float64_rem( float64 a, float64 b STATUS_PARAM ) 3354 { 3355 flag aSign, zSign; 3356 int16 aExp, bExp, expDiff; 3357 bits64 aSig, bSig; 3358 bits64 q, alternateASig; 3359 sbits64 sigMean; 3360 3361 a = float64_squash_input_denormal(a STATUS_VAR); 3362 b = float64_squash_input_denormal(b STATUS_VAR); 3363 aSig = extractFloat64Frac( a ); 3364 aExp = extractFloat64Exp( a ); 3365 aSign = extractFloat64Sign( a ); 3366 bSig = extractFloat64Frac( b ); 3367 bExp = extractFloat64Exp( b ); 3368 if ( aExp == 0x7FF ) { 3369 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 3370 return propagateFloat64NaN( a, b STATUS_VAR ); 3371 } 3372 float_raise( float_flag_invalid STATUS_VAR); 3373 return float64_default_nan; 3374 } 3375 if ( bExp == 0x7FF ) { 3376 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3377 return a; 3378 } 3379 if ( bExp == 0 ) { 3380 if ( bSig == 0 ) { 3381 float_raise( float_flag_invalid STATUS_VAR); 3382 return float64_default_nan; 3383 } 3384 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 3385 } 3386 if ( aExp == 0 ) { 3387 if ( aSig == 0 ) return a; 3388 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3389 } 3390 expDiff = aExp - bExp; 3391 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 3392 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 3393 if ( expDiff < 0 ) { 3394 if ( expDiff < -1 ) return a; 3395 aSig >>= 1; 3396 } 3397 q = ( bSig <= aSig ); 3398 if ( q ) aSig -= bSig; 3399 expDiff -= 64; 3400 while ( 0 < expDiff ) { 3401 q = estimateDiv128To64( aSig, 0, bSig ); 3402 q = ( 2 < q ) ? q - 2 : 0; 3403 aSig = - ( ( bSig>>2 ) * q ); 3404 expDiff -= 62; 3405 } 3406 expDiff += 64; 3407 if ( 0 < expDiff ) { 3408 q = estimateDiv128To64( aSig, 0, bSig ); 3409 q = ( 2 < q ) ? q - 2 : 0; 3410 q >>= 64 - expDiff; 3411 bSig >>= 2; 3412 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 3413 } 3414 else { 3415 aSig >>= 2; 3416 bSig >>= 2; 3417 } 3418 do { 3419 alternateASig = aSig; 3420 ++q; 3421 aSig -= bSig; 3422 } while ( 0 <= (sbits64) aSig ); 3423 sigMean = aSig + alternateASig; 3424 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 3425 aSig = alternateASig; 3426 } 3427 zSign = ( (sbits64) aSig < 0 ); 3428 if ( zSign ) aSig = - aSig; 3429 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR ); 3430 3431 } 3432 3433 /*---------------------------------------------------------------------------- 3434 | Returns the square root of the double-precision floating-point value `a'. 3435 | The operation is performed according to the IEC/IEEE Standard for Binary 3436 | Floating-Point Arithmetic. 3437 *----------------------------------------------------------------------------*/ 3438 3439 float64 float64_sqrt( float64 a STATUS_PARAM ) 3440 { 3441 flag aSign; 3442 int16 aExp, zExp; 3443 bits64 aSig, zSig, doubleZSig; 3444 bits64 rem0, rem1, term0, term1; 3445 a = float64_squash_input_denormal(a STATUS_VAR); 3446 3447 aSig = extractFloat64Frac( a ); 3448 aExp = extractFloat64Exp( a ); 3449 aSign = extractFloat64Sign( a ); 3450 if ( aExp == 0x7FF ) { 3451 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR ); 3452 if ( ! aSign ) return a; 3453 float_raise( float_flag_invalid STATUS_VAR); 3454 return float64_default_nan; 3455 } 3456 if ( aSign ) { 3457 if ( ( aExp | aSig ) == 0 ) return a; 3458 float_raise( float_flag_invalid STATUS_VAR); 3459 return float64_default_nan; 3460 } 3461 if ( aExp == 0 ) { 3462 if ( aSig == 0 ) return float64_zero; 3463 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3464 } 3465 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 3466 aSig |= LIT64( 0x0010000000000000 ); 3467 zSig = estimateSqrt32( aExp, aSig>>21 ); 3468 aSig <<= 9 - ( aExp & 1 ); 3469 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); 3470 if ( ( zSig & 0x1FF ) <= 5 ) { 3471 doubleZSig = zSig<<1; 3472 mul64To128( zSig, zSig, &term0, &term1 ); 3473 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 3474 while ( (sbits64) rem0 < 0 ) { 3475 --zSig; 3476 doubleZSig -= 2; 3477 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); 3478 } 3479 zSig |= ( ( rem0 | rem1 ) != 0 ); 3480 } 3481 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR ); 3482 3483 } 3484 3485 /*---------------------------------------------------------------------------- 3486 | Returns the binary log of the double-precision floating-point value `a'. 3487 | The operation is performed according to the IEC/IEEE Standard for Binary 3488 | Floating-Point Arithmetic. 3489 *----------------------------------------------------------------------------*/ 3490 float64 float64_log2( float64 a STATUS_PARAM ) 3491 { 3492 flag aSign, zSign; 3493 int16 aExp; 3494 bits64 aSig, aSig0, aSig1, zSig, i; 3495 a = float64_squash_input_denormal(a STATUS_VAR); 3496 3497 aSig = extractFloat64Frac( a ); 3498 aExp = extractFloat64Exp( a ); 3499 aSign = extractFloat64Sign( a ); 3500 3501 if ( aExp == 0 ) { 3502 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 3503 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3504 } 3505 if ( aSign ) { 3506 float_raise( float_flag_invalid STATUS_VAR); 3507 return float64_default_nan; 3508 } 3509 if ( aExp == 0x7FF ) { 3510 if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR ); 3511 return a; 3512 } 3513 3514 aExp -= 0x3FF; 3515 aSig |= LIT64( 0x0010000000000000 ); 3516 zSign = aExp < 0; 3517 zSig = (bits64)aExp << 52; 3518 for (i = 1LL << 51; i > 0; i >>= 1) { 3519 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 3520 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 3521 if ( aSig & LIT64( 0x0020000000000000 ) ) { 3522 aSig >>= 1; 3523 zSig |= i; 3524 } 3525 } 3526 3527 if ( zSign ) 3528 zSig = -zSig; 3529 return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR ); 3530 } 3531 3532 /*---------------------------------------------------------------------------- 3533 | Returns 1 if the double-precision floating-point value `a' is equal to the 3534 | corresponding value `b', and 0 otherwise. The comparison is performed 3535 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3536 *----------------------------------------------------------------------------*/ 3537 3538 int float64_eq( float64 a, float64 b STATUS_PARAM ) 3539 { 3540 bits64 av, bv; 3541 a = float64_squash_input_denormal(a STATUS_VAR); 3542 b = float64_squash_input_denormal(b STATUS_VAR); 3543 3544 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3545 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3546 ) { 3547 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3548 float_raise( float_flag_invalid STATUS_VAR); 3549 } 3550 return 0; 3551 } 3552 av = float64_val(a); 3553 bv = float64_val(b); 3554 return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 ); 3555 3556 } 3557 3558 /*---------------------------------------------------------------------------- 3559 | Returns 1 if the double-precision floating-point value `a' is less than or 3560 | equal to the corresponding value `b', and 0 otherwise. The comparison is 3561 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3562 | Arithmetic. 3563 *----------------------------------------------------------------------------*/ 3564 3565 int float64_le( float64 a, float64 b STATUS_PARAM ) 3566 { 3567 flag aSign, bSign; 3568 bits64 av, bv; 3569 a = float64_squash_input_denormal(a STATUS_VAR); 3570 b = float64_squash_input_denormal(b STATUS_VAR); 3571 3572 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3573 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3574 ) { 3575 float_raise( float_flag_invalid STATUS_VAR); 3576 return 0; 3577 } 3578 aSign = extractFloat64Sign( a ); 3579 bSign = extractFloat64Sign( b ); 3580 av = float64_val(a); 3581 bv = float64_val(b); 3582 if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 ); 3583 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3584 3585 } 3586 3587 /*---------------------------------------------------------------------------- 3588 | Returns 1 if the double-precision floating-point value `a' is less than 3589 | the corresponding value `b', and 0 otherwise. The comparison is performed 3590 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3591 *----------------------------------------------------------------------------*/ 3592 3593 int float64_lt( float64 a, float64 b STATUS_PARAM ) 3594 { 3595 flag aSign, bSign; 3596 bits64 av, bv; 3597 3598 a = float64_squash_input_denormal(a STATUS_VAR); 3599 b = float64_squash_input_denormal(b STATUS_VAR); 3600 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3601 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3602 ) { 3603 float_raise( float_flag_invalid STATUS_VAR); 3604 return 0; 3605 } 3606 aSign = extractFloat64Sign( a ); 3607 bSign = extractFloat64Sign( b ); 3608 av = float64_val(a); 3609 bv = float64_val(b); 3610 if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 ); 3611 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3612 3613 } 3614 3615 /*---------------------------------------------------------------------------- 3616 | Returns 1 if the double-precision floating-point value `a' is equal to the 3617 | corresponding value `b', and 0 otherwise. The invalid exception is raised 3618 | if either operand is a NaN. Otherwise, the comparison is performed 3619 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3620 *----------------------------------------------------------------------------*/ 3621 3622 int float64_eq_signaling( float64 a, float64 b STATUS_PARAM ) 3623 { 3624 bits64 av, bv; 3625 a = float64_squash_input_denormal(a STATUS_VAR); 3626 b = float64_squash_input_denormal(b STATUS_VAR); 3627 3628 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3629 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3630 ) { 3631 float_raise( float_flag_invalid STATUS_VAR); 3632 return 0; 3633 } 3634 av = float64_val(a); 3635 bv = float64_val(b); 3636 return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 ); 3637 3638 } 3639 3640 /*---------------------------------------------------------------------------- 3641 | Returns 1 if the double-precision floating-point value `a' is less than or 3642 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 3643 | cause an exception. Otherwise, the comparison is performed according to the 3644 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3645 *----------------------------------------------------------------------------*/ 3646 3647 int float64_le_quiet( float64 a, float64 b STATUS_PARAM ) 3648 { 3649 flag aSign, bSign; 3650 bits64 av, bv; 3651 a = float64_squash_input_denormal(a STATUS_VAR); 3652 b = float64_squash_input_denormal(b STATUS_VAR); 3653 3654 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3655 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3656 ) { 3657 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3658 float_raise( float_flag_invalid STATUS_VAR); 3659 } 3660 return 0; 3661 } 3662 aSign = extractFloat64Sign( a ); 3663 bSign = extractFloat64Sign( b ); 3664 av = float64_val(a); 3665 bv = float64_val(b); 3666 if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 ); 3667 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3668 3669 } 3670 3671 /*---------------------------------------------------------------------------- 3672 | Returns 1 if the double-precision floating-point value `a' is less than 3673 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3674 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 3675 | Standard for Binary Floating-Point Arithmetic. 3676 *----------------------------------------------------------------------------*/ 3677 3678 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM ) 3679 { 3680 flag aSign, bSign; 3681 bits64 av, bv; 3682 a = float64_squash_input_denormal(a STATUS_VAR); 3683 b = float64_squash_input_denormal(b STATUS_VAR); 3684 3685 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3686 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3687 ) { 3688 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3689 float_raise( float_flag_invalid STATUS_VAR); 3690 } 3691 return 0; 3692 } 3693 aSign = extractFloat64Sign( a ); 3694 bSign = extractFloat64Sign( b ); 3695 av = float64_val(a); 3696 bv = float64_val(b); 3697 if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 ); 3698 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3699 3700 } 3701 3702 #ifdef FLOATX80 3703 3704 /*---------------------------------------------------------------------------- 3705 | Returns the result of converting the extended double-precision floating- 3706 | point value `a' to the 32-bit two's complement integer format. The 3707 | conversion is performed according to the IEC/IEEE Standard for Binary 3708 | Floating-Point Arithmetic---which means in particular that the conversion 3709 | is rounded according to the current rounding mode. If `a' is a NaN, the 3710 | largest positive integer is returned. Otherwise, if the conversion 3711 | overflows, the largest integer with the same sign as `a' is returned. 3712 *----------------------------------------------------------------------------*/ 3713 3714 int32 floatx80_to_int32( floatx80 a STATUS_PARAM ) 3715 { 3716 flag aSign; 3717 int32 aExp, shiftCount; 3718 bits64 aSig; 3719 3720 aSig = extractFloatx80Frac( a ); 3721 aExp = extractFloatx80Exp( a ); 3722 aSign = extractFloatx80Sign( a ); 3723 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3724 shiftCount = 0x4037 - aExp; 3725 if ( shiftCount <= 0 ) shiftCount = 1; 3726 shift64RightJamming( aSig, shiftCount, &aSig ); 3727 return roundAndPackInt32( aSign, aSig STATUS_VAR ); 3728 3729 } 3730 3731 /*---------------------------------------------------------------------------- 3732 | Returns the result of converting the extended double-precision floating- 3733 | point value `a' to the 32-bit two's complement integer format. The 3734 | conversion is performed according to the IEC/IEEE Standard for Binary 3735 | Floating-Point Arithmetic, except that the conversion is always rounded 3736 | toward zero. If `a' is a NaN, the largest positive integer is returned. 3737 | Otherwise, if the conversion overflows, the largest integer with the same 3738 | sign as `a' is returned. 3739 *----------------------------------------------------------------------------*/ 3740 3741 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM ) 3742 { 3743 flag aSign; 3744 int32 aExp, shiftCount; 3745 bits64 aSig, savedASig; 3746 int32 z; 3747 3748 aSig = extractFloatx80Frac( a ); 3749 aExp = extractFloatx80Exp( a ); 3750 aSign = extractFloatx80Sign( a ); 3751 if ( 0x401E < aExp ) { 3752 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; 3753 goto invalid; 3754 } 3755 else if ( aExp < 0x3FFF ) { 3756 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 3757 return 0; 3758 } 3759 shiftCount = 0x403E - aExp; 3760 savedASig = aSig; 3761 aSig >>= shiftCount; 3762 z = aSig; 3763 if ( aSign ) z = - z; 3764 if ( ( z < 0 ) ^ aSign ) { 3765 invalid: 3766 float_raise( float_flag_invalid STATUS_VAR); 3767 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 3768 } 3769 if ( ( aSig<<shiftCount ) != savedASig ) { 3770 STATUS(float_exception_flags) |= float_flag_inexact; 3771 } 3772 return z; 3773 3774 } 3775 3776 /*---------------------------------------------------------------------------- 3777 | Returns the result of converting the extended double-precision floating- 3778 | point value `a' to the 64-bit two's complement integer format. The 3779 | conversion is performed according to the IEC/IEEE Standard for Binary 3780 | Floating-Point Arithmetic---which means in particular that the conversion 3781 | is rounded according to the current rounding mode. If `a' is a NaN, 3782 | the largest positive integer is returned. Otherwise, if the conversion 3783 | overflows, the largest integer with the same sign as `a' is returned. 3784 *----------------------------------------------------------------------------*/ 3785 3786 int64 floatx80_to_int64( floatx80 a STATUS_PARAM ) 3787 { 3788 flag aSign; 3789 int32 aExp, shiftCount; 3790 bits64 aSig, aSigExtra; 3791 3792 aSig = extractFloatx80Frac( a ); 3793 aExp = extractFloatx80Exp( a ); 3794 aSign = extractFloatx80Sign( a ); 3795 shiftCount = 0x403E - aExp; 3796 if ( shiftCount <= 0 ) { 3797 if ( shiftCount ) { 3798 float_raise( float_flag_invalid STATUS_VAR); 3799 if ( ! aSign 3800 || ( ( aExp == 0x7FFF ) 3801 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 3802 ) { 3803 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3804 } 3805 return (sbits64) LIT64( 0x8000000000000000 ); 3806 } 3807 aSigExtra = 0; 3808 } 3809 else { 3810 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 3811 } 3812 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR ); 3813 3814 } 3815 3816 /*---------------------------------------------------------------------------- 3817 | Returns the result of converting the extended double-precision floating- 3818 | point value `a' to the 64-bit two's complement integer format. The 3819 | conversion is performed according to the IEC/IEEE Standard for Binary 3820 | Floating-Point Arithmetic, except that the conversion is always rounded 3821 | toward zero. If `a' is a NaN, the largest positive integer is returned. 3822 | Otherwise, if the conversion overflows, the largest integer with the same 3823 | sign as `a' is returned. 3824 *----------------------------------------------------------------------------*/ 3825 3826 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM ) 3827 { 3828 flag aSign; 3829 int32 aExp, shiftCount; 3830 bits64 aSig; 3831 int64 z; 3832 3833 aSig = extractFloatx80Frac( a ); 3834 aExp = extractFloatx80Exp( a ); 3835 aSign = extractFloatx80Sign( a ); 3836 shiftCount = aExp - 0x403E; 3837 if ( 0 <= shiftCount ) { 3838 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 3839 if ( ( a.high != 0xC03E ) || aSig ) { 3840 float_raise( float_flag_invalid STATUS_VAR); 3841 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 3842 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3843 } 3844 } 3845 return (sbits64) LIT64( 0x8000000000000000 ); 3846 } 3847 else if ( aExp < 0x3FFF ) { 3848 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 3849 return 0; 3850 } 3851 z = aSig>>( - shiftCount ); 3852 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { 3853 STATUS(float_exception_flags) |= float_flag_inexact; 3854 } 3855 if ( aSign ) z = - z; 3856 return z; 3857 3858 } 3859 3860 /*---------------------------------------------------------------------------- 3861 | Returns the result of converting the extended double-precision floating- 3862 | point value `a' to the single-precision floating-point format. The 3863 | conversion is performed according to the IEC/IEEE Standard for Binary 3864 | Floating-Point Arithmetic. 3865 *----------------------------------------------------------------------------*/ 3866 3867 float32 floatx80_to_float32( floatx80 a STATUS_PARAM ) 3868 { 3869 flag aSign; 3870 int32 aExp; 3871 bits64 aSig; 3872 3873 aSig = extractFloatx80Frac( a ); 3874 aExp = extractFloatx80Exp( a ); 3875 aSign = extractFloatx80Sign( a ); 3876 if ( aExp == 0x7FFF ) { 3877 if ( (bits64) ( aSig<<1 ) ) { 3878 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 3879 } 3880 return packFloat32( aSign, 0xFF, 0 ); 3881 } 3882 shift64RightJamming( aSig, 33, &aSig ); 3883 if ( aExp || aSig ) aExp -= 0x3F81; 3884 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR ); 3885 3886 } 3887 3888 /*---------------------------------------------------------------------------- 3889 | Returns the result of converting the extended double-precision floating- 3890 | point value `a' to the double-precision floating-point format. The 3891 | conversion is performed according to the IEC/IEEE Standard for Binary 3892 | Floating-Point Arithmetic. 3893 *----------------------------------------------------------------------------*/ 3894 3895 float64 floatx80_to_float64( floatx80 a STATUS_PARAM ) 3896 { 3897 flag aSign; 3898 int32 aExp; 3899 bits64 aSig, zSig; 3900 3901 aSig = extractFloatx80Frac( a ); 3902 aExp = extractFloatx80Exp( a ); 3903 aSign = extractFloatx80Sign( a ); 3904 if ( aExp == 0x7FFF ) { 3905 if ( (bits64) ( aSig<<1 ) ) { 3906 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 3907 } 3908 return packFloat64( aSign, 0x7FF, 0 ); 3909 } 3910 shift64RightJamming( aSig, 1, &zSig ); 3911 if ( aExp || aSig ) aExp -= 0x3C01; 3912 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR ); 3913 3914 } 3915 3916 #ifdef FLOAT128 3917 3918 /*---------------------------------------------------------------------------- 3919 | Returns the result of converting the extended double-precision floating- 3920 | point value `a' to the quadruple-precision floating-point format. The 3921 | conversion is performed according to the IEC/IEEE Standard for Binary 3922 | Floating-Point Arithmetic. 3923 *----------------------------------------------------------------------------*/ 3924 3925 float128 floatx80_to_float128( floatx80 a STATUS_PARAM ) 3926 { 3927 flag aSign; 3928 int16 aExp; 3929 bits64 aSig, zSig0, zSig1; 3930 3931 aSig = extractFloatx80Frac( a ); 3932 aExp = extractFloatx80Exp( a ); 3933 aSign = extractFloatx80Sign( a ); 3934 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) { 3935 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 3936 } 3937 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 3938 return packFloat128( aSign, aExp, zSig0, zSig1 ); 3939 3940 } 3941 3942 #endif 3943 3944 /*---------------------------------------------------------------------------- 3945 | Rounds the extended double-precision floating-point value `a' to an integer, 3946 | and returns the result as an extended quadruple-precision floating-point 3947 | value. The operation is performed according to the IEC/IEEE Standard for 3948 | Binary Floating-Point Arithmetic. 3949 *----------------------------------------------------------------------------*/ 3950 3951 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM ) 3952 { 3953 flag aSign; 3954 int32 aExp; 3955 bits64 lastBitMask, roundBitsMask; 3956 int8 roundingMode; 3957 floatx80 z; 3958 3959 aExp = extractFloatx80Exp( a ); 3960 if ( 0x403E <= aExp ) { 3961 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) { 3962 return propagateFloatx80NaN( a, a STATUS_VAR ); 3963 } 3964 return a; 3965 } 3966 if ( aExp < 0x3FFF ) { 3967 if ( ( aExp == 0 ) 3968 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 3969 return a; 3970 } 3971 STATUS(float_exception_flags) |= float_flag_inexact; 3972 aSign = extractFloatx80Sign( a ); 3973 switch ( STATUS(float_rounding_mode) ) { 3974 case float_round_nearest_even: 3975 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 ) 3976 ) { 3977 return 3978 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3979 } 3980 break; 3981 case float_round_down: 3982 return 3983 aSign ? 3984 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 3985 : packFloatx80( 0, 0, 0 ); 3986 case float_round_up: 3987 return 3988 aSign ? packFloatx80( 1, 0, 0 ) 3989 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 3990 } 3991 return packFloatx80( aSign, 0, 0 ); 3992 } 3993 lastBitMask = 1; 3994 lastBitMask <<= 0x403E - aExp; 3995 roundBitsMask = lastBitMask - 1; 3996 z = a; 3997 roundingMode = STATUS(float_rounding_mode); 3998 if ( roundingMode == float_round_nearest_even ) { 3999 z.low += lastBitMask>>1; 4000 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 4001 } 4002 else if ( roundingMode != float_round_to_zero ) { 4003 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) { 4004 z.low += roundBitsMask; 4005 } 4006 } 4007 z.low &= ~ roundBitsMask; 4008 if ( z.low == 0 ) { 4009 ++z.high; 4010 z.low = LIT64( 0x8000000000000000 ); 4011 } 4012 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact; 4013 return z; 4014 4015 } 4016 4017 /*---------------------------------------------------------------------------- 4018 | Returns the result of adding the absolute values of the extended double- 4019 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 4020 | negated before being returned. `zSign' is ignored if the result is a NaN. 4021 | The addition is performed according to the IEC/IEEE Standard for Binary 4022 | Floating-Point Arithmetic. 4023 *----------------------------------------------------------------------------*/ 4024 4025 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM) 4026 { 4027 int32 aExp, bExp, zExp; 4028 bits64 aSig, bSig, zSig0, zSig1; 4029 int32 expDiff; 4030 4031 aSig = extractFloatx80Frac( a ); 4032 aExp = extractFloatx80Exp( a ); 4033 bSig = extractFloatx80Frac( b ); 4034 bExp = extractFloatx80Exp( b ); 4035 expDiff = aExp - bExp; 4036 if ( 0 < expDiff ) { 4037 if ( aExp == 0x7FFF ) { 4038 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 4039 return a; 4040 } 4041 if ( bExp == 0 ) --expDiff; 4042 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 4043 zExp = aExp; 4044 } 4045 else if ( expDiff < 0 ) { 4046 if ( bExp == 0x7FFF ) { 4047 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 4048 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4049 } 4050 if ( aExp == 0 ) ++expDiff; 4051 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 4052 zExp = bExp; 4053 } 4054 else { 4055 if ( aExp == 0x7FFF ) { 4056 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 4057 return propagateFloatx80NaN( a, b STATUS_VAR ); 4058 } 4059 return a; 4060 } 4061 zSig1 = 0; 4062 zSig0 = aSig + bSig; 4063 if ( aExp == 0 ) { 4064 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 4065 goto roundAndPack; 4066 } 4067 zExp = aExp; 4068 goto shiftRight1; 4069 } 4070 zSig0 = aSig + bSig; 4071 if ( (sbits64) zSig0 < 0 ) goto roundAndPack; 4072 shiftRight1: 4073 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 4074 zSig0 |= LIT64( 0x8000000000000000 ); 4075 ++zExp; 4076 roundAndPack: 4077 return 4078 roundAndPackFloatx80( 4079 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR ); 4080 4081 } 4082 4083 /*---------------------------------------------------------------------------- 4084 | Returns the result of subtracting the absolute values of the extended 4085 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 4086 | difference is negated before being returned. `zSign' is ignored if the 4087 | result is a NaN. The subtraction is performed according to the IEC/IEEE 4088 | Standard for Binary Floating-Point Arithmetic. 4089 *----------------------------------------------------------------------------*/ 4090 4091 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM ) 4092 { 4093 int32 aExp, bExp, zExp; 4094 bits64 aSig, bSig, zSig0, zSig1; 4095 int32 expDiff; 4096 floatx80 z; 4097 4098 aSig = extractFloatx80Frac( a ); 4099 aExp = extractFloatx80Exp( a ); 4100 bSig = extractFloatx80Frac( b ); 4101 bExp = extractFloatx80Exp( b ); 4102 expDiff = aExp - bExp; 4103 if ( 0 < expDiff ) goto aExpBigger; 4104 if ( expDiff < 0 ) goto bExpBigger; 4105 if ( aExp == 0x7FFF ) { 4106 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { 4107 return propagateFloatx80NaN( a, b STATUS_VAR ); 4108 } 4109 float_raise( float_flag_invalid STATUS_VAR); 4110 z.low = floatx80_default_nan_low; 4111 z.high = floatx80_default_nan_high; 4112 return z; 4113 } 4114 if ( aExp == 0 ) { 4115 aExp = 1; 4116 bExp = 1; 4117 } 4118 zSig1 = 0; 4119 if ( bSig < aSig ) goto aBigger; 4120 if ( aSig < bSig ) goto bBigger; 4121 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 ); 4122 bExpBigger: 4123 if ( bExp == 0x7FFF ) { 4124 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 4125 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4126 } 4127 if ( aExp == 0 ) ++expDiff; 4128 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 4129 bBigger: 4130 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 4131 zExp = bExp; 4132 zSign ^= 1; 4133 goto normalizeRoundAndPack; 4134 aExpBigger: 4135 if ( aExp == 0x7FFF ) { 4136 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 4137 return a; 4138 } 4139 if ( bExp == 0 ) --expDiff; 4140 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 4141 aBigger: 4142 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 4143 zExp = aExp; 4144 normalizeRoundAndPack: 4145 return 4146 normalizeRoundAndPackFloatx80( 4147 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR ); 4148 4149 } 4150 4151 /*---------------------------------------------------------------------------- 4152 | Returns the result of adding the extended double-precision floating-point 4153 | values `a' and `b'. The operation is performed according to the IEC/IEEE 4154 | Standard for Binary Floating-Point Arithmetic. 4155 *----------------------------------------------------------------------------*/ 4156 4157 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM ) 4158 { 4159 flag aSign, bSign; 4160 4161 aSign = extractFloatx80Sign( a ); 4162 bSign = extractFloatx80Sign( b ); 4163 if ( aSign == bSign ) { 4164 return addFloatx80Sigs( a, b, aSign STATUS_VAR ); 4165 } 4166 else { 4167 return subFloatx80Sigs( a, b, aSign STATUS_VAR ); 4168 } 4169 4170 } 4171 4172 /*---------------------------------------------------------------------------- 4173 | Returns the result of subtracting the extended double-precision floating- 4174 | point values `a' and `b'. The operation is performed according to the 4175 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4176 *----------------------------------------------------------------------------*/ 4177 4178 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM ) 4179 { 4180 flag aSign, bSign; 4181 4182 aSign = extractFloatx80Sign( a ); 4183 bSign = extractFloatx80Sign( b ); 4184 if ( aSign == bSign ) { 4185 return subFloatx80Sigs( a, b, aSign STATUS_VAR ); 4186 } 4187 else { 4188 return addFloatx80Sigs( a, b, aSign STATUS_VAR ); 4189 } 4190 4191 } 4192 4193 /*---------------------------------------------------------------------------- 4194 | Returns the result of multiplying the extended double-precision floating- 4195 | point values `a' and `b'. The operation is performed according to the 4196 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4197 *----------------------------------------------------------------------------*/ 4198 4199 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM ) 4200 { 4201 flag aSign, bSign, zSign; 4202 int32 aExp, bExp, zExp; 4203 bits64 aSig, bSig, zSig0, zSig1; 4204 floatx80 z; 4205 4206 aSig = extractFloatx80Frac( a ); 4207 aExp = extractFloatx80Exp( a ); 4208 aSign = extractFloatx80Sign( a ); 4209 bSig = extractFloatx80Frac( b ); 4210 bExp = extractFloatx80Exp( b ); 4211 bSign = extractFloatx80Sign( b ); 4212 zSign = aSign ^ bSign; 4213 if ( aExp == 0x7FFF ) { 4214 if ( (bits64) ( aSig<<1 ) 4215 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 4216 return propagateFloatx80NaN( a, b STATUS_VAR ); 4217 } 4218 if ( ( bExp | bSig ) == 0 ) goto invalid; 4219 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4220 } 4221 if ( bExp == 0x7FFF ) { 4222 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 4223 if ( ( aExp | aSig ) == 0 ) { 4224 invalid: 4225 float_raise( float_flag_invalid STATUS_VAR); 4226 z.low = floatx80_default_nan_low; 4227 z.high = floatx80_default_nan_high; 4228 return z; 4229 } 4230 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4231 } 4232 if ( aExp == 0 ) { 4233 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 4234 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 4235 } 4236 if ( bExp == 0 ) { 4237 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 4238 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 4239 } 4240 zExp = aExp + bExp - 0x3FFE; 4241 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 4242 if ( 0 < (sbits64) zSig0 ) { 4243 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 4244 --zExp; 4245 } 4246 return 4247 roundAndPackFloatx80( 4248 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR ); 4249 4250 } 4251 4252 /*---------------------------------------------------------------------------- 4253 | Returns the result of dividing the extended double-precision floating-point 4254 | value `a' by the corresponding value `b'. The operation is performed 4255 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4256 *----------------------------------------------------------------------------*/ 4257 4258 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM ) 4259 { 4260 flag aSign, bSign, zSign; 4261 int32 aExp, bExp, zExp; 4262 bits64 aSig, bSig, zSig0, zSig1; 4263 bits64 rem0, rem1, rem2, term0, term1, term2; 4264 floatx80 z; 4265 4266 aSig = extractFloatx80Frac( a ); 4267 aExp = extractFloatx80Exp( a ); 4268 aSign = extractFloatx80Sign( a ); 4269 bSig = extractFloatx80Frac( b ); 4270 bExp = extractFloatx80Exp( b ); 4271 bSign = extractFloatx80Sign( b ); 4272 zSign = aSign ^ bSign; 4273 if ( aExp == 0x7FFF ) { 4274 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 4275 if ( bExp == 0x7FFF ) { 4276 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 4277 goto invalid; 4278 } 4279 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4280 } 4281 if ( bExp == 0x7FFF ) { 4282 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 4283 return packFloatx80( zSign, 0, 0 ); 4284 } 4285 if ( bExp == 0 ) { 4286 if ( bSig == 0 ) { 4287 if ( ( aExp | aSig ) == 0 ) { 4288 invalid: 4289 float_raise( float_flag_invalid STATUS_VAR); 4290 z.low = floatx80_default_nan_low; 4291 z.high = floatx80_default_nan_high; 4292 return z; 4293 } 4294 float_raise( float_flag_divbyzero STATUS_VAR); 4295 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4296 } 4297 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 4298 } 4299 if ( aExp == 0 ) { 4300 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 4301 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 4302 } 4303 zExp = aExp - bExp + 0x3FFE; 4304 rem1 = 0; 4305 if ( bSig <= aSig ) { 4306 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 4307 ++zExp; 4308 } 4309 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 4310 mul64To128( bSig, zSig0, &term0, &term1 ); 4311 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 4312 while ( (sbits64) rem0 < 0 ) { 4313 --zSig0; 4314 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 4315 } 4316 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 4317 if ( (bits64) ( zSig1<<1 ) <= 8 ) { 4318 mul64To128( bSig, zSig1, &term1, &term2 ); 4319 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 4320 while ( (sbits64) rem1 < 0 ) { 4321 --zSig1; 4322 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 4323 } 4324 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 4325 } 4326 return 4327 roundAndPackFloatx80( 4328 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR ); 4329 4330 } 4331 4332 /*---------------------------------------------------------------------------- 4333 | Returns the remainder of the extended double-precision floating-point value 4334 | `a' with respect to the corresponding value `b'. The operation is performed 4335 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4336 *----------------------------------------------------------------------------*/ 4337 4338 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM ) 4339 { 4340 flag aSign, zSign; 4341 int32 aExp, bExp, expDiff; 4342 bits64 aSig0, aSig1, bSig; 4343 bits64 q, term0, term1, alternateASig0, alternateASig1; 4344 floatx80 z; 4345 4346 aSig0 = extractFloatx80Frac( a ); 4347 aExp = extractFloatx80Exp( a ); 4348 aSign = extractFloatx80Sign( a ); 4349 bSig = extractFloatx80Frac( b ); 4350 bExp = extractFloatx80Exp( b ); 4351 if ( aExp == 0x7FFF ) { 4352 if ( (bits64) ( aSig0<<1 ) 4353 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { 4354 return propagateFloatx80NaN( a, b STATUS_VAR ); 4355 } 4356 goto invalid; 4357 } 4358 if ( bExp == 0x7FFF ) { 4359 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 4360 return a; 4361 } 4362 if ( bExp == 0 ) { 4363 if ( bSig == 0 ) { 4364 invalid: 4365 float_raise( float_flag_invalid STATUS_VAR); 4366 z.low = floatx80_default_nan_low; 4367 z.high = floatx80_default_nan_high; 4368 return z; 4369 } 4370 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 4371 } 4372 if ( aExp == 0 ) { 4373 if ( (bits64) ( aSig0<<1 ) == 0 ) return a; 4374 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 4375 } 4376 bSig |= LIT64( 0x8000000000000000 ); 4377 zSign = aSign; 4378 expDiff = aExp - bExp; 4379 aSig1 = 0; 4380 if ( expDiff < 0 ) { 4381 if ( expDiff < -1 ) return a; 4382 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 4383 expDiff = 0; 4384 } 4385 q = ( bSig <= aSig0 ); 4386 if ( q ) aSig0 -= bSig; 4387 expDiff -= 64; 4388 while ( 0 < expDiff ) { 4389 q = estimateDiv128To64( aSig0, aSig1, bSig ); 4390 q = ( 2 < q ) ? q - 2 : 0; 4391 mul64To128( bSig, q, &term0, &term1 ); 4392 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 4393 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 4394 expDiff -= 62; 4395 } 4396 expDiff += 64; 4397 if ( 0 < expDiff ) { 4398 q = estimateDiv128To64( aSig0, aSig1, bSig ); 4399 q = ( 2 < q ) ? q - 2 : 0; 4400 q >>= 64 - expDiff; 4401 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 4402 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 4403 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 4404 while ( le128( term0, term1, aSig0, aSig1 ) ) { 4405 ++q; 4406 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 4407 } 4408 } 4409 else { 4410 term1 = 0; 4411 term0 = bSig; 4412 } 4413 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 4414 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 4415 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 4416 && ( q & 1 ) ) 4417 ) { 4418 aSig0 = alternateASig0; 4419 aSig1 = alternateASig1; 4420 zSign = ! zSign; 4421 } 4422 return 4423 normalizeRoundAndPackFloatx80( 4424 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR ); 4425 4426 } 4427 4428 /*---------------------------------------------------------------------------- 4429 | Returns the square root of the extended double-precision floating-point 4430 | value `a'. The operation is performed according to the IEC/IEEE Standard 4431 | for Binary Floating-Point Arithmetic. 4432 *----------------------------------------------------------------------------*/ 4433 4434 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM ) 4435 { 4436 flag aSign; 4437 int32 aExp, zExp; 4438 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0; 4439 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 4440 floatx80 z; 4441 4442 aSig0 = extractFloatx80Frac( a ); 4443 aExp = extractFloatx80Exp( a ); 4444 aSign = extractFloatx80Sign( a ); 4445 if ( aExp == 0x7FFF ) { 4446 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR ); 4447 if ( ! aSign ) return a; 4448 goto invalid; 4449 } 4450 if ( aSign ) { 4451 if ( ( aExp | aSig0 ) == 0 ) return a; 4452 invalid: 4453 float_raise( float_flag_invalid STATUS_VAR); 4454 z.low = floatx80_default_nan_low; 4455 z.high = floatx80_default_nan_high; 4456 return z; 4457 } 4458 if ( aExp == 0 ) { 4459 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 4460 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 4461 } 4462 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 4463 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 4464 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 4465 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 4466 doubleZSig0 = zSig0<<1; 4467 mul64To128( zSig0, zSig0, &term0, &term1 ); 4468 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 4469 while ( (sbits64) rem0 < 0 ) { 4470 --zSig0; 4471 doubleZSig0 -= 2; 4472 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 4473 } 4474 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 4475 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 4476 if ( zSig1 == 0 ) zSig1 = 1; 4477 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 4478 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 4479 mul64To128( zSig1, zSig1, &term2, &term3 ); 4480 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 4481 while ( (sbits64) rem1 < 0 ) { 4482 --zSig1; 4483 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 4484 term3 |= 1; 4485 term2 |= doubleZSig0; 4486 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 4487 } 4488 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 4489 } 4490 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 4491 zSig0 |= doubleZSig0; 4492 return 4493 roundAndPackFloatx80( 4494 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR ); 4495 4496 } 4497 4498 /*---------------------------------------------------------------------------- 4499 | Returns 1 if the extended double-precision floating-point value `a' is 4500 | equal to the corresponding value `b', and 0 otherwise. The comparison is 4501 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4502 | Arithmetic. 4503 *----------------------------------------------------------------------------*/ 4504 4505 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM ) 4506 { 4507 4508 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4509 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4510 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4511 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4512 ) { 4513 if ( floatx80_is_signaling_nan( a ) 4514 || floatx80_is_signaling_nan( b ) ) { 4515 float_raise( float_flag_invalid STATUS_VAR); 4516 } 4517 return 0; 4518 } 4519 return 4520 ( a.low == b.low ) 4521 && ( ( a.high == b.high ) 4522 || ( ( a.low == 0 ) 4523 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 4524 ); 4525 4526 } 4527 4528 /*---------------------------------------------------------------------------- 4529 | Returns 1 if the extended double-precision floating-point value `a' is 4530 | less than or equal to the corresponding value `b', and 0 otherwise. The 4531 | comparison is performed according to the IEC/IEEE Standard for Binary 4532 | Floating-Point Arithmetic. 4533 *----------------------------------------------------------------------------*/ 4534 4535 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM ) 4536 { 4537 flag aSign, bSign; 4538 4539 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4540 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4541 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4542 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4543 ) { 4544 float_raise( float_flag_invalid STATUS_VAR); 4545 return 0; 4546 } 4547 aSign = extractFloatx80Sign( a ); 4548 bSign = extractFloatx80Sign( b ); 4549 if ( aSign != bSign ) { 4550 return 4551 aSign 4552 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4553 == 0 ); 4554 } 4555 return 4556 aSign ? le128( b.high, b.low, a.high, a.low ) 4557 : le128( a.high, a.low, b.high, b.low ); 4558 4559 } 4560 4561 /*---------------------------------------------------------------------------- 4562 | Returns 1 if the extended double-precision floating-point value `a' is 4563 | less than the corresponding value `b', and 0 otherwise. The comparison 4564 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4565 | Arithmetic. 4566 *----------------------------------------------------------------------------*/ 4567 4568 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM ) 4569 { 4570 flag aSign, bSign; 4571 4572 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4573 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4574 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4575 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4576 ) { 4577 float_raise( float_flag_invalid STATUS_VAR); 4578 return 0; 4579 } 4580 aSign = extractFloatx80Sign( a ); 4581 bSign = extractFloatx80Sign( b ); 4582 if ( aSign != bSign ) { 4583 return 4584 aSign 4585 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4586 != 0 ); 4587 } 4588 return 4589 aSign ? lt128( b.high, b.low, a.high, a.low ) 4590 : lt128( a.high, a.low, b.high, b.low ); 4591 4592 } 4593 4594 /*---------------------------------------------------------------------------- 4595 | Returns 1 if the extended double-precision floating-point value `a' is equal 4596 | to the corresponding value `b', and 0 otherwise. The invalid exception is 4597 | raised if either operand is a NaN. Otherwise, the comparison is performed 4598 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4599 *----------------------------------------------------------------------------*/ 4600 4601 int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM ) 4602 { 4603 4604 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4605 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4606 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4607 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4608 ) { 4609 float_raise( float_flag_invalid STATUS_VAR); 4610 return 0; 4611 } 4612 return 4613 ( a.low == b.low ) 4614 && ( ( a.high == b.high ) 4615 || ( ( a.low == 0 ) 4616 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) 4617 ); 4618 4619 } 4620 4621 /*---------------------------------------------------------------------------- 4622 | Returns 1 if the extended double-precision floating-point value `a' is less 4623 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 4624 | do not cause an exception. Otherwise, the comparison is performed according 4625 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4626 *----------------------------------------------------------------------------*/ 4627 4628 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM ) 4629 { 4630 flag aSign, bSign; 4631 4632 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4633 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4634 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4635 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4636 ) { 4637 if ( floatx80_is_signaling_nan( a ) 4638 || floatx80_is_signaling_nan( b ) ) { 4639 float_raise( float_flag_invalid STATUS_VAR); 4640 } 4641 return 0; 4642 } 4643 aSign = extractFloatx80Sign( a ); 4644 bSign = extractFloatx80Sign( b ); 4645 if ( aSign != bSign ) { 4646 return 4647 aSign 4648 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4649 == 0 ); 4650 } 4651 return 4652 aSign ? le128( b.high, b.low, a.high, a.low ) 4653 : le128( a.high, a.low, b.high, b.low ); 4654 4655 } 4656 4657 /*---------------------------------------------------------------------------- 4658 | Returns 1 if the extended double-precision floating-point value `a' is less 4659 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 4660 | an exception. Otherwise, the comparison is performed according to the 4661 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4662 *----------------------------------------------------------------------------*/ 4663 4664 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM ) 4665 { 4666 flag aSign, bSign; 4667 4668 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4669 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) 4670 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4671 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) 4672 ) { 4673 if ( floatx80_is_signaling_nan( a ) 4674 || floatx80_is_signaling_nan( b ) ) { 4675 float_raise( float_flag_invalid STATUS_VAR); 4676 } 4677 return 0; 4678 } 4679 aSign = extractFloatx80Sign( a ); 4680 bSign = extractFloatx80Sign( b ); 4681 if ( aSign != bSign ) { 4682 return 4683 aSign 4684 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4685 != 0 ); 4686 } 4687 return 4688 aSign ? lt128( b.high, b.low, a.high, a.low ) 4689 : lt128( a.high, a.low, b.high, b.low ); 4690 4691 } 4692 4693 #endif 4694 4695 #ifdef FLOAT128 4696 4697 /*---------------------------------------------------------------------------- 4698 | Returns the result of converting the quadruple-precision floating-point 4699 | value `a' to the 32-bit two's complement integer format. The conversion 4700 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4701 | Arithmetic---which means in particular that the conversion is rounded 4702 | according to the current rounding mode. If `a' is a NaN, the largest 4703 | positive integer is returned. Otherwise, if the conversion overflows, the 4704 | largest integer with the same sign as `a' is returned. 4705 *----------------------------------------------------------------------------*/ 4706 4707 int32 float128_to_int32( float128 a STATUS_PARAM ) 4708 { 4709 flag aSign; 4710 int32 aExp, shiftCount; 4711 bits64 aSig0, aSig1; 4712 4713 aSig1 = extractFloat128Frac1( a ); 4714 aSig0 = extractFloat128Frac0( a ); 4715 aExp = extractFloat128Exp( a ); 4716 aSign = extractFloat128Sign( a ); 4717 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 4718 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4719 aSig0 |= ( aSig1 != 0 ); 4720 shiftCount = 0x4028 - aExp; 4721 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 4722 return roundAndPackInt32( aSign, aSig0 STATUS_VAR ); 4723 4724 } 4725 4726 /*---------------------------------------------------------------------------- 4727 | Returns the result of converting the quadruple-precision floating-point 4728 | value `a' to the 32-bit two's complement integer format. The conversion 4729 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4730 | Arithmetic, except that the conversion is always rounded toward zero. If 4731 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 4732 | conversion overflows, the largest integer with the same sign as `a' is 4733 | returned. 4734 *----------------------------------------------------------------------------*/ 4735 4736 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM ) 4737 { 4738 flag aSign; 4739 int32 aExp, shiftCount; 4740 bits64 aSig0, aSig1, savedASig; 4741 int32 z; 4742 4743 aSig1 = extractFloat128Frac1( a ); 4744 aSig0 = extractFloat128Frac0( a ); 4745 aExp = extractFloat128Exp( a ); 4746 aSign = extractFloat128Sign( a ); 4747 aSig0 |= ( aSig1 != 0 ); 4748 if ( 0x401E < aExp ) { 4749 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 4750 goto invalid; 4751 } 4752 else if ( aExp < 0x3FFF ) { 4753 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact; 4754 return 0; 4755 } 4756 aSig0 |= LIT64( 0x0001000000000000 ); 4757 shiftCount = 0x402F - aExp; 4758 savedASig = aSig0; 4759 aSig0 >>= shiftCount; 4760 z = aSig0; 4761 if ( aSign ) z = - z; 4762 if ( ( z < 0 ) ^ aSign ) { 4763 invalid: 4764 float_raise( float_flag_invalid STATUS_VAR); 4765 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; 4766 } 4767 if ( ( aSig0<<shiftCount ) != savedASig ) { 4768 STATUS(float_exception_flags) |= float_flag_inexact; 4769 } 4770 return z; 4771 4772 } 4773 4774 /*---------------------------------------------------------------------------- 4775 | Returns the result of converting the quadruple-precision floating-point 4776 | value `a' to the 64-bit two's complement integer format. The conversion 4777 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4778 | Arithmetic---which means in particular that the conversion is rounded 4779 | according to the current rounding mode. If `a' is a NaN, the largest 4780 | positive integer is returned. Otherwise, if the conversion overflows, the 4781 | largest integer with the same sign as `a' is returned. 4782 *----------------------------------------------------------------------------*/ 4783 4784 int64 float128_to_int64( float128 a STATUS_PARAM ) 4785 { 4786 flag aSign; 4787 int32 aExp, shiftCount; 4788 bits64 aSig0, aSig1; 4789 4790 aSig1 = extractFloat128Frac1( a ); 4791 aSig0 = extractFloat128Frac0( a ); 4792 aExp = extractFloat128Exp( a ); 4793 aSign = extractFloat128Sign( a ); 4794 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4795 shiftCount = 0x402F - aExp; 4796 if ( shiftCount <= 0 ) { 4797 if ( 0x403E < aExp ) { 4798 float_raise( float_flag_invalid STATUS_VAR); 4799 if ( ! aSign 4800 || ( ( aExp == 0x7FFF ) 4801 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 4802 ) 4803 ) { 4804 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4805 } 4806 return (sbits64) LIT64( 0x8000000000000000 ); 4807 } 4808 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 4809 } 4810 else { 4811 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 4812 } 4813 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR ); 4814 4815 } 4816 4817 /*---------------------------------------------------------------------------- 4818 | Returns the result of converting the quadruple-precision floating-point 4819 | value `a' to the 64-bit two's complement integer format. The conversion 4820 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4821 | Arithmetic, except that the conversion is always rounded toward zero. 4822 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 4823 | the conversion overflows, the largest integer with the same sign as `a' is 4824 | returned. 4825 *----------------------------------------------------------------------------*/ 4826 4827 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM ) 4828 { 4829 flag aSign; 4830 int32 aExp, shiftCount; 4831 bits64 aSig0, aSig1; 4832 int64 z; 4833 4834 aSig1 = extractFloat128Frac1( a ); 4835 aSig0 = extractFloat128Frac0( a ); 4836 aExp = extractFloat128Exp( a ); 4837 aSign = extractFloat128Sign( a ); 4838 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4839 shiftCount = aExp - 0x402F; 4840 if ( 0 < shiftCount ) { 4841 if ( 0x403E <= aExp ) { 4842 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 4843 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 4844 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 4845 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact; 4846 } 4847 else { 4848 float_raise( float_flag_invalid STATUS_VAR); 4849 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 4850 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4851 } 4852 } 4853 return (sbits64) LIT64( 0x8000000000000000 ); 4854 } 4855 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 4856 if ( (bits64) ( aSig1<<shiftCount ) ) { 4857 STATUS(float_exception_flags) |= float_flag_inexact; 4858 } 4859 } 4860 else { 4861 if ( aExp < 0x3FFF ) { 4862 if ( aExp | aSig0 | aSig1 ) { 4863 STATUS(float_exception_flags) |= float_flag_inexact; 4864 } 4865 return 0; 4866 } 4867 z = aSig0>>( - shiftCount ); 4868 if ( aSig1 4869 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) { 4870 STATUS(float_exception_flags) |= float_flag_inexact; 4871 } 4872 } 4873 if ( aSign ) z = - z; 4874 return z; 4875 4876 } 4877 4878 /*---------------------------------------------------------------------------- 4879 | Returns the result of converting the quadruple-precision floating-point 4880 | value `a' to the single-precision floating-point format. The conversion 4881 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4882 | Arithmetic. 4883 *----------------------------------------------------------------------------*/ 4884 4885 float32 float128_to_float32( float128 a STATUS_PARAM ) 4886 { 4887 flag aSign; 4888 int32 aExp; 4889 bits64 aSig0, aSig1; 4890 bits32 zSig; 4891 4892 aSig1 = extractFloat128Frac1( a ); 4893 aSig0 = extractFloat128Frac0( a ); 4894 aExp = extractFloat128Exp( a ); 4895 aSign = extractFloat128Sign( a ); 4896 if ( aExp == 0x7FFF ) { 4897 if ( aSig0 | aSig1 ) { 4898 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 4899 } 4900 return packFloat32( aSign, 0xFF, 0 ); 4901 } 4902 aSig0 |= ( aSig1 != 0 ); 4903 shift64RightJamming( aSig0, 18, &aSig0 ); 4904 zSig = aSig0; 4905 if ( aExp || zSig ) { 4906 zSig |= 0x40000000; 4907 aExp -= 0x3F81; 4908 } 4909 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR ); 4910 4911 } 4912 4913 /*---------------------------------------------------------------------------- 4914 | Returns the result of converting the quadruple-precision floating-point 4915 | value `a' to the double-precision floating-point format. The conversion 4916 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4917 | Arithmetic. 4918 *----------------------------------------------------------------------------*/ 4919 4920 float64 float128_to_float64( float128 a STATUS_PARAM ) 4921 { 4922 flag aSign; 4923 int32 aExp; 4924 bits64 aSig0, aSig1; 4925 4926 aSig1 = extractFloat128Frac1( a ); 4927 aSig0 = extractFloat128Frac0( a ); 4928 aExp = extractFloat128Exp( a ); 4929 aSign = extractFloat128Sign( a ); 4930 if ( aExp == 0x7FFF ) { 4931 if ( aSig0 | aSig1 ) { 4932 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 4933 } 4934 return packFloat64( aSign, 0x7FF, 0 ); 4935 } 4936 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 4937 aSig0 |= ( aSig1 != 0 ); 4938 if ( aExp || aSig0 ) { 4939 aSig0 |= LIT64( 0x4000000000000000 ); 4940 aExp -= 0x3C01; 4941 } 4942 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR ); 4943 4944 } 4945 4946 #ifdef FLOATX80 4947 4948 /*---------------------------------------------------------------------------- 4949 | Returns the result of converting the quadruple-precision floating-point 4950 | value `a' to the extended double-precision floating-point format. The 4951 | conversion is performed according to the IEC/IEEE Standard for Binary 4952 | Floating-Point Arithmetic. 4953 *----------------------------------------------------------------------------*/ 4954 4955 floatx80 float128_to_floatx80( float128 a STATUS_PARAM ) 4956 { 4957 flag aSign; 4958 int32 aExp; 4959 bits64 aSig0, aSig1; 4960 4961 aSig1 = extractFloat128Frac1( a ); 4962 aSig0 = extractFloat128Frac0( a ); 4963 aExp = extractFloat128Exp( a ); 4964 aSign = extractFloat128Sign( a ); 4965 if ( aExp == 0x7FFF ) { 4966 if ( aSig0 | aSig1 ) { 4967 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 4968 } 4969 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4970 } 4971 if ( aExp == 0 ) { 4972 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 4973 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 4974 } 4975 else { 4976 aSig0 |= LIT64( 0x0001000000000000 ); 4977 } 4978 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 4979 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR ); 4980 4981 } 4982 4983 #endif 4984 4985 /*---------------------------------------------------------------------------- 4986 | Rounds the quadruple-precision floating-point value `a' to an integer, and 4987 | returns the result as a quadruple-precision floating-point value. The 4988 | operation is performed according to the IEC/IEEE Standard for Binary 4989 | Floating-Point Arithmetic. 4990 *----------------------------------------------------------------------------*/ 4991 4992 float128 float128_round_to_int( float128 a STATUS_PARAM ) 4993 { 4994 flag aSign; 4995 int32 aExp; 4996 bits64 lastBitMask, roundBitsMask; 4997 int8 roundingMode; 4998 float128 z; 4999 5000 aExp = extractFloat128Exp( a ); 5001 if ( 0x402F <= aExp ) { 5002 if ( 0x406F <= aExp ) { 5003 if ( ( aExp == 0x7FFF ) 5004 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 5005 ) { 5006 return propagateFloat128NaN( a, a STATUS_VAR ); 5007 } 5008 return a; 5009 } 5010 lastBitMask = 1; 5011 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 5012 roundBitsMask = lastBitMask - 1; 5013 z = a; 5014 roundingMode = STATUS(float_rounding_mode); 5015 if ( roundingMode == float_round_nearest_even ) { 5016 if ( lastBitMask ) { 5017 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 5018 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 5019 } 5020 else { 5021 if ( (sbits64) z.low < 0 ) { 5022 ++z.high; 5023 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1; 5024 } 5025 } 5026 } 5027 else if ( roundingMode != float_round_to_zero ) { 5028 if ( extractFloat128Sign( z ) 5029 ^ ( roundingMode == float_round_up ) ) { 5030 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low ); 5031 } 5032 } 5033 z.low &= ~ roundBitsMask; 5034 } 5035 else { 5036 if ( aExp < 0x3FFF ) { 5037 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 5038 STATUS(float_exception_flags) |= float_flag_inexact; 5039 aSign = extractFloat128Sign( a ); 5040 switch ( STATUS(float_rounding_mode) ) { 5041 case float_round_nearest_even: 5042 if ( ( aExp == 0x3FFE ) 5043 && ( extractFloat128Frac0( a ) 5044 | extractFloat128Frac1( a ) ) 5045 ) { 5046 return packFloat128( aSign, 0x3FFF, 0, 0 ); 5047 } 5048 break; 5049 case float_round_down: 5050 return 5051 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 5052 : packFloat128( 0, 0, 0, 0 ); 5053 case float_round_up: 5054 return 5055 aSign ? packFloat128( 1, 0, 0, 0 ) 5056 : packFloat128( 0, 0x3FFF, 0, 0 ); 5057 } 5058 return packFloat128( aSign, 0, 0, 0 ); 5059 } 5060 lastBitMask = 1; 5061 lastBitMask <<= 0x402F - aExp; 5062 roundBitsMask = lastBitMask - 1; 5063 z.low = 0; 5064 z.high = a.high; 5065 roundingMode = STATUS(float_rounding_mode); 5066 if ( roundingMode == float_round_nearest_even ) { 5067 z.high += lastBitMask>>1; 5068 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 5069 z.high &= ~ lastBitMask; 5070 } 5071 } 5072 else if ( roundingMode != float_round_to_zero ) { 5073 if ( extractFloat128Sign( z ) 5074 ^ ( roundingMode == float_round_up ) ) { 5075 z.high |= ( a.low != 0 ); 5076 z.high += roundBitsMask; 5077 } 5078 } 5079 z.high &= ~ roundBitsMask; 5080 } 5081 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 5082 STATUS(float_exception_flags) |= float_flag_inexact; 5083 } 5084 return z; 5085 5086 } 5087 5088 /*---------------------------------------------------------------------------- 5089 | Returns the result of adding the absolute values of the quadruple-precision 5090 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 5091 | before being returned. `zSign' is ignored if the result is a NaN. 5092 | The addition is performed according to the IEC/IEEE Standard for Binary 5093 | Floating-Point Arithmetic. 5094 *----------------------------------------------------------------------------*/ 5095 5096 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM) 5097 { 5098 int32 aExp, bExp, zExp; 5099 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 5100 int32 expDiff; 5101 5102 aSig1 = extractFloat128Frac1( a ); 5103 aSig0 = extractFloat128Frac0( a ); 5104 aExp = extractFloat128Exp( a ); 5105 bSig1 = extractFloat128Frac1( b ); 5106 bSig0 = extractFloat128Frac0( b ); 5107 bExp = extractFloat128Exp( b ); 5108 expDiff = aExp - bExp; 5109 if ( 0 < expDiff ) { 5110 if ( aExp == 0x7FFF ) { 5111 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 5112 return a; 5113 } 5114 if ( bExp == 0 ) { 5115 --expDiff; 5116 } 5117 else { 5118 bSig0 |= LIT64( 0x0001000000000000 ); 5119 } 5120 shift128ExtraRightJamming( 5121 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 5122 zExp = aExp; 5123 } 5124 else if ( expDiff < 0 ) { 5125 if ( bExp == 0x7FFF ) { 5126 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 5127 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5128 } 5129 if ( aExp == 0 ) { 5130 ++expDiff; 5131 } 5132 else { 5133 aSig0 |= LIT64( 0x0001000000000000 ); 5134 } 5135 shift128ExtraRightJamming( 5136 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 5137 zExp = bExp; 5138 } 5139 else { 5140 if ( aExp == 0x7FFF ) { 5141 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 5142 return propagateFloat128NaN( a, b STATUS_VAR ); 5143 } 5144 return a; 5145 } 5146 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 5147 if ( aExp == 0 ) { 5148 if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 ); 5149 return packFloat128( zSign, 0, zSig0, zSig1 ); 5150 } 5151 zSig2 = 0; 5152 zSig0 |= LIT64( 0x0002000000000000 ); 5153 zExp = aExp; 5154 goto shiftRight1; 5155 } 5156 aSig0 |= LIT64( 0x0001000000000000 ); 5157 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 5158 --zExp; 5159 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 5160 ++zExp; 5161 shiftRight1: 5162 shift128ExtraRightJamming( 5163 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 5164 roundAndPack: 5165 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR ); 5166 5167 } 5168 5169 /*---------------------------------------------------------------------------- 5170 | Returns the result of subtracting the absolute values of the quadruple- 5171 | precision floating-point values `a' and `b'. If `zSign' is 1, the 5172 | difference is negated before being returned. `zSign' is ignored if the 5173 | result is a NaN. The subtraction is performed according to the IEC/IEEE 5174 | Standard for Binary Floating-Point Arithmetic. 5175 *----------------------------------------------------------------------------*/ 5176 5177 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM) 5178 { 5179 int32 aExp, bExp, zExp; 5180 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 5181 int32 expDiff; 5182 float128 z; 5183 5184 aSig1 = extractFloat128Frac1( a ); 5185 aSig0 = extractFloat128Frac0( a ); 5186 aExp = extractFloat128Exp( a ); 5187 bSig1 = extractFloat128Frac1( b ); 5188 bSig0 = extractFloat128Frac0( b ); 5189 bExp = extractFloat128Exp( b ); 5190 expDiff = aExp - bExp; 5191 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 5192 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 5193 if ( 0 < expDiff ) goto aExpBigger; 5194 if ( expDiff < 0 ) goto bExpBigger; 5195 if ( aExp == 0x7FFF ) { 5196 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 5197 return propagateFloat128NaN( a, b STATUS_VAR ); 5198 } 5199 float_raise( float_flag_invalid STATUS_VAR); 5200 z.low = float128_default_nan_low; 5201 z.high = float128_default_nan_high; 5202 return z; 5203 } 5204 if ( aExp == 0 ) { 5205 aExp = 1; 5206 bExp = 1; 5207 } 5208 if ( bSig0 < aSig0 ) goto aBigger; 5209 if ( aSig0 < bSig0 ) goto bBigger; 5210 if ( bSig1 < aSig1 ) goto aBigger; 5211 if ( aSig1 < bSig1 ) goto bBigger; 5212 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 ); 5213 bExpBigger: 5214 if ( bExp == 0x7FFF ) { 5215 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 5216 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 5217 } 5218 if ( aExp == 0 ) { 5219 ++expDiff; 5220 } 5221 else { 5222 aSig0 |= LIT64( 0x4000000000000000 ); 5223 } 5224 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 5225 bSig0 |= LIT64( 0x4000000000000000 ); 5226 bBigger: 5227 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 5228 zExp = bExp; 5229 zSign ^= 1; 5230 goto normalizeRoundAndPack; 5231 aExpBigger: 5232 if ( aExp == 0x7FFF ) { 5233 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 5234 return a; 5235 } 5236 if ( bExp == 0 ) { 5237 --expDiff; 5238 } 5239 else { 5240 bSig0 |= LIT64( 0x4000000000000000 ); 5241 } 5242 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 5243 aSig0 |= LIT64( 0x4000000000000000 ); 5244 aBigger: 5245 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 5246 zExp = aExp; 5247 normalizeRoundAndPack: 5248 --zExp; 5249 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR ); 5250 5251 } 5252 5253 /*---------------------------------------------------------------------------- 5254 | Returns the result of adding the quadruple-precision floating-point values 5255 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 5256 | for Binary Floating-Point Arithmetic. 5257 *----------------------------------------------------------------------------*/ 5258 5259 float128 float128_add( float128 a, float128 b STATUS_PARAM ) 5260 { 5261 flag aSign, bSign; 5262 5263 aSign = extractFloat128Sign( a ); 5264 bSign = extractFloat128Sign( b ); 5265 if ( aSign == bSign ) { 5266 return addFloat128Sigs( a, b, aSign STATUS_VAR ); 5267 } 5268 else { 5269 return subFloat128Sigs( a, b, aSign STATUS_VAR ); 5270 } 5271 5272 } 5273 5274 /*---------------------------------------------------------------------------- 5275 | Returns the result of subtracting the quadruple-precision floating-point 5276 | values `a' and `b'. The operation is performed according to the IEC/IEEE 5277 | Standard for Binary Floating-Point Arithmetic. 5278 *----------------------------------------------------------------------------*/ 5279 5280 float128 float128_sub( float128 a, float128 b STATUS_PARAM ) 5281 { 5282 flag aSign, bSign; 5283 5284 aSign = extractFloat128Sign( a ); 5285 bSign = extractFloat128Sign( b ); 5286 if ( aSign == bSign ) { 5287 return subFloat128Sigs( a, b, aSign STATUS_VAR ); 5288 } 5289 else { 5290 return addFloat128Sigs( a, b, aSign STATUS_VAR ); 5291 } 5292 5293 } 5294 5295 /*---------------------------------------------------------------------------- 5296 | Returns the result of multiplying the quadruple-precision floating-point 5297 | values `a' and `b'. The operation is performed according to the IEC/IEEE 5298 | Standard for Binary Floating-Point Arithmetic. 5299 *----------------------------------------------------------------------------*/ 5300 5301 float128 float128_mul( float128 a, float128 b STATUS_PARAM ) 5302 { 5303 flag aSign, bSign, zSign; 5304 int32 aExp, bExp, zExp; 5305 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 5306 float128 z; 5307 5308 aSig1 = extractFloat128Frac1( a ); 5309 aSig0 = extractFloat128Frac0( a ); 5310 aExp = extractFloat128Exp( a ); 5311 aSign = extractFloat128Sign( a ); 5312 bSig1 = extractFloat128Frac1( b ); 5313 bSig0 = extractFloat128Frac0( b ); 5314 bExp = extractFloat128Exp( b ); 5315 bSign = extractFloat128Sign( b ); 5316 zSign = aSign ^ bSign; 5317 if ( aExp == 0x7FFF ) { 5318 if ( ( aSig0 | aSig1 ) 5319 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 5320 return propagateFloat128NaN( a, b STATUS_VAR ); 5321 } 5322 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 5323 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5324 } 5325 if ( bExp == 0x7FFF ) { 5326 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 5327 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 5328 invalid: 5329 float_raise( float_flag_invalid STATUS_VAR); 5330 z.low = float128_default_nan_low; 5331 z.high = float128_default_nan_high; 5332 return z; 5333 } 5334 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5335 } 5336 if ( aExp == 0 ) { 5337 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 5338 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5339 } 5340 if ( bExp == 0 ) { 5341 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 5342 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5343 } 5344 zExp = aExp + bExp - 0x4000; 5345 aSig0 |= LIT64( 0x0001000000000000 ); 5346 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 5347 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 5348 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 5349 zSig2 |= ( zSig3 != 0 ); 5350 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 5351 shift128ExtraRightJamming( 5352 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 5353 ++zExp; 5354 } 5355 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR ); 5356 5357 } 5358 5359 /*---------------------------------------------------------------------------- 5360 | Returns the result of dividing the quadruple-precision floating-point value 5361 | `a' by the corresponding value `b'. The operation is performed according to 5362 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5363 *----------------------------------------------------------------------------*/ 5364 5365 float128 float128_div( float128 a, float128 b STATUS_PARAM ) 5366 { 5367 flag aSign, bSign, zSign; 5368 int32 aExp, bExp, zExp; 5369 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 5370 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5371 float128 z; 5372 5373 aSig1 = extractFloat128Frac1( a ); 5374 aSig0 = extractFloat128Frac0( a ); 5375 aExp = extractFloat128Exp( a ); 5376 aSign = extractFloat128Sign( a ); 5377 bSig1 = extractFloat128Frac1( b ); 5378 bSig0 = extractFloat128Frac0( b ); 5379 bExp = extractFloat128Exp( b ); 5380 bSign = extractFloat128Sign( b ); 5381 zSign = aSign ^ bSign; 5382 if ( aExp == 0x7FFF ) { 5383 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 5384 if ( bExp == 0x7FFF ) { 5385 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 5386 goto invalid; 5387 } 5388 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5389 } 5390 if ( bExp == 0x7FFF ) { 5391 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 5392 return packFloat128( zSign, 0, 0, 0 ); 5393 } 5394 if ( bExp == 0 ) { 5395 if ( ( bSig0 | bSig1 ) == 0 ) { 5396 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 5397 invalid: 5398 float_raise( float_flag_invalid STATUS_VAR); 5399 z.low = float128_default_nan_low; 5400 z.high = float128_default_nan_high; 5401 return z; 5402 } 5403 float_raise( float_flag_divbyzero STATUS_VAR); 5404 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5405 } 5406 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5407 } 5408 if ( aExp == 0 ) { 5409 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 5410 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5411 } 5412 zExp = aExp - bExp + 0x3FFD; 5413 shortShift128Left( 5414 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 5415 shortShift128Left( 5416 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 5417 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 5418 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 5419 ++zExp; 5420 } 5421 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5422 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 5423 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 5424 while ( (sbits64) rem0 < 0 ) { 5425 --zSig0; 5426 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 5427 } 5428 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 5429 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 5430 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 5431 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 5432 while ( (sbits64) rem1 < 0 ) { 5433 --zSig1; 5434 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 5435 } 5436 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5437 } 5438 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 5439 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR ); 5440 5441 } 5442 5443 /*---------------------------------------------------------------------------- 5444 | Returns the remainder of the quadruple-precision floating-point value `a' 5445 | with respect to the corresponding value `b'. The operation is performed 5446 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5447 *----------------------------------------------------------------------------*/ 5448 5449 float128 float128_rem( float128 a, float128 b STATUS_PARAM ) 5450 { 5451 flag aSign, zSign; 5452 int32 aExp, bExp, expDiff; 5453 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 5454 bits64 allZero, alternateASig0, alternateASig1, sigMean1; 5455 sbits64 sigMean0; 5456 float128 z; 5457 5458 aSig1 = extractFloat128Frac1( a ); 5459 aSig0 = extractFloat128Frac0( a ); 5460 aExp = extractFloat128Exp( a ); 5461 aSign = extractFloat128Sign( a ); 5462 bSig1 = extractFloat128Frac1( b ); 5463 bSig0 = extractFloat128Frac0( b ); 5464 bExp = extractFloat128Exp( b ); 5465 if ( aExp == 0x7FFF ) { 5466 if ( ( aSig0 | aSig1 ) 5467 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 5468 return propagateFloat128NaN( a, b STATUS_VAR ); 5469 } 5470 goto invalid; 5471 } 5472 if ( bExp == 0x7FFF ) { 5473 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 5474 return a; 5475 } 5476 if ( bExp == 0 ) { 5477 if ( ( bSig0 | bSig1 ) == 0 ) { 5478 invalid: 5479 float_raise( float_flag_invalid STATUS_VAR); 5480 z.low = float128_default_nan_low; 5481 z.high = float128_default_nan_high; 5482 return z; 5483 } 5484 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5485 } 5486 if ( aExp == 0 ) { 5487 if ( ( aSig0 | aSig1 ) == 0 ) return a; 5488 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5489 } 5490 expDiff = aExp - bExp; 5491 if ( expDiff < -1 ) return a; 5492 shortShift128Left( 5493 aSig0 | LIT64( 0x0001000000000000 ), 5494 aSig1, 5495 15 - ( expDiff < 0 ), 5496 &aSig0, 5497 &aSig1 5498 ); 5499 shortShift128Left( 5500 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 5501 q = le128( bSig0, bSig1, aSig0, aSig1 ); 5502 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5503 expDiff -= 64; 5504 while ( 0 < expDiff ) { 5505 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5506 q = ( 4 < q ) ? q - 4 : 0; 5507 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5508 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 5509 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 5510 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 5511 expDiff -= 61; 5512 } 5513 if ( -64 < expDiff ) { 5514 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5515 q = ( 4 < q ) ? q - 4 : 0; 5516 q >>= - expDiff; 5517 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5518 expDiff += 52; 5519 if ( expDiff < 0 ) { 5520 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 5521 } 5522 else { 5523 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 5524 } 5525 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5526 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 5527 } 5528 else { 5529 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 5530 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5531 } 5532 do { 5533 alternateASig0 = aSig0; 5534 alternateASig1 = aSig1; 5535 ++q; 5536 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5537 } while ( 0 <= (sbits64) aSig0 ); 5538 add128( 5539 aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 ); 5540 if ( ( sigMean0 < 0 ) 5541 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 5542 aSig0 = alternateASig0; 5543 aSig1 = alternateASig1; 5544 } 5545 zSign = ( (sbits64) aSig0 < 0 ); 5546 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 5547 return 5548 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR ); 5549 5550 } 5551 5552 /*---------------------------------------------------------------------------- 5553 | Returns the square root of the quadruple-precision floating-point value `a'. 5554 | The operation is performed according to the IEC/IEEE Standard for Binary 5555 | Floating-Point Arithmetic. 5556 *----------------------------------------------------------------------------*/ 5557 5558 float128 float128_sqrt( float128 a STATUS_PARAM ) 5559 { 5560 flag aSign; 5561 int32 aExp, zExp; 5562 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 5563 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5564 float128 z; 5565 5566 aSig1 = extractFloat128Frac1( a ); 5567 aSig0 = extractFloat128Frac0( a ); 5568 aExp = extractFloat128Exp( a ); 5569 aSign = extractFloat128Sign( a ); 5570 if ( aExp == 0x7FFF ) { 5571 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR ); 5572 if ( ! aSign ) return a; 5573 goto invalid; 5574 } 5575 if ( aSign ) { 5576 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 5577 invalid: 5578 float_raise( float_flag_invalid STATUS_VAR); 5579 z.low = float128_default_nan_low; 5580 z.high = float128_default_nan_high; 5581 return z; 5582 } 5583 if ( aExp == 0 ) { 5584 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 5585 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5586 } 5587 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 5588 aSig0 |= LIT64( 0x0001000000000000 ); 5589 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 5590 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 5591 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5592 doubleZSig0 = zSig0<<1; 5593 mul64To128( zSig0, zSig0, &term0, &term1 ); 5594 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5595 while ( (sbits64) rem0 < 0 ) { 5596 --zSig0; 5597 doubleZSig0 -= 2; 5598 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5599 } 5600 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5601 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 5602 if ( zSig1 == 0 ) zSig1 = 1; 5603 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5604 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5605 mul64To128( zSig1, zSig1, &term2, &term3 ); 5606 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5607 while ( (sbits64) rem1 < 0 ) { 5608 --zSig1; 5609 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5610 term3 |= 1; 5611 term2 |= doubleZSig0; 5612 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5613 } 5614 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5615 } 5616 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 5617 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR ); 5618 5619 } 5620 5621 /*---------------------------------------------------------------------------- 5622 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 5623 | the corresponding value `b', and 0 otherwise. The comparison is performed 5624 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5625 *----------------------------------------------------------------------------*/ 5626 5627 int float128_eq( float128 a, float128 b STATUS_PARAM ) 5628 { 5629 5630 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5631 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5632 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5633 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5634 ) { 5635 if ( float128_is_signaling_nan( a ) 5636 || float128_is_signaling_nan( b ) ) { 5637 float_raise( float_flag_invalid STATUS_VAR); 5638 } 5639 return 0; 5640 } 5641 return 5642 ( a.low == b.low ) 5643 && ( ( a.high == b.high ) 5644 || ( ( a.low == 0 ) 5645 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5646 ); 5647 5648 } 5649 5650 /*---------------------------------------------------------------------------- 5651 | Returns 1 if the quadruple-precision floating-point value `a' is less than 5652 | or equal to the corresponding value `b', and 0 otherwise. The comparison 5653 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5654 | Arithmetic. 5655 *----------------------------------------------------------------------------*/ 5656 5657 int float128_le( float128 a, float128 b STATUS_PARAM ) 5658 { 5659 flag aSign, bSign; 5660 5661 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5662 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5663 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5664 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5665 ) { 5666 float_raise( float_flag_invalid STATUS_VAR); 5667 return 0; 5668 } 5669 aSign = extractFloat128Sign( a ); 5670 bSign = extractFloat128Sign( b ); 5671 if ( aSign != bSign ) { 5672 return 5673 aSign 5674 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5675 == 0 ); 5676 } 5677 return 5678 aSign ? le128( b.high, b.low, a.high, a.low ) 5679 : le128( a.high, a.low, b.high, b.low ); 5680 5681 } 5682 5683 /*---------------------------------------------------------------------------- 5684 | Returns 1 if the quadruple-precision floating-point value `a' is less than 5685 | the corresponding value `b', and 0 otherwise. The comparison is performed 5686 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5687 *----------------------------------------------------------------------------*/ 5688 5689 int float128_lt( float128 a, float128 b STATUS_PARAM ) 5690 { 5691 flag aSign, bSign; 5692 5693 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5694 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5695 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5696 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5697 ) { 5698 float_raise( float_flag_invalid STATUS_VAR); 5699 return 0; 5700 } 5701 aSign = extractFloat128Sign( a ); 5702 bSign = extractFloat128Sign( b ); 5703 if ( aSign != bSign ) { 5704 return 5705 aSign 5706 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5707 != 0 ); 5708 } 5709 return 5710 aSign ? lt128( b.high, b.low, a.high, a.low ) 5711 : lt128( a.high, a.low, b.high, b.low ); 5712 5713 } 5714 5715 /*---------------------------------------------------------------------------- 5716 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 5717 | the corresponding value `b', and 0 otherwise. The invalid exception is 5718 | raised if either operand is a NaN. Otherwise, the comparison is performed 5719 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5720 *----------------------------------------------------------------------------*/ 5721 5722 int float128_eq_signaling( float128 a, float128 b STATUS_PARAM ) 5723 { 5724 5725 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5726 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5727 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5728 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5729 ) { 5730 float_raise( float_flag_invalid STATUS_VAR); 5731 return 0; 5732 } 5733 return 5734 ( a.low == b.low ) 5735 && ( ( a.high == b.high ) 5736 || ( ( a.low == 0 ) 5737 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) 5738 ); 5739 5740 } 5741 5742 /*---------------------------------------------------------------------------- 5743 | Returns 1 if the quadruple-precision floating-point value `a' is less than 5744 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5745 | cause an exception. Otherwise, the comparison is performed according to the 5746 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5747 *----------------------------------------------------------------------------*/ 5748 5749 int float128_le_quiet( float128 a, float128 b STATUS_PARAM ) 5750 { 5751 flag aSign, bSign; 5752 5753 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5754 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5755 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5756 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5757 ) { 5758 if ( float128_is_signaling_nan( a ) 5759 || float128_is_signaling_nan( b ) ) { 5760 float_raise( float_flag_invalid STATUS_VAR); 5761 } 5762 return 0; 5763 } 5764 aSign = extractFloat128Sign( a ); 5765 bSign = extractFloat128Sign( b ); 5766 if ( aSign != bSign ) { 5767 return 5768 aSign 5769 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5770 == 0 ); 5771 } 5772 return 5773 aSign ? le128( b.high, b.low, a.high, a.low ) 5774 : le128( a.high, a.low, b.high, b.low ); 5775 5776 } 5777 5778 /*---------------------------------------------------------------------------- 5779 | Returns 1 if the quadruple-precision floating-point value `a' is less than 5780 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5781 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 5782 | Standard for Binary Floating-Point Arithmetic. 5783 *----------------------------------------------------------------------------*/ 5784 5785 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM ) 5786 { 5787 flag aSign, bSign; 5788 5789 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5790 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5791 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5792 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5793 ) { 5794 if ( float128_is_signaling_nan( a ) 5795 || float128_is_signaling_nan( b ) ) { 5796 float_raise( float_flag_invalid STATUS_VAR); 5797 } 5798 return 0; 5799 } 5800 aSign = extractFloat128Sign( a ); 5801 bSign = extractFloat128Sign( b ); 5802 if ( aSign != bSign ) { 5803 return 5804 aSign 5805 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5806 != 0 ); 5807 } 5808 return 5809 aSign ? lt128( b.high, b.low, a.high, a.low ) 5810 : lt128( a.high, a.low, b.high, b.low ); 5811 5812 } 5813 5814 #endif 5815 5816 /* misc functions */ 5817 float32 uint32_to_float32( unsigned int a STATUS_PARAM ) 5818 { 5819 return int64_to_float32(a STATUS_VAR); 5820 } 5821 5822 float64 uint32_to_float64( unsigned int a STATUS_PARAM ) 5823 { 5824 return int64_to_float64(a STATUS_VAR); 5825 } 5826 5827 unsigned int float32_to_uint32( float32 a STATUS_PARAM ) 5828 { 5829 int64_t v; 5830 unsigned int res; 5831 5832 v = float32_to_int64(a STATUS_VAR); 5833 if (v < 0) { 5834 res = 0; 5835 float_raise( float_flag_invalid STATUS_VAR); 5836 } else if (v > 0xffffffff) { 5837 res = 0xffffffff; 5838 float_raise( float_flag_invalid STATUS_VAR); 5839 } else { 5840 res = v; 5841 } 5842 return res; 5843 } 5844 5845 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM ) 5846 { 5847 int64_t v; 5848 unsigned int res; 5849 5850 v = float32_to_int64_round_to_zero(a STATUS_VAR); 5851 if (v < 0) { 5852 res = 0; 5853 float_raise( float_flag_invalid STATUS_VAR); 5854 } else if (v > 0xffffffff) { 5855 res = 0xffffffff; 5856 float_raise( float_flag_invalid STATUS_VAR); 5857 } else { 5858 res = v; 5859 } 5860 return res; 5861 } 5862 5863 unsigned int float32_to_uint16_round_to_zero( float32 a STATUS_PARAM ) 5864 { 5865 int64_t v; 5866 unsigned int res; 5867 5868 v = float32_to_int64_round_to_zero(a STATUS_VAR); 5869 if (v < 0) { 5870 res = 0; 5871 float_raise( float_flag_invalid STATUS_VAR); 5872 } else if (v > 0xffff) { 5873 res = 0xffff; 5874 float_raise( float_flag_invalid STATUS_VAR); 5875 } else { 5876 res = v; 5877 } 5878 return res; 5879 } 5880 5881 unsigned int float64_to_uint32( float64 a STATUS_PARAM ) 5882 { 5883 int64_t v; 5884 unsigned int res; 5885 5886 v = float64_to_int64(a STATUS_VAR); 5887 if (v < 0) { 5888 res = 0; 5889 float_raise( float_flag_invalid STATUS_VAR); 5890 } else if (v > 0xffffffff) { 5891 res = 0xffffffff; 5892 float_raise( float_flag_invalid STATUS_VAR); 5893 } else { 5894 res = v; 5895 } 5896 return res; 5897 } 5898 5899 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM ) 5900 { 5901 int64_t v; 5902 unsigned int res; 5903 5904 v = float64_to_int64_round_to_zero(a STATUS_VAR); 5905 if (v < 0) { 5906 res = 0; 5907 float_raise( float_flag_invalid STATUS_VAR); 5908 } else if (v > 0xffffffff) { 5909 res = 0xffffffff; 5910 float_raise( float_flag_invalid STATUS_VAR); 5911 } else { 5912 res = v; 5913 } 5914 return res; 5915 } 5916 5917 unsigned int float64_to_uint16_round_to_zero( float64 a STATUS_PARAM ) 5918 { 5919 int64_t v; 5920 unsigned int res; 5921 5922 v = float64_to_int64_round_to_zero(a STATUS_VAR); 5923 if (v < 0) { 5924 res = 0; 5925 float_raise( float_flag_invalid STATUS_VAR); 5926 } else if (v > 0xffff) { 5927 res = 0xffff; 5928 float_raise( float_flag_invalid STATUS_VAR); 5929 } else { 5930 res = v; 5931 } 5932 return res; 5933 } 5934 5935 /* FIXME: This looks broken. */ 5936 uint64_t float64_to_uint64 (float64 a STATUS_PARAM) 5937 { 5938 int64_t v; 5939 5940 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR)); 5941 v += float64_val(a); 5942 v = float64_to_int64(make_float64(v) STATUS_VAR); 5943 5944 return v - INT64_MIN; 5945 } 5946 5947 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM) 5948 { 5949 int64_t v; 5950 5951 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR)); 5952 v += float64_val(a); 5953 v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR); 5954 5955 return v - INT64_MIN; 5956 } 5957 5958 #define COMPARE(s, nan_exp) \ 5959 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \ 5960 int is_quiet STATUS_PARAM ) \ 5961 { \ 5962 flag aSign, bSign; \ 5963 bits ## s av, bv; \ 5964 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \ 5965 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \ 5966 \ 5967 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \ 5968 extractFloat ## s ## Frac( a ) ) || \ 5969 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \ 5970 extractFloat ## s ## Frac( b ) )) { \ 5971 if (!is_quiet || \ 5972 float ## s ## _is_signaling_nan( a ) || \ 5973 float ## s ## _is_signaling_nan( b ) ) { \ 5974 float_raise( float_flag_invalid STATUS_VAR); \ 5975 } \ 5976 return float_relation_unordered; \ 5977 } \ 5978 aSign = extractFloat ## s ## Sign( a ); \ 5979 bSign = extractFloat ## s ## Sign( b ); \ 5980 av = float ## s ## _val(a); \ 5981 bv = float ## s ## _val(b); \ 5982 if ( aSign != bSign ) { \ 5983 if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) { \ 5984 /* zero case */ \ 5985 return float_relation_equal; \ 5986 } else { \ 5987 return 1 - (2 * aSign); \ 5988 } \ 5989 } else { \ 5990 if (av == bv) { \ 5991 return float_relation_equal; \ 5992 } else { \ 5993 return 1 - 2 * (aSign ^ ( av < bv )); \ 5994 } \ 5995 } \ 5996 } \ 5997 \ 5998 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \ 5999 { \ 6000 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \ 6001 } \ 6002 \ 6003 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \ 6004 { \ 6005 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \ 6006 } 6007 6008 COMPARE(32, 0xff) 6009 COMPARE(64, 0x7ff) 6010 6011 INLINE int float128_compare_internal( float128 a, float128 b, 6012 int is_quiet STATUS_PARAM ) 6013 { 6014 flag aSign, bSign; 6015 6016 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 6017 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 6018 ( ( extractFloat128Exp( b ) == 0x7fff ) && 6019 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 6020 if (!is_quiet || 6021 float128_is_signaling_nan( a ) || 6022 float128_is_signaling_nan( b ) ) { 6023 float_raise( float_flag_invalid STATUS_VAR); 6024 } 6025 return float_relation_unordered; 6026 } 6027 aSign = extractFloat128Sign( a ); 6028 bSign = extractFloat128Sign( b ); 6029 if ( aSign != bSign ) { 6030 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 6031 /* zero case */ 6032 return float_relation_equal; 6033 } else { 6034 return 1 - (2 * aSign); 6035 } 6036 } else { 6037 if (a.low == b.low && a.high == b.high) { 6038 return float_relation_equal; 6039 } else { 6040 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 6041 } 6042 } 6043 } 6044 6045 int float128_compare( float128 a, float128 b STATUS_PARAM ) 6046 { 6047 return float128_compare_internal(a, b, 0 STATUS_VAR); 6048 } 6049 6050 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM ) 6051 { 6052 return float128_compare_internal(a, b, 1 STATUS_VAR); 6053 } 6054 6055 /* Multiply A by 2 raised to the power N. */ 6056 float32 float32_scalbn( float32 a, int n STATUS_PARAM ) 6057 { 6058 flag aSign; 6059 int16 aExp; 6060 bits32 aSig; 6061 6062 a = float32_squash_input_denormal(a STATUS_VAR); 6063 aSig = extractFloat32Frac( a ); 6064 aExp = extractFloat32Exp( a ); 6065 aSign = extractFloat32Sign( a ); 6066 6067 if ( aExp == 0xFF ) { 6068 return a; 6069 } 6070 if ( aExp != 0 ) 6071 aSig |= 0x00800000; 6072 else if ( aSig == 0 ) 6073 return a; 6074 6075 aExp += n - 1; 6076 aSig <<= 7; 6077 return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR ); 6078 } 6079 6080 float64 float64_scalbn( float64 a, int n STATUS_PARAM ) 6081 { 6082 flag aSign; 6083 int16 aExp; 6084 bits64 aSig; 6085 6086 a = float64_squash_input_denormal(a STATUS_VAR); 6087 aSig = extractFloat64Frac( a ); 6088 aExp = extractFloat64Exp( a ); 6089 aSign = extractFloat64Sign( a ); 6090 6091 if ( aExp == 0x7FF ) { 6092 return a; 6093 } 6094 if ( aExp != 0 ) 6095 aSig |= LIT64( 0x0010000000000000 ); 6096 else if ( aSig == 0 ) 6097 return a; 6098 6099 aExp += n - 1; 6100 aSig <<= 10; 6101 return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR ); 6102 } 6103 6104 #ifdef FLOATX80 6105 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM ) 6106 { 6107 flag aSign; 6108 int16 aExp; 6109 bits64 aSig; 6110 6111 aSig = extractFloatx80Frac( a ); 6112 aExp = extractFloatx80Exp( a ); 6113 aSign = extractFloatx80Sign( a ); 6114 6115 if ( aExp == 0x7FF ) { 6116 return a; 6117 } 6118 if (aExp == 0 && aSig == 0) 6119 return a; 6120 6121 aExp += n; 6122 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision), 6123 aSign, aExp, aSig, 0 STATUS_VAR ); 6124 } 6125 #endif 6126 6127 #ifdef FLOAT128 6128 float128 float128_scalbn( float128 a, int n STATUS_PARAM ) 6129 { 6130 flag aSign; 6131 int32 aExp; 6132 bits64 aSig0, aSig1; 6133 6134 aSig1 = extractFloat128Frac1( a ); 6135 aSig0 = extractFloat128Frac0( a ); 6136 aExp = extractFloat128Exp( a ); 6137 aSign = extractFloat128Sign( a ); 6138 if ( aExp == 0x7FFF ) { 6139 return a; 6140 } 6141 if ( aExp != 0 ) 6142 aSig0 |= LIT64( 0x0001000000000000 ); 6143 else if ( aSig0 == 0 && aSig1 == 0 ) 6144 return a; 6145 6146 aExp += n - 1; 6147 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 6148 STATUS_VAR ); 6149 6150 } 6151 #endif 6152