1 /* 2 * QEMU float support 3 * 4 * Derived from SoftFloat. 5 */ 6 7 /*============================================================================ 8 9 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic 10 Package, Release 2b. 11 12 Written by John R. Hauser. This work was made possible in part by the 13 International Computer Science Institute, located at Suite 600, 1947 Center 14 Street, Berkeley, California 94704. Funding was partially provided by the 15 National Science Foundation under grant MIP-9311980. The original version 16 of this code was written as part of a project to build a fixed-point vector 17 processor in collaboration with the University of California at Berkeley, 18 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 19 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ 20 arithmetic/SoftFloat.html'. 21 22 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has 23 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES 24 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS 25 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, 26 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE 27 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE 28 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR 29 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. 30 31 Derivative works are acceptable, even for commercial purposes, so long as 32 (1) the source code for the derivative work includes prominent notice that 33 the work is derivative, and (2) the source code includes prominent notice with 34 these four paragraphs for those parts of this code that are retained. 35 36 =============================================================================*/ 37 38 /* softfloat (and in particular the code in softfloat-specialize.h) is 39 * target-dependent and needs the TARGET_* macros. 40 */ 41 #include "config.h" 42 43 #include "softfloat.h" 44 45 /*---------------------------------------------------------------------------- 46 | Primitive arithmetic functions, including multi-word arithmetic, and 47 | division and square root approximations. (Can be specialized to target if 48 | desired.) 49 *----------------------------------------------------------------------------*/ 50 #include "softfloat-macros.h" 51 52 /*---------------------------------------------------------------------------- 53 | Functions and definitions to determine: (1) whether tininess for underflow 54 | is detected before or after rounding by default, (2) what (if anything) 55 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 56 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 57 | are propagated from function inputs to output. These details are target- 58 | specific. 59 *----------------------------------------------------------------------------*/ 60 #include "softfloat-specialize.h" 61 62 void set_float_rounding_mode(int val STATUS_PARAM) 63 { 64 STATUS(float_rounding_mode) = val; 65 } 66 67 void set_float_exception_flags(int val STATUS_PARAM) 68 { 69 STATUS(float_exception_flags) = val; 70 } 71 72 void set_floatx80_rounding_precision(int val STATUS_PARAM) 73 { 74 STATUS(floatx80_rounding_precision) = val; 75 } 76 77 /*---------------------------------------------------------------------------- 78 | Returns the fraction bits of the half-precision floating-point value `a'. 79 *----------------------------------------------------------------------------*/ 80 81 INLINE uint32_t extractFloat16Frac(float16 a) 82 { 83 return float16_val(a) & 0x3ff; 84 } 85 86 /*---------------------------------------------------------------------------- 87 | Returns the exponent bits of the half-precision floating-point value `a'. 88 *----------------------------------------------------------------------------*/ 89 90 INLINE int16 extractFloat16Exp(float16 a) 91 { 92 return (float16_val(a) >> 10) & 0x1f; 93 } 94 95 /*---------------------------------------------------------------------------- 96 | Returns the sign bit of the single-precision floating-point value `a'. 97 *----------------------------------------------------------------------------*/ 98 99 INLINE flag extractFloat16Sign(float16 a) 100 { 101 return float16_val(a)>>15; 102 } 103 104 /*---------------------------------------------------------------------------- 105 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 106 | and 7, and returns the properly rounded 32-bit integer corresponding to the 107 | input. If `zSign' is 1, the input is negated before being converted to an 108 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 109 | is simply rounded to an integer, with the inexact exception raised if the 110 | input cannot be represented exactly as an integer. However, if the fixed- 111 | point input is too large, the invalid exception is raised and the largest 112 | positive or negative integer is returned. 113 *----------------------------------------------------------------------------*/ 114 115 static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM) 116 { 117 int8 roundingMode; 118 flag roundNearestEven; 119 int8 roundIncrement, roundBits; 120 int32 z; 121 122 roundingMode = STATUS(float_rounding_mode); 123 roundNearestEven = ( roundingMode == float_round_nearest_even ); 124 roundIncrement = 0x40; 125 if ( ! roundNearestEven ) { 126 if ( roundingMode == float_round_to_zero ) { 127 roundIncrement = 0; 128 } 129 else { 130 roundIncrement = 0x7F; 131 if ( zSign ) { 132 if ( roundingMode == float_round_up ) roundIncrement = 0; 133 } 134 else { 135 if ( roundingMode == float_round_down ) roundIncrement = 0; 136 } 137 } 138 } 139 roundBits = absZ & 0x7F; 140 absZ = ( absZ + roundIncrement )>>7; 141 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 142 z = absZ; 143 if ( zSign ) z = - z; 144 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 145 float_raise( float_flag_invalid STATUS_VAR); 146 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 147 } 148 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; 149 return z; 150 151 } 152 153 /*---------------------------------------------------------------------------- 154 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 155 | `absZ1', with binary point between bits 63 and 64 (between the input words), 156 | and returns the properly rounded 64-bit integer corresponding to the input. 157 | If `zSign' is 1, the input is negated before being converted to an integer. 158 | Ordinarily, the fixed-point input is simply rounded to an integer, with 159 | the inexact exception raised if the input cannot be represented exactly as 160 | an integer. However, if the fixed-point input is too large, the invalid 161 | exception is raised and the largest positive or negative integer is 162 | returned. 163 *----------------------------------------------------------------------------*/ 164 165 static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM) 166 { 167 int8 roundingMode; 168 flag roundNearestEven, increment; 169 int64 z; 170 171 roundingMode = STATUS(float_rounding_mode); 172 roundNearestEven = ( roundingMode == float_round_nearest_even ); 173 increment = ( (int64_t) absZ1 < 0 ); 174 if ( ! roundNearestEven ) { 175 if ( roundingMode == float_round_to_zero ) { 176 increment = 0; 177 } 178 else { 179 if ( zSign ) { 180 increment = ( roundingMode == float_round_down ) && absZ1; 181 } 182 else { 183 increment = ( roundingMode == float_round_up ) && absZ1; 184 } 185 } 186 } 187 if ( increment ) { 188 ++absZ0; 189 if ( absZ0 == 0 ) goto overflow; 190 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 191 } 192 z = absZ0; 193 if ( zSign ) z = - z; 194 if ( z && ( ( z < 0 ) ^ zSign ) ) { 195 overflow: 196 float_raise( float_flag_invalid STATUS_VAR); 197 return 198 zSign ? (int64_t) LIT64( 0x8000000000000000 ) 199 : LIT64( 0x7FFFFFFFFFFFFFFF ); 200 } 201 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact; 202 return z; 203 204 } 205 206 /*---------------------------------------------------------------------------- 207 | Returns the fraction bits of the single-precision floating-point value `a'. 208 *----------------------------------------------------------------------------*/ 209 210 INLINE uint32_t extractFloat32Frac( float32 a ) 211 { 212 213 return float32_val(a) & 0x007FFFFF; 214 215 } 216 217 /*---------------------------------------------------------------------------- 218 | Returns the exponent bits of the single-precision floating-point value `a'. 219 *----------------------------------------------------------------------------*/ 220 221 INLINE int16 extractFloat32Exp( float32 a ) 222 { 223 224 return ( float32_val(a)>>23 ) & 0xFF; 225 226 } 227 228 /*---------------------------------------------------------------------------- 229 | Returns the sign bit of the single-precision floating-point value `a'. 230 *----------------------------------------------------------------------------*/ 231 232 INLINE flag extractFloat32Sign( float32 a ) 233 { 234 235 return float32_val(a)>>31; 236 237 } 238 239 /*---------------------------------------------------------------------------- 240 | If `a' is denormal and we are in flush-to-zero mode then set the 241 | input-denormal exception and return zero. Otherwise just return the value. 242 *----------------------------------------------------------------------------*/ 243 static float32 float32_squash_input_denormal(float32 a STATUS_PARAM) 244 { 245 if (STATUS(flush_inputs_to_zero)) { 246 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) { 247 float_raise(float_flag_input_denormal STATUS_VAR); 248 return make_float32(float32_val(a) & 0x80000000); 249 } 250 } 251 return a; 252 } 253 254 /*---------------------------------------------------------------------------- 255 | Normalizes the subnormal single-precision floating-point value represented 256 | by the denormalized significand `aSig'. The normalized exponent and 257 | significand are stored at the locations pointed to by `zExpPtr' and 258 | `zSigPtr', respectively. 259 *----------------------------------------------------------------------------*/ 260 261 static void 262 normalizeFloat32Subnormal( uint32_t aSig, int16 *zExpPtr, uint32_t *zSigPtr ) 263 { 264 int8 shiftCount; 265 266 shiftCount = countLeadingZeros32( aSig ) - 8; 267 *zSigPtr = aSig<<shiftCount; 268 *zExpPtr = 1 - shiftCount; 269 270 } 271 272 /*---------------------------------------------------------------------------- 273 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 274 | single-precision floating-point value, returning the result. After being 275 | shifted into the proper positions, the three fields are simply added 276 | together to form the result. This means that any integer portion of `zSig' 277 | will be added into the exponent. Since a properly normalized significand 278 | will have an integer portion equal to 1, the `zExp' input should be 1 less 279 | than the desired result exponent whenever `zSig' is a complete, normalized 280 | significand. 281 *----------------------------------------------------------------------------*/ 282 283 INLINE float32 packFloat32( flag zSign, int16 zExp, uint32_t zSig ) 284 { 285 286 return make_float32( 287 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig); 288 289 } 290 291 /*---------------------------------------------------------------------------- 292 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 293 | and significand `zSig', and returns the proper single-precision floating- 294 | point value corresponding to the abstract input. Ordinarily, the abstract 295 | value is simply rounded and packed into the single-precision format, with 296 | the inexact exception raised if the abstract input cannot be represented 297 | exactly. However, if the abstract value is too large, the overflow and 298 | inexact exceptions are raised and an infinity or maximal finite value is 299 | returned. If the abstract value is too small, the input value is rounded to 300 | a subnormal number, and the underflow and inexact exceptions are raised if 301 | the abstract input cannot be represented exactly as a subnormal single- 302 | precision floating-point number. 303 | The input significand `zSig' has its binary point between bits 30 304 | and 29, which is 7 bits to the left of the usual location. This shifted 305 | significand must be normalized or smaller. If `zSig' is not normalized, 306 | `zExp' must be 0; in that case, the result returned is a subnormal number, 307 | and it must not require rounding. In the usual case that `zSig' is 308 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 309 | The handling of underflow and overflow follows the IEC/IEEE Standard for 310 | Binary Floating-Point Arithmetic. 311 *----------------------------------------------------------------------------*/ 312 313 static float32 roundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM) 314 { 315 int8 roundingMode; 316 flag roundNearestEven; 317 int8 roundIncrement, roundBits; 318 flag isTiny; 319 320 roundingMode = STATUS(float_rounding_mode); 321 roundNearestEven = ( roundingMode == float_round_nearest_even ); 322 roundIncrement = 0x40; 323 if ( ! roundNearestEven ) { 324 if ( roundingMode == float_round_to_zero ) { 325 roundIncrement = 0; 326 } 327 else { 328 roundIncrement = 0x7F; 329 if ( zSign ) { 330 if ( roundingMode == float_round_up ) roundIncrement = 0; 331 } 332 else { 333 if ( roundingMode == float_round_down ) roundIncrement = 0; 334 } 335 } 336 } 337 roundBits = zSig & 0x7F; 338 if ( 0xFD <= (uint16_t) zExp ) { 339 if ( ( 0xFD < zExp ) 340 || ( ( zExp == 0xFD ) 341 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 342 ) { 343 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); 344 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 )); 345 } 346 if ( zExp < 0 ) { 347 if (STATUS(flush_to_zero)) { 348 float_raise(float_flag_output_denormal STATUS_VAR); 349 return packFloat32(zSign, 0, 0); 350 } 351 isTiny = 352 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) 353 || ( zExp < -1 ) 354 || ( zSig + roundIncrement < 0x80000000 ); 355 shift32RightJamming( zSig, - zExp, &zSig ); 356 zExp = 0; 357 roundBits = zSig & 0x7F; 358 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR); 359 } 360 } 361 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; 362 zSig = ( zSig + roundIncrement )>>7; 363 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 364 if ( zSig == 0 ) zExp = 0; 365 return packFloat32( zSign, zExp, zSig ); 366 367 } 368 369 /*---------------------------------------------------------------------------- 370 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 371 | and significand `zSig', and returns the proper single-precision floating- 372 | point value corresponding to the abstract input. This routine is just like 373 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 374 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 375 | floating-point exponent. 376 *----------------------------------------------------------------------------*/ 377 378 static float32 379 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM) 380 { 381 int8 shiftCount; 382 383 shiftCount = countLeadingZeros32( zSig ) - 1; 384 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR); 385 386 } 387 388 /*---------------------------------------------------------------------------- 389 | Returns the fraction bits of the double-precision floating-point value `a'. 390 *----------------------------------------------------------------------------*/ 391 392 INLINE uint64_t extractFloat64Frac( float64 a ) 393 { 394 395 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF ); 396 397 } 398 399 /*---------------------------------------------------------------------------- 400 | Returns the exponent bits of the double-precision floating-point value `a'. 401 *----------------------------------------------------------------------------*/ 402 403 INLINE int16 extractFloat64Exp( float64 a ) 404 { 405 406 return ( float64_val(a)>>52 ) & 0x7FF; 407 408 } 409 410 /*---------------------------------------------------------------------------- 411 | Returns the sign bit of the double-precision floating-point value `a'. 412 *----------------------------------------------------------------------------*/ 413 414 INLINE flag extractFloat64Sign( float64 a ) 415 { 416 417 return float64_val(a)>>63; 418 419 } 420 421 /*---------------------------------------------------------------------------- 422 | If `a' is denormal and we are in flush-to-zero mode then set the 423 | input-denormal exception and return zero. Otherwise just return the value. 424 *----------------------------------------------------------------------------*/ 425 static float64 float64_squash_input_denormal(float64 a STATUS_PARAM) 426 { 427 if (STATUS(flush_inputs_to_zero)) { 428 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) { 429 float_raise(float_flag_input_denormal STATUS_VAR); 430 return make_float64(float64_val(a) & (1ULL << 63)); 431 } 432 } 433 return a; 434 } 435 436 /*---------------------------------------------------------------------------- 437 | Normalizes the subnormal double-precision floating-point value represented 438 | by the denormalized significand `aSig'. The normalized exponent and 439 | significand are stored at the locations pointed to by `zExpPtr' and 440 | `zSigPtr', respectively. 441 *----------------------------------------------------------------------------*/ 442 443 static void 444 normalizeFloat64Subnormal( uint64_t aSig, int16 *zExpPtr, uint64_t *zSigPtr ) 445 { 446 int8 shiftCount; 447 448 shiftCount = countLeadingZeros64( aSig ) - 11; 449 *zSigPtr = aSig<<shiftCount; 450 *zExpPtr = 1 - shiftCount; 451 452 } 453 454 /*---------------------------------------------------------------------------- 455 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 456 | double-precision floating-point value, returning the result. After being 457 | shifted into the proper positions, the three fields are simply added 458 | together to form the result. This means that any integer portion of `zSig' 459 | will be added into the exponent. Since a properly normalized significand 460 | will have an integer portion equal to 1, the `zExp' input should be 1 less 461 | than the desired result exponent whenever `zSig' is a complete, normalized 462 | significand. 463 *----------------------------------------------------------------------------*/ 464 465 INLINE float64 packFloat64( flag zSign, int16 zExp, uint64_t zSig ) 466 { 467 468 return make_float64( 469 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 470 471 } 472 473 /*---------------------------------------------------------------------------- 474 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 475 | and significand `zSig', and returns the proper double-precision floating- 476 | point value corresponding to the abstract input. Ordinarily, the abstract 477 | value is simply rounded and packed into the double-precision format, with 478 | the inexact exception raised if the abstract input cannot be represented 479 | exactly. However, if the abstract value is too large, the overflow and 480 | inexact exceptions are raised and an infinity or maximal finite value is 481 | returned. If the abstract value is too small, the input value is rounded 482 | to a subnormal number, and the underflow and inexact exceptions are raised 483 | if the abstract input cannot be represented exactly as a subnormal double- 484 | precision floating-point number. 485 | The input significand `zSig' has its binary point between bits 62 486 | and 61, which is 10 bits to the left of the usual location. This shifted 487 | significand must be normalized or smaller. If `zSig' is not normalized, 488 | `zExp' must be 0; in that case, the result returned is a subnormal number, 489 | and it must not require rounding. In the usual case that `zSig' is 490 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 491 | The handling of underflow and overflow follows the IEC/IEEE Standard for 492 | Binary Floating-Point Arithmetic. 493 *----------------------------------------------------------------------------*/ 494 495 static float64 roundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM) 496 { 497 int8 roundingMode; 498 flag roundNearestEven; 499 int16 roundIncrement, roundBits; 500 flag isTiny; 501 502 roundingMode = STATUS(float_rounding_mode); 503 roundNearestEven = ( roundingMode == float_round_nearest_even ); 504 roundIncrement = 0x200; 505 if ( ! roundNearestEven ) { 506 if ( roundingMode == float_round_to_zero ) { 507 roundIncrement = 0; 508 } 509 else { 510 roundIncrement = 0x3FF; 511 if ( zSign ) { 512 if ( roundingMode == float_round_up ) roundIncrement = 0; 513 } 514 else { 515 if ( roundingMode == float_round_down ) roundIncrement = 0; 516 } 517 } 518 } 519 roundBits = zSig & 0x3FF; 520 if ( 0x7FD <= (uint16_t) zExp ) { 521 if ( ( 0x7FD < zExp ) 522 || ( ( zExp == 0x7FD ) 523 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 524 ) { 525 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); 526 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 )); 527 } 528 if ( zExp < 0 ) { 529 if (STATUS(flush_to_zero)) { 530 float_raise(float_flag_output_denormal STATUS_VAR); 531 return packFloat64(zSign, 0, 0); 532 } 533 isTiny = 534 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) 535 || ( zExp < -1 ) 536 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 537 shift64RightJamming( zSig, - zExp, &zSig ); 538 zExp = 0; 539 roundBits = zSig & 0x3FF; 540 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR); 541 } 542 } 543 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; 544 zSig = ( zSig + roundIncrement )>>10; 545 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 546 if ( zSig == 0 ) zExp = 0; 547 return packFloat64( zSign, zExp, zSig ); 548 549 } 550 551 /*---------------------------------------------------------------------------- 552 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 553 | and significand `zSig', and returns the proper double-precision floating- 554 | point value corresponding to the abstract input. This routine is just like 555 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 556 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 557 | floating-point exponent. 558 *----------------------------------------------------------------------------*/ 559 560 static float64 561 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM) 562 { 563 int8 shiftCount; 564 565 shiftCount = countLeadingZeros64( zSig ) - 1; 566 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR); 567 568 } 569 570 /*---------------------------------------------------------------------------- 571 | Returns the fraction bits of the extended double-precision floating-point 572 | value `a'. 573 *----------------------------------------------------------------------------*/ 574 575 INLINE uint64_t extractFloatx80Frac( floatx80 a ) 576 { 577 578 return a.low; 579 580 } 581 582 /*---------------------------------------------------------------------------- 583 | Returns the exponent bits of the extended double-precision floating-point 584 | value `a'. 585 *----------------------------------------------------------------------------*/ 586 587 INLINE int32 extractFloatx80Exp( floatx80 a ) 588 { 589 590 return a.high & 0x7FFF; 591 592 } 593 594 /*---------------------------------------------------------------------------- 595 | Returns the sign bit of the extended double-precision floating-point value 596 | `a'. 597 *----------------------------------------------------------------------------*/ 598 599 INLINE flag extractFloatx80Sign( floatx80 a ) 600 { 601 602 return a.high>>15; 603 604 } 605 606 /*---------------------------------------------------------------------------- 607 | Normalizes the subnormal extended double-precision floating-point value 608 | represented by the denormalized significand `aSig'. The normalized exponent 609 | and significand are stored at the locations pointed to by `zExpPtr' and 610 | `zSigPtr', respectively. 611 *----------------------------------------------------------------------------*/ 612 613 static void 614 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr ) 615 { 616 int8 shiftCount; 617 618 shiftCount = countLeadingZeros64( aSig ); 619 *zSigPtr = aSig<<shiftCount; 620 *zExpPtr = 1 - shiftCount; 621 622 } 623 624 /*---------------------------------------------------------------------------- 625 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 626 | extended double-precision floating-point value, returning the result. 627 *----------------------------------------------------------------------------*/ 628 629 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig ) 630 { 631 floatx80 z; 632 633 z.low = zSig; 634 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp; 635 return z; 636 637 } 638 639 /*---------------------------------------------------------------------------- 640 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 641 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 642 | and returns the proper extended double-precision floating-point value 643 | corresponding to the abstract input. Ordinarily, the abstract value is 644 | rounded and packed into the extended double-precision format, with the 645 | inexact exception raised if the abstract input cannot be represented 646 | exactly. However, if the abstract value is too large, the overflow and 647 | inexact exceptions are raised and an infinity or maximal finite value is 648 | returned. If the abstract value is too small, the input value is rounded to 649 | a subnormal number, and the underflow and inexact exceptions are raised if 650 | the abstract input cannot be represented exactly as a subnormal extended 651 | double-precision floating-point number. 652 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 653 | number of bits as single or double precision, respectively. Otherwise, the 654 | result is rounded to the full precision of the extended double-precision 655 | format. 656 | The input significand must be normalized or smaller. If the input 657 | significand is not normalized, `zExp' must be 0; in that case, the result 658 | returned is a subnormal number, and it must not require rounding. The 659 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 660 | Floating-Point Arithmetic. 661 *----------------------------------------------------------------------------*/ 662 663 static floatx80 664 roundAndPackFloatx80( 665 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 666 STATUS_PARAM) 667 { 668 int8 roundingMode; 669 flag roundNearestEven, increment, isTiny; 670 int64 roundIncrement, roundMask, roundBits; 671 672 roundingMode = STATUS(float_rounding_mode); 673 roundNearestEven = ( roundingMode == float_round_nearest_even ); 674 if ( roundingPrecision == 80 ) goto precision80; 675 if ( roundingPrecision == 64 ) { 676 roundIncrement = LIT64( 0x0000000000000400 ); 677 roundMask = LIT64( 0x00000000000007FF ); 678 } 679 else if ( roundingPrecision == 32 ) { 680 roundIncrement = LIT64( 0x0000008000000000 ); 681 roundMask = LIT64( 0x000000FFFFFFFFFF ); 682 } 683 else { 684 goto precision80; 685 } 686 zSig0 |= ( zSig1 != 0 ); 687 if ( ! roundNearestEven ) { 688 if ( roundingMode == float_round_to_zero ) { 689 roundIncrement = 0; 690 } 691 else { 692 roundIncrement = roundMask; 693 if ( zSign ) { 694 if ( roundingMode == float_round_up ) roundIncrement = 0; 695 } 696 else { 697 if ( roundingMode == float_round_down ) roundIncrement = 0; 698 } 699 } 700 } 701 roundBits = zSig0 & roundMask; 702 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 703 if ( ( 0x7FFE < zExp ) 704 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 705 ) { 706 goto overflow; 707 } 708 if ( zExp <= 0 ) { 709 if (STATUS(flush_to_zero)) { 710 float_raise(float_flag_output_denormal STATUS_VAR); 711 return packFloatx80(zSign, 0, 0); 712 } 713 isTiny = 714 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) 715 || ( zExp < 0 ) 716 || ( zSig0 <= zSig0 + roundIncrement ); 717 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 718 zExp = 0; 719 roundBits = zSig0 & roundMask; 720 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR); 721 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; 722 zSig0 += roundIncrement; 723 if ( (int64_t) zSig0 < 0 ) zExp = 1; 724 roundIncrement = roundMask + 1; 725 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 726 roundMask |= roundIncrement; 727 } 728 zSig0 &= ~ roundMask; 729 return packFloatx80( zSign, zExp, zSig0 ); 730 } 731 } 732 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; 733 zSig0 += roundIncrement; 734 if ( zSig0 < roundIncrement ) { 735 ++zExp; 736 zSig0 = LIT64( 0x8000000000000000 ); 737 } 738 roundIncrement = roundMask + 1; 739 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 740 roundMask |= roundIncrement; 741 } 742 zSig0 &= ~ roundMask; 743 if ( zSig0 == 0 ) zExp = 0; 744 return packFloatx80( zSign, zExp, zSig0 ); 745 precision80: 746 increment = ( (int64_t) zSig1 < 0 ); 747 if ( ! roundNearestEven ) { 748 if ( roundingMode == float_round_to_zero ) { 749 increment = 0; 750 } 751 else { 752 if ( zSign ) { 753 increment = ( roundingMode == float_round_down ) && zSig1; 754 } 755 else { 756 increment = ( roundingMode == float_round_up ) && zSig1; 757 } 758 } 759 } 760 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 761 if ( ( 0x7FFE < zExp ) 762 || ( ( zExp == 0x7FFE ) 763 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 764 && increment 765 ) 766 ) { 767 roundMask = 0; 768 overflow: 769 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); 770 if ( ( roundingMode == float_round_to_zero ) 771 || ( zSign && ( roundingMode == float_round_up ) ) 772 || ( ! zSign && ( roundingMode == float_round_down ) ) 773 ) { 774 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 775 } 776 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 777 } 778 if ( zExp <= 0 ) { 779 isTiny = 780 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) 781 || ( zExp < 0 ) 782 || ! increment 783 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 784 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 785 zExp = 0; 786 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR); 787 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact; 788 if ( roundNearestEven ) { 789 increment = ( (int64_t) zSig1 < 0 ); 790 } 791 else { 792 if ( zSign ) { 793 increment = ( roundingMode == float_round_down ) && zSig1; 794 } 795 else { 796 increment = ( roundingMode == float_round_up ) && zSig1; 797 } 798 } 799 if ( increment ) { 800 ++zSig0; 801 zSig0 &= 802 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 803 if ( (int64_t) zSig0 < 0 ) zExp = 1; 804 } 805 return packFloatx80( zSign, zExp, zSig0 ); 806 } 807 } 808 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact; 809 if ( increment ) { 810 ++zSig0; 811 if ( zSig0 == 0 ) { 812 ++zExp; 813 zSig0 = LIT64( 0x8000000000000000 ); 814 } 815 else { 816 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 817 } 818 } 819 else { 820 if ( zSig0 == 0 ) zExp = 0; 821 } 822 return packFloatx80( zSign, zExp, zSig0 ); 823 824 } 825 826 /*---------------------------------------------------------------------------- 827 | Takes an abstract floating-point value having sign `zSign', exponent 828 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 829 | and returns the proper extended double-precision floating-point value 830 | corresponding to the abstract input. This routine is just like 831 | `roundAndPackFloatx80' except that the input significand does not have to be 832 | normalized. 833 *----------------------------------------------------------------------------*/ 834 835 static floatx80 836 normalizeRoundAndPackFloatx80( 837 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 838 STATUS_PARAM) 839 { 840 int8 shiftCount; 841 842 if ( zSig0 == 0 ) { 843 zSig0 = zSig1; 844 zSig1 = 0; 845 zExp -= 64; 846 } 847 shiftCount = countLeadingZeros64( zSig0 ); 848 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 849 zExp -= shiftCount; 850 return 851 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR); 852 853 } 854 855 /*---------------------------------------------------------------------------- 856 | Returns the least-significant 64 fraction bits of the quadruple-precision 857 | floating-point value `a'. 858 *----------------------------------------------------------------------------*/ 859 860 INLINE uint64_t extractFloat128Frac1( float128 a ) 861 { 862 863 return a.low; 864 865 } 866 867 /*---------------------------------------------------------------------------- 868 | Returns the most-significant 48 fraction bits of the quadruple-precision 869 | floating-point value `a'. 870 *----------------------------------------------------------------------------*/ 871 872 INLINE uint64_t extractFloat128Frac0( float128 a ) 873 { 874 875 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 876 877 } 878 879 /*---------------------------------------------------------------------------- 880 | Returns the exponent bits of the quadruple-precision floating-point value 881 | `a'. 882 *----------------------------------------------------------------------------*/ 883 884 INLINE int32 extractFloat128Exp( float128 a ) 885 { 886 887 return ( a.high>>48 ) & 0x7FFF; 888 889 } 890 891 /*---------------------------------------------------------------------------- 892 | Returns the sign bit of the quadruple-precision floating-point value `a'. 893 *----------------------------------------------------------------------------*/ 894 895 INLINE flag extractFloat128Sign( float128 a ) 896 { 897 898 return a.high>>63; 899 900 } 901 902 /*---------------------------------------------------------------------------- 903 | Normalizes the subnormal quadruple-precision floating-point value 904 | represented by the denormalized significand formed by the concatenation of 905 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 906 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 907 | significand are stored at the location pointed to by `zSig0Ptr', and the 908 | least significant 64 bits of the normalized significand are stored at the 909 | location pointed to by `zSig1Ptr'. 910 *----------------------------------------------------------------------------*/ 911 912 static void 913 normalizeFloat128Subnormal( 914 uint64_t aSig0, 915 uint64_t aSig1, 916 int32 *zExpPtr, 917 uint64_t *zSig0Ptr, 918 uint64_t *zSig1Ptr 919 ) 920 { 921 int8 shiftCount; 922 923 if ( aSig0 == 0 ) { 924 shiftCount = countLeadingZeros64( aSig1 ) - 15; 925 if ( shiftCount < 0 ) { 926 *zSig0Ptr = aSig1>>( - shiftCount ); 927 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 928 } 929 else { 930 *zSig0Ptr = aSig1<<shiftCount; 931 *zSig1Ptr = 0; 932 } 933 *zExpPtr = - shiftCount - 63; 934 } 935 else { 936 shiftCount = countLeadingZeros64( aSig0 ) - 15; 937 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 938 *zExpPtr = 1 - shiftCount; 939 } 940 941 } 942 943 /*---------------------------------------------------------------------------- 944 | Packs the sign `zSign', the exponent `zExp', and the significand formed 945 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 946 | floating-point value, returning the result. After being shifted into the 947 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 948 | added together to form the most significant 32 bits of the result. This 949 | means that any integer portion of `zSig0' will be added into the exponent. 950 | Since a properly normalized significand will have an integer portion equal 951 | to 1, the `zExp' input should be 1 less than the desired result exponent 952 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 953 | significand. 954 *----------------------------------------------------------------------------*/ 955 956 INLINE float128 957 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 ) 958 { 959 float128 z; 960 961 z.low = zSig1; 962 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0; 963 return z; 964 965 } 966 967 /*---------------------------------------------------------------------------- 968 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 969 | and extended significand formed by the concatenation of `zSig0', `zSig1', 970 | and `zSig2', and returns the proper quadruple-precision floating-point value 971 | corresponding to the abstract input. Ordinarily, the abstract value is 972 | simply rounded and packed into the quadruple-precision format, with the 973 | inexact exception raised if the abstract input cannot be represented 974 | exactly. However, if the abstract value is too large, the overflow and 975 | inexact exceptions are raised and an infinity or maximal finite value is 976 | returned. If the abstract value is too small, the input value is rounded to 977 | a subnormal number, and the underflow and inexact exceptions are raised if 978 | the abstract input cannot be represented exactly as a subnormal quadruple- 979 | precision floating-point number. 980 | The input significand must be normalized or smaller. If the input 981 | significand is not normalized, `zExp' must be 0; in that case, the result 982 | returned is a subnormal number, and it must not require rounding. In the 983 | usual case that the input significand is normalized, `zExp' must be 1 less 984 | than the ``true'' floating-point exponent. The handling of underflow and 985 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 986 *----------------------------------------------------------------------------*/ 987 988 static float128 989 roundAndPackFloat128( 990 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM) 991 { 992 int8 roundingMode; 993 flag roundNearestEven, increment, isTiny; 994 995 roundingMode = STATUS(float_rounding_mode); 996 roundNearestEven = ( roundingMode == float_round_nearest_even ); 997 increment = ( (int64_t) zSig2 < 0 ); 998 if ( ! roundNearestEven ) { 999 if ( roundingMode == float_round_to_zero ) { 1000 increment = 0; 1001 } 1002 else { 1003 if ( zSign ) { 1004 increment = ( roundingMode == float_round_down ) && zSig2; 1005 } 1006 else { 1007 increment = ( roundingMode == float_round_up ) && zSig2; 1008 } 1009 } 1010 } 1011 if ( 0x7FFD <= (uint32_t) zExp ) { 1012 if ( ( 0x7FFD < zExp ) 1013 || ( ( zExp == 0x7FFD ) 1014 && eq128( 1015 LIT64( 0x0001FFFFFFFFFFFF ), 1016 LIT64( 0xFFFFFFFFFFFFFFFF ), 1017 zSig0, 1018 zSig1 1019 ) 1020 && increment 1021 ) 1022 ) { 1023 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); 1024 if ( ( roundingMode == float_round_to_zero ) 1025 || ( zSign && ( roundingMode == float_round_up ) ) 1026 || ( ! zSign && ( roundingMode == float_round_down ) ) 1027 ) { 1028 return 1029 packFloat128( 1030 zSign, 1031 0x7FFE, 1032 LIT64( 0x0000FFFFFFFFFFFF ), 1033 LIT64( 0xFFFFFFFFFFFFFFFF ) 1034 ); 1035 } 1036 return packFloat128( zSign, 0x7FFF, 0, 0 ); 1037 } 1038 if ( zExp < 0 ) { 1039 if (STATUS(flush_to_zero)) { 1040 float_raise(float_flag_output_denormal STATUS_VAR); 1041 return packFloat128(zSign, 0, 0, 0); 1042 } 1043 isTiny = 1044 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) 1045 || ( zExp < -1 ) 1046 || ! increment 1047 || lt128( 1048 zSig0, 1049 zSig1, 1050 LIT64( 0x0001FFFFFFFFFFFF ), 1051 LIT64( 0xFFFFFFFFFFFFFFFF ) 1052 ); 1053 shift128ExtraRightJamming( 1054 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 1055 zExp = 0; 1056 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR); 1057 if ( roundNearestEven ) { 1058 increment = ( (int64_t) zSig2 < 0 ); 1059 } 1060 else { 1061 if ( zSign ) { 1062 increment = ( roundingMode == float_round_down ) && zSig2; 1063 } 1064 else { 1065 increment = ( roundingMode == float_round_up ) && zSig2; 1066 } 1067 } 1068 } 1069 } 1070 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact; 1071 if ( increment ) { 1072 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 1073 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 1074 } 1075 else { 1076 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 1077 } 1078 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1079 1080 } 1081 1082 /*---------------------------------------------------------------------------- 1083 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1084 | and significand formed by the concatenation of `zSig0' and `zSig1', and 1085 | returns the proper quadruple-precision floating-point value corresponding 1086 | to the abstract input. This routine is just like `roundAndPackFloat128' 1087 | except that the input significand has fewer bits and does not have to be 1088 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 1089 | point exponent. 1090 *----------------------------------------------------------------------------*/ 1091 1092 static float128 1093 normalizeRoundAndPackFloat128( 1094 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM) 1095 { 1096 int8 shiftCount; 1097 uint64_t zSig2; 1098 1099 if ( zSig0 == 0 ) { 1100 zSig0 = zSig1; 1101 zSig1 = 0; 1102 zExp -= 64; 1103 } 1104 shiftCount = countLeadingZeros64( zSig0 ) - 15; 1105 if ( 0 <= shiftCount ) { 1106 zSig2 = 0; 1107 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1108 } 1109 else { 1110 shift128ExtraRightJamming( 1111 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 1112 } 1113 zExp -= shiftCount; 1114 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR); 1115 1116 } 1117 1118 /*---------------------------------------------------------------------------- 1119 | Returns the result of converting the 32-bit two's complement integer `a' 1120 | to the single-precision floating-point format. The conversion is performed 1121 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1122 *----------------------------------------------------------------------------*/ 1123 1124 float32 int32_to_float32( int32 a STATUS_PARAM ) 1125 { 1126 flag zSign; 1127 1128 if ( a == 0 ) return float32_zero; 1129 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); 1130 zSign = ( a < 0 ); 1131 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR ); 1132 1133 } 1134 1135 /*---------------------------------------------------------------------------- 1136 | Returns the result of converting the 32-bit two's complement integer `a' 1137 | to the double-precision floating-point format. The conversion is performed 1138 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1139 *----------------------------------------------------------------------------*/ 1140 1141 float64 int32_to_float64( int32 a STATUS_PARAM ) 1142 { 1143 flag zSign; 1144 uint32 absA; 1145 int8 shiftCount; 1146 uint64_t zSig; 1147 1148 if ( a == 0 ) return float64_zero; 1149 zSign = ( a < 0 ); 1150 absA = zSign ? - a : a; 1151 shiftCount = countLeadingZeros32( absA ) + 21; 1152 zSig = absA; 1153 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount ); 1154 1155 } 1156 1157 /*---------------------------------------------------------------------------- 1158 | Returns the result of converting the 32-bit two's complement integer `a' 1159 | to the extended double-precision floating-point format. The conversion 1160 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1161 | Arithmetic. 1162 *----------------------------------------------------------------------------*/ 1163 1164 floatx80 int32_to_floatx80( int32 a STATUS_PARAM ) 1165 { 1166 flag zSign; 1167 uint32 absA; 1168 int8 shiftCount; 1169 uint64_t zSig; 1170 1171 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1172 zSign = ( a < 0 ); 1173 absA = zSign ? - a : a; 1174 shiftCount = countLeadingZeros32( absA ) + 32; 1175 zSig = absA; 1176 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 1177 1178 } 1179 1180 /*---------------------------------------------------------------------------- 1181 | Returns the result of converting the 32-bit two's complement integer `a' to 1182 | the quadruple-precision floating-point format. The conversion is performed 1183 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1184 *----------------------------------------------------------------------------*/ 1185 1186 float128 int32_to_float128( int32 a STATUS_PARAM ) 1187 { 1188 flag zSign; 1189 uint32 absA; 1190 int8 shiftCount; 1191 uint64_t zSig0; 1192 1193 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1194 zSign = ( a < 0 ); 1195 absA = zSign ? - a : a; 1196 shiftCount = countLeadingZeros32( absA ) + 17; 1197 zSig0 = absA; 1198 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 1199 1200 } 1201 1202 /*---------------------------------------------------------------------------- 1203 | Returns the result of converting the 64-bit two's complement integer `a' 1204 | to the single-precision floating-point format. The conversion is performed 1205 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1206 *----------------------------------------------------------------------------*/ 1207 1208 float32 int64_to_float32( int64 a STATUS_PARAM ) 1209 { 1210 flag zSign; 1211 uint64 absA; 1212 int8 shiftCount; 1213 1214 if ( a == 0 ) return float32_zero; 1215 zSign = ( a < 0 ); 1216 absA = zSign ? - a : a; 1217 shiftCount = countLeadingZeros64( absA ) - 40; 1218 if ( 0 <= shiftCount ) { 1219 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount ); 1220 } 1221 else { 1222 shiftCount += 7; 1223 if ( shiftCount < 0 ) { 1224 shift64RightJamming( absA, - shiftCount, &absA ); 1225 } 1226 else { 1227 absA <<= shiftCount; 1228 } 1229 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR ); 1230 } 1231 1232 } 1233 1234 float32 uint64_to_float32( uint64 a STATUS_PARAM ) 1235 { 1236 int8 shiftCount; 1237 1238 if ( a == 0 ) return float32_zero; 1239 shiftCount = countLeadingZeros64( a ) - 40; 1240 if ( 0 <= shiftCount ) { 1241 return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount ); 1242 } 1243 else { 1244 shiftCount += 7; 1245 if ( shiftCount < 0 ) { 1246 shift64RightJamming( a, - shiftCount, &a ); 1247 } 1248 else { 1249 a <<= shiftCount; 1250 } 1251 return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR ); 1252 } 1253 } 1254 1255 /*---------------------------------------------------------------------------- 1256 | Returns the result of converting the 64-bit two's complement integer `a' 1257 | to the double-precision floating-point format. The conversion is performed 1258 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1259 *----------------------------------------------------------------------------*/ 1260 1261 float64 int64_to_float64( int64 a STATUS_PARAM ) 1262 { 1263 flag zSign; 1264 1265 if ( a == 0 ) return float64_zero; 1266 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) { 1267 return packFloat64( 1, 0x43E, 0 ); 1268 } 1269 zSign = ( a < 0 ); 1270 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR ); 1271 1272 } 1273 1274 float64 uint64_to_float64( uint64 a STATUS_PARAM ) 1275 { 1276 if ( a == 0 ) return float64_zero; 1277 return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR ); 1278 1279 } 1280 1281 /*---------------------------------------------------------------------------- 1282 | Returns the result of converting the 64-bit two's complement integer `a' 1283 | to the extended double-precision floating-point format. The conversion 1284 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1285 | Arithmetic. 1286 *----------------------------------------------------------------------------*/ 1287 1288 floatx80 int64_to_floatx80( int64 a STATUS_PARAM ) 1289 { 1290 flag zSign; 1291 uint64 absA; 1292 int8 shiftCount; 1293 1294 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1295 zSign = ( a < 0 ); 1296 absA = zSign ? - a : a; 1297 shiftCount = countLeadingZeros64( absA ); 1298 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 1299 1300 } 1301 1302 /*---------------------------------------------------------------------------- 1303 | Returns the result of converting the 64-bit two's complement integer `a' to 1304 | the quadruple-precision floating-point format. The conversion is performed 1305 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1306 *----------------------------------------------------------------------------*/ 1307 1308 float128 int64_to_float128( int64 a STATUS_PARAM ) 1309 { 1310 flag zSign; 1311 uint64 absA; 1312 int8 shiftCount; 1313 int32 zExp; 1314 uint64_t zSig0, zSig1; 1315 1316 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1317 zSign = ( a < 0 ); 1318 absA = zSign ? - a : a; 1319 shiftCount = countLeadingZeros64( absA ) + 49; 1320 zExp = 0x406E - shiftCount; 1321 if ( 64 <= shiftCount ) { 1322 zSig1 = 0; 1323 zSig0 = absA; 1324 shiftCount -= 64; 1325 } 1326 else { 1327 zSig1 = absA; 1328 zSig0 = 0; 1329 } 1330 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1331 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1332 1333 } 1334 1335 /*---------------------------------------------------------------------------- 1336 | Returns the result of converting the single-precision floating-point value 1337 | `a' to the 32-bit two's complement integer format. The conversion is 1338 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1339 | Arithmetic---which means in particular that the conversion is rounded 1340 | according to the current rounding mode. If `a' is a NaN, the largest 1341 | positive integer is returned. Otherwise, if the conversion overflows, the 1342 | largest integer with the same sign as `a' is returned. 1343 *----------------------------------------------------------------------------*/ 1344 1345 int32 float32_to_int32( float32 a STATUS_PARAM ) 1346 { 1347 flag aSign; 1348 int16 aExp, shiftCount; 1349 uint32_t aSig; 1350 uint64_t aSig64; 1351 1352 a = float32_squash_input_denormal(a STATUS_VAR); 1353 aSig = extractFloat32Frac( a ); 1354 aExp = extractFloat32Exp( a ); 1355 aSign = extractFloat32Sign( a ); 1356 if ( ( aExp == 0xFF ) && aSig ) aSign = 0; 1357 if ( aExp ) aSig |= 0x00800000; 1358 shiftCount = 0xAF - aExp; 1359 aSig64 = aSig; 1360 aSig64 <<= 32; 1361 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 ); 1362 return roundAndPackInt32( aSign, aSig64 STATUS_VAR ); 1363 1364 } 1365 1366 /*---------------------------------------------------------------------------- 1367 | Returns the result of converting the single-precision floating-point value 1368 | `a' to the 32-bit two's complement integer format. The conversion is 1369 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1370 | Arithmetic, except that the conversion is always rounded toward zero. 1371 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1372 | the conversion overflows, the largest integer with the same sign as `a' is 1373 | returned. 1374 *----------------------------------------------------------------------------*/ 1375 1376 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM ) 1377 { 1378 flag aSign; 1379 int16 aExp, shiftCount; 1380 uint32_t aSig; 1381 int32 z; 1382 a = float32_squash_input_denormal(a STATUS_VAR); 1383 1384 aSig = extractFloat32Frac( a ); 1385 aExp = extractFloat32Exp( a ); 1386 aSign = extractFloat32Sign( a ); 1387 shiftCount = aExp - 0x9E; 1388 if ( 0 <= shiftCount ) { 1389 if ( float32_val(a) != 0xCF000000 ) { 1390 float_raise( float_flag_invalid STATUS_VAR); 1391 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; 1392 } 1393 return (int32_t) 0x80000000; 1394 } 1395 else if ( aExp <= 0x7E ) { 1396 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 1397 return 0; 1398 } 1399 aSig = ( aSig | 0x00800000 )<<8; 1400 z = aSig>>( - shiftCount ); 1401 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) { 1402 STATUS(float_exception_flags) |= float_flag_inexact; 1403 } 1404 if ( aSign ) z = - z; 1405 return z; 1406 1407 } 1408 1409 /*---------------------------------------------------------------------------- 1410 | Returns the result of converting the single-precision floating-point value 1411 | `a' to the 16-bit two's complement integer format. The conversion is 1412 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1413 | Arithmetic, except that the conversion is always rounded toward zero. 1414 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1415 | the conversion overflows, the largest integer with the same sign as `a' is 1416 | returned. 1417 *----------------------------------------------------------------------------*/ 1418 1419 int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM ) 1420 { 1421 flag aSign; 1422 int16 aExp, shiftCount; 1423 uint32_t aSig; 1424 int32 z; 1425 1426 aSig = extractFloat32Frac( a ); 1427 aExp = extractFloat32Exp( a ); 1428 aSign = extractFloat32Sign( a ); 1429 shiftCount = aExp - 0x8E; 1430 if ( 0 <= shiftCount ) { 1431 if ( float32_val(a) != 0xC7000000 ) { 1432 float_raise( float_flag_invalid STATUS_VAR); 1433 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1434 return 0x7FFF; 1435 } 1436 } 1437 return (int32_t) 0xffff8000; 1438 } 1439 else if ( aExp <= 0x7E ) { 1440 if ( aExp | aSig ) { 1441 STATUS(float_exception_flags) |= float_flag_inexact; 1442 } 1443 return 0; 1444 } 1445 shiftCount -= 0x10; 1446 aSig = ( aSig | 0x00800000 )<<8; 1447 z = aSig>>( - shiftCount ); 1448 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) { 1449 STATUS(float_exception_flags) |= float_flag_inexact; 1450 } 1451 if ( aSign ) { 1452 z = - z; 1453 } 1454 return z; 1455 1456 } 1457 1458 /*---------------------------------------------------------------------------- 1459 | Returns the result of converting the single-precision floating-point value 1460 | `a' to the 64-bit two's complement integer format. The conversion is 1461 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1462 | Arithmetic---which means in particular that the conversion is rounded 1463 | according to the current rounding mode. If `a' is a NaN, the largest 1464 | positive integer is returned. Otherwise, if the conversion overflows, the 1465 | largest integer with the same sign as `a' is returned. 1466 *----------------------------------------------------------------------------*/ 1467 1468 int64 float32_to_int64( float32 a STATUS_PARAM ) 1469 { 1470 flag aSign; 1471 int16 aExp, shiftCount; 1472 uint32_t aSig; 1473 uint64_t aSig64, aSigExtra; 1474 a = float32_squash_input_denormal(a STATUS_VAR); 1475 1476 aSig = extractFloat32Frac( a ); 1477 aExp = extractFloat32Exp( a ); 1478 aSign = extractFloat32Sign( a ); 1479 shiftCount = 0xBE - aExp; 1480 if ( shiftCount < 0 ) { 1481 float_raise( float_flag_invalid STATUS_VAR); 1482 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1483 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1484 } 1485 return (int64_t) LIT64( 0x8000000000000000 ); 1486 } 1487 if ( aExp ) aSig |= 0x00800000; 1488 aSig64 = aSig; 1489 aSig64 <<= 40; 1490 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra ); 1491 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR ); 1492 1493 } 1494 1495 /*---------------------------------------------------------------------------- 1496 | Returns the result of converting the single-precision floating-point value 1497 | `a' to the 64-bit two's complement integer format. The conversion is 1498 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1499 | Arithmetic, except that the conversion is always rounded toward zero. If 1500 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 1501 | conversion overflows, the largest integer with the same sign as `a' is 1502 | returned. 1503 *----------------------------------------------------------------------------*/ 1504 1505 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM ) 1506 { 1507 flag aSign; 1508 int16 aExp, shiftCount; 1509 uint32_t aSig; 1510 uint64_t aSig64; 1511 int64 z; 1512 a = float32_squash_input_denormal(a STATUS_VAR); 1513 1514 aSig = extractFloat32Frac( a ); 1515 aExp = extractFloat32Exp( a ); 1516 aSign = extractFloat32Sign( a ); 1517 shiftCount = aExp - 0xBE; 1518 if ( 0 <= shiftCount ) { 1519 if ( float32_val(a) != 0xDF000000 ) { 1520 float_raise( float_flag_invalid STATUS_VAR); 1521 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1522 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1523 } 1524 } 1525 return (int64_t) LIT64( 0x8000000000000000 ); 1526 } 1527 else if ( aExp <= 0x7E ) { 1528 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 1529 return 0; 1530 } 1531 aSig64 = aSig | 0x00800000; 1532 aSig64 <<= 40; 1533 z = aSig64>>( - shiftCount ); 1534 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) { 1535 STATUS(float_exception_flags) |= float_flag_inexact; 1536 } 1537 if ( aSign ) z = - z; 1538 return z; 1539 1540 } 1541 1542 /*---------------------------------------------------------------------------- 1543 | Returns the result of converting the single-precision floating-point value 1544 | `a' to the double-precision floating-point format. The conversion is 1545 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1546 | Arithmetic. 1547 *----------------------------------------------------------------------------*/ 1548 1549 float64 float32_to_float64( float32 a STATUS_PARAM ) 1550 { 1551 flag aSign; 1552 int16 aExp; 1553 uint32_t aSig; 1554 a = float32_squash_input_denormal(a STATUS_VAR); 1555 1556 aSig = extractFloat32Frac( a ); 1557 aExp = extractFloat32Exp( a ); 1558 aSign = extractFloat32Sign( a ); 1559 if ( aExp == 0xFF ) { 1560 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 1561 return packFloat64( aSign, 0x7FF, 0 ); 1562 } 1563 if ( aExp == 0 ) { 1564 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 1565 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1566 --aExp; 1567 } 1568 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 ); 1569 1570 } 1571 1572 /*---------------------------------------------------------------------------- 1573 | Returns the result of converting the single-precision floating-point value 1574 | `a' to the extended double-precision floating-point format. The conversion 1575 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1576 | Arithmetic. 1577 *----------------------------------------------------------------------------*/ 1578 1579 floatx80 float32_to_floatx80( float32 a STATUS_PARAM ) 1580 { 1581 flag aSign; 1582 int16 aExp; 1583 uint32_t aSig; 1584 1585 a = float32_squash_input_denormal(a STATUS_VAR); 1586 aSig = extractFloat32Frac( a ); 1587 aExp = extractFloat32Exp( a ); 1588 aSign = extractFloat32Sign( a ); 1589 if ( aExp == 0xFF ) { 1590 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 1591 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 1592 } 1593 if ( aExp == 0 ) { 1594 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 1595 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1596 } 1597 aSig |= 0x00800000; 1598 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 1599 1600 } 1601 1602 /*---------------------------------------------------------------------------- 1603 | Returns the result of converting the single-precision floating-point value 1604 | `a' to the double-precision floating-point format. The conversion is 1605 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1606 | Arithmetic. 1607 *----------------------------------------------------------------------------*/ 1608 1609 float128 float32_to_float128( float32 a STATUS_PARAM ) 1610 { 1611 flag aSign; 1612 int16 aExp; 1613 uint32_t aSig; 1614 1615 a = float32_squash_input_denormal(a STATUS_VAR); 1616 aSig = extractFloat32Frac( a ); 1617 aExp = extractFloat32Exp( a ); 1618 aSign = extractFloat32Sign( a ); 1619 if ( aExp == 0xFF ) { 1620 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 1621 return packFloat128( aSign, 0x7FFF, 0, 0 ); 1622 } 1623 if ( aExp == 0 ) { 1624 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 1625 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1626 --aExp; 1627 } 1628 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 1629 1630 } 1631 1632 /*---------------------------------------------------------------------------- 1633 | Rounds the single-precision floating-point value `a' to an integer, and 1634 | returns the result as a single-precision floating-point value. The 1635 | operation is performed according to the IEC/IEEE Standard for Binary 1636 | Floating-Point Arithmetic. 1637 *----------------------------------------------------------------------------*/ 1638 1639 float32 float32_round_to_int( float32 a STATUS_PARAM) 1640 { 1641 flag aSign; 1642 int16 aExp; 1643 uint32_t lastBitMask, roundBitsMask; 1644 int8 roundingMode; 1645 uint32_t z; 1646 a = float32_squash_input_denormal(a STATUS_VAR); 1647 1648 aExp = extractFloat32Exp( a ); 1649 if ( 0x96 <= aExp ) { 1650 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { 1651 return propagateFloat32NaN( a, a STATUS_VAR ); 1652 } 1653 return a; 1654 } 1655 if ( aExp <= 0x7E ) { 1656 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a; 1657 STATUS(float_exception_flags) |= float_flag_inexact; 1658 aSign = extractFloat32Sign( a ); 1659 switch ( STATUS(float_rounding_mode) ) { 1660 case float_round_nearest_even: 1661 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { 1662 return packFloat32( aSign, 0x7F, 0 ); 1663 } 1664 break; 1665 case float_round_down: 1666 return make_float32(aSign ? 0xBF800000 : 0); 1667 case float_round_up: 1668 return make_float32(aSign ? 0x80000000 : 0x3F800000); 1669 } 1670 return packFloat32( aSign, 0, 0 ); 1671 } 1672 lastBitMask = 1; 1673 lastBitMask <<= 0x96 - aExp; 1674 roundBitsMask = lastBitMask - 1; 1675 z = float32_val(a); 1676 roundingMode = STATUS(float_rounding_mode); 1677 if ( roundingMode == float_round_nearest_even ) { 1678 z += lastBitMask>>1; 1679 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 1680 } 1681 else if ( roundingMode != float_round_to_zero ) { 1682 if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) { 1683 z += roundBitsMask; 1684 } 1685 } 1686 z &= ~ roundBitsMask; 1687 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact; 1688 return make_float32(z); 1689 1690 } 1691 1692 /*---------------------------------------------------------------------------- 1693 | Returns the result of adding the absolute values of the single-precision 1694 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 1695 | before being returned. `zSign' is ignored if the result is a NaN. 1696 | The addition is performed according to the IEC/IEEE Standard for Binary 1697 | Floating-Point Arithmetic. 1698 *----------------------------------------------------------------------------*/ 1699 1700 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM) 1701 { 1702 int16 aExp, bExp, zExp; 1703 uint32_t aSig, bSig, zSig; 1704 int16 expDiff; 1705 1706 aSig = extractFloat32Frac( a ); 1707 aExp = extractFloat32Exp( a ); 1708 bSig = extractFloat32Frac( b ); 1709 bExp = extractFloat32Exp( b ); 1710 expDiff = aExp - bExp; 1711 aSig <<= 6; 1712 bSig <<= 6; 1713 if ( 0 < expDiff ) { 1714 if ( aExp == 0xFF ) { 1715 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1716 return a; 1717 } 1718 if ( bExp == 0 ) { 1719 --expDiff; 1720 } 1721 else { 1722 bSig |= 0x20000000; 1723 } 1724 shift32RightJamming( bSig, expDiff, &bSig ); 1725 zExp = aExp; 1726 } 1727 else if ( expDiff < 0 ) { 1728 if ( bExp == 0xFF ) { 1729 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1730 return packFloat32( zSign, 0xFF, 0 ); 1731 } 1732 if ( aExp == 0 ) { 1733 ++expDiff; 1734 } 1735 else { 1736 aSig |= 0x20000000; 1737 } 1738 shift32RightJamming( aSig, - expDiff, &aSig ); 1739 zExp = bExp; 1740 } 1741 else { 1742 if ( aExp == 0xFF ) { 1743 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1744 return a; 1745 } 1746 if ( aExp == 0 ) { 1747 if (STATUS(flush_to_zero)) { 1748 if (aSig | bSig) { 1749 float_raise(float_flag_output_denormal STATUS_VAR); 1750 } 1751 return packFloat32(zSign, 0, 0); 1752 } 1753 return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); 1754 } 1755 zSig = 0x40000000 + aSig + bSig; 1756 zExp = aExp; 1757 goto roundAndPack; 1758 } 1759 aSig |= 0x20000000; 1760 zSig = ( aSig + bSig )<<1; 1761 --zExp; 1762 if ( (int32_t) zSig < 0 ) { 1763 zSig = aSig + bSig; 1764 ++zExp; 1765 } 1766 roundAndPack: 1767 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR ); 1768 1769 } 1770 1771 /*---------------------------------------------------------------------------- 1772 | Returns the result of subtracting the absolute values of the single- 1773 | precision floating-point values `a' and `b'. If `zSign' is 1, the 1774 | difference is negated before being returned. `zSign' is ignored if the 1775 | result is a NaN. The subtraction is performed according to the IEC/IEEE 1776 | Standard for Binary Floating-Point Arithmetic. 1777 *----------------------------------------------------------------------------*/ 1778 1779 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM) 1780 { 1781 int16 aExp, bExp, zExp; 1782 uint32_t aSig, bSig, zSig; 1783 int16 expDiff; 1784 1785 aSig = extractFloat32Frac( a ); 1786 aExp = extractFloat32Exp( a ); 1787 bSig = extractFloat32Frac( b ); 1788 bExp = extractFloat32Exp( b ); 1789 expDiff = aExp - bExp; 1790 aSig <<= 7; 1791 bSig <<= 7; 1792 if ( 0 < expDiff ) goto aExpBigger; 1793 if ( expDiff < 0 ) goto bExpBigger; 1794 if ( aExp == 0xFF ) { 1795 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1796 float_raise( float_flag_invalid STATUS_VAR); 1797 return float32_default_nan; 1798 } 1799 if ( aExp == 0 ) { 1800 aExp = 1; 1801 bExp = 1; 1802 } 1803 if ( bSig < aSig ) goto aBigger; 1804 if ( aSig < bSig ) goto bBigger; 1805 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 ); 1806 bExpBigger: 1807 if ( bExp == 0xFF ) { 1808 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1809 return packFloat32( zSign ^ 1, 0xFF, 0 ); 1810 } 1811 if ( aExp == 0 ) { 1812 ++expDiff; 1813 } 1814 else { 1815 aSig |= 0x40000000; 1816 } 1817 shift32RightJamming( aSig, - expDiff, &aSig ); 1818 bSig |= 0x40000000; 1819 bBigger: 1820 zSig = bSig - aSig; 1821 zExp = bExp; 1822 zSign ^= 1; 1823 goto normalizeRoundAndPack; 1824 aExpBigger: 1825 if ( aExp == 0xFF ) { 1826 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1827 return a; 1828 } 1829 if ( bExp == 0 ) { 1830 --expDiff; 1831 } 1832 else { 1833 bSig |= 0x40000000; 1834 } 1835 shift32RightJamming( bSig, expDiff, &bSig ); 1836 aSig |= 0x40000000; 1837 aBigger: 1838 zSig = aSig - bSig; 1839 zExp = aExp; 1840 normalizeRoundAndPack: 1841 --zExp; 1842 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR ); 1843 1844 } 1845 1846 /*---------------------------------------------------------------------------- 1847 | Returns the result of adding the single-precision floating-point values `a' 1848 | and `b'. The operation is performed according to the IEC/IEEE Standard for 1849 | Binary Floating-Point Arithmetic. 1850 *----------------------------------------------------------------------------*/ 1851 1852 float32 float32_add( float32 a, float32 b STATUS_PARAM ) 1853 { 1854 flag aSign, bSign; 1855 a = float32_squash_input_denormal(a STATUS_VAR); 1856 b = float32_squash_input_denormal(b STATUS_VAR); 1857 1858 aSign = extractFloat32Sign( a ); 1859 bSign = extractFloat32Sign( b ); 1860 if ( aSign == bSign ) { 1861 return addFloat32Sigs( a, b, aSign STATUS_VAR); 1862 } 1863 else { 1864 return subFloat32Sigs( a, b, aSign STATUS_VAR ); 1865 } 1866 1867 } 1868 1869 /*---------------------------------------------------------------------------- 1870 | Returns the result of subtracting the single-precision floating-point values 1871 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1872 | for Binary Floating-Point Arithmetic. 1873 *----------------------------------------------------------------------------*/ 1874 1875 float32 float32_sub( float32 a, float32 b STATUS_PARAM ) 1876 { 1877 flag aSign, bSign; 1878 a = float32_squash_input_denormal(a STATUS_VAR); 1879 b = float32_squash_input_denormal(b STATUS_VAR); 1880 1881 aSign = extractFloat32Sign( a ); 1882 bSign = extractFloat32Sign( b ); 1883 if ( aSign == bSign ) { 1884 return subFloat32Sigs( a, b, aSign STATUS_VAR ); 1885 } 1886 else { 1887 return addFloat32Sigs( a, b, aSign STATUS_VAR ); 1888 } 1889 1890 } 1891 1892 /*---------------------------------------------------------------------------- 1893 | Returns the result of multiplying the single-precision floating-point values 1894 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 1895 | for Binary Floating-Point Arithmetic. 1896 *----------------------------------------------------------------------------*/ 1897 1898 float32 float32_mul( float32 a, float32 b STATUS_PARAM ) 1899 { 1900 flag aSign, bSign, zSign; 1901 int16 aExp, bExp, zExp; 1902 uint32_t aSig, bSig; 1903 uint64_t zSig64; 1904 uint32_t zSig; 1905 1906 a = float32_squash_input_denormal(a STATUS_VAR); 1907 b = float32_squash_input_denormal(b STATUS_VAR); 1908 1909 aSig = extractFloat32Frac( a ); 1910 aExp = extractFloat32Exp( a ); 1911 aSign = extractFloat32Sign( a ); 1912 bSig = extractFloat32Frac( b ); 1913 bExp = extractFloat32Exp( b ); 1914 bSign = extractFloat32Sign( b ); 1915 zSign = aSign ^ bSign; 1916 if ( aExp == 0xFF ) { 1917 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 1918 return propagateFloat32NaN( a, b STATUS_VAR ); 1919 } 1920 if ( ( bExp | bSig ) == 0 ) { 1921 float_raise( float_flag_invalid STATUS_VAR); 1922 return float32_default_nan; 1923 } 1924 return packFloat32( zSign, 0xFF, 0 ); 1925 } 1926 if ( bExp == 0xFF ) { 1927 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1928 if ( ( aExp | aSig ) == 0 ) { 1929 float_raise( float_flag_invalid STATUS_VAR); 1930 return float32_default_nan; 1931 } 1932 return packFloat32( zSign, 0xFF, 0 ); 1933 } 1934 if ( aExp == 0 ) { 1935 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 1936 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1937 } 1938 if ( bExp == 0 ) { 1939 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); 1940 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1941 } 1942 zExp = aExp + bExp - 0x7F; 1943 aSig = ( aSig | 0x00800000 )<<7; 1944 bSig = ( bSig | 0x00800000 )<<8; 1945 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 ); 1946 zSig = zSig64; 1947 if ( 0 <= (int32_t) ( zSig<<1 ) ) { 1948 zSig <<= 1; 1949 --zExp; 1950 } 1951 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR ); 1952 1953 } 1954 1955 /*---------------------------------------------------------------------------- 1956 | Returns the result of dividing the single-precision floating-point value `a' 1957 | by the corresponding value `b'. The operation is performed according to the 1958 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1959 *----------------------------------------------------------------------------*/ 1960 1961 float32 float32_div( float32 a, float32 b STATUS_PARAM ) 1962 { 1963 flag aSign, bSign, zSign; 1964 int16 aExp, bExp, zExp; 1965 uint32_t aSig, bSig, zSig; 1966 a = float32_squash_input_denormal(a STATUS_VAR); 1967 b = float32_squash_input_denormal(b STATUS_VAR); 1968 1969 aSig = extractFloat32Frac( a ); 1970 aExp = extractFloat32Exp( a ); 1971 aSign = extractFloat32Sign( a ); 1972 bSig = extractFloat32Frac( b ); 1973 bExp = extractFloat32Exp( b ); 1974 bSign = extractFloat32Sign( b ); 1975 zSign = aSign ^ bSign; 1976 if ( aExp == 0xFF ) { 1977 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1978 if ( bExp == 0xFF ) { 1979 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1980 float_raise( float_flag_invalid STATUS_VAR); 1981 return float32_default_nan; 1982 } 1983 return packFloat32( zSign, 0xFF, 0 ); 1984 } 1985 if ( bExp == 0xFF ) { 1986 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1987 return packFloat32( zSign, 0, 0 ); 1988 } 1989 if ( bExp == 0 ) { 1990 if ( bSig == 0 ) { 1991 if ( ( aExp | aSig ) == 0 ) { 1992 float_raise( float_flag_invalid STATUS_VAR); 1993 return float32_default_nan; 1994 } 1995 float_raise( float_flag_divbyzero STATUS_VAR); 1996 return packFloat32( zSign, 0xFF, 0 ); 1997 } 1998 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 1999 } 2000 if ( aExp == 0 ) { 2001 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 2002 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2003 } 2004 zExp = aExp - bExp + 0x7D; 2005 aSig = ( aSig | 0x00800000 )<<7; 2006 bSig = ( bSig | 0x00800000 )<<8; 2007 if ( bSig <= ( aSig + aSig ) ) { 2008 aSig >>= 1; 2009 ++zExp; 2010 } 2011 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig; 2012 if ( ( zSig & 0x3F ) == 0 ) { 2013 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 ); 2014 } 2015 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR ); 2016 2017 } 2018 2019 /*---------------------------------------------------------------------------- 2020 | Returns the remainder of the single-precision floating-point value `a' 2021 | with respect to the corresponding value `b'. The operation is performed 2022 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2023 *----------------------------------------------------------------------------*/ 2024 2025 float32 float32_rem( float32 a, float32 b STATUS_PARAM ) 2026 { 2027 flag aSign, zSign; 2028 int16 aExp, bExp, expDiff; 2029 uint32_t aSig, bSig; 2030 uint32_t q; 2031 uint64_t aSig64, bSig64, q64; 2032 uint32_t alternateASig; 2033 int32_t sigMean; 2034 a = float32_squash_input_denormal(a STATUS_VAR); 2035 b = float32_squash_input_denormal(b STATUS_VAR); 2036 2037 aSig = extractFloat32Frac( a ); 2038 aExp = extractFloat32Exp( a ); 2039 aSign = extractFloat32Sign( a ); 2040 bSig = extractFloat32Frac( b ); 2041 bExp = extractFloat32Exp( b ); 2042 if ( aExp == 0xFF ) { 2043 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 2044 return propagateFloat32NaN( a, b STATUS_VAR ); 2045 } 2046 float_raise( float_flag_invalid STATUS_VAR); 2047 return float32_default_nan; 2048 } 2049 if ( bExp == 0xFF ) { 2050 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 2051 return a; 2052 } 2053 if ( bExp == 0 ) { 2054 if ( bSig == 0 ) { 2055 float_raise( float_flag_invalid STATUS_VAR); 2056 return float32_default_nan; 2057 } 2058 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2059 } 2060 if ( aExp == 0 ) { 2061 if ( aSig == 0 ) return a; 2062 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2063 } 2064 expDiff = aExp - bExp; 2065 aSig |= 0x00800000; 2066 bSig |= 0x00800000; 2067 if ( expDiff < 32 ) { 2068 aSig <<= 8; 2069 bSig <<= 8; 2070 if ( expDiff < 0 ) { 2071 if ( expDiff < -1 ) return a; 2072 aSig >>= 1; 2073 } 2074 q = ( bSig <= aSig ); 2075 if ( q ) aSig -= bSig; 2076 if ( 0 < expDiff ) { 2077 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 2078 q >>= 32 - expDiff; 2079 bSig >>= 2; 2080 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 2081 } 2082 else { 2083 aSig >>= 2; 2084 bSig >>= 2; 2085 } 2086 } 2087 else { 2088 if ( bSig <= aSig ) aSig -= bSig; 2089 aSig64 = ( (uint64_t) aSig )<<40; 2090 bSig64 = ( (uint64_t) bSig )<<40; 2091 expDiff -= 64; 2092 while ( 0 < expDiff ) { 2093 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2094 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2095 aSig64 = - ( ( bSig * q64 )<<38 ); 2096 expDiff -= 62; 2097 } 2098 expDiff += 64; 2099 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2100 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2101 q = q64>>( 64 - expDiff ); 2102 bSig <<= 6; 2103 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 2104 } 2105 do { 2106 alternateASig = aSig; 2107 ++q; 2108 aSig -= bSig; 2109 } while ( 0 <= (int32_t) aSig ); 2110 sigMean = aSig + alternateASig; 2111 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 2112 aSig = alternateASig; 2113 } 2114 zSign = ( (int32_t) aSig < 0 ); 2115 if ( zSign ) aSig = - aSig; 2116 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR ); 2117 2118 } 2119 2120 /*---------------------------------------------------------------------------- 2121 | Returns the square root of the single-precision floating-point value `a'. 2122 | The operation is performed according to the IEC/IEEE Standard for Binary 2123 | Floating-Point Arithmetic. 2124 *----------------------------------------------------------------------------*/ 2125 2126 float32 float32_sqrt( float32 a STATUS_PARAM ) 2127 { 2128 flag aSign; 2129 int16 aExp, zExp; 2130 uint32_t aSig, zSig; 2131 uint64_t rem, term; 2132 a = float32_squash_input_denormal(a STATUS_VAR); 2133 2134 aSig = extractFloat32Frac( a ); 2135 aExp = extractFloat32Exp( a ); 2136 aSign = extractFloat32Sign( a ); 2137 if ( aExp == 0xFF ) { 2138 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR ); 2139 if ( ! aSign ) return a; 2140 float_raise( float_flag_invalid STATUS_VAR); 2141 return float32_default_nan; 2142 } 2143 if ( aSign ) { 2144 if ( ( aExp | aSig ) == 0 ) return a; 2145 float_raise( float_flag_invalid STATUS_VAR); 2146 return float32_default_nan; 2147 } 2148 if ( aExp == 0 ) { 2149 if ( aSig == 0 ) return float32_zero; 2150 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2151 } 2152 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 2153 aSig = ( aSig | 0x00800000 )<<8; 2154 zSig = estimateSqrt32( aExp, aSig ) + 2; 2155 if ( ( zSig & 0x7F ) <= 5 ) { 2156 if ( zSig < 2 ) { 2157 zSig = 0x7FFFFFFF; 2158 goto roundAndPack; 2159 } 2160 aSig >>= aExp & 1; 2161 term = ( (uint64_t) zSig ) * zSig; 2162 rem = ( ( (uint64_t) aSig )<<32 ) - term; 2163 while ( (int64_t) rem < 0 ) { 2164 --zSig; 2165 rem += ( ( (uint64_t) zSig )<<1 ) | 1; 2166 } 2167 zSig |= ( rem != 0 ); 2168 } 2169 shift32RightJamming( zSig, 1, &zSig ); 2170 roundAndPack: 2171 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR ); 2172 2173 } 2174 2175 /*---------------------------------------------------------------------------- 2176 | Returns the binary exponential of the single-precision floating-point value 2177 | `a'. The operation is performed according to the IEC/IEEE Standard for 2178 | Binary Floating-Point Arithmetic. 2179 | 2180 | Uses the following identities: 2181 | 2182 | 1. ------------------------------------------------------------------------- 2183 | x x*ln(2) 2184 | 2 = e 2185 | 2186 | 2. ------------------------------------------------------------------------- 2187 | 2 3 4 5 n 2188 | x x x x x x x 2189 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 2190 | 1! 2! 3! 4! 5! n! 2191 *----------------------------------------------------------------------------*/ 2192 2193 static const float64 float32_exp2_coefficients[15] = 2194 { 2195 const_float64( 0x3ff0000000000000ll ), /* 1 */ 2196 const_float64( 0x3fe0000000000000ll ), /* 2 */ 2197 const_float64( 0x3fc5555555555555ll ), /* 3 */ 2198 const_float64( 0x3fa5555555555555ll ), /* 4 */ 2199 const_float64( 0x3f81111111111111ll ), /* 5 */ 2200 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 2201 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 2202 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 2203 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 2204 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 2205 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 2206 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 2207 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 2208 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 2209 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 2210 }; 2211 2212 float32 float32_exp2( float32 a STATUS_PARAM ) 2213 { 2214 flag aSign; 2215 int16 aExp; 2216 uint32_t aSig; 2217 float64 r, x, xn; 2218 int i; 2219 a = float32_squash_input_denormal(a STATUS_VAR); 2220 2221 aSig = extractFloat32Frac( a ); 2222 aExp = extractFloat32Exp( a ); 2223 aSign = extractFloat32Sign( a ); 2224 2225 if ( aExp == 0xFF) { 2226 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR ); 2227 return (aSign) ? float32_zero : a; 2228 } 2229 if (aExp == 0) { 2230 if (aSig == 0) return float32_one; 2231 } 2232 2233 float_raise( float_flag_inexact STATUS_VAR); 2234 2235 /* ******************************* */ 2236 /* using float64 for approximation */ 2237 /* ******************************* */ 2238 x = float32_to_float64(a STATUS_VAR); 2239 x = float64_mul(x, float64_ln2 STATUS_VAR); 2240 2241 xn = x; 2242 r = float64_one; 2243 for (i = 0 ; i < 15 ; i++) { 2244 float64 f; 2245 2246 f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR); 2247 r = float64_add(r, f STATUS_VAR); 2248 2249 xn = float64_mul(xn, x STATUS_VAR); 2250 } 2251 2252 return float64_to_float32(r, status); 2253 } 2254 2255 /*---------------------------------------------------------------------------- 2256 | Returns the binary log of the single-precision floating-point value `a'. 2257 | The operation is performed according to the IEC/IEEE Standard for Binary 2258 | Floating-Point Arithmetic. 2259 *----------------------------------------------------------------------------*/ 2260 float32 float32_log2( float32 a STATUS_PARAM ) 2261 { 2262 flag aSign, zSign; 2263 int16 aExp; 2264 uint32_t aSig, zSig, i; 2265 2266 a = float32_squash_input_denormal(a STATUS_VAR); 2267 aSig = extractFloat32Frac( a ); 2268 aExp = extractFloat32Exp( a ); 2269 aSign = extractFloat32Sign( a ); 2270 2271 if ( aExp == 0 ) { 2272 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 2273 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2274 } 2275 if ( aSign ) { 2276 float_raise( float_flag_invalid STATUS_VAR); 2277 return float32_default_nan; 2278 } 2279 if ( aExp == 0xFF ) { 2280 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR ); 2281 return a; 2282 } 2283 2284 aExp -= 0x7F; 2285 aSig |= 0x00800000; 2286 zSign = aExp < 0; 2287 zSig = aExp << 23; 2288 2289 for (i = 1 << 22; i > 0; i >>= 1) { 2290 aSig = ( (uint64_t)aSig * aSig ) >> 23; 2291 if ( aSig & 0x01000000 ) { 2292 aSig >>= 1; 2293 zSig |= i; 2294 } 2295 } 2296 2297 if ( zSign ) 2298 zSig = -zSig; 2299 2300 return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR ); 2301 } 2302 2303 /*---------------------------------------------------------------------------- 2304 | Returns 1 if the single-precision floating-point value `a' is equal to 2305 | the corresponding value `b', and 0 otherwise. The invalid exception is 2306 | raised if either operand is a NaN. Otherwise, the comparison is performed 2307 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2308 *----------------------------------------------------------------------------*/ 2309 2310 int float32_eq( float32 a, float32 b STATUS_PARAM ) 2311 { 2312 uint32_t av, bv; 2313 a = float32_squash_input_denormal(a STATUS_VAR); 2314 b = float32_squash_input_denormal(b STATUS_VAR); 2315 2316 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2317 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2318 ) { 2319 float_raise( float_flag_invalid STATUS_VAR); 2320 return 0; 2321 } 2322 av = float32_val(a); 2323 bv = float32_val(b); 2324 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 2325 } 2326 2327 /*---------------------------------------------------------------------------- 2328 | Returns 1 if the single-precision floating-point value `a' is less than 2329 | or equal to the corresponding value `b', and 0 otherwise. The invalid 2330 | exception is raised if either operand is a NaN. The comparison is performed 2331 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2332 *----------------------------------------------------------------------------*/ 2333 2334 int float32_le( float32 a, float32 b STATUS_PARAM ) 2335 { 2336 flag aSign, bSign; 2337 uint32_t av, bv; 2338 a = float32_squash_input_denormal(a STATUS_VAR); 2339 b = float32_squash_input_denormal(b STATUS_VAR); 2340 2341 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2342 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2343 ) { 2344 float_raise( float_flag_invalid STATUS_VAR); 2345 return 0; 2346 } 2347 aSign = extractFloat32Sign( a ); 2348 bSign = extractFloat32Sign( b ); 2349 av = float32_val(a); 2350 bv = float32_val(b); 2351 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 2352 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 2353 2354 } 2355 2356 /*---------------------------------------------------------------------------- 2357 | Returns 1 if the single-precision floating-point value `a' is less than 2358 | the corresponding value `b', and 0 otherwise. The invalid exception is 2359 | raised if either operand is a NaN. The comparison is performed according 2360 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2361 *----------------------------------------------------------------------------*/ 2362 2363 int float32_lt( float32 a, float32 b STATUS_PARAM ) 2364 { 2365 flag aSign, bSign; 2366 uint32_t av, bv; 2367 a = float32_squash_input_denormal(a STATUS_VAR); 2368 b = float32_squash_input_denormal(b STATUS_VAR); 2369 2370 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2371 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2372 ) { 2373 float_raise( float_flag_invalid STATUS_VAR); 2374 return 0; 2375 } 2376 aSign = extractFloat32Sign( a ); 2377 bSign = extractFloat32Sign( b ); 2378 av = float32_val(a); 2379 bv = float32_val(b); 2380 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 2381 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 2382 2383 } 2384 2385 /*---------------------------------------------------------------------------- 2386 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 2387 | be compared, and 0 otherwise. The invalid exception is raised if either 2388 | operand is a NaN. The comparison is performed according to the IEC/IEEE 2389 | Standard for Binary Floating-Point Arithmetic. 2390 *----------------------------------------------------------------------------*/ 2391 2392 int float32_unordered( float32 a, float32 b STATUS_PARAM ) 2393 { 2394 a = float32_squash_input_denormal(a STATUS_VAR); 2395 b = float32_squash_input_denormal(b STATUS_VAR); 2396 2397 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2398 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2399 ) { 2400 float_raise( float_flag_invalid STATUS_VAR); 2401 return 1; 2402 } 2403 return 0; 2404 } 2405 2406 /*---------------------------------------------------------------------------- 2407 | Returns 1 if the single-precision floating-point value `a' is equal to 2408 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 2409 | exception. The comparison is performed according to the IEC/IEEE Standard 2410 | for Binary Floating-Point Arithmetic. 2411 *----------------------------------------------------------------------------*/ 2412 2413 int float32_eq_quiet( float32 a, float32 b STATUS_PARAM ) 2414 { 2415 a = float32_squash_input_denormal(a STATUS_VAR); 2416 b = float32_squash_input_denormal(b STATUS_VAR); 2417 2418 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2419 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2420 ) { 2421 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2422 float_raise( float_flag_invalid STATUS_VAR); 2423 } 2424 return 0; 2425 } 2426 return ( float32_val(a) == float32_val(b) ) || 2427 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 2428 } 2429 2430 /*---------------------------------------------------------------------------- 2431 | Returns 1 if the single-precision floating-point value `a' is less than or 2432 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 2433 | cause an exception. Otherwise, the comparison is performed according to the 2434 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2435 *----------------------------------------------------------------------------*/ 2436 2437 int float32_le_quiet( float32 a, float32 b STATUS_PARAM ) 2438 { 2439 flag aSign, bSign; 2440 uint32_t av, bv; 2441 a = float32_squash_input_denormal(a STATUS_VAR); 2442 b = float32_squash_input_denormal(b STATUS_VAR); 2443 2444 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2445 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2446 ) { 2447 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2448 float_raise( float_flag_invalid STATUS_VAR); 2449 } 2450 return 0; 2451 } 2452 aSign = extractFloat32Sign( a ); 2453 bSign = extractFloat32Sign( b ); 2454 av = float32_val(a); 2455 bv = float32_val(b); 2456 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 2457 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 2458 2459 } 2460 2461 /*---------------------------------------------------------------------------- 2462 | Returns 1 if the single-precision floating-point value `a' is less than 2463 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 2464 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 2465 | Standard for Binary Floating-Point Arithmetic. 2466 *----------------------------------------------------------------------------*/ 2467 2468 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM ) 2469 { 2470 flag aSign, bSign; 2471 uint32_t av, bv; 2472 a = float32_squash_input_denormal(a STATUS_VAR); 2473 b = float32_squash_input_denormal(b STATUS_VAR); 2474 2475 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2476 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2477 ) { 2478 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2479 float_raise( float_flag_invalid STATUS_VAR); 2480 } 2481 return 0; 2482 } 2483 aSign = extractFloat32Sign( a ); 2484 bSign = extractFloat32Sign( b ); 2485 av = float32_val(a); 2486 bv = float32_val(b); 2487 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 2488 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 2489 2490 } 2491 2492 /*---------------------------------------------------------------------------- 2493 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 2494 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 2495 | comparison is performed according to the IEC/IEEE Standard for Binary 2496 | Floating-Point Arithmetic. 2497 *----------------------------------------------------------------------------*/ 2498 2499 int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM ) 2500 { 2501 a = float32_squash_input_denormal(a STATUS_VAR); 2502 b = float32_squash_input_denormal(b STATUS_VAR); 2503 2504 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2505 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2506 ) { 2507 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2508 float_raise( float_flag_invalid STATUS_VAR); 2509 } 2510 return 1; 2511 } 2512 return 0; 2513 } 2514 2515 /*---------------------------------------------------------------------------- 2516 | Returns the result of converting the double-precision floating-point value 2517 | `a' to the 32-bit two's complement integer format. The conversion is 2518 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2519 | Arithmetic---which means in particular that the conversion is rounded 2520 | according to the current rounding mode. If `a' is a NaN, the largest 2521 | positive integer is returned. Otherwise, if the conversion overflows, the 2522 | largest integer with the same sign as `a' is returned. 2523 *----------------------------------------------------------------------------*/ 2524 2525 int32 float64_to_int32( float64 a STATUS_PARAM ) 2526 { 2527 flag aSign; 2528 int16 aExp, shiftCount; 2529 uint64_t aSig; 2530 a = float64_squash_input_denormal(a STATUS_VAR); 2531 2532 aSig = extractFloat64Frac( a ); 2533 aExp = extractFloat64Exp( a ); 2534 aSign = extractFloat64Sign( a ); 2535 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2536 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2537 shiftCount = 0x42C - aExp; 2538 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 2539 return roundAndPackInt32( aSign, aSig STATUS_VAR ); 2540 2541 } 2542 2543 /*---------------------------------------------------------------------------- 2544 | Returns the result of converting the double-precision floating-point value 2545 | `a' to the 32-bit two's complement integer format. The conversion is 2546 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2547 | Arithmetic, except that the conversion is always rounded toward zero. 2548 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2549 | the conversion overflows, the largest integer with the same sign as `a' is 2550 | returned. 2551 *----------------------------------------------------------------------------*/ 2552 2553 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM ) 2554 { 2555 flag aSign; 2556 int16 aExp, shiftCount; 2557 uint64_t aSig, savedASig; 2558 int32 z; 2559 a = float64_squash_input_denormal(a STATUS_VAR); 2560 2561 aSig = extractFloat64Frac( a ); 2562 aExp = extractFloat64Exp( a ); 2563 aSign = extractFloat64Sign( a ); 2564 if ( 0x41E < aExp ) { 2565 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2566 goto invalid; 2567 } 2568 else if ( aExp < 0x3FF ) { 2569 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 2570 return 0; 2571 } 2572 aSig |= LIT64( 0x0010000000000000 ); 2573 shiftCount = 0x433 - aExp; 2574 savedASig = aSig; 2575 aSig >>= shiftCount; 2576 z = aSig; 2577 if ( aSign ) z = - z; 2578 if ( ( z < 0 ) ^ aSign ) { 2579 invalid: 2580 float_raise( float_flag_invalid STATUS_VAR); 2581 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 2582 } 2583 if ( ( aSig<<shiftCount ) != savedASig ) { 2584 STATUS(float_exception_flags) |= float_flag_inexact; 2585 } 2586 return z; 2587 2588 } 2589 2590 /*---------------------------------------------------------------------------- 2591 | Returns the result of converting the double-precision floating-point value 2592 | `a' to the 16-bit two's complement integer format. The conversion is 2593 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2594 | Arithmetic, except that the conversion is always rounded toward zero. 2595 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2596 | the conversion overflows, the largest integer with the same sign as `a' is 2597 | returned. 2598 *----------------------------------------------------------------------------*/ 2599 2600 int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM ) 2601 { 2602 flag aSign; 2603 int16 aExp, shiftCount; 2604 uint64_t aSig, savedASig; 2605 int32 z; 2606 2607 aSig = extractFloat64Frac( a ); 2608 aExp = extractFloat64Exp( a ); 2609 aSign = extractFloat64Sign( a ); 2610 if ( 0x40E < aExp ) { 2611 if ( ( aExp == 0x7FF ) && aSig ) { 2612 aSign = 0; 2613 } 2614 goto invalid; 2615 } 2616 else if ( aExp < 0x3FF ) { 2617 if ( aExp || aSig ) { 2618 STATUS(float_exception_flags) |= float_flag_inexact; 2619 } 2620 return 0; 2621 } 2622 aSig |= LIT64( 0x0010000000000000 ); 2623 shiftCount = 0x433 - aExp; 2624 savedASig = aSig; 2625 aSig >>= shiftCount; 2626 z = aSig; 2627 if ( aSign ) { 2628 z = - z; 2629 } 2630 if ( ( (int16_t)z < 0 ) ^ aSign ) { 2631 invalid: 2632 float_raise( float_flag_invalid STATUS_VAR); 2633 return aSign ? (int32_t) 0xffff8000 : 0x7FFF; 2634 } 2635 if ( ( aSig<<shiftCount ) != savedASig ) { 2636 STATUS(float_exception_flags) |= float_flag_inexact; 2637 } 2638 return z; 2639 } 2640 2641 /*---------------------------------------------------------------------------- 2642 | Returns the result of converting the double-precision floating-point value 2643 | `a' to the 64-bit two's complement integer format. The conversion is 2644 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2645 | Arithmetic---which means in particular that the conversion is rounded 2646 | according to the current rounding mode. If `a' is a NaN, the largest 2647 | positive integer is returned. Otherwise, if the conversion overflows, the 2648 | largest integer with the same sign as `a' is returned. 2649 *----------------------------------------------------------------------------*/ 2650 2651 int64 float64_to_int64( float64 a STATUS_PARAM ) 2652 { 2653 flag aSign; 2654 int16 aExp, shiftCount; 2655 uint64_t aSig, aSigExtra; 2656 a = float64_squash_input_denormal(a STATUS_VAR); 2657 2658 aSig = extractFloat64Frac( a ); 2659 aExp = extractFloat64Exp( a ); 2660 aSign = extractFloat64Sign( a ); 2661 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2662 shiftCount = 0x433 - aExp; 2663 if ( shiftCount <= 0 ) { 2664 if ( 0x43E < aExp ) { 2665 float_raise( float_flag_invalid STATUS_VAR); 2666 if ( ! aSign 2667 || ( ( aExp == 0x7FF ) 2668 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2669 ) { 2670 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2671 } 2672 return (int64_t) LIT64( 0x8000000000000000 ); 2673 } 2674 aSigExtra = 0; 2675 aSig <<= - shiftCount; 2676 } 2677 else { 2678 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 2679 } 2680 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR ); 2681 2682 } 2683 2684 /*---------------------------------------------------------------------------- 2685 | Returns the result of converting the double-precision floating-point value 2686 | `a' to the 64-bit two's complement integer format. The conversion is 2687 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2688 | Arithmetic, except that the conversion is always rounded toward zero. 2689 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2690 | the conversion overflows, the largest integer with the same sign as `a' is 2691 | returned. 2692 *----------------------------------------------------------------------------*/ 2693 2694 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM ) 2695 { 2696 flag aSign; 2697 int16 aExp, shiftCount; 2698 uint64_t aSig; 2699 int64 z; 2700 a = float64_squash_input_denormal(a STATUS_VAR); 2701 2702 aSig = extractFloat64Frac( a ); 2703 aExp = extractFloat64Exp( a ); 2704 aSign = extractFloat64Sign( a ); 2705 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2706 shiftCount = aExp - 0x433; 2707 if ( 0 <= shiftCount ) { 2708 if ( 0x43E <= aExp ) { 2709 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) { 2710 float_raise( float_flag_invalid STATUS_VAR); 2711 if ( ! aSign 2712 || ( ( aExp == 0x7FF ) 2713 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 2714 ) { 2715 return LIT64( 0x7FFFFFFFFFFFFFFF ); 2716 } 2717 } 2718 return (int64_t) LIT64( 0x8000000000000000 ); 2719 } 2720 z = aSig<<shiftCount; 2721 } 2722 else { 2723 if ( aExp < 0x3FE ) { 2724 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 2725 return 0; 2726 } 2727 z = aSig>>( - shiftCount ); 2728 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 2729 STATUS(float_exception_flags) |= float_flag_inexact; 2730 } 2731 } 2732 if ( aSign ) z = - z; 2733 return z; 2734 2735 } 2736 2737 /*---------------------------------------------------------------------------- 2738 | Returns the result of converting the double-precision floating-point value 2739 | `a' to the single-precision floating-point format. The conversion is 2740 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2741 | Arithmetic. 2742 *----------------------------------------------------------------------------*/ 2743 2744 float32 float64_to_float32( float64 a STATUS_PARAM ) 2745 { 2746 flag aSign; 2747 int16 aExp; 2748 uint64_t aSig; 2749 uint32_t zSig; 2750 a = float64_squash_input_denormal(a STATUS_VAR); 2751 2752 aSig = extractFloat64Frac( a ); 2753 aExp = extractFloat64Exp( a ); 2754 aSign = extractFloat64Sign( a ); 2755 if ( aExp == 0x7FF ) { 2756 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 2757 return packFloat32( aSign, 0xFF, 0 ); 2758 } 2759 shift64RightJamming( aSig, 22, &aSig ); 2760 zSig = aSig; 2761 if ( aExp || zSig ) { 2762 zSig |= 0x40000000; 2763 aExp -= 0x381; 2764 } 2765 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR ); 2766 2767 } 2768 2769 2770 /*---------------------------------------------------------------------------- 2771 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 2772 | half-precision floating-point value, returning the result. After being 2773 | shifted into the proper positions, the three fields are simply added 2774 | together to form the result. This means that any integer portion of `zSig' 2775 | will be added into the exponent. Since a properly normalized significand 2776 | will have an integer portion equal to 1, the `zExp' input should be 1 less 2777 | than the desired result exponent whenever `zSig' is a complete, normalized 2778 | significand. 2779 *----------------------------------------------------------------------------*/ 2780 static float16 packFloat16(flag zSign, int16 zExp, uint16_t zSig) 2781 { 2782 return make_float16( 2783 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig); 2784 } 2785 2786 /* Half precision floats come in two formats: standard IEEE and "ARM" format. 2787 The latter gains extra exponent range by omitting the NaN/Inf encodings. */ 2788 2789 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM) 2790 { 2791 flag aSign; 2792 int16 aExp; 2793 uint32_t aSig; 2794 2795 aSign = extractFloat16Sign(a); 2796 aExp = extractFloat16Exp(a); 2797 aSig = extractFloat16Frac(a); 2798 2799 if (aExp == 0x1f && ieee) { 2800 if (aSig) { 2801 return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR); 2802 } 2803 return packFloat32(aSign, 0xff, aSig << 13); 2804 } 2805 if (aExp == 0) { 2806 int8 shiftCount; 2807 2808 if (aSig == 0) { 2809 return packFloat32(aSign, 0, 0); 2810 } 2811 2812 shiftCount = countLeadingZeros32( aSig ) - 21; 2813 aSig = aSig << shiftCount; 2814 aExp = -shiftCount; 2815 } 2816 return packFloat32( aSign, aExp + 0x70, aSig << 13); 2817 } 2818 2819 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM) 2820 { 2821 flag aSign; 2822 int16 aExp; 2823 uint32_t aSig; 2824 uint32_t mask; 2825 uint32_t increment; 2826 int8 roundingMode; 2827 a = float32_squash_input_denormal(a STATUS_VAR); 2828 2829 aSig = extractFloat32Frac( a ); 2830 aExp = extractFloat32Exp( a ); 2831 aSign = extractFloat32Sign( a ); 2832 if ( aExp == 0xFF ) { 2833 if (aSig) { 2834 /* Input is a NaN */ 2835 float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 2836 if (!ieee) { 2837 return packFloat16(aSign, 0, 0); 2838 } 2839 return r; 2840 } 2841 /* Infinity */ 2842 if (!ieee) { 2843 float_raise(float_flag_invalid STATUS_VAR); 2844 return packFloat16(aSign, 0x1f, 0x3ff); 2845 } 2846 return packFloat16(aSign, 0x1f, 0); 2847 } 2848 if (aExp == 0 && aSig == 0) { 2849 return packFloat16(aSign, 0, 0); 2850 } 2851 /* Decimal point between bits 22 and 23. */ 2852 aSig |= 0x00800000; 2853 aExp -= 0x7f; 2854 if (aExp < -14) { 2855 mask = 0x00ffffff; 2856 if (aExp >= -24) { 2857 mask >>= 25 + aExp; 2858 } 2859 } else { 2860 mask = 0x00001fff; 2861 } 2862 if (aSig & mask) { 2863 float_raise( float_flag_underflow STATUS_VAR ); 2864 roundingMode = STATUS(float_rounding_mode); 2865 switch (roundingMode) { 2866 case float_round_nearest_even: 2867 increment = (mask + 1) >> 1; 2868 if ((aSig & mask) == increment) { 2869 increment = aSig & (increment << 1); 2870 } 2871 break; 2872 case float_round_up: 2873 increment = aSign ? 0 : mask; 2874 break; 2875 case float_round_down: 2876 increment = aSign ? mask : 0; 2877 break; 2878 default: /* round_to_zero */ 2879 increment = 0; 2880 break; 2881 } 2882 aSig += increment; 2883 if (aSig >= 0x01000000) { 2884 aSig >>= 1; 2885 aExp++; 2886 } 2887 } else if (aExp < -14 2888 && STATUS(float_detect_tininess) == float_tininess_before_rounding) { 2889 float_raise( float_flag_underflow STATUS_VAR); 2890 } 2891 2892 if (ieee) { 2893 if (aExp > 15) { 2894 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); 2895 return packFloat16(aSign, 0x1f, 0); 2896 } 2897 } else { 2898 if (aExp > 16) { 2899 float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR); 2900 return packFloat16(aSign, 0x1f, 0x3ff); 2901 } 2902 } 2903 if (aExp < -24) { 2904 return packFloat16(aSign, 0, 0); 2905 } 2906 if (aExp < -14) { 2907 aSig >>= -14 - aExp; 2908 aExp = -14; 2909 } 2910 return packFloat16(aSign, aExp + 14, aSig >> 13); 2911 } 2912 2913 /*---------------------------------------------------------------------------- 2914 | Returns the result of converting the double-precision floating-point value 2915 | `a' to the extended double-precision floating-point format. The conversion 2916 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 2917 | Arithmetic. 2918 *----------------------------------------------------------------------------*/ 2919 2920 floatx80 float64_to_floatx80( float64 a STATUS_PARAM ) 2921 { 2922 flag aSign; 2923 int16 aExp; 2924 uint64_t aSig; 2925 2926 a = float64_squash_input_denormal(a STATUS_VAR); 2927 aSig = extractFloat64Frac( a ); 2928 aExp = extractFloat64Exp( a ); 2929 aSign = extractFloat64Sign( a ); 2930 if ( aExp == 0x7FF ) { 2931 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 2932 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2933 } 2934 if ( aExp == 0 ) { 2935 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 2936 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2937 } 2938 return 2939 packFloatx80( 2940 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 2941 2942 } 2943 2944 /*---------------------------------------------------------------------------- 2945 | Returns the result of converting the double-precision floating-point value 2946 | `a' to the quadruple-precision floating-point format. The conversion is 2947 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2948 | Arithmetic. 2949 *----------------------------------------------------------------------------*/ 2950 2951 float128 float64_to_float128( float64 a STATUS_PARAM ) 2952 { 2953 flag aSign; 2954 int16 aExp; 2955 uint64_t aSig, zSig0, zSig1; 2956 2957 a = float64_squash_input_denormal(a STATUS_VAR); 2958 aSig = extractFloat64Frac( a ); 2959 aExp = extractFloat64Exp( a ); 2960 aSign = extractFloat64Sign( a ); 2961 if ( aExp == 0x7FF ) { 2962 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 2963 return packFloat128( aSign, 0x7FFF, 0, 0 ); 2964 } 2965 if ( aExp == 0 ) { 2966 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 2967 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 2968 --aExp; 2969 } 2970 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 2971 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 2972 2973 } 2974 2975 /*---------------------------------------------------------------------------- 2976 | Rounds the double-precision floating-point value `a' to an integer, and 2977 | returns the result as a double-precision floating-point value. The 2978 | operation is performed according to the IEC/IEEE Standard for Binary 2979 | Floating-Point Arithmetic. 2980 *----------------------------------------------------------------------------*/ 2981 2982 float64 float64_round_to_int( float64 a STATUS_PARAM ) 2983 { 2984 flag aSign; 2985 int16 aExp; 2986 uint64_t lastBitMask, roundBitsMask; 2987 int8 roundingMode; 2988 uint64_t z; 2989 a = float64_squash_input_denormal(a STATUS_VAR); 2990 2991 aExp = extractFloat64Exp( a ); 2992 if ( 0x433 <= aExp ) { 2993 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { 2994 return propagateFloat64NaN( a, a STATUS_VAR ); 2995 } 2996 return a; 2997 } 2998 if ( aExp < 0x3FF ) { 2999 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a; 3000 STATUS(float_exception_flags) |= float_flag_inexact; 3001 aSign = extractFloat64Sign( a ); 3002 switch ( STATUS(float_rounding_mode) ) { 3003 case float_round_nearest_even: 3004 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { 3005 return packFloat64( aSign, 0x3FF, 0 ); 3006 } 3007 break; 3008 case float_round_down: 3009 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0); 3010 case float_round_up: 3011 return make_float64( 3012 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 )); 3013 } 3014 return packFloat64( aSign, 0, 0 ); 3015 } 3016 lastBitMask = 1; 3017 lastBitMask <<= 0x433 - aExp; 3018 roundBitsMask = lastBitMask - 1; 3019 z = float64_val(a); 3020 roundingMode = STATUS(float_rounding_mode); 3021 if ( roundingMode == float_round_nearest_even ) { 3022 z += lastBitMask>>1; 3023 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; 3024 } 3025 else if ( roundingMode != float_round_to_zero ) { 3026 if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) { 3027 z += roundBitsMask; 3028 } 3029 } 3030 z &= ~ roundBitsMask; 3031 if ( z != float64_val(a) ) 3032 STATUS(float_exception_flags) |= float_flag_inexact; 3033 return make_float64(z); 3034 3035 } 3036 3037 float64 float64_trunc_to_int( float64 a STATUS_PARAM) 3038 { 3039 int oldmode; 3040 float64 res; 3041 oldmode = STATUS(float_rounding_mode); 3042 STATUS(float_rounding_mode) = float_round_to_zero; 3043 res = float64_round_to_int(a STATUS_VAR); 3044 STATUS(float_rounding_mode) = oldmode; 3045 return res; 3046 } 3047 3048 /*---------------------------------------------------------------------------- 3049 | Returns the result of adding the absolute values of the double-precision 3050 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 3051 | before being returned. `zSign' is ignored if the result is a NaN. 3052 | The addition is performed according to the IEC/IEEE Standard for Binary 3053 | Floating-Point Arithmetic. 3054 *----------------------------------------------------------------------------*/ 3055 3056 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM ) 3057 { 3058 int16 aExp, bExp, zExp; 3059 uint64_t aSig, bSig, zSig; 3060 int16 expDiff; 3061 3062 aSig = extractFloat64Frac( a ); 3063 aExp = extractFloat64Exp( a ); 3064 bSig = extractFloat64Frac( b ); 3065 bExp = extractFloat64Exp( b ); 3066 expDiff = aExp - bExp; 3067 aSig <<= 9; 3068 bSig <<= 9; 3069 if ( 0 < expDiff ) { 3070 if ( aExp == 0x7FF ) { 3071 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3072 return a; 3073 } 3074 if ( bExp == 0 ) { 3075 --expDiff; 3076 } 3077 else { 3078 bSig |= LIT64( 0x2000000000000000 ); 3079 } 3080 shift64RightJamming( bSig, expDiff, &bSig ); 3081 zExp = aExp; 3082 } 3083 else if ( expDiff < 0 ) { 3084 if ( bExp == 0x7FF ) { 3085 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3086 return packFloat64( zSign, 0x7FF, 0 ); 3087 } 3088 if ( aExp == 0 ) { 3089 ++expDiff; 3090 } 3091 else { 3092 aSig |= LIT64( 0x2000000000000000 ); 3093 } 3094 shift64RightJamming( aSig, - expDiff, &aSig ); 3095 zExp = bExp; 3096 } 3097 else { 3098 if ( aExp == 0x7FF ) { 3099 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3100 return a; 3101 } 3102 if ( aExp == 0 ) { 3103 if (STATUS(flush_to_zero)) { 3104 if (aSig | bSig) { 3105 float_raise(float_flag_output_denormal STATUS_VAR); 3106 } 3107 return packFloat64(zSign, 0, 0); 3108 } 3109 return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); 3110 } 3111 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig; 3112 zExp = aExp; 3113 goto roundAndPack; 3114 } 3115 aSig |= LIT64( 0x2000000000000000 ); 3116 zSig = ( aSig + bSig )<<1; 3117 --zExp; 3118 if ( (int64_t) zSig < 0 ) { 3119 zSig = aSig + bSig; 3120 ++zExp; 3121 } 3122 roundAndPack: 3123 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR ); 3124 3125 } 3126 3127 /*---------------------------------------------------------------------------- 3128 | Returns the result of subtracting the absolute values of the double- 3129 | precision floating-point values `a' and `b'. If `zSign' is 1, the 3130 | difference is negated before being returned. `zSign' is ignored if the 3131 | result is a NaN. The subtraction is performed according to the IEC/IEEE 3132 | Standard for Binary Floating-Point Arithmetic. 3133 *----------------------------------------------------------------------------*/ 3134 3135 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM ) 3136 { 3137 int16 aExp, bExp, zExp; 3138 uint64_t aSig, bSig, zSig; 3139 int16 expDiff; 3140 3141 aSig = extractFloat64Frac( a ); 3142 aExp = extractFloat64Exp( a ); 3143 bSig = extractFloat64Frac( b ); 3144 bExp = extractFloat64Exp( b ); 3145 expDiff = aExp - bExp; 3146 aSig <<= 10; 3147 bSig <<= 10; 3148 if ( 0 < expDiff ) goto aExpBigger; 3149 if ( expDiff < 0 ) goto bExpBigger; 3150 if ( aExp == 0x7FF ) { 3151 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3152 float_raise( float_flag_invalid STATUS_VAR); 3153 return float64_default_nan; 3154 } 3155 if ( aExp == 0 ) { 3156 aExp = 1; 3157 bExp = 1; 3158 } 3159 if ( bSig < aSig ) goto aBigger; 3160 if ( aSig < bSig ) goto bBigger; 3161 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 ); 3162 bExpBigger: 3163 if ( bExp == 0x7FF ) { 3164 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3165 return packFloat64( zSign ^ 1, 0x7FF, 0 ); 3166 } 3167 if ( aExp == 0 ) { 3168 ++expDiff; 3169 } 3170 else { 3171 aSig |= LIT64( 0x4000000000000000 ); 3172 } 3173 shift64RightJamming( aSig, - expDiff, &aSig ); 3174 bSig |= LIT64( 0x4000000000000000 ); 3175 bBigger: 3176 zSig = bSig - aSig; 3177 zExp = bExp; 3178 zSign ^= 1; 3179 goto normalizeRoundAndPack; 3180 aExpBigger: 3181 if ( aExp == 0x7FF ) { 3182 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3183 return a; 3184 } 3185 if ( bExp == 0 ) { 3186 --expDiff; 3187 } 3188 else { 3189 bSig |= LIT64( 0x4000000000000000 ); 3190 } 3191 shift64RightJamming( bSig, expDiff, &bSig ); 3192 aSig |= LIT64( 0x4000000000000000 ); 3193 aBigger: 3194 zSig = aSig - bSig; 3195 zExp = aExp; 3196 normalizeRoundAndPack: 3197 --zExp; 3198 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR ); 3199 3200 } 3201 3202 /*---------------------------------------------------------------------------- 3203 | Returns the result of adding the double-precision floating-point values `a' 3204 | and `b'. The operation is performed according to the IEC/IEEE Standard for 3205 | Binary Floating-Point Arithmetic. 3206 *----------------------------------------------------------------------------*/ 3207 3208 float64 float64_add( float64 a, float64 b STATUS_PARAM ) 3209 { 3210 flag aSign, bSign; 3211 a = float64_squash_input_denormal(a STATUS_VAR); 3212 b = float64_squash_input_denormal(b STATUS_VAR); 3213 3214 aSign = extractFloat64Sign( a ); 3215 bSign = extractFloat64Sign( b ); 3216 if ( aSign == bSign ) { 3217 return addFloat64Sigs( a, b, aSign STATUS_VAR ); 3218 } 3219 else { 3220 return subFloat64Sigs( a, b, aSign STATUS_VAR ); 3221 } 3222 3223 } 3224 3225 /*---------------------------------------------------------------------------- 3226 | Returns the result of subtracting the double-precision floating-point values 3227 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 3228 | for Binary Floating-Point Arithmetic. 3229 *----------------------------------------------------------------------------*/ 3230 3231 float64 float64_sub( float64 a, float64 b STATUS_PARAM ) 3232 { 3233 flag aSign, bSign; 3234 a = float64_squash_input_denormal(a STATUS_VAR); 3235 b = float64_squash_input_denormal(b STATUS_VAR); 3236 3237 aSign = extractFloat64Sign( a ); 3238 bSign = extractFloat64Sign( b ); 3239 if ( aSign == bSign ) { 3240 return subFloat64Sigs( a, b, aSign STATUS_VAR ); 3241 } 3242 else { 3243 return addFloat64Sigs( a, b, aSign STATUS_VAR ); 3244 } 3245 3246 } 3247 3248 /*---------------------------------------------------------------------------- 3249 | Returns the result of multiplying the double-precision floating-point values 3250 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 3251 | for Binary Floating-Point Arithmetic. 3252 *----------------------------------------------------------------------------*/ 3253 3254 float64 float64_mul( float64 a, float64 b STATUS_PARAM ) 3255 { 3256 flag aSign, bSign, zSign; 3257 int16 aExp, bExp, zExp; 3258 uint64_t aSig, bSig, zSig0, zSig1; 3259 3260 a = float64_squash_input_denormal(a STATUS_VAR); 3261 b = float64_squash_input_denormal(b STATUS_VAR); 3262 3263 aSig = extractFloat64Frac( a ); 3264 aExp = extractFloat64Exp( a ); 3265 aSign = extractFloat64Sign( a ); 3266 bSig = extractFloat64Frac( b ); 3267 bExp = extractFloat64Exp( b ); 3268 bSign = extractFloat64Sign( b ); 3269 zSign = aSign ^ bSign; 3270 if ( aExp == 0x7FF ) { 3271 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 3272 return propagateFloat64NaN( a, b STATUS_VAR ); 3273 } 3274 if ( ( bExp | bSig ) == 0 ) { 3275 float_raise( float_flag_invalid STATUS_VAR); 3276 return float64_default_nan; 3277 } 3278 return packFloat64( zSign, 0x7FF, 0 ); 3279 } 3280 if ( bExp == 0x7FF ) { 3281 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3282 if ( ( aExp | aSig ) == 0 ) { 3283 float_raise( float_flag_invalid STATUS_VAR); 3284 return float64_default_nan; 3285 } 3286 return packFloat64( zSign, 0x7FF, 0 ); 3287 } 3288 if ( aExp == 0 ) { 3289 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 3290 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3291 } 3292 if ( bExp == 0 ) { 3293 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 ); 3294 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 3295 } 3296 zExp = aExp + bExp - 0x3FF; 3297 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 3298 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 3299 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 3300 zSig0 |= ( zSig1 != 0 ); 3301 if ( 0 <= (int64_t) ( zSig0<<1 ) ) { 3302 zSig0 <<= 1; 3303 --zExp; 3304 } 3305 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR ); 3306 3307 } 3308 3309 /*---------------------------------------------------------------------------- 3310 | Returns the result of dividing the double-precision floating-point value `a' 3311 | by the corresponding value `b'. The operation is performed according to 3312 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3313 *----------------------------------------------------------------------------*/ 3314 3315 float64 float64_div( float64 a, float64 b STATUS_PARAM ) 3316 { 3317 flag aSign, bSign, zSign; 3318 int16 aExp, bExp, zExp; 3319 uint64_t aSig, bSig, zSig; 3320 uint64_t rem0, rem1; 3321 uint64_t term0, term1; 3322 a = float64_squash_input_denormal(a STATUS_VAR); 3323 b = float64_squash_input_denormal(b STATUS_VAR); 3324 3325 aSig = extractFloat64Frac( a ); 3326 aExp = extractFloat64Exp( a ); 3327 aSign = extractFloat64Sign( a ); 3328 bSig = extractFloat64Frac( b ); 3329 bExp = extractFloat64Exp( b ); 3330 bSign = extractFloat64Sign( b ); 3331 zSign = aSign ^ bSign; 3332 if ( aExp == 0x7FF ) { 3333 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3334 if ( bExp == 0x7FF ) { 3335 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3336 float_raise( float_flag_invalid STATUS_VAR); 3337 return float64_default_nan; 3338 } 3339 return packFloat64( zSign, 0x7FF, 0 ); 3340 } 3341 if ( bExp == 0x7FF ) { 3342 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3343 return packFloat64( zSign, 0, 0 ); 3344 } 3345 if ( bExp == 0 ) { 3346 if ( bSig == 0 ) { 3347 if ( ( aExp | aSig ) == 0 ) { 3348 float_raise( float_flag_invalid STATUS_VAR); 3349 return float64_default_nan; 3350 } 3351 float_raise( float_flag_divbyzero STATUS_VAR); 3352 return packFloat64( zSign, 0x7FF, 0 ); 3353 } 3354 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 3355 } 3356 if ( aExp == 0 ) { 3357 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 3358 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3359 } 3360 zExp = aExp - bExp + 0x3FD; 3361 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 3362 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 3363 if ( bSig <= ( aSig + aSig ) ) { 3364 aSig >>= 1; 3365 ++zExp; 3366 } 3367 zSig = estimateDiv128To64( aSig, 0, bSig ); 3368 if ( ( zSig & 0x1FF ) <= 2 ) { 3369 mul64To128( bSig, zSig, &term0, &term1 ); 3370 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 3371 while ( (int64_t) rem0 < 0 ) { 3372 --zSig; 3373 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 3374 } 3375 zSig |= ( rem1 != 0 ); 3376 } 3377 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR ); 3378 3379 } 3380 3381 /*---------------------------------------------------------------------------- 3382 | Returns the remainder of the double-precision floating-point value `a' 3383 | with respect to the corresponding value `b'. The operation is performed 3384 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3385 *----------------------------------------------------------------------------*/ 3386 3387 float64 float64_rem( float64 a, float64 b STATUS_PARAM ) 3388 { 3389 flag aSign, zSign; 3390 int16 aExp, bExp, expDiff; 3391 uint64_t aSig, bSig; 3392 uint64_t q, alternateASig; 3393 int64_t sigMean; 3394 3395 a = float64_squash_input_denormal(a STATUS_VAR); 3396 b = float64_squash_input_denormal(b STATUS_VAR); 3397 aSig = extractFloat64Frac( a ); 3398 aExp = extractFloat64Exp( a ); 3399 aSign = extractFloat64Sign( a ); 3400 bSig = extractFloat64Frac( b ); 3401 bExp = extractFloat64Exp( b ); 3402 if ( aExp == 0x7FF ) { 3403 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 3404 return propagateFloat64NaN( a, b STATUS_VAR ); 3405 } 3406 float_raise( float_flag_invalid STATUS_VAR); 3407 return float64_default_nan; 3408 } 3409 if ( bExp == 0x7FF ) { 3410 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3411 return a; 3412 } 3413 if ( bExp == 0 ) { 3414 if ( bSig == 0 ) { 3415 float_raise( float_flag_invalid STATUS_VAR); 3416 return float64_default_nan; 3417 } 3418 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 3419 } 3420 if ( aExp == 0 ) { 3421 if ( aSig == 0 ) return a; 3422 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3423 } 3424 expDiff = aExp - bExp; 3425 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 3426 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 3427 if ( expDiff < 0 ) { 3428 if ( expDiff < -1 ) return a; 3429 aSig >>= 1; 3430 } 3431 q = ( bSig <= aSig ); 3432 if ( q ) aSig -= bSig; 3433 expDiff -= 64; 3434 while ( 0 < expDiff ) { 3435 q = estimateDiv128To64( aSig, 0, bSig ); 3436 q = ( 2 < q ) ? q - 2 : 0; 3437 aSig = - ( ( bSig>>2 ) * q ); 3438 expDiff -= 62; 3439 } 3440 expDiff += 64; 3441 if ( 0 < expDiff ) { 3442 q = estimateDiv128To64( aSig, 0, bSig ); 3443 q = ( 2 < q ) ? q - 2 : 0; 3444 q >>= 64 - expDiff; 3445 bSig >>= 2; 3446 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 3447 } 3448 else { 3449 aSig >>= 2; 3450 bSig >>= 2; 3451 } 3452 do { 3453 alternateASig = aSig; 3454 ++q; 3455 aSig -= bSig; 3456 } while ( 0 <= (int64_t) aSig ); 3457 sigMean = aSig + alternateASig; 3458 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 3459 aSig = alternateASig; 3460 } 3461 zSign = ( (int64_t) aSig < 0 ); 3462 if ( zSign ) aSig = - aSig; 3463 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR ); 3464 3465 } 3466 3467 /*---------------------------------------------------------------------------- 3468 | Returns the square root of the double-precision floating-point value `a'. 3469 | The operation is performed according to the IEC/IEEE Standard for Binary 3470 | Floating-Point Arithmetic. 3471 *----------------------------------------------------------------------------*/ 3472 3473 float64 float64_sqrt( float64 a STATUS_PARAM ) 3474 { 3475 flag aSign; 3476 int16 aExp, zExp; 3477 uint64_t aSig, zSig, doubleZSig; 3478 uint64_t rem0, rem1, term0, term1; 3479 a = float64_squash_input_denormal(a STATUS_VAR); 3480 3481 aSig = extractFloat64Frac( a ); 3482 aExp = extractFloat64Exp( a ); 3483 aSign = extractFloat64Sign( a ); 3484 if ( aExp == 0x7FF ) { 3485 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR ); 3486 if ( ! aSign ) return a; 3487 float_raise( float_flag_invalid STATUS_VAR); 3488 return float64_default_nan; 3489 } 3490 if ( aSign ) { 3491 if ( ( aExp | aSig ) == 0 ) return a; 3492 float_raise( float_flag_invalid STATUS_VAR); 3493 return float64_default_nan; 3494 } 3495 if ( aExp == 0 ) { 3496 if ( aSig == 0 ) return float64_zero; 3497 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3498 } 3499 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 3500 aSig |= LIT64( 0x0010000000000000 ); 3501 zSig = estimateSqrt32( aExp, aSig>>21 ); 3502 aSig <<= 9 - ( aExp & 1 ); 3503 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); 3504 if ( ( zSig & 0x1FF ) <= 5 ) { 3505 doubleZSig = zSig<<1; 3506 mul64To128( zSig, zSig, &term0, &term1 ); 3507 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 3508 while ( (int64_t) rem0 < 0 ) { 3509 --zSig; 3510 doubleZSig -= 2; 3511 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); 3512 } 3513 zSig |= ( ( rem0 | rem1 ) != 0 ); 3514 } 3515 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR ); 3516 3517 } 3518 3519 /*---------------------------------------------------------------------------- 3520 | Returns the binary log of the double-precision floating-point value `a'. 3521 | The operation is performed according to the IEC/IEEE Standard for Binary 3522 | Floating-Point Arithmetic. 3523 *----------------------------------------------------------------------------*/ 3524 float64 float64_log2( float64 a STATUS_PARAM ) 3525 { 3526 flag aSign, zSign; 3527 int16 aExp; 3528 uint64_t aSig, aSig0, aSig1, zSig, i; 3529 a = float64_squash_input_denormal(a STATUS_VAR); 3530 3531 aSig = extractFloat64Frac( a ); 3532 aExp = extractFloat64Exp( a ); 3533 aSign = extractFloat64Sign( a ); 3534 3535 if ( aExp == 0 ) { 3536 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 3537 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3538 } 3539 if ( aSign ) { 3540 float_raise( float_flag_invalid STATUS_VAR); 3541 return float64_default_nan; 3542 } 3543 if ( aExp == 0x7FF ) { 3544 if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR ); 3545 return a; 3546 } 3547 3548 aExp -= 0x3FF; 3549 aSig |= LIT64( 0x0010000000000000 ); 3550 zSign = aExp < 0; 3551 zSig = (uint64_t)aExp << 52; 3552 for (i = 1LL << 51; i > 0; i >>= 1) { 3553 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 3554 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 3555 if ( aSig & LIT64( 0x0020000000000000 ) ) { 3556 aSig >>= 1; 3557 zSig |= i; 3558 } 3559 } 3560 3561 if ( zSign ) 3562 zSig = -zSig; 3563 return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR ); 3564 } 3565 3566 /*---------------------------------------------------------------------------- 3567 | Returns 1 if the double-precision floating-point value `a' is equal to the 3568 | corresponding value `b', and 0 otherwise. The invalid exception is raised 3569 | if either operand is a NaN. Otherwise, the comparison is performed 3570 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3571 *----------------------------------------------------------------------------*/ 3572 3573 int float64_eq( float64 a, float64 b STATUS_PARAM ) 3574 { 3575 uint64_t av, bv; 3576 a = float64_squash_input_denormal(a STATUS_VAR); 3577 b = float64_squash_input_denormal(b STATUS_VAR); 3578 3579 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3580 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3581 ) { 3582 float_raise( float_flag_invalid STATUS_VAR); 3583 return 0; 3584 } 3585 av = float64_val(a); 3586 bv = float64_val(b); 3587 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 3588 3589 } 3590 3591 /*---------------------------------------------------------------------------- 3592 | Returns 1 if the double-precision floating-point value `a' is less than or 3593 | equal to the corresponding value `b', and 0 otherwise. The invalid 3594 | exception is raised if either operand is a NaN. The comparison is performed 3595 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3596 *----------------------------------------------------------------------------*/ 3597 3598 int float64_le( float64 a, float64 b STATUS_PARAM ) 3599 { 3600 flag aSign, bSign; 3601 uint64_t av, bv; 3602 a = float64_squash_input_denormal(a STATUS_VAR); 3603 b = float64_squash_input_denormal(b STATUS_VAR); 3604 3605 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3606 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3607 ) { 3608 float_raise( float_flag_invalid STATUS_VAR); 3609 return 0; 3610 } 3611 aSign = extractFloat64Sign( a ); 3612 bSign = extractFloat64Sign( b ); 3613 av = float64_val(a); 3614 bv = float64_val(b); 3615 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 3616 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3617 3618 } 3619 3620 /*---------------------------------------------------------------------------- 3621 | Returns 1 if the double-precision floating-point value `a' is less than 3622 | the corresponding value `b', and 0 otherwise. The invalid exception is 3623 | raised if either operand is a NaN. The comparison is performed according 3624 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3625 *----------------------------------------------------------------------------*/ 3626 3627 int float64_lt( float64 a, float64 b STATUS_PARAM ) 3628 { 3629 flag aSign, bSign; 3630 uint64_t av, bv; 3631 3632 a = float64_squash_input_denormal(a STATUS_VAR); 3633 b = float64_squash_input_denormal(b STATUS_VAR); 3634 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3635 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3636 ) { 3637 float_raise( float_flag_invalid STATUS_VAR); 3638 return 0; 3639 } 3640 aSign = extractFloat64Sign( a ); 3641 bSign = extractFloat64Sign( b ); 3642 av = float64_val(a); 3643 bv = float64_val(b); 3644 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 3645 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3646 3647 } 3648 3649 /*---------------------------------------------------------------------------- 3650 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 3651 | be compared, and 0 otherwise. The invalid exception is raised if either 3652 | operand is a NaN. The comparison is performed according to the IEC/IEEE 3653 | Standard for Binary Floating-Point Arithmetic. 3654 *----------------------------------------------------------------------------*/ 3655 3656 int float64_unordered( float64 a, float64 b STATUS_PARAM ) 3657 { 3658 a = float64_squash_input_denormal(a STATUS_VAR); 3659 b = float64_squash_input_denormal(b STATUS_VAR); 3660 3661 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3662 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3663 ) { 3664 float_raise( float_flag_invalid STATUS_VAR); 3665 return 1; 3666 } 3667 return 0; 3668 } 3669 3670 /*---------------------------------------------------------------------------- 3671 | Returns 1 if the double-precision floating-point value `a' is equal to the 3672 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3673 | exception.The comparison is performed according to the IEC/IEEE Standard 3674 | for Binary Floating-Point Arithmetic. 3675 *----------------------------------------------------------------------------*/ 3676 3677 int float64_eq_quiet( float64 a, float64 b STATUS_PARAM ) 3678 { 3679 uint64_t av, bv; 3680 a = float64_squash_input_denormal(a STATUS_VAR); 3681 b = float64_squash_input_denormal(b STATUS_VAR); 3682 3683 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3684 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3685 ) { 3686 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3687 float_raise( float_flag_invalid STATUS_VAR); 3688 } 3689 return 0; 3690 } 3691 av = float64_val(a); 3692 bv = float64_val(b); 3693 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 3694 3695 } 3696 3697 /*---------------------------------------------------------------------------- 3698 | Returns 1 if the double-precision floating-point value `a' is less than or 3699 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 3700 | cause an exception. Otherwise, the comparison is performed according to the 3701 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3702 *----------------------------------------------------------------------------*/ 3703 3704 int float64_le_quiet( float64 a, float64 b STATUS_PARAM ) 3705 { 3706 flag aSign, bSign; 3707 uint64_t av, bv; 3708 a = float64_squash_input_denormal(a STATUS_VAR); 3709 b = float64_squash_input_denormal(b STATUS_VAR); 3710 3711 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3712 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3713 ) { 3714 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3715 float_raise( float_flag_invalid STATUS_VAR); 3716 } 3717 return 0; 3718 } 3719 aSign = extractFloat64Sign( a ); 3720 bSign = extractFloat64Sign( b ); 3721 av = float64_val(a); 3722 bv = float64_val(b); 3723 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 3724 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3725 3726 } 3727 3728 /*---------------------------------------------------------------------------- 3729 | Returns 1 if the double-precision floating-point value `a' is less than 3730 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3731 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 3732 | Standard for Binary Floating-Point Arithmetic. 3733 *----------------------------------------------------------------------------*/ 3734 3735 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM ) 3736 { 3737 flag aSign, bSign; 3738 uint64_t av, bv; 3739 a = float64_squash_input_denormal(a STATUS_VAR); 3740 b = float64_squash_input_denormal(b STATUS_VAR); 3741 3742 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3743 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3744 ) { 3745 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3746 float_raise( float_flag_invalid STATUS_VAR); 3747 } 3748 return 0; 3749 } 3750 aSign = extractFloat64Sign( a ); 3751 bSign = extractFloat64Sign( b ); 3752 av = float64_val(a); 3753 bv = float64_val(b); 3754 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 3755 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3756 3757 } 3758 3759 /*---------------------------------------------------------------------------- 3760 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 3761 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 3762 | comparison is performed according to the IEC/IEEE Standard for Binary 3763 | Floating-Point Arithmetic. 3764 *----------------------------------------------------------------------------*/ 3765 3766 int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM ) 3767 { 3768 a = float64_squash_input_denormal(a STATUS_VAR); 3769 b = float64_squash_input_denormal(b STATUS_VAR); 3770 3771 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 3772 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 3773 ) { 3774 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 3775 float_raise( float_flag_invalid STATUS_VAR); 3776 } 3777 return 1; 3778 } 3779 return 0; 3780 } 3781 3782 /*---------------------------------------------------------------------------- 3783 | Returns the result of converting the extended double-precision floating- 3784 | point value `a' to the 32-bit two's complement integer format. The 3785 | conversion is performed according to the IEC/IEEE Standard for Binary 3786 | Floating-Point Arithmetic---which means in particular that the conversion 3787 | is rounded according to the current rounding mode. If `a' is a NaN, the 3788 | largest positive integer is returned. Otherwise, if the conversion 3789 | overflows, the largest integer with the same sign as `a' is returned. 3790 *----------------------------------------------------------------------------*/ 3791 3792 int32 floatx80_to_int32( floatx80 a STATUS_PARAM ) 3793 { 3794 flag aSign; 3795 int32 aExp, shiftCount; 3796 uint64_t aSig; 3797 3798 aSig = extractFloatx80Frac( a ); 3799 aExp = extractFloatx80Exp( a ); 3800 aSign = extractFloatx80Sign( a ); 3801 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 3802 shiftCount = 0x4037 - aExp; 3803 if ( shiftCount <= 0 ) shiftCount = 1; 3804 shift64RightJamming( aSig, shiftCount, &aSig ); 3805 return roundAndPackInt32( aSign, aSig STATUS_VAR ); 3806 3807 } 3808 3809 /*---------------------------------------------------------------------------- 3810 | Returns the result of converting the extended double-precision floating- 3811 | point value `a' to the 32-bit two's complement integer format. The 3812 | conversion is performed according to the IEC/IEEE Standard for Binary 3813 | Floating-Point Arithmetic, except that the conversion is always rounded 3814 | toward zero. If `a' is a NaN, the largest positive integer is returned. 3815 | Otherwise, if the conversion overflows, the largest integer with the same 3816 | sign as `a' is returned. 3817 *----------------------------------------------------------------------------*/ 3818 3819 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM ) 3820 { 3821 flag aSign; 3822 int32 aExp, shiftCount; 3823 uint64_t aSig, savedASig; 3824 int32 z; 3825 3826 aSig = extractFloatx80Frac( a ); 3827 aExp = extractFloatx80Exp( a ); 3828 aSign = extractFloatx80Sign( a ); 3829 if ( 0x401E < aExp ) { 3830 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 3831 goto invalid; 3832 } 3833 else if ( aExp < 0x3FFF ) { 3834 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 3835 return 0; 3836 } 3837 shiftCount = 0x403E - aExp; 3838 savedASig = aSig; 3839 aSig >>= shiftCount; 3840 z = aSig; 3841 if ( aSign ) z = - z; 3842 if ( ( z < 0 ) ^ aSign ) { 3843 invalid: 3844 float_raise( float_flag_invalid STATUS_VAR); 3845 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 3846 } 3847 if ( ( aSig<<shiftCount ) != savedASig ) { 3848 STATUS(float_exception_flags) |= float_flag_inexact; 3849 } 3850 return z; 3851 3852 } 3853 3854 /*---------------------------------------------------------------------------- 3855 | Returns the result of converting the extended double-precision floating- 3856 | point value `a' to the 64-bit two's complement integer format. The 3857 | conversion is performed according to the IEC/IEEE Standard for Binary 3858 | Floating-Point Arithmetic---which means in particular that the conversion 3859 | is rounded according to the current rounding mode. If `a' is a NaN, 3860 | the largest positive integer is returned. Otherwise, if the conversion 3861 | overflows, the largest integer with the same sign as `a' is returned. 3862 *----------------------------------------------------------------------------*/ 3863 3864 int64 floatx80_to_int64( floatx80 a STATUS_PARAM ) 3865 { 3866 flag aSign; 3867 int32 aExp, shiftCount; 3868 uint64_t aSig, aSigExtra; 3869 3870 aSig = extractFloatx80Frac( a ); 3871 aExp = extractFloatx80Exp( a ); 3872 aSign = extractFloatx80Sign( a ); 3873 shiftCount = 0x403E - aExp; 3874 if ( shiftCount <= 0 ) { 3875 if ( shiftCount ) { 3876 float_raise( float_flag_invalid STATUS_VAR); 3877 if ( ! aSign 3878 || ( ( aExp == 0x7FFF ) 3879 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 3880 ) { 3881 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3882 } 3883 return (int64_t) LIT64( 0x8000000000000000 ); 3884 } 3885 aSigExtra = 0; 3886 } 3887 else { 3888 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 3889 } 3890 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR ); 3891 3892 } 3893 3894 /*---------------------------------------------------------------------------- 3895 | Returns the result of converting the extended double-precision floating- 3896 | point value `a' to the 64-bit two's complement integer format. The 3897 | conversion is performed according to the IEC/IEEE Standard for Binary 3898 | Floating-Point Arithmetic, except that the conversion is always rounded 3899 | toward zero. If `a' is a NaN, the largest positive integer is returned. 3900 | Otherwise, if the conversion overflows, the largest integer with the same 3901 | sign as `a' is returned. 3902 *----------------------------------------------------------------------------*/ 3903 3904 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM ) 3905 { 3906 flag aSign; 3907 int32 aExp, shiftCount; 3908 uint64_t aSig; 3909 int64 z; 3910 3911 aSig = extractFloatx80Frac( a ); 3912 aExp = extractFloatx80Exp( a ); 3913 aSign = extractFloatx80Sign( a ); 3914 shiftCount = aExp - 0x403E; 3915 if ( 0 <= shiftCount ) { 3916 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 3917 if ( ( a.high != 0xC03E ) || aSig ) { 3918 float_raise( float_flag_invalid STATUS_VAR); 3919 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 3920 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3921 } 3922 } 3923 return (int64_t) LIT64( 0x8000000000000000 ); 3924 } 3925 else if ( aExp < 0x3FFF ) { 3926 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 3927 return 0; 3928 } 3929 z = aSig>>( - shiftCount ); 3930 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 3931 STATUS(float_exception_flags) |= float_flag_inexact; 3932 } 3933 if ( aSign ) z = - z; 3934 return z; 3935 3936 } 3937 3938 /*---------------------------------------------------------------------------- 3939 | Returns the result of converting the extended double-precision floating- 3940 | point value `a' to the single-precision floating-point format. The 3941 | conversion is performed according to the IEC/IEEE Standard for Binary 3942 | Floating-Point Arithmetic. 3943 *----------------------------------------------------------------------------*/ 3944 3945 float32 floatx80_to_float32( floatx80 a STATUS_PARAM ) 3946 { 3947 flag aSign; 3948 int32 aExp; 3949 uint64_t aSig; 3950 3951 aSig = extractFloatx80Frac( a ); 3952 aExp = extractFloatx80Exp( a ); 3953 aSign = extractFloatx80Sign( a ); 3954 if ( aExp == 0x7FFF ) { 3955 if ( (uint64_t) ( aSig<<1 ) ) { 3956 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 3957 } 3958 return packFloat32( aSign, 0xFF, 0 ); 3959 } 3960 shift64RightJamming( aSig, 33, &aSig ); 3961 if ( aExp || aSig ) aExp -= 0x3F81; 3962 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR ); 3963 3964 } 3965 3966 /*---------------------------------------------------------------------------- 3967 | Returns the result of converting the extended double-precision floating- 3968 | point value `a' to the double-precision floating-point format. The 3969 | conversion is performed according to the IEC/IEEE Standard for Binary 3970 | Floating-Point Arithmetic. 3971 *----------------------------------------------------------------------------*/ 3972 3973 float64 floatx80_to_float64( floatx80 a STATUS_PARAM ) 3974 { 3975 flag aSign; 3976 int32 aExp; 3977 uint64_t aSig, zSig; 3978 3979 aSig = extractFloatx80Frac( a ); 3980 aExp = extractFloatx80Exp( a ); 3981 aSign = extractFloatx80Sign( a ); 3982 if ( aExp == 0x7FFF ) { 3983 if ( (uint64_t) ( aSig<<1 ) ) { 3984 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 3985 } 3986 return packFloat64( aSign, 0x7FF, 0 ); 3987 } 3988 shift64RightJamming( aSig, 1, &zSig ); 3989 if ( aExp || aSig ) aExp -= 0x3C01; 3990 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR ); 3991 3992 } 3993 3994 /*---------------------------------------------------------------------------- 3995 | Returns the result of converting the extended double-precision floating- 3996 | point value `a' to the quadruple-precision floating-point format. The 3997 | conversion is performed according to the IEC/IEEE Standard for Binary 3998 | Floating-Point Arithmetic. 3999 *----------------------------------------------------------------------------*/ 4000 4001 float128 floatx80_to_float128( floatx80 a STATUS_PARAM ) 4002 { 4003 flag aSign; 4004 int16 aExp; 4005 uint64_t aSig, zSig0, zSig1; 4006 4007 aSig = extractFloatx80Frac( a ); 4008 aExp = extractFloatx80Exp( a ); 4009 aSign = extractFloatx80Sign( a ); 4010 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 4011 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 4012 } 4013 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 4014 return packFloat128( aSign, aExp, zSig0, zSig1 ); 4015 4016 } 4017 4018 /*---------------------------------------------------------------------------- 4019 | Rounds the extended double-precision floating-point value `a' to an integer, 4020 | and returns the result as an extended quadruple-precision floating-point 4021 | value. The operation is performed according to the IEC/IEEE Standard for 4022 | Binary Floating-Point Arithmetic. 4023 *----------------------------------------------------------------------------*/ 4024 4025 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM ) 4026 { 4027 flag aSign; 4028 int32 aExp; 4029 uint64_t lastBitMask, roundBitsMask; 4030 int8 roundingMode; 4031 floatx80 z; 4032 4033 aExp = extractFloatx80Exp( a ); 4034 if ( 0x403E <= aExp ) { 4035 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 4036 return propagateFloatx80NaN( a, a STATUS_VAR ); 4037 } 4038 return a; 4039 } 4040 if ( aExp < 0x3FFF ) { 4041 if ( ( aExp == 0 ) 4042 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 4043 return a; 4044 } 4045 STATUS(float_exception_flags) |= float_flag_inexact; 4046 aSign = extractFloatx80Sign( a ); 4047 switch ( STATUS(float_rounding_mode) ) { 4048 case float_round_nearest_even: 4049 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 4050 ) { 4051 return 4052 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 4053 } 4054 break; 4055 case float_round_down: 4056 return 4057 aSign ? 4058 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 4059 : packFloatx80( 0, 0, 0 ); 4060 case float_round_up: 4061 return 4062 aSign ? packFloatx80( 1, 0, 0 ) 4063 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 4064 } 4065 return packFloatx80( aSign, 0, 0 ); 4066 } 4067 lastBitMask = 1; 4068 lastBitMask <<= 0x403E - aExp; 4069 roundBitsMask = lastBitMask - 1; 4070 z = a; 4071 roundingMode = STATUS(float_rounding_mode); 4072 if ( roundingMode == float_round_nearest_even ) { 4073 z.low += lastBitMask>>1; 4074 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 4075 } 4076 else if ( roundingMode != float_round_to_zero ) { 4077 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) { 4078 z.low += roundBitsMask; 4079 } 4080 } 4081 z.low &= ~ roundBitsMask; 4082 if ( z.low == 0 ) { 4083 ++z.high; 4084 z.low = LIT64( 0x8000000000000000 ); 4085 } 4086 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact; 4087 return z; 4088 4089 } 4090 4091 /*---------------------------------------------------------------------------- 4092 | Returns the result of adding the absolute values of the extended double- 4093 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 4094 | negated before being returned. `zSign' is ignored if the result is a NaN. 4095 | The addition is performed according to the IEC/IEEE Standard for Binary 4096 | Floating-Point Arithmetic. 4097 *----------------------------------------------------------------------------*/ 4098 4099 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM) 4100 { 4101 int32 aExp, bExp, zExp; 4102 uint64_t aSig, bSig, zSig0, zSig1; 4103 int32 expDiff; 4104 4105 aSig = extractFloatx80Frac( a ); 4106 aExp = extractFloatx80Exp( a ); 4107 bSig = extractFloatx80Frac( b ); 4108 bExp = extractFloatx80Exp( b ); 4109 expDiff = aExp - bExp; 4110 if ( 0 < expDiff ) { 4111 if ( aExp == 0x7FFF ) { 4112 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 4113 return a; 4114 } 4115 if ( bExp == 0 ) --expDiff; 4116 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 4117 zExp = aExp; 4118 } 4119 else if ( expDiff < 0 ) { 4120 if ( bExp == 0x7FFF ) { 4121 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 4122 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4123 } 4124 if ( aExp == 0 ) ++expDiff; 4125 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 4126 zExp = bExp; 4127 } 4128 else { 4129 if ( aExp == 0x7FFF ) { 4130 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 4131 return propagateFloatx80NaN( a, b STATUS_VAR ); 4132 } 4133 return a; 4134 } 4135 zSig1 = 0; 4136 zSig0 = aSig + bSig; 4137 if ( aExp == 0 ) { 4138 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 4139 goto roundAndPack; 4140 } 4141 zExp = aExp; 4142 goto shiftRight1; 4143 } 4144 zSig0 = aSig + bSig; 4145 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 4146 shiftRight1: 4147 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 4148 zSig0 |= LIT64( 0x8000000000000000 ); 4149 ++zExp; 4150 roundAndPack: 4151 return 4152 roundAndPackFloatx80( 4153 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR ); 4154 4155 } 4156 4157 /*---------------------------------------------------------------------------- 4158 | Returns the result of subtracting the absolute values of the extended 4159 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 4160 | difference is negated before being returned. `zSign' is ignored if the 4161 | result is a NaN. The subtraction is performed according to the IEC/IEEE 4162 | Standard for Binary Floating-Point Arithmetic. 4163 *----------------------------------------------------------------------------*/ 4164 4165 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM ) 4166 { 4167 int32 aExp, bExp, zExp; 4168 uint64_t aSig, bSig, zSig0, zSig1; 4169 int32 expDiff; 4170 floatx80 z; 4171 4172 aSig = extractFloatx80Frac( a ); 4173 aExp = extractFloatx80Exp( a ); 4174 bSig = extractFloatx80Frac( b ); 4175 bExp = extractFloatx80Exp( b ); 4176 expDiff = aExp - bExp; 4177 if ( 0 < expDiff ) goto aExpBigger; 4178 if ( expDiff < 0 ) goto bExpBigger; 4179 if ( aExp == 0x7FFF ) { 4180 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 4181 return propagateFloatx80NaN( a, b STATUS_VAR ); 4182 } 4183 float_raise( float_flag_invalid STATUS_VAR); 4184 z.low = floatx80_default_nan_low; 4185 z.high = floatx80_default_nan_high; 4186 return z; 4187 } 4188 if ( aExp == 0 ) { 4189 aExp = 1; 4190 bExp = 1; 4191 } 4192 zSig1 = 0; 4193 if ( bSig < aSig ) goto aBigger; 4194 if ( aSig < bSig ) goto bBigger; 4195 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 ); 4196 bExpBigger: 4197 if ( bExp == 0x7FFF ) { 4198 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 4199 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4200 } 4201 if ( aExp == 0 ) ++expDiff; 4202 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 4203 bBigger: 4204 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 4205 zExp = bExp; 4206 zSign ^= 1; 4207 goto normalizeRoundAndPack; 4208 aExpBigger: 4209 if ( aExp == 0x7FFF ) { 4210 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 4211 return a; 4212 } 4213 if ( bExp == 0 ) --expDiff; 4214 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 4215 aBigger: 4216 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 4217 zExp = aExp; 4218 normalizeRoundAndPack: 4219 return 4220 normalizeRoundAndPackFloatx80( 4221 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR ); 4222 4223 } 4224 4225 /*---------------------------------------------------------------------------- 4226 | Returns the result of adding the extended double-precision floating-point 4227 | values `a' and `b'. The operation is performed according to the IEC/IEEE 4228 | Standard for Binary Floating-Point Arithmetic. 4229 *----------------------------------------------------------------------------*/ 4230 4231 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM ) 4232 { 4233 flag aSign, bSign; 4234 4235 aSign = extractFloatx80Sign( a ); 4236 bSign = extractFloatx80Sign( b ); 4237 if ( aSign == bSign ) { 4238 return addFloatx80Sigs( a, b, aSign STATUS_VAR ); 4239 } 4240 else { 4241 return subFloatx80Sigs( a, b, aSign STATUS_VAR ); 4242 } 4243 4244 } 4245 4246 /*---------------------------------------------------------------------------- 4247 | Returns the result of subtracting the extended double-precision floating- 4248 | point values `a' and `b'. The operation is performed according to the 4249 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4250 *----------------------------------------------------------------------------*/ 4251 4252 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM ) 4253 { 4254 flag aSign, bSign; 4255 4256 aSign = extractFloatx80Sign( a ); 4257 bSign = extractFloatx80Sign( b ); 4258 if ( aSign == bSign ) { 4259 return subFloatx80Sigs( a, b, aSign STATUS_VAR ); 4260 } 4261 else { 4262 return addFloatx80Sigs( a, b, aSign STATUS_VAR ); 4263 } 4264 4265 } 4266 4267 /*---------------------------------------------------------------------------- 4268 | Returns the result of multiplying the extended double-precision floating- 4269 | point values `a' and `b'. The operation is performed according to the 4270 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4271 *----------------------------------------------------------------------------*/ 4272 4273 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM ) 4274 { 4275 flag aSign, bSign, zSign; 4276 int32 aExp, bExp, zExp; 4277 uint64_t aSig, bSig, zSig0, zSig1; 4278 floatx80 z; 4279 4280 aSig = extractFloatx80Frac( a ); 4281 aExp = extractFloatx80Exp( a ); 4282 aSign = extractFloatx80Sign( a ); 4283 bSig = extractFloatx80Frac( b ); 4284 bExp = extractFloatx80Exp( b ); 4285 bSign = extractFloatx80Sign( b ); 4286 zSign = aSign ^ bSign; 4287 if ( aExp == 0x7FFF ) { 4288 if ( (uint64_t) ( aSig<<1 ) 4289 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 4290 return propagateFloatx80NaN( a, b STATUS_VAR ); 4291 } 4292 if ( ( bExp | bSig ) == 0 ) goto invalid; 4293 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4294 } 4295 if ( bExp == 0x7FFF ) { 4296 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 4297 if ( ( aExp | aSig ) == 0 ) { 4298 invalid: 4299 float_raise( float_flag_invalid STATUS_VAR); 4300 z.low = floatx80_default_nan_low; 4301 z.high = floatx80_default_nan_high; 4302 return z; 4303 } 4304 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4305 } 4306 if ( aExp == 0 ) { 4307 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 4308 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 4309 } 4310 if ( bExp == 0 ) { 4311 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 4312 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 4313 } 4314 zExp = aExp + bExp - 0x3FFE; 4315 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 4316 if ( 0 < (int64_t) zSig0 ) { 4317 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 4318 --zExp; 4319 } 4320 return 4321 roundAndPackFloatx80( 4322 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR ); 4323 4324 } 4325 4326 /*---------------------------------------------------------------------------- 4327 | Returns the result of dividing the extended double-precision floating-point 4328 | value `a' by the corresponding value `b'. The operation is performed 4329 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4330 *----------------------------------------------------------------------------*/ 4331 4332 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM ) 4333 { 4334 flag aSign, bSign, zSign; 4335 int32 aExp, bExp, zExp; 4336 uint64_t aSig, bSig, zSig0, zSig1; 4337 uint64_t rem0, rem1, rem2, term0, term1, term2; 4338 floatx80 z; 4339 4340 aSig = extractFloatx80Frac( a ); 4341 aExp = extractFloatx80Exp( a ); 4342 aSign = extractFloatx80Sign( a ); 4343 bSig = extractFloatx80Frac( b ); 4344 bExp = extractFloatx80Exp( b ); 4345 bSign = extractFloatx80Sign( b ); 4346 zSign = aSign ^ bSign; 4347 if ( aExp == 0x7FFF ) { 4348 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 4349 if ( bExp == 0x7FFF ) { 4350 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 4351 goto invalid; 4352 } 4353 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4354 } 4355 if ( bExp == 0x7FFF ) { 4356 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 4357 return packFloatx80( zSign, 0, 0 ); 4358 } 4359 if ( bExp == 0 ) { 4360 if ( bSig == 0 ) { 4361 if ( ( aExp | aSig ) == 0 ) { 4362 invalid: 4363 float_raise( float_flag_invalid STATUS_VAR); 4364 z.low = floatx80_default_nan_low; 4365 z.high = floatx80_default_nan_high; 4366 return z; 4367 } 4368 float_raise( float_flag_divbyzero STATUS_VAR); 4369 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4370 } 4371 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 4372 } 4373 if ( aExp == 0 ) { 4374 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 4375 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 4376 } 4377 zExp = aExp - bExp + 0x3FFE; 4378 rem1 = 0; 4379 if ( bSig <= aSig ) { 4380 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 4381 ++zExp; 4382 } 4383 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 4384 mul64To128( bSig, zSig0, &term0, &term1 ); 4385 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 4386 while ( (int64_t) rem0 < 0 ) { 4387 --zSig0; 4388 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 4389 } 4390 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 4391 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 4392 mul64To128( bSig, zSig1, &term1, &term2 ); 4393 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 4394 while ( (int64_t) rem1 < 0 ) { 4395 --zSig1; 4396 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 4397 } 4398 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 4399 } 4400 return 4401 roundAndPackFloatx80( 4402 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR ); 4403 4404 } 4405 4406 /*---------------------------------------------------------------------------- 4407 | Returns the remainder of the extended double-precision floating-point value 4408 | `a' with respect to the corresponding value `b'. The operation is performed 4409 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4410 *----------------------------------------------------------------------------*/ 4411 4412 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM ) 4413 { 4414 flag aSign, zSign; 4415 int32 aExp, bExp, expDiff; 4416 uint64_t aSig0, aSig1, bSig; 4417 uint64_t q, term0, term1, alternateASig0, alternateASig1; 4418 floatx80 z; 4419 4420 aSig0 = extractFloatx80Frac( a ); 4421 aExp = extractFloatx80Exp( a ); 4422 aSign = extractFloatx80Sign( a ); 4423 bSig = extractFloatx80Frac( b ); 4424 bExp = extractFloatx80Exp( b ); 4425 if ( aExp == 0x7FFF ) { 4426 if ( (uint64_t) ( aSig0<<1 ) 4427 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 4428 return propagateFloatx80NaN( a, b STATUS_VAR ); 4429 } 4430 goto invalid; 4431 } 4432 if ( bExp == 0x7FFF ) { 4433 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 4434 return a; 4435 } 4436 if ( bExp == 0 ) { 4437 if ( bSig == 0 ) { 4438 invalid: 4439 float_raise( float_flag_invalid STATUS_VAR); 4440 z.low = floatx80_default_nan_low; 4441 z.high = floatx80_default_nan_high; 4442 return z; 4443 } 4444 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 4445 } 4446 if ( aExp == 0 ) { 4447 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a; 4448 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 4449 } 4450 bSig |= LIT64( 0x8000000000000000 ); 4451 zSign = aSign; 4452 expDiff = aExp - bExp; 4453 aSig1 = 0; 4454 if ( expDiff < 0 ) { 4455 if ( expDiff < -1 ) return a; 4456 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 4457 expDiff = 0; 4458 } 4459 q = ( bSig <= aSig0 ); 4460 if ( q ) aSig0 -= bSig; 4461 expDiff -= 64; 4462 while ( 0 < expDiff ) { 4463 q = estimateDiv128To64( aSig0, aSig1, bSig ); 4464 q = ( 2 < q ) ? q - 2 : 0; 4465 mul64To128( bSig, q, &term0, &term1 ); 4466 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 4467 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 4468 expDiff -= 62; 4469 } 4470 expDiff += 64; 4471 if ( 0 < expDiff ) { 4472 q = estimateDiv128To64( aSig0, aSig1, bSig ); 4473 q = ( 2 < q ) ? q - 2 : 0; 4474 q >>= 64 - expDiff; 4475 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 4476 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 4477 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 4478 while ( le128( term0, term1, aSig0, aSig1 ) ) { 4479 ++q; 4480 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 4481 } 4482 } 4483 else { 4484 term1 = 0; 4485 term0 = bSig; 4486 } 4487 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 4488 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 4489 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 4490 && ( q & 1 ) ) 4491 ) { 4492 aSig0 = alternateASig0; 4493 aSig1 = alternateASig1; 4494 zSign = ! zSign; 4495 } 4496 return 4497 normalizeRoundAndPackFloatx80( 4498 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR ); 4499 4500 } 4501 4502 /*---------------------------------------------------------------------------- 4503 | Returns the square root of the extended double-precision floating-point 4504 | value `a'. The operation is performed according to the IEC/IEEE Standard 4505 | for Binary Floating-Point Arithmetic. 4506 *----------------------------------------------------------------------------*/ 4507 4508 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM ) 4509 { 4510 flag aSign; 4511 int32 aExp, zExp; 4512 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 4513 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 4514 floatx80 z; 4515 4516 aSig0 = extractFloatx80Frac( a ); 4517 aExp = extractFloatx80Exp( a ); 4518 aSign = extractFloatx80Sign( a ); 4519 if ( aExp == 0x7FFF ) { 4520 if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR ); 4521 if ( ! aSign ) return a; 4522 goto invalid; 4523 } 4524 if ( aSign ) { 4525 if ( ( aExp | aSig0 ) == 0 ) return a; 4526 invalid: 4527 float_raise( float_flag_invalid STATUS_VAR); 4528 z.low = floatx80_default_nan_low; 4529 z.high = floatx80_default_nan_high; 4530 return z; 4531 } 4532 if ( aExp == 0 ) { 4533 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 4534 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 4535 } 4536 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 4537 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 4538 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 4539 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 4540 doubleZSig0 = zSig0<<1; 4541 mul64To128( zSig0, zSig0, &term0, &term1 ); 4542 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 4543 while ( (int64_t) rem0 < 0 ) { 4544 --zSig0; 4545 doubleZSig0 -= 2; 4546 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 4547 } 4548 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 4549 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 4550 if ( zSig1 == 0 ) zSig1 = 1; 4551 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 4552 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 4553 mul64To128( zSig1, zSig1, &term2, &term3 ); 4554 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 4555 while ( (int64_t) rem1 < 0 ) { 4556 --zSig1; 4557 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 4558 term3 |= 1; 4559 term2 |= doubleZSig0; 4560 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 4561 } 4562 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 4563 } 4564 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 4565 zSig0 |= doubleZSig0; 4566 return 4567 roundAndPackFloatx80( 4568 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR ); 4569 4570 } 4571 4572 /*---------------------------------------------------------------------------- 4573 | Returns 1 if the extended double-precision floating-point value `a' is equal 4574 | to the corresponding value `b', and 0 otherwise. The invalid exception is 4575 | raised if either operand is a NaN. Otherwise, the comparison is performed 4576 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4577 *----------------------------------------------------------------------------*/ 4578 4579 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM ) 4580 { 4581 4582 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4583 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 4584 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4585 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 4586 ) { 4587 float_raise( float_flag_invalid STATUS_VAR); 4588 return 0; 4589 } 4590 return 4591 ( a.low == b.low ) 4592 && ( ( a.high == b.high ) 4593 || ( ( a.low == 0 ) 4594 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 4595 ); 4596 4597 } 4598 4599 /*---------------------------------------------------------------------------- 4600 | Returns 1 if the extended double-precision floating-point value `a' is 4601 | less than or equal to the corresponding value `b', and 0 otherwise. The 4602 | invalid exception is raised if either operand is a NaN. The comparison is 4603 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4604 | Arithmetic. 4605 *----------------------------------------------------------------------------*/ 4606 4607 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM ) 4608 { 4609 flag aSign, bSign; 4610 4611 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4612 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 4613 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4614 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 4615 ) { 4616 float_raise( float_flag_invalid STATUS_VAR); 4617 return 0; 4618 } 4619 aSign = extractFloatx80Sign( a ); 4620 bSign = extractFloatx80Sign( b ); 4621 if ( aSign != bSign ) { 4622 return 4623 aSign 4624 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4625 == 0 ); 4626 } 4627 return 4628 aSign ? le128( b.high, b.low, a.high, a.low ) 4629 : le128( a.high, a.low, b.high, b.low ); 4630 4631 } 4632 4633 /*---------------------------------------------------------------------------- 4634 | Returns 1 if the extended double-precision floating-point value `a' is 4635 | less than the corresponding value `b', and 0 otherwise. The invalid 4636 | exception is raised if either operand is a NaN. The comparison is performed 4637 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4638 *----------------------------------------------------------------------------*/ 4639 4640 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM ) 4641 { 4642 flag aSign, bSign; 4643 4644 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4645 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 4646 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4647 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 4648 ) { 4649 float_raise( float_flag_invalid STATUS_VAR); 4650 return 0; 4651 } 4652 aSign = extractFloatx80Sign( a ); 4653 bSign = extractFloatx80Sign( b ); 4654 if ( aSign != bSign ) { 4655 return 4656 aSign 4657 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4658 != 0 ); 4659 } 4660 return 4661 aSign ? lt128( b.high, b.low, a.high, a.low ) 4662 : lt128( a.high, a.low, b.high, b.low ); 4663 4664 } 4665 4666 /*---------------------------------------------------------------------------- 4667 | Returns 1 if the extended double-precision floating-point values `a' and `b' 4668 | cannot be compared, and 0 otherwise. The invalid exception is raised if 4669 | either operand is a NaN. The comparison is performed according to the 4670 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4671 *----------------------------------------------------------------------------*/ 4672 int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM ) 4673 { 4674 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4675 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 4676 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4677 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 4678 ) { 4679 float_raise( float_flag_invalid STATUS_VAR); 4680 return 1; 4681 } 4682 return 0; 4683 } 4684 4685 /*---------------------------------------------------------------------------- 4686 | Returns 1 if the extended double-precision floating-point value `a' is 4687 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4688 | cause an exception. The comparison is performed according to the IEC/IEEE 4689 | Standard for Binary Floating-Point Arithmetic. 4690 *----------------------------------------------------------------------------*/ 4691 4692 int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM ) 4693 { 4694 4695 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4696 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 4697 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4698 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 4699 ) { 4700 if ( floatx80_is_signaling_nan( a ) 4701 || floatx80_is_signaling_nan( b ) ) { 4702 float_raise( float_flag_invalid STATUS_VAR); 4703 } 4704 return 0; 4705 } 4706 return 4707 ( a.low == b.low ) 4708 && ( ( a.high == b.high ) 4709 || ( ( a.low == 0 ) 4710 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 4711 ); 4712 4713 } 4714 4715 /*---------------------------------------------------------------------------- 4716 | Returns 1 if the extended double-precision floating-point value `a' is less 4717 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 4718 | do not cause an exception. Otherwise, the comparison is performed according 4719 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4720 *----------------------------------------------------------------------------*/ 4721 4722 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM ) 4723 { 4724 flag aSign, bSign; 4725 4726 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4727 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 4728 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4729 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 4730 ) { 4731 if ( floatx80_is_signaling_nan( a ) 4732 || floatx80_is_signaling_nan( b ) ) { 4733 float_raise( float_flag_invalid STATUS_VAR); 4734 } 4735 return 0; 4736 } 4737 aSign = extractFloatx80Sign( a ); 4738 bSign = extractFloatx80Sign( b ); 4739 if ( aSign != bSign ) { 4740 return 4741 aSign 4742 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4743 == 0 ); 4744 } 4745 return 4746 aSign ? le128( b.high, b.low, a.high, a.low ) 4747 : le128( a.high, a.low, b.high, b.low ); 4748 4749 } 4750 4751 /*---------------------------------------------------------------------------- 4752 | Returns 1 if the extended double-precision floating-point value `a' is less 4753 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 4754 | an exception. Otherwise, the comparison is performed according to the 4755 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4756 *----------------------------------------------------------------------------*/ 4757 4758 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM ) 4759 { 4760 flag aSign, bSign; 4761 4762 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4763 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 4764 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4765 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 4766 ) { 4767 if ( floatx80_is_signaling_nan( a ) 4768 || floatx80_is_signaling_nan( b ) ) { 4769 float_raise( float_flag_invalid STATUS_VAR); 4770 } 4771 return 0; 4772 } 4773 aSign = extractFloatx80Sign( a ); 4774 bSign = extractFloatx80Sign( b ); 4775 if ( aSign != bSign ) { 4776 return 4777 aSign 4778 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 4779 != 0 ); 4780 } 4781 return 4782 aSign ? lt128( b.high, b.low, a.high, a.low ) 4783 : lt128( a.high, a.low, b.high, b.low ); 4784 4785 } 4786 4787 /*---------------------------------------------------------------------------- 4788 | Returns 1 if the extended double-precision floating-point values `a' and `b' 4789 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception. 4790 | The comparison is performed according to the IEC/IEEE Standard for Binary 4791 | Floating-Point Arithmetic. 4792 *----------------------------------------------------------------------------*/ 4793 int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM ) 4794 { 4795 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 4796 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 4797 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 4798 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 4799 ) { 4800 if ( floatx80_is_signaling_nan( a ) 4801 || floatx80_is_signaling_nan( b ) ) { 4802 float_raise( float_flag_invalid STATUS_VAR); 4803 } 4804 return 1; 4805 } 4806 return 0; 4807 } 4808 4809 /*---------------------------------------------------------------------------- 4810 | Returns the result of converting the quadruple-precision floating-point 4811 | value `a' to the 32-bit two's complement integer format. The conversion 4812 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4813 | Arithmetic---which means in particular that the conversion is rounded 4814 | according to the current rounding mode. If `a' is a NaN, the largest 4815 | positive integer is returned. Otherwise, if the conversion overflows, the 4816 | largest integer with the same sign as `a' is returned. 4817 *----------------------------------------------------------------------------*/ 4818 4819 int32 float128_to_int32( float128 a STATUS_PARAM ) 4820 { 4821 flag aSign; 4822 int32 aExp, shiftCount; 4823 uint64_t aSig0, aSig1; 4824 4825 aSig1 = extractFloat128Frac1( a ); 4826 aSig0 = extractFloat128Frac0( a ); 4827 aExp = extractFloat128Exp( a ); 4828 aSign = extractFloat128Sign( a ); 4829 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 4830 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4831 aSig0 |= ( aSig1 != 0 ); 4832 shiftCount = 0x4028 - aExp; 4833 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 4834 return roundAndPackInt32( aSign, aSig0 STATUS_VAR ); 4835 4836 } 4837 4838 /*---------------------------------------------------------------------------- 4839 | Returns the result of converting the quadruple-precision floating-point 4840 | value `a' to the 32-bit two's complement integer format. The conversion 4841 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4842 | Arithmetic, except that the conversion is always rounded toward zero. If 4843 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 4844 | conversion overflows, the largest integer with the same sign as `a' is 4845 | returned. 4846 *----------------------------------------------------------------------------*/ 4847 4848 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM ) 4849 { 4850 flag aSign; 4851 int32 aExp, shiftCount; 4852 uint64_t aSig0, aSig1, savedASig; 4853 int32 z; 4854 4855 aSig1 = extractFloat128Frac1( a ); 4856 aSig0 = extractFloat128Frac0( a ); 4857 aExp = extractFloat128Exp( a ); 4858 aSign = extractFloat128Sign( a ); 4859 aSig0 |= ( aSig1 != 0 ); 4860 if ( 0x401E < aExp ) { 4861 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 4862 goto invalid; 4863 } 4864 else if ( aExp < 0x3FFF ) { 4865 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact; 4866 return 0; 4867 } 4868 aSig0 |= LIT64( 0x0001000000000000 ); 4869 shiftCount = 0x402F - aExp; 4870 savedASig = aSig0; 4871 aSig0 >>= shiftCount; 4872 z = aSig0; 4873 if ( aSign ) z = - z; 4874 if ( ( z < 0 ) ^ aSign ) { 4875 invalid: 4876 float_raise( float_flag_invalid STATUS_VAR); 4877 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 4878 } 4879 if ( ( aSig0<<shiftCount ) != savedASig ) { 4880 STATUS(float_exception_flags) |= float_flag_inexact; 4881 } 4882 return z; 4883 4884 } 4885 4886 /*---------------------------------------------------------------------------- 4887 | Returns the result of converting the quadruple-precision floating-point 4888 | value `a' to the 64-bit two's complement integer format. The conversion 4889 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4890 | Arithmetic---which means in particular that the conversion is rounded 4891 | according to the current rounding mode. If `a' is a NaN, the largest 4892 | positive integer is returned. Otherwise, if the conversion overflows, the 4893 | largest integer with the same sign as `a' is returned. 4894 *----------------------------------------------------------------------------*/ 4895 4896 int64 float128_to_int64( float128 a STATUS_PARAM ) 4897 { 4898 flag aSign; 4899 int32 aExp, shiftCount; 4900 uint64_t aSig0, aSig1; 4901 4902 aSig1 = extractFloat128Frac1( a ); 4903 aSig0 = extractFloat128Frac0( a ); 4904 aExp = extractFloat128Exp( a ); 4905 aSign = extractFloat128Sign( a ); 4906 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4907 shiftCount = 0x402F - aExp; 4908 if ( shiftCount <= 0 ) { 4909 if ( 0x403E < aExp ) { 4910 float_raise( float_flag_invalid STATUS_VAR); 4911 if ( ! aSign 4912 || ( ( aExp == 0x7FFF ) 4913 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 4914 ) 4915 ) { 4916 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4917 } 4918 return (int64_t) LIT64( 0x8000000000000000 ); 4919 } 4920 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 4921 } 4922 else { 4923 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 4924 } 4925 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR ); 4926 4927 } 4928 4929 /*---------------------------------------------------------------------------- 4930 | Returns the result of converting the quadruple-precision floating-point 4931 | value `a' to the 64-bit two's complement integer format. The conversion 4932 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4933 | Arithmetic, except that the conversion is always rounded toward zero. 4934 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 4935 | the conversion overflows, the largest integer with the same sign as `a' is 4936 | returned. 4937 *----------------------------------------------------------------------------*/ 4938 4939 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM ) 4940 { 4941 flag aSign; 4942 int32 aExp, shiftCount; 4943 uint64_t aSig0, aSig1; 4944 int64 z; 4945 4946 aSig1 = extractFloat128Frac1( a ); 4947 aSig0 = extractFloat128Frac0( a ); 4948 aExp = extractFloat128Exp( a ); 4949 aSign = extractFloat128Sign( a ); 4950 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 4951 shiftCount = aExp - 0x402F; 4952 if ( 0 < shiftCount ) { 4953 if ( 0x403E <= aExp ) { 4954 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 4955 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 4956 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 4957 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact; 4958 } 4959 else { 4960 float_raise( float_flag_invalid STATUS_VAR); 4961 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 4962 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4963 } 4964 } 4965 return (int64_t) LIT64( 0x8000000000000000 ); 4966 } 4967 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 4968 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 4969 STATUS(float_exception_flags) |= float_flag_inexact; 4970 } 4971 } 4972 else { 4973 if ( aExp < 0x3FFF ) { 4974 if ( aExp | aSig0 | aSig1 ) { 4975 STATUS(float_exception_flags) |= float_flag_inexact; 4976 } 4977 return 0; 4978 } 4979 z = aSig0>>( - shiftCount ); 4980 if ( aSig1 4981 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 4982 STATUS(float_exception_flags) |= float_flag_inexact; 4983 } 4984 } 4985 if ( aSign ) z = - z; 4986 return z; 4987 4988 } 4989 4990 /*---------------------------------------------------------------------------- 4991 | Returns the result of converting the quadruple-precision floating-point 4992 | value `a' to the single-precision floating-point format. The conversion 4993 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4994 | Arithmetic. 4995 *----------------------------------------------------------------------------*/ 4996 4997 float32 float128_to_float32( float128 a STATUS_PARAM ) 4998 { 4999 flag aSign; 5000 int32 aExp; 5001 uint64_t aSig0, aSig1; 5002 uint32_t zSig; 5003 5004 aSig1 = extractFloat128Frac1( a ); 5005 aSig0 = extractFloat128Frac0( a ); 5006 aExp = extractFloat128Exp( a ); 5007 aSign = extractFloat128Sign( a ); 5008 if ( aExp == 0x7FFF ) { 5009 if ( aSig0 | aSig1 ) { 5010 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 5011 } 5012 return packFloat32( aSign, 0xFF, 0 ); 5013 } 5014 aSig0 |= ( aSig1 != 0 ); 5015 shift64RightJamming( aSig0, 18, &aSig0 ); 5016 zSig = aSig0; 5017 if ( aExp || zSig ) { 5018 zSig |= 0x40000000; 5019 aExp -= 0x3F81; 5020 } 5021 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR ); 5022 5023 } 5024 5025 /*---------------------------------------------------------------------------- 5026 | Returns the result of converting the quadruple-precision floating-point 5027 | value `a' to the double-precision floating-point format. The conversion 5028 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5029 | Arithmetic. 5030 *----------------------------------------------------------------------------*/ 5031 5032 float64 float128_to_float64( float128 a STATUS_PARAM ) 5033 { 5034 flag aSign; 5035 int32 aExp; 5036 uint64_t aSig0, aSig1; 5037 5038 aSig1 = extractFloat128Frac1( a ); 5039 aSig0 = extractFloat128Frac0( a ); 5040 aExp = extractFloat128Exp( a ); 5041 aSign = extractFloat128Sign( a ); 5042 if ( aExp == 0x7FFF ) { 5043 if ( aSig0 | aSig1 ) { 5044 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 5045 } 5046 return packFloat64( aSign, 0x7FF, 0 ); 5047 } 5048 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 5049 aSig0 |= ( aSig1 != 0 ); 5050 if ( aExp || aSig0 ) { 5051 aSig0 |= LIT64( 0x4000000000000000 ); 5052 aExp -= 0x3C01; 5053 } 5054 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR ); 5055 5056 } 5057 5058 /*---------------------------------------------------------------------------- 5059 | Returns the result of converting the quadruple-precision floating-point 5060 | value `a' to the extended double-precision floating-point format. The 5061 | conversion is performed according to the IEC/IEEE Standard for Binary 5062 | Floating-Point Arithmetic. 5063 *----------------------------------------------------------------------------*/ 5064 5065 floatx80 float128_to_floatx80( float128 a STATUS_PARAM ) 5066 { 5067 flag aSign; 5068 int32 aExp; 5069 uint64_t aSig0, aSig1; 5070 5071 aSig1 = extractFloat128Frac1( a ); 5072 aSig0 = extractFloat128Frac0( a ); 5073 aExp = extractFloat128Exp( a ); 5074 aSign = extractFloat128Sign( a ); 5075 if ( aExp == 0x7FFF ) { 5076 if ( aSig0 | aSig1 ) { 5077 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 5078 } 5079 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5080 } 5081 if ( aExp == 0 ) { 5082 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 5083 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5084 } 5085 else { 5086 aSig0 |= LIT64( 0x0001000000000000 ); 5087 } 5088 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 5089 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR ); 5090 5091 } 5092 5093 /*---------------------------------------------------------------------------- 5094 | Rounds the quadruple-precision floating-point value `a' to an integer, and 5095 | returns the result as a quadruple-precision floating-point value. The 5096 | operation is performed according to the IEC/IEEE Standard for Binary 5097 | Floating-Point Arithmetic. 5098 *----------------------------------------------------------------------------*/ 5099 5100 float128 float128_round_to_int( float128 a STATUS_PARAM ) 5101 { 5102 flag aSign; 5103 int32 aExp; 5104 uint64_t lastBitMask, roundBitsMask; 5105 int8 roundingMode; 5106 float128 z; 5107 5108 aExp = extractFloat128Exp( a ); 5109 if ( 0x402F <= aExp ) { 5110 if ( 0x406F <= aExp ) { 5111 if ( ( aExp == 0x7FFF ) 5112 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 5113 ) { 5114 return propagateFloat128NaN( a, a STATUS_VAR ); 5115 } 5116 return a; 5117 } 5118 lastBitMask = 1; 5119 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 5120 roundBitsMask = lastBitMask - 1; 5121 z = a; 5122 roundingMode = STATUS(float_rounding_mode); 5123 if ( roundingMode == float_round_nearest_even ) { 5124 if ( lastBitMask ) { 5125 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 5126 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 5127 } 5128 else { 5129 if ( (int64_t) z.low < 0 ) { 5130 ++z.high; 5131 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 5132 } 5133 } 5134 } 5135 else if ( roundingMode != float_round_to_zero ) { 5136 if ( extractFloat128Sign( z ) 5137 ^ ( roundingMode == float_round_up ) ) { 5138 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low ); 5139 } 5140 } 5141 z.low &= ~ roundBitsMask; 5142 } 5143 else { 5144 if ( aExp < 0x3FFF ) { 5145 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 5146 STATUS(float_exception_flags) |= float_flag_inexact; 5147 aSign = extractFloat128Sign( a ); 5148 switch ( STATUS(float_rounding_mode) ) { 5149 case float_round_nearest_even: 5150 if ( ( aExp == 0x3FFE ) 5151 && ( extractFloat128Frac0( a ) 5152 | extractFloat128Frac1( a ) ) 5153 ) { 5154 return packFloat128( aSign, 0x3FFF, 0, 0 ); 5155 } 5156 break; 5157 case float_round_down: 5158 return 5159 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 5160 : packFloat128( 0, 0, 0, 0 ); 5161 case float_round_up: 5162 return 5163 aSign ? packFloat128( 1, 0, 0, 0 ) 5164 : packFloat128( 0, 0x3FFF, 0, 0 ); 5165 } 5166 return packFloat128( aSign, 0, 0, 0 ); 5167 } 5168 lastBitMask = 1; 5169 lastBitMask <<= 0x402F - aExp; 5170 roundBitsMask = lastBitMask - 1; 5171 z.low = 0; 5172 z.high = a.high; 5173 roundingMode = STATUS(float_rounding_mode); 5174 if ( roundingMode == float_round_nearest_even ) { 5175 z.high += lastBitMask>>1; 5176 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 5177 z.high &= ~ lastBitMask; 5178 } 5179 } 5180 else if ( roundingMode != float_round_to_zero ) { 5181 if ( extractFloat128Sign( z ) 5182 ^ ( roundingMode == float_round_up ) ) { 5183 z.high |= ( a.low != 0 ); 5184 z.high += roundBitsMask; 5185 } 5186 } 5187 z.high &= ~ roundBitsMask; 5188 } 5189 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 5190 STATUS(float_exception_flags) |= float_flag_inexact; 5191 } 5192 return z; 5193 5194 } 5195 5196 /*---------------------------------------------------------------------------- 5197 | Returns the result of adding the absolute values of the quadruple-precision 5198 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 5199 | before being returned. `zSign' is ignored if the result is a NaN. 5200 | The addition is performed according to the IEC/IEEE Standard for Binary 5201 | Floating-Point Arithmetic. 5202 *----------------------------------------------------------------------------*/ 5203 5204 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM) 5205 { 5206 int32 aExp, bExp, zExp; 5207 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 5208 int32 expDiff; 5209 5210 aSig1 = extractFloat128Frac1( a ); 5211 aSig0 = extractFloat128Frac0( a ); 5212 aExp = extractFloat128Exp( a ); 5213 bSig1 = extractFloat128Frac1( b ); 5214 bSig0 = extractFloat128Frac0( b ); 5215 bExp = extractFloat128Exp( b ); 5216 expDiff = aExp - bExp; 5217 if ( 0 < expDiff ) { 5218 if ( aExp == 0x7FFF ) { 5219 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 5220 return a; 5221 } 5222 if ( bExp == 0 ) { 5223 --expDiff; 5224 } 5225 else { 5226 bSig0 |= LIT64( 0x0001000000000000 ); 5227 } 5228 shift128ExtraRightJamming( 5229 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 5230 zExp = aExp; 5231 } 5232 else if ( expDiff < 0 ) { 5233 if ( bExp == 0x7FFF ) { 5234 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 5235 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5236 } 5237 if ( aExp == 0 ) { 5238 ++expDiff; 5239 } 5240 else { 5241 aSig0 |= LIT64( 0x0001000000000000 ); 5242 } 5243 shift128ExtraRightJamming( 5244 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 5245 zExp = bExp; 5246 } 5247 else { 5248 if ( aExp == 0x7FFF ) { 5249 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 5250 return propagateFloat128NaN( a, b STATUS_VAR ); 5251 } 5252 return a; 5253 } 5254 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 5255 if ( aExp == 0 ) { 5256 if (STATUS(flush_to_zero)) { 5257 if (zSig0 | zSig1) { 5258 float_raise(float_flag_output_denormal STATUS_VAR); 5259 } 5260 return packFloat128(zSign, 0, 0, 0); 5261 } 5262 return packFloat128( zSign, 0, zSig0, zSig1 ); 5263 } 5264 zSig2 = 0; 5265 zSig0 |= LIT64( 0x0002000000000000 ); 5266 zExp = aExp; 5267 goto shiftRight1; 5268 } 5269 aSig0 |= LIT64( 0x0001000000000000 ); 5270 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 5271 --zExp; 5272 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 5273 ++zExp; 5274 shiftRight1: 5275 shift128ExtraRightJamming( 5276 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 5277 roundAndPack: 5278 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR ); 5279 5280 } 5281 5282 /*---------------------------------------------------------------------------- 5283 | Returns the result of subtracting the absolute values of the quadruple- 5284 | precision floating-point values `a' and `b'. If `zSign' is 1, the 5285 | difference is negated before being returned. `zSign' is ignored if the 5286 | result is a NaN. The subtraction is performed according to the IEC/IEEE 5287 | Standard for Binary Floating-Point Arithmetic. 5288 *----------------------------------------------------------------------------*/ 5289 5290 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM) 5291 { 5292 int32 aExp, bExp, zExp; 5293 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 5294 int32 expDiff; 5295 float128 z; 5296 5297 aSig1 = extractFloat128Frac1( a ); 5298 aSig0 = extractFloat128Frac0( a ); 5299 aExp = extractFloat128Exp( a ); 5300 bSig1 = extractFloat128Frac1( b ); 5301 bSig0 = extractFloat128Frac0( b ); 5302 bExp = extractFloat128Exp( b ); 5303 expDiff = aExp - bExp; 5304 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 5305 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 5306 if ( 0 < expDiff ) goto aExpBigger; 5307 if ( expDiff < 0 ) goto bExpBigger; 5308 if ( aExp == 0x7FFF ) { 5309 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 5310 return propagateFloat128NaN( a, b STATUS_VAR ); 5311 } 5312 float_raise( float_flag_invalid STATUS_VAR); 5313 z.low = float128_default_nan_low; 5314 z.high = float128_default_nan_high; 5315 return z; 5316 } 5317 if ( aExp == 0 ) { 5318 aExp = 1; 5319 bExp = 1; 5320 } 5321 if ( bSig0 < aSig0 ) goto aBigger; 5322 if ( aSig0 < bSig0 ) goto bBigger; 5323 if ( bSig1 < aSig1 ) goto aBigger; 5324 if ( aSig1 < bSig1 ) goto bBigger; 5325 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 ); 5326 bExpBigger: 5327 if ( bExp == 0x7FFF ) { 5328 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 5329 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 5330 } 5331 if ( aExp == 0 ) { 5332 ++expDiff; 5333 } 5334 else { 5335 aSig0 |= LIT64( 0x4000000000000000 ); 5336 } 5337 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 5338 bSig0 |= LIT64( 0x4000000000000000 ); 5339 bBigger: 5340 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 5341 zExp = bExp; 5342 zSign ^= 1; 5343 goto normalizeRoundAndPack; 5344 aExpBigger: 5345 if ( aExp == 0x7FFF ) { 5346 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 5347 return a; 5348 } 5349 if ( bExp == 0 ) { 5350 --expDiff; 5351 } 5352 else { 5353 bSig0 |= LIT64( 0x4000000000000000 ); 5354 } 5355 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 5356 aSig0 |= LIT64( 0x4000000000000000 ); 5357 aBigger: 5358 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 5359 zExp = aExp; 5360 normalizeRoundAndPack: 5361 --zExp; 5362 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR ); 5363 5364 } 5365 5366 /*---------------------------------------------------------------------------- 5367 | Returns the result of adding the quadruple-precision floating-point values 5368 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 5369 | for Binary Floating-Point Arithmetic. 5370 *----------------------------------------------------------------------------*/ 5371 5372 float128 float128_add( float128 a, float128 b STATUS_PARAM ) 5373 { 5374 flag aSign, bSign; 5375 5376 aSign = extractFloat128Sign( a ); 5377 bSign = extractFloat128Sign( b ); 5378 if ( aSign == bSign ) { 5379 return addFloat128Sigs( a, b, aSign STATUS_VAR ); 5380 } 5381 else { 5382 return subFloat128Sigs( a, b, aSign STATUS_VAR ); 5383 } 5384 5385 } 5386 5387 /*---------------------------------------------------------------------------- 5388 | Returns the result of subtracting the quadruple-precision floating-point 5389 | values `a' and `b'. The operation is performed according to the IEC/IEEE 5390 | Standard for Binary Floating-Point Arithmetic. 5391 *----------------------------------------------------------------------------*/ 5392 5393 float128 float128_sub( float128 a, float128 b STATUS_PARAM ) 5394 { 5395 flag aSign, bSign; 5396 5397 aSign = extractFloat128Sign( a ); 5398 bSign = extractFloat128Sign( b ); 5399 if ( aSign == bSign ) { 5400 return subFloat128Sigs( a, b, aSign STATUS_VAR ); 5401 } 5402 else { 5403 return addFloat128Sigs( a, b, aSign STATUS_VAR ); 5404 } 5405 5406 } 5407 5408 /*---------------------------------------------------------------------------- 5409 | Returns the result of multiplying the quadruple-precision floating-point 5410 | values `a' and `b'. The operation is performed according to the IEC/IEEE 5411 | Standard for Binary Floating-Point Arithmetic. 5412 *----------------------------------------------------------------------------*/ 5413 5414 float128 float128_mul( float128 a, float128 b STATUS_PARAM ) 5415 { 5416 flag aSign, bSign, zSign; 5417 int32 aExp, bExp, zExp; 5418 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 5419 float128 z; 5420 5421 aSig1 = extractFloat128Frac1( a ); 5422 aSig0 = extractFloat128Frac0( a ); 5423 aExp = extractFloat128Exp( a ); 5424 aSign = extractFloat128Sign( a ); 5425 bSig1 = extractFloat128Frac1( b ); 5426 bSig0 = extractFloat128Frac0( b ); 5427 bExp = extractFloat128Exp( b ); 5428 bSign = extractFloat128Sign( b ); 5429 zSign = aSign ^ bSign; 5430 if ( aExp == 0x7FFF ) { 5431 if ( ( aSig0 | aSig1 ) 5432 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 5433 return propagateFloat128NaN( a, b STATUS_VAR ); 5434 } 5435 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 5436 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5437 } 5438 if ( bExp == 0x7FFF ) { 5439 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 5440 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 5441 invalid: 5442 float_raise( float_flag_invalid STATUS_VAR); 5443 z.low = float128_default_nan_low; 5444 z.high = float128_default_nan_high; 5445 return z; 5446 } 5447 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5448 } 5449 if ( aExp == 0 ) { 5450 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 5451 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5452 } 5453 if ( bExp == 0 ) { 5454 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 5455 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5456 } 5457 zExp = aExp + bExp - 0x4000; 5458 aSig0 |= LIT64( 0x0001000000000000 ); 5459 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 5460 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 5461 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 5462 zSig2 |= ( zSig3 != 0 ); 5463 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 5464 shift128ExtraRightJamming( 5465 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 5466 ++zExp; 5467 } 5468 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR ); 5469 5470 } 5471 5472 /*---------------------------------------------------------------------------- 5473 | Returns the result of dividing the quadruple-precision floating-point value 5474 | `a' by the corresponding value `b'. The operation is performed according to 5475 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5476 *----------------------------------------------------------------------------*/ 5477 5478 float128 float128_div( float128 a, float128 b STATUS_PARAM ) 5479 { 5480 flag aSign, bSign, zSign; 5481 int32 aExp, bExp, zExp; 5482 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 5483 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5484 float128 z; 5485 5486 aSig1 = extractFloat128Frac1( a ); 5487 aSig0 = extractFloat128Frac0( a ); 5488 aExp = extractFloat128Exp( a ); 5489 aSign = extractFloat128Sign( a ); 5490 bSig1 = extractFloat128Frac1( b ); 5491 bSig0 = extractFloat128Frac0( b ); 5492 bExp = extractFloat128Exp( b ); 5493 bSign = extractFloat128Sign( b ); 5494 zSign = aSign ^ bSign; 5495 if ( aExp == 0x7FFF ) { 5496 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 5497 if ( bExp == 0x7FFF ) { 5498 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 5499 goto invalid; 5500 } 5501 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5502 } 5503 if ( bExp == 0x7FFF ) { 5504 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 5505 return packFloat128( zSign, 0, 0, 0 ); 5506 } 5507 if ( bExp == 0 ) { 5508 if ( ( bSig0 | bSig1 ) == 0 ) { 5509 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 5510 invalid: 5511 float_raise( float_flag_invalid STATUS_VAR); 5512 z.low = float128_default_nan_low; 5513 z.high = float128_default_nan_high; 5514 return z; 5515 } 5516 float_raise( float_flag_divbyzero STATUS_VAR); 5517 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5518 } 5519 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5520 } 5521 if ( aExp == 0 ) { 5522 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 5523 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5524 } 5525 zExp = aExp - bExp + 0x3FFD; 5526 shortShift128Left( 5527 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 5528 shortShift128Left( 5529 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 5530 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 5531 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 5532 ++zExp; 5533 } 5534 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5535 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 5536 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 5537 while ( (int64_t) rem0 < 0 ) { 5538 --zSig0; 5539 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 5540 } 5541 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 5542 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 5543 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 5544 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 5545 while ( (int64_t) rem1 < 0 ) { 5546 --zSig1; 5547 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 5548 } 5549 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5550 } 5551 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 5552 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR ); 5553 5554 } 5555 5556 /*---------------------------------------------------------------------------- 5557 | Returns the remainder of the quadruple-precision floating-point value `a' 5558 | with respect to the corresponding value `b'. The operation is performed 5559 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5560 *----------------------------------------------------------------------------*/ 5561 5562 float128 float128_rem( float128 a, float128 b STATUS_PARAM ) 5563 { 5564 flag aSign, zSign; 5565 int32 aExp, bExp, expDiff; 5566 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 5567 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 5568 int64_t sigMean0; 5569 float128 z; 5570 5571 aSig1 = extractFloat128Frac1( a ); 5572 aSig0 = extractFloat128Frac0( a ); 5573 aExp = extractFloat128Exp( a ); 5574 aSign = extractFloat128Sign( a ); 5575 bSig1 = extractFloat128Frac1( b ); 5576 bSig0 = extractFloat128Frac0( b ); 5577 bExp = extractFloat128Exp( b ); 5578 if ( aExp == 0x7FFF ) { 5579 if ( ( aSig0 | aSig1 ) 5580 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 5581 return propagateFloat128NaN( a, b STATUS_VAR ); 5582 } 5583 goto invalid; 5584 } 5585 if ( bExp == 0x7FFF ) { 5586 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 5587 return a; 5588 } 5589 if ( bExp == 0 ) { 5590 if ( ( bSig0 | bSig1 ) == 0 ) { 5591 invalid: 5592 float_raise( float_flag_invalid STATUS_VAR); 5593 z.low = float128_default_nan_low; 5594 z.high = float128_default_nan_high; 5595 return z; 5596 } 5597 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 5598 } 5599 if ( aExp == 0 ) { 5600 if ( ( aSig0 | aSig1 ) == 0 ) return a; 5601 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5602 } 5603 expDiff = aExp - bExp; 5604 if ( expDiff < -1 ) return a; 5605 shortShift128Left( 5606 aSig0 | LIT64( 0x0001000000000000 ), 5607 aSig1, 5608 15 - ( expDiff < 0 ), 5609 &aSig0, 5610 &aSig1 5611 ); 5612 shortShift128Left( 5613 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 5614 q = le128( bSig0, bSig1, aSig0, aSig1 ); 5615 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5616 expDiff -= 64; 5617 while ( 0 < expDiff ) { 5618 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5619 q = ( 4 < q ) ? q - 4 : 0; 5620 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5621 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 5622 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 5623 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 5624 expDiff -= 61; 5625 } 5626 if ( -64 < expDiff ) { 5627 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 5628 q = ( 4 < q ) ? q - 4 : 0; 5629 q >>= - expDiff; 5630 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5631 expDiff += 52; 5632 if ( expDiff < 0 ) { 5633 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 5634 } 5635 else { 5636 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 5637 } 5638 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 5639 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 5640 } 5641 else { 5642 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 5643 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 5644 } 5645 do { 5646 alternateASig0 = aSig0; 5647 alternateASig1 = aSig1; 5648 ++q; 5649 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 5650 } while ( 0 <= (int64_t) aSig0 ); 5651 add128( 5652 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 5653 if ( ( sigMean0 < 0 ) 5654 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 5655 aSig0 = alternateASig0; 5656 aSig1 = alternateASig1; 5657 } 5658 zSign = ( (int64_t) aSig0 < 0 ); 5659 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 5660 return 5661 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR ); 5662 5663 } 5664 5665 /*---------------------------------------------------------------------------- 5666 | Returns the square root of the quadruple-precision floating-point value `a'. 5667 | The operation is performed according to the IEC/IEEE Standard for Binary 5668 | Floating-Point Arithmetic. 5669 *----------------------------------------------------------------------------*/ 5670 5671 float128 float128_sqrt( float128 a STATUS_PARAM ) 5672 { 5673 flag aSign; 5674 int32 aExp, zExp; 5675 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 5676 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5677 float128 z; 5678 5679 aSig1 = extractFloat128Frac1( a ); 5680 aSig0 = extractFloat128Frac0( a ); 5681 aExp = extractFloat128Exp( a ); 5682 aSign = extractFloat128Sign( a ); 5683 if ( aExp == 0x7FFF ) { 5684 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR ); 5685 if ( ! aSign ) return a; 5686 goto invalid; 5687 } 5688 if ( aSign ) { 5689 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 5690 invalid: 5691 float_raise( float_flag_invalid STATUS_VAR); 5692 z.low = float128_default_nan_low; 5693 z.high = float128_default_nan_high; 5694 return z; 5695 } 5696 if ( aExp == 0 ) { 5697 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 5698 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5699 } 5700 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 5701 aSig0 |= LIT64( 0x0001000000000000 ); 5702 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 5703 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 5704 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5705 doubleZSig0 = zSig0<<1; 5706 mul64To128( zSig0, zSig0, &term0, &term1 ); 5707 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5708 while ( (int64_t) rem0 < 0 ) { 5709 --zSig0; 5710 doubleZSig0 -= 2; 5711 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5712 } 5713 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5714 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 5715 if ( zSig1 == 0 ) zSig1 = 1; 5716 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5717 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5718 mul64To128( zSig1, zSig1, &term2, &term3 ); 5719 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5720 while ( (int64_t) rem1 < 0 ) { 5721 --zSig1; 5722 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5723 term3 |= 1; 5724 term2 |= doubleZSig0; 5725 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5726 } 5727 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5728 } 5729 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 5730 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR ); 5731 5732 } 5733 5734 /*---------------------------------------------------------------------------- 5735 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 5736 | the corresponding value `b', and 0 otherwise. The invalid exception is 5737 | raised if either operand is a NaN. Otherwise, the comparison is performed 5738 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5739 *----------------------------------------------------------------------------*/ 5740 5741 int float128_eq( float128 a, float128 b STATUS_PARAM ) 5742 { 5743 5744 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5745 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5746 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5747 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5748 ) { 5749 float_raise( float_flag_invalid STATUS_VAR); 5750 return 0; 5751 } 5752 return 5753 ( a.low == b.low ) 5754 && ( ( a.high == b.high ) 5755 || ( ( a.low == 0 ) 5756 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5757 ); 5758 5759 } 5760 5761 /*---------------------------------------------------------------------------- 5762 | Returns 1 if the quadruple-precision floating-point value `a' is less than 5763 | or equal to the corresponding value `b', and 0 otherwise. The invalid 5764 | exception is raised if either operand is a NaN. The comparison is performed 5765 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5766 *----------------------------------------------------------------------------*/ 5767 5768 int float128_le( float128 a, float128 b STATUS_PARAM ) 5769 { 5770 flag aSign, bSign; 5771 5772 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5773 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5774 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5775 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5776 ) { 5777 float_raise( float_flag_invalid STATUS_VAR); 5778 return 0; 5779 } 5780 aSign = extractFloat128Sign( a ); 5781 bSign = extractFloat128Sign( b ); 5782 if ( aSign != bSign ) { 5783 return 5784 aSign 5785 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5786 == 0 ); 5787 } 5788 return 5789 aSign ? le128( b.high, b.low, a.high, a.low ) 5790 : le128( a.high, a.low, b.high, b.low ); 5791 5792 } 5793 5794 /*---------------------------------------------------------------------------- 5795 | Returns 1 if the quadruple-precision floating-point value `a' is less than 5796 | the corresponding value `b', and 0 otherwise. The invalid exception is 5797 | raised if either operand is a NaN. The comparison is performed according 5798 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5799 *----------------------------------------------------------------------------*/ 5800 5801 int float128_lt( float128 a, float128 b STATUS_PARAM ) 5802 { 5803 flag aSign, bSign; 5804 5805 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5806 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5807 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5808 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5809 ) { 5810 float_raise( float_flag_invalid STATUS_VAR); 5811 return 0; 5812 } 5813 aSign = extractFloat128Sign( a ); 5814 bSign = extractFloat128Sign( b ); 5815 if ( aSign != bSign ) { 5816 return 5817 aSign 5818 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5819 != 0 ); 5820 } 5821 return 5822 aSign ? lt128( b.high, b.low, a.high, a.low ) 5823 : lt128( a.high, a.low, b.high, b.low ); 5824 5825 } 5826 5827 /*---------------------------------------------------------------------------- 5828 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 5829 | be compared, and 0 otherwise. The invalid exception is raised if either 5830 | operand is a NaN. The comparison is performed according to the IEC/IEEE 5831 | Standard for Binary Floating-Point Arithmetic. 5832 *----------------------------------------------------------------------------*/ 5833 5834 int float128_unordered( float128 a, float128 b STATUS_PARAM ) 5835 { 5836 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5837 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5838 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5839 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5840 ) { 5841 float_raise( float_flag_invalid STATUS_VAR); 5842 return 1; 5843 } 5844 return 0; 5845 } 5846 5847 /*---------------------------------------------------------------------------- 5848 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 5849 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5850 | exception. The comparison is performed according to the IEC/IEEE Standard 5851 | for Binary Floating-Point Arithmetic. 5852 *----------------------------------------------------------------------------*/ 5853 5854 int float128_eq_quiet( float128 a, float128 b STATUS_PARAM ) 5855 { 5856 5857 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5858 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5859 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5860 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5861 ) { 5862 if ( float128_is_signaling_nan( a ) 5863 || float128_is_signaling_nan( b ) ) { 5864 float_raise( float_flag_invalid STATUS_VAR); 5865 } 5866 return 0; 5867 } 5868 return 5869 ( a.low == b.low ) 5870 && ( ( a.high == b.high ) 5871 || ( ( a.low == 0 ) 5872 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5873 ); 5874 5875 } 5876 5877 /*---------------------------------------------------------------------------- 5878 | Returns 1 if the quadruple-precision floating-point value `a' is less than 5879 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5880 | cause an exception. Otherwise, the comparison is performed according to the 5881 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5882 *----------------------------------------------------------------------------*/ 5883 5884 int float128_le_quiet( float128 a, float128 b STATUS_PARAM ) 5885 { 5886 flag aSign, bSign; 5887 5888 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5889 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5890 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5891 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5892 ) { 5893 if ( float128_is_signaling_nan( a ) 5894 || float128_is_signaling_nan( b ) ) { 5895 float_raise( float_flag_invalid STATUS_VAR); 5896 } 5897 return 0; 5898 } 5899 aSign = extractFloat128Sign( a ); 5900 bSign = extractFloat128Sign( b ); 5901 if ( aSign != bSign ) { 5902 return 5903 aSign 5904 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5905 == 0 ); 5906 } 5907 return 5908 aSign ? le128( b.high, b.low, a.high, a.low ) 5909 : le128( a.high, a.low, b.high, b.low ); 5910 5911 } 5912 5913 /*---------------------------------------------------------------------------- 5914 | Returns 1 if the quadruple-precision floating-point value `a' is less than 5915 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5916 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 5917 | Standard for Binary Floating-Point Arithmetic. 5918 *----------------------------------------------------------------------------*/ 5919 5920 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM ) 5921 { 5922 flag aSign, bSign; 5923 5924 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5925 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5926 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5927 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5928 ) { 5929 if ( float128_is_signaling_nan( a ) 5930 || float128_is_signaling_nan( b ) ) { 5931 float_raise( float_flag_invalid STATUS_VAR); 5932 } 5933 return 0; 5934 } 5935 aSign = extractFloat128Sign( a ); 5936 bSign = extractFloat128Sign( b ); 5937 if ( aSign != bSign ) { 5938 return 5939 aSign 5940 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5941 != 0 ); 5942 } 5943 return 5944 aSign ? lt128( b.high, b.low, a.high, a.low ) 5945 : lt128( a.high, a.low, b.high, b.low ); 5946 5947 } 5948 5949 /*---------------------------------------------------------------------------- 5950 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 5951 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 5952 | comparison is performed according to the IEC/IEEE Standard for Binary 5953 | Floating-Point Arithmetic. 5954 *----------------------------------------------------------------------------*/ 5955 5956 int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM ) 5957 { 5958 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 5959 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 5960 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 5961 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 5962 ) { 5963 if ( float128_is_signaling_nan( a ) 5964 || float128_is_signaling_nan( b ) ) { 5965 float_raise( float_flag_invalid STATUS_VAR); 5966 } 5967 return 1; 5968 } 5969 return 0; 5970 } 5971 5972 /* misc functions */ 5973 float32 uint32_to_float32( uint32 a STATUS_PARAM ) 5974 { 5975 return int64_to_float32(a STATUS_VAR); 5976 } 5977 5978 float64 uint32_to_float64( uint32 a STATUS_PARAM ) 5979 { 5980 return int64_to_float64(a STATUS_VAR); 5981 } 5982 5983 uint32 float32_to_uint32( float32 a STATUS_PARAM ) 5984 { 5985 int64_t v; 5986 uint32 res; 5987 5988 v = float32_to_int64(a STATUS_VAR); 5989 if (v < 0) { 5990 res = 0; 5991 float_raise( float_flag_invalid STATUS_VAR); 5992 } else if (v > 0xffffffff) { 5993 res = 0xffffffff; 5994 float_raise( float_flag_invalid STATUS_VAR); 5995 } else { 5996 res = v; 5997 } 5998 return res; 5999 } 6000 6001 uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM ) 6002 { 6003 int64_t v; 6004 uint32 res; 6005 6006 v = float32_to_int64_round_to_zero(a STATUS_VAR); 6007 if (v < 0) { 6008 res = 0; 6009 float_raise( float_flag_invalid STATUS_VAR); 6010 } else if (v > 0xffffffff) { 6011 res = 0xffffffff; 6012 float_raise( float_flag_invalid STATUS_VAR); 6013 } else { 6014 res = v; 6015 } 6016 return res; 6017 } 6018 6019 uint16 float32_to_uint16_round_to_zero( float32 a STATUS_PARAM ) 6020 { 6021 int64_t v; 6022 uint16 res; 6023 6024 v = float32_to_int64_round_to_zero(a STATUS_VAR); 6025 if (v < 0) { 6026 res = 0; 6027 float_raise( float_flag_invalid STATUS_VAR); 6028 } else if (v > 0xffff) { 6029 res = 0xffff; 6030 float_raise( float_flag_invalid STATUS_VAR); 6031 } else { 6032 res = v; 6033 } 6034 return res; 6035 } 6036 6037 uint32 float64_to_uint32( float64 a STATUS_PARAM ) 6038 { 6039 int64_t v; 6040 uint32 res; 6041 6042 v = float64_to_int64(a STATUS_VAR); 6043 if (v < 0) { 6044 res = 0; 6045 float_raise( float_flag_invalid STATUS_VAR); 6046 } else if (v > 0xffffffff) { 6047 res = 0xffffffff; 6048 float_raise( float_flag_invalid STATUS_VAR); 6049 } else { 6050 res = v; 6051 } 6052 return res; 6053 } 6054 6055 uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM ) 6056 { 6057 int64_t v; 6058 uint32 res; 6059 6060 v = float64_to_int64_round_to_zero(a STATUS_VAR); 6061 if (v < 0) { 6062 res = 0; 6063 float_raise( float_flag_invalid STATUS_VAR); 6064 } else if (v > 0xffffffff) { 6065 res = 0xffffffff; 6066 float_raise( float_flag_invalid STATUS_VAR); 6067 } else { 6068 res = v; 6069 } 6070 return res; 6071 } 6072 6073 uint16 float64_to_uint16_round_to_zero( float64 a STATUS_PARAM ) 6074 { 6075 int64_t v; 6076 uint16 res; 6077 6078 v = float64_to_int64_round_to_zero(a STATUS_VAR); 6079 if (v < 0) { 6080 res = 0; 6081 float_raise( float_flag_invalid STATUS_VAR); 6082 } else if (v > 0xffff) { 6083 res = 0xffff; 6084 float_raise( float_flag_invalid STATUS_VAR); 6085 } else { 6086 res = v; 6087 } 6088 return res; 6089 } 6090 6091 /* FIXME: This looks broken. */ 6092 uint64_t float64_to_uint64 (float64 a STATUS_PARAM) 6093 { 6094 int64_t v; 6095 6096 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR)); 6097 v += float64_val(a); 6098 v = float64_to_int64(make_float64(v) STATUS_VAR); 6099 6100 return v - INT64_MIN; 6101 } 6102 6103 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM) 6104 { 6105 int64_t v; 6106 6107 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR)); 6108 v += float64_val(a); 6109 v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR); 6110 6111 return v - INT64_MIN; 6112 } 6113 6114 #define COMPARE(s, nan_exp) \ 6115 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \ 6116 int is_quiet STATUS_PARAM ) \ 6117 { \ 6118 flag aSign, bSign; \ 6119 uint ## s ## _t av, bv; \ 6120 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \ 6121 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \ 6122 \ 6123 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \ 6124 extractFloat ## s ## Frac( a ) ) || \ 6125 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \ 6126 extractFloat ## s ## Frac( b ) )) { \ 6127 if (!is_quiet || \ 6128 float ## s ## _is_signaling_nan( a ) || \ 6129 float ## s ## _is_signaling_nan( b ) ) { \ 6130 float_raise( float_flag_invalid STATUS_VAR); \ 6131 } \ 6132 return float_relation_unordered; \ 6133 } \ 6134 aSign = extractFloat ## s ## Sign( a ); \ 6135 bSign = extractFloat ## s ## Sign( b ); \ 6136 av = float ## s ## _val(a); \ 6137 bv = float ## s ## _val(b); \ 6138 if ( aSign != bSign ) { \ 6139 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \ 6140 /* zero case */ \ 6141 return float_relation_equal; \ 6142 } else { \ 6143 return 1 - (2 * aSign); \ 6144 } \ 6145 } else { \ 6146 if (av == bv) { \ 6147 return float_relation_equal; \ 6148 } else { \ 6149 return 1 - 2 * (aSign ^ ( av < bv )); \ 6150 } \ 6151 } \ 6152 } \ 6153 \ 6154 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \ 6155 { \ 6156 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \ 6157 } \ 6158 \ 6159 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \ 6160 { \ 6161 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \ 6162 } 6163 6164 COMPARE(32, 0xff) 6165 COMPARE(64, 0x7ff) 6166 6167 INLINE int floatx80_compare_internal( floatx80 a, floatx80 b, 6168 int is_quiet STATUS_PARAM ) 6169 { 6170 flag aSign, bSign; 6171 6172 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 6173 ( extractFloatx80Frac( a )<<1 ) ) || 6174 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 6175 ( extractFloatx80Frac( b )<<1 ) )) { 6176 if (!is_quiet || 6177 floatx80_is_signaling_nan( a ) || 6178 floatx80_is_signaling_nan( b ) ) { 6179 float_raise( float_flag_invalid STATUS_VAR); 6180 } 6181 return float_relation_unordered; 6182 } 6183 aSign = extractFloatx80Sign( a ); 6184 bSign = extractFloatx80Sign( b ); 6185 if ( aSign != bSign ) { 6186 6187 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 6188 ( ( a.low | b.low ) == 0 ) ) { 6189 /* zero case */ 6190 return float_relation_equal; 6191 } else { 6192 return 1 - (2 * aSign); 6193 } 6194 } else { 6195 if (a.low == b.low && a.high == b.high) { 6196 return float_relation_equal; 6197 } else { 6198 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 6199 } 6200 } 6201 } 6202 6203 int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM ) 6204 { 6205 return floatx80_compare_internal(a, b, 0 STATUS_VAR); 6206 } 6207 6208 int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM ) 6209 { 6210 return floatx80_compare_internal(a, b, 1 STATUS_VAR); 6211 } 6212 6213 INLINE int float128_compare_internal( float128 a, float128 b, 6214 int is_quiet STATUS_PARAM ) 6215 { 6216 flag aSign, bSign; 6217 6218 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 6219 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 6220 ( ( extractFloat128Exp( b ) == 0x7fff ) && 6221 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 6222 if (!is_quiet || 6223 float128_is_signaling_nan( a ) || 6224 float128_is_signaling_nan( b ) ) { 6225 float_raise( float_flag_invalid STATUS_VAR); 6226 } 6227 return float_relation_unordered; 6228 } 6229 aSign = extractFloat128Sign( a ); 6230 bSign = extractFloat128Sign( b ); 6231 if ( aSign != bSign ) { 6232 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 6233 /* zero case */ 6234 return float_relation_equal; 6235 } else { 6236 return 1 - (2 * aSign); 6237 } 6238 } else { 6239 if (a.low == b.low && a.high == b.high) { 6240 return float_relation_equal; 6241 } else { 6242 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 6243 } 6244 } 6245 } 6246 6247 int float128_compare( float128 a, float128 b STATUS_PARAM ) 6248 { 6249 return float128_compare_internal(a, b, 0 STATUS_VAR); 6250 } 6251 6252 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM ) 6253 { 6254 return float128_compare_internal(a, b, 1 STATUS_VAR); 6255 } 6256 6257 /* min() and max() functions. These can't be implemented as 6258 * 'compare and pick one input' because that would mishandle 6259 * NaNs and +0 vs -0. 6260 */ 6261 #define MINMAX(s, nan_exp) \ 6262 INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b, \ 6263 int ismin STATUS_PARAM ) \ 6264 { \ 6265 flag aSign, bSign; \ 6266 uint ## s ## _t av, bv; \ 6267 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \ 6268 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \ 6269 if (float ## s ## _is_any_nan(a) || \ 6270 float ## s ## _is_any_nan(b)) { \ 6271 return propagateFloat ## s ## NaN(a, b STATUS_VAR); \ 6272 } \ 6273 aSign = extractFloat ## s ## Sign(a); \ 6274 bSign = extractFloat ## s ## Sign(b); \ 6275 av = float ## s ## _val(a); \ 6276 bv = float ## s ## _val(b); \ 6277 if (aSign != bSign) { \ 6278 if (ismin) { \ 6279 return aSign ? a : b; \ 6280 } else { \ 6281 return aSign ? b : a; \ 6282 } \ 6283 } else { \ 6284 if (ismin) { \ 6285 return (aSign ^ (av < bv)) ? a : b; \ 6286 } else { \ 6287 return (aSign ^ (av < bv)) ? b : a; \ 6288 } \ 6289 } \ 6290 } \ 6291 \ 6292 float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM) \ 6293 { \ 6294 return float ## s ## _minmax(a, b, 1 STATUS_VAR); \ 6295 } \ 6296 \ 6297 float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM) \ 6298 { \ 6299 return float ## s ## _minmax(a, b, 0 STATUS_VAR); \ 6300 } 6301 6302 MINMAX(32, 0xff) 6303 MINMAX(64, 0x7ff) 6304 6305 6306 /* Multiply A by 2 raised to the power N. */ 6307 float32 float32_scalbn( float32 a, int n STATUS_PARAM ) 6308 { 6309 flag aSign; 6310 int16_t aExp; 6311 uint32_t aSig; 6312 6313 a = float32_squash_input_denormal(a STATUS_VAR); 6314 aSig = extractFloat32Frac( a ); 6315 aExp = extractFloat32Exp( a ); 6316 aSign = extractFloat32Sign( a ); 6317 6318 if ( aExp == 0xFF ) { 6319 if ( aSig ) { 6320 return propagateFloat32NaN( a, a STATUS_VAR ); 6321 } 6322 return a; 6323 } 6324 if ( aExp != 0 ) 6325 aSig |= 0x00800000; 6326 else if ( aSig == 0 ) 6327 return a; 6328 6329 if (n > 0x200) { 6330 n = 0x200; 6331 } else if (n < -0x200) { 6332 n = -0x200; 6333 } 6334 6335 aExp += n - 1; 6336 aSig <<= 7; 6337 return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR ); 6338 } 6339 6340 float64 float64_scalbn( float64 a, int n STATUS_PARAM ) 6341 { 6342 flag aSign; 6343 int16_t aExp; 6344 uint64_t aSig; 6345 6346 a = float64_squash_input_denormal(a STATUS_VAR); 6347 aSig = extractFloat64Frac( a ); 6348 aExp = extractFloat64Exp( a ); 6349 aSign = extractFloat64Sign( a ); 6350 6351 if ( aExp == 0x7FF ) { 6352 if ( aSig ) { 6353 return propagateFloat64NaN( a, a STATUS_VAR ); 6354 } 6355 return a; 6356 } 6357 if ( aExp != 0 ) 6358 aSig |= LIT64( 0x0010000000000000 ); 6359 else if ( aSig == 0 ) 6360 return a; 6361 6362 if (n > 0x1000) { 6363 n = 0x1000; 6364 } else if (n < -0x1000) { 6365 n = -0x1000; 6366 } 6367 6368 aExp += n - 1; 6369 aSig <<= 10; 6370 return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR ); 6371 } 6372 6373 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM ) 6374 { 6375 flag aSign; 6376 int32_t aExp; 6377 uint64_t aSig; 6378 6379 aSig = extractFloatx80Frac( a ); 6380 aExp = extractFloatx80Exp( a ); 6381 aSign = extractFloatx80Sign( a ); 6382 6383 if ( aExp == 0x7FFF ) { 6384 if ( aSig<<1 ) { 6385 return propagateFloatx80NaN( a, a STATUS_VAR ); 6386 } 6387 return a; 6388 } 6389 6390 if (aExp == 0 && aSig == 0) 6391 return a; 6392 6393 if (n > 0x10000) { 6394 n = 0x10000; 6395 } else if (n < -0x10000) { 6396 n = -0x10000; 6397 } 6398 6399 aExp += n; 6400 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision), 6401 aSign, aExp, aSig, 0 STATUS_VAR ); 6402 } 6403 6404 float128 float128_scalbn( float128 a, int n STATUS_PARAM ) 6405 { 6406 flag aSign; 6407 int32_t aExp; 6408 uint64_t aSig0, aSig1; 6409 6410 aSig1 = extractFloat128Frac1( a ); 6411 aSig0 = extractFloat128Frac0( a ); 6412 aExp = extractFloat128Exp( a ); 6413 aSign = extractFloat128Sign( a ); 6414 if ( aExp == 0x7FFF ) { 6415 if ( aSig0 | aSig1 ) { 6416 return propagateFloat128NaN( a, a STATUS_VAR ); 6417 } 6418 return a; 6419 } 6420 if ( aExp != 0 ) 6421 aSig0 |= LIT64( 0x0001000000000000 ); 6422 else if ( aSig0 == 0 && aSig1 == 0 ) 6423 return a; 6424 6425 if (n > 0x10000) { 6426 n = 0x10000; 6427 } else if (n < -0x10000) { 6428 n = -0x10000; 6429 } 6430 6431 aExp += n - 1; 6432 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 6433 STATUS_VAR ); 6434 6435 } 6436