1 /* 2 * QEMU float support 3 * 4 * Derived from SoftFloat. 5 */ 6 7 /*============================================================================ 8 9 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic 10 Package, Release 2b. 11 12 Written by John R. Hauser. This work was made possible in part by the 13 International Computer Science Institute, located at Suite 600, 1947 Center 14 Street, Berkeley, California 94704. Funding was partially provided by the 15 National Science Foundation under grant MIP-9311980. The original version 16 of this code was written as part of a project to build a fixed-point vector 17 processor in collaboration with the University of California at Berkeley, 18 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 19 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ 20 arithmetic/SoftFloat.html'. 21 22 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has 23 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES 24 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS 25 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, 26 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE 27 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE 28 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR 29 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. 30 31 Derivative works are acceptable, even for commercial purposes, so long as 32 (1) the source code for the derivative work includes prominent notice that 33 the work is derivative, and (2) the source code includes prominent notice with 34 these four paragraphs for those parts of this code that are retained. 35 36 =============================================================================*/ 37 38 /* softfloat (and in particular the code in softfloat-specialize.h) is 39 * target-dependent and needs the TARGET_* macros. 40 */ 41 #include "config.h" 42 43 #include "fpu/softfloat.h" 44 45 /* We only need stdlib for abort() */ 46 #include <stdlib.h> 47 48 /*---------------------------------------------------------------------------- 49 | Primitive arithmetic functions, including multi-word arithmetic, and 50 | division and square root approximations. (Can be specialized to target if 51 | desired.) 52 *----------------------------------------------------------------------------*/ 53 #include "softfloat-macros.h" 54 55 /*---------------------------------------------------------------------------- 56 | Functions and definitions to determine: (1) whether tininess for underflow 57 | is detected before or after rounding by default, (2) what (if anything) 58 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 59 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 60 | are propagated from function inputs to output. These details are target- 61 | specific. 62 *----------------------------------------------------------------------------*/ 63 #include "softfloat-specialize.h" 64 65 /*---------------------------------------------------------------------------- 66 | Returns the fraction bits of the half-precision floating-point value `a'. 67 *----------------------------------------------------------------------------*/ 68 69 INLINE uint32_t extractFloat16Frac(float16 a) 70 { 71 return float16_val(a) & 0x3ff; 72 } 73 74 /*---------------------------------------------------------------------------- 75 | Returns the exponent bits of the half-precision floating-point value `a'. 76 *----------------------------------------------------------------------------*/ 77 78 INLINE int_fast16_t extractFloat16Exp(float16 a) 79 { 80 return (float16_val(a) >> 10) & 0x1f; 81 } 82 83 /*---------------------------------------------------------------------------- 84 | Returns the sign bit of the single-precision floating-point value `a'. 85 *----------------------------------------------------------------------------*/ 86 87 INLINE flag extractFloat16Sign(float16 a) 88 { 89 return float16_val(a)>>15; 90 } 91 92 /*---------------------------------------------------------------------------- 93 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 94 | and 7, and returns the properly rounded 32-bit integer corresponding to the 95 | input. If `zSign' is 1, the input is negated before being converted to an 96 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 97 | is simply rounded to an integer, with the inexact exception raised if the 98 | input cannot be represented exactly as an integer. However, if the fixed- 99 | point input is too large, the invalid exception is raised and the largest 100 | positive or negative integer is returned. 101 *----------------------------------------------------------------------------*/ 102 103 static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM) 104 { 105 int8 roundingMode; 106 flag roundNearestEven; 107 int8 roundIncrement, roundBits; 108 int32_t z; 109 110 roundingMode = STATUS(float_rounding_mode); 111 roundNearestEven = ( roundingMode == float_round_nearest_even ); 112 switch (roundingMode) { 113 case float_round_nearest_even: 114 case float_round_ties_away: 115 roundIncrement = 0x40; 116 break; 117 case float_round_to_zero: 118 roundIncrement = 0; 119 break; 120 case float_round_up: 121 roundIncrement = zSign ? 0 : 0x7f; 122 break; 123 case float_round_down: 124 roundIncrement = zSign ? 0x7f : 0; 125 break; 126 default: 127 abort(); 128 } 129 roundBits = absZ & 0x7F; 130 absZ = ( absZ + roundIncrement )>>7; 131 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 132 z = absZ; 133 if ( zSign ) z = - z; 134 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 135 float_raise( float_flag_invalid STATUS_VAR); 136 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 137 } 138 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; 139 return z; 140 141 } 142 143 /*---------------------------------------------------------------------------- 144 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 145 | `absZ1', with binary point between bits 63 and 64 (between the input words), 146 | and returns the properly rounded 64-bit integer corresponding to the input. 147 | If `zSign' is 1, the input is negated before being converted to an integer. 148 | Ordinarily, the fixed-point input is simply rounded to an integer, with 149 | the inexact exception raised if the input cannot be represented exactly as 150 | an integer. However, if the fixed-point input is too large, the invalid 151 | exception is raised and the largest positive or negative integer is 152 | returned. 153 *----------------------------------------------------------------------------*/ 154 155 static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM) 156 { 157 int8 roundingMode; 158 flag roundNearestEven, increment; 159 int64_t z; 160 161 roundingMode = STATUS(float_rounding_mode); 162 roundNearestEven = ( roundingMode == float_round_nearest_even ); 163 switch (roundingMode) { 164 case float_round_nearest_even: 165 case float_round_ties_away: 166 increment = ((int64_t) absZ1 < 0); 167 break; 168 case float_round_to_zero: 169 increment = 0; 170 break; 171 case float_round_up: 172 increment = !zSign && absZ1; 173 break; 174 case float_round_down: 175 increment = zSign && absZ1; 176 break; 177 default: 178 abort(); 179 } 180 if ( increment ) { 181 ++absZ0; 182 if ( absZ0 == 0 ) goto overflow; 183 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 184 } 185 z = absZ0; 186 if ( zSign ) z = - z; 187 if ( z && ( ( z < 0 ) ^ zSign ) ) { 188 overflow: 189 float_raise( float_flag_invalid STATUS_VAR); 190 return 191 zSign ? (int64_t) LIT64( 0x8000000000000000 ) 192 : LIT64( 0x7FFFFFFFFFFFFFFF ); 193 } 194 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact; 195 return z; 196 197 } 198 199 /*---------------------------------------------------------------------------- 200 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 201 | `absZ1', with binary point between bits 63 and 64 (between the input words), 202 | and returns the properly rounded 64-bit unsigned integer corresponding to the 203 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 204 | with the inexact exception raised if the input cannot be represented exactly 205 | as an integer. However, if the fixed-point input is too large, the invalid 206 | exception is raised and the largest unsigned integer is returned. 207 *----------------------------------------------------------------------------*/ 208 209 static int64 roundAndPackUint64(flag zSign, uint64_t absZ0, 210 uint64_t absZ1 STATUS_PARAM) 211 { 212 int8 roundingMode; 213 flag roundNearestEven, increment; 214 215 roundingMode = STATUS(float_rounding_mode); 216 roundNearestEven = (roundingMode == float_round_nearest_even); 217 switch (roundingMode) { 218 case float_round_nearest_even: 219 case float_round_ties_away: 220 increment = ((int64_t)absZ1 < 0); 221 break; 222 case float_round_to_zero: 223 increment = 0; 224 break; 225 case float_round_up: 226 increment = !zSign && absZ1; 227 break; 228 case float_round_down: 229 increment = zSign && absZ1; 230 break; 231 default: 232 abort(); 233 } 234 if (increment) { 235 ++absZ0; 236 if (absZ0 == 0) { 237 float_raise(float_flag_invalid STATUS_VAR); 238 return LIT64(0xFFFFFFFFFFFFFFFF); 239 } 240 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven); 241 } 242 243 if (zSign && absZ0) { 244 float_raise(float_flag_invalid STATUS_VAR); 245 return 0; 246 } 247 248 if (absZ1) { 249 STATUS(float_exception_flags) |= float_flag_inexact; 250 } 251 return absZ0; 252 } 253 254 /*---------------------------------------------------------------------------- 255 | Returns the fraction bits of the single-precision floating-point value `a'. 256 *----------------------------------------------------------------------------*/ 257 258 INLINE uint32_t extractFloat32Frac( float32 a ) 259 { 260 261 return float32_val(a) & 0x007FFFFF; 262 263 } 264 265 /*---------------------------------------------------------------------------- 266 | Returns the exponent bits of the single-precision floating-point value `a'. 267 *----------------------------------------------------------------------------*/ 268 269 INLINE int_fast16_t extractFloat32Exp(float32 a) 270 { 271 272 return ( float32_val(a)>>23 ) & 0xFF; 273 274 } 275 276 /*---------------------------------------------------------------------------- 277 | Returns the sign bit of the single-precision floating-point value `a'. 278 *----------------------------------------------------------------------------*/ 279 280 INLINE flag extractFloat32Sign( float32 a ) 281 { 282 283 return float32_val(a)>>31; 284 285 } 286 287 /*---------------------------------------------------------------------------- 288 | If `a' is denormal and we are in flush-to-zero mode then set the 289 | input-denormal exception and return zero. Otherwise just return the value. 290 *----------------------------------------------------------------------------*/ 291 static float32 float32_squash_input_denormal(float32 a STATUS_PARAM) 292 { 293 if (STATUS(flush_inputs_to_zero)) { 294 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) { 295 float_raise(float_flag_input_denormal STATUS_VAR); 296 return make_float32(float32_val(a) & 0x80000000); 297 } 298 } 299 return a; 300 } 301 302 /*---------------------------------------------------------------------------- 303 | Normalizes the subnormal single-precision floating-point value represented 304 | by the denormalized significand `aSig'. The normalized exponent and 305 | significand are stored at the locations pointed to by `zExpPtr' and 306 | `zSigPtr', respectively. 307 *----------------------------------------------------------------------------*/ 308 309 static void 310 normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr) 311 { 312 int8 shiftCount; 313 314 shiftCount = countLeadingZeros32( aSig ) - 8; 315 *zSigPtr = aSig<<shiftCount; 316 *zExpPtr = 1 - shiftCount; 317 318 } 319 320 /*---------------------------------------------------------------------------- 321 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 322 | single-precision floating-point value, returning the result. After being 323 | shifted into the proper positions, the three fields are simply added 324 | together to form the result. This means that any integer portion of `zSig' 325 | will be added into the exponent. Since a properly normalized significand 326 | will have an integer portion equal to 1, the `zExp' input should be 1 less 327 | than the desired result exponent whenever `zSig' is a complete, normalized 328 | significand. 329 *----------------------------------------------------------------------------*/ 330 331 INLINE float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig) 332 { 333 334 return make_float32( 335 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig); 336 337 } 338 339 /*---------------------------------------------------------------------------- 340 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 341 | and significand `zSig', and returns the proper single-precision floating- 342 | point value corresponding to the abstract input. Ordinarily, the abstract 343 | value is simply rounded and packed into the single-precision format, with 344 | the inexact exception raised if the abstract input cannot be represented 345 | exactly. However, if the abstract value is too large, the overflow and 346 | inexact exceptions are raised and an infinity or maximal finite value is 347 | returned. If the abstract value is too small, the input value is rounded to 348 | a subnormal number, and the underflow and inexact exceptions are raised if 349 | the abstract input cannot be represented exactly as a subnormal single- 350 | precision floating-point number. 351 | The input significand `zSig' has its binary point between bits 30 352 | and 29, which is 7 bits to the left of the usual location. This shifted 353 | significand must be normalized or smaller. If `zSig' is not normalized, 354 | `zExp' must be 0; in that case, the result returned is a subnormal number, 355 | and it must not require rounding. In the usual case that `zSig' is 356 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 357 | The handling of underflow and overflow follows the IEC/IEEE Standard for 358 | Binary Floating-Point Arithmetic. 359 *----------------------------------------------------------------------------*/ 360 361 static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM) 362 { 363 int8 roundingMode; 364 flag roundNearestEven; 365 int8 roundIncrement, roundBits; 366 flag isTiny; 367 368 roundingMode = STATUS(float_rounding_mode); 369 roundNearestEven = ( roundingMode == float_round_nearest_even ); 370 switch (roundingMode) { 371 case float_round_nearest_even: 372 case float_round_ties_away: 373 roundIncrement = 0x40; 374 break; 375 case float_round_to_zero: 376 roundIncrement = 0; 377 break; 378 case float_round_up: 379 roundIncrement = zSign ? 0 : 0x7f; 380 break; 381 case float_round_down: 382 roundIncrement = zSign ? 0x7f : 0; 383 break; 384 default: 385 abort(); 386 break; 387 } 388 roundBits = zSig & 0x7F; 389 if ( 0xFD <= (uint16_t) zExp ) { 390 if ( ( 0xFD < zExp ) 391 || ( ( zExp == 0xFD ) 392 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 393 ) { 394 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); 395 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 )); 396 } 397 if ( zExp < 0 ) { 398 if (STATUS(flush_to_zero)) { 399 float_raise(float_flag_output_denormal STATUS_VAR); 400 return packFloat32(zSign, 0, 0); 401 } 402 isTiny = 403 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) 404 || ( zExp < -1 ) 405 || ( zSig + roundIncrement < 0x80000000 ); 406 shift32RightJamming( zSig, - zExp, &zSig ); 407 zExp = 0; 408 roundBits = zSig & 0x7F; 409 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR); 410 } 411 } 412 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; 413 zSig = ( zSig + roundIncrement )>>7; 414 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 415 if ( zSig == 0 ) zExp = 0; 416 return packFloat32( zSign, zExp, zSig ); 417 418 } 419 420 /*---------------------------------------------------------------------------- 421 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 422 | and significand `zSig', and returns the proper single-precision floating- 423 | point value corresponding to the abstract input. This routine is just like 424 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 425 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 426 | floating-point exponent. 427 *----------------------------------------------------------------------------*/ 428 429 static float32 430 normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM) 431 { 432 int8 shiftCount; 433 434 shiftCount = countLeadingZeros32( zSig ) - 1; 435 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR); 436 437 } 438 439 /*---------------------------------------------------------------------------- 440 | Returns the fraction bits of the double-precision floating-point value `a'. 441 *----------------------------------------------------------------------------*/ 442 443 INLINE uint64_t extractFloat64Frac( float64 a ) 444 { 445 446 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF ); 447 448 } 449 450 /*---------------------------------------------------------------------------- 451 | Returns the exponent bits of the double-precision floating-point value `a'. 452 *----------------------------------------------------------------------------*/ 453 454 INLINE int_fast16_t extractFloat64Exp(float64 a) 455 { 456 457 return ( float64_val(a)>>52 ) & 0x7FF; 458 459 } 460 461 /*---------------------------------------------------------------------------- 462 | Returns the sign bit of the double-precision floating-point value `a'. 463 *----------------------------------------------------------------------------*/ 464 465 INLINE flag extractFloat64Sign( float64 a ) 466 { 467 468 return float64_val(a)>>63; 469 470 } 471 472 /*---------------------------------------------------------------------------- 473 | If `a' is denormal and we are in flush-to-zero mode then set the 474 | input-denormal exception and return zero. Otherwise just return the value. 475 *----------------------------------------------------------------------------*/ 476 static float64 float64_squash_input_denormal(float64 a STATUS_PARAM) 477 { 478 if (STATUS(flush_inputs_to_zero)) { 479 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) { 480 float_raise(float_flag_input_denormal STATUS_VAR); 481 return make_float64(float64_val(a) & (1ULL << 63)); 482 } 483 } 484 return a; 485 } 486 487 /*---------------------------------------------------------------------------- 488 | Normalizes the subnormal double-precision floating-point value represented 489 | by the denormalized significand `aSig'. The normalized exponent and 490 | significand are stored at the locations pointed to by `zExpPtr' and 491 | `zSigPtr', respectively. 492 *----------------------------------------------------------------------------*/ 493 494 static void 495 normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr) 496 { 497 int8 shiftCount; 498 499 shiftCount = countLeadingZeros64( aSig ) - 11; 500 *zSigPtr = aSig<<shiftCount; 501 *zExpPtr = 1 - shiftCount; 502 503 } 504 505 /*---------------------------------------------------------------------------- 506 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 507 | double-precision floating-point value, returning the result. After being 508 | shifted into the proper positions, the three fields are simply added 509 | together to form the result. This means that any integer portion of `zSig' 510 | will be added into the exponent. Since a properly normalized significand 511 | will have an integer portion equal to 1, the `zExp' input should be 1 less 512 | than the desired result exponent whenever `zSig' is a complete, normalized 513 | significand. 514 *----------------------------------------------------------------------------*/ 515 516 INLINE float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig) 517 { 518 519 return make_float64( 520 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 521 522 } 523 524 /*---------------------------------------------------------------------------- 525 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 526 | and significand `zSig', and returns the proper double-precision floating- 527 | point value corresponding to the abstract input. Ordinarily, the abstract 528 | value is simply rounded and packed into the double-precision format, with 529 | the inexact exception raised if the abstract input cannot be represented 530 | exactly. However, if the abstract value is too large, the overflow and 531 | inexact exceptions are raised and an infinity or maximal finite value is 532 | returned. If the abstract value is too small, the input value is rounded 533 | to a subnormal number, and the underflow and inexact exceptions are raised 534 | if the abstract input cannot be represented exactly as a subnormal double- 535 | precision floating-point number. 536 | The input significand `zSig' has its binary point between bits 62 537 | and 61, which is 10 bits to the left of the usual location. This shifted 538 | significand must be normalized or smaller. If `zSig' is not normalized, 539 | `zExp' must be 0; in that case, the result returned is a subnormal number, 540 | and it must not require rounding. In the usual case that `zSig' is 541 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 542 | The handling of underflow and overflow follows the IEC/IEEE Standard for 543 | Binary Floating-Point Arithmetic. 544 *----------------------------------------------------------------------------*/ 545 546 static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM) 547 { 548 int8 roundingMode; 549 flag roundNearestEven; 550 int_fast16_t roundIncrement, roundBits; 551 flag isTiny; 552 553 roundingMode = STATUS(float_rounding_mode); 554 roundNearestEven = ( roundingMode == float_round_nearest_even ); 555 switch (roundingMode) { 556 case float_round_nearest_even: 557 case float_round_ties_away: 558 roundIncrement = 0x200; 559 break; 560 case float_round_to_zero: 561 roundIncrement = 0; 562 break; 563 case float_round_up: 564 roundIncrement = zSign ? 0 : 0x3ff; 565 break; 566 case float_round_down: 567 roundIncrement = zSign ? 0x3ff : 0; 568 break; 569 default: 570 abort(); 571 } 572 roundBits = zSig & 0x3FF; 573 if ( 0x7FD <= (uint16_t) zExp ) { 574 if ( ( 0x7FD < zExp ) 575 || ( ( zExp == 0x7FD ) 576 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 577 ) { 578 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); 579 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 )); 580 } 581 if ( zExp < 0 ) { 582 if (STATUS(flush_to_zero)) { 583 float_raise(float_flag_output_denormal STATUS_VAR); 584 return packFloat64(zSign, 0, 0); 585 } 586 isTiny = 587 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) 588 || ( zExp < -1 ) 589 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 590 shift64RightJamming( zSig, - zExp, &zSig ); 591 zExp = 0; 592 roundBits = zSig & 0x3FF; 593 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR); 594 } 595 } 596 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; 597 zSig = ( zSig + roundIncrement )>>10; 598 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 599 if ( zSig == 0 ) zExp = 0; 600 return packFloat64( zSign, zExp, zSig ); 601 602 } 603 604 /*---------------------------------------------------------------------------- 605 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 606 | and significand `zSig', and returns the proper double-precision floating- 607 | point value corresponding to the abstract input. This routine is just like 608 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 609 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 610 | floating-point exponent. 611 *----------------------------------------------------------------------------*/ 612 613 static float64 614 normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM) 615 { 616 int8 shiftCount; 617 618 shiftCount = countLeadingZeros64( zSig ) - 1; 619 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR); 620 621 } 622 623 /*---------------------------------------------------------------------------- 624 | Returns the fraction bits of the extended double-precision floating-point 625 | value `a'. 626 *----------------------------------------------------------------------------*/ 627 628 INLINE uint64_t extractFloatx80Frac( floatx80 a ) 629 { 630 631 return a.low; 632 633 } 634 635 /*---------------------------------------------------------------------------- 636 | Returns the exponent bits of the extended double-precision floating-point 637 | value `a'. 638 *----------------------------------------------------------------------------*/ 639 640 INLINE int32 extractFloatx80Exp( floatx80 a ) 641 { 642 643 return a.high & 0x7FFF; 644 645 } 646 647 /*---------------------------------------------------------------------------- 648 | Returns the sign bit of the extended double-precision floating-point value 649 | `a'. 650 *----------------------------------------------------------------------------*/ 651 652 INLINE flag extractFloatx80Sign( floatx80 a ) 653 { 654 655 return a.high>>15; 656 657 } 658 659 /*---------------------------------------------------------------------------- 660 | Normalizes the subnormal extended double-precision floating-point value 661 | represented by the denormalized significand `aSig'. The normalized exponent 662 | and significand are stored at the locations pointed to by `zExpPtr' and 663 | `zSigPtr', respectively. 664 *----------------------------------------------------------------------------*/ 665 666 static void 667 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr ) 668 { 669 int8 shiftCount; 670 671 shiftCount = countLeadingZeros64( aSig ); 672 *zSigPtr = aSig<<shiftCount; 673 *zExpPtr = 1 - shiftCount; 674 675 } 676 677 /*---------------------------------------------------------------------------- 678 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 679 | extended double-precision floating-point value, returning the result. 680 *----------------------------------------------------------------------------*/ 681 682 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig ) 683 { 684 floatx80 z; 685 686 z.low = zSig; 687 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp; 688 return z; 689 690 } 691 692 /*---------------------------------------------------------------------------- 693 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 694 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 695 | and returns the proper extended double-precision floating-point value 696 | corresponding to the abstract input. Ordinarily, the abstract value is 697 | rounded and packed into the extended double-precision format, with the 698 | inexact exception raised if the abstract input cannot be represented 699 | exactly. However, if the abstract value is too large, the overflow and 700 | inexact exceptions are raised and an infinity or maximal finite value is 701 | returned. If the abstract value is too small, the input value is rounded to 702 | a subnormal number, and the underflow and inexact exceptions are raised if 703 | the abstract input cannot be represented exactly as a subnormal extended 704 | double-precision floating-point number. 705 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 706 | number of bits as single or double precision, respectively. Otherwise, the 707 | result is rounded to the full precision of the extended double-precision 708 | format. 709 | The input significand must be normalized or smaller. If the input 710 | significand is not normalized, `zExp' must be 0; in that case, the result 711 | returned is a subnormal number, and it must not require rounding. The 712 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 713 | Floating-Point Arithmetic. 714 *----------------------------------------------------------------------------*/ 715 716 static floatx80 717 roundAndPackFloatx80( 718 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 719 STATUS_PARAM) 720 { 721 int8 roundingMode; 722 flag roundNearestEven, increment, isTiny; 723 int64 roundIncrement, roundMask, roundBits; 724 725 roundingMode = STATUS(float_rounding_mode); 726 roundNearestEven = ( roundingMode == float_round_nearest_even ); 727 if ( roundingPrecision == 80 ) goto precision80; 728 if ( roundingPrecision == 64 ) { 729 roundIncrement = LIT64( 0x0000000000000400 ); 730 roundMask = LIT64( 0x00000000000007FF ); 731 } 732 else if ( roundingPrecision == 32 ) { 733 roundIncrement = LIT64( 0x0000008000000000 ); 734 roundMask = LIT64( 0x000000FFFFFFFFFF ); 735 } 736 else { 737 goto precision80; 738 } 739 zSig0 |= ( zSig1 != 0 ); 740 switch (roundingMode) { 741 case float_round_nearest_even: 742 case float_round_ties_away: 743 break; 744 case float_round_to_zero: 745 roundIncrement = 0; 746 break; 747 case float_round_up: 748 roundIncrement = zSign ? 0 : roundMask; 749 break; 750 case float_round_down: 751 roundIncrement = zSign ? roundMask : 0; 752 break; 753 default: 754 abort(); 755 } 756 roundBits = zSig0 & roundMask; 757 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 758 if ( ( 0x7FFE < zExp ) 759 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 760 ) { 761 goto overflow; 762 } 763 if ( zExp <= 0 ) { 764 if (STATUS(flush_to_zero)) { 765 float_raise(float_flag_output_denormal STATUS_VAR); 766 return packFloatx80(zSign, 0, 0); 767 } 768 isTiny = 769 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) 770 || ( zExp < 0 ) 771 || ( zSig0 <= zSig0 + roundIncrement ); 772 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 773 zExp = 0; 774 roundBits = zSig0 & roundMask; 775 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR); 776 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; 777 zSig0 += roundIncrement; 778 if ( (int64_t) zSig0 < 0 ) zExp = 1; 779 roundIncrement = roundMask + 1; 780 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 781 roundMask |= roundIncrement; 782 } 783 zSig0 &= ~ roundMask; 784 return packFloatx80( zSign, zExp, zSig0 ); 785 } 786 } 787 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; 788 zSig0 += roundIncrement; 789 if ( zSig0 < roundIncrement ) { 790 ++zExp; 791 zSig0 = LIT64( 0x8000000000000000 ); 792 } 793 roundIncrement = roundMask + 1; 794 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 795 roundMask |= roundIncrement; 796 } 797 zSig0 &= ~ roundMask; 798 if ( zSig0 == 0 ) zExp = 0; 799 return packFloatx80( zSign, zExp, zSig0 ); 800 precision80: 801 switch (roundingMode) { 802 case float_round_nearest_even: 803 case float_round_ties_away: 804 increment = ((int64_t)zSig1 < 0); 805 break; 806 case float_round_to_zero: 807 increment = 0; 808 break; 809 case float_round_up: 810 increment = !zSign && zSig1; 811 break; 812 case float_round_down: 813 increment = zSign && zSig1; 814 break; 815 default: 816 abort(); 817 } 818 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 819 if ( ( 0x7FFE < zExp ) 820 || ( ( zExp == 0x7FFE ) 821 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 822 && increment 823 ) 824 ) { 825 roundMask = 0; 826 overflow: 827 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); 828 if ( ( roundingMode == float_round_to_zero ) 829 || ( zSign && ( roundingMode == float_round_up ) ) 830 || ( ! zSign && ( roundingMode == float_round_down ) ) 831 ) { 832 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 833 } 834 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 835 } 836 if ( zExp <= 0 ) { 837 isTiny = 838 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) 839 || ( zExp < 0 ) 840 || ! increment 841 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 842 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 843 zExp = 0; 844 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR); 845 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact; 846 switch (roundingMode) { 847 case float_round_nearest_even: 848 case float_round_ties_away: 849 increment = ((int64_t)zSig1 < 0); 850 break; 851 case float_round_to_zero: 852 increment = 0; 853 break; 854 case float_round_up: 855 increment = !zSign && zSig1; 856 break; 857 case float_round_down: 858 increment = zSign && zSig1; 859 break; 860 default: 861 abort(); 862 } 863 if ( increment ) { 864 ++zSig0; 865 zSig0 &= 866 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 867 if ( (int64_t) zSig0 < 0 ) zExp = 1; 868 } 869 return packFloatx80( zSign, zExp, zSig0 ); 870 } 871 } 872 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact; 873 if ( increment ) { 874 ++zSig0; 875 if ( zSig0 == 0 ) { 876 ++zExp; 877 zSig0 = LIT64( 0x8000000000000000 ); 878 } 879 else { 880 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 881 } 882 } 883 else { 884 if ( zSig0 == 0 ) zExp = 0; 885 } 886 return packFloatx80( zSign, zExp, zSig0 ); 887 888 } 889 890 /*---------------------------------------------------------------------------- 891 | Takes an abstract floating-point value having sign `zSign', exponent 892 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 893 | and returns the proper extended double-precision floating-point value 894 | corresponding to the abstract input. This routine is just like 895 | `roundAndPackFloatx80' except that the input significand does not have to be 896 | normalized. 897 *----------------------------------------------------------------------------*/ 898 899 static floatx80 900 normalizeRoundAndPackFloatx80( 901 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 902 STATUS_PARAM) 903 { 904 int8 shiftCount; 905 906 if ( zSig0 == 0 ) { 907 zSig0 = zSig1; 908 zSig1 = 0; 909 zExp -= 64; 910 } 911 shiftCount = countLeadingZeros64( zSig0 ); 912 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 913 zExp -= shiftCount; 914 return 915 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR); 916 917 } 918 919 /*---------------------------------------------------------------------------- 920 | Returns the least-significant 64 fraction bits of the quadruple-precision 921 | floating-point value `a'. 922 *----------------------------------------------------------------------------*/ 923 924 INLINE uint64_t extractFloat128Frac1( float128 a ) 925 { 926 927 return a.low; 928 929 } 930 931 /*---------------------------------------------------------------------------- 932 | Returns the most-significant 48 fraction bits of the quadruple-precision 933 | floating-point value `a'. 934 *----------------------------------------------------------------------------*/ 935 936 INLINE uint64_t extractFloat128Frac0( float128 a ) 937 { 938 939 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 940 941 } 942 943 /*---------------------------------------------------------------------------- 944 | Returns the exponent bits of the quadruple-precision floating-point value 945 | `a'. 946 *----------------------------------------------------------------------------*/ 947 948 INLINE int32 extractFloat128Exp( float128 a ) 949 { 950 951 return ( a.high>>48 ) & 0x7FFF; 952 953 } 954 955 /*---------------------------------------------------------------------------- 956 | Returns the sign bit of the quadruple-precision floating-point value `a'. 957 *----------------------------------------------------------------------------*/ 958 959 INLINE flag extractFloat128Sign( float128 a ) 960 { 961 962 return a.high>>63; 963 964 } 965 966 /*---------------------------------------------------------------------------- 967 | Normalizes the subnormal quadruple-precision floating-point value 968 | represented by the denormalized significand formed by the concatenation of 969 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 970 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 971 | significand are stored at the location pointed to by `zSig0Ptr', and the 972 | least significant 64 bits of the normalized significand are stored at the 973 | location pointed to by `zSig1Ptr'. 974 *----------------------------------------------------------------------------*/ 975 976 static void 977 normalizeFloat128Subnormal( 978 uint64_t aSig0, 979 uint64_t aSig1, 980 int32 *zExpPtr, 981 uint64_t *zSig0Ptr, 982 uint64_t *zSig1Ptr 983 ) 984 { 985 int8 shiftCount; 986 987 if ( aSig0 == 0 ) { 988 shiftCount = countLeadingZeros64( aSig1 ) - 15; 989 if ( shiftCount < 0 ) { 990 *zSig0Ptr = aSig1>>( - shiftCount ); 991 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 992 } 993 else { 994 *zSig0Ptr = aSig1<<shiftCount; 995 *zSig1Ptr = 0; 996 } 997 *zExpPtr = - shiftCount - 63; 998 } 999 else { 1000 shiftCount = countLeadingZeros64( aSig0 ) - 15; 1001 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 1002 *zExpPtr = 1 - shiftCount; 1003 } 1004 1005 } 1006 1007 /*---------------------------------------------------------------------------- 1008 | Packs the sign `zSign', the exponent `zExp', and the significand formed 1009 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 1010 | floating-point value, returning the result. After being shifted into the 1011 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 1012 | added together to form the most significant 32 bits of the result. This 1013 | means that any integer portion of `zSig0' will be added into the exponent. 1014 | Since a properly normalized significand will have an integer portion equal 1015 | to 1, the `zExp' input should be 1 less than the desired result exponent 1016 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 1017 | significand. 1018 *----------------------------------------------------------------------------*/ 1019 1020 INLINE float128 1021 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 ) 1022 { 1023 float128 z; 1024 1025 z.low = zSig1; 1026 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0; 1027 return z; 1028 1029 } 1030 1031 /*---------------------------------------------------------------------------- 1032 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1033 | and extended significand formed by the concatenation of `zSig0', `zSig1', 1034 | and `zSig2', and returns the proper quadruple-precision floating-point value 1035 | corresponding to the abstract input. Ordinarily, the abstract value is 1036 | simply rounded and packed into the quadruple-precision format, with the 1037 | inexact exception raised if the abstract input cannot be represented 1038 | exactly. However, if the abstract value is too large, the overflow and 1039 | inexact exceptions are raised and an infinity or maximal finite value is 1040 | returned. If the abstract value is too small, the input value is rounded to 1041 | a subnormal number, and the underflow and inexact exceptions are raised if 1042 | the abstract input cannot be represented exactly as a subnormal quadruple- 1043 | precision floating-point number. 1044 | The input significand must be normalized or smaller. If the input 1045 | significand is not normalized, `zExp' must be 0; in that case, the result 1046 | returned is a subnormal number, and it must not require rounding. In the 1047 | usual case that the input significand is normalized, `zExp' must be 1 less 1048 | than the ``true'' floating-point exponent. The handling of underflow and 1049 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1050 *----------------------------------------------------------------------------*/ 1051 1052 static float128 1053 roundAndPackFloat128( 1054 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM) 1055 { 1056 int8 roundingMode; 1057 flag roundNearestEven, increment, isTiny; 1058 1059 roundingMode = STATUS(float_rounding_mode); 1060 roundNearestEven = ( roundingMode == float_round_nearest_even ); 1061 switch (roundingMode) { 1062 case float_round_nearest_even: 1063 case float_round_ties_away: 1064 increment = ((int64_t)zSig2 < 0); 1065 break; 1066 case float_round_to_zero: 1067 increment = 0; 1068 break; 1069 case float_round_up: 1070 increment = !zSign && zSig2; 1071 break; 1072 case float_round_down: 1073 increment = zSign && zSig2; 1074 break; 1075 default: 1076 abort(); 1077 } 1078 if ( 0x7FFD <= (uint32_t) zExp ) { 1079 if ( ( 0x7FFD < zExp ) 1080 || ( ( zExp == 0x7FFD ) 1081 && eq128( 1082 LIT64( 0x0001FFFFFFFFFFFF ), 1083 LIT64( 0xFFFFFFFFFFFFFFFF ), 1084 zSig0, 1085 zSig1 1086 ) 1087 && increment 1088 ) 1089 ) { 1090 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); 1091 if ( ( roundingMode == float_round_to_zero ) 1092 || ( zSign && ( roundingMode == float_round_up ) ) 1093 || ( ! zSign && ( roundingMode == float_round_down ) ) 1094 ) { 1095 return 1096 packFloat128( 1097 zSign, 1098 0x7FFE, 1099 LIT64( 0x0000FFFFFFFFFFFF ), 1100 LIT64( 0xFFFFFFFFFFFFFFFF ) 1101 ); 1102 } 1103 return packFloat128( zSign, 0x7FFF, 0, 0 ); 1104 } 1105 if ( zExp < 0 ) { 1106 if (STATUS(flush_to_zero)) { 1107 float_raise(float_flag_output_denormal STATUS_VAR); 1108 return packFloat128(zSign, 0, 0, 0); 1109 } 1110 isTiny = 1111 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) 1112 || ( zExp < -1 ) 1113 || ! increment 1114 || lt128( 1115 zSig0, 1116 zSig1, 1117 LIT64( 0x0001FFFFFFFFFFFF ), 1118 LIT64( 0xFFFFFFFFFFFFFFFF ) 1119 ); 1120 shift128ExtraRightJamming( 1121 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 1122 zExp = 0; 1123 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR); 1124 switch (roundingMode) { 1125 case float_round_nearest_even: 1126 case float_round_ties_away: 1127 increment = ((int64_t)zSig2 < 0); 1128 break; 1129 case float_round_to_zero: 1130 increment = 0; 1131 break; 1132 case float_round_up: 1133 increment = !zSign && zSig2; 1134 break; 1135 case float_round_down: 1136 increment = zSign && zSig2; 1137 break; 1138 default: 1139 abort(); 1140 } 1141 } 1142 } 1143 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact; 1144 if ( increment ) { 1145 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 1146 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 1147 } 1148 else { 1149 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 1150 } 1151 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1152 1153 } 1154 1155 /*---------------------------------------------------------------------------- 1156 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1157 | and significand formed by the concatenation of `zSig0' and `zSig1', and 1158 | returns the proper quadruple-precision floating-point value corresponding 1159 | to the abstract input. This routine is just like `roundAndPackFloat128' 1160 | except that the input significand has fewer bits and does not have to be 1161 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 1162 | point exponent. 1163 *----------------------------------------------------------------------------*/ 1164 1165 static float128 1166 normalizeRoundAndPackFloat128( 1167 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM) 1168 { 1169 int8 shiftCount; 1170 uint64_t zSig2; 1171 1172 if ( zSig0 == 0 ) { 1173 zSig0 = zSig1; 1174 zSig1 = 0; 1175 zExp -= 64; 1176 } 1177 shiftCount = countLeadingZeros64( zSig0 ) - 15; 1178 if ( 0 <= shiftCount ) { 1179 zSig2 = 0; 1180 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1181 } 1182 else { 1183 shift128ExtraRightJamming( 1184 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 1185 } 1186 zExp -= shiftCount; 1187 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR); 1188 1189 } 1190 1191 /*---------------------------------------------------------------------------- 1192 | Returns the result of converting the 32-bit two's complement integer `a' 1193 | to the single-precision floating-point format. The conversion is performed 1194 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1195 *----------------------------------------------------------------------------*/ 1196 1197 float32 int32_to_float32(int32_t a STATUS_PARAM) 1198 { 1199 flag zSign; 1200 1201 if ( a == 0 ) return float32_zero; 1202 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); 1203 zSign = ( a < 0 ); 1204 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR ); 1205 1206 } 1207 1208 /*---------------------------------------------------------------------------- 1209 | Returns the result of converting the 32-bit two's complement integer `a' 1210 | to the double-precision floating-point format. The conversion is performed 1211 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1212 *----------------------------------------------------------------------------*/ 1213 1214 float64 int32_to_float64(int32_t a STATUS_PARAM) 1215 { 1216 flag zSign; 1217 uint32 absA; 1218 int8 shiftCount; 1219 uint64_t zSig; 1220 1221 if ( a == 0 ) return float64_zero; 1222 zSign = ( a < 0 ); 1223 absA = zSign ? - a : a; 1224 shiftCount = countLeadingZeros32( absA ) + 21; 1225 zSig = absA; 1226 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount ); 1227 1228 } 1229 1230 /*---------------------------------------------------------------------------- 1231 | Returns the result of converting the 32-bit two's complement integer `a' 1232 | to the extended double-precision floating-point format. The conversion 1233 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1234 | Arithmetic. 1235 *----------------------------------------------------------------------------*/ 1236 1237 floatx80 int32_to_floatx80(int32_t a STATUS_PARAM) 1238 { 1239 flag zSign; 1240 uint32 absA; 1241 int8 shiftCount; 1242 uint64_t zSig; 1243 1244 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1245 zSign = ( a < 0 ); 1246 absA = zSign ? - a : a; 1247 shiftCount = countLeadingZeros32( absA ) + 32; 1248 zSig = absA; 1249 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 1250 1251 } 1252 1253 /*---------------------------------------------------------------------------- 1254 | Returns the result of converting the 32-bit two's complement integer `a' to 1255 | the quadruple-precision floating-point format. The conversion is performed 1256 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1257 *----------------------------------------------------------------------------*/ 1258 1259 float128 int32_to_float128(int32_t a STATUS_PARAM) 1260 { 1261 flag zSign; 1262 uint32 absA; 1263 int8 shiftCount; 1264 uint64_t zSig0; 1265 1266 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1267 zSign = ( a < 0 ); 1268 absA = zSign ? - a : a; 1269 shiftCount = countLeadingZeros32( absA ) + 17; 1270 zSig0 = absA; 1271 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 1272 1273 } 1274 1275 /*---------------------------------------------------------------------------- 1276 | Returns the result of converting the 64-bit two's complement integer `a' 1277 | to the single-precision floating-point format. The conversion is performed 1278 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1279 *----------------------------------------------------------------------------*/ 1280 1281 float32 int64_to_float32(int64_t a STATUS_PARAM) 1282 { 1283 flag zSign; 1284 uint64 absA; 1285 int8 shiftCount; 1286 1287 if ( a == 0 ) return float32_zero; 1288 zSign = ( a < 0 ); 1289 absA = zSign ? - a : a; 1290 shiftCount = countLeadingZeros64( absA ) - 40; 1291 if ( 0 <= shiftCount ) { 1292 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount ); 1293 } 1294 else { 1295 shiftCount += 7; 1296 if ( shiftCount < 0 ) { 1297 shift64RightJamming( absA, - shiftCount, &absA ); 1298 } 1299 else { 1300 absA <<= shiftCount; 1301 } 1302 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR ); 1303 } 1304 1305 } 1306 1307 float32 uint64_to_float32(uint64_t a STATUS_PARAM) 1308 { 1309 int8 shiftCount; 1310 1311 if ( a == 0 ) return float32_zero; 1312 shiftCount = countLeadingZeros64( a ) - 40; 1313 if ( 0 <= shiftCount ) { 1314 return packFloat32(0, 0x95 - shiftCount, a<<shiftCount); 1315 } 1316 else { 1317 shiftCount += 7; 1318 if ( shiftCount < 0 ) { 1319 shift64RightJamming( a, - shiftCount, &a ); 1320 } 1321 else { 1322 a <<= shiftCount; 1323 } 1324 return roundAndPackFloat32(0, 0x9C - shiftCount, a STATUS_VAR); 1325 } 1326 } 1327 1328 /*---------------------------------------------------------------------------- 1329 | Returns the result of converting the 64-bit two's complement integer `a' 1330 | to the double-precision floating-point format. The conversion is performed 1331 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1332 *----------------------------------------------------------------------------*/ 1333 1334 float64 int64_to_float64(int64_t a STATUS_PARAM) 1335 { 1336 flag zSign; 1337 1338 if ( a == 0 ) return float64_zero; 1339 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) { 1340 return packFloat64( 1, 0x43E, 0 ); 1341 } 1342 zSign = ( a < 0 ); 1343 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR ); 1344 1345 } 1346 1347 float64 uint64_to_float64(uint64_t a STATUS_PARAM) 1348 { 1349 int exp = 0x43C; 1350 1351 if (a == 0) { 1352 return float64_zero; 1353 } 1354 if ((int64_t)a < 0) { 1355 shift64RightJamming(a, 1, &a); 1356 exp += 1; 1357 } 1358 return normalizeRoundAndPackFloat64(0, exp, a STATUS_VAR); 1359 } 1360 1361 /*---------------------------------------------------------------------------- 1362 | Returns the result of converting the 64-bit two's complement integer `a' 1363 | to the extended double-precision floating-point format. The conversion 1364 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1365 | Arithmetic. 1366 *----------------------------------------------------------------------------*/ 1367 1368 floatx80 int64_to_floatx80(int64_t a STATUS_PARAM) 1369 { 1370 flag zSign; 1371 uint64 absA; 1372 int8 shiftCount; 1373 1374 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1375 zSign = ( a < 0 ); 1376 absA = zSign ? - a : a; 1377 shiftCount = countLeadingZeros64( absA ); 1378 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 1379 1380 } 1381 1382 /*---------------------------------------------------------------------------- 1383 | Returns the result of converting the 64-bit two's complement integer `a' to 1384 | the quadruple-precision floating-point format. The conversion is performed 1385 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1386 *----------------------------------------------------------------------------*/ 1387 1388 float128 int64_to_float128(int64_t a STATUS_PARAM) 1389 { 1390 flag zSign; 1391 uint64 absA; 1392 int8 shiftCount; 1393 int32 zExp; 1394 uint64_t zSig0, zSig1; 1395 1396 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1397 zSign = ( a < 0 ); 1398 absA = zSign ? - a : a; 1399 shiftCount = countLeadingZeros64( absA ) + 49; 1400 zExp = 0x406E - shiftCount; 1401 if ( 64 <= shiftCount ) { 1402 zSig1 = 0; 1403 zSig0 = absA; 1404 shiftCount -= 64; 1405 } 1406 else { 1407 zSig1 = absA; 1408 zSig0 = 0; 1409 } 1410 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1411 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1412 1413 } 1414 1415 float128 uint64_to_float128(uint64_t a STATUS_PARAM) 1416 { 1417 if (a == 0) { 1418 return float128_zero; 1419 } 1420 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0 STATUS_VAR); 1421 } 1422 1423 /*---------------------------------------------------------------------------- 1424 | Returns the result of converting the single-precision floating-point value 1425 | `a' to the 32-bit two's complement integer format. The conversion is 1426 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1427 | Arithmetic---which means in particular that the conversion is rounded 1428 | according to the current rounding mode. If `a' is a NaN, the largest 1429 | positive integer is returned. Otherwise, if the conversion overflows, the 1430 | largest integer with the same sign as `a' is returned. 1431 *----------------------------------------------------------------------------*/ 1432 1433 int32 float32_to_int32( float32 a STATUS_PARAM ) 1434 { 1435 flag aSign; 1436 int_fast16_t aExp, shiftCount; 1437 uint32_t aSig; 1438 uint64_t aSig64; 1439 1440 a = float32_squash_input_denormal(a STATUS_VAR); 1441 aSig = extractFloat32Frac( a ); 1442 aExp = extractFloat32Exp( a ); 1443 aSign = extractFloat32Sign( a ); 1444 if ( ( aExp == 0xFF ) && aSig ) aSign = 0; 1445 if ( aExp ) aSig |= 0x00800000; 1446 shiftCount = 0xAF - aExp; 1447 aSig64 = aSig; 1448 aSig64 <<= 32; 1449 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 ); 1450 return roundAndPackInt32( aSign, aSig64 STATUS_VAR ); 1451 1452 } 1453 1454 /*---------------------------------------------------------------------------- 1455 | Returns the result of converting the single-precision floating-point value 1456 | `a' to the 32-bit two's complement integer format. The conversion is 1457 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1458 | Arithmetic, except that the conversion is always rounded toward zero. 1459 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1460 | the conversion overflows, the largest integer with the same sign as `a' is 1461 | returned. 1462 *----------------------------------------------------------------------------*/ 1463 1464 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM ) 1465 { 1466 flag aSign; 1467 int_fast16_t aExp, shiftCount; 1468 uint32_t aSig; 1469 int32_t z; 1470 a = float32_squash_input_denormal(a STATUS_VAR); 1471 1472 aSig = extractFloat32Frac( a ); 1473 aExp = extractFloat32Exp( a ); 1474 aSign = extractFloat32Sign( a ); 1475 shiftCount = aExp - 0x9E; 1476 if ( 0 <= shiftCount ) { 1477 if ( float32_val(a) != 0xCF000000 ) { 1478 float_raise( float_flag_invalid STATUS_VAR); 1479 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; 1480 } 1481 return (int32_t) 0x80000000; 1482 } 1483 else if ( aExp <= 0x7E ) { 1484 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 1485 return 0; 1486 } 1487 aSig = ( aSig | 0x00800000 )<<8; 1488 z = aSig>>( - shiftCount ); 1489 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) { 1490 STATUS(float_exception_flags) |= float_flag_inexact; 1491 } 1492 if ( aSign ) z = - z; 1493 return z; 1494 1495 } 1496 1497 /*---------------------------------------------------------------------------- 1498 | Returns the result of converting the single-precision floating-point value 1499 | `a' to the 16-bit two's complement integer format. The conversion is 1500 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1501 | Arithmetic, except that the conversion is always rounded toward zero. 1502 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1503 | the conversion overflows, the largest integer with the same sign as `a' is 1504 | returned. 1505 *----------------------------------------------------------------------------*/ 1506 1507 int_fast16_t float32_to_int16_round_to_zero(float32 a STATUS_PARAM) 1508 { 1509 flag aSign; 1510 int_fast16_t aExp, shiftCount; 1511 uint32_t aSig; 1512 int32 z; 1513 1514 aSig = extractFloat32Frac( a ); 1515 aExp = extractFloat32Exp( a ); 1516 aSign = extractFloat32Sign( a ); 1517 shiftCount = aExp - 0x8E; 1518 if ( 0 <= shiftCount ) { 1519 if ( float32_val(a) != 0xC7000000 ) { 1520 float_raise( float_flag_invalid STATUS_VAR); 1521 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1522 return 0x7FFF; 1523 } 1524 } 1525 return (int32_t) 0xffff8000; 1526 } 1527 else if ( aExp <= 0x7E ) { 1528 if ( aExp | aSig ) { 1529 STATUS(float_exception_flags) |= float_flag_inexact; 1530 } 1531 return 0; 1532 } 1533 shiftCount -= 0x10; 1534 aSig = ( aSig | 0x00800000 )<<8; 1535 z = aSig>>( - shiftCount ); 1536 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) { 1537 STATUS(float_exception_flags) |= float_flag_inexact; 1538 } 1539 if ( aSign ) { 1540 z = - z; 1541 } 1542 return z; 1543 1544 } 1545 1546 /*---------------------------------------------------------------------------- 1547 | Returns the result of converting the single-precision floating-point value 1548 | `a' to the 64-bit two's complement integer format. The conversion is 1549 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1550 | Arithmetic---which means in particular that the conversion is rounded 1551 | according to the current rounding mode. If `a' is a NaN, the largest 1552 | positive integer is returned. Otherwise, if the conversion overflows, the 1553 | largest integer with the same sign as `a' is returned. 1554 *----------------------------------------------------------------------------*/ 1555 1556 int64 float32_to_int64( float32 a STATUS_PARAM ) 1557 { 1558 flag aSign; 1559 int_fast16_t aExp, shiftCount; 1560 uint32_t aSig; 1561 uint64_t aSig64, aSigExtra; 1562 a = float32_squash_input_denormal(a STATUS_VAR); 1563 1564 aSig = extractFloat32Frac( a ); 1565 aExp = extractFloat32Exp( a ); 1566 aSign = extractFloat32Sign( a ); 1567 shiftCount = 0xBE - aExp; 1568 if ( shiftCount < 0 ) { 1569 float_raise( float_flag_invalid STATUS_VAR); 1570 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1571 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1572 } 1573 return (int64_t) LIT64( 0x8000000000000000 ); 1574 } 1575 if ( aExp ) aSig |= 0x00800000; 1576 aSig64 = aSig; 1577 aSig64 <<= 40; 1578 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra ); 1579 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR ); 1580 1581 } 1582 1583 /*---------------------------------------------------------------------------- 1584 | Returns the result of converting the single-precision floating-point value 1585 | `a' to the 64-bit unsigned integer format. The conversion is 1586 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1587 | Arithmetic---which means in particular that the conversion is rounded 1588 | according to the current rounding mode. If `a' is a NaN, the largest 1589 | unsigned integer is returned. Otherwise, if the conversion overflows, the 1590 | largest unsigned integer is returned. If the 'a' is negative, the result 1591 | is rounded and zero is returned; values that do not round to zero will 1592 | raise the inexact exception flag. 1593 *----------------------------------------------------------------------------*/ 1594 1595 uint64 float32_to_uint64(float32 a STATUS_PARAM) 1596 { 1597 flag aSign; 1598 int_fast16_t aExp, shiftCount; 1599 uint32_t aSig; 1600 uint64_t aSig64, aSigExtra; 1601 a = float32_squash_input_denormal(a STATUS_VAR); 1602 1603 aSig = extractFloat32Frac(a); 1604 aExp = extractFloat32Exp(a); 1605 aSign = extractFloat32Sign(a); 1606 if ((aSign) && (aExp > 126)) { 1607 float_raise(float_flag_invalid STATUS_VAR); 1608 if (float32_is_any_nan(a)) { 1609 return LIT64(0xFFFFFFFFFFFFFFFF); 1610 } else { 1611 return 0; 1612 } 1613 } 1614 shiftCount = 0xBE - aExp; 1615 if (aExp) { 1616 aSig |= 0x00800000; 1617 } 1618 if (shiftCount < 0) { 1619 float_raise(float_flag_invalid STATUS_VAR); 1620 return LIT64(0xFFFFFFFFFFFFFFFF); 1621 } 1622 1623 aSig64 = aSig; 1624 aSig64 <<= 40; 1625 shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra); 1626 return roundAndPackUint64(aSign, aSig64, aSigExtra STATUS_VAR); 1627 } 1628 1629 /*---------------------------------------------------------------------------- 1630 | Returns the result of converting the single-precision floating-point value 1631 | `a' to the 64-bit two's complement integer format. The conversion is 1632 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1633 | Arithmetic, except that the conversion is always rounded toward zero. If 1634 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 1635 | conversion overflows, the largest integer with the same sign as `a' is 1636 | returned. 1637 *----------------------------------------------------------------------------*/ 1638 1639 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM ) 1640 { 1641 flag aSign; 1642 int_fast16_t aExp, shiftCount; 1643 uint32_t aSig; 1644 uint64_t aSig64; 1645 int64 z; 1646 a = float32_squash_input_denormal(a STATUS_VAR); 1647 1648 aSig = extractFloat32Frac( a ); 1649 aExp = extractFloat32Exp( a ); 1650 aSign = extractFloat32Sign( a ); 1651 shiftCount = aExp - 0xBE; 1652 if ( 0 <= shiftCount ) { 1653 if ( float32_val(a) != 0xDF000000 ) { 1654 float_raise( float_flag_invalid STATUS_VAR); 1655 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1656 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1657 } 1658 } 1659 return (int64_t) LIT64( 0x8000000000000000 ); 1660 } 1661 else if ( aExp <= 0x7E ) { 1662 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 1663 return 0; 1664 } 1665 aSig64 = aSig | 0x00800000; 1666 aSig64 <<= 40; 1667 z = aSig64>>( - shiftCount ); 1668 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) { 1669 STATUS(float_exception_flags) |= float_flag_inexact; 1670 } 1671 if ( aSign ) z = - z; 1672 return z; 1673 1674 } 1675 1676 /*---------------------------------------------------------------------------- 1677 | Returns the result of converting the single-precision floating-point value 1678 | `a' to the double-precision floating-point format. The conversion is 1679 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1680 | Arithmetic. 1681 *----------------------------------------------------------------------------*/ 1682 1683 float64 float32_to_float64( float32 a STATUS_PARAM ) 1684 { 1685 flag aSign; 1686 int_fast16_t aExp; 1687 uint32_t aSig; 1688 a = float32_squash_input_denormal(a STATUS_VAR); 1689 1690 aSig = extractFloat32Frac( a ); 1691 aExp = extractFloat32Exp( a ); 1692 aSign = extractFloat32Sign( a ); 1693 if ( aExp == 0xFF ) { 1694 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 1695 return packFloat64( aSign, 0x7FF, 0 ); 1696 } 1697 if ( aExp == 0 ) { 1698 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 1699 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1700 --aExp; 1701 } 1702 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 ); 1703 1704 } 1705 1706 /*---------------------------------------------------------------------------- 1707 | Returns the result of converting the single-precision floating-point value 1708 | `a' to the extended double-precision floating-point format. The conversion 1709 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1710 | Arithmetic. 1711 *----------------------------------------------------------------------------*/ 1712 1713 floatx80 float32_to_floatx80( float32 a STATUS_PARAM ) 1714 { 1715 flag aSign; 1716 int_fast16_t aExp; 1717 uint32_t aSig; 1718 1719 a = float32_squash_input_denormal(a STATUS_VAR); 1720 aSig = extractFloat32Frac( a ); 1721 aExp = extractFloat32Exp( a ); 1722 aSign = extractFloat32Sign( a ); 1723 if ( aExp == 0xFF ) { 1724 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 1725 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 1726 } 1727 if ( aExp == 0 ) { 1728 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 1729 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1730 } 1731 aSig |= 0x00800000; 1732 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 1733 1734 } 1735 1736 /*---------------------------------------------------------------------------- 1737 | Returns the result of converting the single-precision floating-point value 1738 | `a' to the double-precision floating-point format. The conversion is 1739 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1740 | Arithmetic. 1741 *----------------------------------------------------------------------------*/ 1742 1743 float128 float32_to_float128( float32 a STATUS_PARAM ) 1744 { 1745 flag aSign; 1746 int_fast16_t aExp; 1747 uint32_t aSig; 1748 1749 a = float32_squash_input_denormal(a STATUS_VAR); 1750 aSig = extractFloat32Frac( a ); 1751 aExp = extractFloat32Exp( a ); 1752 aSign = extractFloat32Sign( a ); 1753 if ( aExp == 0xFF ) { 1754 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 1755 return packFloat128( aSign, 0x7FFF, 0, 0 ); 1756 } 1757 if ( aExp == 0 ) { 1758 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 1759 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1760 --aExp; 1761 } 1762 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 1763 1764 } 1765 1766 /*---------------------------------------------------------------------------- 1767 | Rounds the single-precision floating-point value `a' to an integer, and 1768 | returns the result as a single-precision floating-point value. The 1769 | operation is performed according to the IEC/IEEE Standard for Binary 1770 | Floating-Point Arithmetic. 1771 *----------------------------------------------------------------------------*/ 1772 1773 float32 float32_round_to_int( float32 a STATUS_PARAM) 1774 { 1775 flag aSign; 1776 int_fast16_t aExp; 1777 uint32_t lastBitMask, roundBitsMask; 1778 uint32_t z; 1779 a = float32_squash_input_denormal(a STATUS_VAR); 1780 1781 aExp = extractFloat32Exp( a ); 1782 if ( 0x96 <= aExp ) { 1783 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { 1784 return propagateFloat32NaN( a, a STATUS_VAR ); 1785 } 1786 return a; 1787 } 1788 if ( aExp <= 0x7E ) { 1789 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a; 1790 STATUS(float_exception_flags) |= float_flag_inexact; 1791 aSign = extractFloat32Sign( a ); 1792 switch ( STATUS(float_rounding_mode) ) { 1793 case float_round_nearest_even: 1794 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { 1795 return packFloat32( aSign, 0x7F, 0 ); 1796 } 1797 break; 1798 case float_round_ties_away: 1799 if (aExp == 0x7E) { 1800 return packFloat32(aSign, 0x7F, 0); 1801 } 1802 break; 1803 case float_round_down: 1804 return make_float32(aSign ? 0xBF800000 : 0); 1805 case float_round_up: 1806 return make_float32(aSign ? 0x80000000 : 0x3F800000); 1807 } 1808 return packFloat32( aSign, 0, 0 ); 1809 } 1810 lastBitMask = 1; 1811 lastBitMask <<= 0x96 - aExp; 1812 roundBitsMask = lastBitMask - 1; 1813 z = float32_val(a); 1814 switch (STATUS(float_rounding_mode)) { 1815 case float_round_nearest_even: 1816 z += lastBitMask>>1; 1817 if ((z & roundBitsMask) == 0) { 1818 z &= ~lastBitMask; 1819 } 1820 break; 1821 case float_round_ties_away: 1822 z += lastBitMask >> 1; 1823 break; 1824 case float_round_to_zero: 1825 break; 1826 case float_round_up: 1827 if (!extractFloat32Sign(make_float32(z))) { 1828 z += roundBitsMask; 1829 } 1830 break; 1831 case float_round_down: 1832 if (extractFloat32Sign(make_float32(z))) { 1833 z += roundBitsMask; 1834 } 1835 break; 1836 default: 1837 abort(); 1838 } 1839 z &= ~ roundBitsMask; 1840 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact; 1841 return make_float32(z); 1842 1843 } 1844 1845 /*---------------------------------------------------------------------------- 1846 | Returns the result of adding the absolute values of the single-precision 1847 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 1848 | before being returned. `zSign' is ignored if the result is a NaN. 1849 | The addition is performed according to the IEC/IEEE Standard for Binary 1850 | Floating-Point Arithmetic. 1851 *----------------------------------------------------------------------------*/ 1852 1853 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM) 1854 { 1855 int_fast16_t aExp, bExp, zExp; 1856 uint32_t aSig, bSig, zSig; 1857 int_fast16_t expDiff; 1858 1859 aSig = extractFloat32Frac( a ); 1860 aExp = extractFloat32Exp( a ); 1861 bSig = extractFloat32Frac( b ); 1862 bExp = extractFloat32Exp( b ); 1863 expDiff = aExp - bExp; 1864 aSig <<= 6; 1865 bSig <<= 6; 1866 if ( 0 < expDiff ) { 1867 if ( aExp == 0xFF ) { 1868 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1869 return a; 1870 } 1871 if ( bExp == 0 ) { 1872 --expDiff; 1873 } 1874 else { 1875 bSig |= 0x20000000; 1876 } 1877 shift32RightJamming( bSig, expDiff, &bSig ); 1878 zExp = aExp; 1879 } 1880 else if ( expDiff < 0 ) { 1881 if ( bExp == 0xFF ) { 1882 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1883 return packFloat32( zSign, 0xFF, 0 ); 1884 } 1885 if ( aExp == 0 ) { 1886 ++expDiff; 1887 } 1888 else { 1889 aSig |= 0x20000000; 1890 } 1891 shift32RightJamming( aSig, - expDiff, &aSig ); 1892 zExp = bExp; 1893 } 1894 else { 1895 if ( aExp == 0xFF ) { 1896 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1897 return a; 1898 } 1899 if ( aExp == 0 ) { 1900 if (STATUS(flush_to_zero)) { 1901 if (aSig | bSig) { 1902 float_raise(float_flag_output_denormal STATUS_VAR); 1903 } 1904 return packFloat32(zSign, 0, 0); 1905 } 1906 return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); 1907 } 1908 zSig = 0x40000000 + aSig + bSig; 1909 zExp = aExp; 1910 goto roundAndPack; 1911 } 1912 aSig |= 0x20000000; 1913 zSig = ( aSig + bSig )<<1; 1914 --zExp; 1915 if ( (int32_t) zSig < 0 ) { 1916 zSig = aSig + bSig; 1917 ++zExp; 1918 } 1919 roundAndPack: 1920 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR ); 1921 1922 } 1923 1924 /*---------------------------------------------------------------------------- 1925 | Returns the result of subtracting the absolute values of the single- 1926 | precision floating-point values `a' and `b'. If `zSign' is 1, the 1927 | difference is negated before being returned. `zSign' is ignored if the 1928 | result is a NaN. The subtraction is performed according to the IEC/IEEE 1929 | Standard for Binary Floating-Point Arithmetic. 1930 *----------------------------------------------------------------------------*/ 1931 1932 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM) 1933 { 1934 int_fast16_t aExp, bExp, zExp; 1935 uint32_t aSig, bSig, zSig; 1936 int_fast16_t expDiff; 1937 1938 aSig = extractFloat32Frac( a ); 1939 aExp = extractFloat32Exp( a ); 1940 bSig = extractFloat32Frac( b ); 1941 bExp = extractFloat32Exp( b ); 1942 expDiff = aExp - bExp; 1943 aSig <<= 7; 1944 bSig <<= 7; 1945 if ( 0 < expDiff ) goto aExpBigger; 1946 if ( expDiff < 0 ) goto bExpBigger; 1947 if ( aExp == 0xFF ) { 1948 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1949 float_raise( float_flag_invalid STATUS_VAR); 1950 return float32_default_nan; 1951 } 1952 if ( aExp == 0 ) { 1953 aExp = 1; 1954 bExp = 1; 1955 } 1956 if ( bSig < aSig ) goto aBigger; 1957 if ( aSig < bSig ) goto bBigger; 1958 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 ); 1959 bExpBigger: 1960 if ( bExp == 0xFF ) { 1961 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1962 return packFloat32( zSign ^ 1, 0xFF, 0 ); 1963 } 1964 if ( aExp == 0 ) { 1965 ++expDiff; 1966 } 1967 else { 1968 aSig |= 0x40000000; 1969 } 1970 shift32RightJamming( aSig, - expDiff, &aSig ); 1971 bSig |= 0x40000000; 1972 bBigger: 1973 zSig = bSig - aSig; 1974 zExp = bExp; 1975 zSign ^= 1; 1976 goto normalizeRoundAndPack; 1977 aExpBigger: 1978 if ( aExp == 0xFF ) { 1979 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 1980 return a; 1981 } 1982 if ( bExp == 0 ) { 1983 --expDiff; 1984 } 1985 else { 1986 bSig |= 0x40000000; 1987 } 1988 shift32RightJamming( bSig, expDiff, &bSig ); 1989 aSig |= 0x40000000; 1990 aBigger: 1991 zSig = aSig - bSig; 1992 zExp = aExp; 1993 normalizeRoundAndPack: 1994 --zExp; 1995 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR ); 1996 1997 } 1998 1999 /*---------------------------------------------------------------------------- 2000 | Returns the result of adding the single-precision floating-point values `a' 2001 | and `b'. The operation is performed according to the IEC/IEEE Standard for 2002 | Binary Floating-Point Arithmetic. 2003 *----------------------------------------------------------------------------*/ 2004 2005 float32 float32_add( float32 a, float32 b STATUS_PARAM ) 2006 { 2007 flag aSign, bSign; 2008 a = float32_squash_input_denormal(a STATUS_VAR); 2009 b = float32_squash_input_denormal(b STATUS_VAR); 2010 2011 aSign = extractFloat32Sign( a ); 2012 bSign = extractFloat32Sign( b ); 2013 if ( aSign == bSign ) { 2014 return addFloat32Sigs( a, b, aSign STATUS_VAR); 2015 } 2016 else { 2017 return subFloat32Sigs( a, b, aSign STATUS_VAR ); 2018 } 2019 2020 } 2021 2022 /*---------------------------------------------------------------------------- 2023 | Returns the result of subtracting the single-precision floating-point values 2024 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2025 | for Binary Floating-Point Arithmetic. 2026 *----------------------------------------------------------------------------*/ 2027 2028 float32 float32_sub( float32 a, float32 b STATUS_PARAM ) 2029 { 2030 flag aSign, bSign; 2031 a = float32_squash_input_denormal(a STATUS_VAR); 2032 b = float32_squash_input_denormal(b STATUS_VAR); 2033 2034 aSign = extractFloat32Sign( a ); 2035 bSign = extractFloat32Sign( b ); 2036 if ( aSign == bSign ) { 2037 return subFloat32Sigs( a, b, aSign STATUS_VAR ); 2038 } 2039 else { 2040 return addFloat32Sigs( a, b, aSign STATUS_VAR ); 2041 } 2042 2043 } 2044 2045 /*---------------------------------------------------------------------------- 2046 | Returns the result of multiplying the single-precision floating-point values 2047 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2048 | for Binary Floating-Point Arithmetic. 2049 *----------------------------------------------------------------------------*/ 2050 2051 float32 float32_mul( float32 a, float32 b STATUS_PARAM ) 2052 { 2053 flag aSign, bSign, zSign; 2054 int_fast16_t aExp, bExp, zExp; 2055 uint32_t aSig, bSig; 2056 uint64_t zSig64; 2057 uint32_t zSig; 2058 2059 a = float32_squash_input_denormal(a STATUS_VAR); 2060 b = float32_squash_input_denormal(b STATUS_VAR); 2061 2062 aSig = extractFloat32Frac( a ); 2063 aExp = extractFloat32Exp( a ); 2064 aSign = extractFloat32Sign( a ); 2065 bSig = extractFloat32Frac( b ); 2066 bExp = extractFloat32Exp( b ); 2067 bSign = extractFloat32Sign( b ); 2068 zSign = aSign ^ bSign; 2069 if ( aExp == 0xFF ) { 2070 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 2071 return propagateFloat32NaN( a, b STATUS_VAR ); 2072 } 2073 if ( ( bExp | bSig ) == 0 ) { 2074 float_raise( float_flag_invalid STATUS_VAR); 2075 return float32_default_nan; 2076 } 2077 return packFloat32( zSign, 0xFF, 0 ); 2078 } 2079 if ( bExp == 0xFF ) { 2080 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 2081 if ( ( aExp | aSig ) == 0 ) { 2082 float_raise( float_flag_invalid STATUS_VAR); 2083 return float32_default_nan; 2084 } 2085 return packFloat32( zSign, 0xFF, 0 ); 2086 } 2087 if ( aExp == 0 ) { 2088 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 2089 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2090 } 2091 if ( bExp == 0 ) { 2092 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); 2093 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2094 } 2095 zExp = aExp + bExp - 0x7F; 2096 aSig = ( aSig | 0x00800000 )<<7; 2097 bSig = ( bSig | 0x00800000 )<<8; 2098 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 ); 2099 zSig = zSig64; 2100 if ( 0 <= (int32_t) ( zSig<<1 ) ) { 2101 zSig <<= 1; 2102 --zExp; 2103 } 2104 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR ); 2105 2106 } 2107 2108 /*---------------------------------------------------------------------------- 2109 | Returns the result of dividing the single-precision floating-point value `a' 2110 | by the corresponding value `b'. The operation is performed according to the 2111 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2112 *----------------------------------------------------------------------------*/ 2113 2114 float32 float32_div( float32 a, float32 b STATUS_PARAM ) 2115 { 2116 flag aSign, bSign, zSign; 2117 int_fast16_t aExp, bExp, zExp; 2118 uint32_t aSig, bSig, zSig; 2119 a = float32_squash_input_denormal(a STATUS_VAR); 2120 b = float32_squash_input_denormal(b STATUS_VAR); 2121 2122 aSig = extractFloat32Frac( a ); 2123 aExp = extractFloat32Exp( a ); 2124 aSign = extractFloat32Sign( a ); 2125 bSig = extractFloat32Frac( b ); 2126 bExp = extractFloat32Exp( b ); 2127 bSign = extractFloat32Sign( b ); 2128 zSign = aSign ^ bSign; 2129 if ( aExp == 0xFF ) { 2130 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 2131 if ( bExp == 0xFF ) { 2132 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 2133 float_raise( float_flag_invalid STATUS_VAR); 2134 return float32_default_nan; 2135 } 2136 return packFloat32( zSign, 0xFF, 0 ); 2137 } 2138 if ( bExp == 0xFF ) { 2139 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 2140 return packFloat32( zSign, 0, 0 ); 2141 } 2142 if ( bExp == 0 ) { 2143 if ( bSig == 0 ) { 2144 if ( ( aExp | aSig ) == 0 ) { 2145 float_raise( float_flag_invalid STATUS_VAR); 2146 return float32_default_nan; 2147 } 2148 float_raise( float_flag_divbyzero STATUS_VAR); 2149 return packFloat32( zSign, 0xFF, 0 ); 2150 } 2151 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2152 } 2153 if ( aExp == 0 ) { 2154 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 2155 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2156 } 2157 zExp = aExp - bExp + 0x7D; 2158 aSig = ( aSig | 0x00800000 )<<7; 2159 bSig = ( bSig | 0x00800000 )<<8; 2160 if ( bSig <= ( aSig + aSig ) ) { 2161 aSig >>= 1; 2162 ++zExp; 2163 } 2164 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig; 2165 if ( ( zSig & 0x3F ) == 0 ) { 2166 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 ); 2167 } 2168 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR ); 2169 2170 } 2171 2172 /*---------------------------------------------------------------------------- 2173 | Returns the remainder of the single-precision floating-point value `a' 2174 | with respect to the corresponding value `b'. The operation is performed 2175 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2176 *----------------------------------------------------------------------------*/ 2177 2178 float32 float32_rem( float32 a, float32 b STATUS_PARAM ) 2179 { 2180 flag aSign, zSign; 2181 int_fast16_t aExp, bExp, expDiff; 2182 uint32_t aSig, bSig; 2183 uint32_t q; 2184 uint64_t aSig64, bSig64, q64; 2185 uint32_t alternateASig; 2186 int32_t sigMean; 2187 a = float32_squash_input_denormal(a STATUS_VAR); 2188 b = float32_squash_input_denormal(b STATUS_VAR); 2189 2190 aSig = extractFloat32Frac( a ); 2191 aExp = extractFloat32Exp( a ); 2192 aSign = extractFloat32Sign( a ); 2193 bSig = extractFloat32Frac( b ); 2194 bExp = extractFloat32Exp( b ); 2195 if ( aExp == 0xFF ) { 2196 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 2197 return propagateFloat32NaN( a, b STATUS_VAR ); 2198 } 2199 float_raise( float_flag_invalid STATUS_VAR); 2200 return float32_default_nan; 2201 } 2202 if ( bExp == 0xFF ) { 2203 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); 2204 return a; 2205 } 2206 if ( bExp == 0 ) { 2207 if ( bSig == 0 ) { 2208 float_raise( float_flag_invalid STATUS_VAR); 2209 return float32_default_nan; 2210 } 2211 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2212 } 2213 if ( aExp == 0 ) { 2214 if ( aSig == 0 ) return a; 2215 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2216 } 2217 expDiff = aExp - bExp; 2218 aSig |= 0x00800000; 2219 bSig |= 0x00800000; 2220 if ( expDiff < 32 ) { 2221 aSig <<= 8; 2222 bSig <<= 8; 2223 if ( expDiff < 0 ) { 2224 if ( expDiff < -1 ) return a; 2225 aSig >>= 1; 2226 } 2227 q = ( bSig <= aSig ); 2228 if ( q ) aSig -= bSig; 2229 if ( 0 < expDiff ) { 2230 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 2231 q >>= 32 - expDiff; 2232 bSig >>= 2; 2233 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 2234 } 2235 else { 2236 aSig >>= 2; 2237 bSig >>= 2; 2238 } 2239 } 2240 else { 2241 if ( bSig <= aSig ) aSig -= bSig; 2242 aSig64 = ( (uint64_t) aSig )<<40; 2243 bSig64 = ( (uint64_t) bSig )<<40; 2244 expDiff -= 64; 2245 while ( 0 < expDiff ) { 2246 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2247 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2248 aSig64 = - ( ( bSig * q64 )<<38 ); 2249 expDiff -= 62; 2250 } 2251 expDiff += 64; 2252 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2253 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2254 q = q64>>( 64 - expDiff ); 2255 bSig <<= 6; 2256 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 2257 } 2258 do { 2259 alternateASig = aSig; 2260 ++q; 2261 aSig -= bSig; 2262 } while ( 0 <= (int32_t) aSig ); 2263 sigMean = aSig + alternateASig; 2264 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 2265 aSig = alternateASig; 2266 } 2267 zSign = ( (int32_t) aSig < 0 ); 2268 if ( zSign ) aSig = - aSig; 2269 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR ); 2270 2271 } 2272 2273 /*---------------------------------------------------------------------------- 2274 | Returns the result of multiplying the single-precision floating-point values 2275 | `a' and `b' then adding 'c', with no intermediate rounding step after the 2276 | multiplication. The operation is performed according to the IEC/IEEE 2277 | Standard for Binary Floating-Point Arithmetic 754-2008. 2278 | The flags argument allows the caller to select negation of the 2279 | addend, the intermediate product, or the final result. (The difference 2280 | between this and having the caller do a separate negation is that negating 2281 | externally will flip the sign bit on NaNs.) 2282 *----------------------------------------------------------------------------*/ 2283 2284 float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM) 2285 { 2286 flag aSign, bSign, cSign, zSign; 2287 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff; 2288 uint32_t aSig, bSig, cSig; 2289 flag pInf, pZero, pSign; 2290 uint64_t pSig64, cSig64, zSig64; 2291 uint32_t pSig; 2292 int shiftcount; 2293 flag signflip, infzero; 2294 2295 a = float32_squash_input_denormal(a STATUS_VAR); 2296 b = float32_squash_input_denormal(b STATUS_VAR); 2297 c = float32_squash_input_denormal(c STATUS_VAR); 2298 aSig = extractFloat32Frac(a); 2299 aExp = extractFloat32Exp(a); 2300 aSign = extractFloat32Sign(a); 2301 bSig = extractFloat32Frac(b); 2302 bExp = extractFloat32Exp(b); 2303 bSign = extractFloat32Sign(b); 2304 cSig = extractFloat32Frac(c); 2305 cExp = extractFloat32Exp(c); 2306 cSign = extractFloat32Sign(c); 2307 2308 infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) || 2309 (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0)); 2310 2311 /* It is implementation-defined whether the cases of (0,inf,qnan) 2312 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 2313 * they return if they do), so we have to hand this information 2314 * off to the target-specific pick-a-NaN routine. 2315 */ 2316 if (((aExp == 0xff) && aSig) || 2317 ((bExp == 0xff) && bSig) || 2318 ((cExp == 0xff) && cSig)) { 2319 return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR); 2320 } 2321 2322 if (infzero) { 2323 float_raise(float_flag_invalid STATUS_VAR); 2324 return float32_default_nan; 2325 } 2326 2327 if (flags & float_muladd_negate_c) { 2328 cSign ^= 1; 2329 } 2330 2331 signflip = (flags & float_muladd_negate_result) ? 1 : 0; 2332 2333 /* Work out the sign and type of the product */ 2334 pSign = aSign ^ bSign; 2335 if (flags & float_muladd_negate_product) { 2336 pSign ^= 1; 2337 } 2338 pInf = (aExp == 0xff) || (bExp == 0xff); 2339 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0); 2340 2341 if (cExp == 0xff) { 2342 if (pInf && (pSign ^ cSign)) { 2343 /* addition of opposite-signed infinities => InvalidOperation */ 2344 float_raise(float_flag_invalid STATUS_VAR); 2345 return float32_default_nan; 2346 } 2347 /* Otherwise generate an infinity of the same sign */ 2348 return packFloat32(cSign ^ signflip, 0xff, 0); 2349 } 2350 2351 if (pInf) { 2352 return packFloat32(pSign ^ signflip, 0xff, 0); 2353 } 2354 2355 if (pZero) { 2356 if (cExp == 0) { 2357 if (cSig == 0) { 2358 /* Adding two exact zeroes */ 2359 if (pSign == cSign) { 2360 zSign = pSign; 2361 } else if (STATUS(float_rounding_mode) == float_round_down) { 2362 zSign = 1; 2363 } else { 2364 zSign = 0; 2365 } 2366 return packFloat32(zSign ^ signflip, 0, 0); 2367 } 2368 /* Exact zero plus a denorm */ 2369 if (STATUS(flush_to_zero)) { 2370 float_raise(float_flag_output_denormal STATUS_VAR); 2371 return packFloat32(cSign ^ signflip, 0, 0); 2372 } 2373 } 2374 /* Zero plus something non-zero : just return the something */ 2375 return packFloat32(cSign ^ signflip, cExp, cSig); 2376 } 2377 2378 if (aExp == 0) { 2379 normalizeFloat32Subnormal(aSig, &aExp, &aSig); 2380 } 2381 if (bExp == 0) { 2382 normalizeFloat32Subnormal(bSig, &bExp, &bSig); 2383 } 2384 2385 /* Calculate the actual result a * b + c */ 2386 2387 /* Multiply first; this is easy. */ 2388 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f 2389 * because we want the true exponent, not the "one-less-than" 2390 * flavour that roundAndPackFloat32() takes. 2391 */ 2392 pExp = aExp + bExp - 0x7e; 2393 aSig = (aSig | 0x00800000) << 7; 2394 bSig = (bSig | 0x00800000) << 8; 2395 pSig64 = (uint64_t)aSig * bSig; 2396 if ((int64_t)(pSig64 << 1) >= 0) { 2397 pSig64 <<= 1; 2398 pExp--; 2399 } 2400 2401 zSign = pSign ^ signflip; 2402 2403 /* Now pSig64 is the significand of the multiply, with the explicit bit in 2404 * position 62. 2405 */ 2406 if (cExp == 0) { 2407 if (!cSig) { 2408 /* Throw out the special case of c being an exact zero now */ 2409 shift64RightJamming(pSig64, 32, &pSig64); 2410 pSig = pSig64; 2411 return roundAndPackFloat32(zSign, pExp - 1, 2412 pSig STATUS_VAR); 2413 } 2414 normalizeFloat32Subnormal(cSig, &cExp, &cSig); 2415 } 2416 2417 cSig64 = (uint64_t)cSig << (62 - 23); 2418 cSig64 |= LIT64(0x4000000000000000); 2419 expDiff = pExp - cExp; 2420 2421 if (pSign == cSign) { 2422 /* Addition */ 2423 if (expDiff > 0) { 2424 /* scale c to match p */ 2425 shift64RightJamming(cSig64, expDiff, &cSig64); 2426 zExp = pExp; 2427 } else if (expDiff < 0) { 2428 /* scale p to match c */ 2429 shift64RightJamming(pSig64, -expDiff, &pSig64); 2430 zExp = cExp; 2431 } else { 2432 /* no scaling needed */ 2433 zExp = cExp; 2434 } 2435 /* Add significands and make sure explicit bit ends up in posn 62 */ 2436 zSig64 = pSig64 + cSig64; 2437 if ((int64_t)zSig64 < 0) { 2438 shift64RightJamming(zSig64, 1, &zSig64); 2439 } else { 2440 zExp--; 2441 } 2442 } else { 2443 /* Subtraction */ 2444 if (expDiff > 0) { 2445 shift64RightJamming(cSig64, expDiff, &cSig64); 2446 zSig64 = pSig64 - cSig64; 2447 zExp = pExp; 2448 } else if (expDiff < 0) { 2449 shift64RightJamming(pSig64, -expDiff, &pSig64); 2450 zSig64 = cSig64 - pSig64; 2451 zExp = cExp; 2452 zSign ^= 1; 2453 } else { 2454 zExp = pExp; 2455 if (cSig64 < pSig64) { 2456 zSig64 = pSig64 - cSig64; 2457 } else if (pSig64 < cSig64) { 2458 zSig64 = cSig64 - pSig64; 2459 zSign ^= 1; 2460 } else { 2461 /* Exact zero */ 2462 zSign = signflip; 2463 if (STATUS(float_rounding_mode) == float_round_down) { 2464 zSign ^= 1; 2465 } 2466 return packFloat32(zSign, 0, 0); 2467 } 2468 } 2469 --zExp; 2470 /* Normalize to put the explicit bit back into bit 62. */ 2471 shiftcount = countLeadingZeros64(zSig64) - 1; 2472 zSig64 <<= shiftcount; 2473 zExp -= shiftcount; 2474 } 2475 shift64RightJamming(zSig64, 32, &zSig64); 2476 return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR); 2477 } 2478 2479 2480 /*---------------------------------------------------------------------------- 2481 | Returns the square root of the single-precision floating-point value `a'. 2482 | The operation is performed according to the IEC/IEEE Standard for Binary 2483 | Floating-Point Arithmetic. 2484 *----------------------------------------------------------------------------*/ 2485 2486 float32 float32_sqrt( float32 a STATUS_PARAM ) 2487 { 2488 flag aSign; 2489 int_fast16_t aExp, zExp; 2490 uint32_t aSig, zSig; 2491 uint64_t rem, term; 2492 a = float32_squash_input_denormal(a STATUS_VAR); 2493 2494 aSig = extractFloat32Frac( a ); 2495 aExp = extractFloat32Exp( a ); 2496 aSign = extractFloat32Sign( a ); 2497 if ( aExp == 0xFF ) { 2498 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR ); 2499 if ( ! aSign ) return a; 2500 float_raise( float_flag_invalid STATUS_VAR); 2501 return float32_default_nan; 2502 } 2503 if ( aSign ) { 2504 if ( ( aExp | aSig ) == 0 ) return a; 2505 float_raise( float_flag_invalid STATUS_VAR); 2506 return float32_default_nan; 2507 } 2508 if ( aExp == 0 ) { 2509 if ( aSig == 0 ) return float32_zero; 2510 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2511 } 2512 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 2513 aSig = ( aSig | 0x00800000 )<<8; 2514 zSig = estimateSqrt32( aExp, aSig ) + 2; 2515 if ( ( zSig & 0x7F ) <= 5 ) { 2516 if ( zSig < 2 ) { 2517 zSig = 0x7FFFFFFF; 2518 goto roundAndPack; 2519 } 2520 aSig >>= aExp & 1; 2521 term = ( (uint64_t) zSig ) * zSig; 2522 rem = ( ( (uint64_t) aSig )<<32 ) - term; 2523 while ( (int64_t) rem < 0 ) { 2524 --zSig; 2525 rem += ( ( (uint64_t) zSig )<<1 ) | 1; 2526 } 2527 zSig |= ( rem != 0 ); 2528 } 2529 shift32RightJamming( zSig, 1, &zSig ); 2530 roundAndPack: 2531 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR ); 2532 2533 } 2534 2535 /*---------------------------------------------------------------------------- 2536 | Returns the binary exponential of the single-precision floating-point value 2537 | `a'. The operation is performed according to the IEC/IEEE Standard for 2538 | Binary Floating-Point Arithmetic. 2539 | 2540 | Uses the following identities: 2541 | 2542 | 1. ------------------------------------------------------------------------- 2543 | x x*ln(2) 2544 | 2 = e 2545 | 2546 | 2. ------------------------------------------------------------------------- 2547 | 2 3 4 5 n 2548 | x x x x x x x 2549 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 2550 | 1! 2! 3! 4! 5! n! 2551 *----------------------------------------------------------------------------*/ 2552 2553 static const float64 float32_exp2_coefficients[15] = 2554 { 2555 const_float64( 0x3ff0000000000000ll ), /* 1 */ 2556 const_float64( 0x3fe0000000000000ll ), /* 2 */ 2557 const_float64( 0x3fc5555555555555ll ), /* 3 */ 2558 const_float64( 0x3fa5555555555555ll ), /* 4 */ 2559 const_float64( 0x3f81111111111111ll ), /* 5 */ 2560 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 2561 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 2562 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 2563 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 2564 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 2565 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 2566 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 2567 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 2568 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 2569 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 2570 }; 2571 2572 float32 float32_exp2( float32 a STATUS_PARAM ) 2573 { 2574 flag aSign; 2575 int_fast16_t aExp; 2576 uint32_t aSig; 2577 float64 r, x, xn; 2578 int i; 2579 a = float32_squash_input_denormal(a STATUS_VAR); 2580 2581 aSig = extractFloat32Frac( a ); 2582 aExp = extractFloat32Exp( a ); 2583 aSign = extractFloat32Sign( a ); 2584 2585 if ( aExp == 0xFF) { 2586 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR ); 2587 return (aSign) ? float32_zero : a; 2588 } 2589 if (aExp == 0) { 2590 if (aSig == 0) return float32_one; 2591 } 2592 2593 float_raise( float_flag_inexact STATUS_VAR); 2594 2595 /* ******************************* */ 2596 /* using float64 for approximation */ 2597 /* ******************************* */ 2598 x = float32_to_float64(a STATUS_VAR); 2599 x = float64_mul(x, float64_ln2 STATUS_VAR); 2600 2601 xn = x; 2602 r = float64_one; 2603 for (i = 0 ; i < 15 ; i++) { 2604 float64 f; 2605 2606 f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR); 2607 r = float64_add(r, f STATUS_VAR); 2608 2609 xn = float64_mul(xn, x STATUS_VAR); 2610 } 2611 2612 return float64_to_float32(r, status); 2613 } 2614 2615 /*---------------------------------------------------------------------------- 2616 | Returns the binary log of the single-precision floating-point value `a'. 2617 | The operation is performed according to the IEC/IEEE Standard for Binary 2618 | Floating-Point Arithmetic. 2619 *----------------------------------------------------------------------------*/ 2620 float32 float32_log2( float32 a STATUS_PARAM ) 2621 { 2622 flag aSign, zSign; 2623 int_fast16_t aExp; 2624 uint32_t aSig, zSig, i; 2625 2626 a = float32_squash_input_denormal(a STATUS_VAR); 2627 aSig = extractFloat32Frac( a ); 2628 aExp = extractFloat32Exp( a ); 2629 aSign = extractFloat32Sign( a ); 2630 2631 if ( aExp == 0 ) { 2632 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 2633 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2634 } 2635 if ( aSign ) { 2636 float_raise( float_flag_invalid STATUS_VAR); 2637 return float32_default_nan; 2638 } 2639 if ( aExp == 0xFF ) { 2640 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR ); 2641 return a; 2642 } 2643 2644 aExp -= 0x7F; 2645 aSig |= 0x00800000; 2646 zSign = aExp < 0; 2647 zSig = aExp << 23; 2648 2649 for (i = 1 << 22; i > 0; i >>= 1) { 2650 aSig = ( (uint64_t)aSig * aSig ) >> 23; 2651 if ( aSig & 0x01000000 ) { 2652 aSig >>= 1; 2653 zSig |= i; 2654 } 2655 } 2656 2657 if ( zSign ) 2658 zSig = -zSig; 2659 2660 return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR ); 2661 } 2662 2663 /*---------------------------------------------------------------------------- 2664 | Returns 1 if the single-precision floating-point value `a' is equal to 2665 | the corresponding value `b', and 0 otherwise. The invalid exception is 2666 | raised if either operand is a NaN. Otherwise, the comparison is performed 2667 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2668 *----------------------------------------------------------------------------*/ 2669 2670 int float32_eq( float32 a, float32 b STATUS_PARAM ) 2671 { 2672 uint32_t av, bv; 2673 a = float32_squash_input_denormal(a STATUS_VAR); 2674 b = float32_squash_input_denormal(b STATUS_VAR); 2675 2676 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2677 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2678 ) { 2679 float_raise( float_flag_invalid STATUS_VAR); 2680 return 0; 2681 } 2682 av = float32_val(a); 2683 bv = float32_val(b); 2684 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 2685 } 2686 2687 /*---------------------------------------------------------------------------- 2688 | Returns 1 if the single-precision floating-point value `a' is less than 2689 | or equal to the corresponding value `b', and 0 otherwise. The invalid 2690 | exception is raised if either operand is a NaN. The comparison is performed 2691 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2692 *----------------------------------------------------------------------------*/ 2693 2694 int float32_le( float32 a, float32 b STATUS_PARAM ) 2695 { 2696 flag aSign, bSign; 2697 uint32_t av, bv; 2698 a = float32_squash_input_denormal(a STATUS_VAR); 2699 b = float32_squash_input_denormal(b STATUS_VAR); 2700 2701 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2702 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2703 ) { 2704 float_raise( float_flag_invalid STATUS_VAR); 2705 return 0; 2706 } 2707 aSign = extractFloat32Sign( a ); 2708 bSign = extractFloat32Sign( b ); 2709 av = float32_val(a); 2710 bv = float32_val(b); 2711 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 2712 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 2713 2714 } 2715 2716 /*---------------------------------------------------------------------------- 2717 | Returns 1 if the single-precision floating-point value `a' is less than 2718 | the corresponding value `b', and 0 otherwise. The invalid exception is 2719 | raised if either operand is a NaN. The comparison is performed according 2720 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2721 *----------------------------------------------------------------------------*/ 2722 2723 int float32_lt( float32 a, float32 b STATUS_PARAM ) 2724 { 2725 flag aSign, bSign; 2726 uint32_t av, bv; 2727 a = float32_squash_input_denormal(a STATUS_VAR); 2728 b = float32_squash_input_denormal(b STATUS_VAR); 2729 2730 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2731 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2732 ) { 2733 float_raise( float_flag_invalid STATUS_VAR); 2734 return 0; 2735 } 2736 aSign = extractFloat32Sign( a ); 2737 bSign = extractFloat32Sign( b ); 2738 av = float32_val(a); 2739 bv = float32_val(b); 2740 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 2741 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 2742 2743 } 2744 2745 /*---------------------------------------------------------------------------- 2746 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 2747 | be compared, and 0 otherwise. The invalid exception is raised if either 2748 | operand is a NaN. The comparison is performed according to the IEC/IEEE 2749 | Standard for Binary Floating-Point Arithmetic. 2750 *----------------------------------------------------------------------------*/ 2751 2752 int float32_unordered( float32 a, float32 b STATUS_PARAM ) 2753 { 2754 a = float32_squash_input_denormal(a STATUS_VAR); 2755 b = float32_squash_input_denormal(b STATUS_VAR); 2756 2757 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2758 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2759 ) { 2760 float_raise( float_flag_invalid STATUS_VAR); 2761 return 1; 2762 } 2763 return 0; 2764 } 2765 2766 /*---------------------------------------------------------------------------- 2767 | Returns 1 if the single-precision floating-point value `a' is equal to 2768 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 2769 | exception. The comparison is performed according to the IEC/IEEE Standard 2770 | for Binary Floating-Point Arithmetic. 2771 *----------------------------------------------------------------------------*/ 2772 2773 int float32_eq_quiet( float32 a, float32 b STATUS_PARAM ) 2774 { 2775 a = float32_squash_input_denormal(a STATUS_VAR); 2776 b = float32_squash_input_denormal(b STATUS_VAR); 2777 2778 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2779 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2780 ) { 2781 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2782 float_raise( float_flag_invalid STATUS_VAR); 2783 } 2784 return 0; 2785 } 2786 return ( float32_val(a) == float32_val(b) ) || 2787 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 2788 } 2789 2790 /*---------------------------------------------------------------------------- 2791 | Returns 1 if the single-precision floating-point value `a' is less than or 2792 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 2793 | cause an exception. Otherwise, the comparison is performed according to the 2794 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2795 *----------------------------------------------------------------------------*/ 2796 2797 int float32_le_quiet( float32 a, float32 b STATUS_PARAM ) 2798 { 2799 flag aSign, bSign; 2800 uint32_t av, bv; 2801 a = float32_squash_input_denormal(a STATUS_VAR); 2802 b = float32_squash_input_denormal(b STATUS_VAR); 2803 2804 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2805 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2806 ) { 2807 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2808 float_raise( float_flag_invalid STATUS_VAR); 2809 } 2810 return 0; 2811 } 2812 aSign = extractFloat32Sign( a ); 2813 bSign = extractFloat32Sign( b ); 2814 av = float32_val(a); 2815 bv = float32_val(b); 2816 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 2817 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 2818 2819 } 2820 2821 /*---------------------------------------------------------------------------- 2822 | Returns 1 if the single-precision floating-point value `a' is less than 2823 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 2824 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 2825 | Standard for Binary Floating-Point Arithmetic. 2826 *----------------------------------------------------------------------------*/ 2827 2828 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM ) 2829 { 2830 flag aSign, bSign; 2831 uint32_t av, bv; 2832 a = float32_squash_input_denormal(a STATUS_VAR); 2833 b = float32_squash_input_denormal(b STATUS_VAR); 2834 2835 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2836 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2837 ) { 2838 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2839 float_raise( float_flag_invalid STATUS_VAR); 2840 } 2841 return 0; 2842 } 2843 aSign = extractFloat32Sign( a ); 2844 bSign = extractFloat32Sign( b ); 2845 av = float32_val(a); 2846 bv = float32_val(b); 2847 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 2848 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 2849 2850 } 2851 2852 /*---------------------------------------------------------------------------- 2853 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 2854 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 2855 | comparison is performed according to the IEC/IEEE Standard for Binary 2856 | Floating-Point Arithmetic. 2857 *----------------------------------------------------------------------------*/ 2858 2859 int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM ) 2860 { 2861 a = float32_squash_input_denormal(a STATUS_VAR); 2862 b = float32_squash_input_denormal(b STATUS_VAR); 2863 2864 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2865 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2866 ) { 2867 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2868 float_raise( float_flag_invalid STATUS_VAR); 2869 } 2870 return 1; 2871 } 2872 return 0; 2873 } 2874 2875 /*---------------------------------------------------------------------------- 2876 | Returns the result of converting the double-precision floating-point value 2877 | `a' to the 32-bit two's complement integer format. The conversion is 2878 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2879 | Arithmetic---which means in particular that the conversion is rounded 2880 | according to the current rounding mode. If `a' is a NaN, the largest 2881 | positive integer is returned. Otherwise, if the conversion overflows, the 2882 | largest integer with the same sign as `a' is returned. 2883 *----------------------------------------------------------------------------*/ 2884 2885 int32 float64_to_int32( float64 a STATUS_PARAM ) 2886 { 2887 flag aSign; 2888 int_fast16_t aExp, shiftCount; 2889 uint64_t aSig; 2890 a = float64_squash_input_denormal(a STATUS_VAR); 2891 2892 aSig = extractFloat64Frac( a ); 2893 aExp = extractFloat64Exp( a ); 2894 aSign = extractFloat64Sign( a ); 2895 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2896 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 2897 shiftCount = 0x42C - aExp; 2898 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 2899 return roundAndPackInt32( aSign, aSig STATUS_VAR ); 2900 2901 } 2902 2903 /*---------------------------------------------------------------------------- 2904 | Returns the result of converting the double-precision floating-point value 2905 | `a' to the 32-bit two's complement integer format. The conversion is 2906 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2907 | Arithmetic, except that the conversion is always rounded toward zero. 2908 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2909 | the conversion overflows, the largest integer with the same sign as `a' is 2910 | returned. 2911 *----------------------------------------------------------------------------*/ 2912 2913 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM ) 2914 { 2915 flag aSign; 2916 int_fast16_t aExp, shiftCount; 2917 uint64_t aSig, savedASig; 2918 int32_t z; 2919 a = float64_squash_input_denormal(a STATUS_VAR); 2920 2921 aSig = extractFloat64Frac( a ); 2922 aExp = extractFloat64Exp( a ); 2923 aSign = extractFloat64Sign( a ); 2924 if ( 0x41E < aExp ) { 2925 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 2926 goto invalid; 2927 } 2928 else if ( aExp < 0x3FF ) { 2929 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 2930 return 0; 2931 } 2932 aSig |= LIT64( 0x0010000000000000 ); 2933 shiftCount = 0x433 - aExp; 2934 savedASig = aSig; 2935 aSig >>= shiftCount; 2936 z = aSig; 2937 if ( aSign ) z = - z; 2938 if ( ( z < 0 ) ^ aSign ) { 2939 invalid: 2940 float_raise( float_flag_invalid STATUS_VAR); 2941 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 2942 } 2943 if ( ( aSig<<shiftCount ) != savedASig ) { 2944 STATUS(float_exception_flags) |= float_flag_inexact; 2945 } 2946 return z; 2947 2948 } 2949 2950 /*---------------------------------------------------------------------------- 2951 | Returns the result of converting the double-precision floating-point value 2952 | `a' to the 16-bit two's complement integer format. The conversion is 2953 | performed according to the IEC/IEEE Standard for Binary Floating-Point 2954 | Arithmetic, except that the conversion is always rounded toward zero. 2955 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 2956 | the conversion overflows, the largest integer with the same sign as `a' is 2957 | returned. 2958 *----------------------------------------------------------------------------*/ 2959 2960 int_fast16_t float64_to_int16_round_to_zero(float64 a STATUS_PARAM) 2961 { 2962 flag aSign; 2963 int_fast16_t aExp, shiftCount; 2964 uint64_t aSig, savedASig; 2965 int32 z; 2966 2967 aSig = extractFloat64Frac( a ); 2968 aExp = extractFloat64Exp( a ); 2969 aSign = extractFloat64Sign( a ); 2970 if ( 0x40E < aExp ) { 2971 if ( ( aExp == 0x7FF ) && aSig ) { 2972 aSign = 0; 2973 } 2974 goto invalid; 2975 } 2976 else if ( aExp < 0x3FF ) { 2977 if ( aExp || aSig ) { 2978 STATUS(float_exception_flags) |= float_flag_inexact; 2979 } 2980 return 0; 2981 } 2982 aSig |= LIT64( 0x0010000000000000 ); 2983 shiftCount = 0x433 - aExp; 2984 savedASig = aSig; 2985 aSig >>= shiftCount; 2986 z = aSig; 2987 if ( aSign ) { 2988 z = - z; 2989 } 2990 if ( ( (int16_t)z < 0 ) ^ aSign ) { 2991 invalid: 2992 float_raise( float_flag_invalid STATUS_VAR); 2993 return aSign ? (int32_t) 0xffff8000 : 0x7FFF; 2994 } 2995 if ( ( aSig<<shiftCount ) != savedASig ) { 2996 STATUS(float_exception_flags) |= float_flag_inexact; 2997 } 2998 return z; 2999 } 3000 3001 /*---------------------------------------------------------------------------- 3002 | Returns the result of converting the double-precision floating-point value 3003 | `a' to the 64-bit two's complement integer format. The conversion is 3004 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3005 | Arithmetic---which means in particular that the conversion is rounded 3006 | according to the current rounding mode. If `a' is a NaN, the largest 3007 | positive integer is returned. Otherwise, if the conversion overflows, the 3008 | largest integer with the same sign as `a' is returned. 3009 *----------------------------------------------------------------------------*/ 3010 3011 int64 float64_to_int64( float64 a STATUS_PARAM ) 3012 { 3013 flag aSign; 3014 int_fast16_t aExp, shiftCount; 3015 uint64_t aSig, aSigExtra; 3016 a = float64_squash_input_denormal(a STATUS_VAR); 3017 3018 aSig = extractFloat64Frac( a ); 3019 aExp = extractFloat64Exp( a ); 3020 aSign = extractFloat64Sign( a ); 3021 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 3022 shiftCount = 0x433 - aExp; 3023 if ( shiftCount <= 0 ) { 3024 if ( 0x43E < aExp ) { 3025 float_raise( float_flag_invalid STATUS_VAR); 3026 if ( ! aSign 3027 || ( ( aExp == 0x7FF ) 3028 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 3029 ) { 3030 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3031 } 3032 return (int64_t) LIT64( 0x8000000000000000 ); 3033 } 3034 aSigExtra = 0; 3035 aSig <<= - shiftCount; 3036 } 3037 else { 3038 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 3039 } 3040 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR ); 3041 3042 } 3043 3044 /*---------------------------------------------------------------------------- 3045 | Returns the result of converting the double-precision floating-point value 3046 | `a' to the 64-bit two's complement integer format. The conversion is 3047 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3048 | Arithmetic, except that the conversion is always rounded toward zero. 3049 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 3050 | the conversion overflows, the largest integer with the same sign as `a' is 3051 | returned. 3052 *----------------------------------------------------------------------------*/ 3053 3054 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM ) 3055 { 3056 flag aSign; 3057 int_fast16_t aExp, shiftCount; 3058 uint64_t aSig; 3059 int64 z; 3060 a = float64_squash_input_denormal(a STATUS_VAR); 3061 3062 aSig = extractFloat64Frac( a ); 3063 aExp = extractFloat64Exp( a ); 3064 aSign = extractFloat64Sign( a ); 3065 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 3066 shiftCount = aExp - 0x433; 3067 if ( 0 <= shiftCount ) { 3068 if ( 0x43E <= aExp ) { 3069 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) { 3070 float_raise( float_flag_invalid STATUS_VAR); 3071 if ( ! aSign 3072 || ( ( aExp == 0x7FF ) 3073 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 3074 ) { 3075 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3076 } 3077 } 3078 return (int64_t) LIT64( 0x8000000000000000 ); 3079 } 3080 z = aSig<<shiftCount; 3081 } 3082 else { 3083 if ( aExp < 0x3FE ) { 3084 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 3085 return 0; 3086 } 3087 z = aSig>>( - shiftCount ); 3088 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 3089 STATUS(float_exception_flags) |= float_flag_inexact; 3090 } 3091 } 3092 if ( aSign ) z = - z; 3093 return z; 3094 3095 } 3096 3097 /*---------------------------------------------------------------------------- 3098 | Returns the result of converting the double-precision floating-point value 3099 | `a' to the single-precision floating-point format. The conversion is 3100 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3101 | Arithmetic. 3102 *----------------------------------------------------------------------------*/ 3103 3104 float32 float64_to_float32( float64 a STATUS_PARAM ) 3105 { 3106 flag aSign; 3107 int_fast16_t aExp; 3108 uint64_t aSig; 3109 uint32_t zSig; 3110 a = float64_squash_input_denormal(a STATUS_VAR); 3111 3112 aSig = extractFloat64Frac( a ); 3113 aExp = extractFloat64Exp( a ); 3114 aSign = extractFloat64Sign( a ); 3115 if ( aExp == 0x7FF ) { 3116 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 3117 return packFloat32( aSign, 0xFF, 0 ); 3118 } 3119 shift64RightJamming( aSig, 22, &aSig ); 3120 zSig = aSig; 3121 if ( aExp || zSig ) { 3122 zSig |= 0x40000000; 3123 aExp -= 0x381; 3124 } 3125 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR ); 3126 3127 } 3128 3129 3130 /*---------------------------------------------------------------------------- 3131 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 3132 | half-precision floating-point value, returning the result. After being 3133 | shifted into the proper positions, the three fields are simply added 3134 | together to form the result. This means that any integer portion of `zSig' 3135 | will be added into the exponent. Since a properly normalized significand 3136 | will have an integer portion equal to 1, the `zExp' input should be 1 less 3137 | than the desired result exponent whenever `zSig' is a complete, normalized 3138 | significand. 3139 *----------------------------------------------------------------------------*/ 3140 static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig) 3141 { 3142 return make_float16( 3143 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig); 3144 } 3145 3146 /*---------------------------------------------------------------------------- 3147 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3148 | and significand `zSig', and returns the proper half-precision floating- 3149 | point value corresponding to the abstract input. Ordinarily, the abstract 3150 | value is simply rounded and packed into the half-precision format, with 3151 | the inexact exception raised if the abstract input cannot be represented 3152 | exactly. However, if the abstract value is too large, the overflow and 3153 | inexact exceptions are raised and an infinity or maximal finite value is 3154 | returned. If the abstract value is too small, the input value is rounded to 3155 | a subnormal number, and the underflow and inexact exceptions are raised if 3156 | the abstract input cannot be represented exactly as a subnormal half- 3157 | precision floating-point number. 3158 | The `ieee' flag indicates whether to use IEEE standard half precision, or 3159 | ARM-style "alternative representation", which omits the NaN and Inf 3160 | encodings in order to raise the maximum representable exponent by one. 3161 | The input significand `zSig' has its binary point between bits 22 3162 | and 23, which is 13 bits to the left of the usual location. This shifted 3163 | significand must be normalized or smaller. If `zSig' is not normalized, 3164 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3165 | and it must not require rounding. In the usual case that `zSig' is 3166 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3167 | Note the slightly odd position of the binary point in zSig compared with the 3168 | other roundAndPackFloat functions. This should probably be fixed if we 3169 | need to implement more float16 routines than just conversion. 3170 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3171 | Binary Floating-Point Arithmetic. 3172 *----------------------------------------------------------------------------*/ 3173 3174 static float32 roundAndPackFloat16(flag zSign, int_fast16_t zExp, 3175 uint32_t zSig, flag ieee STATUS_PARAM) 3176 { 3177 int maxexp = ieee ? 29 : 30; 3178 uint32_t mask; 3179 uint32_t increment; 3180 bool rounding_bumps_exp; 3181 bool is_tiny = false; 3182 3183 /* Calculate the mask of bits of the mantissa which are not 3184 * representable in half-precision and will be lost. 3185 */ 3186 if (zExp < 1) { 3187 /* Will be denormal in halfprec */ 3188 mask = 0x00ffffff; 3189 if (zExp >= -11) { 3190 mask >>= 11 + zExp; 3191 } 3192 } else { 3193 /* Normal number in halfprec */ 3194 mask = 0x00001fff; 3195 } 3196 3197 switch (STATUS(float_rounding_mode)) { 3198 case float_round_nearest_even: 3199 increment = (mask + 1) >> 1; 3200 if ((zSig & mask) == increment) { 3201 increment = zSig & (increment << 1); 3202 } 3203 break; 3204 case float_round_ties_away: 3205 increment = (mask + 1) >> 1; 3206 break; 3207 case float_round_up: 3208 increment = zSign ? 0 : mask; 3209 break; 3210 case float_round_down: 3211 increment = zSign ? mask : 0; 3212 break; 3213 default: /* round_to_zero */ 3214 increment = 0; 3215 break; 3216 } 3217 3218 rounding_bumps_exp = (zSig + increment >= 0x01000000); 3219 3220 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) { 3221 if (ieee) { 3222 float_raise(float_flag_overflow | float_flag_inexact STATUS_VAR); 3223 return packFloat16(zSign, 0x1f, 0); 3224 } else { 3225 float_raise(float_flag_invalid STATUS_VAR); 3226 return packFloat16(zSign, 0x1f, 0x3ff); 3227 } 3228 } 3229 3230 if (zExp < 0) { 3231 /* Note that flush-to-zero does not affect half-precision results */ 3232 is_tiny = 3233 (STATUS(float_detect_tininess) == float_tininess_before_rounding) 3234 || (zExp < -1) 3235 || (!rounding_bumps_exp); 3236 } 3237 if (zSig & mask) { 3238 float_raise(float_flag_inexact STATUS_VAR); 3239 if (is_tiny) { 3240 float_raise(float_flag_underflow STATUS_VAR); 3241 } 3242 } 3243 3244 zSig += increment; 3245 if (rounding_bumps_exp) { 3246 zSig >>= 1; 3247 zExp++; 3248 } 3249 3250 if (zExp < -10) { 3251 return packFloat16(zSign, 0, 0); 3252 } 3253 if (zExp < 0) { 3254 zSig >>= -zExp; 3255 zExp = 0; 3256 } 3257 return packFloat16(zSign, zExp, zSig >> 13); 3258 } 3259 3260 static void normalizeFloat16Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, 3261 uint32_t *zSigPtr) 3262 { 3263 int8_t shiftCount = countLeadingZeros32(aSig) - 21; 3264 *zSigPtr = aSig << shiftCount; 3265 *zExpPtr = 1 - shiftCount; 3266 } 3267 3268 /* Half precision floats come in two formats: standard IEEE and "ARM" format. 3269 The latter gains extra exponent range by omitting the NaN/Inf encodings. */ 3270 3271 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM) 3272 { 3273 flag aSign; 3274 int_fast16_t aExp; 3275 uint32_t aSig; 3276 3277 aSign = extractFloat16Sign(a); 3278 aExp = extractFloat16Exp(a); 3279 aSig = extractFloat16Frac(a); 3280 3281 if (aExp == 0x1f && ieee) { 3282 if (aSig) { 3283 return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR); 3284 } 3285 return packFloat32(aSign, 0xff, 0); 3286 } 3287 if (aExp == 0) { 3288 if (aSig == 0) { 3289 return packFloat32(aSign, 0, 0); 3290 } 3291 3292 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 3293 aExp--; 3294 } 3295 return packFloat32( aSign, aExp + 0x70, aSig << 13); 3296 } 3297 3298 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM) 3299 { 3300 flag aSign; 3301 int_fast16_t aExp; 3302 uint32_t aSig; 3303 3304 a = float32_squash_input_denormal(a STATUS_VAR); 3305 3306 aSig = extractFloat32Frac( a ); 3307 aExp = extractFloat32Exp( a ); 3308 aSign = extractFloat32Sign( a ); 3309 if ( aExp == 0xFF ) { 3310 if (aSig) { 3311 /* Input is a NaN */ 3312 if (!ieee) { 3313 float_raise(float_flag_invalid STATUS_VAR); 3314 return packFloat16(aSign, 0, 0); 3315 } 3316 return commonNaNToFloat16( 3317 float32ToCommonNaN(a STATUS_VAR) STATUS_VAR); 3318 } 3319 /* Infinity */ 3320 if (!ieee) { 3321 float_raise(float_flag_invalid STATUS_VAR); 3322 return packFloat16(aSign, 0x1f, 0x3ff); 3323 } 3324 return packFloat16(aSign, 0x1f, 0); 3325 } 3326 if (aExp == 0 && aSig == 0) { 3327 return packFloat16(aSign, 0, 0); 3328 } 3329 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 3330 * even if the input is denormal; however this is harmless because 3331 * the largest possible single-precision denormal is still smaller 3332 * than the smallest representable half-precision denormal, and so we 3333 * will end up ignoring aSig and returning via the "always return zero" 3334 * codepath. 3335 */ 3336 aSig |= 0x00800000; 3337 aExp -= 0x71; 3338 3339 return roundAndPackFloat16(aSign, aExp, aSig, ieee STATUS_VAR); 3340 } 3341 3342 float64 float16_to_float64(float16 a, flag ieee STATUS_PARAM) 3343 { 3344 flag aSign; 3345 int_fast16_t aExp; 3346 uint32_t aSig; 3347 3348 aSign = extractFloat16Sign(a); 3349 aExp = extractFloat16Exp(a); 3350 aSig = extractFloat16Frac(a); 3351 3352 if (aExp == 0x1f && ieee) { 3353 if (aSig) { 3354 return commonNaNToFloat64( 3355 float16ToCommonNaN(a STATUS_VAR) STATUS_VAR); 3356 } 3357 return packFloat64(aSign, 0x7ff, 0); 3358 } 3359 if (aExp == 0) { 3360 if (aSig == 0) { 3361 return packFloat64(aSign, 0, 0); 3362 } 3363 3364 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 3365 aExp--; 3366 } 3367 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42); 3368 } 3369 3370 float16 float64_to_float16(float64 a, flag ieee STATUS_PARAM) 3371 { 3372 flag aSign; 3373 int_fast16_t aExp; 3374 uint64_t aSig; 3375 uint32_t zSig; 3376 3377 a = float64_squash_input_denormal(a STATUS_VAR); 3378 3379 aSig = extractFloat64Frac(a); 3380 aExp = extractFloat64Exp(a); 3381 aSign = extractFloat64Sign(a); 3382 if (aExp == 0x7FF) { 3383 if (aSig) { 3384 /* Input is a NaN */ 3385 if (!ieee) { 3386 float_raise(float_flag_invalid STATUS_VAR); 3387 return packFloat16(aSign, 0, 0); 3388 } 3389 return commonNaNToFloat16( 3390 float64ToCommonNaN(a STATUS_VAR) STATUS_VAR); 3391 } 3392 /* Infinity */ 3393 if (!ieee) { 3394 float_raise(float_flag_invalid STATUS_VAR); 3395 return packFloat16(aSign, 0x1f, 0x3ff); 3396 } 3397 return packFloat16(aSign, 0x1f, 0); 3398 } 3399 shift64RightJamming(aSig, 29, &aSig); 3400 zSig = aSig; 3401 if (aExp == 0 && zSig == 0) { 3402 return packFloat16(aSign, 0, 0); 3403 } 3404 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 3405 * even if the input is denormal; however this is harmless because 3406 * the largest possible single-precision denormal is still smaller 3407 * than the smallest representable half-precision denormal, and so we 3408 * will end up ignoring aSig and returning via the "always return zero" 3409 * codepath. 3410 */ 3411 zSig |= 0x00800000; 3412 aExp -= 0x3F1; 3413 3414 return roundAndPackFloat16(aSign, aExp, zSig, ieee STATUS_VAR); 3415 } 3416 3417 /*---------------------------------------------------------------------------- 3418 | Returns the result of converting the double-precision floating-point value 3419 | `a' to the extended double-precision floating-point format. The conversion 3420 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3421 | Arithmetic. 3422 *----------------------------------------------------------------------------*/ 3423 3424 floatx80 float64_to_floatx80( float64 a STATUS_PARAM ) 3425 { 3426 flag aSign; 3427 int_fast16_t aExp; 3428 uint64_t aSig; 3429 3430 a = float64_squash_input_denormal(a STATUS_VAR); 3431 aSig = extractFloat64Frac( a ); 3432 aExp = extractFloat64Exp( a ); 3433 aSign = extractFloat64Sign( a ); 3434 if ( aExp == 0x7FF ) { 3435 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 3436 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3437 } 3438 if ( aExp == 0 ) { 3439 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 3440 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3441 } 3442 return 3443 packFloatx80( 3444 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 3445 3446 } 3447 3448 /*---------------------------------------------------------------------------- 3449 | Returns the result of converting the double-precision floating-point value 3450 | `a' to the quadruple-precision floating-point format. The conversion is 3451 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3452 | Arithmetic. 3453 *----------------------------------------------------------------------------*/ 3454 3455 float128 float64_to_float128( float64 a STATUS_PARAM ) 3456 { 3457 flag aSign; 3458 int_fast16_t aExp; 3459 uint64_t aSig, zSig0, zSig1; 3460 3461 a = float64_squash_input_denormal(a STATUS_VAR); 3462 aSig = extractFloat64Frac( a ); 3463 aExp = extractFloat64Exp( a ); 3464 aSign = extractFloat64Sign( a ); 3465 if ( aExp == 0x7FF ) { 3466 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 3467 return packFloat128( aSign, 0x7FFF, 0, 0 ); 3468 } 3469 if ( aExp == 0 ) { 3470 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 3471 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3472 --aExp; 3473 } 3474 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 3475 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 3476 3477 } 3478 3479 /*---------------------------------------------------------------------------- 3480 | Rounds the double-precision floating-point value `a' to an integer, and 3481 | returns the result as a double-precision floating-point value. The 3482 | operation is performed according to the IEC/IEEE Standard for Binary 3483 | Floating-Point Arithmetic. 3484 *----------------------------------------------------------------------------*/ 3485 3486 float64 float64_round_to_int( float64 a STATUS_PARAM ) 3487 { 3488 flag aSign; 3489 int_fast16_t aExp; 3490 uint64_t lastBitMask, roundBitsMask; 3491 uint64_t z; 3492 a = float64_squash_input_denormal(a STATUS_VAR); 3493 3494 aExp = extractFloat64Exp( a ); 3495 if ( 0x433 <= aExp ) { 3496 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { 3497 return propagateFloat64NaN( a, a STATUS_VAR ); 3498 } 3499 return a; 3500 } 3501 if ( aExp < 0x3FF ) { 3502 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a; 3503 STATUS(float_exception_flags) |= float_flag_inexact; 3504 aSign = extractFloat64Sign( a ); 3505 switch ( STATUS(float_rounding_mode) ) { 3506 case float_round_nearest_even: 3507 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { 3508 return packFloat64( aSign, 0x3FF, 0 ); 3509 } 3510 break; 3511 case float_round_ties_away: 3512 if (aExp == 0x3FE) { 3513 return packFloat64(aSign, 0x3ff, 0); 3514 } 3515 break; 3516 case float_round_down: 3517 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0); 3518 case float_round_up: 3519 return make_float64( 3520 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 )); 3521 } 3522 return packFloat64( aSign, 0, 0 ); 3523 } 3524 lastBitMask = 1; 3525 lastBitMask <<= 0x433 - aExp; 3526 roundBitsMask = lastBitMask - 1; 3527 z = float64_val(a); 3528 switch (STATUS(float_rounding_mode)) { 3529 case float_round_nearest_even: 3530 z += lastBitMask >> 1; 3531 if ((z & roundBitsMask) == 0) { 3532 z &= ~lastBitMask; 3533 } 3534 break; 3535 case float_round_ties_away: 3536 z += lastBitMask >> 1; 3537 break; 3538 case float_round_to_zero: 3539 break; 3540 case float_round_up: 3541 if (!extractFloat64Sign(make_float64(z))) { 3542 z += roundBitsMask; 3543 } 3544 break; 3545 case float_round_down: 3546 if (extractFloat64Sign(make_float64(z))) { 3547 z += roundBitsMask; 3548 } 3549 break; 3550 default: 3551 abort(); 3552 } 3553 z &= ~ roundBitsMask; 3554 if ( z != float64_val(a) ) 3555 STATUS(float_exception_flags) |= float_flag_inexact; 3556 return make_float64(z); 3557 3558 } 3559 3560 float64 float64_trunc_to_int( float64 a STATUS_PARAM) 3561 { 3562 int oldmode; 3563 float64 res; 3564 oldmode = STATUS(float_rounding_mode); 3565 STATUS(float_rounding_mode) = float_round_to_zero; 3566 res = float64_round_to_int(a STATUS_VAR); 3567 STATUS(float_rounding_mode) = oldmode; 3568 return res; 3569 } 3570 3571 /*---------------------------------------------------------------------------- 3572 | Returns the result of adding the absolute values of the double-precision 3573 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 3574 | before being returned. `zSign' is ignored if the result is a NaN. 3575 | The addition is performed according to the IEC/IEEE Standard for Binary 3576 | Floating-Point Arithmetic. 3577 *----------------------------------------------------------------------------*/ 3578 3579 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM ) 3580 { 3581 int_fast16_t aExp, bExp, zExp; 3582 uint64_t aSig, bSig, zSig; 3583 int_fast16_t expDiff; 3584 3585 aSig = extractFloat64Frac( a ); 3586 aExp = extractFloat64Exp( a ); 3587 bSig = extractFloat64Frac( b ); 3588 bExp = extractFloat64Exp( b ); 3589 expDiff = aExp - bExp; 3590 aSig <<= 9; 3591 bSig <<= 9; 3592 if ( 0 < expDiff ) { 3593 if ( aExp == 0x7FF ) { 3594 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3595 return a; 3596 } 3597 if ( bExp == 0 ) { 3598 --expDiff; 3599 } 3600 else { 3601 bSig |= LIT64( 0x2000000000000000 ); 3602 } 3603 shift64RightJamming( bSig, expDiff, &bSig ); 3604 zExp = aExp; 3605 } 3606 else if ( expDiff < 0 ) { 3607 if ( bExp == 0x7FF ) { 3608 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3609 return packFloat64( zSign, 0x7FF, 0 ); 3610 } 3611 if ( aExp == 0 ) { 3612 ++expDiff; 3613 } 3614 else { 3615 aSig |= LIT64( 0x2000000000000000 ); 3616 } 3617 shift64RightJamming( aSig, - expDiff, &aSig ); 3618 zExp = bExp; 3619 } 3620 else { 3621 if ( aExp == 0x7FF ) { 3622 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3623 return a; 3624 } 3625 if ( aExp == 0 ) { 3626 if (STATUS(flush_to_zero)) { 3627 if (aSig | bSig) { 3628 float_raise(float_flag_output_denormal STATUS_VAR); 3629 } 3630 return packFloat64(zSign, 0, 0); 3631 } 3632 return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); 3633 } 3634 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig; 3635 zExp = aExp; 3636 goto roundAndPack; 3637 } 3638 aSig |= LIT64( 0x2000000000000000 ); 3639 zSig = ( aSig + bSig )<<1; 3640 --zExp; 3641 if ( (int64_t) zSig < 0 ) { 3642 zSig = aSig + bSig; 3643 ++zExp; 3644 } 3645 roundAndPack: 3646 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR ); 3647 3648 } 3649 3650 /*---------------------------------------------------------------------------- 3651 | Returns the result of subtracting the absolute values of the double- 3652 | precision floating-point values `a' and `b'. If `zSign' is 1, the 3653 | difference is negated before being returned. `zSign' is ignored if the 3654 | result is a NaN. The subtraction is performed according to the IEC/IEEE 3655 | Standard for Binary Floating-Point Arithmetic. 3656 *----------------------------------------------------------------------------*/ 3657 3658 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM ) 3659 { 3660 int_fast16_t aExp, bExp, zExp; 3661 uint64_t aSig, bSig, zSig; 3662 int_fast16_t expDiff; 3663 3664 aSig = extractFloat64Frac( a ); 3665 aExp = extractFloat64Exp( a ); 3666 bSig = extractFloat64Frac( b ); 3667 bExp = extractFloat64Exp( b ); 3668 expDiff = aExp - bExp; 3669 aSig <<= 10; 3670 bSig <<= 10; 3671 if ( 0 < expDiff ) goto aExpBigger; 3672 if ( expDiff < 0 ) goto bExpBigger; 3673 if ( aExp == 0x7FF ) { 3674 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3675 float_raise( float_flag_invalid STATUS_VAR); 3676 return float64_default_nan; 3677 } 3678 if ( aExp == 0 ) { 3679 aExp = 1; 3680 bExp = 1; 3681 } 3682 if ( bSig < aSig ) goto aBigger; 3683 if ( aSig < bSig ) goto bBigger; 3684 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 ); 3685 bExpBigger: 3686 if ( bExp == 0x7FF ) { 3687 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3688 return packFloat64( zSign ^ 1, 0x7FF, 0 ); 3689 } 3690 if ( aExp == 0 ) { 3691 ++expDiff; 3692 } 3693 else { 3694 aSig |= LIT64( 0x4000000000000000 ); 3695 } 3696 shift64RightJamming( aSig, - expDiff, &aSig ); 3697 bSig |= LIT64( 0x4000000000000000 ); 3698 bBigger: 3699 zSig = bSig - aSig; 3700 zExp = bExp; 3701 zSign ^= 1; 3702 goto normalizeRoundAndPack; 3703 aExpBigger: 3704 if ( aExp == 0x7FF ) { 3705 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3706 return a; 3707 } 3708 if ( bExp == 0 ) { 3709 --expDiff; 3710 } 3711 else { 3712 bSig |= LIT64( 0x4000000000000000 ); 3713 } 3714 shift64RightJamming( bSig, expDiff, &bSig ); 3715 aSig |= LIT64( 0x4000000000000000 ); 3716 aBigger: 3717 zSig = aSig - bSig; 3718 zExp = aExp; 3719 normalizeRoundAndPack: 3720 --zExp; 3721 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR ); 3722 3723 } 3724 3725 /*---------------------------------------------------------------------------- 3726 | Returns the result of adding the double-precision floating-point values `a' 3727 | and `b'. The operation is performed according to the IEC/IEEE Standard for 3728 | Binary Floating-Point Arithmetic. 3729 *----------------------------------------------------------------------------*/ 3730 3731 float64 float64_add( float64 a, float64 b STATUS_PARAM ) 3732 { 3733 flag aSign, bSign; 3734 a = float64_squash_input_denormal(a STATUS_VAR); 3735 b = float64_squash_input_denormal(b STATUS_VAR); 3736 3737 aSign = extractFloat64Sign( a ); 3738 bSign = extractFloat64Sign( b ); 3739 if ( aSign == bSign ) { 3740 return addFloat64Sigs( a, b, aSign STATUS_VAR ); 3741 } 3742 else { 3743 return subFloat64Sigs( a, b, aSign STATUS_VAR ); 3744 } 3745 3746 } 3747 3748 /*---------------------------------------------------------------------------- 3749 | Returns the result of subtracting the double-precision floating-point values 3750 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 3751 | for Binary Floating-Point Arithmetic. 3752 *----------------------------------------------------------------------------*/ 3753 3754 float64 float64_sub( float64 a, float64 b STATUS_PARAM ) 3755 { 3756 flag aSign, bSign; 3757 a = float64_squash_input_denormal(a STATUS_VAR); 3758 b = float64_squash_input_denormal(b STATUS_VAR); 3759 3760 aSign = extractFloat64Sign( a ); 3761 bSign = extractFloat64Sign( b ); 3762 if ( aSign == bSign ) { 3763 return subFloat64Sigs( a, b, aSign STATUS_VAR ); 3764 } 3765 else { 3766 return addFloat64Sigs( a, b, aSign STATUS_VAR ); 3767 } 3768 3769 } 3770 3771 /*---------------------------------------------------------------------------- 3772 | Returns the result of multiplying the double-precision floating-point values 3773 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 3774 | for Binary Floating-Point Arithmetic. 3775 *----------------------------------------------------------------------------*/ 3776 3777 float64 float64_mul( float64 a, float64 b STATUS_PARAM ) 3778 { 3779 flag aSign, bSign, zSign; 3780 int_fast16_t aExp, bExp, zExp; 3781 uint64_t aSig, bSig, zSig0, zSig1; 3782 3783 a = float64_squash_input_denormal(a STATUS_VAR); 3784 b = float64_squash_input_denormal(b STATUS_VAR); 3785 3786 aSig = extractFloat64Frac( a ); 3787 aExp = extractFloat64Exp( a ); 3788 aSign = extractFloat64Sign( a ); 3789 bSig = extractFloat64Frac( b ); 3790 bExp = extractFloat64Exp( b ); 3791 bSign = extractFloat64Sign( b ); 3792 zSign = aSign ^ bSign; 3793 if ( aExp == 0x7FF ) { 3794 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 3795 return propagateFloat64NaN( a, b STATUS_VAR ); 3796 } 3797 if ( ( bExp | bSig ) == 0 ) { 3798 float_raise( float_flag_invalid STATUS_VAR); 3799 return float64_default_nan; 3800 } 3801 return packFloat64( zSign, 0x7FF, 0 ); 3802 } 3803 if ( bExp == 0x7FF ) { 3804 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3805 if ( ( aExp | aSig ) == 0 ) { 3806 float_raise( float_flag_invalid STATUS_VAR); 3807 return float64_default_nan; 3808 } 3809 return packFloat64( zSign, 0x7FF, 0 ); 3810 } 3811 if ( aExp == 0 ) { 3812 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 3813 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3814 } 3815 if ( bExp == 0 ) { 3816 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 ); 3817 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 3818 } 3819 zExp = aExp + bExp - 0x3FF; 3820 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 3821 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 3822 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 3823 zSig0 |= ( zSig1 != 0 ); 3824 if ( 0 <= (int64_t) ( zSig0<<1 ) ) { 3825 zSig0 <<= 1; 3826 --zExp; 3827 } 3828 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR ); 3829 3830 } 3831 3832 /*---------------------------------------------------------------------------- 3833 | Returns the result of dividing the double-precision floating-point value `a' 3834 | by the corresponding value `b'. The operation is performed according to 3835 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3836 *----------------------------------------------------------------------------*/ 3837 3838 float64 float64_div( float64 a, float64 b STATUS_PARAM ) 3839 { 3840 flag aSign, bSign, zSign; 3841 int_fast16_t aExp, bExp, zExp; 3842 uint64_t aSig, bSig, zSig; 3843 uint64_t rem0, rem1; 3844 uint64_t term0, term1; 3845 a = float64_squash_input_denormal(a STATUS_VAR); 3846 b = float64_squash_input_denormal(b STATUS_VAR); 3847 3848 aSig = extractFloat64Frac( a ); 3849 aExp = extractFloat64Exp( a ); 3850 aSign = extractFloat64Sign( a ); 3851 bSig = extractFloat64Frac( b ); 3852 bExp = extractFloat64Exp( b ); 3853 bSign = extractFloat64Sign( b ); 3854 zSign = aSign ^ bSign; 3855 if ( aExp == 0x7FF ) { 3856 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3857 if ( bExp == 0x7FF ) { 3858 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3859 float_raise( float_flag_invalid STATUS_VAR); 3860 return float64_default_nan; 3861 } 3862 return packFloat64( zSign, 0x7FF, 0 ); 3863 } 3864 if ( bExp == 0x7FF ) { 3865 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3866 return packFloat64( zSign, 0, 0 ); 3867 } 3868 if ( bExp == 0 ) { 3869 if ( bSig == 0 ) { 3870 if ( ( aExp | aSig ) == 0 ) { 3871 float_raise( float_flag_invalid STATUS_VAR); 3872 return float64_default_nan; 3873 } 3874 float_raise( float_flag_divbyzero STATUS_VAR); 3875 return packFloat64( zSign, 0x7FF, 0 ); 3876 } 3877 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 3878 } 3879 if ( aExp == 0 ) { 3880 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 3881 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3882 } 3883 zExp = aExp - bExp + 0x3FD; 3884 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 3885 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 3886 if ( bSig <= ( aSig + aSig ) ) { 3887 aSig >>= 1; 3888 ++zExp; 3889 } 3890 zSig = estimateDiv128To64( aSig, 0, bSig ); 3891 if ( ( zSig & 0x1FF ) <= 2 ) { 3892 mul64To128( bSig, zSig, &term0, &term1 ); 3893 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 3894 while ( (int64_t) rem0 < 0 ) { 3895 --zSig; 3896 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 3897 } 3898 zSig |= ( rem1 != 0 ); 3899 } 3900 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR ); 3901 3902 } 3903 3904 /*---------------------------------------------------------------------------- 3905 | Returns the remainder of the double-precision floating-point value `a' 3906 | with respect to the corresponding value `b'. The operation is performed 3907 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3908 *----------------------------------------------------------------------------*/ 3909 3910 float64 float64_rem( float64 a, float64 b STATUS_PARAM ) 3911 { 3912 flag aSign, zSign; 3913 int_fast16_t aExp, bExp, expDiff; 3914 uint64_t aSig, bSig; 3915 uint64_t q, alternateASig; 3916 int64_t sigMean; 3917 3918 a = float64_squash_input_denormal(a STATUS_VAR); 3919 b = float64_squash_input_denormal(b STATUS_VAR); 3920 aSig = extractFloat64Frac( a ); 3921 aExp = extractFloat64Exp( a ); 3922 aSign = extractFloat64Sign( a ); 3923 bSig = extractFloat64Frac( b ); 3924 bExp = extractFloat64Exp( b ); 3925 if ( aExp == 0x7FF ) { 3926 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 3927 return propagateFloat64NaN( a, b STATUS_VAR ); 3928 } 3929 float_raise( float_flag_invalid STATUS_VAR); 3930 return float64_default_nan; 3931 } 3932 if ( bExp == 0x7FF ) { 3933 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); 3934 return a; 3935 } 3936 if ( bExp == 0 ) { 3937 if ( bSig == 0 ) { 3938 float_raise( float_flag_invalid STATUS_VAR); 3939 return float64_default_nan; 3940 } 3941 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 3942 } 3943 if ( aExp == 0 ) { 3944 if ( aSig == 0 ) return a; 3945 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3946 } 3947 expDiff = aExp - bExp; 3948 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 3949 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 3950 if ( expDiff < 0 ) { 3951 if ( expDiff < -1 ) return a; 3952 aSig >>= 1; 3953 } 3954 q = ( bSig <= aSig ); 3955 if ( q ) aSig -= bSig; 3956 expDiff -= 64; 3957 while ( 0 < expDiff ) { 3958 q = estimateDiv128To64( aSig, 0, bSig ); 3959 q = ( 2 < q ) ? q - 2 : 0; 3960 aSig = - ( ( bSig>>2 ) * q ); 3961 expDiff -= 62; 3962 } 3963 expDiff += 64; 3964 if ( 0 < expDiff ) { 3965 q = estimateDiv128To64( aSig, 0, bSig ); 3966 q = ( 2 < q ) ? q - 2 : 0; 3967 q >>= 64 - expDiff; 3968 bSig >>= 2; 3969 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 3970 } 3971 else { 3972 aSig >>= 2; 3973 bSig >>= 2; 3974 } 3975 do { 3976 alternateASig = aSig; 3977 ++q; 3978 aSig -= bSig; 3979 } while ( 0 <= (int64_t) aSig ); 3980 sigMean = aSig + alternateASig; 3981 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 3982 aSig = alternateASig; 3983 } 3984 zSign = ( (int64_t) aSig < 0 ); 3985 if ( zSign ) aSig = - aSig; 3986 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR ); 3987 3988 } 3989 3990 /*---------------------------------------------------------------------------- 3991 | Returns the result of multiplying the double-precision floating-point values 3992 | `a' and `b' then adding 'c', with no intermediate rounding step after the 3993 | multiplication. The operation is performed according to the IEC/IEEE 3994 | Standard for Binary Floating-Point Arithmetic 754-2008. 3995 | The flags argument allows the caller to select negation of the 3996 | addend, the intermediate product, or the final result. (The difference 3997 | between this and having the caller do a separate negation is that negating 3998 | externally will flip the sign bit on NaNs.) 3999 *----------------------------------------------------------------------------*/ 4000 4001 float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM) 4002 { 4003 flag aSign, bSign, cSign, zSign; 4004 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff; 4005 uint64_t aSig, bSig, cSig; 4006 flag pInf, pZero, pSign; 4007 uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1; 4008 int shiftcount; 4009 flag signflip, infzero; 4010 4011 a = float64_squash_input_denormal(a STATUS_VAR); 4012 b = float64_squash_input_denormal(b STATUS_VAR); 4013 c = float64_squash_input_denormal(c STATUS_VAR); 4014 aSig = extractFloat64Frac(a); 4015 aExp = extractFloat64Exp(a); 4016 aSign = extractFloat64Sign(a); 4017 bSig = extractFloat64Frac(b); 4018 bExp = extractFloat64Exp(b); 4019 bSign = extractFloat64Sign(b); 4020 cSig = extractFloat64Frac(c); 4021 cExp = extractFloat64Exp(c); 4022 cSign = extractFloat64Sign(c); 4023 4024 infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) || 4025 (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0)); 4026 4027 /* It is implementation-defined whether the cases of (0,inf,qnan) 4028 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 4029 * they return if they do), so we have to hand this information 4030 * off to the target-specific pick-a-NaN routine. 4031 */ 4032 if (((aExp == 0x7ff) && aSig) || 4033 ((bExp == 0x7ff) && bSig) || 4034 ((cExp == 0x7ff) && cSig)) { 4035 return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR); 4036 } 4037 4038 if (infzero) { 4039 float_raise(float_flag_invalid STATUS_VAR); 4040 return float64_default_nan; 4041 } 4042 4043 if (flags & float_muladd_negate_c) { 4044 cSign ^= 1; 4045 } 4046 4047 signflip = (flags & float_muladd_negate_result) ? 1 : 0; 4048 4049 /* Work out the sign and type of the product */ 4050 pSign = aSign ^ bSign; 4051 if (flags & float_muladd_negate_product) { 4052 pSign ^= 1; 4053 } 4054 pInf = (aExp == 0x7ff) || (bExp == 0x7ff); 4055 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0); 4056 4057 if (cExp == 0x7ff) { 4058 if (pInf && (pSign ^ cSign)) { 4059 /* addition of opposite-signed infinities => InvalidOperation */ 4060 float_raise(float_flag_invalid STATUS_VAR); 4061 return float64_default_nan; 4062 } 4063 /* Otherwise generate an infinity of the same sign */ 4064 return packFloat64(cSign ^ signflip, 0x7ff, 0); 4065 } 4066 4067 if (pInf) { 4068 return packFloat64(pSign ^ signflip, 0x7ff, 0); 4069 } 4070 4071 if (pZero) { 4072 if (cExp == 0) { 4073 if (cSig == 0) { 4074 /* Adding two exact zeroes */ 4075 if (pSign == cSign) { 4076 zSign = pSign; 4077 } else if (STATUS(float_rounding_mode) == float_round_down) { 4078 zSign = 1; 4079 } else { 4080 zSign = 0; 4081 } 4082 return packFloat64(zSign ^ signflip, 0, 0); 4083 } 4084 /* Exact zero plus a denorm */ 4085 if (STATUS(flush_to_zero)) { 4086 float_raise(float_flag_output_denormal STATUS_VAR); 4087 return packFloat64(cSign ^ signflip, 0, 0); 4088 } 4089 } 4090 /* Zero plus something non-zero : just return the something */ 4091 return packFloat64(cSign ^ signflip, cExp, cSig); 4092 } 4093 4094 if (aExp == 0) { 4095 normalizeFloat64Subnormal(aSig, &aExp, &aSig); 4096 } 4097 if (bExp == 0) { 4098 normalizeFloat64Subnormal(bSig, &bExp, &bSig); 4099 } 4100 4101 /* Calculate the actual result a * b + c */ 4102 4103 /* Multiply first; this is easy. */ 4104 /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff 4105 * because we want the true exponent, not the "one-less-than" 4106 * flavour that roundAndPackFloat64() takes. 4107 */ 4108 pExp = aExp + bExp - 0x3fe; 4109 aSig = (aSig | LIT64(0x0010000000000000))<<10; 4110 bSig = (bSig | LIT64(0x0010000000000000))<<11; 4111 mul64To128(aSig, bSig, &pSig0, &pSig1); 4112 if ((int64_t)(pSig0 << 1) >= 0) { 4113 shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1); 4114 pExp--; 4115 } 4116 4117 zSign = pSign ^ signflip; 4118 4119 /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit 4120 * bit in position 126. 4121 */ 4122 if (cExp == 0) { 4123 if (!cSig) { 4124 /* Throw out the special case of c being an exact zero now */ 4125 shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1); 4126 return roundAndPackFloat64(zSign, pExp - 1, 4127 pSig1 STATUS_VAR); 4128 } 4129 normalizeFloat64Subnormal(cSig, &cExp, &cSig); 4130 } 4131 4132 /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the 4133 * significand of the addend, with the explicit bit in position 126. 4134 */ 4135 cSig0 = cSig << (126 - 64 - 52); 4136 cSig1 = 0; 4137 cSig0 |= LIT64(0x4000000000000000); 4138 expDiff = pExp - cExp; 4139 4140 if (pSign == cSign) { 4141 /* Addition */ 4142 if (expDiff > 0) { 4143 /* scale c to match p */ 4144 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1); 4145 zExp = pExp; 4146 } else if (expDiff < 0) { 4147 /* scale p to match c */ 4148 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1); 4149 zExp = cExp; 4150 } else { 4151 /* no scaling needed */ 4152 zExp = cExp; 4153 } 4154 /* Add significands and make sure explicit bit ends up in posn 126 */ 4155 add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); 4156 if ((int64_t)zSig0 < 0) { 4157 shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1); 4158 } else { 4159 zExp--; 4160 } 4161 shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1); 4162 return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR); 4163 } else { 4164 /* Subtraction */ 4165 if (expDiff > 0) { 4166 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1); 4167 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); 4168 zExp = pExp; 4169 } else if (expDiff < 0) { 4170 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1); 4171 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1); 4172 zExp = cExp; 4173 zSign ^= 1; 4174 } else { 4175 zExp = pExp; 4176 if (lt128(cSig0, cSig1, pSig0, pSig1)) { 4177 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); 4178 } else if (lt128(pSig0, pSig1, cSig0, cSig1)) { 4179 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1); 4180 zSign ^= 1; 4181 } else { 4182 /* Exact zero */ 4183 zSign = signflip; 4184 if (STATUS(float_rounding_mode) == float_round_down) { 4185 zSign ^= 1; 4186 } 4187 return packFloat64(zSign, 0, 0); 4188 } 4189 } 4190 --zExp; 4191 /* Do the equivalent of normalizeRoundAndPackFloat64() but 4192 * starting with the significand in a pair of uint64_t. 4193 */ 4194 if (zSig0) { 4195 shiftcount = countLeadingZeros64(zSig0) - 1; 4196 shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1); 4197 if (zSig1) { 4198 zSig0 |= 1; 4199 } 4200 zExp -= shiftcount; 4201 } else { 4202 shiftcount = countLeadingZeros64(zSig1); 4203 if (shiftcount == 0) { 4204 zSig0 = (zSig1 >> 1) | (zSig1 & 1); 4205 zExp -= 63; 4206 } else { 4207 shiftcount--; 4208 zSig0 = zSig1 << shiftcount; 4209 zExp -= (shiftcount + 64); 4210 } 4211 } 4212 return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR); 4213 } 4214 } 4215 4216 /*---------------------------------------------------------------------------- 4217 | Returns the square root of the double-precision floating-point value `a'. 4218 | The operation is performed according to the IEC/IEEE Standard for Binary 4219 | Floating-Point Arithmetic. 4220 *----------------------------------------------------------------------------*/ 4221 4222 float64 float64_sqrt( float64 a STATUS_PARAM ) 4223 { 4224 flag aSign; 4225 int_fast16_t aExp, zExp; 4226 uint64_t aSig, zSig, doubleZSig; 4227 uint64_t rem0, rem1, term0, term1; 4228 a = float64_squash_input_denormal(a STATUS_VAR); 4229 4230 aSig = extractFloat64Frac( a ); 4231 aExp = extractFloat64Exp( a ); 4232 aSign = extractFloat64Sign( a ); 4233 if ( aExp == 0x7FF ) { 4234 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR ); 4235 if ( ! aSign ) return a; 4236 float_raise( float_flag_invalid STATUS_VAR); 4237 return float64_default_nan; 4238 } 4239 if ( aSign ) { 4240 if ( ( aExp | aSig ) == 0 ) return a; 4241 float_raise( float_flag_invalid STATUS_VAR); 4242 return float64_default_nan; 4243 } 4244 if ( aExp == 0 ) { 4245 if ( aSig == 0 ) return float64_zero; 4246 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4247 } 4248 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 4249 aSig |= LIT64( 0x0010000000000000 ); 4250 zSig = estimateSqrt32( aExp, aSig>>21 ); 4251 aSig <<= 9 - ( aExp & 1 ); 4252 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); 4253 if ( ( zSig & 0x1FF ) <= 5 ) { 4254 doubleZSig = zSig<<1; 4255 mul64To128( zSig, zSig, &term0, &term1 ); 4256 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 4257 while ( (int64_t) rem0 < 0 ) { 4258 --zSig; 4259 doubleZSig -= 2; 4260 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); 4261 } 4262 zSig |= ( ( rem0 | rem1 ) != 0 ); 4263 } 4264 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR ); 4265 4266 } 4267 4268 /*---------------------------------------------------------------------------- 4269 | Returns the binary log of the double-precision floating-point value `a'. 4270 | The operation is performed according to the IEC/IEEE Standard for Binary 4271 | Floating-Point Arithmetic. 4272 *----------------------------------------------------------------------------*/ 4273 float64 float64_log2( float64 a STATUS_PARAM ) 4274 { 4275 flag aSign, zSign; 4276 int_fast16_t aExp; 4277 uint64_t aSig, aSig0, aSig1, zSig, i; 4278 a = float64_squash_input_denormal(a STATUS_VAR); 4279 4280 aSig = extractFloat64Frac( a ); 4281 aExp = extractFloat64Exp( a ); 4282 aSign = extractFloat64Sign( a ); 4283 4284 if ( aExp == 0 ) { 4285 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 4286 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4287 } 4288 if ( aSign ) { 4289 float_raise( float_flag_invalid STATUS_VAR); 4290 return float64_default_nan; 4291 } 4292 if ( aExp == 0x7FF ) { 4293 if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR ); 4294 return a; 4295 } 4296 4297 aExp -= 0x3FF; 4298 aSig |= LIT64( 0x0010000000000000 ); 4299 zSign = aExp < 0; 4300 zSig = (uint64_t)aExp << 52; 4301 for (i = 1LL << 51; i > 0; i >>= 1) { 4302 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 4303 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 4304 if ( aSig & LIT64( 0x0020000000000000 ) ) { 4305 aSig >>= 1; 4306 zSig |= i; 4307 } 4308 } 4309 4310 if ( zSign ) 4311 zSig = -zSig; 4312 return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR ); 4313 } 4314 4315 /*---------------------------------------------------------------------------- 4316 | Returns 1 if the double-precision floating-point value `a' is equal to the 4317 | corresponding value `b', and 0 otherwise. The invalid exception is raised 4318 | if either operand is a NaN. Otherwise, the comparison is performed 4319 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4320 *----------------------------------------------------------------------------*/ 4321 4322 int float64_eq( float64 a, float64 b STATUS_PARAM ) 4323 { 4324 uint64_t av, bv; 4325 a = float64_squash_input_denormal(a STATUS_VAR); 4326 b = float64_squash_input_denormal(b STATUS_VAR); 4327 4328 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4329 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4330 ) { 4331 float_raise( float_flag_invalid STATUS_VAR); 4332 return 0; 4333 } 4334 av = float64_val(a); 4335 bv = float64_val(b); 4336 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4337 4338 } 4339 4340 /*---------------------------------------------------------------------------- 4341 | Returns 1 if the double-precision floating-point value `a' is less than or 4342 | equal to the corresponding value `b', and 0 otherwise. The invalid 4343 | exception is raised if either operand is a NaN. The comparison is performed 4344 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4345 *----------------------------------------------------------------------------*/ 4346 4347 int float64_le( float64 a, float64 b STATUS_PARAM ) 4348 { 4349 flag aSign, bSign; 4350 uint64_t av, bv; 4351 a = float64_squash_input_denormal(a STATUS_VAR); 4352 b = float64_squash_input_denormal(b STATUS_VAR); 4353 4354 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4355 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4356 ) { 4357 float_raise( float_flag_invalid STATUS_VAR); 4358 return 0; 4359 } 4360 aSign = extractFloat64Sign( a ); 4361 bSign = extractFloat64Sign( b ); 4362 av = float64_val(a); 4363 bv = float64_val(b); 4364 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4365 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4366 4367 } 4368 4369 /*---------------------------------------------------------------------------- 4370 | Returns 1 if the double-precision floating-point value `a' is less than 4371 | the corresponding value `b', and 0 otherwise. The invalid exception is 4372 | raised if either operand is a NaN. The comparison is performed according 4373 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4374 *----------------------------------------------------------------------------*/ 4375 4376 int float64_lt( float64 a, float64 b STATUS_PARAM ) 4377 { 4378 flag aSign, bSign; 4379 uint64_t av, bv; 4380 4381 a = float64_squash_input_denormal(a STATUS_VAR); 4382 b = float64_squash_input_denormal(b STATUS_VAR); 4383 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4384 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4385 ) { 4386 float_raise( float_flag_invalid STATUS_VAR); 4387 return 0; 4388 } 4389 aSign = extractFloat64Sign( a ); 4390 bSign = extractFloat64Sign( b ); 4391 av = float64_val(a); 4392 bv = float64_val(b); 4393 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4394 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4395 4396 } 4397 4398 /*---------------------------------------------------------------------------- 4399 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4400 | be compared, and 0 otherwise. The invalid exception is raised if either 4401 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4402 | Standard for Binary Floating-Point Arithmetic. 4403 *----------------------------------------------------------------------------*/ 4404 4405 int float64_unordered( float64 a, float64 b STATUS_PARAM ) 4406 { 4407 a = float64_squash_input_denormal(a STATUS_VAR); 4408 b = float64_squash_input_denormal(b STATUS_VAR); 4409 4410 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4411 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4412 ) { 4413 float_raise( float_flag_invalid STATUS_VAR); 4414 return 1; 4415 } 4416 return 0; 4417 } 4418 4419 /*---------------------------------------------------------------------------- 4420 | Returns 1 if the double-precision floating-point value `a' is equal to the 4421 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4422 | exception.The comparison is performed according to the IEC/IEEE Standard 4423 | for Binary Floating-Point Arithmetic. 4424 *----------------------------------------------------------------------------*/ 4425 4426 int float64_eq_quiet( float64 a, float64 b STATUS_PARAM ) 4427 { 4428 uint64_t av, bv; 4429 a = float64_squash_input_denormal(a STATUS_VAR); 4430 b = float64_squash_input_denormal(b STATUS_VAR); 4431 4432 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4433 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4434 ) { 4435 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 4436 float_raise( float_flag_invalid STATUS_VAR); 4437 } 4438 return 0; 4439 } 4440 av = float64_val(a); 4441 bv = float64_val(b); 4442 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4443 4444 } 4445 4446 /*---------------------------------------------------------------------------- 4447 | Returns 1 if the double-precision floating-point value `a' is less than or 4448 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4449 | cause an exception. Otherwise, the comparison is performed according to the 4450 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4451 *----------------------------------------------------------------------------*/ 4452 4453 int float64_le_quiet( float64 a, float64 b STATUS_PARAM ) 4454 { 4455 flag aSign, bSign; 4456 uint64_t av, bv; 4457 a = float64_squash_input_denormal(a STATUS_VAR); 4458 b = float64_squash_input_denormal(b STATUS_VAR); 4459 4460 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4461 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4462 ) { 4463 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 4464 float_raise( float_flag_invalid STATUS_VAR); 4465 } 4466 return 0; 4467 } 4468 aSign = extractFloat64Sign( a ); 4469 bSign = extractFloat64Sign( b ); 4470 av = float64_val(a); 4471 bv = float64_val(b); 4472 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4473 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4474 4475 } 4476 4477 /*---------------------------------------------------------------------------- 4478 | Returns 1 if the double-precision floating-point value `a' is less than 4479 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4480 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 4481 | Standard for Binary Floating-Point Arithmetic. 4482 *----------------------------------------------------------------------------*/ 4483 4484 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM ) 4485 { 4486 flag aSign, bSign; 4487 uint64_t av, bv; 4488 a = float64_squash_input_denormal(a STATUS_VAR); 4489 b = float64_squash_input_denormal(b STATUS_VAR); 4490 4491 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4492 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4493 ) { 4494 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 4495 float_raise( float_flag_invalid STATUS_VAR); 4496 } 4497 return 0; 4498 } 4499 aSign = extractFloat64Sign( a ); 4500 bSign = extractFloat64Sign( b ); 4501 av = float64_val(a); 4502 bv = float64_val(b); 4503 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4504 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4505 4506 } 4507 4508 /*---------------------------------------------------------------------------- 4509 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4510 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 4511 | comparison is performed according to the IEC/IEEE Standard for Binary 4512 | Floating-Point Arithmetic. 4513 *----------------------------------------------------------------------------*/ 4514 4515 int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM ) 4516 { 4517 a = float64_squash_input_denormal(a STATUS_VAR); 4518 b = float64_squash_input_denormal(b STATUS_VAR); 4519 4520 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4521 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4522 ) { 4523 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 4524 float_raise( float_flag_invalid STATUS_VAR); 4525 } 4526 return 1; 4527 } 4528 return 0; 4529 } 4530 4531 /*---------------------------------------------------------------------------- 4532 | Returns the result of converting the extended double-precision floating- 4533 | point value `a' to the 32-bit two's complement integer format. The 4534 | conversion is performed according to the IEC/IEEE Standard for Binary 4535 | Floating-Point Arithmetic---which means in particular that the conversion 4536 | is rounded according to the current rounding mode. If `a' is a NaN, the 4537 | largest positive integer is returned. Otherwise, if the conversion 4538 | overflows, the largest integer with the same sign as `a' is returned. 4539 *----------------------------------------------------------------------------*/ 4540 4541 int32 floatx80_to_int32( floatx80 a STATUS_PARAM ) 4542 { 4543 flag aSign; 4544 int32 aExp, shiftCount; 4545 uint64_t aSig; 4546 4547 aSig = extractFloatx80Frac( a ); 4548 aExp = extractFloatx80Exp( a ); 4549 aSign = extractFloatx80Sign( a ); 4550 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4551 shiftCount = 0x4037 - aExp; 4552 if ( shiftCount <= 0 ) shiftCount = 1; 4553 shift64RightJamming( aSig, shiftCount, &aSig ); 4554 return roundAndPackInt32( aSign, aSig STATUS_VAR ); 4555 4556 } 4557 4558 /*---------------------------------------------------------------------------- 4559 | Returns the result of converting the extended double-precision floating- 4560 | point value `a' to the 32-bit two's complement integer format. The 4561 | conversion is performed according to the IEC/IEEE Standard for Binary 4562 | Floating-Point Arithmetic, except that the conversion is always rounded 4563 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4564 | Otherwise, if the conversion overflows, the largest integer with the same 4565 | sign as `a' is returned. 4566 *----------------------------------------------------------------------------*/ 4567 4568 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM ) 4569 { 4570 flag aSign; 4571 int32 aExp, shiftCount; 4572 uint64_t aSig, savedASig; 4573 int32_t z; 4574 4575 aSig = extractFloatx80Frac( a ); 4576 aExp = extractFloatx80Exp( a ); 4577 aSign = extractFloatx80Sign( a ); 4578 if ( 0x401E < aExp ) { 4579 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4580 goto invalid; 4581 } 4582 else if ( aExp < 0x3FFF ) { 4583 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 4584 return 0; 4585 } 4586 shiftCount = 0x403E - aExp; 4587 savedASig = aSig; 4588 aSig >>= shiftCount; 4589 z = aSig; 4590 if ( aSign ) z = - z; 4591 if ( ( z < 0 ) ^ aSign ) { 4592 invalid: 4593 float_raise( float_flag_invalid STATUS_VAR); 4594 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 4595 } 4596 if ( ( aSig<<shiftCount ) != savedASig ) { 4597 STATUS(float_exception_flags) |= float_flag_inexact; 4598 } 4599 return z; 4600 4601 } 4602 4603 /*---------------------------------------------------------------------------- 4604 | Returns the result of converting the extended double-precision floating- 4605 | point value `a' to the 64-bit two's complement integer format. The 4606 | conversion is performed according to the IEC/IEEE Standard for Binary 4607 | Floating-Point Arithmetic---which means in particular that the conversion 4608 | is rounded according to the current rounding mode. If `a' is a NaN, 4609 | the largest positive integer is returned. Otherwise, if the conversion 4610 | overflows, the largest integer with the same sign as `a' is returned. 4611 *----------------------------------------------------------------------------*/ 4612 4613 int64 floatx80_to_int64( floatx80 a STATUS_PARAM ) 4614 { 4615 flag aSign; 4616 int32 aExp, shiftCount; 4617 uint64_t aSig, aSigExtra; 4618 4619 aSig = extractFloatx80Frac( a ); 4620 aExp = extractFloatx80Exp( a ); 4621 aSign = extractFloatx80Sign( a ); 4622 shiftCount = 0x403E - aExp; 4623 if ( shiftCount <= 0 ) { 4624 if ( shiftCount ) { 4625 float_raise( float_flag_invalid STATUS_VAR); 4626 if ( ! aSign 4627 || ( ( aExp == 0x7FFF ) 4628 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 4629 ) { 4630 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4631 } 4632 return (int64_t) LIT64( 0x8000000000000000 ); 4633 } 4634 aSigExtra = 0; 4635 } 4636 else { 4637 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 4638 } 4639 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR ); 4640 4641 } 4642 4643 /*---------------------------------------------------------------------------- 4644 | Returns the result of converting the extended double-precision floating- 4645 | point value `a' to the 64-bit two's complement integer format. The 4646 | conversion is performed according to the IEC/IEEE Standard for Binary 4647 | Floating-Point Arithmetic, except that the conversion is always rounded 4648 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4649 | Otherwise, if the conversion overflows, the largest integer with the same 4650 | sign as `a' is returned. 4651 *----------------------------------------------------------------------------*/ 4652 4653 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM ) 4654 { 4655 flag aSign; 4656 int32 aExp, shiftCount; 4657 uint64_t aSig; 4658 int64 z; 4659 4660 aSig = extractFloatx80Frac( a ); 4661 aExp = extractFloatx80Exp( a ); 4662 aSign = extractFloatx80Sign( a ); 4663 shiftCount = aExp - 0x403E; 4664 if ( 0 <= shiftCount ) { 4665 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 4666 if ( ( a.high != 0xC03E ) || aSig ) { 4667 float_raise( float_flag_invalid STATUS_VAR); 4668 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 4669 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4670 } 4671 } 4672 return (int64_t) LIT64( 0x8000000000000000 ); 4673 } 4674 else if ( aExp < 0x3FFF ) { 4675 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact; 4676 return 0; 4677 } 4678 z = aSig>>( - shiftCount ); 4679 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 4680 STATUS(float_exception_flags) |= float_flag_inexact; 4681 } 4682 if ( aSign ) z = - z; 4683 return z; 4684 4685 } 4686 4687 /*---------------------------------------------------------------------------- 4688 | Returns the result of converting the extended double-precision floating- 4689 | point value `a' to the single-precision floating-point format. The 4690 | conversion is performed according to the IEC/IEEE Standard for Binary 4691 | Floating-Point Arithmetic. 4692 *----------------------------------------------------------------------------*/ 4693 4694 float32 floatx80_to_float32( floatx80 a STATUS_PARAM ) 4695 { 4696 flag aSign; 4697 int32 aExp; 4698 uint64_t aSig; 4699 4700 aSig = extractFloatx80Frac( a ); 4701 aExp = extractFloatx80Exp( a ); 4702 aSign = extractFloatx80Sign( a ); 4703 if ( aExp == 0x7FFF ) { 4704 if ( (uint64_t) ( aSig<<1 ) ) { 4705 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 4706 } 4707 return packFloat32( aSign, 0xFF, 0 ); 4708 } 4709 shift64RightJamming( aSig, 33, &aSig ); 4710 if ( aExp || aSig ) aExp -= 0x3F81; 4711 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR ); 4712 4713 } 4714 4715 /*---------------------------------------------------------------------------- 4716 | Returns the result of converting the extended double-precision floating- 4717 | point value `a' to the double-precision floating-point format. The 4718 | conversion is performed according to the IEC/IEEE Standard for Binary 4719 | Floating-Point Arithmetic. 4720 *----------------------------------------------------------------------------*/ 4721 4722 float64 floatx80_to_float64( floatx80 a STATUS_PARAM ) 4723 { 4724 flag aSign; 4725 int32 aExp; 4726 uint64_t aSig, zSig; 4727 4728 aSig = extractFloatx80Frac( a ); 4729 aExp = extractFloatx80Exp( a ); 4730 aSign = extractFloatx80Sign( a ); 4731 if ( aExp == 0x7FFF ) { 4732 if ( (uint64_t) ( aSig<<1 ) ) { 4733 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 4734 } 4735 return packFloat64( aSign, 0x7FF, 0 ); 4736 } 4737 shift64RightJamming( aSig, 1, &zSig ); 4738 if ( aExp || aSig ) aExp -= 0x3C01; 4739 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR ); 4740 4741 } 4742 4743 /*---------------------------------------------------------------------------- 4744 | Returns the result of converting the extended double-precision floating- 4745 | point value `a' to the quadruple-precision floating-point format. The 4746 | conversion is performed according to the IEC/IEEE Standard for Binary 4747 | Floating-Point Arithmetic. 4748 *----------------------------------------------------------------------------*/ 4749 4750 float128 floatx80_to_float128( floatx80 a STATUS_PARAM ) 4751 { 4752 flag aSign; 4753 int_fast16_t aExp; 4754 uint64_t aSig, zSig0, zSig1; 4755 4756 aSig = extractFloatx80Frac( a ); 4757 aExp = extractFloatx80Exp( a ); 4758 aSign = extractFloatx80Sign( a ); 4759 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 4760 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 4761 } 4762 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 4763 return packFloat128( aSign, aExp, zSig0, zSig1 ); 4764 4765 } 4766 4767 /*---------------------------------------------------------------------------- 4768 | Rounds the extended double-precision floating-point value `a' to an integer, 4769 | and returns the result as an extended quadruple-precision floating-point 4770 | value. The operation is performed according to the IEC/IEEE Standard for 4771 | Binary Floating-Point Arithmetic. 4772 *----------------------------------------------------------------------------*/ 4773 4774 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM ) 4775 { 4776 flag aSign; 4777 int32 aExp; 4778 uint64_t lastBitMask, roundBitsMask; 4779 floatx80 z; 4780 4781 aExp = extractFloatx80Exp( a ); 4782 if ( 0x403E <= aExp ) { 4783 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 4784 return propagateFloatx80NaN( a, a STATUS_VAR ); 4785 } 4786 return a; 4787 } 4788 if ( aExp < 0x3FFF ) { 4789 if ( ( aExp == 0 ) 4790 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 4791 return a; 4792 } 4793 STATUS(float_exception_flags) |= float_flag_inexact; 4794 aSign = extractFloatx80Sign( a ); 4795 switch ( STATUS(float_rounding_mode) ) { 4796 case float_round_nearest_even: 4797 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 4798 ) { 4799 return 4800 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 4801 } 4802 break; 4803 case float_round_ties_away: 4804 if (aExp == 0x3FFE) { 4805 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000)); 4806 } 4807 break; 4808 case float_round_down: 4809 return 4810 aSign ? 4811 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 4812 : packFloatx80( 0, 0, 0 ); 4813 case float_round_up: 4814 return 4815 aSign ? packFloatx80( 1, 0, 0 ) 4816 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 4817 } 4818 return packFloatx80( aSign, 0, 0 ); 4819 } 4820 lastBitMask = 1; 4821 lastBitMask <<= 0x403E - aExp; 4822 roundBitsMask = lastBitMask - 1; 4823 z = a; 4824 switch (STATUS(float_rounding_mode)) { 4825 case float_round_nearest_even: 4826 z.low += lastBitMask>>1; 4827 if ((z.low & roundBitsMask) == 0) { 4828 z.low &= ~lastBitMask; 4829 } 4830 break; 4831 case float_round_ties_away: 4832 z.low += lastBitMask >> 1; 4833 break; 4834 case float_round_to_zero: 4835 break; 4836 case float_round_up: 4837 if (!extractFloatx80Sign(z)) { 4838 z.low += roundBitsMask; 4839 } 4840 break; 4841 case float_round_down: 4842 if (extractFloatx80Sign(z)) { 4843 z.low += roundBitsMask; 4844 } 4845 break; 4846 default: 4847 abort(); 4848 } 4849 z.low &= ~ roundBitsMask; 4850 if ( z.low == 0 ) { 4851 ++z.high; 4852 z.low = LIT64( 0x8000000000000000 ); 4853 } 4854 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact; 4855 return z; 4856 4857 } 4858 4859 /*---------------------------------------------------------------------------- 4860 | Returns the result of adding the absolute values of the extended double- 4861 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 4862 | negated before being returned. `zSign' is ignored if the result is a NaN. 4863 | The addition is performed according to the IEC/IEEE Standard for Binary 4864 | Floating-Point Arithmetic. 4865 *----------------------------------------------------------------------------*/ 4866 4867 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM) 4868 { 4869 int32 aExp, bExp, zExp; 4870 uint64_t aSig, bSig, zSig0, zSig1; 4871 int32 expDiff; 4872 4873 aSig = extractFloatx80Frac( a ); 4874 aExp = extractFloatx80Exp( a ); 4875 bSig = extractFloatx80Frac( b ); 4876 bExp = extractFloatx80Exp( b ); 4877 expDiff = aExp - bExp; 4878 if ( 0 < expDiff ) { 4879 if ( aExp == 0x7FFF ) { 4880 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 4881 return a; 4882 } 4883 if ( bExp == 0 ) --expDiff; 4884 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 4885 zExp = aExp; 4886 } 4887 else if ( expDiff < 0 ) { 4888 if ( bExp == 0x7FFF ) { 4889 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 4890 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4891 } 4892 if ( aExp == 0 ) ++expDiff; 4893 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 4894 zExp = bExp; 4895 } 4896 else { 4897 if ( aExp == 0x7FFF ) { 4898 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 4899 return propagateFloatx80NaN( a, b STATUS_VAR ); 4900 } 4901 return a; 4902 } 4903 zSig1 = 0; 4904 zSig0 = aSig + bSig; 4905 if ( aExp == 0 ) { 4906 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 4907 goto roundAndPack; 4908 } 4909 zExp = aExp; 4910 goto shiftRight1; 4911 } 4912 zSig0 = aSig + bSig; 4913 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 4914 shiftRight1: 4915 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 4916 zSig0 |= LIT64( 0x8000000000000000 ); 4917 ++zExp; 4918 roundAndPack: 4919 return 4920 roundAndPackFloatx80( 4921 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR ); 4922 4923 } 4924 4925 /*---------------------------------------------------------------------------- 4926 | Returns the result of subtracting the absolute values of the extended 4927 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 4928 | difference is negated before being returned. `zSign' is ignored if the 4929 | result is a NaN. The subtraction is performed according to the IEC/IEEE 4930 | Standard for Binary Floating-Point Arithmetic. 4931 *----------------------------------------------------------------------------*/ 4932 4933 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM ) 4934 { 4935 int32 aExp, bExp, zExp; 4936 uint64_t aSig, bSig, zSig0, zSig1; 4937 int32 expDiff; 4938 floatx80 z; 4939 4940 aSig = extractFloatx80Frac( a ); 4941 aExp = extractFloatx80Exp( a ); 4942 bSig = extractFloatx80Frac( b ); 4943 bExp = extractFloatx80Exp( b ); 4944 expDiff = aExp - bExp; 4945 if ( 0 < expDiff ) goto aExpBigger; 4946 if ( expDiff < 0 ) goto bExpBigger; 4947 if ( aExp == 0x7FFF ) { 4948 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 4949 return propagateFloatx80NaN( a, b STATUS_VAR ); 4950 } 4951 float_raise( float_flag_invalid STATUS_VAR); 4952 z.low = floatx80_default_nan_low; 4953 z.high = floatx80_default_nan_high; 4954 return z; 4955 } 4956 if ( aExp == 0 ) { 4957 aExp = 1; 4958 bExp = 1; 4959 } 4960 zSig1 = 0; 4961 if ( bSig < aSig ) goto aBigger; 4962 if ( aSig < bSig ) goto bBigger; 4963 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 ); 4964 bExpBigger: 4965 if ( bExp == 0x7FFF ) { 4966 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 4967 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4968 } 4969 if ( aExp == 0 ) ++expDiff; 4970 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 4971 bBigger: 4972 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 4973 zExp = bExp; 4974 zSign ^= 1; 4975 goto normalizeRoundAndPack; 4976 aExpBigger: 4977 if ( aExp == 0x7FFF ) { 4978 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 4979 return a; 4980 } 4981 if ( bExp == 0 ) --expDiff; 4982 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 4983 aBigger: 4984 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 4985 zExp = aExp; 4986 normalizeRoundAndPack: 4987 return 4988 normalizeRoundAndPackFloatx80( 4989 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR ); 4990 4991 } 4992 4993 /*---------------------------------------------------------------------------- 4994 | Returns the result of adding the extended double-precision floating-point 4995 | values `a' and `b'. The operation is performed according to the IEC/IEEE 4996 | Standard for Binary Floating-Point Arithmetic. 4997 *----------------------------------------------------------------------------*/ 4998 4999 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM ) 5000 { 5001 flag aSign, bSign; 5002 5003 aSign = extractFloatx80Sign( a ); 5004 bSign = extractFloatx80Sign( b ); 5005 if ( aSign == bSign ) { 5006 return addFloatx80Sigs( a, b, aSign STATUS_VAR ); 5007 } 5008 else { 5009 return subFloatx80Sigs( a, b, aSign STATUS_VAR ); 5010 } 5011 5012 } 5013 5014 /*---------------------------------------------------------------------------- 5015 | Returns the result of subtracting the extended double-precision floating- 5016 | point values `a' and `b'. The operation is performed according to the 5017 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5018 *----------------------------------------------------------------------------*/ 5019 5020 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM ) 5021 { 5022 flag aSign, bSign; 5023 5024 aSign = extractFloatx80Sign( a ); 5025 bSign = extractFloatx80Sign( b ); 5026 if ( aSign == bSign ) { 5027 return subFloatx80Sigs( a, b, aSign STATUS_VAR ); 5028 } 5029 else { 5030 return addFloatx80Sigs( a, b, aSign STATUS_VAR ); 5031 } 5032 5033 } 5034 5035 /*---------------------------------------------------------------------------- 5036 | Returns the result of multiplying the extended double-precision floating- 5037 | point values `a' and `b'. The operation is performed according to the 5038 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5039 *----------------------------------------------------------------------------*/ 5040 5041 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM ) 5042 { 5043 flag aSign, bSign, zSign; 5044 int32 aExp, bExp, zExp; 5045 uint64_t aSig, bSig, zSig0, zSig1; 5046 floatx80 z; 5047 5048 aSig = extractFloatx80Frac( a ); 5049 aExp = extractFloatx80Exp( a ); 5050 aSign = extractFloatx80Sign( a ); 5051 bSig = extractFloatx80Frac( b ); 5052 bExp = extractFloatx80Exp( b ); 5053 bSign = extractFloatx80Sign( b ); 5054 zSign = aSign ^ bSign; 5055 if ( aExp == 0x7FFF ) { 5056 if ( (uint64_t) ( aSig<<1 ) 5057 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5058 return propagateFloatx80NaN( a, b STATUS_VAR ); 5059 } 5060 if ( ( bExp | bSig ) == 0 ) goto invalid; 5061 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5062 } 5063 if ( bExp == 0x7FFF ) { 5064 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 5065 if ( ( aExp | aSig ) == 0 ) { 5066 invalid: 5067 float_raise( float_flag_invalid STATUS_VAR); 5068 z.low = floatx80_default_nan_low; 5069 z.high = floatx80_default_nan_high; 5070 return z; 5071 } 5072 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5073 } 5074 if ( aExp == 0 ) { 5075 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5076 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5077 } 5078 if ( bExp == 0 ) { 5079 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5080 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5081 } 5082 zExp = aExp + bExp - 0x3FFE; 5083 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 5084 if ( 0 < (int64_t) zSig0 ) { 5085 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5086 --zExp; 5087 } 5088 return 5089 roundAndPackFloatx80( 5090 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR ); 5091 5092 } 5093 5094 /*---------------------------------------------------------------------------- 5095 | Returns the result of dividing the extended double-precision floating-point 5096 | value `a' by the corresponding value `b'. The operation is performed 5097 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5098 *----------------------------------------------------------------------------*/ 5099 5100 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM ) 5101 { 5102 flag aSign, bSign, zSign; 5103 int32 aExp, bExp, zExp; 5104 uint64_t aSig, bSig, zSig0, zSig1; 5105 uint64_t rem0, rem1, rem2, term0, term1, term2; 5106 floatx80 z; 5107 5108 aSig = extractFloatx80Frac( a ); 5109 aExp = extractFloatx80Exp( a ); 5110 aSign = extractFloatx80Sign( a ); 5111 bSig = extractFloatx80Frac( b ); 5112 bExp = extractFloatx80Exp( b ); 5113 bSign = extractFloatx80Sign( b ); 5114 zSign = aSign ^ bSign; 5115 if ( aExp == 0x7FFF ) { 5116 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 5117 if ( bExp == 0x7FFF ) { 5118 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 5119 goto invalid; 5120 } 5121 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5122 } 5123 if ( bExp == 0x7FFF ) { 5124 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 5125 return packFloatx80( zSign, 0, 0 ); 5126 } 5127 if ( bExp == 0 ) { 5128 if ( bSig == 0 ) { 5129 if ( ( aExp | aSig ) == 0 ) { 5130 invalid: 5131 float_raise( float_flag_invalid STATUS_VAR); 5132 z.low = floatx80_default_nan_low; 5133 z.high = floatx80_default_nan_high; 5134 return z; 5135 } 5136 float_raise( float_flag_divbyzero STATUS_VAR); 5137 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5138 } 5139 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5140 } 5141 if ( aExp == 0 ) { 5142 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5143 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5144 } 5145 zExp = aExp - bExp + 0x3FFE; 5146 rem1 = 0; 5147 if ( bSig <= aSig ) { 5148 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 5149 ++zExp; 5150 } 5151 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 5152 mul64To128( bSig, zSig0, &term0, &term1 ); 5153 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 5154 while ( (int64_t) rem0 < 0 ) { 5155 --zSig0; 5156 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 5157 } 5158 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 5159 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 5160 mul64To128( bSig, zSig1, &term1, &term2 ); 5161 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5162 while ( (int64_t) rem1 < 0 ) { 5163 --zSig1; 5164 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 5165 } 5166 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 5167 } 5168 return 5169 roundAndPackFloatx80( 5170 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR ); 5171 5172 } 5173 5174 /*---------------------------------------------------------------------------- 5175 | Returns the remainder of the extended double-precision floating-point value 5176 | `a' with respect to the corresponding value `b'. The operation is performed 5177 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5178 *----------------------------------------------------------------------------*/ 5179 5180 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM ) 5181 { 5182 flag aSign, zSign; 5183 int32 aExp, bExp, expDiff; 5184 uint64_t aSig0, aSig1, bSig; 5185 uint64_t q, term0, term1, alternateASig0, alternateASig1; 5186 floatx80 z; 5187 5188 aSig0 = extractFloatx80Frac( a ); 5189 aExp = extractFloatx80Exp( a ); 5190 aSign = extractFloatx80Sign( a ); 5191 bSig = extractFloatx80Frac( b ); 5192 bExp = extractFloatx80Exp( b ); 5193 if ( aExp == 0x7FFF ) { 5194 if ( (uint64_t) ( aSig0<<1 ) 5195 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5196 return propagateFloatx80NaN( a, b STATUS_VAR ); 5197 } 5198 goto invalid; 5199 } 5200 if ( bExp == 0x7FFF ) { 5201 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); 5202 return a; 5203 } 5204 if ( bExp == 0 ) { 5205 if ( bSig == 0 ) { 5206 invalid: 5207 float_raise( float_flag_invalid STATUS_VAR); 5208 z.low = floatx80_default_nan_low; 5209 z.high = floatx80_default_nan_high; 5210 return z; 5211 } 5212 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5213 } 5214 if ( aExp == 0 ) { 5215 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a; 5216 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5217 } 5218 bSig |= LIT64( 0x8000000000000000 ); 5219 zSign = aSign; 5220 expDiff = aExp - bExp; 5221 aSig1 = 0; 5222 if ( expDiff < 0 ) { 5223 if ( expDiff < -1 ) return a; 5224 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 5225 expDiff = 0; 5226 } 5227 q = ( bSig <= aSig0 ); 5228 if ( q ) aSig0 -= bSig; 5229 expDiff -= 64; 5230 while ( 0 < expDiff ) { 5231 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5232 q = ( 2 < q ) ? q - 2 : 0; 5233 mul64To128( bSig, q, &term0, &term1 ); 5234 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5235 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 5236 expDiff -= 62; 5237 } 5238 expDiff += 64; 5239 if ( 0 < expDiff ) { 5240 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5241 q = ( 2 < q ) ? q - 2 : 0; 5242 q >>= 64 - expDiff; 5243 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 5244 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5245 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 5246 while ( le128( term0, term1, aSig0, aSig1 ) ) { 5247 ++q; 5248 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5249 } 5250 } 5251 else { 5252 term1 = 0; 5253 term0 = bSig; 5254 } 5255 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 5256 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5257 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5258 && ( q & 1 ) ) 5259 ) { 5260 aSig0 = alternateASig0; 5261 aSig1 = alternateASig1; 5262 zSign = ! zSign; 5263 } 5264 return 5265 normalizeRoundAndPackFloatx80( 5266 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR ); 5267 5268 } 5269 5270 /*---------------------------------------------------------------------------- 5271 | Returns the square root of the extended double-precision floating-point 5272 | value `a'. The operation is performed according to the IEC/IEEE Standard 5273 | for Binary Floating-Point Arithmetic. 5274 *----------------------------------------------------------------------------*/ 5275 5276 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM ) 5277 { 5278 flag aSign; 5279 int32 aExp, zExp; 5280 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 5281 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5282 floatx80 z; 5283 5284 aSig0 = extractFloatx80Frac( a ); 5285 aExp = extractFloatx80Exp( a ); 5286 aSign = extractFloatx80Sign( a ); 5287 if ( aExp == 0x7FFF ) { 5288 if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR ); 5289 if ( ! aSign ) return a; 5290 goto invalid; 5291 } 5292 if ( aSign ) { 5293 if ( ( aExp | aSig0 ) == 0 ) return a; 5294 invalid: 5295 float_raise( float_flag_invalid STATUS_VAR); 5296 z.low = floatx80_default_nan_low; 5297 z.high = floatx80_default_nan_high; 5298 return z; 5299 } 5300 if ( aExp == 0 ) { 5301 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 5302 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5303 } 5304 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 5305 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 5306 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 5307 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5308 doubleZSig0 = zSig0<<1; 5309 mul64To128( zSig0, zSig0, &term0, &term1 ); 5310 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5311 while ( (int64_t) rem0 < 0 ) { 5312 --zSig0; 5313 doubleZSig0 -= 2; 5314 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5315 } 5316 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5317 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 5318 if ( zSig1 == 0 ) zSig1 = 1; 5319 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5320 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5321 mul64To128( zSig1, zSig1, &term2, &term3 ); 5322 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5323 while ( (int64_t) rem1 < 0 ) { 5324 --zSig1; 5325 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5326 term3 |= 1; 5327 term2 |= doubleZSig0; 5328 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5329 } 5330 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5331 } 5332 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 5333 zSig0 |= doubleZSig0; 5334 return 5335 roundAndPackFloatx80( 5336 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR ); 5337 5338 } 5339 5340 /*---------------------------------------------------------------------------- 5341 | Returns 1 if the extended double-precision floating-point value `a' is equal 5342 | to the corresponding value `b', and 0 otherwise. The invalid exception is 5343 | raised if either operand is a NaN. Otherwise, the comparison is performed 5344 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5345 *----------------------------------------------------------------------------*/ 5346 5347 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM ) 5348 { 5349 5350 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5351 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5352 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5353 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5354 ) { 5355 float_raise( float_flag_invalid STATUS_VAR); 5356 return 0; 5357 } 5358 return 5359 ( a.low == b.low ) 5360 && ( ( a.high == b.high ) 5361 || ( ( a.low == 0 ) 5362 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5363 ); 5364 5365 } 5366 5367 /*---------------------------------------------------------------------------- 5368 | Returns 1 if the extended double-precision floating-point value `a' is 5369 | less than or equal to the corresponding value `b', and 0 otherwise. The 5370 | invalid exception is raised if either operand is a NaN. The comparison is 5371 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5372 | Arithmetic. 5373 *----------------------------------------------------------------------------*/ 5374 5375 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM ) 5376 { 5377 flag aSign, bSign; 5378 5379 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5380 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5381 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5382 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5383 ) { 5384 float_raise( float_flag_invalid STATUS_VAR); 5385 return 0; 5386 } 5387 aSign = extractFloatx80Sign( a ); 5388 bSign = extractFloatx80Sign( b ); 5389 if ( aSign != bSign ) { 5390 return 5391 aSign 5392 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5393 == 0 ); 5394 } 5395 return 5396 aSign ? le128( b.high, b.low, a.high, a.low ) 5397 : le128( a.high, a.low, b.high, b.low ); 5398 5399 } 5400 5401 /*---------------------------------------------------------------------------- 5402 | Returns 1 if the extended double-precision floating-point value `a' is 5403 | less than the corresponding value `b', and 0 otherwise. The invalid 5404 | exception is raised if either operand is a NaN. The comparison is performed 5405 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5406 *----------------------------------------------------------------------------*/ 5407 5408 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM ) 5409 { 5410 flag aSign, bSign; 5411 5412 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5413 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5414 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5415 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5416 ) { 5417 float_raise( float_flag_invalid STATUS_VAR); 5418 return 0; 5419 } 5420 aSign = extractFloatx80Sign( a ); 5421 bSign = extractFloatx80Sign( b ); 5422 if ( aSign != bSign ) { 5423 return 5424 aSign 5425 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5426 != 0 ); 5427 } 5428 return 5429 aSign ? lt128( b.high, b.low, a.high, a.low ) 5430 : lt128( a.high, a.low, b.high, b.low ); 5431 5432 } 5433 5434 /*---------------------------------------------------------------------------- 5435 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5436 | cannot be compared, and 0 otherwise. The invalid exception is raised if 5437 | either operand is a NaN. The comparison is performed according to the 5438 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5439 *----------------------------------------------------------------------------*/ 5440 int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM ) 5441 { 5442 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5443 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5444 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5445 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5446 ) { 5447 float_raise( float_flag_invalid STATUS_VAR); 5448 return 1; 5449 } 5450 return 0; 5451 } 5452 5453 /*---------------------------------------------------------------------------- 5454 | Returns 1 if the extended double-precision floating-point value `a' is 5455 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5456 | cause an exception. The comparison is performed according to the IEC/IEEE 5457 | Standard for Binary Floating-Point Arithmetic. 5458 *----------------------------------------------------------------------------*/ 5459 5460 int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM ) 5461 { 5462 5463 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5464 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5465 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5466 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5467 ) { 5468 if ( floatx80_is_signaling_nan( a ) 5469 || floatx80_is_signaling_nan( b ) ) { 5470 float_raise( float_flag_invalid STATUS_VAR); 5471 } 5472 return 0; 5473 } 5474 return 5475 ( a.low == b.low ) 5476 && ( ( a.high == b.high ) 5477 || ( ( a.low == 0 ) 5478 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5479 ); 5480 5481 } 5482 5483 /*---------------------------------------------------------------------------- 5484 | Returns 1 if the extended double-precision floating-point value `a' is less 5485 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 5486 | do not cause an exception. Otherwise, the comparison is performed according 5487 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5488 *----------------------------------------------------------------------------*/ 5489 5490 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM ) 5491 { 5492 flag aSign, bSign; 5493 5494 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5495 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5496 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5497 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5498 ) { 5499 if ( floatx80_is_signaling_nan( a ) 5500 || floatx80_is_signaling_nan( b ) ) { 5501 float_raise( float_flag_invalid STATUS_VAR); 5502 } 5503 return 0; 5504 } 5505 aSign = extractFloatx80Sign( a ); 5506 bSign = extractFloatx80Sign( b ); 5507 if ( aSign != bSign ) { 5508 return 5509 aSign 5510 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5511 == 0 ); 5512 } 5513 return 5514 aSign ? le128( b.high, b.low, a.high, a.low ) 5515 : le128( a.high, a.low, b.high, b.low ); 5516 5517 } 5518 5519 /*---------------------------------------------------------------------------- 5520 | Returns 1 if the extended double-precision floating-point value `a' is less 5521 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 5522 | an exception. Otherwise, the comparison is performed according to the 5523 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5524 *----------------------------------------------------------------------------*/ 5525 5526 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM ) 5527 { 5528 flag aSign, bSign; 5529 5530 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5531 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5532 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5533 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5534 ) { 5535 if ( floatx80_is_signaling_nan( a ) 5536 || floatx80_is_signaling_nan( b ) ) { 5537 float_raise( float_flag_invalid STATUS_VAR); 5538 } 5539 return 0; 5540 } 5541 aSign = extractFloatx80Sign( a ); 5542 bSign = extractFloatx80Sign( b ); 5543 if ( aSign != bSign ) { 5544 return 5545 aSign 5546 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5547 != 0 ); 5548 } 5549 return 5550 aSign ? lt128( b.high, b.low, a.high, a.low ) 5551 : lt128( a.high, a.low, b.high, b.low ); 5552 5553 } 5554 5555 /*---------------------------------------------------------------------------- 5556 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5557 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception. 5558 | The comparison is performed according to the IEC/IEEE Standard for Binary 5559 | Floating-Point Arithmetic. 5560 *----------------------------------------------------------------------------*/ 5561 int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM ) 5562 { 5563 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5564 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5565 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5566 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5567 ) { 5568 if ( floatx80_is_signaling_nan( a ) 5569 || floatx80_is_signaling_nan( b ) ) { 5570 float_raise( float_flag_invalid STATUS_VAR); 5571 } 5572 return 1; 5573 } 5574 return 0; 5575 } 5576 5577 /*---------------------------------------------------------------------------- 5578 | Returns the result of converting the quadruple-precision floating-point 5579 | value `a' to the 32-bit two's complement integer format. The conversion 5580 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5581 | Arithmetic---which means in particular that the conversion is rounded 5582 | according to the current rounding mode. If `a' is a NaN, the largest 5583 | positive integer is returned. Otherwise, if the conversion overflows, the 5584 | largest integer with the same sign as `a' is returned. 5585 *----------------------------------------------------------------------------*/ 5586 5587 int32 float128_to_int32( float128 a STATUS_PARAM ) 5588 { 5589 flag aSign; 5590 int32 aExp, shiftCount; 5591 uint64_t aSig0, aSig1; 5592 5593 aSig1 = extractFloat128Frac1( a ); 5594 aSig0 = extractFloat128Frac0( a ); 5595 aExp = extractFloat128Exp( a ); 5596 aSign = extractFloat128Sign( a ); 5597 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 5598 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5599 aSig0 |= ( aSig1 != 0 ); 5600 shiftCount = 0x4028 - aExp; 5601 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 5602 return roundAndPackInt32( aSign, aSig0 STATUS_VAR ); 5603 5604 } 5605 5606 /*---------------------------------------------------------------------------- 5607 | Returns the result of converting the quadruple-precision floating-point 5608 | value `a' to the 32-bit two's complement integer format. The conversion 5609 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5610 | Arithmetic, except that the conversion is always rounded toward zero. If 5611 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 5612 | conversion overflows, the largest integer with the same sign as `a' is 5613 | returned. 5614 *----------------------------------------------------------------------------*/ 5615 5616 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM ) 5617 { 5618 flag aSign; 5619 int32 aExp, shiftCount; 5620 uint64_t aSig0, aSig1, savedASig; 5621 int32_t z; 5622 5623 aSig1 = extractFloat128Frac1( a ); 5624 aSig0 = extractFloat128Frac0( a ); 5625 aExp = extractFloat128Exp( a ); 5626 aSign = extractFloat128Sign( a ); 5627 aSig0 |= ( aSig1 != 0 ); 5628 if ( 0x401E < aExp ) { 5629 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 5630 goto invalid; 5631 } 5632 else if ( aExp < 0x3FFF ) { 5633 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact; 5634 return 0; 5635 } 5636 aSig0 |= LIT64( 0x0001000000000000 ); 5637 shiftCount = 0x402F - aExp; 5638 savedASig = aSig0; 5639 aSig0 >>= shiftCount; 5640 z = aSig0; 5641 if ( aSign ) z = - z; 5642 if ( ( z < 0 ) ^ aSign ) { 5643 invalid: 5644 float_raise( float_flag_invalid STATUS_VAR); 5645 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5646 } 5647 if ( ( aSig0<<shiftCount ) != savedASig ) { 5648 STATUS(float_exception_flags) |= float_flag_inexact; 5649 } 5650 return z; 5651 5652 } 5653 5654 /*---------------------------------------------------------------------------- 5655 | Returns the result of converting the quadruple-precision floating-point 5656 | value `a' to the 64-bit two's complement integer format. The conversion 5657 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5658 | Arithmetic---which means in particular that the conversion is rounded 5659 | according to the current rounding mode. If `a' is a NaN, the largest 5660 | positive integer is returned. Otherwise, if the conversion overflows, the 5661 | largest integer with the same sign as `a' is returned. 5662 *----------------------------------------------------------------------------*/ 5663 5664 int64 float128_to_int64( float128 a STATUS_PARAM ) 5665 { 5666 flag aSign; 5667 int32 aExp, shiftCount; 5668 uint64_t aSig0, aSig1; 5669 5670 aSig1 = extractFloat128Frac1( a ); 5671 aSig0 = extractFloat128Frac0( a ); 5672 aExp = extractFloat128Exp( a ); 5673 aSign = extractFloat128Sign( a ); 5674 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5675 shiftCount = 0x402F - aExp; 5676 if ( shiftCount <= 0 ) { 5677 if ( 0x403E < aExp ) { 5678 float_raise( float_flag_invalid STATUS_VAR); 5679 if ( ! aSign 5680 || ( ( aExp == 0x7FFF ) 5681 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 5682 ) 5683 ) { 5684 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5685 } 5686 return (int64_t) LIT64( 0x8000000000000000 ); 5687 } 5688 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 5689 } 5690 else { 5691 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 5692 } 5693 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR ); 5694 5695 } 5696 5697 /*---------------------------------------------------------------------------- 5698 | Returns the result of converting the quadruple-precision floating-point 5699 | value `a' to the 64-bit two's complement integer format. The conversion 5700 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5701 | Arithmetic, except that the conversion is always rounded toward zero. 5702 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 5703 | the conversion overflows, the largest integer with the same sign as `a' is 5704 | returned. 5705 *----------------------------------------------------------------------------*/ 5706 5707 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM ) 5708 { 5709 flag aSign; 5710 int32 aExp, shiftCount; 5711 uint64_t aSig0, aSig1; 5712 int64 z; 5713 5714 aSig1 = extractFloat128Frac1( a ); 5715 aSig0 = extractFloat128Frac0( a ); 5716 aExp = extractFloat128Exp( a ); 5717 aSign = extractFloat128Sign( a ); 5718 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5719 shiftCount = aExp - 0x402F; 5720 if ( 0 < shiftCount ) { 5721 if ( 0x403E <= aExp ) { 5722 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 5723 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 5724 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 5725 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact; 5726 } 5727 else { 5728 float_raise( float_flag_invalid STATUS_VAR); 5729 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 5730 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5731 } 5732 } 5733 return (int64_t) LIT64( 0x8000000000000000 ); 5734 } 5735 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 5736 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 5737 STATUS(float_exception_flags) |= float_flag_inexact; 5738 } 5739 } 5740 else { 5741 if ( aExp < 0x3FFF ) { 5742 if ( aExp | aSig0 | aSig1 ) { 5743 STATUS(float_exception_flags) |= float_flag_inexact; 5744 } 5745 return 0; 5746 } 5747 z = aSig0>>( - shiftCount ); 5748 if ( aSig1 5749 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 5750 STATUS(float_exception_flags) |= float_flag_inexact; 5751 } 5752 } 5753 if ( aSign ) z = - z; 5754 return z; 5755 5756 } 5757 5758 /*---------------------------------------------------------------------------- 5759 | Returns the result of converting the quadruple-precision floating-point 5760 | value `a' to the single-precision floating-point format. The conversion 5761 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5762 | Arithmetic. 5763 *----------------------------------------------------------------------------*/ 5764 5765 float32 float128_to_float32( float128 a STATUS_PARAM ) 5766 { 5767 flag aSign; 5768 int32 aExp; 5769 uint64_t aSig0, aSig1; 5770 uint32_t zSig; 5771 5772 aSig1 = extractFloat128Frac1( a ); 5773 aSig0 = extractFloat128Frac0( a ); 5774 aExp = extractFloat128Exp( a ); 5775 aSign = extractFloat128Sign( a ); 5776 if ( aExp == 0x7FFF ) { 5777 if ( aSig0 | aSig1 ) { 5778 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 5779 } 5780 return packFloat32( aSign, 0xFF, 0 ); 5781 } 5782 aSig0 |= ( aSig1 != 0 ); 5783 shift64RightJamming( aSig0, 18, &aSig0 ); 5784 zSig = aSig0; 5785 if ( aExp || zSig ) { 5786 zSig |= 0x40000000; 5787 aExp -= 0x3F81; 5788 } 5789 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR ); 5790 5791 } 5792 5793 /*---------------------------------------------------------------------------- 5794 | Returns the result of converting the quadruple-precision floating-point 5795 | value `a' to the double-precision floating-point format. The conversion 5796 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5797 | Arithmetic. 5798 *----------------------------------------------------------------------------*/ 5799 5800 float64 float128_to_float64( float128 a STATUS_PARAM ) 5801 { 5802 flag aSign; 5803 int32 aExp; 5804 uint64_t aSig0, aSig1; 5805 5806 aSig1 = extractFloat128Frac1( a ); 5807 aSig0 = extractFloat128Frac0( a ); 5808 aExp = extractFloat128Exp( a ); 5809 aSign = extractFloat128Sign( a ); 5810 if ( aExp == 0x7FFF ) { 5811 if ( aSig0 | aSig1 ) { 5812 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 5813 } 5814 return packFloat64( aSign, 0x7FF, 0 ); 5815 } 5816 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 5817 aSig0 |= ( aSig1 != 0 ); 5818 if ( aExp || aSig0 ) { 5819 aSig0 |= LIT64( 0x4000000000000000 ); 5820 aExp -= 0x3C01; 5821 } 5822 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR ); 5823 5824 } 5825 5826 /*---------------------------------------------------------------------------- 5827 | Returns the result of converting the quadruple-precision floating-point 5828 | value `a' to the extended double-precision floating-point format. The 5829 | conversion is performed according to the IEC/IEEE Standard for Binary 5830 | Floating-Point Arithmetic. 5831 *----------------------------------------------------------------------------*/ 5832 5833 floatx80 float128_to_floatx80( float128 a STATUS_PARAM ) 5834 { 5835 flag aSign; 5836 int32 aExp; 5837 uint64_t aSig0, aSig1; 5838 5839 aSig1 = extractFloat128Frac1( a ); 5840 aSig0 = extractFloat128Frac0( a ); 5841 aExp = extractFloat128Exp( a ); 5842 aSign = extractFloat128Sign( a ); 5843 if ( aExp == 0x7FFF ) { 5844 if ( aSig0 | aSig1 ) { 5845 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR ); 5846 } 5847 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5848 } 5849 if ( aExp == 0 ) { 5850 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 5851 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5852 } 5853 else { 5854 aSig0 |= LIT64( 0x0001000000000000 ); 5855 } 5856 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 5857 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR ); 5858 5859 } 5860 5861 /*---------------------------------------------------------------------------- 5862 | Rounds the quadruple-precision floating-point value `a' to an integer, and 5863 | returns the result as a quadruple-precision floating-point value. The 5864 | operation is performed according to the IEC/IEEE Standard for Binary 5865 | Floating-Point Arithmetic. 5866 *----------------------------------------------------------------------------*/ 5867 5868 float128 float128_round_to_int( float128 a STATUS_PARAM ) 5869 { 5870 flag aSign; 5871 int32 aExp; 5872 uint64_t lastBitMask, roundBitsMask; 5873 float128 z; 5874 5875 aExp = extractFloat128Exp( a ); 5876 if ( 0x402F <= aExp ) { 5877 if ( 0x406F <= aExp ) { 5878 if ( ( aExp == 0x7FFF ) 5879 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 5880 ) { 5881 return propagateFloat128NaN( a, a STATUS_VAR ); 5882 } 5883 return a; 5884 } 5885 lastBitMask = 1; 5886 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 5887 roundBitsMask = lastBitMask - 1; 5888 z = a; 5889 switch (STATUS(float_rounding_mode)) { 5890 case float_round_nearest_even: 5891 if ( lastBitMask ) { 5892 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 5893 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 5894 } 5895 else { 5896 if ( (int64_t) z.low < 0 ) { 5897 ++z.high; 5898 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 5899 } 5900 } 5901 break; 5902 case float_round_ties_away: 5903 if (lastBitMask) { 5904 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 5905 } else { 5906 if ((int64_t) z.low < 0) { 5907 ++z.high; 5908 } 5909 } 5910 break; 5911 case float_round_to_zero: 5912 break; 5913 case float_round_up: 5914 if (!extractFloat128Sign(z)) { 5915 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 5916 } 5917 break; 5918 case float_round_down: 5919 if (extractFloat128Sign(z)) { 5920 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 5921 } 5922 break; 5923 default: 5924 abort(); 5925 } 5926 z.low &= ~ roundBitsMask; 5927 } 5928 else { 5929 if ( aExp < 0x3FFF ) { 5930 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 5931 STATUS(float_exception_flags) |= float_flag_inexact; 5932 aSign = extractFloat128Sign( a ); 5933 switch ( STATUS(float_rounding_mode) ) { 5934 case float_round_nearest_even: 5935 if ( ( aExp == 0x3FFE ) 5936 && ( extractFloat128Frac0( a ) 5937 | extractFloat128Frac1( a ) ) 5938 ) { 5939 return packFloat128( aSign, 0x3FFF, 0, 0 ); 5940 } 5941 break; 5942 case float_round_ties_away: 5943 if (aExp == 0x3FFE) { 5944 return packFloat128(aSign, 0x3FFF, 0, 0); 5945 } 5946 break; 5947 case float_round_down: 5948 return 5949 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 5950 : packFloat128( 0, 0, 0, 0 ); 5951 case float_round_up: 5952 return 5953 aSign ? packFloat128( 1, 0, 0, 0 ) 5954 : packFloat128( 0, 0x3FFF, 0, 0 ); 5955 } 5956 return packFloat128( aSign, 0, 0, 0 ); 5957 } 5958 lastBitMask = 1; 5959 lastBitMask <<= 0x402F - aExp; 5960 roundBitsMask = lastBitMask - 1; 5961 z.low = 0; 5962 z.high = a.high; 5963 switch (STATUS(float_rounding_mode)) { 5964 case float_round_nearest_even: 5965 z.high += lastBitMask>>1; 5966 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 5967 z.high &= ~ lastBitMask; 5968 } 5969 break; 5970 case float_round_ties_away: 5971 z.high += lastBitMask>>1; 5972 break; 5973 case float_round_to_zero: 5974 break; 5975 case float_round_up: 5976 if (!extractFloat128Sign(z)) { 5977 z.high |= ( a.low != 0 ); 5978 z.high += roundBitsMask; 5979 } 5980 break; 5981 case float_round_down: 5982 if (extractFloat128Sign(z)) { 5983 z.high |= (a.low != 0); 5984 z.high += roundBitsMask; 5985 } 5986 break; 5987 default: 5988 abort(); 5989 } 5990 z.high &= ~ roundBitsMask; 5991 } 5992 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 5993 STATUS(float_exception_flags) |= float_flag_inexact; 5994 } 5995 return z; 5996 5997 } 5998 5999 /*---------------------------------------------------------------------------- 6000 | Returns the result of adding the absolute values of the quadruple-precision 6001 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 6002 | before being returned. `zSign' is ignored if the result is a NaN. 6003 | The addition is performed according to the IEC/IEEE Standard for Binary 6004 | Floating-Point Arithmetic. 6005 *----------------------------------------------------------------------------*/ 6006 6007 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM) 6008 { 6009 int32 aExp, bExp, zExp; 6010 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6011 int32 expDiff; 6012 6013 aSig1 = extractFloat128Frac1( a ); 6014 aSig0 = extractFloat128Frac0( a ); 6015 aExp = extractFloat128Exp( a ); 6016 bSig1 = extractFloat128Frac1( b ); 6017 bSig0 = extractFloat128Frac0( b ); 6018 bExp = extractFloat128Exp( b ); 6019 expDiff = aExp - bExp; 6020 if ( 0 < expDiff ) { 6021 if ( aExp == 0x7FFF ) { 6022 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 6023 return a; 6024 } 6025 if ( bExp == 0 ) { 6026 --expDiff; 6027 } 6028 else { 6029 bSig0 |= LIT64( 0x0001000000000000 ); 6030 } 6031 shift128ExtraRightJamming( 6032 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 6033 zExp = aExp; 6034 } 6035 else if ( expDiff < 0 ) { 6036 if ( bExp == 0x7FFF ) { 6037 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 6038 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6039 } 6040 if ( aExp == 0 ) { 6041 ++expDiff; 6042 } 6043 else { 6044 aSig0 |= LIT64( 0x0001000000000000 ); 6045 } 6046 shift128ExtraRightJamming( 6047 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 6048 zExp = bExp; 6049 } 6050 else { 6051 if ( aExp == 0x7FFF ) { 6052 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6053 return propagateFloat128NaN( a, b STATUS_VAR ); 6054 } 6055 return a; 6056 } 6057 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6058 if ( aExp == 0 ) { 6059 if (STATUS(flush_to_zero)) { 6060 if (zSig0 | zSig1) { 6061 float_raise(float_flag_output_denormal STATUS_VAR); 6062 } 6063 return packFloat128(zSign, 0, 0, 0); 6064 } 6065 return packFloat128( zSign, 0, zSig0, zSig1 ); 6066 } 6067 zSig2 = 0; 6068 zSig0 |= LIT64( 0x0002000000000000 ); 6069 zExp = aExp; 6070 goto shiftRight1; 6071 } 6072 aSig0 |= LIT64( 0x0001000000000000 ); 6073 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6074 --zExp; 6075 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 6076 ++zExp; 6077 shiftRight1: 6078 shift128ExtraRightJamming( 6079 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6080 roundAndPack: 6081 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR ); 6082 6083 } 6084 6085 /*---------------------------------------------------------------------------- 6086 | Returns the result of subtracting the absolute values of the quadruple- 6087 | precision floating-point values `a' and `b'. If `zSign' is 1, the 6088 | difference is negated before being returned. `zSign' is ignored if the 6089 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6090 | Standard for Binary Floating-Point Arithmetic. 6091 *----------------------------------------------------------------------------*/ 6092 6093 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM) 6094 { 6095 int32 aExp, bExp, zExp; 6096 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 6097 int32 expDiff; 6098 float128 z; 6099 6100 aSig1 = extractFloat128Frac1( a ); 6101 aSig0 = extractFloat128Frac0( a ); 6102 aExp = extractFloat128Exp( a ); 6103 bSig1 = extractFloat128Frac1( b ); 6104 bSig0 = extractFloat128Frac0( b ); 6105 bExp = extractFloat128Exp( b ); 6106 expDiff = aExp - bExp; 6107 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6108 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 6109 if ( 0 < expDiff ) goto aExpBigger; 6110 if ( expDiff < 0 ) goto bExpBigger; 6111 if ( aExp == 0x7FFF ) { 6112 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6113 return propagateFloat128NaN( a, b STATUS_VAR ); 6114 } 6115 float_raise( float_flag_invalid STATUS_VAR); 6116 z.low = float128_default_nan_low; 6117 z.high = float128_default_nan_high; 6118 return z; 6119 } 6120 if ( aExp == 0 ) { 6121 aExp = 1; 6122 bExp = 1; 6123 } 6124 if ( bSig0 < aSig0 ) goto aBigger; 6125 if ( aSig0 < bSig0 ) goto bBigger; 6126 if ( bSig1 < aSig1 ) goto aBigger; 6127 if ( aSig1 < bSig1 ) goto bBigger; 6128 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 ); 6129 bExpBigger: 6130 if ( bExp == 0x7FFF ) { 6131 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 6132 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 6133 } 6134 if ( aExp == 0 ) { 6135 ++expDiff; 6136 } 6137 else { 6138 aSig0 |= LIT64( 0x4000000000000000 ); 6139 } 6140 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6141 bSig0 |= LIT64( 0x4000000000000000 ); 6142 bBigger: 6143 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6144 zExp = bExp; 6145 zSign ^= 1; 6146 goto normalizeRoundAndPack; 6147 aExpBigger: 6148 if ( aExp == 0x7FFF ) { 6149 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 6150 return a; 6151 } 6152 if ( bExp == 0 ) { 6153 --expDiff; 6154 } 6155 else { 6156 bSig0 |= LIT64( 0x4000000000000000 ); 6157 } 6158 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 6159 aSig0 |= LIT64( 0x4000000000000000 ); 6160 aBigger: 6161 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6162 zExp = aExp; 6163 normalizeRoundAndPack: 6164 --zExp; 6165 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR ); 6166 6167 } 6168 6169 /*---------------------------------------------------------------------------- 6170 | Returns the result of adding the quadruple-precision floating-point values 6171 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 6172 | for Binary Floating-Point Arithmetic. 6173 *----------------------------------------------------------------------------*/ 6174 6175 float128 float128_add( float128 a, float128 b STATUS_PARAM ) 6176 { 6177 flag aSign, bSign; 6178 6179 aSign = extractFloat128Sign( a ); 6180 bSign = extractFloat128Sign( b ); 6181 if ( aSign == bSign ) { 6182 return addFloat128Sigs( a, b, aSign STATUS_VAR ); 6183 } 6184 else { 6185 return subFloat128Sigs( a, b, aSign STATUS_VAR ); 6186 } 6187 6188 } 6189 6190 /*---------------------------------------------------------------------------- 6191 | Returns the result of subtracting the quadruple-precision floating-point 6192 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6193 | Standard for Binary Floating-Point Arithmetic. 6194 *----------------------------------------------------------------------------*/ 6195 6196 float128 float128_sub( float128 a, float128 b STATUS_PARAM ) 6197 { 6198 flag aSign, bSign; 6199 6200 aSign = extractFloat128Sign( a ); 6201 bSign = extractFloat128Sign( b ); 6202 if ( aSign == bSign ) { 6203 return subFloat128Sigs( a, b, aSign STATUS_VAR ); 6204 } 6205 else { 6206 return addFloat128Sigs( a, b, aSign STATUS_VAR ); 6207 } 6208 6209 } 6210 6211 /*---------------------------------------------------------------------------- 6212 | Returns the result of multiplying the quadruple-precision floating-point 6213 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6214 | Standard for Binary Floating-Point Arithmetic. 6215 *----------------------------------------------------------------------------*/ 6216 6217 float128 float128_mul( float128 a, float128 b STATUS_PARAM ) 6218 { 6219 flag aSign, bSign, zSign; 6220 int32 aExp, bExp, zExp; 6221 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 6222 float128 z; 6223 6224 aSig1 = extractFloat128Frac1( a ); 6225 aSig0 = extractFloat128Frac0( a ); 6226 aExp = extractFloat128Exp( a ); 6227 aSign = extractFloat128Sign( a ); 6228 bSig1 = extractFloat128Frac1( b ); 6229 bSig0 = extractFloat128Frac0( b ); 6230 bExp = extractFloat128Exp( b ); 6231 bSign = extractFloat128Sign( b ); 6232 zSign = aSign ^ bSign; 6233 if ( aExp == 0x7FFF ) { 6234 if ( ( aSig0 | aSig1 ) 6235 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6236 return propagateFloat128NaN( a, b STATUS_VAR ); 6237 } 6238 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 6239 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6240 } 6241 if ( bExp == 0x7FFF ) { 6242 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 6243 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6244 invalid: 6245 float_raise( float_flag_invalid STATUS_VAR); 6246 z.low = float128_default_nan_low; 6247 z.high = float128_default_nan_high; 6248 return z; 6249 } 6250 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6251 } 6252 if ( aExp == 0 ) { 6253 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6254 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6255 } 6256 if ( bExp == 0 ) { 6257 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6258 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6259 } 6260 zExp = aExp + bExp - 0x4000; 6261 aSig0 |= LIT64( 0x0001000000000000 ); 6262 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 6263 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 6264 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6265 zSig2 |= ( zSig3 != 0 ); 6266 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 6267 shift128ExtraRightJamming( 6268 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6269 ++zExp; 6270 } 6271 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR ); 6272 6273 } 6274 6275 /*---------------------------------------------------------------------------- 6276 | Returns the result of dividing the quadruple-precision floating-point value 6277 | `a' by the corresponding value `b'. The operation is performed according to 6278 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6279 *----------------------------------------------------------------------------*/ 6280 6281 float128 float128_div( float128 a, float128 b STATUS_PARAM ) 6282 { 6283 flag aSign, bSign, zSign; 6284 int32 aExp, bExp, zExp; 6285 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6286 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6287 float128 z; 6288 6289 aSig1 = extractFloat128Frac1( a ); 6290 aSig0 = extractFloat128Frac0( a ); 6291 aExp = extractFloat128Exp( a ); 6292 aSign = extractFloat128Sign( a ); 6293 bSig1 = extractFloat128Frac1( b ); 6294 bSig0 = extractFloat128Frac0( b ); 6295 bExp = extractFloat128Exp( b ); 6296 bSign = extractFloat128Sign( b ); 6297 zSign = aSign ^ bSign; 6298 if ( aExp == 0x7FFF ) { 6299 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 6300 if ( bExp == 0x7FFF ) { 6301 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 6302 goto invalid; 6303 } 6304 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6305 } 6306 if ( bExp == 0x7FFF ) { 6307 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 6308 return packFloat128( zSign, 0, 0, 0 ); 6309 } 6310 if ( bExp == 0 ) { 6311 if ( ( bSig0 | bSig1 ) == 0 ) { 6312 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6313 invalid: 6314 float_raise( float_flag_invalid STATUS_VAR); 6315 z.low = float128_default_nan_low; 6316 z.high = float128_default_nan_high; 6317 return z; 6318 } 6319 float_raise( float_flag_divbyzero STATUS_VAR); 6320 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6321 } 6322 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6323 } 6324 if ( aExp == 0 ) { 6325 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6326 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6327 } 6328 zExp = aExp - bExp + 0x3FFD; 6329 shortShift128Left( 6330 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 6331 shortShift128Left( 6332 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6333 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 6334 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 6335 ++zExp; 6336 } 6337 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6338 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 6339 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 6340 while ( (int64_t) rem0 < 0 ) { 6341 --zSig0; 6342 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 6343 } 6344 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 6345 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 6346 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 6347 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 6348 while ( (int64_t) rem1 < 0 ) { 6349 --zSig1; 6350 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 6351 } 6352 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6353 } 6354 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 6355 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR ); 6356 6357 } 6358 6359 /*---------------------------------------------------------------------------- 6360 | Returns the remainder of the quadruple-precision floating-point value `a' 6361 | with respect to the corresponding value `b'. The operation is performed 6362 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6363 *----------------------------------------------------------------------------*/ 6364 6365 float128 float128_rem( float128 a, float128 b STATUS_PARAM ) 6366 { 6367 flag aSign, zSign; 6368 int32 aExp, bExp, expDiff; 6369 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 6370 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 6371 int64_t sigMean0; 6372 float128 z; 6373 6374 aSig1 = extractFloat128Frac1( a ); 6375 aSig0 = extractFloat128Frac0( a ); 6376 aExp = extractFloat128Exp( a ); 6377 aSign = extractFloat128Sign( a ); 6378 bSig1 = extractFloat128Frac1( b ); 6379 bSig0 = extractFloat128Frac0( b ); 6380 bExp = extractFloat128Exp( b ); 6381 if ( aExp == 0x7FFF ) { 6382 if ( ( aSig0 | aSig1 ) 6383 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6384 return propagateFloat128NaN( a, b STATUS_VAR ); 6385 } 6386 goto invalid; 6387 } 6388 if ( bExp == 0x7FFF ) { 6389 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); 6390 return a; 6391 } 6392 if ( bExp == 0 ) { 6393 if ( ( bSig0 | bSig1 ) == 0 ) { 6394 invalid: 6395 float_raise( float_flag_invalid STATUS_VAR); 6396 z.low = float128_default_nan_low; 6397 z.high = float128_default_nan_high; 6398 return z; 6399 } 6400 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6401 } 6402 if ( aExp == 0 ) { 6403 if ( ( aSig0 | aSig1 ) == 0 ) return a; 6404 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6405 } 6406 expDiff = aExp - bExp; 6407 if ( expDiff < -1 ) return a; 6408 shortShift128Left( 6409 aSig0 | LIT64( 0x0001000000000000 ), 6410 aSig1, 6411 15 - ( expDiff < 0 ), 6412 &aSig0, 6413 &aSig1 6414 ); 6415 shortShift128Left( 6416 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6417 q = le128( bSig0, bSig1, aSig0, aSig1 ); 6418 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6419 expDiff -= 64; 6420 while ( 0 < expDiff ) { 6421 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6422 q = ( 4 < q ) ? q - 4 : 0; 6423 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6424 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 6425 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 6426 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 6427 expDiff -= 61; 6428 } 6429 if ( -64 < expDiff ) { 6430 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6431 q = ( 4 < q ) ? q - 4 : 0; 6432 q >>= - expDiff; 6433 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6434 expDiff += 52; 6435 if ( expDiff < 0 ) { 6436 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6437 } 6438 else { 6439 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 6440 } 6441 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6442 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 6443 } 6444 else { 6445 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 6446 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6447 } 6448 do { 6449 alternateASig0 = aSig0; 6450 alternateASig1 = aSig1; 6451 ++q; 6452 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6453 } while ( 0 <= (int64_t) aSig0 ); 6454 add128( 6455 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 6456 if ( ( sigMean0 < 0 ) 6457 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 6458 aSig0 = alternateASig0; 6459 aSig1 = alternateASig1; 6460 } 6461 zSign = ( (int64_t) aSig0 < 0 ); 6462 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 6463 return 6464 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR ); 6465 6466 } 6467 6468 /*---------------------------------------------------------------------------- 6469 | Returns the square root of the quadruple-precision floating-point value `a'. 6470 | The operation is performed according to the IEC/IEEE Standard for Binary 6471 | Floating-Point Arithmetic. 6472 *----------------------------------------------------------------------------*/ 6473 6474 float128 float128_sqrt( float128 a STATUS_PARAM ) 6475 { 6476 flag aSign; 6477 int32 aExp, zExp; 6478 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 6479 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6480 float128 z; 6481 6482 aSig1 = extractFloat128Frac1( a ); 6483 aSig0 = extractFloat128Frac0( a ); 6484 aExp = extractFloat128Exp( a ); 6485 aSign = extractFloat128Sign( a ); 6486 if ( aExp == 0x7FFF ) { 6487 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR ); 6488 if ( ! aSign ) return a; 6489 goto invalid; 6490 } 6491 if ( aSign ) { 6492 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 6493 invalid: 6494 float_raise( float_flag_invalid STATUS_VAR); 6495 z.low = float128_default_nan_low; 6496 z.high = float128_default_nan_high; 6497 return z; 6498 } 6499 if ( aExp == 0 ) { 6500 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 6501 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6502 } 6503 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 6504 aSig0 |= LIT64( 0x0001000000000000 ); 6505 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 6506 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 6507 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6508 doubleZSig0 = zSig0<<1; 6509 mul64To128( zSig0, zSig0, &term0, &term1 ); 6510 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6511 while ( (int64_t) rem0 < 0 ) { 6512 --zSig0; 6513 doubleZSig0 -= 2; 6514 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6515 } 6516 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6517 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 6518 if ( zSig1 == 0 ) zSig1 = 1; 6519 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6520 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6521 mul64To128( zSig1, zSig1, &term2, &term3 ); 6522 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6523 while ( (int64_t) rem1 < 0 ) { 6524 --zSig1; 6525 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6526 term3 |= 1; 6527 term2 |= doubleZSig0; 6528 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6529 } 6530 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6531 } 6532 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 6533 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR ); 6534 6535 } 6536 6537 /*---------------------------------------------------------------------------- 6538 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 6539 | the corresponding value `b', and 0 otherwise. The invalid exception is 6540 | raised if either operand is a NaN. Otherwise, the comparison is performed 6541 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6542 *----------------------------------------------------------------------------*/ 6543 6544 int float128_eq( float128 a, float128 b STATUS_PARAM ) 6545 { 6546 6547 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6548 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6549 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6550 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6551 ) { 6552 float_raise( float_flag_invalid STATUS_VAR); 6553 return 0; 6554 } 6555 return 6556 ( a.low == b.low ) 6557 && ( ( a.high == b.high ) 6558 || ( ( a.low == 0 ) 6559 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6560 ); 6561 6562 } 6563 6564 /*---------------------------------------------------------------------------- 6565 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6566 | or equal to the corresponding value `b', and 0 otherwise. The invalid 6567 | exception is raised if either operand is a NaN. The comparison is performed 6568 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6569 *----------------------------------------------------------------------------*/ 6570 6571 int float128_le( float128 a, float128 b STATUS_PARAM ) 6572 { 6573 flag aSign, bSign; 6574 6575 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6576 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6577 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6578 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6579 ) { 6580 float_raise( float_flag_invalid STATUS_VAR); 6581 return 0; 6582 } 6583 aSign = extractFloat128Sign( a ); 6584 bSign = extractFloat128Sign( b ); 6585 if ( aSign != bSign ) { 6586 return 6587 aSign 6588 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6589 == 0 ); 6590 } 6591 return 6592 aSign ? le128( b.high, b.low, a.high, a.low ) 6593 : le128( a.high, a.low, b.high, b.low ); 6594 6595 } 6596 6597 /*---------------------------------------------------------------------------- 6598 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6599 | the corresponding value `b', and 0 otherwise. The invalid exception is 6600 | raised if either operand is a NaN. The comparison is performed according 6601 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6602 *----------------------------------------------------------------------------*/ 6603 6604 int float128_lt( float128 a, float128 b STATUS_PARAM ) 6605 { 6606 flag aSign, bSign; 6607 6608 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6609 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6610 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6611 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6612 ) { 6613 float_raise( float_flag_invalid STATUS_VAR); 6614 return 0; 6615 } 6616 aSign = extractFloat128Sign( a ); 6617 bSign = extractFloat128Sign( b ); 6618 if ( aSign != bSign ) { 6619 return 6620 aSign 6621 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6622 != 0 ); 6623 } 6624 return 6625 aSign ? lt128( b.high, b.low, a.high, a.low ) 6626 : lt128( a.high, a.low, b.high, b.low ); 6627 6628 } 6629 6630 /*---------------------------------------------------------------------------- 6631 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 6632 | be compared, and 0 otherwise. The invalid exception is raised if either 6633 | operand is a NaN. The comparison is performed according to the IEC/IEEE 6634 | Standard for Binary Floating-Point Arithmetic. 6635 *----------------------------------------------------------------------------*/ 6636 6637 int float128_unordered( float128 a, float128 b STATUS_PARAM ) 6638 { 6639 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6640 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6641 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6642 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6643 ) { 6644 float_raise( float_flag_invalid STATUS_VAR); 6645 return 1; 6646 } 6647 return 0; 6648 } 6649 6650 /*---------------------------------------------------------------------------- 6651 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 6652 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 6653 | exception. The comparison is performed according to the IEC/IEEE Standard 6654 | for Binary Floating-Point Arithmetic. 6655 *----------------------------------------------------------------------------*/ 6656 6657 int float128_eq_quiet( float128 a, float128 b STATUS_PARAM ) 6658 { 6659 6660 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6661 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6662 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6663 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6664 ) { 6665 if ( float128_is_signaling_nan( a ) 6666 || float128_is_signaling_nan( b ) ) { 6667 float_raise( float_flag_invalid STATUS_VAR); 6668 } 6669 return 0; 6670 } 6671 return 6672 ( a.low == b.low ) 6673 && ( ( a.high == b.high ) 6674 || ( ( a.low == 0 ) 6675 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6676 ); 6677 6678 } 6679 6680 /*---------------------------------------------------------------------------- 6681 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6682 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 6683 | cause an exception. Otherwise, the comparison is performed according to the 6684 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6685 *----------------------------------------------------------------------------*/ 6686 6687 int float128_le_quiet( float128 a, float128 b STATUS_PARAM ) 6688 { 6689 flag aSign, bSign; 6690 6691 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6692 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6693 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6694 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6695 ) { 6696 if ( float128_is_signaling_nan( a ) 6697 || float128_is_signaling_nan( b ) ) { 6698 float_raise( float_flag_invalid STATUS_VAR); 6699 } 6700 return 0; 6701 } 6702 aSign = extractFloat128Sign( a ); 6703 bSign = extractFloat128Sign( b ); 6704 if ( aSign != bSign ) { 6705 return 6706 aSign 6707 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6708 == 0 ); 6709 } 6710 return 6711 aSign ? le128( b.high, b.low, a.high, a.low ) 6712 : le128( a.high, a.low, b.high, b.low ); 6713 6714 } 6715 6716 /*---------------------------------------------------------------------------- 6717 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6718 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 6719 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 6720 | Standard for Binary Floating-Point Arithmetic. 6721 *----------------------------------------------------------------------------*/ 6722 6723 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM ) 6724 { 6725 flag aSign, bSign; 6726 6727 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6728 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6729 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6730 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6731 ) { 6732 if ( float128_is_signaling_nan( a ) 6733 || float128_is_signaling_nan( b ) ) { 6734 float_raise( float_flag_invalid STATUS_VAR); 6735 } 6736 return 0; 6737 } 6738 aSign = extractFloat128Sign( a ); 6739 bSign = extractFloat128Sign( b ); 6740 if ( aSign != bSign ) { 6741 return 6742 aSign 6743 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6744 != 0 ); 6745 } 6746 return 6747 aSign ? lt128( b.high, b.low, a.high, a.low ) 6748 : lt128( a.high, a.low, b.high, b.low ); 6749 6750 } 6751 6752 /*---------------------------------------------------------------------------- 6753 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 6754 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 6755 | comparison is performed according to the IEC/IEEE Standard for Binary 6756 | Floating-Point Arithmetic. 6757 *----------------------------------------------------------------------------*/ 6758 6759 int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM ) 6760 { 6761 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6762 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6763 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6764 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6765 ) { 6766 if ( float128_is_signaling_nan( a ) 6767 || float128_is_signaling_nan( b ) ) { 6768 float_raise( float_flag_invalid STATUS_VAR); 6769 } 6770 return 1; 6771 } 6772 return 0; 6773 } 6774 6775 /* misc functions */ 6776 float32 uint32_to_float32(uint32_t a STATUS_PARAM) 6777 { 6778 return int64_to_float32(a STATUS_VAR); 6779 } 6780 6781 float64 uint32_to_float64(uint32_t a STATUS_PARAM) 6782 { 6783 return int64_to_float64(a STATUS_VAR); 6784 } 6785 6786 uint32 float32_to_uint32( float32 a STATUS_PARAM ) 6787 { 6788 int64_t v; 6789 uint32 res; 6790 int old_exc_flags = get_float_exception_flags(status); 6791 6792 v = float32_to_int64(a STATUS_VAR); 6793 if (v < 0) { 6794 res = 0; 6795 } else if (v > 0xffffffff) { 6796 res = 0xffffffff; 6797 } else { 6798 return v; 6799 } 6800 set_float_exception_flags(old_exc_flags, status); 6801 float_raise(float_flag_invalid STATUS_VAR); 6802 return res; 6803 } 6804 6805 uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM ) 6806 { 6807 int64_t v; 6808 uint32 res; 6809 int old_exc_flags = get_float_exception_flags(status); 6810 6811 v = float32_to_int64_round_to_zero(a STATUS_VAR); 6812 if (v < 0) { 6813 res = 0; 6814 } else if (v > 0xffffffff) { 6815 res = 0xffffffff; 6816 } else { 6817 return v; 6818 } 6819 set_float_exception_flags(old_exc_flags, status); 6820 float_raise(float_flag_invalid STATUS_VAR); 6821 return res; 6822 } 6823 6824 int_fast16_t float32_to_int16(float32 a STATUS_PARAM) 6825 { 6826 int32_t v; 6827 int_fast16_t res; 6828 int old_exc_flags = get_float_exception_flags(status); 6829 6830 v = float32_to_int32(a STATUS_VAR); 6831 if (v < -0x8000) { 6832 res = -0x8000; 6833 } else if (v > 0x7fff) { 6834 res = 0x7fff; 6835 } else { 6836 return v; 6837 } 6838 6839 set_float_exception_flags(old_exc_flags, status); 6840 float_raise(float_flag_invalid STATUS_VAR); 6841 return res; 6842 } 6843 6844 uint_fast16_t float32_to_uint16(float32 a STATUS_PARAM) 6845 { 6846 int32_t v; 6847 uint_fast16_t res; 6848 int old_exc_flags = get_float_exception_flags(status); 6849 6850 v = float32_to_int32(a STATUS_VAR); 6851 if (v < 0) { 6852 res = 0; 6853 } else if (v > 0xffff) { 6854 res = 0xffff; 6855 } else { 6856 return v; 6857 } 6858 6859 set_float_exception_flags(old_exc_flags, status); 6860 float_raise(float_flag_invalid STATUS_VAR); 6861 return res; 6862 } 6863 6864 uint_fast16_t float32_to_uint16_round_to_zero(float32 a STATUS_PARAM) 6865 { 6866 int64_t v; 6867 uint_fast16_t res; 6868 int old_exc_flags = get_float_exception_flags(status); 6869 6870 v = float32_to_int64_round_to_zero(a STATUS_VAR); 6871 if (v < 0) { 6872 res = 0; 6873 } else if (v > 0xffff) { 6874 res = 0xffff; 6875 } else { 6876 return v; 6877 } 6878 set_float_exception_flags(old_exc_flags, status); 6879 float_raise(float_flag_invalid STATUS_VAR); 6880 return res; 6881 } 6882 6883 uint32 float64_to_uint32( float64 a STATUS_PARAM ) 6884 { 6885 uint64_t v; 6886 uint32 res; 6887 int old_exc_flags = get_float_exception_flags(status); 6888 6889 v = float64_to_uint64(a STATUS_VAR); 6890 if (v > 0xffffffff) { 6891 res = 0xffffffff; 6892 } else { 6893 return v; 6894 } 6895 set_float_exception_flags(old_exc_flags, status); 6896 float_raise(float_flag_invalid STATUS_VAR); 6897 return res; 6898 } 6899 6900 uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM ) 6901 { 6902 uint64_t v; 6903 uint32 res; 6904 int old_exc_flags = get_float_exception_flags(status); 6905 6906 v = float64_to_uint64_round_to_zero(a STATUS_VAR); 6907 if (v > 0xffffffff) { 6908 res = 0xffffffff; 6909 } else { 6910 return v; 6911 } 6912 set_float_exception_flags(old_exc_flags, status); 6913 float_raise(float_flag_invalid STATUS_VAR); 6914 return res; 6915 } 6916 6917 int_fast16_t float64_to_int16(float64 a STATUS_PARAM) 6918 { 6919 int64_t v; 6920 int_fast16_t res; 6921 int old_exc_flags = get_float_exception_flags(status); 6922 6923 v = float64_to_int32(a STATUS_VAR); 6924 if (v < -0x8000) { 6925 res = -0x8000; 6926 } else if (v > 0x7fff) { 6927 res = 0x7fff; 6928 } else { 6929 return v; 6930 } 6931 6932 set_float_exception_flags(old_exc_flags, status); 6933 float_raise(float_flag_invalid STATUS_VAR); 6934 return res; 6935 } 6936 6937 uint_fast16_t float64_to_uint16(float64 a STATUS_PARAM) 6938 { 6939 int64_t v; 6940 uint_fast16_t res; 6941 int old_exc_flags = get_float_exception_flags(status); 6942 6943 v = float64_to_int32(a STATUS_VAR); 6944 if (v < 0) { 6945 res = 0; 6946 } else if (v > 0xffff) { 6947 res = 0xffff; 6948 } else { 6949 return v; 6950 } 6951 6952 set_float_exception_flags(old_exc_flags, status); 6953 float_raise(float_flag_invalid STATUS_VAR); 6954 return res; 6955 } 6956 6957 uint_fast16_t float64_to_uint16_round_to_zero(float64 a STATUS_PARAM) 6958 { 6959 int64_t v; 6960 uint_fast16_t res; 6961 int old_exc_flags = get_float_exception_flags(status); 6962 6963 v = float64_to_int64_round_to_zero(a STATUS_VAR); 6964 if (v < 0) { 6965 res = 0; 6966 } else if (v > 0xffff) { 6967 res = 0xffff; 6968 } else { 6969 return v; 6970 } 6971 set_float_exception_flags(old_exc_flags, status); 6972 float_raise(float_flag_invalid STATUS_VAR); 6973 return res; 6974 } 6975 6976 /*---------------------------------------------------------------------------- 6977 | Returns the result of converting the double-precision floating-point value 6978 | `a' to the 64-bit unsigned integer format. The conversion is 6979 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6980 | Arithmetic---which means in particular that the conversion is rounded 6981 | according to the current rounding mode. If `a' is a NaN, the largest 6982 | positive integer is returned. If the conversion overflows, the 6983 | largest unsigned integer is returned. If 'a' is negative, the value is 6984 | rounded and zero is returned; negative values that do not round to zero 6985 | will raise the inexact exception. 6986 *----------------------------------------------------------------------------*/ 6987 6988 uint64_t float64_to_uint64(float64 a STATUS_PARAM) 6989 { 6990 flag aSign; 6991 int_fast16_t aExp, shiftCount; 6992 uint64_t aSig, aSigExtra; 6993 a = float64_squash_input_denormal(a STATUS_VAR); 6994 6995 aSig = extractFloat64Frac(a); 6996 aExp = extractFloat64Exp(a); 6997 aSign = extractFloat64Sign(a); 6998 if (aSign && (aExp > 1022)) { 6999 float_raise(float_flag_invalid STATUS_VAR); 7000 if (float64_is_any_nan(a)) { 7001 return LIT64(0xFFFFFFFFFFFFFFFF); 7002 } else { 7003 return 0; 7004 } 7005 } 7006 if (aExp) { 7007 aSig |= LIT64(0x0010000000000000); 7008 } 7009 shiftCount = 0x433 - aExp; 7010 if (shiftCount <= 0) { 7011 if (0x43E < aExp) { 7012 float_raise(float_flag_invalid STATUS_VAR); 7013 return LIT64(0xFFFFFFFFFFFFFFFF); 7014 } 7015 aSigExtra = 0; 7016 aSig <<= -shiftCount; 7017 } else { 7018 shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra); 7019 } 7020 return roundAndPackUint64(aSign, aSig, aSigExtra STATUS_VAR); 7021 } 7022 7023 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM) 7024 { 7025 signed char current_rounding_mode = STATUS(float_rounding_mode); 7026 set_float_rounding_mode(float_round_to_zero STATUS_VAR); 7027 int64_t v = float64_to_uint64(a STATUS_VAR); 7028 set_float_rounding_mode(current_rounding_mode STATUS_VAR); 7029 return v; 7030 } 7031 7032 #define COMPARE(s, nan_exp) \ 7033 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \ 7034 int is_quiet STATUS_PARAM ) \ 7035 { \ 7036 flag aSign, bSign; \ 7037 uint ## s ## _t av, bv; \ 7038 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \ 7039 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \ 7040 \ 7041 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \ 7042 extractFloat ## s ## Frac( a ) ) || \ 7043 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \ 7044 extractFloat ## s ## Frac( b ) )) { \ 7045 if (!is_quiet || \ 7046 float ## s ## _is_signaling_nan( a ) || \ 7047 float ## s ## _is_signaling_nan( b ) ) { \ 7048 float_raise( float_flag_invalid STATUS_VAR); \ 7049 } \ 7050 return float_relation_unordered; \ 7051 } \ 7052 aSign = extractFloat ## s ## Sign( a ); \ 7053 bSign = extractFloat ## s ## Sign( b ); \ 7054 av = float ## s ## _val(a); \ 7055 bv = float ## s ## _val(b); \ 7056 if ( aSign != bSign ) { \ 7057 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \ 7058 /* zero case */ \ 7059 return float_relation_equal; \ 7060 } else { \ 7061 return 1 - (2 * aSign); \ 7062 } \ 7063 } else { \ 7064 if (av == bv) { \ 7065 return float_relation_equal; \ 7066 } else { \ 7067 return 1 - 2 * (aSign ^ ( av < bv )); \ 7068 } \ 7069 } \ 7070 } \ 7071 \ 7072 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \ 7073 { \ 7074 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \ 7075 } \ 7076 \ 7077 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \ 7078 { \ 7079 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \ 7080 } 7081 7082 COMPARE(32, 0xff) 7083 COMPARE(64, 0x7ff) 7084 7085 INLINE int floatx80_compare_internal( floatx80 a, floatx80 b, 7086 int is_quiet STATUS_PARAM ) 7087 { 7088 flag aSign, bSign; 7089 7090 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 7091 ( extractFloatx80Frac( a )<<1 ) ) || 7092 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 7093 ( extractFloatx80Frac( b )<<1 ) )) { 7094 if (!is_quiet || 7095 floatx80_is_signaling_nan( a ) || 7096 floatx80_is_signaling_nan( b ) ) { 7097 float_raise( float_flag_invalid STATUS_VAR); 7098 } 7099 return float_relation_unordered; 7100 } 7101 aSign = extractFloatx80Sign( a ); 7102 bSign = extractFloatx80Sign( b ); 7103 if ( aSign != bSign ) { 7104 7105 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 7106 ( ( a.low | b.low ) == 0 ) ) { 7107 /* zero case */ 7108 return float_relation_equal; 7109 } else { 7110 return 1 - (2 * aSign); 7111 } 7112 } else { 7113 if (a.low == b.low && a.high == b.high) { 7114 return float_relation_equal; 7115 } else { 7116 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7117 } 7118 } 7119 } 7120 7121 int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM ) 7122 { 7123 return floatx80_compare_internal(a, b, 0 STATUS_VAR); 7124 } 7125 7126 int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM ) 7127 { 7128 return floatx80_compare_internal(a, b, 1 STATUS_VAR); 7129 } 7130 7131 INLINE int float128_compare_internal( float128 a, float128 b, 7132 int is_quiet STATUS_PARAM ) 7133 { 7134 flag aSign, bSign; 7135 7136 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7137 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7138 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7139 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7140 if (!is_quiet || 7141 float128_is_signaling_nan( a ) || 7142 float128_is_signaling_nan( b ) ) { 7143 float_raise( float_flag_invalid STATUS_VAR); 7144 } 7145 return float_relation_unordered; 7146 } 7147 aSign = extractFloat128Sign( a ); 7148 bSign = extractFloat128Sign( b ); 7149 if ( aSign != bSign ) { 7150 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7151 /* zero case */ 7152 return float_relation_equal; 7153 } else { 7154 return 1 - (2 * aSign); 7155 } 7156 } else { 7157 if (a.low == b.low && a.high == b.high) { 7158 return float_relation_equal; 7159 } else { 7160 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7161 } 7162 } 7163 } 7164 7165 int float128_compare( float128 a, float128 b STATUS_PARAM ) 7166 { 7167 return float128_compare_internal(a, b, 0 STATUS_VAR); 7168 } 7169 7170 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM ) 7171 { 7172 return float128_compare_internal(a, b, 1 STATUS_VAR); 7173 } 7174 7175 /* min() and max() functions. These can't be implemented as 7176 * 'compare and pick one input' because that would mishandle 7177 * NaNs and +0 vs -0. 7178 * 7179 * minnum() and maxnum() functions. These are similar to the min() 7180 * and max() functions but if one of the arguments is a QNaN and 7181 * the other is numerical then the numerical argument is returned. 7182 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 7183 * and maxNum() operations. min() and max() are the typical min/max 7184 * semantics provided by many CPUs which predate that specification. 7185 */ 7186 #define MINMAX(s) \ 7187 INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b, \ 7188 int ismin, int isieee STATUS_PARAM) \ 7189 { \ 7190 flag aSign, bSign; \ 7191 uint ## s ## _t av, bv; \ 7192 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \ 7193 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \ 7194 if (float ## s ## _is_any_nan(a) || \ 7195 float ## s ## _is_any_nan(b)) { \ 7196 if (isieee) { \ 7197 if (float ## s ## _is_quiet_nan(a) && \ 7198 !float ## s ##_is_any_nan(b)) { \ 7199 return b; \ 7200 } else if (float ## s ## _is_quiet_nan(b) && \ 7201 !float ## s ## _is_any_nan(a)) { \ 7202 return a; \ 7203 } \ 7204 } \ 7205 return propagateFloat ## s ## NaN(a, b STATUS_VAR); \ 7206 } \ 7207 aSign = extractFloat ## s ## Sign(a); \ 7208 bSign = extractFloat ## s ## Sign(b); \ 7209 av = float ## s ## _val(a); \ 7210 bv = float ## s ## _val(b); \ 7211 if (aSign != bSign) { \ 7212 if (ismin) { \ 7213 return aSign ? a : b; \ 7214 } else { \ 7215 return aSign ? b : a; \ 7216 } \ 7217 } else { \ 7218 if (ismin) { \ 7219 return (aSign ^ (av < bv)) ? a : b; \ 7220 } else { \ 7221 return (aSign ^ (av < bv)) ? b : a; \ 7222 } \ 7223 } \ 7224 } \ 7225 \ 7226 float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM) \ 7227 { \ 7228 return float ## s ## _minmax(a, b, 1, 0 STATUS_VAR); \ 7229 } \ 7230 \ 7231 float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM) \ 7232 { \ 7233 return float ## s ## _minmax(a, b, 0, 0 STATUS_VAR); \ 7234 } \ 7235 \ 7236 float ## s float ## s ## _minnum(float ## s a, float ## s b STATUS_PARAM) \ 7237 { \ 7238 return float ## s ## _minmax(a, b, 1, 1 STATUS_VAR); \ 7239 } \ 7240 \ 7241 float ## s float ## s ## _maxnum(float ## s a, float ## s b STATUS_PARAM) \ 7242 { \ 7243 return float ## s ## _minmax(a, b, 0, 1 STATUS_VAR); \ 7244 } 7245 7246 MINMAX(32) 7247 MINMAX(64) 7248 7249 7250 /* Multiply A by 2 raised to the power N. */ 7251 float32 float32_scalbn( float32 a, int n STATUS_PARAM ) 7252 { 7253 flag aSign; 7254 int16_t aExp; 7255 uint32_t aSig; 7256 7257 a = float32_squash_input_denormal(a STATUS_VAR); 7258 aSig = extractFloat32Frac( a ); 7259 aExp = extractFloat32Exp( a ); 7260 aSign = extractFloat32Sign( a ); 7261 7262 if ( aExp == 0xFF ) { 7263 if ( aSig ) { 7264 return propagateFloat32NaN( a, a STATUS_VAR ); 7265 } 7266 return a; 7267 } 7268 if (aExp != 0) { 7269 aSig |= 0x00800000; 7270 } else if (aSig == 0) { 7271 return a; 7272 } else { 7273 aExp++; 7274 } 7275 7276 if (n > 0x200) { 7277 n = 0x200; 7278 } else if (n < -0x200) { 7279 n = -0x200; 7280 } 7281 7282 aExp += n - 1; 7283 aSig <<= 7; 7284 return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR ); 7285 } 7286 7287 float64 float64_scalbn( float64 a, int n STATUS_PARAM ) 7288 { 7289 flag aSign; 7290 int16_t aExp; 7291 uint64_t aSig; 7292 7293 a = float64_squash_input_denormal(a STATUS_VAR); 7294 aSig = extractFloat64Frac( a ); 7295 aExp = extractFloat64Exp( a ); 7296 aSign = extractFloat64Sign( a ); 7297 7298 if ( aExp == 0x7FF ) { 7299 if ( aSig ) { 7300 return propagateFloat64NaN( a, a STATUS_VAR ); 7301 } 7302 return a; 7303 } 7304 if (aExp != 0) { 7305 aSig |= LIT64( 0x0010000000000000 ); 7306 } else if (aSig == 0) { 7307 return a; 7308 } else { 7309 aExp++; 7310 } 7311 7312 if (n > 0x1000) { 7313 n = 0x1000; 7314 } else if (n < -0x1000) { 7315 n = -0x1000; 7316 } 7317 7318 aExp += n - 1; 7319 aSig <<= 10; 7320 return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR ); 7321 } 7322 7323 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM ) 7324 { 7325 flag aSign; 7326 int32_t aExp; 7327 uint64_t aSig; 7328 7329 aSig = extractFloatx80Frac( a ); 7330 aExp = extractFloatx80Exp( a ); 7331 aSign = extractFloatx80Sign( a ); 7332 7333 if ( aExp == 0x7FFF ) { 7334 if ( aSig<<1 ) { 7335 return propagateFloatx80NaN( a, a STATUS_VAR ); 7336 } 7337 return a; 7338 } 7339 7340 if (aExp == 0) { 7341 if (aSig == 0) { 7342 return a; 7343 } 7344 aExp++; 7345 } 7346 7347 if (n > 0x10000) { 7348 n = 0x10000; 7349 } else if (n < -0x10000) { 7350 n = -0x10000; 7351 } 7352 7353 aExp += n; 7354 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision), 7355 aSign, aExp, aSig, 0 STATUS_VAR ); 7356 } 7357 7358 float128 float128_scalbn( float128 a, int n STATUS_PARAM ) 7359 { 7360 flag aSign; 7361 int32_t aExp; 7362 uint64_t aSig0, aSig1; 7363 7364 aSig1 = extractFloat128Frac1( a ); 7365 aSig0 = extractFloat128Frac0( a ); 7366 aExp = extractFloat128Exp( a ); 7367 aSign = extractFloat128Sign( a ); 7368 if ( aExp == 0x7FFF ) { 7369 if ( aSig0 | aSig1 ) { 7370 return propagateFloat128NaN( a, a STATUS_VAR ); 7371 } 7372 return a; 7373 } 7374 if (aExp != 0) { 7375 aSig0 |= LIT64( 0x0001000000000000 ); 7376 } else if (aSig0 == 0 && aSig1 == 0) { 7377 return a; 7378 } else { 7379 aExp++; 7380 } 7381 7382 if (n > 0x10000) { 7383 n = 0x10000; 7384 } else if (n < -0x10000) { 7385 n = -0x10000; 7386 } 7387 7388 aExp += n - 1; 7389 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7390 STATUS_VAR ); 7391 7392 } 7393