1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 87 #include "fpu/softfloat.h" 88 89 /* We only need stdlib for abort() */ 90 91 /*---------------------------------------------------------------------------- 92 | Primitive arithmetic functions, including multi-word arithmetic, and 93 | division and square root approximations. (Can be specialized to target if 94 | desired.) 95 *----------------------------------------------------------------------------*/ 96 #include "softfloat-macros.h" 97 98 /*---------------------------------------------------------------------------- 99 | Functions and definitions to determine: (1) whether tininess for underflow 100 | is detected before or after rounding by default, (2) what (if anything) 101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 103 | are propagated from function inputs to output. These details are target- 104 | specific. 105 *----------------------------------------------------------------------------*/ 106 #include "softfloat-specialize.h" 107 108 /*---------------------------------------------------------------------------- 109 | Returns the fraction bits of the half-precision floating-point value `a'. 110 *----------------------------------------------------------------------------*/ 111 112 static inline uint32_t extractFloat16Frac(float16 a) 113 { 114 return float16_val(a) & 0x3ff; 115 } 116 117 /*---------------------------------------------------------------------------- 118 | Returns the exponent bits of the half-precision floating-point value `a'. 119 *----------------------------------------------------------------------------*/ 120 121 static inline int extractFloat16Exp(float16 a) 122 { 123 return (float16_val(a) >> 10) & 0x1f; 124 } 125 126 /*---------------------------------------------------------------------------- 127 | Returns the sign bit of the single-precision floating-point value `a'. 128 *----------------------------------------------------------------------------*/ 129 130 static inline flag extractFloat16Sign(float16 a) 131 { 132 return float16_val(a)>>15; 133 } 134 135 /*---------------------------------------------------------------------------- 136 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 137 | and 7, and returns the properly rounded 32-bit integer corresponding to the 138 | input. If `zSign' is 1, the input is negated before being converted to an 139 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 140 | is simply rounded to an integer, with the inexact exception raised if the 141 | input cannot be represented exactly as an integer. However, if the fixed- 142 | point input is too large, the invalid exception is raised and the largest 143 | positive or negative integer is returned. 144 *----------------------------------------------------------------------------*/ 145 146 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status) 147 { 148 int8_t roundingMode; 149 flag roundNearestEven; 150 int8_t roundIncrement, roundBits; 151 int32_t z; 152 153 roundingMode = status->float_rounding_mode; 154 roundNearestEven = ( roundingMode == float_round_nearest_even ); 155 switch (roundingMode) { 156 case float_round_nearest_even: 157 case float_round_ties_away: 158 roundIncrement = 0x40; 159 break; 160 case float_round_to_zero: 161 roundIncrement = 0; 162 break; 163 case float_round_up: 164 roundIncrement = zSign ? 0 : 0x7f; 165 break; 166 case float_round_down: 167 roundIncrement = zSign ? 0x7f : 0; 168 break; 169 default: 170 abort(); 171 } 172 roundBits = absZ & 0x7F; 173 absZ = ( absZ + roundIncrement )>>7; 174 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 175 z = absZ; 176 if ( zSign ) z = - z; 177 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 178 float_raise(float_flag_invalid, status); 179 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 180 } 181 if (roundBits) { 182 status->float_exception_flags |= float_flag_inexact; 183 } 184 return z; 185 186 } 187 188 /*---------------------------------------------------------------------------- 189 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 190 | `absZ1', with binary point between bits 63 and 64 (between the input words), 191 | and returns the properly rounded 64-bit integer corresponding to the input. 192 | If `zSign' is 1, the input is negated before being converted to an integer. 193 | Ordinarily, the fixed-point input is simply rounded to an integer, with 194 | the inexact exception raised if the input cannot be represented exactly as 195 | an integer. However, if the fixed-point input is too large, the invalid 196 | exception is raised and the largest positive or negative integer is 197 | returned. 198 *----------------------------------------------------------------------------*/ 199 200 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1, 201 float_status *status) 202 { 203 int8_t roundingMode; 204 flag roundNearestEven, increment; 205 int64_t z; 206 207 roundingMode = status->float_rounding_mode; 208 roundNearestEven = ( roundingMode == float_round_nearest_even ); 209 switch (roundingMode) { 210 case float_round_nearest_even: 211 case float_round_ties_away: 212 increment = ((int64_t) absZ1 < 0); 213 break; 214 case float_round_to_zero: 215 increment = 0; 216 break; 217 case float_round_up: 218 increment = !zSign && absZ1; 219 break; 220 case float_round_down: 221 increment = zSign && absZ1; 222 break; 223 default: 224 abort(); 225 } 226 if ( increment ) { 227 ++absZ0; 228 if ( absZ0 == 0 ) goto overflow; 229 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 230 } 231 z = absZ0; 232 if ( zSign ) z = - z; 233 if ( z && ( ( z < 0 ) ^ zSign ) ) { 234 overflow: 235 float_raise(float_flag_invalid, status); 236 return 237 zSign ? (int64_t) LIT64( 0x8000000000000000 ) 238 : LIT64( 0x7FFFFFFFFFFFFFFF ); 239 } 240 if (absZ1) { 241 status->float_exception_flags |= float_flag_inexact; 242 } 243 return z; 244 245 } 246 247 /*---------------------------------------------------------------------------- 248 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 249 | `absZ1', with binary point between bits 63 and 64 (between the input words), 250 | and returns the properly rounded 64-bit unsigned integer corresponding to the 251 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 252 | with the inexact exception raised if the input cannot be represented exactly 253 | as an integer. However, if the fixed-point input is too large, the invalid 254 | exception is raised and the largest unsigned integer is returned. 255 *----------------------------------------------------------------------------*/ 256 257 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0, 258 uint64_t absZ1, float_status *status) 259 { 260 int8_t roundingMode; 261 flag roundNearestEven, increment; 262 263 roundingMode = status->float_rounding_mode; 264 roundNearestEven = (roundingMode == float_round_nearest_even); 265 switch (roundingMode) { 266 case float_round_nearest_even: 267 case float_round_ties_away: 268 increment = ((int64_t)absZ1 < 0); 269 break; 270 case float_round_to_zero: 271 increment = 0; 272 break; 273 case float_round_up: 274 increment = !zSign && absZ1; 275 break; 276 case float_round_down: 277 increment = zSign && absZ1; 278 break; 279 default: 280 abort(); 281 } 282 if (increment) { 283 ++absZ0; 284 if (absZ0 == 0) { 285 float_raise(float_flag_invalid, status); 286 return LIT64(0xFFFFFFFFFFFFFFFF); 287 } 288 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven); 289 } 290 291 if (zSign && absZ0) { 292 float_raise(float_flag_invalid, status); 293 return 0; 294 } 295 296 if (absZ1) { 297 status->float_exception_flags |= float_flag_inexact; 298 } 299 return absZ0; 300 } 301 302 /*---------------------------------------------------------------------------- 303 | Returns the fraction bits of the single-precision floating-point value `a'. 304 *----------------------------------------------------------------------------*/ 305 306 static inline uint32_t extractFloat32Frac( float32 a ) 307 { 308 309 return float32_val(a) & 0x007FFFFF; 310 311 } 312 313 /*---------------------------------------------------------------------------- 314 | Returns the exponent bits of the single-precision floating-point value `a'. 315 *----------------------------------------------------------------------------*/ 316 317 static inline int extractFloat32Exp(float32 a) 318 { 319 320 return ( float32_val(a)>>23 ) & 0xFF; 321 322 } 323 324 /*---------------------------------------------------------------------------- 325 | Returns the sign bit of the single-precision floating-point value `a'. 326 *----------------------------------------------------------------------------*/ 327 328 static inline flag extractFloat32Sign( float32 a ) 329 { 330 331 return float32_val(a)>>31; 332 333 } 334 335 /*---------------------------------------------------------------------------- 336 | If `a' is denormal and we are in flush-to-zero mode then set the 337 | input-denormal exception and return zero. Otherwise just return the value. 338 *----------------------------------------------------------------------------*/ 339 float32 float32_squash_input_denormal(float32 a, float_status *status) 340 { 341 if (status->flush_inputs_to_zero) { 342 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) { 343 float_raise(float_flag_input_denormal, status); 344 return make_float32(float32_val(a) & 0x80000000); 345 } 346 } 347 return a; 348 } 349 350 /*---------------------------------------------------------------------------- 351 | Normalizes the subnormal single-precision floating-point value represented 352 | by the denormalized significand `aSig'. The normalized exponent and 353 | significand are stored at the locations pointed to by `zExpPtr' and 354 | `zSigPtr', respectively. 355 *----------------------------------------------------------------------------*/ 356 357 static void 358 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 359 { 360 int8_t shiftCount; 361 362 shiftCount = countLeadingZeros32( aSig ) - 8; 363 *zSigPtr = aSig<<shiftCount; 364 *zExpPtr = 1 - shiftCount; 365 366 } 367 368 /*---------------------------------------------------------------------------- 369 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 370 | single-precision floating-point value, returning the result. After being 371 | shifted into the proper positions, the three fields are simply added 372 | together to form the result. This means that any integer portion of `zSig' 373 | will be added into the exponent. Since a properly normalized significand 374 | will have an integer portion equal to 1, the `zExp' input should be 1 less 375 | than the desired result exponent whenever `zSig' is a complete, normalized 376 | significand. 377 *----------------------------------------------------------------------------*/ 378 379 static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig) 380 { 381 382 return make_float32( 383 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig); 384 385 } 386 387 /*---------------------------------------------------------------------------- 388 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 389 | and significand `zSig', and returns the proper single-precision floating- 390 | point value corresponding to the abstract input. Ordinarily, the abstract 391 | value is simply rounded and packed into the single-precision format, with 392 | the inexact exception raised if the abstract input cannot be represented 393 | exactly. However, if the abstract value is too large, the overflow and 394 | inexact exceptions are raised and an infinity or maximal finite value is 395 | returned. If the abstract value is too small, the input value is rounded to 396 | a subnormal number, and the underflow and inexact exceptions are raised if 397 | the abstract input cannot be represented exactly as a subnormal single- 398 | precision floating-point number. 399 | The input significand `zSig' has its binary point between bits 30 400 | and 29, which is 7 bits to the left of the usual location. This shifted 401 | significand must be normalized or smaller. If `zSig' is not normalized, 402 | `zExp' must be 0; in that case, the result returned is a subnormal number, 403 | and it must not require rounding. In the usual case that `zSig' is 404 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 405 | The handling of underflow and overflow follows the IEC/IEEE Standard for 406 | Binary Floating-Point Arithmetic. 407 *----------------------------------------------------------------------------*/ 408 409 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 410 float_status *status) 411 { 412 int8_t roundingMode; 413 flag roundNearestEven; 414 int8_t roundIncrement, roundBits; 415 flag isTiny; 416 417 roundingMode = status->float_rounding_mode; 418 roundNearestEven = ( roundingMode == float_round_nearest_even ); 419 switch (roundingMode) { 420 case float_round_nearest_even: 421 case float_round_ties_away: 422 roundIncrement = 0x40; 423 break; 424 case float_round_to_zero: 425 roundIncrement = 0; 426 break; 427 case float_round_up: 428 roundIncrement = zSign ? 0 : 0x7f; 429 break; 430 case float_round_down: 431 roundIncrement = zSign ? 0x7f : 0; 432 break; 433 default: 434 abort(); 435 break; 436 } 437 roundBits = zSig & 0x7F; 438 if ( 0xFD <= (uint16_t) zExp ) { 439 if ( ( 0xFD < zExp ) 440 || ( ( zExp == 0xFD ) 441 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 442 ) { 443 float_raise(float_flag_overflow | float_flag_inexact, status); 444 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 )); 445 } 446 if ( zExp < 0 ) { 447 if (status->flush_to_zero) { 448 float_raise(float_flag_output_denormal, status); 449 return packFloat32(zSign, 0, 0); 450 } 451 isTiny = 452 (status->float_detect_tininess 453 == float_tininess_before_rounding) 454 || ( zExp < -1 ) 455 || ( zSig + roundIncrement < 0x80000000 ); 456 shift32RightJamming( zSig, - zExp, &zSig ); 457 zExp = 0; 458 roundBits = zSig & 0x7F; 459 if (isTiny && roundBits) { 460 float_raise(float_flag_underflow, status); 461 } 462 } 463 } 464 if (roundBits) { 465 status->float_exception_flags |= float_flag_inexact; 466 } 467 zSig = ( zSig + roundIncrement )>>7; 468 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 469 if ( zSig == 0 ) zExp = 0; 470 return packFloat32( zSign, zExp, zSig ); 471 472 } 473 474 /*---------------------------------------------------------------------------- 475 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 476 | and significand `zSig', and returns the proper single-precision floating- 477 | point value corresponding to the abstract input. This routine is just like 478 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 479 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 480 | floating-point exponent. 481 *----------------------------------------------------------------------------*/ 482 483 static float32 484 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 485 float_status *status) 486 { 487 int8_t shiftCount; 488 489 shiftCount = countLeadingZeros32( zSig ) - 1; 490 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 491 status); 492 493 } 494 495 /*---------------------------------------------------------------------------- 496 | Returns the fraction bits of the double-precision floating-point value `a'. 497 *----------------------------------------------------------------------------*/ 498 499 static inline uint64_t extractFloat64Frac( float64 a ) 500 { 501 502 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF ); 503 504 } 505 506 /*---------------------------------------------------------------------------- 507 | Returns the exponent bits of the double-precision floating-point value `a'. 508 *----------------------------------------------------------------------------*/ 509 510 static inline int extractFloat64Exp(float64 a) 511 { 512 513 return ( float64_val(a)>>52 ) & 0x7FF; 514 515 } 516 517 /*---------------------------------------------------------------------------- 518 | Returns the sign bit of the double-precision floating-point value `a'. 519 *----------------------------------------------------------------------------*/ 520 521 static inline flag extractFloat64Sign( float64 a ) 522 { 523 524 return float64_val(a)>>63; 525 526 } 527 528 /*---------------------------------------------------------------------------- 529 | If `a' is denormal and we are in flush-to-zero mode then set the 530 | input-denormal exception and return zero. Otherwise just return the value. 531 *----------------------------------------------------------------------------*/ 532 float64 float64_squash_input_denormal(float64 a, float_status *status) 533 { 534 if (status->flush_inputs_to_zero) { 535 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) { 536 float_raise(float_flag_input_denormal, status); 537 return make_float64(float64_val(a) & (1ULL << 63)); 538 } 539 } 540 return a; 541 } 542 543 /*---------------------------------------------------------------------------- 544 | Normalizes the subnormal double-precision floating-point value represented 545 | by the denormalized significand `aSig'. The normalized exponent and 546 | significand are stored at the locations pointed to by `zExpPtr' and 547 | `zSigPtr', respectively. 548 *----------------------------------------------------------------------------*/ 549 550 static void 551 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 552 { 553 int8_t shiftCount; 554 555 shiftCount = countLeadingZeros64( aSig ) - 11; 556 *zSigPtr = aSig<<shiftCount; 557 *zExpPtr = 1 - shiftCount; 558 559 } 560 561 /*---------------------------------------------------------------------------- 562 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 563 | double-precision floating-point value, returning the result. After being 564 | shifted into the proper positions, the three fields are simply added 565 | together to form the result. This means that any integer portion of `zSig' 566 | will be added into the exponent. Since a properly normalized significand 567 | will have an integer portion equal to 1, the `zExp' input should be 1 less 568 | than the desired result exponent whenever `zSig' is a complete, normalized 569 | significand. 570 *----------------------------------------------------------------------------*/ 571 572 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig) 573 { 574 575 return make_float64( 576 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 577 578 } 579 580 /*---------------------------------------------------------------------------- 581 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 582 | and significand `zSig', and returns the proper double-precision floating- 583 | point value corresponding to the abstract input. Ordinarily, the abstract 584 | value is simply rounded and packed into the double-precision format, with 585 | the inexact exception raised if the abstract input cannot be represented 586 | exactly. However, if the abstract value is too large, the overflow and 587 | inexact exceptions are raised and an infinity or maximal finite value is 588 | returned. If the abstract value is too small, the input value is rounded to 589 | a subnormal number, and the underflow and inexact exceptions are raised if 590 | the abstract input cannot be represented exactly as a subnormal double- 591 | precision floating-point number. 592 | The input significand `zSig' has its binary point between bits 62 593 | and 61, which is 10 bits to the left of the usual location. This shifted 594 | significand must be normalized or smaller. If `zSig' is not normalized, 595 | `zExp' must be 0; in that case, the result returned is a subnormal number, 596 | and it must not require rounding. In the usual case that `zSig' is 597 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 598 | The handling of underflow and overflow follows the IEC/IEEE Standard for 599 | Binary Floating-Point Arithmetic. 600 *----------------------------------------------------------------------------*/ 601 602 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 603 float_status *status) 604 { 605 int8_t roundingMode; 606 flag roundNearestEven; 607 int roundIncrement, roundBits; 608 flag isTiny; 609 610 roundingMode = status->float_rounding_mode; 611 roundNearestEven = ( roundingMode == float_round_nearest_even ); 612 switch (roundingMode) { 613 case float_round_nearest_even: 614 case float_round_ties_away: 615 roundIncrement = 0x200; 616 break; 617 case float_round_to_zero: 618 roundIncrement = 0; 619 break; 620 case float_round_up: 621 roundIncrement = zSign ? 0 : 0x3ff; 622 break; 623 case float_round_down: 624 roundIncrement = zSign ? 0x3ff : 0; 625 break; 626 default: 627 abort(); 628 } 629 roundBits = zSig & 0x3FF; 630 if ( 0x7FD <= (uint16_t) zExp ) { 631 if ( ( 0x7FD < zExp ) 632 || ( ( zExp == 0x7FD ) 633 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 634 ) { 635 float_raise(float_flag_overflow | float_flag_inexact, status); 636 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 )); 637 } 638 if ( zExp < 0 ) { 639 if (status->flush_to_zero) { 640 float_raise(float_flag_output_denormal, status); 641 return packFloat64(zSign, 0, 0); 642 } 643 isTiny = 644 (status->float_detect_tininess 645 == float_tininess_before_rounding) 646 || ( zExp < -1 ) 647 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 648 shift64RightJamming( zSig, - zExp, &zSig ); 649 zExp = 0; 650 roundBits = zSig & 0x3FF; 651 if (isTiny && roundBits) { 652 float_raise(float_flag_underflow, status); 653 } 654 } 655 } 656 if (roundBits) { 657 status->float_exception_flags |= float_flag_inexact; 658 } 659 zSig = ( zSig + roundIncrement )>>10; 660 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 661 if ( zSig == 0 ) zExp = 0; 662 return packFloat64( zSign, zExp, zSig ); 663 664 } 665 666 /*---------------------------------------------------------------------------- 667 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 668 | and significand `zSig', and returns the proper double-precision floating- 669 | point value corresponding to the abstract input. This routine is just like 670 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 671 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 672 | floating-point exponent. 673 *----------------------------------------------------------------------------*/ 674 675 static float64 676 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 677 float_status *status) 678 { 679 int8_t shiftCount; 680 681 shiftCount = countLeadingZeros64( zSig ) - 1; 682 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 683 status); 684 685 } 686 687 /*---------------------------------------------------------------------------- 688 | Returns the fraction bits of the extended double-precision floating-point 689 | value `a'. 690 *----------------------------------------------------------------------------*/ 691 692 static inline uint64_t extractFloatx80Frac( floatx80 a ) 693 { 694 695 return a.low; 696 697 } 698 699 /*---------------------------------------------------------------------------- 700 | Returns the exponent bits of the extended double-precision floating-point 701 | value `a'. 702 *----------------------------------------------------------------------------*/ 703 704 static inline int32_t extractFloatx80Exp( floatx80 a ) 705 { 706 707 return a.high & 0x7FFF; 708 709 } 710 711 /*---------------------------------------------------------------------------- 712 | Returns the sign bit of the extended double-precision floating-point value 713 | `a'. 714 *----------------------------------------------------------------------------*/ 715 716 static inline flag extractFloatx80Sign( floatx80 a ) 717 { 718 719 return a.high>>15; 720 721 } 722 723 /*---------------------------------------------------------------------------- 724 | Normalizes the subnormal extended double-precision floating-point value 725 | represented by the denormalized significand `aSig'. The normalized exponent 726 | and significand are stored at the locations pointed to by `zExpPtr' and 727 | `zSigPtr', respectively. 728 *----------------------------------------------------------------------------*/ 729 730 static void 731 normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr ) 732 { 733 int8_t shiftCount; 734 735 shiftCount = countLeadingZeros64( aSig ); 736 *zSigPtr = aSig<<shiftCount; 737 *zExpPtr = 1 - shiftCount; 738 739 } 740 741 /*---------------------------------------------------------------------------- 742 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 743 | extended double-precision floating-point value, returning the result. 744 *----------------------------------------------------------------------------*/ 745 746 static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig ) 747 { 748 floatx80 z; 749 750 z.low = zSig; 751 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp; 752 return z; 753 754 } 755 756 /*---------------------------------------------------------------------------- 757 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 758 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 759 | and returns the proper extended double-precision floating-point value 760 | corresponding to the abstract input. Ordinarily, the abstract value is 761 | rounded and packed into the extended double-precision format, with the 762 | inexact exception raised if the abstract input cannot be represented 763 | exactly. However, if the abstract value is too large, the overflow and 764 | inexact exceptions are raised and an infinity or maximal finite value is 765 | returned. If the abstract value is too small, the input value is rounded to 766 | a subnormal number, and the underflow and inexact exceptions are raised if 767 | the abstract input cannot be represented exactly as a subnormal extended 768 | double-precision floating-point number. 769 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 770 | number of bits as single or double precision, respectively. Otherwise, the 771 | result is rounded to the full precision of the extended double-precision 772 | format. 773 | The input significand must be normalized or smaller. If the input 774 | significand is not normalized, `zExp' must be 0; in that case, the result 775 | returned is a subnormal number, and it must not require rounding. The 776 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 777 | Floating-Point Arithmetic. 778 *----------------------------------------------------------------------------*/ 779 780 static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign, 781 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 782 float_status *status) 783 { 784 int8_t roundingMode; 785 flag roundNearestEven, increment, isTiny; 786 int64_t roundIncrement, roundMask, roundBits; 787 788 roundingMode = status->float_rounding_mode; 789 roundNearestEven = ( roundingMode == float_round_nearest_even ); 790 if ( roundingPrecision == 80 ) goto precision80; 791 if ( roundingPrecision == 64 ) { 792 roundIncrement = LIT64( 0x0000000000000400 ); 793 roundMask = LIT64( 0x00000000000007FF ); 794 } 795 else if ( roundingPrecision == 32 ) { 796 roundIncrement = LIT64( 0x0000008000000000 ); 797 roundMask = LIT64( 0x000000FFFFFFFFFF ); 798 } 799 else { 800 goto precision80; 801 } 802 zSig0 |= ( zSig1 != 0 ); 803 switch (roundingMode) { 804 case float_round_nearest_even: 805 case float_round_ties_away: 806 break; 807 case float_round_to_zero: 808 roundIncrement = 0; 809 break; 810 case float_round_up: 811 roundIncrement = zSign ? 0 : roundMask; 812 break; 813 case float_round_down: 814 roundIncrement = zSign ? roundMask : 0; 815 break; 816 default: 817 abort(); 818 } 819 roundBits = zSig0 & roundMask; 820 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 821 if ( ( 0x7FFE < zExp ) 822 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 823 ) { 824 goto overflow; 825 } 826 if ( zExp <= 0 ) { 827 if (status->flush_to_zero) { 828 float_raise(float_flag_output_denormal, status); 829 return packFloatx80(zSign, 0, 0); 830 } 831 isTiny = 832 (status->float_detect_tininess 833 == float_tininess_before_rounding) 834 || ( zExp < 0 ) 835 || ( zSig0 <= zSig0 + roundIncrement ); 836 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 837 zExp = 0; 838 roundBits = zSig0 & roundMask; 839 if (isTiny && roundBits) { 840 float_raise(float_flag_underflow, status); 841 } 842 if (roundBits) { 843 status->float_exception_flags |= float_flag_inexact; 844 } 845 zSig0 += roundIncrement; 846 if ( (int64_t) zSig0 < 0 ) zExp = 1; 847 roundIncrement = roundMask + 1; 848 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 849 roundMask |= roundIncrement; 850 } 851 zSig0 &= ~ roundMask; 852 return packFloatx80( zSign, zExp, zSig0 ); 853 } 854 } 855 if (roundBits) { 856 status->float_exception_flags |= float_flag_inexact; 857 } 858 zSig0 += roundIncrement; 859 if ( zSig0 < roundIncrement ) { 860 ++zExp; 861 zSig0 = LIT64( 0x8000000000000000 ); 862 } 863 roundIncrement = roundMask + 1; 864 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 865 roundMask |= roundIncrement; 866 } 867 zSig0 &= ~ roundMask; 868 if ( zSig0 == 0 ) zExp = 0; 869 return packFloatx80( zSign, zExp, zSig0 ); 870 precision80: 871 switch (roundingMode) { 872 case float_round_nearest_even: 873 case float_round_ties_away: 874 increment = ((int64_t)zSig1 < 0); 875 break; 876 case float_round_to_zero: 877 increment = 0; 878 break; 879 case float_round_up: 880 increment = !zSign && zSig1; 881 break; 882 case float_round_down: 883 increment = zSign && zSig1; 884 break; 885 default: 886 abort(); 887 } 888 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 889 if ( ( 0x7FFE < zExp ) 890 || ( ( zExp == 0x7FFE ) 891 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 892 && increment 893 ) 894 ) { 895 roundMask = 0; 896 overflow: 897 float_raise(float_flag_overflow | float_flag_inexact, status); 898 if ( ( roundingMode == float_round_to_zero ) 899 || ( zSign && ( roundingMode == float_round_up ) ) 900 || ( ! zSign && ( roundingMode == float_round_down ) ) 901 ) { 902 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 903 } 904 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 905 } 906 if ( zExp <= 0 ) { 907 isTiny = 908 (status->float_detect_tininess 909 == float_tininess_before_rounding) 910 || ( zExp < 0 ) 911 || ! increment 912 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 913 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 914 zExp = 0; 915 if (isTiny && zSig1) { 916 float_raise(float_flag_underflow, status); 917 } 918 if (zSig1) { 919 status->float_exception_flags |= float_flag_inexact; 920 } 921 switch (roundingMode) { 922 case float_round_nearest_even: 923 case float_round_ties_away: 924 increment = ((int64_t)zSig1 < 0); 925 break; 926 case float_round_to_zero: 927 increment = 0; 928 break; 929 case float_round_up: 930 increment = !zSign && zSig1; 931 break; 932 case float_round_down: 933 increment = zSign && zSig1; 934 break; 935 default: 936 abort(); 937 } 938 if ( increment ) { 939 ++zSig0; 940 zSig0 &= 941 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 942 if ( (int64_t) zSig0 < 0 ) zExp = 1; 943 } 944 return packFloatx80( zSign, zExp, zSig0 ); 945 } 946 } 947 if (zSig1) { 948 status->float_exception_flags |= float_flag_inexact; 949 } 950 if ( increment ) { 951 ++zSig0; 952 if ( zSig0 == 0 ) { 953 ++zExp; 954 zSig0 = LIT64( 0x8000000000000000 ); 955 } 956 else { 957 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 958 } 959 } 960 else { 961 if ( zSig0 == 0 ) zExp = 0; 962 } 963 return packFloatx80( zSign, zExp, zSig0 ); 964 965 } 966 967 /*---------------------------------------------------------------------------- 968 | Takes an abstract floating-point value having sign `zSign', exponent 969 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 970 | and returns the proper extended double-precision floating-point value 971 | corresponding to the abstract input. This routine is just like 972 | `roundAndPackFloatx80' except that the input significand does not have to be 973 | normalized. 974 *----------------------------------------------------------------------------*/ 975 976 static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 977 flag zSign, int32_t zExp, 978 uint64_t zSig0, uint64_t zSig1, 979 float_status *status) 980 { 981 int8_t shiftCount; 982 983 if ( zSig0 == 0 ) { 984 zSig0 = zSig1; 985 zSig1 = 0; 986 zExp -= 64; 987 } 988 shiftCount = countLeadingZeros64( zSig0 ); 989 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 990 zExp -= shiftCount; 991 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 992 zSig0, zSig1, status); 993 994 } 995 996 /*---------------------------------------------------------------------------- 997 | Returns the least-significant 64 fraction bits of the quadruple-precision 998 | floating-point value `a'. 999 *----------------------------------------------------------------------------*/ 1000 1001 static inline uint64_t extractFloat128Frac1( float128 a ) 1002 { 1003 1004 return a.low; 1005 1006 } 1007 1008 /*---------------------------------------------------------------------------- 1009 | Returns the most-significant 48 fraction bits of the quadruple-precision 1010 | floating-point value `a'. 1011 *----------------------------------------------------------------------------*/ 1012 1013 static inline uint64_t extractFloat128Frac0( float128 a ) 1014 { 1015 1016 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 1017 1018 } 1019 1020 /*---------------------------------------------------------------------------- 1021 | Returns the exponent bits of the quadruple-precision floating-point value 1022 | `a'. 1023 *----------------------------------------------------------------------------*/ 1024 1025 static inline int32_t extractFloat128Exp( float128 a ) 1026 { 1027 1028 return ( a.high>>48 ) & 0x7FFF; 1029 1030 } 1031 1032 /*---------------------------------------------------------------------------- 1033 | Returns the sign bit of the quadruple-precision floating-point value `a'. 1034 *----------------------------------------------------------------------------*/ 1035 1036 static inline flag extractFloat128Sign( float128 a ) 1037 { 1038 1039 return a.high>>63; 1040 1041 } 1042 1043 /*---------------------------------------------------------------------------- 1044 | Normalizes the subnormal quadruple-precision floating-point value 1045 | represented by the denormalized significand formed by the concatenation of 1046 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 1047 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 1048 | significand are stored at the location pointed to by `zSig0Ptr', and the 1049 | least significant 64 bits of the normalized significand are stored at the 1050 | location pointed to by `zSig1Ptr'. 1051 *----------------------------------------------------------------------------*/ 1052 1053 static void 1054 normalizeFloat128Subnormal( 1055 uint64_t aSig0, 1056 uint64_t aSig1, 1057 int32_t *zExpPtr, 1058 uint64_t *zSig0Ptr, 1059 uint64_t *zSig1Ptr 1060 ) 1061 { 1062 int8_t shiftCount; 1063 1064 if ( aSig0 == 0 ) { 1065 shiftCount = countLeadingZeros64( aSig1 ) - 15; 1066 if ( shiftCount < 0 ) { 1067 *zSig0Ptr = aSig1>>( - shiftCount ); 1068 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 1069 } 1070 else { 1071 *zSig0Ptr = aSig1<<shiftCount; 1072 *zSig1Ptr = 0; 1073 } 1074 *zExpPtr = - shiftCount - 63; 1075 } 1076 else { 1077 shiftCount = countLeadingZeros64( aSig0 ) - 15; 1078 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 1079 *zExpPtr = 1 - shiftCount; 1080 } 1081 1082 } 1083 1084 /*---------------------------------------------------------------------------- 1085 | Packs the sign `zSign', the exponent `zExp', and the significand formed 1086 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 1087 | floating-point value, returning the result. After being shifted into the 1088 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 1089 | added together to form the most significant 32 bits of the result. This 1090 | means that any integer portion of `zSig0' will be added into the exponent. 1091 | Since a properly normalized significand will have an integer portion equal 1092 | to 1, the `zExp' input should be 1 less than the desired result exponent 1093 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 1094 | significand. 1095 *----------------------------------------------------------------------------*/ 1096 1097 static inline float128 1098 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 ) 1099 { 1100 float128 z; 1101 1102 z.low = zSig1; 1103 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0; 1104 return z; 1105 1106 } 1107 1108 /*---------------------------------------------------------------------------- 1109 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1110 | and extended significand formed by the concatenation of `zSig0', `zSig1', 1111 | and `zSig2', and returns the proper quadruple-precision floating-point value 1112 | corresponding to the abstract input. Ordinarily, the abstract value is 1113 | simply rounded and packed into the quadruple-precision format, with the 1114 | inexact exception raised if the abstract input cannot be represented 1115 | exactly. However, if the abstract value is too large, the overflow and 1116 | inexact exceptions are raised and an infinity or maximal finite value is 1117 | returned. If the abstract value is too small, the input value is rounded to 1118 | a subnormal number, and the underflow and inexact exceptions are raised if 1119 | the abstract input cannot be represented exactly as a subnormal quadruple- 1120 | precision floating-point number. 1121 | The input significand must be normalized or smaller. If the input 1122 | significand is not normalized, `zExp' must be 0; in that case, the result 1123 | returned is a subnormal number, and it must not require rounding. In the 1124 | usual case that the input significand is normalized, `zExp' must be 1 less 1125 | than the ``true'' floating-point exponent. The handling of underflow and 1126 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1127 *----------------------------------------------------------------------------*/ 1128 1129 static float128 roundAndPackFloat128(flag zSign, int32_t zExp, 1130 uint64_t zSig0, uint64_t zSig1, 1131 uint64_t zSig2, float_status *status) 1132 { 1133 int8_t roundingMode; 1134 flag roundNearestEven, increment, isTiny; 1135 1136 roundingMode = status->float_rounding_mode; 1137 roundNearestEven = ( roundingMode == float_round_nearest_even ); 1138 switch (roundingMode) { 1139 case float_round_nearest_even: 1140 case float_round_ties_away: 1141 increment = ((int64_t)zSig2 < 0); 1142 break; 1143 case float_round_to_zero: 1144 increment = 0; 1145 break; 1146 case float_round_up: 1147 increment = !zSign && zSig2; 1148 break; 1149 case float_round_down: 1150 increment = zSign && zSig2; 1151 break; 1152 default: 1153 abort(); 1154 } 1155 if ( 0x7FFD <= (uint32_t) zExp ) { 1156 if ( ( 0x7FFD < zExp ) 1157 || ( ( zExp == 0x7FFD ) 1158 && eq128( 1159 LIT64( 0x0001FFFFFFFFFFFF ), 1160 LIT64( 0xFFFFFFFFFFFFFFFF ), 1161 zSig0, 1162 zSig1 1163 ) 1164 && increment 1165 ) 1166 ) { 1167 float_raise(float_flag_overflow | float_flag_inexact, status); 1168 if ( ( roundingMode == float_round_to_zero ) 1169 || ( zSign && ( roundingMode == float_round_up ) ) 1170 || ( ! zSign && ( roundingMode == float_round_down ) ) 1171 ) { 1172 return 1173 packFloat128( 1174 zSign, 1175 0x7FFE, 1176 LIT64( 0x0000FFFFFFFFFFFF ), 1177 LIT64( 0xFFFFFFFFFFFFFFFF ) 1178 ); 1179 } 1180 return packFloat128( zSign, 0x7FFF, 0, 0 ); 1181 } 1182 if ( zExp < 0 ) { 1183 if (status->flush_to_zero) { 1184 float_raise(float_flag_output_denormal, status); 1185 return packFloat128(zSign, 0, 0, 0); 1186 } 1187 isTiny = 1188 (status->float_detect_tininess 1189 == float_tininess_before_rounding) 1190 || ( zExp < -1 ) 1191 || ! increment 1192 || lt128( 1193 zSig0, 1194 zSig1, 1195 LIT64( 0x0001FFFFFFFFFFFF ), 1196 LIT64( 0xFFFFFFFFFFFFFFFF ) 1197 ); 1198 shift128ExtraRightJamming( 1199 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 1200 zExp = 0; 1201 if (isTiny && zSig2) { 1202 float_raise(float_flag_underflow, status); 1203 } 1204 switch (roundingMode) { 1205 case float_round_nearest_even: 1206 case float_round_ties_away: 1207 increment = ((int64_t)zSig2 < 0); 1208 break; 1209 case float_round_to_zero: 1210 increment = 0; 1211 break; 1212 case float_round_up: 1213 increment = !zSign && zSig2; 1214 break; 1215 case float_round_down: 1216 increment = zSign && zSig2; 1217 break; 1218 default: 1219 abort(); 1220 } 1221 } 1222 } 1223 if (zSig2) { 1224 status->float_exception_flags |= float_flag_inexact; 1225 } 1226 if ( increment ) { 1227 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 1228 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 1229 } 1230 else { 1231 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 1232 } 1233 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1234 1235 } 1236 1237 /*---------------------------------------------------------------------------- 1238 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1239 | and significand formed by the concatenation of `zSig0' and `zSig1', and 1240 | returns the proper quadruple-precision floating-point value corresponding 1241 | to the abstract input. This routine is just like `roundAndPackFloat128' 1242 | except that the input significand has fewer bits and does not have to be 1243 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 1244 | point exponent. 1245 *----------------------------------------------------------------------------*/ 1246 1247 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp, 1248 uint64_t zSig0, uint64_t zSig1, 1249 float_status *status) 1250 { 1251 int8_t shiftCount; 1252 uint64_t zSig2; 1253 1254 if ( zSig0 == 0 ) { 1255 zSig0 = zSig1; 1256 zSig1 = 0; 1257 zExp -= 64; 1258 } 1259 shiftCount = countLeadingZeros64( zSig0 ) - 15; 1260 if ( 0 <= shiftCount ) { 1261 zSig2 = 0; 1262 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1263 } 1264 else { 1265 shift128ExtraRightJamming( 1266 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 1267 } 1268 zExp -= shiftCount; 1269 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 1270 1271 } 1272 1273 /*---------------------------------------------------------------------------- 1274 | Returns the result of converting the 32-bit two's complement integer `a' 1275 | to the single-precision floating-point format. The conversion is performed 1276 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1277 *----------------------------------------------------------------------------*/ 1278 1279 float32 int32_to_float32(int32_t a, float_status *status) 1280 { 1281 flag zSign; 1282 1283 if ( a == 0 ) return float32_zero; 1284 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); 1285 zSign = ( a < 0 ); 1286 return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status); 1287 } 1288 1289 /*---------------------------------------------------------------------------- 1290 | Returns the result of converting the 32-bit two's complement integer `a' 1291 | to the double-precision floating-point format. The conversion is performed 1292 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1293 *----------------------------------------------------------------------------*/ 1294 1295 float64 int32_to_float64(int32_t a, float_status *status) 1296 { 1297 flag zSign; 1298 uint32_t absA; 1299 int8_t shiftCount; 1300 uint64_t zSig; 1301 1302 if ( a == 0 ) return float64_zero; 1303 zSign = ( a < 0 ); 1304 absA = zSign ? - a : a; 1305 shiftCount = countLeadingZeros32( absA ) + 21; 1306 zSig = absA; 1307 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount ); 1308 1309 } 1310 1311 /*---------------------------------------------------------------------------- 1312 | Returns the result of converting the 32-bit two's complement integer `a' 1313 | to the extended double-precision floating-point format. The conversion 1314 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1315 | Arithmetic. 1316 *----------------------------------------------------------------------------*/ 1317 1318 floatx80 int32_to_floatx80(int32_t a, float_status *status) 1319 { 1320 flag zSign; 1321 uint32_t absA; 1322 int8_t shiftCount; 1323 uint64_t zSig; 1324 1325 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1326 zSign = ( a < 0 ); 1327 absA = zSign ? - a : a; 1328 shiftCount = countLeadingZeros32( absA ) + 32; 1329 zSig = absA; 1330 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 1331 1332 } 1333 1334 /*---------------------------------------------------------------------------- 1335 | Returns the result of converting the 32-bit two's complement integer `a' to 1336 | the quadruple-precision floating-point format. The conversion is performed 1337 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1338 *----------------------------------------------------------------------------*/ 1339 1340 float128 int32_to_float128(int32_t a, float_status *status) 1341 { 1342 flag zSign; 1343 uint32_t absA; 1344 int8_t shiftCount; 1345 uint64_t zSig0; 1346 1347 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1348 zSign = ( a < 0 ); 1349 absA = zSign ? - a : a; 1350 shiftCount = countLeadingZeros32( absA ) + 17; 1351 zSig0 = absA; 1352 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 1353 1354 } 1355 1356 /*---------------------------------------------------------------------------- 1357 | Returns the result of converting the 64-bit two's complement integer `a' 1358 | to the single-precision floating-point format. The conversion is performed 1359 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1360 *----------------------------------------------------------------------------*/ 1361 1362 float32 int64_to_float32(int64_t a, float_status *status) 1363 { 1364 flag zSign; 1365 uint64_t absA; 1366 int8_t shiftCount; 1367 1368 if ( a == 0 ) return float32_zero; 1369 zSign = ( a < 0 ); 1370 absA = zSign ? - a : a; 1371 shiftCount = countLeadingZeros64( absA ) - 40; 1372 if ( 0 <= shiftCount ) { 1373 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount ); 1374 } 1375 else { 1376 shiftCount += 7; 1377 if ( shiftCount < 0 ) { 1378 shift64RightJamming( absA, - shiftCount, &absA ); 1379 } 1380 else { 1381 absA <<= shiftCount; 1382 } 1383 return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status); 1384 } 1385 1386 } 1387 1388 /*---------------------------------------------------------------------------- 1389 | Returns the result of converting the 64-bit two's complement integer `a' 1390 | to the double-precision floating-point format. The conversion is performed 1391 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1392 *----------------------------------------------------------------------------*/ 1393 1394 float64 int64_to_float64(int64_t a, float_status *status) 1395 { 1396 flag zSign; 1397 1398 if ( a == 0 ) return float64_zero; 1399 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) { 1400 return packFloat64( 1, 0x43E, 0 ); 1401 } 1402 zSign = ( a < 0 ); 1403 return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status); 1404 } 1405 1406 /*---------------------------------------------------------------------------- 1407 | Returns the result of converting the 64-bit two's complement integer `a' 1408 | to the extended double-precision floating-point format. The conversion 1409 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1410 | Arithmetic. 1411 *----------------------------------------------------------------------------*/ 1412 1413 floatx80 int64_to_floatx80(int64_t a, float_status *status) 1414 { 1415 flag zSign; 1416 uint64_t absA; 1417 int8_t shiftCount; 1418 1419 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1420 zSign = ( a < 0 ); 1421 absA = zSign ? - a : a; 1422 shiftCount = countLeadingZeros64( absA ); 1423 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 1424 1425 } 1426 1427 /*---------------------------------------------------------------------------- 1428 | Returns the result of converting the 64-bit two's complement integer `a' to 1429 | the quadruple-precision floating-point format. The conversion is performed 1430 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1431 *----------------------------------------------------------------------------*/ 1432 1433 float128 int64_to_float128(int64_t a, float_status *status) 1434 { 1435 flag zSign; 1436 uint64_t absA; 1437 int8_t shiftCount; 1438 int32_t zExp; 1439 uint64_t zSig0, zSig1; 1440 1441 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1442 zSign = ( a < 0 ); 1443 absA = zSign ? - a : a; 1444 shiftCount = countLeadingZeros64( absA ) + 49; 1445 zExp = 0x406E - shiftCount; 1446 if ( 64 <= shiftCount ) { 1447 zSig1 = 0; 1448 zSig0 = absA; 1449 shiftCount -= 64; 1450 } 1451 else { 1452 zSig1 = absA; 1453 zSig0 = 0; 1454 } 1455 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1456 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1457 1458 } 1459 1460 /*---------------------------------------------------------------------------- 1461 | Returns the result of converting the 64-bit unsigned integer `a' 1462 | to the single-precision floating-point format. The conversion is performed 1463 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1464 *----------------------------------------------------------------------------*/ 1465 1466 float32 uint64_to_float32(uint64_t a, float_status *status) 1467 { 1468 int shiftcount; 1469 1470 if (a == 0) { 1471 return float32_zero; 1472 } 1473 1474 /* Determine (left) shift needed to put first set bit into bit posn 23 1475 * (since packFloat32() expects the binary point between bits 23 and 22); 1476 * this is the fast case for smallish numbers. 1477 */ 1478 shiftcount = countLeadingZeros64(a) - 40; 1479 if (shiftcount >= 0) { 1480 return packFloat32(0, 0x95 - shiftcount, a << shiftcount); 1481 } 1482 /* Otherwise we need to do a round-and-pack. roundAndPackFloat32() 1483 * expects the binary point between bits 30 and 29, hence the + 7. 1484 */ 1485 shiftcount += 7; 1486 if (shiftcount < 0) { 1487 shift64RightJamming(a, -shiftcount, &a); 1488 } else { 1489 a <<= shiftcount; 1490 } 1491 1492 return roundAndPackFloat32(0, 0x9c - shiftcount, a, status); 1493 } 1494 1495 /*---------------------------------------------------------------------------- 1496 | Returns the result of converting the 64-bit unsigned integer `a' 1497 | to the double-precision floating-point format. The conversion is performed 1498 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1499 *----------------------------------------------------------------------------*/ 1500 1501 float64 uint64_to_float64(uint64_t a, float_status *status) 1502 { 1503 int exp = 0x43C; 1504 int shiftcount; 1505 1506 if (a == 0) { 1507 return float64_zero; 1508 } 1509 1510 shiftcount = countLeadingZeros64(a) - 1; 1511 if (shiftcount < 0) { 1512 shift64RightJamming(a, -shiftcount, &a); 1513 } else { 1514 a <<= shiftcount; 1515 } 1516 return roundAndPackFloat64(0, exp - shiftcount, a, status); 1517 } 1518 1519 /*---------------------------------------------------------------------------- 1520 | Returns the result of converting the 64-bit unsigned integer `a' 1521 | to the quadruple-precision floating-point format. The conversion is performed 1522 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1523 *----------------------------------------------------------------------------*/ 1524 1525 float128 uint64_to_float128(uint64_t a, float_status *status) 1526 { 1527 if (a == 0) { 1528 return float128_zero; 1529 } 1530 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status); 1531 } 1532 1533 /*---------------------------------------------------------------------------- 1534 | Returns the result of converting the single-precision floating-point value 1535 | `a' to the 32-bit two's complement integer format. The conversion is 1536 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1537 | Arithmetic---which means in particular that the conversion is rounded 1538 | according to the current rounding mode. If `a' is a NaN, the largest 1539 | positive integer is returned. Otherwise, if the conversion overflows, the 1540 | largest integer with the same sign as `a' is returned. 1541 *----------------------------------------------------------------------------*/ 1542 1543 int32_t float32_to_int32(float32 a, float_status *status) 1544 { 1545 flag aSign; 1546 int aExp; 1547 int shiftCount; 1548 uint32_t aSig; 1549 uint64_t aSig64; 1550 1551 a = float32_squash_input_denormal(a, status); 1552 aSig = extractFloat32Frac( a ); 1553 aExp = extractFloat32Exp( a ); 1554 aSign = extractFloat32Sign( a ); 1555 if ( ( aExp == 0xFF ) && aSig ) aSign = 0; 1556 if ( aExp ) aSig |= 0x00800000; 1557 shiftCount = 0xAF - aExp; 1558 aSig64 = aSig; 1559 aSig64 <<= 32; 1560 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 ); 1561 return roundAndPackInt32(aSign, aSig64, status); 1562 1563 } 1564 1565 /*---------------------------------------------------------------------------- 1566 | Returns the result of converting the single-precision floating-point value 1567 | `a' to the 32-bit two's complement integer format. The conversion is 1568 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1569 | Arithmetic, except that the conversion is always rounded toward zero. 1570 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1571 | the conversion overflows, the largest integer with the same sign as `a' is 1572 | returned. 1573 *----------------------------------------------------------------------------*/ 1574 1575 int32_t float32_to_int32_round_to_zero(float32 a, float_status *status) 1576 { 1577 flag aSign; 1578 int aExp; 1579 int shiftCount; 1580 uint32_t aSig; 1581 int32_t z; 1582 a = float32_squash_input_denormal(a, status); 1583 1584 aSig = extractFloat32Frac( a ); 1585 aExp = extractFloat32Exp( a ); 1586 aSign = extractFloat32Sign( a ); 1587 shiftCount = aExp - 0x9E; 1588 if ( 0 <= shiftCount ) { 1589 if ( float32_val(a) != 0xCF000000 ) { 1590 float_raise(float_flag_invalid, status); 1591 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; 1592 } 1593 return (int32_t) 0x80000000; 1594 } 1595 else if ( aExp <= 0x7E ) { 1596 if (aExp | aSig) { 1597 status->float_exception_flags |= float_flag_inexact; 1598 } 1599 return 0; 1600 } 1601 aSig = ( aSig | 0x00800000 )<<8; 1602 z = aSig>>( - shiftCount ); 1603 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) { 1604 status->float_exception_flags |= float_flag_inexact; 1605 } 1606 if ( aSign ) z = - z; 1607 return z; 1608 1609 } 1610 1611 /*---------------------------------------------------------------------------- 1612 | Returns the result of converting the single-precision floating-point value 1613 | `a' to the 16-bit two's complement integer format. The conversion is 1614 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1615 | Arithmetic, except that the conversion is always rounded toward zero. 1616 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1617 | the conversion overflows, the largest integer with the same sign as `a' is 1618 | returned. 1619 *----------------------------------------------------------------------------*/ 1620 1621 int16_t float32_to_int16_round_to_zero(float32 a, float_status *status) 1622 { 1623 flag aSign; 1624 int aExp; 1625 int shiftCount; 1626 uint32_t aSig; 1627 int32_t z; 1628 1629 aSig = extractFloat32Frac( a ); 1630 aExp = extractFloat32Exp( a ); 1631 aSign = extractFloat32Sign( a ); 1632 shiftCount = aExp - 0x8E; 1633 if ( 0 <= shiftCount ) { 1634 if ( float32_val(a) != 0xC7000000 ) { 1635 float_raise(float_flag_invalid, status); 1636 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1637 return 0x7FFF; 1638 } 1639 } 1640 return (int32_t) 0xffff8000; 1641 } 1642 else if ( aExp <= 0x7E ) { 1643 if ( aExp | aSig ) { 1644 status->float_exception_flags |= float_flag_inexact; 1645 } 1646 return 0; 1647 } 1648 shiftCount -= 0x10; 1649 aSig = ( aSig | 0x00800000 )<<8; 1650 z = aSig>>( - shiftCount ); 1651 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) { 1652 status->float_exception_flags |= float_flag_inexact; 1653 } 1654 if ( aSign ) { 1655 z = - z; 1656 } 1657 return z; 1658 1659 } 1660 1661 /*---------------------------------------------------------------------------- 1662 | Returns the result of converting the single-precision floating-point value 1663 | `a' to the 64-bit two's complement integer format. The conversion is 1664 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1665 | Arithmetic---which means in particular that the conversion is rounded 1666 | according to the current rounding mode. If `a' is a NaN, the largest 1667 | positive integer is returned. Otherwise, if the conversion overflows, the 1668 | largest integer with the same sign as `a' is returned. 1669 *----------------------------------------------------------------------------*/ 1670 1671 int64_t float32_to_int64(float32 a, float_status *status) 1672 { 1673 flag aSign; 1674 int aExp; 1675 int shiftCount; 1676 uint32_t aSig; 1677 uint64_t aSig64, aSigExtra; 1678 a = float32_squash_input_denormal(a, status); 1679 1680 aSig = extractFloat32Frac( a ); 1681 aExp = extractFloat32Exp( a ); 1682 aSign = extractFloat32Sign( a ); 1683 shiftCount = 0xBE - aExp; 1684 if ( shiftCount < 0 ) { 1685 float_raise(float_flag_invalid, status); 1686 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1687 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1688 } 1689 return (int64_t) LIT64( 0x8000000000000000 ); 1690 } 1691 if ( aExp ) aSig |= 0x00800000; 1692 aSig64 = aSig; 1693 aSig64 <<= 40; 1694 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra ); 1695 return roundAndPackInt64(aSign, aSig64, aSigExtra, status); 1696 1697 } 1698 1699 /*---------------------------------------------------------------------------- 1700 | Returns the result of converting the single-precision floating-point value 1701 | `a' to the 64-bit unsigned integer format. The conversion is 1702 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1703 | Arithmetic---which means in particular that the conversion is rounded 1704 | according to the current rounding mode. If `a' is a NaN, the largest 1705 | unsigned integer is returned. Otherwise, if the conversion overflows, the 1706 | largest unsigned integer is returned. If the 'a' is negative, the result 1707 | is rounded and zero is returned; values that do not round to zero will 1708 | raise the inexact exception flag. 1709 *----------------------------------------------------------------------------*/ 1710 1711 uint64_t float32_to_uint64(float32 a, float_status *status) 1712 { 1713 flag aSign; 1714 int aExp; 1715 int shiftCount; 1716 uint32_t aSig; 1717 uint64_t aSig64, aSigExtra; 1718 a = float32_squash_input_denormal(a, status); 1719 1720 aSig = extractFloat32Frac(a); 1721 aExp = extractFloat32Exp(a); 1722 aSign = extractFloat32Sign(a); 1723 if ((aSign) && (aExp > 126)) { 1724 float_raise(float_flag_invalid, status); 1725 if (float32_is_any_nan(a)) { 1726 return LIT64(0xFFFFFFFFFFFFFFFF); 1727 } else { 1728 return 0; 1729 } 1730 } 1731 shiftCount = 0xBE - aExp; 1732 if (aExp) { 1733 aSig |= 0x00800000; 1734 } 1735 if (shiftCount < 0) { 1736 float_raise(float_flag_invalid, status); 1737 return LIT64(0xFFFFFFFFFFFFFFFF); 1738 } 1739 1740 aSig64 = aSig; 1741 aSig64 <<= 40; 1742 shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra); 1743 return roundAndPackUint64(aSign, aSig64, aSigExtra, status); 1744 } 1745 1746 /*---------------------------------------------------------------------------- 1747 | Returns the result of converting the single-precision floating-point value 1748 | `a' to the 64-bit unsigned integer format. The conversion is 1749 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1750 | Arithmetic, except that the conversion is always rounded toward zero. If 1751 | `a' is a NaN, the largest unsigned integer is returned. Otherwise, if the 1752 | conversion overflows, the largest unsigned integer is returned. If the 1753 | 'a' is negative, the result is rounded and zero is returned; values that do 1754 | not round to zero will raise the inexact flag. 1755 *----------------------------------------------------------------------------*/ 1756 1757 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *status) 1758 { 1759 signed char current_rounding_mode = status->float_rounding_mode; 1760 set_float_rounding_mode(float_round_to_zero, status); 1761 int64_t v = float32_to_uint64(a, status); 1762 set_float_rounding_mode(current_rounding_mode, status); 1763 return v; 1764 } 1765 1766 /*---------------------------------------------------------------------------- 1767 | Returns the result of converting the single-precision floating-point value 1768 | `a' to the 64-bit two's complement integer format. The conversion is 1769 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1770 | Arithmetic, except that the conversion is always rounded toward zero. If 1771 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 1772 | conversion overflows, the largest integer with the same sign as `a' is 1773 | returned. 1774 *----------------------------------------------------------------------------*/ 1775 1776 int64_t float32_to_int64_round_to_zero(float32 a, float_status *status) 1777 { 1778 flag aSign; 1779 int aExp; 1780 int shiftCount; 1781 uint32_t aSig; 1782 uint64_t aSig64; 1783 int64_t z; 1784 a = float32_squash_input_denormal(a, status); 1785 1786 aSig = extractFloat32Frac( a ); 1787 aExp = extractFloat32Exp( a ); 1788 aSign = extractFloat32Sign( a ); 1789 shiftCount = aExp - 0xBE; 1790 if ( 0 <= shiftCount ) { 1791 if ( float32_val(a) != 0xDF000000 ) { 1792 float_raise(float_flag_invalid, status); 1793 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1794 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1795 } 1796 } 1797 return (int64_t) LIT64( 0x8000000000000000 ); 1798 } 1799 else if ( aExp <= 0x7E ) { 1800 if (aExp | aSig) { 1801 status->float_exception_flags |= float_flag_inexact; 1802 } 1803 return 0; 1804 } 1805 aSig64 = aSig | 0x00800000; 1806 aSig64 <<= 40; 1807 z = aSig64>>( - shiftCount ); 1808 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) { 1809 status->float_exception_flags |= float_flag_inexact; 1810 } 1811 if ( aSign ) z = - z; 1812 return z; 1813 1814 } 1815 1816 /*---------------------------------------------------------------------------- 1817 | Returns the result of converting the single-precision floating-point value 1818 | `a' to the double-precision floating-point format. The conversion is 1819 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1820 | Arithmetic. 1821 *----------------------------------------------------------------------------*/ 1822 1823 float64 float32_to_float64(float32 a, float_status *status) 1824 { 1825 flag aSign; 1826 int aExp; 1827 uint32_t aSig; 1828 a = float32_squash_input_denormal(a, status); 1829 1830 aSig = extractFloat32Frac( a ); 1831 aExp = extractFloat32Exp( a ); 1832 aSign = extractFloat32Sign( a ); 1833 if ( aExp == 0xFF ) { 1834 if (aSig) { 1835 return commonNaNToFloat64(float32ToCommonNaN(a, status), status); 1836 } 1837 return packFloat64( aSign, 0x7FF, 0 ); 1838 } 1839 if ( aExp == 0 ) { 1840 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 1841 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1842 --aExp; 1843 } 1844 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 ); 1845 1846 } 1847 1848 /*---------------------------------------------------------------------------- 1849 | Returns the result of converting the single-precision floating-point value 1850 | `a' to the extended double-precision floating-point format. The conversion 1851 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1852 | Arithmetic. 1853 *----------------------------------------------------------------------------*/ 1854 1855 floatx80 float32_to_floatx80(float32 a, float_status *status) 1856 { 1857 flag aSign; 1858 int aExp; 1859 uint32_t aSig; 1860 1861 a = float32_squash_input_denormal(a, status); 1862 aSig = extractFloat32Frac( a ); 1863 aExp = extractFloat32Exp( a ); 1864 aSign = extractFloat32Sign( a ); 1865 if ( aExp == 0xFF ) { 1866 if (aSig) { 1867 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status); 1868 } 1869 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 1870 } 1871 if ( aExp == 0 ) { 1872 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 1873 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1874 } 1875 aSig |= 0x00800000; 1876 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 1877 1878 } 1879 1880 /*---------------------------------------------------------------------------- 1881 | Returns the result of converting the single-precision floating-point value 1882 | `a' to the double-precision floating-point format. The conversion is 1883 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1884 | Arithmetic. 1885 *----------------------------------------------------------------------------*/ 1886 1887 float128 float32_to_float128(float32 a, float_status *status) 1888 { 1889 flag aSign; 1890 int aExp; 1891 uint32_t aSig; 1892 1893 a = float32_squash_input_denormal(a, status); 1894 aSig = extractFloat32Frac( a ); 1895 aExp = extractFloat32Exp( a ); 1896 aSign = extractFloat32Sign( a ); 1897 if ( aExp == 0xFF ) { 1898 if (aSig) { 1899 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 1900 } 1901 return packFloat128( aSign, 0x7FFF, 0, 0 ); 1902 } 1903 if ( aExp == 0 ) { 1904 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 1905 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1906 --aExp; 1907 } 1908 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 1909 1910 } 1911 1912 /*---------------------------------------------------------------------------- 1913 | Rounds the single-precision floating-point value `a' to an integer, and 1914 | returns the result as a single-precision floating-point value. The 1915 | operation is performed according to the IEC/IEEE Standard for Binary 1916 | Floating-Point Arithmetic. 1917 *----------------------------------------------------------------------------*/ 1918 1919 float32 float32_round_to_int(float32 a, float_status *status) 1920 { 1921 flag aSign; 1922 int aExp; 1923 uint32_t lastBitMask, roundBitsMask; 1924 uint32_t z; 1925 a = float32_squash_input_denormal(a, status); 1926 1927 aExp = extractFloat32Exp( a ); 1928 if ( 0x96 <= aExp ) { 1929 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { 1930 return propagateFloat32NaN(a, a, status); 1931 } 1932 return a; 1933 } 1934 if ( aExp <= 0x7E ) { 1935 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a; 1936 status->float_exception_flags |= float_flag_inexact; 1937 aSign = extractFloat32Sign( a ); 1938 switch (status->float_rounding_mode) { 1939 case float_round_nearest_even: 1940 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { 1941 return packFloat32( aSign, 0x7F, 0 ); 1942 } 1943 break; 1944 case float_round_ties_away: 1945 if (aExp == 0x7E) { 1946 return packFloat32(aSign, 0x7F, 0); 1947 } 1948 break; 1949 case float_round_down: 1950 return make_float32(aSign ? 0xBF800000 : 0); 1951 case float_round_up: 1952 return make_float32(aSign ? 0x80000000 : 0x3F800000); 1953 } 1954 return packFloat32( aSign, 0, 0 ); 1955 } 1956 lastBitMask = 1; 1957 lastBitMask <<= 0x96 - aExp; 1958 roundBitsMask = lastBitMask - 1; 1959 z = float32_val(a); 1960 switch (status->float_rounding_mode) { 1961 case float_round_nearest_even: 1962 z += lastBitMask>>1; 1963 if ((z & roundBitsMask) == 0) { 1964 z &= ~lastBitMask; 1965 } 1966 break; 1967 case float_round_ties_away: 1968 z += lastBitMask >> 1; 1969 break; 1970 case float_round_to_zero: 1971 break; 1972 case float_round_up: 1973 if (!extractFloat32Sign(make_float32(z))) { 1974 z += roundBitsMask; 1975 } 1976 break; 1977 case float_round_down: 1978 if (extractFloat32Sign(make_float32(z))) { 1979 z += roundBitsMask; 1980 } 1981 break; 1982 default: 1983 abort(); 1984 } 1985 z &= ~ roundBitsMask; 1986 if (z != float32_val(a)) { 1987 status->float_exception_flags |= float_flag_inexact; 1988 } 1989 return make_float32(z); 1990 1991 } 1992 1993 /*---------------------------------------------------------------------------- 1994 | Returns the result of adding the absolute values of the single-precision 1995 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 1996 | before being returned. `zSign' is ignored if the result is a NaN. 1997 | The addition is performed according to the IEC/IEEE Standard for Binary 1998 | Floating-Point Arithmetic. 1999 *----------------------------------------------------------------------------*/ 2000 2001 static float32 addFloat32Sigs(float32 a, float32 b, flag zSign, 2002 float_status *status) 2003 { 2004 int aExp, bExp, zExp; 2005 uint32_t aSig, bSig, zSig; 2006 int expDiff; 2007 2008 aSig = extractFloat32Frac( a ); 2009 aExp = extractFloat32Exp( a ); 2010 bSig = extractFloat32Frac( b ); 2011 bExp = extractFloat32Exp( b ); 2012 expDiff = aExp - bExp; 2013 aSig <<= 6; 2014 bSig <<= 6; 2015 if ( 0 < expDiff ) { 2016 if ( aExp == 0xFF ) { 2017 if (aSig) { 2018 return propagateFloat32NaN(a, b, status); 2019 } 2020 return a; 2021 } 2022 if ( bExp == 0 ) { 2023 --expDiff; 2024 } 2025 else { 2026 bSig |= 0x20000000; 2027 } 2028 shift32RightJamming( bSig, expDiff, &bSig ); 2029 zExp = aExp; 2030 } 2031 else if ( expDiff < 0 ) { 2032 if ( bExp == 0xFF ) { 2033 if (bSig) { 2034 return propagateFloat32NaN(a, b, status); 2035 } 2036 return packFloat32( zSign, 0xFF, 0 ); 2037 } 2038 if ( aExp == 0 ) { 2039 ++expDiff; 2040 } 2041 else { 2042 aSig |= 0x20000000; 2043 } 2044 shift32RightJamming( aSig, - expDiff, &aSig ); 2045 zExp = bExp; 2046 } 2047 else { 2048 if ( aExp == 0xFF ) { 2049 if (aSig | bSig) { 2050 return propagateFloat32NaN(a, b, status); 2051 } 2052 return a; 2053 } 2054 if ( aExp == 0 ) { 2055 if (status->flush_to_zero) { 2056 if (aSig | bSig) { 2057 float_raise(float_flag_output_denormal, status); 2058 } 2059 return packFloat32(zSign, 0, 0); 2060 } 2061 return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); 2062 } 2063 zSig = 0x40000000 + aSig + bSig; 2064 zExp = aExp; 2065 goto roundAndPack; 2066 } 2067 aSig |= 0x20000000; 2068 zSig = ( aSig + bSig )<<1; 2069 --zExp; 2070 if ( (int32_t) zSig < 0 ) { 2071 zSig = aSig + bSig; 2072 ++zExp; 2073 } 2074 roundAndPack: 2075 return roundAndPackFloat32(zSign, zExp, zSig, status); 2076 2077 } 2078 2079 /*---------------------------------------------------------------------------- 2080 | Returns the result of subtracting the absolute values of the single- 2081 | precision floating-point values `a' and `b'. If `zSign' is 1, the 2082 | difference is negated before being returned. `zSign' is ignored if the 2083 | result is a NaN. The subtraction is performed according to the IEC/IEEE 2084 | Standard for Binary Floating-Point Arithmetic. 2085 *----------------------------------------------------------------------------*/ 2086 2087 static float32 subFloat32Sigs(float32 a, float32 b, flag zSign, 2088 float_status *status) 2089 { 2090 int aExp, bExp, zExp; 2091 uint32_t aSig, bSig, zSig; 2092 int expDiff; 2093 2094 aSig = extractFloat32Frac( a ); 2095 aExp = extractFloat32Exp( a ); 2096 bSig = extractFloat32Frac( b ); 2097 bExp = extractFloat32Exp( b ); 2098 expDiff = aExp - bExp; 2099 aSig <<= 7; 2100 bSig <<= 7; 2101 if ( 0 < expDiff ) goto aExpBigger; 2102 if ( expDiff < 0 ) goto bExpBigger; 2103 if ( aExp == 0xFF ) { 2104 if (aSig | bSig) { 2105 return propagateFloat32NaN(a, b, status); 2106 } 2107 float_raise(float_flag_invalid, status); 2108 return float32_default_nan(status); 2109 } 2110 if ( aExp == 0 ) { 2111 aExp = 1; 2112 bExp = 1; 2113 } 2114 if ( bSig < aSig ) goto aBigger; 2115 if ( aSig < bSig ) goto bBigger; 2116 return packFloat32(status->float_rounding_mode == float_round_down, 0, 0); 2117 bExpBigger: 2118 if ( bExp == 0xFF ) { 2119 if (bSig) { 2120 return propagateFloat32NaN(a, b, status); 2121 } 2122 return packFloat32( zSign ^ 1, 0xFF, 0 ); 2123 } 2124 if ( aExp == 0 ) { 2125 ++expDiff; 2126 } 2127 else { 2128 aSig |= 0x40000000; 2129 } 2130 shift32RightJamming( aSig, - expDiff, &aSig ); 2131 bSig |= 0x40000000; 2132 bBigger: 2133 zSig = bSig - aSig; 2134 zExp = bExp; 2135 zSign ^= 1; 2136 goto normalizeRoundAndPack; 2137 aExpBigger: 2138 if ( aExp == 0xFF ) { 2139 if (aSig) { 2140 return propagateFloat32NaN(a, b, status); 2141 } 2142 return a; 2143 } 2144 if ( bExp == 0 ) { 2145 --expDiff; 2146 } 2147 else { 2148 bSig |= 0x40000000; 2149 } 2150 shift32RightJamming( bSig, expDiff, &bSig ); 2151 aSig |= 0x40000000; 2152 aBigger: 2153 zSig = aSig - bSig; 2154 zExp = aExp; 2155 normalizeRoundAndPack: 2156 --zExp; 2157 return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status); 2158 2159 } 2160 2161 /*---------------------------------------------------------------------------- 2162 | Returns the result of adding the single-precision floating-point values `a' 2163 | and `b'. The operation is performed according to the IEC/IEEE Standard for 2164 | Binary Floating-Point Arithmetic. 2165 *----------------------------------------------------------------------------*/ 2166 2167 float32 float32_add(float32 a, float32 b, float_status *status) 2168 { 2169 flag aSign, bSign; 2170 a = float32_squash_input_denormal(a, status); 2171 b = float32_squash_input_denormal(b, status); 2172 2173 aSign = extractFloat32Sign( a ); 2174 bSign = extractFloat32Sign( b ); 2175 if ( aSign == bSign ) { 2176 return addFloat32Sigs(a, b, aSign, status); 2177 } 2178 else { 2179 return subFloat32Sigs(a, b, aSign, status); 2180 } 2181 2182 } 2183 2184 /*---------------------------------------------------------------------------- 2185 | Returns the result of subtracting the single-precision floating-point values 2186 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2187 | for Binary Floating-Point Arithmetic. 2188 *----------------------------------------------------------------------------*/ 2189 2190 float32 float32_sub(float32 a, float32 b, float_status *status) 2191 { 2192 flag aSign, bSign; 2193 a = float32_squash_input_denormal(a, status); 2194 b = float32_squash_input_denormal(b, status); 2195 2196 aSign = extractFloat32Sign( a ); 2197 bSign = extractFloat32Sign( b ); 2198 if ( aSign == bSign ) { 2199 return subFloat32Sigs(a, b, aSign, status); 2200 } 2201 else { 2202 return addFloat32Sigs(a, b, aSign, status); 2203 } 2204 2205 } 2206 2207 /*---------------------------------------------------------------------------- 2208 | Returns the result of multiplying the single-precision floating-point values 2209 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2210 | for Binary Floating-Point Arithmetic. 2211 *----------------------------------------------------------------------------*/ 2212 2213 float32 float32_mul(float32 a, float32 b, float_status *status) 2214 { 2215 flag aSign, bSign, zSign; 2216 int aExp, bExp, zExp; 2217 uint32_t aSig, bSig; 2218 uint64_t zSig64; 2219 uint32_t zSig; 2220 2221 a = float32_squash_input_denormal(a, status); 2222 b = float32_squash_input_denormal(b, status); 2223 2224 aSig = extractFloat32Frac( a ); 2225 aExp = extractFloat32Exp( a ); 2226 aSign = extractFloat32Sign( a ); 2227 bSig = extractFloat32Frac( b ); 2228 bExp = extractFloat32Exp( b ); 2229 bSign = extractFloat32Sign( b ); 2230 zSign = aSign ^ bSign; 2231 if ( aExp == 0xFF ) { 2232 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 2233 return propagateFloat32NaN(a, b, status); 2234 } 2235 if ( ( bExp | bSig ) == 0 ) { 2236 float_raise(float_flag_invalid, status); 2237 return float32_default_nan(status); 2238 } 2239 return packFloat32( zSign, 0xFF, 0 ); 2240 } 2241 if ( bExp == 0xFF ) { 2242 if (bSig) { 2243 return propagateFloat32NaN(a, b, status); 2244 } 2245 if ( ( aExp | aSig ) == 0 ) { 2246 float_raise(float_flag_invalid, status); 2247 return float32_default_nan(status); 2248 } 2249 return packFloat32( zSign, 0xFF, 0 ); 2250 } 2251 if ( aExp == 0 ) { 2252 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 2253 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2254 } 2255 if ( bExp == 0 ) { 2256 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); 2257 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2258 } 2259 zExp = aExp + bExp - 0x7F; 2260 aSig = ( aSig | 0x00800000 )<<7; 2261 bSig = ( bSig | 0x00800000 )<<8; 2262 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 ); 2263 zSig = zSig64; 2264 if ( 0 <= (int32_t) ( zSig<<1 ) ) { 2265 zSig <<= 1; 2266 --zExp; 2267 } 2268 return roundAndPackFloat32(zSign, zExp, zSig, status); 2269 2270 } 2271 2272 /*---------------------------------------------------------------------------- 2273 | Returns the result of dividing the single-precision floating-point value `a' 2274 | by the corresponding value `b'. The operation is performed according to the 2275 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2276 *----------------------------------------------------------------------------*/ 2277 2278 float32 float32_div(float32 a, float32 b, float_status *status) 2279 { 2280 flag aSign, bSign, zSign; 2281 int aExp, bExp, zExp; 2282 uint32_t aSig, bSig, zSig; 2283 a = float32_squash_input_denormal(a, status); 2284 b = float32_squash_input_denormal(b, status); 2285 2286 aSig = extractFloat32Frac( a ); 2287 aExp = extractFloat32Exp( a ); 2288 aSign = extractFloat32Sign( a ); 2289 bSig = extractFloat32Frac( b ); 2290 bExp = extractFloat32Exp( b ); 2291 bSign = extractFloat32Sign( b ); 2292 zSign = aSign ^ bSign; 2293 if ( aExp == 0xFF ) { 2294 if (aSig) { 2295 return propagateFloat32NaN(a, b, status); 2296 } 2297 if ( bExp == 0xFF ) { 2298 if (bSig) { 2299 return propagateFloat32NaN(a, b, status); 2300 } 2301 float_raise(float_flag_invalid, status); 2302 return float32_default_nan(status); 2303 } 2304 return packFloat32( zSign, 0xFF, 0 ); 2305 } 2306 if ( bExp == 0xFF ) { 2307 if (bSig) { 2308 return propagateFloat32NaN(a, b, status); 2309 } 2310 return packFloat32( zSign, 0, 0 ); 2311 } 2312 if ( bExp == 0 ) { 2313 if ( bSig == 0 ) { 2314 if ( ( aExp | aSig ) == 0 ) { 2315 float_raise(float_flag_invalid, status); 2316 return float32_default_nan(status); 2317 } 2318 float_raise(float_flag_divbyzero, status); 2319 return packFloat32( zSign, 0xFF, 0 ); 2320 } 2321 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2322 } 2323 if ( aExp == 0 ) { 2324 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 2325 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2326 } 2327 zExp = aExp - bExp + 0x7D; 2328 aSig = ( aSig | 0x00800000 )<<7; 2329 bSig = ( bSig | 0x00800000 )<<8; 2330 if ( bSig <= ( aSig + aSig ) ) { 2331 aSig >>= 1; 2332 ++zExp; 2333 } 2334 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig; 2335 if ( ( zSig & 0x3F ) == 0 ) { 2336 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 ); 2337 } 2338 return roundAndPackFloat32(zSign, zExp, zSig, status); 2339 2340 } 2341 2342 /*---------------------------------------------------------------------------- 2343 | Returns the remainder of the single-precision floating-point value `a' 2344 | with respect to the corresponding value `b'. The operation is performed 2345 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2346 *----------------------------------------------------------------------------*/ 2347 2348 float32 float32_rem(float32 a, float32 b, float_status *status) 2349 { 2350 flag aSign, zSign; 2351 int aExp, bExp, expDiff; 2352 uint32_t aSig, bSig; 2353 uint32_t q; 2354 uint64_t aSig64, bSig64, q64; 2355 uint32_t alternateASig; 2356 int32_t sigMean; 2357 a = float32_squash_input_denormal(a, status); 2358 b = float32_squash_input_denormal(b, status); 2359 2360 aSig = extractFloat32Frac( a ); 2361 aExp = extractFloat32Exp( a ); 2362 aSign = extractFloat32Sign( a ); 2363 bSig = extractFloat32Frac( b ); 2364 bExp = extractFloat32Exp( b ); 2365 if ( aExp == 0xFF ) { 2366 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 2367 return propagateFloat32NaN(a, b, status); 2368 } 2369 float_raise(float_flag_invalid, status); 2370 return float32_default_nan(status); 2371 } 2372 if ( bExp == 0xFF ) { 2373 if (bSig) { 2374 return propagateFloat32NaN(a, b, status); 2375 } 2376 return a; 2377 } 2378 if ( bExp == 0 ) { 2379 if ( bSig == 0 ) { 2380 float_raise(float_flag_invalid, status); 2381 return float32_default_nan(status); 2382 } 2383 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2384 } 2385 if ( aExp == 0 ) { 2386 if ( aSig == 0 ) return a; 2387 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2388 } 2389 expDiff = aExp - bExp; 2390 aSig |= 0x00800000; 2391 bSig |= 0x00800000; 2392 if ( expDiff < 32 ) { 2393 aSig <<= 8; 2394 bSig <<= 8; 2395 if ( expDiff < 0 ) { 2396 if ( expDiff < -1 ) return a; 2397 aSig >>= 1; 2398 } 2399 q = ( bSig <= aSig ); 2400 if ( q ) aSig -= bSig; 2401 if ( 0 < expDiff ) { 2402 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 2403 q >>= 32 - expDiff; 2404 bSig >>= 2; 2405 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 2406 } 2407 else { 2408 aSig >>= 2; 2409 bSig >>= 2; 2410 } 2411 } 2412 else { 2413 if ( bSig <= aSig ) aSig -= bSig; 2414 aSig64 = ( (uint64_t) aSig )<<40; 2415 bSig64 = ( (uint64_t) bSig )<<40; 2416 expDiff -= 64; 2417 while ( 0 < expDiff ) { 2418 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2419 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2420 aSig64 = - ( ( bSig * q64 )<<38 ); 2421 expDiff -= 62; 2422 } 2423 expDiff += 64; 2424 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2425 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2426 q = q64>>( 64 - expDiff ); 2427 bSig <<= 6; 2428 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 2429 } 2430 do { 2431 alternateASig = aSig; 2432 ++q; 2433 aSig -= bSig; 2434 } while ( 0 <= (int32_t) aSig ); 2435 sigMean = aSig + alternateASig; 2436 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 2437 aSig = alternateASig; 2438 } 2439 zSign = ( (int32_t) aSig < 0 ); 2440 if ( zSign ) aSig = - aSig; 2441 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 2442 } 2443 2444 /*---------------------------------------------------------------------------- 2445 | Returns the result of multiplying the single-precision floating-point values 2446 | `a' and `b' then adding 'c', with no intermediate rounding step after the 2447 | multiplication. The operation is performed according to the IEC/IEEE 2448 | Standard for Binary Floating-Point Arithmetic 754-2008. 2449 | The flags argument allows the caller to select negation of the 2450 | addend, the intermediate product, or the final result. (The difference 2451 | between this and having the caller do a separate negation is that negating 2452 | externally will flip the sign bit on NaNs.) 2453 *----------------------------------------------------------------------------*/ 2454 2455 float32 float32_muladd(float32 a, float32 b, float32 c, int flags, 2456 float_status *status) 2457 { 2458 flag aSign, bSign, cSign, zSign; 2459 int aExp, bExp, cExp, pExp, zExp, expDiff; 2460 uint32_t aSig, bSig, cSig; 2461 flag pInf, pZero, pSign; 2462 uint64_t pSig64, cSig64, zSig64; 2463 uint32_t pSig; 2464 int shiftcount; 2465 flag signflip, infzero; 2466 2467 a = float32_squash_input_denormal(a, status); 2468 b = float32_squash_input_denormal(b, status); 2469 c = float32_squash_input_denormal(c, status); 2470 aSig = extractFloat32Frac(a); 2471 aExp = extractFloat32Exp(a); 2472 aSign = extractFloat32Sign(a); 2473 bSig = extractFloat32Frac(b); 2474 bExp = extractFloat32Exp(b); 2475 bSign = extractFloat32Sign(b); 2476 cSig = extractFloat32Frac(c); 2477 cExp = extractFloat32Exp(c); 2478 cSign = extractFloat32Sign(c); 2479 2480 infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) || 2481 (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0)); 2482 2483 /* It is implementation-defined whether the cases of (0,inf,qnan) 2484 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 2485 * they return if they do), so we have to hand this information 2486 * off to the target-specific pick-a-NaN routine. 2487 */ 2488 if (((aExp == 0xff) && aSig) || 2489 ((bExp == 0xff) && bSig) || 2490 ((cExp == 0xff) && cSig)) { 2491 return propagateFloat32MulAddNaN(a, b, c, infzero, status); 2492 } 2493 2494 if (infzero) { 2495 float_raise(float_flag_invalid, status); 2496 return float32_default_nan(status); 2497 } 2498 2499 if (flags & float_muladd_negate_c) { 2500 cSign ^= 1; 2501 } 2502 2503 signflip = (flags & float_muladd_negate_result) ? 1 : 0; 2504 2505 /* Work out the sign and type of the product */ 2506 pSign = aSign ^ bSign; 2507 if (flags & float_muladd_negate_product) { 2508 pSign ^= 1; 2509 } 2510 pInf = (aExp == 0xff) || (bExp == 0xff); 2511 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0); 2512 2513 if (cExp == 0xff) { 2514 if (pInf && (pSign ^ cSign)) { 2515 /* addition of opposite-signed infinities => InvalidOperation */ 2516 float_raise(float_flag_invalid, status); 2517 return float32_default_nan(status); 2518 } 2519 /* Otherwise generate an infinity of the same sign */ 2520 return packFloat32(cSign ^ signflip, 0xff, 0); 2521 } 2522 2523 if (pInf) { 2524 return packFloat32(pSign ^ signflip, 0xff, 0); 2525 } 2526 2527 if (pZero) { 2528 if (cExp == 0) { 2529 if (cSig == 0) { 2530 /* Adding two exact zeroes */ 2531 if (pSign == cSign) { 2532 zSign = pSign; 2533 } else if (status->float_rounding_mode == float_round_down) { 2534 zSign = 1; 2535 } else { 2536 zSign = 0; 2537 } 2538 return packFloat32(zSign ^ signflip, 0, 0); 2539 } 2540 /* Exact zero plus a denorm */ 2541 if (status->flush_to_zero) { 2542 float_raise(float_flag_output_denormal, status); 2543 return packFloat32(cSign ^ signflip, 0, 0); 2544 } 2545 } 2546 /* Zero plus something non-zero : just return the something */ 2547 if (flags & float_muladd_halve_result) { 2548 if (cExp == 0) { 2549 normalizeFloat32Subnormal(cSig, &cExp, &cSig); 2550 } 2551 /* Subtract one to halve, and one again because roundAndPackFloat32 2552 * wants one less than the true exponent. 2553 */ 2554 cExp -= 2; 2555 cSig = (cSig | 0x00800000) << 7; 2556 return roundAndPackFloat32(cSign ^ signflip, cExp, cSig, status); 2557 } 2558 return packFloat32(cSign ^ signflip, cExp, cSig); 2559 } 2560 2561 if (aExp == 0) { 2562 normalizeFloat32Subnormal(aSig, &aExp, &aSig); 2563 } 2564 if (bExp == 0) { 2565 normalizeFloat32Subnormal(bSig, &bExp, &bSig); 2566 } 2567 2568 /* Calculate the actual result a * b + c */ 2569 2570 /* Multiply first; this is easy. */ 2571 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f 2572 * because we want the true exponent, not the "one-less-than" 2573 * flavour that roundAndPackFloat32() takes. 2574 */ 2575 pExp = aExp + bExp - 0x7e; 2576 aSig = (aSig | 0x00800000) << 7; 2577 bSig = (bSig | 0x00800000) << 8; 2578 pSig64 = (uint64_t)aSig * bSig; 2579 if ((int64_t)(pSig64 << 1) >= 0) { 2580 pSig64 <<= 1; 2581 pExp--; 2582 } 2583 2584 zSign = pSign ^ signflip; 2585 2586 /* Now pSig64 is the significand of the multiply, with the explicit bit in 2587 * position 62. 2588 */ 2589 if (cExp == 0) { 2590 if (!cSig) { 2591 /* Throw out the special case of c being an exact zero now */ 2592 shift64RightJamming(pSig64, 32, &pSig64); 2593 pSig = pSig64; 2594 if (flags & float_muladd_halve_result) { 2595 pExp--; 2596 } 2597 return roundAndPackFloat32(zSign, pExp - 1, 2598 pSig, status); 2599 } 2600 normalizeFloat32Subnormal(cSig, &cExp, &cSig); 2601 } 2602 2603 cSig64 = (uint64_t)cSig << (62 - 23); 2604 cSig64 |= LIT64(0x4000000000000000); 2605 expDiff = pExp - cExp; 2606 2607 if (pSign == cSign) { 2608 /* Addition */ 2609 if (expDiff > 0) { 2610 /* scale c to match p */ 2611 shift64RightJamming(cSig64, expDiff, &cSig64); 2612 zExp = pExp; 2613 } else if (expDiff < 0) { 2614 /* scale p to match c */ 2615 shift64RightJamming(pSig64, -expDiff, &pSig64); 2616 zExp = cExp; 2617 } else { 2618 /* no scaling needed */ 2619 zExp = cExp; 2620 } 2621 /* Add significands and make sure explicit bit ends up in posn 62 */ 2622 zSig64 = pSig64 + cSig64; 2623 if ((int64_t)zSig64 < 0) { 2624 shift64RightJamming(zSig64, 1, &zSig64); 2625 } else { 2626 zExp--; 2627 } 2628 } else { 2629 /* Subtraction */ 2630 if (expDiff > 0) { 2631 shift64RightJamming(cSig64, expDiff, &cSig64); 2632 zSig64 = pSig64 - cSig64; 2633 zExp = pExp; 2634 } else if (expDiff < 0) { 2635 shift64RightJamming(pSig64, -expDiff, &pSig64); 2636 zSig64 = cSig64 - pSig64; 2637 zExp = cExp; 2638 zSign ^= 1; 2639 } else { 2640 zExp = pExp; 2641 if (cSig64 < pSig64) { 2642 zSig64 = pSig64 - cSig64; 2643 } else if (pSig64 < cSig64) { 2644 zSig64 = cSig64 - pSig64; 2645 zSign ^= 1; 2646 } else { 2647 /* Exact zero */ 2648 zSign = signflip; 2649 if (status->float_rounding_mode == float_round_down) { 2650 zSign ^= 1; 2651 } 2652 return packFloat32(zSign, 0, 0); 2653 } 2654 } 2655 --zExp; 2656 /* Normalize to put the explicit bit back into bit 62. */ 2657 shiftcount = countLeadingZeros64(zSig64) - 1; 2658 zSig64 <<= shiftcount; 2659 zExp -= shiftcount; 2660 } 2661 if (flags & float_muladd_halve_result) { 2662 zExp--; 2663 } 2664 2665 shift64RightJamming(zSig64, 32, &zSig64); 2666 return roundAndPackFloat32(zSign, zExp, zSig64, status); 2667 } 2668 2669 2670 /*---------------------------------------------------------------------------- 2671 | Returns the square root of the single-precision floating-point value `a'. 2672 | The operation is performed according to the IEC/IEEE Standard for Binary 2673 | Floating-Point Arithmetic. 2674 *----------------------------------------------------------------------------*/ 2675 2676 float32 float32_sqrt(float32 a, float_status *status) 2677 { 2678 flag aSign; 2679 int aExp, zExp; 2680 uint32_t aSig, zSig; 2681 uint64_t rem, term; 2682 a = float32_squash_input_denormal(a, status); 2683 2684 aSig = extractFloat32Frac( a ); 2685 aExp = extractFloat32Exp( a ); 2686 aSign = extractFloat32Sign( a ); 2687 if ( aExp == 0xFF ) { 2688 if (aSig) { 2689 return propagateFloat32NaN(a, float32_zero, status); 2690 } 2691 if ( ! aSign ) return a; 2692 float_raise(float_flag_invalid, status); 2693 return float32_default_nan(status); 2694 } 2695 if ( aSign ) { 2696 if ( ( aExp | aSig ) == 0 ) return a; 2697 float_raise(float_flag_invalid, status); 2698 return float32_default_nan(status); 2699 } 2700 if ( aExp == 0 ) { 2701 if ( aSig == 0 ) return float32_zero; 2702 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2703 } 2704 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 2705 aSig = ( aSig | 0x00800000 )<<8; 2706 zSig = estimateSqrt32( aExp, aSig ) + 2; 2707 if ( ( zSig & 0x7F ) <= 5 ) { 2708 if ( zSig < 2 ) { 2709 zSig = 0x7FFFFFFF; 2710 goto roundAndPack; 2711 } 2712 aSig >>= aExp & 1; 2713 term = ( (uint64_t) zSig ) * zSig; 2714 rem = ( ( (uint64_t) aSig )<<32 ) - term; 2715 while ( (int64_t) rem < 0 ) { 2716 --zSig; 2717 rem += ( ( (uint64_t) zSig )<<1 ) | 1; 2718 } 2719 zSig |= ( rem != 0 ); 2720 } 2721 shift32RightJamming( zSig, 1, &zSig ); 2722 roundAndPack: 2723 return roundAndPackFloat32(0, zExp, zSig, status); 2724 2725 } 2726 2727 /*---------------------------------------------------------------------------- 2728 | Returns the binary exponential of the single-precision floating-point value 2729 | `a'. The operation is performed according to the IEC/IEEE Standard for 2730 | Binary Floating-Point Arithmetic. 2731 | 2732 | Uses the following identities: 2733 | 2734 | 1. ------------------------------------------------------------------------- 2735 | x x*ln(2) 2736 | 2 = e 2737 | 2738 | 2. ------------------------------------------------------------------------- 2739 | 2 3 4 5 n 2740 | x x x x x x x 2741 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 2742 | 1! 2! 3! 4! 5! n! 2743 *----------------------------------------------------------------------------*/ 2744 2745 static const float64 float32_exp2_coefficients[15] = 2746 { 2747 const_float64( 0x3ff0000000000000ll ), /* 1 */ 2748 const_float64( 0x3fe0000000000000ll ), /* 2 */ 2749 const_float64( 0x3fc5555555555555ll ), /* 3 */ 2750 const_float64( 0x3fa5555555555555ll ), /* 4 */ 2751 const_float64( 0x3f81111111111111ll ), /* 5 */ 2752 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 2753 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 2754 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 2755 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 2756 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 2757 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 2758 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 2759 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 2760 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 2761 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 2762 }; 2763 2764 float32 float32_exp2(float32 a, float_status *status) 2765 { 2766 flag aSign; 2767 int aExp; 2768 uint32_t aSig; 2769 float64 r, x, xn; 2770 int i; 2771 a = float32_squash_input_denormal(a, status); 2772 2773 aSig = extractFloat32Frac( a ); 2774 aExp = extractFloat32Exp( a ); 2775 aSign = extractFloat32Sign( a ); 2776 2777 if ( aExp == 0xFF) { 2778 if (aSig) { 2779 return propagateFloat32NaN(a, float32_zero, status); 2780 } 2781 return (aSign) ? float32_zero : a; 2782 } 2783 if (aExp == 0) { 2784 if (aSig == 0) return float32_one; 2785 } 2786 2787 float_raise(float_flag_inexact, status); 2788 2789 /* ******************************* */ 2790 /* using float64 for approximation */ 2791 /* ******************************* */ 2792 x = float32_to_float64(a, status); 2793 x = float64_mul(x, float64_ln2, status); 2794 2795 xn = x; 2796 r = float64_one; 2797 for (i = 0 ; i < 15 ; i++) { 2798 float64 f; 2799 2800 f = float64_mul(xn, float32_exp2_coefficients[i], status); 2801 r = float64_add(r, f, status); 2802 2803 xn = float64_mul(xn, x, status); 2804 } 2805 2806 return float64_to_float32(r, status); 2807 } 2808 2809 /*---------------------------------------------------------------------------- 2810 | Returns the binary log of the single-precision floating-point value `a'. 2811 | The operation is performed according to the IEC/IEEE Standard for Binary 2812 | Floating-Point Arithmetic. 2813 *----------------------------------------------------------------------------*/ 2814 float32 float32_log2(float32 a, float_status *status) 2815 { 2816 flag aSign, zSign; 2817 int aExp; 2818 uint32_t aSig, zSig, i; 2819 2820 a = float32_squash_input_denormal(a, status); 2821 aSig = extractFloat32Frac( a ); 2822 aExp = extractFloat32Exp( a ); 2823 aSign = extractFloat32Sign( a ); 2824 2825 if ( aExp == 0 ) { 2826 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 2827 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2828 } 2829 if ( aSign ) { 2830 float_raise(float_flag_invalid, status); 2831 return float32_default_nan(status); 2832 } 2833 if ( aExp == 0xFF ) { 2834 if (aSig) { 2835 return propagateFloat32NaN(a, float32_zero, status); 2836 } 2837 return a; 2838 } 2839 2840 aExp -= 0x7F; 2841 aSig |= 0x00800000; 2842 zSign = aExp < 0; 2843 zSig = aExp << 23; 2844 2845 for (i = 1 << 22; i > 0; i >>= 1) { 2846 aSig = ( (uint64_t)aSig * aSig ) >> 23; 2847 if ( aSig & 0x01000000 ) { 2848 aSig >>= 1; 2849 zSig |= i; 2850 } 2851 } 2852 2853 if ( zSign ) 2854 zSig = -zSig; 2855 2856 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 2857 } 2858 2859 /*---------------------------------------------------------------------------- 2860 | Returns 1 if the single-precision floating-point value `a' is equal to 2861 | the corresponding value `b', and 0 otherwise. The invalid exception is 2862 | raised if either operand is a NaN. Otherwise, the comparison is performed 2863 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2864 *----------------------------------------------------------------------------*/ 2865 2866 int float32_eq(float32 a, float32 b, float_status *status) 2867 { 2868 uint32_t av, bv; 2869 a = float32_squash_input_denormal(a, status); 2870 b = float32_squash_input_denormal(b, status); 2871 2872 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2873 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2874 ) { 2875 float_raise(float_flag_invalid, status); 2876 return 0; 2877 } 2878 av = float32_val(a); 2879 bv = float32_val(b); 2880 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 2881 } 2882 2883 /*---------------------------------------------------------------------------- 2884 | Returns 1 if the single-precision floating-point value `a' is less than 2885 | or equal to the corresponding value `b', and 0 otherwise. The invalid 2886 | exception is raised if either operand is a NaN. The comparison is performed 2887 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2888 *----------------------------------------------------------------------------*/ 2889 2890 int float32_le(float32 a, float32 b, float_status *status) 2891 { 2892 flag aSign, bSign; 2893 uint32_t av, bv; 2894 a = float32_squash_input_denormal(a, status); 2895 b = float32_squash_input_denormal(b, status); 2896 2897 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2898 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2899 ) { 2900 float_raise(float_flag_invalid, status); 2901 return 0; 2902 } 2903 aSign = extractFloat32Sign( a ); 2904 bSign = extractFloat32Sign( b ); 2905 av = float32_val(a); 2906 bv = float32_val(b); 2907 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 2908 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 2909 2910 } 2911 2912 /*---------------------------------------------------------------------------- 2913 | Returns 1 if the single-precision floating-point value `a' is less than 2914 | the corresponding value `b', and 0 otherwise. The invalid exception is 2915 | raised if either operand is a NaN. The comparison is performed according 2916 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2917 *----------------------------------------------------------------------------*/ 2918 2919 int float32_lt(float32 a, float32 b, float_status *status) 2920 { 2921 flag aSign, bSign; 2922 uint32_t av, bv; 2923 a = float32_squash_input_denormal(a, status); 2924 b = float32_squash_input_denormal(b, status); 2925 2926 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2927 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2928 ) { 2929 float_raise(float_flag_invalid, status); 2930 return 0; 2931 } 2932 aSign = extractFloat32Sign( a ); 2933 bSign = extractFloat32Sign( b ); 2934 av = float32_val(a); 2935 bv = float32_val(b); 2936 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 2937 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 2938 2939 } 2940 2941 /*---------------------------------------------------------------------------- 2942 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 2943 | be compared, and 0 otherwise. The invalid exception is raised if either 2944 | operand is a NaN. The comparison is performed according to the IEC/IEEE 2945 | Standard for Binary Floating-Point Arithmetic. 2946 *----------------------------------------------------------------------------*/ 2947 2948 int float32_unordered(float32 a, float32 b, float_status *status) 2949 { 2950 a = float32_squash_input_denormal(a, status); 2951 b = float32_squash_input_denormal(b, status); 2952 2953 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2954 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2955 ) { 2956 float_raise(float_flag_invalid, status); 2957 return 1; 2958 } 2959 return 0; 2960 } 2961 2962 /*---------------------------------------------------------------------------- 2963 | Returns 1 if the single-precision floating-point value `a' is equal to 2964 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 2965 | exception. The comparison is performed according to the IEC/IEEE Standard 2966 | for Binary Floating-Point Arithmetic. 2967 *----------------------------------------------------------------------------*/ 2968 2969 int float32_eq_quiet(float32 a, float32 b, float_status *status) 2970 { 2971 a = float32_squash_input_denormal(a, status); 2972 b = float32_squash_input_denormal(b, status); 2973 2974 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2975 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2976 ) { 2977 if (float32_is_signaling_nan(a, status) 2978 || float32_is_signaling_nan(b, status)) { 2979 float_raise(float_flag_invalid, status); 2980 } 2981 return 0; 2982 } 2983 return ( float32_val(a) == float32_val(b) ) || 2984 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 2985 } 2986 2987 /*---------------------------------------------------------------------------- 2988 | Returns 1 if the single-precision floating-point value `a' is less than or 2989 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 2990 | cause an exception. Otherwise, the comparison is performed according to the 2991 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2992 *----------------------------------------------------------------------------*/ 2993 2994 int float32_le_quiet(float32 a, float32 b, float_status *status) 2995 { 2996 flag aSign, bSign; 2997 uint32_t av, bv; 2998 a = float32_squash_input_denormal(a, status); 2999 b = float32_squash_input_denormal(b, status); 3000 3001 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3002 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3003 ) { 3004 if (float32_is_signaling_nan(a, status) 3005 || float32_is_signaling_nan(b, status)) { 3006 float_raise(float_flag_invalid, status); 3007 } 3008 return 0; 3009 } 3010 aSign = extractFloat32Sign( a ); 3011 bSign = extractFloat32Sign( b ); 3012 av = float32_val(a); 3013 bv = float32_val(b); 3014 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3015 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3016 3017 } 3018 3019 /*---------------------------------------------------------------------------- 3020 | Returns 1 if the single-precision floating-point value `a' is less than 3021 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3022 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 3023 | Standard for Binary Floating-Point Arithmetic. 3024 *----------------------------------------------------------------------------*/ 3025 3026 int float32_lt_quiet(float32 a, float32 b, float_status *status) 3027 { 3028 flag aSign, bSign; 3029 uint32_t av, bv; 3030 a = float32_squash_input_denormal(a, status); 3031 b = float32_squash_input_denormal(b, status); 3032 3033 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3034 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3035 ) { 3036 if (float32_is_signaling_nan(a, status) 3037 || float32_is_signaling_nan(b, status)) { 3038 float_raise(float_flag_invalid, status); 3039 } 3040 return 0; 3041 } 3042 aSign = extractFloat32Sign( a ); 3043 bSign = extractFloat32Sign( b ); 3044 av = float32_val(a); 3045 bv = float32_val(b); 3046 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 3047 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3048 3049 } 3050 3051 /*---------------------------------------------------------------------------- 3052 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 3053 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 3054 | comparison is performed according to the IEC/IEEE Standard for Binary 3055 | Floating-Point Arithmetic. 3056 *----------------------------------------------------------------------------*/ 3057 3058 int float32_unordered_quiet(float32 a, float32 b, float_status *status) 3059 { 3060 a = float32_squash_input_denormal(a, status); 3061 b = float32_squash_input_denormal(b, status); 3062 3063 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3064 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3065 ) { 3066 if (float32_is_signaling_nan(a, status) 3067 || float32_is_signaling_nan(b, status)) { 3068 float_raise(float_flag_invalid, status); 3069 } 3070 return 1; 3071 } 3072 return 0; 3073 } 3074 3075 /*---------------------------------------------------------------------------- 3076 | Returns the result of converting the double-precision floating-point value 3077 | `a' to the 32-bit two's complement integer format. The conversion is 3078 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3079 | Arithmetic---which means in particular that the conversion is rounded 3080 | according to the current rounding mode. If `a' is a NaN, the largest 3081 | positive integer is returned. Otherwise, if the conversion overflows, the 3082 | largest integer with the same sign as `a' is returned. 3083 *----------------------------------------------------------------------------*/ 3084 3085 int32_t float64_to_int32(float64 a, float_status *status) 3086 { 3087 flag aSign; 3088 int aExp; 3089 int shiftCount; 3090 uint64_t aSig; 3091 a = float64_squash_input_denormal(a, status); 3092 3093 aSig = extractFloat64Frac( a ); 3094 aExp = extractFloat64Exp( a ); 3095 aSign = extractFloat64Sign( a ); 3096 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 3097 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 3098 shiftCount = 0x42C - aExp; 3099 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 3100 return roundAndPackInt32(aSign, aSig, status); 3101 3102 } 3103 3104 /*---------------------------------------------------------------------------- 3105 | Returns the result of converting the double-precision floating-point value 3106 | `a' to the 32-bit two's complement integer format. The conversion is 3107 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3108 | Arithmetic, except that the conversion is always rounded toward zero. 3109 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 3110 | the conversion overflows, the largest integer with the same sign as `a' is 3111 | returned. 3112 *----------------------------------------------------------------------------*/ 3113 3114 int32_t float64_to_int32_round_to_zero(float64 a, float_status *status) 3115 { 3116 flag aSign; 3117 int aExp; 3118 int shiftCount; 3119 uint64_t aSig, savedASig; 3120 int32_t z; 3121 a = float64_squash_input_denormal(a, status); 3122 3123 aSig = extractFloat64Frac( a ); 3124 aExp = extractFloat64Exp( a ); 3125 aSign = extractFloat64Sign( a ); 3126 if ( 0x41E < aExp ) { 3127 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 3128 goto invalid; 3129 } 3130 else if ( aExp < 0x3FF ) { 3131 if (aExp || aSig) { 3132 status->float_exception_flags |= float_flag_inexact; 3133 } 3134 return 0; 3135 } 3136 aSig |= LIT64( 0x0010000000000000 ); 3137 shiftCount = 0x433 - aExp; 3138 savedASig = aSig; 3139 aSig >>= shiftCount; 3140 z = aSig; 3141 if ( aSign ) z = - z; 3142 if ( ( z < 0 ) ^ aSign ) { 3143 invalid: 3144 float_raise(float_flag_invalid, status); 3145 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 3146 } 3147 if ( ( aSig<<shiftCount ) != savedASig ) { 3148 status->float_exception_flags |= float_flag_inexact; 3149 } 3150 return z; 3151 3152 } 3153 3154 /*---------------------------------------------------------------------------- 3155 | Returns the result of converting the double-precision floating-point value 3156 | `a' to the 16-bit two's complement integer format. The conversion is 3157 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3158 | Arithmetic, except that the conversion is always rounded toward zero. 3159 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 3160 | the conversion overflows, the largest integer with the same sign as `a' is 3161 | returned. 3162 *----------------------------------------------------------------------------*/ 3163 3164 int16_t float64_to_int16_round_to_zero(float64 a, float_status *status) 3165 { 3166 flag aSign; 3167 int aExp; 3168 int shiftCount; 3169 uint64_t aSig, savedASig; 3170 int32_t z; 3171 3172 aSig = extractFloat64Frac( a ); 3173 aExp = extractFloat64Exp( a ); 3174 aSign = extractFloat64Sign( a ); 3175 if ( 0x40E < aExp ) { 3176 if ( ( aExp == 0x7FF ) && aSig ) { 3177 aSign = 0; 3178 } 3179 goto invalid; 3180 } 3181 else if ( aExp < 0x3FF ) { 3182 if ( aExp || aSig ) { 3183 status->float_exception_flags |= float_flag_inexact; 3184 } 3185 return 0; 3186 } 3187 aSig |= LIT64( 0x0010000000000000 ); 3188 shiftCount = 0x433 - aExp; 3189 savedASig = aSig; 3190 aSig >>= shiftCount; 3191 z = aSig; 3192 if ( aSign ) { 3193 z = - z; 3194 } 3195 if ( ( (int16_t)z < 0 ) ^ aSign ) { 3196 invalid: 3197 float_raise(float_flag_invalid, status); 3198 return aSign ? (int32_t) 0xffff8000 : 0x7FFF; 3199 } 3200 if ( ( aSig<<shiftCount ) != savedASig ) { 3201 status->float_exception_flags |= float_flag_inexact; 3202 } 3203 return z; 3204 } 3205 3206 /*---------------------------------------------------------------------------- 3207 | Returns the result of converting the double-precision floating-point value 3208 | `a' to the 64-bit two's complement integer format. The conversion is 3209 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3210 | Arithmetic---which means in particular that the conversion is rounded 3211 | according to the current rounding mode. If `a' is a NaN, the largest 3212 | positive integer is returned. Otherwise, if the conversion overflows, the 3213 | largest integer with the same sign as `a' is returned. 3214 *----------------------------------------------------------------------------*/ 3215 3216 int64_t float64_to_int64(float64 a, float_status *status) 3217 { 3218 flag aSign; 3219 int aExp; 3220 int shiftCount; 3221 uint64_t aSig, aSigExtra; 3222 a = float64_squash_input_denormal(a, status); 3223 3224 aSig = extractFloat64Frac( a ); 3225 aExp = extractFloat64Exp( a ); 3226 aSign = extractFloat64Sign( a ); 3227 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 3228 shiftCount = 0x433 - aExp; 3229 if ( shiftCount <= 0 ) { 3230 if ( 0x43E < aExp ) { 3231 float_raise(float_flag_invalid, status); 3232 if ( ! aSign 3233 || ( ( aExp == 0x7FF ) 3234 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 3235 ) { 3236 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3237 } 3238 return (int64_t) LIT64( 0x8000000000000000 ); 3239 } 3240 aSigExtra = 0; 3241 aSig <<= - shiftCount; 3242 } 3243 else { 3244 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 3245 } 3246 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 3247 3248 } 3249 3250 /*---------------------------------------------------------------------------- 3251 | Returns the result of converting the double-precision floating-point value 3252 | `a' to the 64-bit two's complement integer format. The conversion is 3253 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3254 | Arithmetic, except that the conversion is always rounded toward zero. 3255 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 3256 | the conversion overflows, the largest integer with the same sign as `a' is 3257 | returned. 3258 *----------------------------------------------------------------------------*/ 3259 3260 int64_t float64_to_int64_round_to_zero(float64 a, float_status *status) 3261 { 3262 flag aSign; 3263 int aExp; 3264 int shiftCount; 3265 uint64_t aSig; 3266 int64_t z; 3267 a = float64_squash_input_denormal(a, status); 3268 3269 aSig = extractFloat64Frac( a ); 3270 aExp = extractFloat64Exp( a ); 3271 aSign = extractFloat64Sign( a ); 3272 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 3273 shiftCount = aExp - 0x433; 3274 if ( 0 <= shiftCount ) { 3275 if ( 0x43E <= aExp ) { 3276 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) { 3277 float_raise(float_flag_invalid, status); 3278 if ( ! aSign 3279 || ( ( aExp == 0x7FF ) 3280 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 3281 ) { 3282 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3283 } 3284 } 3285 return (int64_t) LIT64( 0x8000000000000000 ); 3286 } 3287 z = aSig<<shiftCount; 3288 } 3289 else { 3290 if ( aExp < 0x3FE ) { 3291 if (aExp | aSig) { 3292 status->float_exception_flags |= float_flag_inexact; 3293 } 3294 return 0; 3295 } 3296 z = aSig>>( - shiftCount ); 3297 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 3298 status->float_exception_flags |= float_flag_inexact; 3299 } 3300 } 3301 if ( aSign ) z = - z; 3302 return z; 3303 3304 } 3305 3306 /*---------------------------------------------------------------------------- 3307 | Returns the result of converting the double-precision floating-point value 3308 | `a' to the single-precision floating-point format. The conversion is 3309 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3310 | Arithmetic. 3311 *----------------------------------------------------------------------------*/ 3312 3313 float32 float64_to_float32(float64 a, float_status *status) 3314 { 3315 flag aSign; 3316 int aExp; 3317 uint64_t aSig; 3318 uint32_t zSig; 3319 a = float64_squash_input_denormal(a, status); 3320 3321 aSig = extractFloat64Frac( a ); 3322 aExp = extractFloat64Exp( a ); 3323 aSign = extractFloat64Sign( a ); 3324 if ( aExp == 0x7FF ) { 3325 if (aSig) { 3326 return commonNaNToFloat32(float64ToCommonNaN(a, status), status); 3327 } 3328 return packFloat32( aSign, 0xFF, 0 ); 3329 } 3330 shift64RightJamming( aSig, 22, &aSig ); 3331 zSig = aSig; 3332 if ( aExp || zSig ) { 3333 zSig |= 0x40000000; 3334 aExp -= 0x381; 3335 } 3336 return roundAndPackFloat32(aSign, aExp, zSig, status); 3337 3338 } 3339 3340 3341 /*---------------------------------------------------------------------------- 3342 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 3343 | half-precision floating-point value, returning the result. After being 3344 | shifted into the proper positions, the three fields are simply added 3345 | together to form the result. This means that any integer portion of `zSig' 3346 | will be added into the exponent. Since a properly normalized significand 3347 | will have an integer portion equal to 1, the `zExp' input should be 1 less 3348 | than the desired result exponent whenever `zSig' is a complete, normalized 3349 | significand. 3350 *----------------------------------------------------------------------------*/ 3351 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig) 3352 { 3353 return make_float16( 3354 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig); 3355 } 3356 3357 /*---------------------------------------------------------------------------- 3358 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3359 | and significand `zSig', and returns the proper half-precision floating- 3360 | point value corresponding to the abstract input. Ordinarily, the abstract 3361 | value is simply rounded and packed into the half-precision format, with 3362 | the inexact exception raised if the abstract input cannot be represented 3363 | exactly. However, if the abstract value is too large, the overflow and 3364 | inexact exceptions are raised and an infinity or maximal finite value is 3365 | returned. If the abstract value is too small, the input value is rounded to 3366 | a subnormal number, and the underflow and inexact exceptions are raised if 3367 | the abstract input cannot be represented exactly as a subnormal half- 3368 | precision floating-point number. 3369 | The `ieee' flag indicates whether to use IEEE standard half precision, or 3370 | ARM-style "alternative representation", which omits the NaN and Inf 3371 | encodings in order to raise the maximum representable exponent by one. 3372 | The input significand `zSig' has its binary point between bits 22 3373 | and 23, which is 13 bits to the left of the usual location. This shifted 3374 | significand must be normalized or smaller. If `zSig' is not normalized, 3375 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3376 | and it must not require rounding. In the usual case that `zSig' is 3377 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3378 | Note the slightly odd position of the binary point in zSig compared with the 3379 | other roundAndPackFloat functions. This should probably be fixed if we 3380 | need to implement more float16 routines than just conversion. 3381 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3382 | Binary Floating-Point Arithmetic. 3383 *----------------------------------------------------------------------------*/ 3384 3385 static float16 roundAndPackFloat16(flag zSign, int zExp, 3386 uint32_t zSig, flag ieee, 3387 float_status *status) 3388 { 3389 int maxexp = ieee ? 29 : 30; 3390 uint32_t mask; 3391 uint32_t increment; 3392 bool rounding_bumps_exp; 3393 bool is_tiny = false; 3394 3395 /* Calculate the mask of bits of the mantissa which are not 3396 * representable in half-precision and will be lost. 3397 */ 3398 if (zExp < 1) { 3399 /* Will be denormal in halfprec */ 3400 mask = 0x00ffffff; 3401 if (zExp >= -11) { 3402 mask >>= 11 + zExp; 3403 } 3404 } else { 3405 /* Normal number in halfprec */ 3406 mask = 0x00001fff; 3407 } 3408 3409 switch (status->float_rounding_mode) { 3410 case float_round_nearest_even: 3411 increment = (mask + 1) >> 1; 3412 if ((zSig & mask) == increment) { 3413 increment = zSig & (increment << 1); 3414 } 3415 break; 3416 case float_round_ties_away: 3417 increment = (mask + 1) >> 1; 3418 break; 3419 case float_round_up: 3420 increment = zSign ? 0 : mask; 3421 break; 3422 case float_round_down: 3423 increment = zSign ? mask : 0; 3424 break; 3425 default: /* round_to_zero */ 3426 increment = 0; 3427 break; 3428 } 3429 3430 rounding_bumps_exp = (zSig + increment >= 0x01000000); 3431 3432 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) { 3433 if (ieee) { 3434 float_raise(float_flag_overflow | float_flag_inexact, status); 3435 return packFloat16(zSign, 0x1f, 0); 3436 } else { 3437 float_raise(float_flag_invalid, status); 3438 return packFloat16(zSign, 0x1f, 0x3ff); 3439 } 3440 } 3441 3442 if (zExp < 0) { 3443 /* Note that flush-to-zero does not affect half-precision results */ 3444 is_tiny = 3445 (status->float_detect_tininess == float_tininess_before_rounding) 3446 || (zExp < -1) 3447 || (!rounding_bumps_exp); 3448 } 3449 if (zSig & mask) { 3450 float_raise(float_flag_inexact, status); 3451 if (is_tiny) { 3452 float_raise(float_flag_underflow, status); 3453 } 3454 } 3455 3456 zSig += increment; 3457 if (rounding_bumps_exp) { 3458 zSig >>= 1; 3459 zExp++; 3460 } 3461 3462 if (zExp < -10) { 3463 return packFloat16(zSign, 0, 0); 3464 } 3465 if (zExp < 0) { 3466 zSig >>= -zExp; 3467 zExp = 0; 3468 } 3469 return packFloat16(zSign, zExp, zSig >> 13); 3470 } 3471 3472 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr, 3473 uint32_t *zSigPtr) 3474 { 3475 int8_t shiftCount = countLeadingZeros32(aSig) - 21; 3476 *zSigPtr = aSig << shiftCount; 3477 *zExpPtr = 1 - shiftCount; 3478 } 3479 3480 /* Half precision floats come in two formats: standard IEEE and "ARM" format. 3481 The latter gains extra exponent range by omitting the NaN/Inf encodings. */ 3482 3483 float32 float16_to_float32(float16 a, flag ieee, float_status *status) 3484 { 3485 flag aSign; 3486 int aExp; 3487 uint32_t aSig; 3488 3489 aSign = extractFloat16Sign(a); 3490 aExp = extractFloat16Exp(a); 3491 aSig = extractFloat16Frac(a); 3492 3493 if (aExp == 0x1f && ieee) { 3494 if (aSig) { 3495 return commonNaNToFloat32(float16ToCommonNaN(a, status), status); 3496 } 3497 return packFloat32(aSign, 0xff, 0); 3498 } 3499 if (aExp == 0) { 3500 if (aSig == 0) { 3501 return packFloat32(aSign, 0, 0); 3502 } 3503 3504 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 3505 aExp--; 3506 } 3507 return packFloat32( aSign, aExp + 0x70, aSig << 13); 3508 } 3509 3510 float16 float32_to_float16(float32 a, flag ieee, float_status *status) 3511 { 3512 flag aSign; 3513 int aExp; 3514 uint32_t aSig; 3515 3516 a = float32_squash_input_denormal(a, status); 3517 3518 aSig = extractFloat32Frac( a ); 3519 aExp = extractFloat32Exp( a ); 3520 aSign = extractFloat32Sign( a ); 3521 if ( aExp == 0xFF ) { 3522 if (aSig) { 3523 /* Input is a NaN */ 3524 if (!ieee) { 3525 float_raise(float_flag_invalid, status); 3526 return packFloat16(aSign, 0, 0); 3527 } 3528 return commonNaNToFloat16( 3529 float32ToCommonNaN(a, status), status); 3530 } 3531 /* Infinity */ 3532 if (!ieee) { 3533 float_raise(float_flag_invalid, status); 3534 return packFloat16(aSign, 0x1f, 0x3ff); 3535 } 3536 return packFloat16(aSign, 0x1f, 0); 3537 } 3538 if (aExp == 0 && aSig == 0) { 3539 return packFloat16(aSign, 0, 0); 3540 } 3541 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 3542 * even if the input is denormal; however this is harmless because 3543 * the largest possible single-precision denormal is still smaller 3544 * than the smallest representable half-precision denormal, and so we 3545 * will end up ignoring aSig and returning via the "always return zero" 3546 * codepath. 3547 */ 3548 aSig |= 0x00800000; 3549 aExp -= 0x71; 3550 3551 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status); 3552 } 3553 3554 float64 float16_to_float64(float16 a, flag ieee, float_status *status) 3555 { 3556 flag aSign; 3557 int aExp; 3558 uint32_t aSig; 3559 3560 aSign = extractFloat16Sign(a); 3561 aExp = extractFloat16Exp(a); 3562 aSig = extractFloat16Frac(a); 3563 3564 if (aExp == 0x1f && ieee) { 3565 if (aSig) { 3566 return commonNaNToFloat64( 3567 float16ToCommonNaN(a, status), status); 3568 } 3569 return packFloat64(aSign, 0x7ff, 0); 3570 } 3571 if (aExp == 0) { 3572 if (aSig == 0) { 3573 return packFloat64(aSign, 0, 0); 3574 } 3575 3576 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 3577 aExp--; 3578 } 3579 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42); 3580 } 3581 3582 float16 float64_to_float16(float64 a, flag ieee, float_status *status) 3583 { 3584 flag aSign; 3585 int aExp; 3586 uint64_t aSig; 3587 uint32_t zSig; 3588 3589 a = float64_squash_input_denormal(a, status); 3590 3591 aSig = extractFloat64Frac(a); 3592 aExp = extractFloat64Exp(a); 3593 aSign = extractFloat64Sign(a); 3594 if (aExp == 0x7FF) { 3595 if (aSig) { 3596 /* Input is a NaN */ 3597 if (!ieee) { 3598 float_raise(float_flag_invalid, status); 3599 return packFloat16(aSign, 0, 0); 3600 } 3601 return commonNaNToFloat16( 3602 float64ToCommonNaN(a, status), status); 3603 } 3604 /* Infinity */ 3605 if (!ieee) { 3606 float_raise(float_flag_invalid, status); 3607 return packFloat16(aSign, 0x1f, 0x3ff); 3608 } 3609 return packFloat16(aSign, 0x1f, 0); 3610 } 3611 shift64RightJamming(aSig, 29, &aSig); 3612 zSig = aSig; 3613 if (aExp == 0 && zSig == 0) { 3614 return packFloat16(aSign, 0, 0); 3615 } 3616 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 3617 * even if the input is denormal; however this is harmless because 3618 * the largest possible single-precision denormal is still smaller 3619 * than the smallest representable half-precision denormal, and so we 3620 * will end up ignoring aSig and returning via the "always return zero" 3621 * codepath. 3622 */ 3623 zSig |= 0x00800000; 3624 aExp -= 0x3F1; 3625 3626 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status); 3627 } 3628 3629 /*---------------------------------------------------------------------------- 3630 | Returns the result of converting the double-precision floating-point value 3631 | `a' to the extended double-precision floating-point format. The conversion 3632 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3633 | Arithmetic. 3634 *----------------------------------------------------------------------------*/ 3635 3636 floatx80 float64_to_floatx80(float64 a, float_status *status) 3637 { 3638 flag aSign; 3639 int aExp; 3640 uint64_t aSig; 3641 3642 a = float64_squash_input_denormal(a, status); 3643 aSig = extractFloat64Frac( a ); 3644 aExp = extractFloat64Exp( a ); 3645 aSign = extractFloat64Sign( a ); 3646 if ( aExp == 0x7FF ) { 3647 if (aSig) { 3648 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status); 3649 } 3650 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3651 } 3652 if ( aExp == 0 ) { 3653 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 3654 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3655 } 3656 return 3657 packFloatx80( 3658 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 3659 3660 } 3661 3662 /*---------------------------------------------------------------------------- 3663 | Returns the result of converting the double-precision floating-point value 3664 | `a' to the quadruple-precision floating-point format. The conversion is 3665 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3666 | Arithmetic. 3667 *----------------------------------------------------------------------------*/ 3668 3669 float128 float64_to_float128(float64 a, float_status *status) 3670 { 3671 flag aSign; 3672 int aExp; 3673 uint64_t aSig, zSig0, zSig1; 3674 3675 a = float64_squash_input_denormal(a, status); 3676 aSig = extractFloat64Frac( a ); 3677 aExp = extractFloat64Exp( a ); 3678 aSign = extractFloat64Sign( a ); 3679 if ( aExp == 0x7FF ) { 3680 if (aSig) { 3681 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 3682 } 3683 return packFloat128( aSign, 0x7FFF, 0, 0 ); 3684 } 3685 if ( aExp == 0 ) { 3686 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 3687 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3688 --aExp; 3689 } 3690 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 3691 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 3692 3693 } 3694 3695 /*---------------------------------------------------------------------------- 3696 | Rounds the double-precision floating-point value `a' to an integer, and 3697 | returns the result as a double-precision floating-point value. The 3698 | operation is performed according to the IEC/IEEE Standard for Binary 3699 | Floating-Point Arithmetic. 3700 *----------------------------------------------------------------------------*/ 3701 3702 float64 float64_round_to_int(float64 a, float_status *status) 3703 { 3704 flag aSign; 3705 int aExp; 3706 uint64_t lastBitMask, roundBitsMask; 3707 uint64_t z; 3708 a = float64_squash_input_denormal(a, status); 3709 3710 aExp = extractFloat64Exp( a ); 3711 if ( 0x433 <= aExp ) { 3712 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { 3713 return propagateFloat64NaN(a, a, status); 3714 } 3715 return a; 3716 } 3717 if ( aExp < 0x3FF ) { 3718 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a; 3719 status->float_exception_flags |= float_flag_inexact; 3720 aSign = extractFloat64Sign( a ); 3721 switch (status->float_rounding_mode) { 3722 case float_round_nearest_even: 3723 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { 3724 return packFloat64( aSign, 0x3FF, 0 ); 3725 } 3726 break; 3727 case float_round_ties_away: 3728 if (aExp == 0x3FE) { 3729 return packFloat64(aSign, 0x3ff, 0); 3730 } 3731 break; 3732 case float_round_down: 3733 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0); 3734 case float_round_up: 3735 return make_float64( 3736 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 )); 3737 } 3738 return packFloat64( aSign, 0, 0 ); 3739 } 3740 lastBitMask = 1; 3741 lastBitMask <<= 0x433 - aExp; 3742 roundBitsMask = lastBitMask - 1; 3743 z = float64_val(a); 3744 switch (status->float_rounding_mode) { 3745 case float_round_nearest_even: 3746 z += lastBitMask >> 1; 3747 if ((z & roundBitsMask) == 0) { 3748 z &= ~lastBitMask; 3749 } 3750 break; 3751 case float_round_ties_away: 3752 z += lastBitMask >> 1; 3753 break; 3754 case float_round_to_zero: 3755 break; 3756 case float_round_up: 3757 if (!extractFloat64Sign(make_float64(z))) { 3758 z += roundBitsMask; 3759 } 3760 break; 3761 case float_round_down: 3762 if (extractFloat64Sign(make_float64(z))) { 3763 z += roundBitsMask; 3764 } 3765 break; 3766 default: 3767 abort(); 3768 } 3769 z &= ~ roundBitsMask; 3770 if (z != float64_val(a)) { 3771 status->float_exception_flags |= float_flag_inexact; 3772 } 3773 return make_float64(z); 3774 3775 } 3776 3777 float64 float64_trunc_to_int(float64 a, float_status *status) 3778 { 3779 int oldmode; 3780 float64 res; 3781 oldmode = status->float_rounding_mode; 3782 status->float_rounding_mode = float_round_to_zero; 3783 res = float64_round_to_int(a, status); 3784 status->float_rounding_mode = oldmode; 3785 return res; 3786 } 3787 3788 /*---------------------------------------------------------------------------- 3789 | Returns the result of adding the absolute values of the double-precision 3790 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 3791 | before being returned. `zSign' is ignored if the result is a NaN. 3792 | The addition is performed according to the IEC/IEEE Standard for Binary 3793 | Floating-Point Arithmetic. 3794 *----------------------------------------------------------------------------*/ 3795 3796 static float64 addFloat64Sigs(float64 a, float64 b, flag zSign, 3797 float_status *status) 3798 { 3799 int aExp, bExp, zExp; 3800 uint64_t aSig, bSig, zSig; 3801 int expDiff; 3802 3803 aSig = extractFloat64Frac( a ); 3804 aExp = extractFloat64Exp( a ); 3805 bSig = extractFloat64Frac( b ); 3806 bExp = extractFloat64Exp( b ); 3807 expDiff = aExp - bExp; 3808 aSig <<= 9; 3809 bSig <<= 9; 3810 if ( 0 < expDiff ) { 3811 if ( aExp == 0x7FF ) { 3812 if (aSig) { 3813 return propagateFloat64NaN(a, b, status); 3814 } 3815 return a; 3816 } 3817 if ( bExp == 0 ) { 3818 --expDiff; 3819 } 3820 else { 3821 bSig |= LIT64( 0x2000000000000000 ); 3822 } 3823 shift64RightJamming( bSig, expDiff, &bSig ); 3824 zExp = aExp; 3825 } 3826 else if ( expDiff < 0 ) { 3827 if ( bExp == 0x7FF ) { 3828 if (bSig) { 3829 return propagateFloat64NaN(a, b, status); 3830 } 3831 return packFloat64( zSign, 0x7FF, 0 ); 3832 } 3833 if ( aExp == 0 ) { 3834 ++expDiff; 3835 } 3836 else { 3837 aSig |= LIT64( 0x2000000000000000 ); 3838 } 3839 shift64RightJamming( aSig, - expDiff, &aSig ); 3840 zExp = bExp; 3841 } 3842 else { 3843 if ( aExp == 0x7FF ) { 3844 if (aSig | bSig) { 3845 return propagateFloat64NaN(a, b, status); 3846 } 3847 return a; 3848 } 3849 if ( aExp == 0 ) { 3850 if (status->flush_to_zero) { 3851 if (aSig | bSig) { 3852 float_raise(float_flag_output_denormal, status); 3853 } 3854 return packFloat64(zSign, 0, 0); 3855 } 3856 return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); 3857 } 3858 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig; 3859 zExp = aExp; 3860 goto roundAndPack; 3861 } 3862 aSig |= LIT64( 0x2000000000000000 ); 3863 zSig = ( aSig + bSig )<<1; 3864 --zExp; 3865 if ( (int64_t) zSig < 0 ) { 3866 zSig = aSig + bSig; 3867 ++zExp; 3868 } 3869 roundAndPack: 3870 return roundAndPackFloat64(zSign, zExp, zSig, status); 3871 3872 } 3873 3874 /*---------------------------------------------------------------------------- 3875 | Returns the result of subtracting the absolute values of the double- 3876 | precision floating-point values `a' and `b'. If `zSign' is 1, the 3877 | difference is negated before being returned. `zSign' is ignored if the 3878 | result is a NaN. The subtraction is performed according to the IEC/IEEE 3879 | Standard for Binary Floating-Point Arithmetic. 3880 *----------------------------------------------------------------------------*/ 3881 3882 static float64 subFloat64Sigs(float64 a, float64 b, flag zSign, 3883 float_status *status) 3884 { 3885 int aExp, bExp, zExp; 3886 uint64_t aSig, bSig, zSig; 3887 int expDiff; 3888 3889 aSig = extractFloat64Frac( a ); 3890 aExp = extractFloat64Exp( a ); 3891 bSig = extractFloat64Frac( b ); 3892 bExp = extractFloat64Exp( b ); 3893 expDiff = aExp - bExp; 3894 aSig <<= 10; 3895 bSig <<= 10; 3896 if ( 0 < expDiff ) goto aExpBigger; 3897 if ( expDiff < 0 ) goto bExpBigger; 3898 if ( aExp == 0x7FF ) { 3899 if (aSig | bSig) { 3900 return propagateFloat64NaN(a, b, status); 3901 } 3902 float_raise(float_flag_invalid, status); 3903 return float64_default_nan(status); 3904 } 3905 if ( aExp == 0 ) { 3906 aExp = 1; 3907 bExp = 1; 3908 } 3909 if ( bSig < aSig ) goto aBigger; 3910 if ( aSig < bSig ) goto bBigger; 3911 return packFloat64(status->float_rounding_mode == float_round_down, 0, 0); 3912 bExpBigger: 3913 if ( bExp == 0x7FF ) { 3914 if (bSig) { 3915 return propagateFloat64NaN(a, b, status); 3916 } 3917 return packFloat64( zSign ^ 1, 0x7FF, 0 ); 3918 } 3919 if ( aExp == 0 ) { 3920 ++expDiff; 3921 } 3922 else { 3923 aSig |= LIT64( 0x4000000000000000 ); 3924 } 3925 shift64RightJamming( aSig, - expDiff, &aSig ); 3926 bSig |= LIT64( 0x4000000000000000 ); 3927 bBigger: 3928 zSig = bSig - aSig; 3929 zExp = bExp; 3930 zSign ^= 1; 3931 goto normalizeRoundAndPack; 3932 aExpBigger: 3933 if ( aExp == 0x7FF ) { 3934 if (aSig) { 3935 return propagateFloat64NaN(a, b, status); 3936 } 3937 return a; 3938 } 3939 if ( bExp == 0 ) { 3940 --expDiff; 3941 } 3942 else { 3943 bSig |= LIT64( 0x4000000000000000 ); 3944 } 3945 shift64RightJamming( bSig, expDiff, &bSig ); 3946 aSig |= LIT64( 0x4000000000000000 ); 3947 aBigger: 3948 zSig = aSig - bSig; 3949 zExp = aExp; 3950 normalizeRoundAndPack: 3951 --zExp; 3952 return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status); 3953 3954 } 3955 3956 /*---------------------------------------------------------------------------- 3957 | Returns the result of adding the double-precision floating-point values `a' 3958 | and `b'. The operation is performed according to the IEC/IEEE Standard for 3959 | Binary Floating-Point Arithmetic. 3960 *----------------------------------------------------------------------------*/ 3961 3962 float64 float64_add(float64 a, float64 b, float_status *status) 3963 { 3964 flag aSign, bSign; 3965 a = float64_squash_input_denormal(a, status); 3966 b = float64_squash_input_denormal(b, status); 3967 3968 aSign = extractFloat64Sign( a ); 3969 bSign = extractFloat64Sign( b ); 3970 if ( aSign == bSign ) { 3971 return addFloat64Sigs(a, b, aSign, status); 3972 } 3973 else { 3974 return subFloat64Sigs(a, b, aSign, status); 3975 } 3976 3977 } 3978 3979 /*---------------------------------------------------------------------------- 3980 | Returns the result of subtracting the double-precision floating-point values 3981 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 3982 | for Binary Floating-Point Arithmetic. 3983 *----------------------------------------------------------------------------*/ 3984 3985 float64 float64_sub(float64 a, float64 b, float_status *status) 3986 { 3987 flag aSign, bSign; 3988 a = float64_squash_input_denormal(a, status); 3989 b = float64_squash_input_denormal(b, status); 3990 3991 aSign = extractFloat64Sign( a ); 3992 bSign = extractFloat64Sign( b ); 3993 if ( aSign == bSign ) { 3994 return subFloat64Sigs(a, b, aSign, status); 3995 } 3996 else { 3997 return addFloat64Sigs(a, b, aSign, status); 3998 } 3999 4000 } 4001 4002 /*---------------------------------------------------------------------------- 4003 | Returns the result of multiplying the double-precision floating-point values 4004 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 4005 | for Binary Floating-Point Arithmetic. 4006 *----------------------------------------------------------------------------*/ 4007 4008 float64 float64_mul(float64 a, float64 b, float_status *status) 4009 { 4010 flag aSign, bSign, zSign; 4011 int aExp, bExp, zExp; 4012 uint64_t aSig, bSig, zSig0, zSig1; 4013 4014 a = float64_squash_input_denormal(a, status); 4015 b = float64_squash_input_denormal(b, status); 4016 4017 aSig = extractFloat64Frac( a ); 4018 aExp = extractFloat64Exp( a ); 4019 aSign = extractFloat64Sign( a ); 4020 bSig = extractFloat64Frac( b ); 4021 bExp = extractFloat64Exp( b ); 4022 bSign = extractFloat64Sign( b ); 4023 zSign = aSign ^ bSign; 4024 if ( aExp == 0x7FF ) { 4025 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 4026 return propagateFloat64NaN(a, b, status); 4027 } 4028 if ( ( bExp | bSig ) == 0 ) { 4029 float_raise(float_flag_invalid, status); 4030 return float64_default_nan(status); 4031 } 4032 return packFloat64( zSign, 0x7FF, 0 ); 4033 } 4034 if ( bExp == 0x7FF ) { 4035 if (bSig) { 4036 return propagateFloat64NaN(a, b, status); 4037 } 4038 if ( ( aExp | aSig ) == 0 ) { 4039 float_raise(float_flag_invalid, status); 4040 return float64_default_nan(status); 4041 } 4042 return packFloat64( zSign, 0x7FF, 0 ); 4043 } 4044 if ( aExp == 0 ) { 4045 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 4046 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4047 } 4048 if ( bExp == 0 ) { 4049 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 ); 4050 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4051 } 4052 zExp = aExp + bExp - 0x3FF; 4053 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 4054 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4055 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 4056 zSig0 |= ( zSig1 != 0 ); 4057 if ( 0 <= (int64_t) ( zSig0<<1 ) ) { 4058 zSig0 <<= 1; 4059 --zExp; 4060 } 4061 return roundAndPackFloat64(zSign, zExp, zSig0, status); 4062 4063 } 4064 4065 /*---------------------------------------------------------------------------- 4066 | Returns the result of dividing the double-precision floating-point value `a' 4067 | by the corresponding value `b'. The operation is performed according to 4068 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4069 *----------------------------------------------------------------------------*/ 4070 4071 float64 float64_div(float64 a, float64 b, float_status *status) 4072 { 4073 flag aSign, bSign, zSign; 4074 int aExp, bExp, zExp; 4075 uint64_t aSig, bSig, zSig; 4076 uint64_t rem0, rem1; 4077 uint64_t term0, term1; 4078 a = float64_squash_input_denormal(a, status); 4079 b = float64_squash_input_denormal(b, status); 4080 4081 aSig = extractFloat64Frac( a ); 4082 aExp = extractFloat64Exp( a ); 4083 aSign = extractFloat64Sign( a ); 4084 bSig = extractFloat64Frac( b ); 4085 bExp = extractFloat64Exp( b ); 4086 bSign = extractFloat64Sign( b ); 4087 zSign = aSign ^ bSign; 4088 if ( aExp == 0x7FF ) { 4089 if (aSig) { 4090 return propagateFloat64NaN(a, b, status); 4091 } 4092 if ( bExp == 0x7FF ) { 4093 if (bSig) { 4094 return propagateFloat64NaN(a, b, status); 4095 } 4096 float_raise(float_flag_invalid, status); 4097 return float64_default_nan(status); 4098 } 4099 return packFloat64( zSign, 0x7FF, 0 ); 4100 } 4101 if ( bExp == 0x7FF ) { 4102 if (bSig) { 4103 return propagateFloat64NaN(a, b, status); 4104 } 4105 return packFloat64( zSign, 0, 0 ); 4106 } 4107 if ( bExp == 0 ) { 4108 if ( bSig == 0 ) { 4109 if ( ( aExp | aSig ) == 0 ) { 4110 float_raise(float_flag_invalid, status); 4111 return float64_default_nan(status); 4112 } 4113 float_raise(float_flag_divbyzero, status); 4114 return packFloat64( zSign, 0x7FF, 0 ); 4115 } 4116 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4117 } 4118 if ( aExp == 0 ) { 4119 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 4120 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4121 } 4122 zExp = aExp - bExp + 0x3FD; 4123 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 4124 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4125 if ( bSig <= ( aSig + aSig ) ) { 4126 aSig >>= 1; 4127 ++zExp; 4128 } 4129 zSig = estimateDiv128To64( aSig, 0, bSig ); 4130 if ( ( zSig & 0x1FF ) <= 2 ) { 4131 mul64To128( bSig, zSig, &term0, &term1 ); 4132 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 4133 while ( (int64_t) rem0 < 0 ) { 4134 --zSig; 4135 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 4136 } 4137 zSig |= ( rem1 != 0 ); 4138 } 4139 return roundAndPackFloat64(zSign, zExp, zSig, status); 4140 4141 } 4142 4143 /*---------------------------------------------------------------------------- 4144 | Returns the remainder of the double-precision floating-point value `a' 4145 | with respect to the corresponding value `b'. The operation is performed 4146 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4147 *----------------------------------------------------------------------------*/ 4148 4149 float64 float64_rem(float64 a, float64 b, float_status *status) 4150 { 4151 flag aSign, zSign; 4152 int aExp, bExp, expDiff; 4153 uint64_t aSig, bSig; 4154 uint64_t q, alternateASig; 4155 int64_t sigMean; 4156 4157 a = float64_squash_input_denormal(a, status); 4158 b = float64_squash_input_denormal(b, status); 4159 aSig = extractFloat64Frac( a ); 4160 aExp = extractFloat64Exp( a ); 4161 aSign = extractFloat64Sign( a ); 4162 bSig = extractFloat64Frac( b ); 4163 bExp = extractFloat64Exp( b ); 4164 if ( aExp == 0x7FF ) { 4165 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 4166 return propagateFloat64NaN(a, b, status); 4167 } 4168 float_raise(float_flag_invalid, status); 4169 return float64_default_nan(status); 4170 } 4171 if ( bExp == 0x7FF ) { 4172 if (bSig) { 4173 return propagateFloat64NaN(a, b, status); 4174 } 4175 return a; 4176 } 4177 if ( bExp == 0 ) { 4178 if ( bSig == 0 ) { 4179 float_raise(float_flag_invalid, status); 4180 return float64_default_nan(status); 4181 } 4182 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4183 } 4184 if ( aExp == 0 ) { 4185 if ( aSig == 0 ) return a; 4186 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4187 } 4188 expDiff = aExp - bExp; 4189 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 4190 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4191 if ( expDiff < 0 ) { 4192 if ( expDiff < -1 ) return a; 4193 aSig >>= 1; 4194 } 4195 q = ( bSig <= aSig ); 4196 if ( q ) aSig -= bSig; 4197 expDiff -= 64; 4198 while ( 0 < expDiff ) { 4199 q = estimateDiv128To64( aSig, 0, bSig ); 4200 q = ( 2 < q ) ? q - 2 : 0; 4201 aSig = - ( ( bSig>>2 ) * q ); 4202 expDiff -= 62; 4203 } 4204 expDiff += 64; 4205 if ( 0 < expDiff ) { 4206 q = estimateDiv128To64( aSig, 0, bSig ); 4207 q = ( 2 < q ) ? q - 2 : 0; 4208 q >>= 64 - expDiff; 4209 bSig >>= 2; 4210 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4211 } 4212 else { 4213 aSig >>= 2; 4214 bSig >>= 2; 4215 } 4216 do { 4217 alternateASig = aSig; 4218 ++q; 4219 aSig -= bSig; 4220 } while ( 0 <= (int64_t) aSig ); 4221 sigMean = aSig + alternateASig; 4222 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 4223 aSig = alternateASig; 4224 } 4225 zSign = ( (int64_t) aSig < 0 ); 4226 if ( zSign ) aSig = - aSig; 4227 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 4228 4229 } 4230 4231 /*---------------------------------------------------------------------------- 4232 | Returns the result of multiplying the double-precision floating-point values 4233 | `a' and `b' then adding 'c', with no intermediate rounding step after the 4234 | multiplication. The operation is performed according to the IEC/IEEE 4235 | Standard for Binary Floating-Point Arithmetic 754-2008. 4236 | The flags argument allows the caller to select negation of the 4237 | addend, the intermediate product, or the final result. (The difference 4238 | between this and having the caller do a separate negation is that negating 4239 | externally will flip the sign bit on NaNs.) 4240 *----------------------------------------------------------------------------*/ 4241 4242 float64 float64_muladd(float64 a, float64 b, float64 c, int flags, 4243 float_status *status) 4244 { 4245 flag aSign, bSign, cSign, zSign; 4246 int aExp, bExp, cExp, pExp, zExp, expDiff; 4247 uint64_t aSig, bSig, cSig; 4248 flag pInf, pZero, pSign; 4249 uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1; 4250 int shiftcount; 4251 flag signflip, infzero; 4252 4253 a = float64_squash_input_denormal(a, status); 4254 b = float64_squash_input_denormal(b, status); 4255 c = float64_squash_input_denormal(c, status); 4256 aSig = extractFloat64Frac(a); 4257 aExp = extractFloat64Exp(a); 4258 aSign = extractFloat64Sign(a); 4259 bSig = extractFloat64Frac(b); 4260 bExp = extractFloat64Exp(b); 4261 bSign = extractFloat64Sign(b); 4262 cSig = extractFloat64Frac(c); 4263 cExp = extractFloat64Exp(c); 4264 cSign = extractFloat64Sign(c); 4265 4266 infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) || 4267 (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0)); 4268 4269 /* It is implementation-defined whether the cases of (0,inf,qnan) 4270 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 4271 * they return if they do), so we have to hand this information 4272 * off to the target-specific pick-a-NaN routine. 4273 */ 4274 if (((aExp == 0x7ff) && aSig) || 4275 ((bExp == 0x7ff) && bSig) || 4276 ((cExp == 0x7ff) && cSig)) { 4277 return propagateFloat64MulAddNaN(a, b, c, infzero, status); 4278 } 4279 4280 if (infzero) { 4281 float_raise(float_flag_invalid, status); 4282 return float64_default_nan(status); 4283 } 4284 4285 if (flags & float_muladd_negate_c) { 4286 cSign ^= 1; 4287 } 4288 4289 signflip = (flags & float_muladd_negate_result) ? 1 : 0; 4290 4291 /* Work out the sign and type of the product */ 4292 pSign = aSign ^ bSign; 4293 if (flags & float_muladd_negate_product) { 4294 pSign ^= 1; 4295 } 4296 pInf = (aExp == 0x7ff) || (bExp == 0x7ff); 4297 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0); 4298 4299 if (cExp == 0x7ff) { 4300 if (pInf && (pSign ^ cSign)) { 4301 /* addition of opposite-signed infinities => InvalidOperation */ 4302 float_raise(float_flag_invalid, status); 4303 return float64_default_nan(status); 4304 } 4305 /* Otherwise generate an infinity of the same sign */ 4306 return packFloat64(cSign ^ signflip, 0x7ff, 0); 4307 } 4308 4309 if (pInf) { 4310 return packFloat64(pSign ^ signflip, 0x7ff, 0); 4311 } 4312 4313 if (pZero) { 4314 if (cExp == 0) { 4315 if (cSig == 0) { 4316 /* Adding two exact zeroes */ 4317 if (pSign == cSign) { 4318 zSign = pSign; 4319 } else if (status->float_rounding_mode == float_round_down) { 4320 zSign = 1; 4321 } else { 4322 zSign = 0; 4323 } 4324 return packFloat64(zSign ^ signflip, 0, 0); 4325 } 4326 /* Exact zero plus a denorm */ 4327 if (status->flush_to_zero) { 4328 float_raise(float_flag_output_denormal, status); 4329 return packFloat64(cSign ^ signflip, 0, 0); 4330 } 4331 } 4332 /* Zero plus something non-zero : just return the something */ 4333 if (flags & float_muladd_halve_result) { 4334 if (cExp == 0) { 4335 normalizeFloat64Subnormal(cSig, &cExp, &cSig); 4336 } 4337 /* Subtract one to halve, and one again because roundAndPackFloat64 4338 * wants one less than the true exponent. 4339 */ 4340 cExp -= 2; 4341 cSig = (cSig | 0x0010000000000000ULL) << 10; 4342 return roundAndPackFloat64(cSign ^ signflip, cExp, cSig, status); 4343 } 4344 return packFloat64(cSign ^ signflip, cExp, cSig); 4345 } 4346 4347 if (aExp == 0) { 4348 normalizeFloat64Subnormal(aSig, &aExp, &aSig); 4349 } 4350 if (bExp == 0) { 4351 normalizeFloat64Subnormal(bSig, &bExp, &bSig); 4352 } 4353 4354 /* Calculate the actual result a * b + c */ 4355 4356 /* Multiply first; this is easy. */ 4357 /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff 4358 * because we want the true exponent, not the "one-less-than" 4359 * flavour that roundAndPackFloat64() takes. 4360 */ 4361 pExp = aExp + bExp - 0x3fe; 4362 aSig = (aSig | LIT64(0x0010000000000000))<<10; 4363 bSig = (bSig | LIT64(0x0010000000000000))<<11; 4364 mul64To128(aSig, bSig, &pSig0, &pSig1); 4365 if ((int64_t)(pSig0 << 1) >= 0) { 4366 shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1); 4367 pExp--; 4368 } 4369 4370 zSign = pSign ^ signflip; 4371 4372 /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit 4373 * bit in position 126. 4374 */ 4375 if (cExp == 0) { 4376 if (!cSig) { 4377 /* Throw out the special case of c being an exact zero now */ 4378 shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1); 4379 if (flags & float_muladd_halve_result) { 4380 pExp--; 4381 } 4382 return roundAndPackFloat64(zSign, pExp - 1, 4383 pSig1, status); 4384 } 4385 normalizeFloat64Subnormal(cSig, &cExp, &cSig); 4386 } 4387 4388 /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the 4389 * significand of the addend, with the explicit bit in position 126. 4390 */ 4391 cSig0 = cSig << (126 - 64 - 52); 4392 cSig1 = 0; 4393 cSig0 |= LIT64(0x4000000000000000); 4394 expDiff = pExp - cExp; 4395 4396 if (pSign == cSign) { 4397 /* Addition */ 4398 if (expDiff > 0) { 4399 /* scale c to match p */ 4400 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1); 4401 zExp = pExp; 4402 } else if (expDiff < 0) { 4403 /* scale p to match c */ 4404 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1); 4405 zExp = cExp; 4406 } else { 4407 /* no scaling needed */ 4408 zExp = cExp; 4409 } 4410 /* Add significands and make sure explicit bit ends up in posn 126 */ 4411 add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); 4412 if ((int64_t)zSig0 < 0) { 4413 shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1); 4414 } else { 4415 zExp--; 4416 } 4417 shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1); 4418 if (flags & float_muladd_halve_result) { 4419 zExp--; 4420 } 4421 return roundAndPackFloat64(zSign, zExp, zSig1, status); 4422 } else { 4423 /* Subtraction */ 4424 if (expDiff > 0) { 4425 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1); 4426 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); 4427 zExp = pExp; 4428 } else if (expDiff < 0) { 4429 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1); 4430 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1); 4431 zExp = cExp; 4432 zSign ^= 1; 4433 } else { 4434 zExp = pExp; 4435 if (lt128(cSig0, cSig1, pSig0, pSig1)) { 4436 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); 4437 } else if (lt128(pSig0, pSig1, cSig0, cSig1)) { 4438 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1); 4439 zSign ^= 1; 4440 } else { 4441 /* Exact zero */ 4442 zSign = signflip; 4443 if (status->float_rounding_mode == float_round_down) { 4444 zSign ^= 1; 4445 } 4446 return packFloat64(zSign, 0, 0); 4447 } 4448 } 4449 --zExp; 4450 /* Do the equivalent of normalizeRoundAndPackFloat64() but 4451 * starting with the significand in a pair of uint64_t. 4452 */ 4453 if (zSig0) { 4454 shiftcount = countLeadingZeros64(zSig0) - 1; 4455 shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1); 4456 if (zSig1) { 4457 zSig0 |= 1; 4458 } 4459 zExp -= shiftcount; 4460 } else { 4461 shiftcount = countLeadingZeros64(zSig1); 4462 if (shiftcount == 0) { 4463 zSig0 = (zSig1 >> 1) | (zSig1 & 1); 4464 zExp -= 63; 4465 } else { 4466 shiftcount--; 4467 zSig0 = zSig1 << shiftcount; 4468 zExp -= (shiftcount + 64); 4469 } 4470 } 4471 if (flags & float_muladd_halve_result) { 4472 zExp--; 4473 } 4474 return roundAndPackFloat64(zSign, zExp, zSig0, status); 4475 } 4476 } 4477 4478 /*---------------------------------------------------------------------------- 4479 | Returns the square root of the double-precision floating-point value `a'. 4480 | The operation is performed according to the IEC/IEEE Standard for Binary 4481 | Floating-Point Arithmetic. 4482 *----------------------------------------------------------------------------*/ 4483 4484 float64 float64_sqrt(float64 a, float_status *status) 4485 { 4486 flag aSign; 4487 int aExp, zExp; 4488 uint64_t aSig, zSig, doubleZSig; 4489 uint64_t rem0, rem1, term0, term1; 4490 a = float64_squash_input_denormal(a, status); 4491 4492 aSig = extractFloat64Frac( a ); 4493 aExp = extractFloat64Exp( a ); 4494 aSign = extractFloat64Sign( a ); 4495 if ( aExp == 0x7FF ) { 4496 if (aSig) { 4497 return propagateFloat64NaN(a, a, status); 4498 } 4499 if ( ! aSign ) return a; 4500 float_raise(float_flag_invalid, status); 4501 return float64_default_nan(status); 4502 } 4503 if ( aSign ) { 4504 if ( ( aExp | aSig ) == 0 ) return a; 4505 float_raise(float_flag_invalid, status); 4506 return float64_default_nan(status); 4507 } 4508 if ( aExp == 0 ) { 4509 if ( aSig == 0 ) return float64_zero; 4510 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4511 } 4512 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 4513 aSig |= LIT64( 0x0010000000000000 ); 4514 zSig = estimateSqrt32( aExp, aSig>>21 ); 4515 aSig <<= 9 - ( aExp & 1 ); 4516 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); 4517 if ( ( zSig & 0x1FF ) <= 5 ) { 4518 doubleZSig = zSig<<1; 4519 mul64To128( zSig, zSig, &term0, &term1 ); 4520 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 4521 while ( (int64_t) rem0 < 0 ) { 4522 --zSig; 4523 doubleZSig -= 2; 4524 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); 4525 } 4526 zSig |= ( ( rem0 | rem1 ) != 0 ); 4527 } 4528 return roundAndPackFloat64(0, zExp, zSig, status); 4529 4530 } 4531 4532 /*---------------------------------------------------------------------------- 4533 | Returns the binary log of the double-precision floating-point value `a'. 4534 | The operation is performed according to the IEC/IEEE Standard for Binary 4535 | Floating-Point Arithmetic. 4536 *----------------------------------------------------------------------------*/ 4537 float64 float64_log2(float64 a, float_status *status) 4538 { 4539 flag aSign, zSign; 4540 int aExp; 4541 uint64_t aSig, aSig0, aSig1, zSig, i; 4542 a = float64_squash_input_denormal(a, status); 4543 4544 aSig = extractFloat64Frac( a ); 4545 aExp = extractFloat64Exp( a ); 4546 aSign = extractFloat64Sign( a ); 4547 4548 if ( aExp == 0 ) { 4549 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 4550 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4551 } 4552 if ( aSign ) { 4553 float_raise(float_flag_invalid, status); 4554 return float64_default_nan(status); 4555 } 4556 if ( aExp == 0x7FF ) { 4557 if (aSig) { 4558 return propagateFloat64NaN(a, float64_zero, status); 4559 } 4560 return a; 4561 } 4562 4563 aExp -= 0x3FF; 4564 aSig |= LIT64( 0x0010000000000000 ); 4565 zSign = aExp < 0; 4566 zSig = (uint64_t)aExp << 52; 4567 for (i = 1LL << 51; i > 0; i >>= 1) { 4568 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 4569 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 4570 if ( aSig & LIT64( 0x0020000000000000 ) ) { 4571 aSig >>= 1; 4572 zSig |= i; 4573 } 4574 } 4575 4576 if ( zSign ) 4577 zSig = -zSig; 4578 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 4579 } 4580 4581 /*---------------------------------------------------------------------------- 4582 | Returns 1 if the double-precision floating-point value `a' is equal to the 4583 | corresponding value `b', and 0 otherwise. The invalid exception is raised 4584 | if either operand is a NaN. Otherwise, the comparison is performed 4585 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4586 *----------------------------------------------------------------------------*/ 4587 4588 int float64_eq(float64 a, float64 b, float_status *status) 4589 { 4590 uint64_t av, bv; 4591 a = float64_squash_input_denormal(a, status); 4592 b = float64_squash_input_denormal(b, status); 4593 4594 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4595 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4596 ) { 4597 float_raise(float_flag_invalid, status); 4598 return 0; 4599 } 4600 av = float64_val(a); 4601 bv = float64_val(b); 4602 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4603 4604 } 4605 4606 /*---------------------------------------------------------------------------- 4607 | Returns 1 if the double-precision floating-point value `a' is less than or 4608 | equal to the corresponding value `b', and 0 otherwise. The invalid 4609 | exception is raised if either operand is a NaN. The comparison is performed 4610 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4611 *----------------------------------------------------------------------------*/ 4612 4613 int float64_le(float64 a, float64 b, float_status *status) 4614 { 4615 flag aSign, bSign; 4616 uint64_t av, bv; 4617 a = float64_squash_input_denormal(a, status); 4618 b = float64_squash_input_denormal(b, status); 4619 4620 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4621 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4622 ) { 4623 float_raise(float_flag_invalid, status); 4624 return 0; 4625 } 4626 aSign = extractFloat64Sign( a ); 4627 bSign = extractFloat64Sign( b ); 4628 av = float64_val(a); 4629 bv = float64_val(b); 4630 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4631 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4632 4633 } 4634 4635 /*---------------------------------------------------------------------------- 4636 | Returns 1 if the double-precision floating-point value `a' is less than 4637 | the corresponding value `b', and 0 otherwise. The invalid exception is 4638 | raised if either operand is a NaN. The comparison is performed according 4639 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4640 *----------------------------------------------------------------------------*/ 4641 4642 int float64_lt(float64 a, float64 b, float_status *status) 4643 { 4644 flag aSign, bSign; 4645 uint64_t av, bv; 4646 4647 a = float64_squash_input_denormal(a, status); 4648 b = float64_squash_input_denormal(b, status); 4649 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4650 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4651 ) { 4652 float_raise(float_flag_invalid, status); 4653 return 0; 4654 } 4655 aSign = extractFloat64Sign( a ); 4656 bSign = extractFloat64Sign( b ); 4657 av = float64_val(a); 4658 bv = float64_val(b); 4659 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4660 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4661 4662 } 4663 4664 /*---------------------------------------------------------------------------- 4665 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4666 | be compared, and 0 otherwise. The invalid exception is raised if either 4667 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4668 | Standard for Binary Floating-Point Arithmetic. 4669 *----------------------------------------------------------------------------*/ 4670 4671 int float64_unordered(float64 a, float64 b, float_status *status) 4672 { 4673 a = float64_squash_input_denormal(a, status); 4674 b = float64_squash_input_denormal(b, status); 4675 4676 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4677 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4678 ) { 4679 float_raise(float_flag_invalid, status); 4680 return 1; 4681 } 4682 return 0; 4683 } 4684 4685 /*---------------------------------------------------------------------------- 4686 | Returns 1 if the double-precision floating-point value `a' is equal to the 4687 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4688 | exception.The comparison is performed according to the IEC/IEEE Standard 4689 | for Binary Floating-Point Arithmetic. 4690 *----------------------------------------------------------------------------*/ 4691 4692 int float64_eq_quiet(float64 a, float64 b, float_status *status) 4693 { 4694 uint64_t av, bv; 4695 a = float64_squash_input_denormal(a, status); 4696 b = float64_squash_input_denormal(b, status); 4697 4698 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4699 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4700 ) { 4701 if (float64_is_signaling_nan(a, status) 4702 || float64_is_signaling_nan(b, status)) { 4703 float_raise(float_flag_invalid, status); 4704 } 4705 return 0; 4706 } 4707 av = float64_val(a); 4708 bv = float64_val(b); 4709 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4710 4711 } 4712 4713 /*---------------------------------------------------------------------------- 4714 | Returns 1 if the double-precision floating-point value `a' is less than or 4715 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4716 | cause an exception. Otherwise, the comparison is performed according to the 4717 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4718 *----------------------------------------------------------------------------*/ 4719 4720 int float64_le_quiet(float64 a, float64 b, float_status *status) 4721 { 4722 flag aSign, bSign; 4723 uint64_t av, bv; 4724 a = float64_squash_input_denormal(a, status); 4725 b = float64_squash_input_denormal(b, status); 4726 4727 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4728 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4729 ) { 4730 if (float64_is_signaling_nan(a, status) 4731 || float64_is_signaling_nan(b, status)) { 4732 float_raise(float_flag_invalid, status); 4733 } 4734 return 0; 4735 } 4736 aSign = extractFloat64Sign( a ); 4737 bSign = extractFloat64Sign( b ); 4738 av = float64_val(a); 4739 bv = float64_val(b); 4740 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4741 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4742 4743 } 4744 4745 /*---------------------------------------------------------------------------- 4746 | Returns 1 if the double-precision floating-point value `a' is less than 4747 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4748 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 4749 | Standard for Binary Floating-Point Arithmetic. 4750 *----------------------------------------------------------------------------*/ 4751 4752 int float64_lt_quiet(float64 a, float64 b, float_status *status) 4753 { 4754 flag aSign, bSign; 4755 uint64_t av, bv; 4756 a = float64_squash_input_denormal(a, status); 4757 b = float64_squash_input_denormal(b, status); 4758 4759 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4760 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4761 ) { 4762 if (float64_is_signaling_nan(a, status) 4763 || float64_is_signaling_nan(b, status)) { 4764 float_raise(float_flag_invalid, status); 4765 } 4766 return 0; 4767 } 4768 aSign = extractFloat64Sign( a ); 4769 bSign = extractFloat64Sign( b ); 4770 av = float64_val(a); 4771 bv = float64_val(b); 4772 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4773 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4774 4775 } 4776 4777 /*---------------------------------------------------------------------------- 4778 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4779 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 4780 | comparison is performed according to the IEC/IEEE Standard for Binary 4781 | Floating-Point Arithmetic. 4782 *----------------------------------------------------------------------------*/ 4783 4784 int float64_unordered_quiet(float64 a, float64 b, float_status *status) 4785 { 4786 a = float64_squash_input_denormal(a, status); 4787 b = float64_squash_input_denormal(b, status); 4788 4789 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4790 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4791 ) { 4792 if (float64_is_signaling_nan(a, status) 4793 || float64_is_signaling_nan(b, status)) { 4794 float_raise(float_flag_invalid, status); 4795 } 4796 return 1; 4797 } 4798 return 0; 4799 } 4800 4801 /*---------------------------------------------------------------------------- 4802 | Returns the result of converting the extended double-precision floating- 4803 | point value `a' to the 32-bit two's complement integer format. The 4804 | conversion is performed according to the IEC/IEEE Standard for Binary 4805 | Floating-Point Arithmetic---which means in particular that the conversion 4806 | is rounded according to the current rounding mode. If `a' is a NaN, the 4807 | largest positive integer is returned. Otherwise, if the conversion 4808 | overflows, the largest integer with the same sign as `a' is returned. 4809 *----------------------------------------------------------------------------*/ 4810 4811 int32_t floatx80_to_int32(floatx80 a, float_status *status) 4812 { 4813 flag aSign; 4814 int32_t aExp, shiftCount; 4815 uint64_t aSig; 4816 4817 aSig = extractFloatx80Frac( a ); 4818 aExp = extractFloatx80Exp( a ); 4819 aSign = extractFloatx80Sign( a ); 4820 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4821 shiftCount = 0x4037 - aExp; 4822 if ( shiftCount <= 0 ) shiftCount = 1; 4823 shift64RightJamming( aSig, shiftCount, &aSig ); 4824 return roundAndPackInt32(aSign, aSig, status); 4825 4826 } 4827 4828 /*---------------------------------------------------------------------------- 4829 | Returns the result of converting the extended double-precision floating- 4830 | point value `a' to the 32-bit two's complement integer format. The 4831 | conversion is performed according to the IEC/IEEE Standard for Binary 4832 | Floating-Point Arithmetic, except that the conversion is always rounded 4833 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4834 | Otherwise, if the conversion overflows, the largest integer with the same 4835 | sign as `a' is returned. 4836 *----------------------------------------------------------------------------*/ 4837 4838 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 4839 { 4840 flag aSign; 4841 int32_t aExp, shiftCount; 4842 uint64_t aSig, savedASig; 4843 int32_t z; 4844 4845 aSig = extractFloatx80Frac( a ); 4846 aExp = extractFloatx80Exp( a ); 4847 aSign = extractFloatx80Sign( a ); 4848 if ( 0x401E < aExp ) { 4849 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4850 goto invalid; 4851 } 4852 else if ( aExp < 0x3FFF ) { 4853 if (aExp || aSig) { 4854 status->float_exception_flags |= float_flag_inexact; 4855 } 4856 return 0; 4857 } 4858 shiftCount = 0x403E - aExp; 4859 savedASig = aSig; 4860 aSig >>= shiftCount; 4861 z = aSig; 4862 if ( aSign ) z = - z; 4863 if ( ( z < 0 ) ^ aSign ) { 4864 invalid: 4865 float_raise(float_flag_invalid, status); 4866 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 4867 } 4868 if ( ( aSig<<shiftCount ) != savedASig ) { 4869 status->float_exception_flags |= float_flag_inexact; 4870 } 4871 return z; 4872 4873 } 4874 4875 /*---------------------------------------------------------------------------- 4876 | Returns the result of converting the extended double-precision floating- 4877 | point value `a' to the 64-bit two's complement integer format. The 4878 | conversion is performed according to the IEC/IEEE Standard for Binary 4879 | Floating-Point Arithmetic---which means in particular that the conversion 4880 | is rounded according to the current rounding mode. If `a' is a NaN, 4881 | the largest positive integer is returned. Otherwise, if the conversion 4882 | overflows, the largest integer with the same sign as `a' is returned. 4883 *----------------------------------------------------------------------------*/ 4884 4885 int64_t floatx80_to_int64(floatx80 a, float_status *status) 4886 { 4887 flag aSign; 4888 int32_t aExp, shiftCount; 4889 uint64_t aSig, aSigExtra; 4890 4891 aSig = extractFloatx80Frac( a ); 4892 aExp = extractFloatx80Exp( a ); 4893 aSign = extractFloatx80Sign( a ); 4894 shiftCount = 0x403E - aExp; 4895 if ( shiftCount <= 0 ) { 4896 if ( shiftCount ) { 4897 float_raise(float_flag_invalid, status); 4898 if ( ! aSign 4899 || ( ( aExp == 0x7FFF ) 4900 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 4901 ) { 4902 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4903 } 4904 return (int64_t) LIT64( 0x8000000000000000 ); 4905 } 4906 aSigExtra = 0; 4907 } 4908 else { 4909 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 4910 } 4911 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 4912 4913 } 4914 4915 /*---------------------------------------------------------------------------- 4916 | Returns the result of converting the extended double-precision floating- 4917 | point value `a' to the 64-bit two's complement integer format. The 4918 | conversion is performed according to the IEC/IEEE Standard for Binary 4919 | Floating-Point Arithmetic, except that the conversion is always rounded 4920 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4921 | Otherwise, if the conversion overflows, the largest integer with the same 4922 | sign as `a' is returned. 4923 *----------------------------------------------------------------------------*/ 4924 4925 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 4926 { 4927 flag aSign; 4928 int32_t aExp, shiftCount; 4929 uint64_t aSig; 4930 int64_t z; 4931 4932 aSig = extractFloatx80Frac( a ); 4933 aExp = extractFloatx80Exp( a ); 4934 aSign = extractFloatx80Sign( a ); 4935 shiftCount = aExp - 0x403E; 4936 if ( 0 <= shiftCount ) { 4937 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 4938 if ( ( a.high != 0xC03E ) || aSig ) { 4939 float_raise(float_flag_invalid, status); 4940 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 4941 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4942 } 4943 } 4944 return (int64_t) LIT64( 0x8000000000000000 ); 4945 } 4946 else if ( aExp < 0x3FFF ) { 4947 if (aExp | aSig) { 4948 status->float_exception_flags |= float_flag_inexact; 4949 } 4950 return 0; 4951 } 4952 z = aSig>>( - shiftCount ); 4953 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 4954 status->float_exception_flags |= float_flag_inexact; 4955 } 4956 if ( aSign ) z = - z; 4957 return z; 4958 4959 } 4960 4961 /*---------------------------------------------------------------------------- 4962 | Returns the result of converting the extended double-precision floating- 4963 | point value `a' to the single-precision floating-point format. The 4964 | conversion is performed according to the IEC/IEEE Standard for Binary 4965 | Floating-Point Arithmetic. 4966 *----------------------------------------------------------------------------*/ 4967 4968 float32 floatx80_to_float32(floatx80 a, float_status *status) 4969 { 4970 flag aSign; 4971 int32_t aExp; 4972 uint64_t aSig; 4973 4974 aSig = extractFloatx80Frac( a ); 4975 aExp = extractFloatx80Exp( a ); 4976 aSign = extractFloatx80Sign( a ); 4977 if ( aExp == 0x7FFF ) { 4978 if ( (uint64_t) ( aSig<<1 ) ) { 4979 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status); 4980 } 4981 return packFloat32( aSign, 0xFF, 0 ); 4982 } 4983 shift64RightJamming( aSig, 33, &aSig ); 4984 if ( aExp || aSig ) aExp -= 0x3F81; 4985 return roundAndPackFloat32(aSign, aExp, aSig, status); 4986 4987 } 4988 4989 /*---------------------------------------------------------------------------- 4990 | Returns the result of converting the extended double-precision floating- 4991 | point value `a' to the double-precision floating-point format. The 4992 | conversion is performed according to the IEC/IEEE Standard for Binary 4993 | Floating-Point Arithmetic. 4994 *----------------------------------------------------------------------------*/ 4995 4996 float64 floatx80_to_float64(floatx80 a, float_status *status) 4997 { 4998 flag aSign; 4999 int32_t aExp; 5000 uint64_t aSig, zSig; 5001 5002 aSig = extractFloatx80Frac( a ); 5003 aExp = extractFloatx80Exp( a ); 5004 aSign = extractFloatx80Sign( a ); 5005 if ( aExp == 0x7FFF ) { 5006 if ( (uint64_t) ( aSig<<1 ) ) { 5007 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status); 5008 } 5009 return packFloat64( aSign, 0x7FF, 0 ); 5010 } 5011 shift64RightJamming( aSig, 1, &zSig ); 5012 if ( aExp || aSig ) aExp -= 0x3C01; 5013 return roundAndPackFloat64(aSign, aExp, zSig, status); 5014 5015 } 5016 5017 /*---------------------------------------------------------------------------- 5018 | Returns the result of converting the extended double-precision floating- 5019 | point value `a' to the quadruple-precision floating-point format. The 5020 | conversion is performed according to the IEC/IEEE Standard for Binary 5021 | Floating-Point Arithmetic. 5022 *----------------------------------------------------------------------------*/ 5023 5024 float128 floatx80_to_float128(floatx80 a, float_status *status) 5025 { 5026 flag aSign; 5027 int aExp; 5028 uint64_t aSig, zSig0, zSig1; 5029 5030 aSig = extractFloatx80Frac( a ); 5031 aExp = extractFloatx80Exp( a ); 5032 aSign = extractFloatx80Sign( a ); 5033 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 5034 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status); 5035 } 5036 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 5037 return packFloat128( aSign, aExp, zSig0, zSig1 ); 5038 5039 } 5040 5041 /*---------------------------------------------------------------------------- 5042 | Rounds the extended double-precision floating-point value `a' to an integer, 5043 | and returns the result as an extended quadruple-precision floating-point 5044 | value. The operation is performed according to the IEC/IEEE Standard for 5045 | Binary Floating-Point Arithmetic. 5046 *----------------------------------------------------------------------------*/ 5047 5048 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 5049 { 5050 flag aSign; 5051 int32_t aExp; 5052 uint64_t lastBitMask, roundBitsMask; 5053 floatx80 z; 5054 5055 aExp = extractFloatx80Exp( a ); 5056 if ( 0x403E <= aExp ) { 5057 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 5058 return propagateFloatx80NaN(a, a, status); 5059 } 5060 return a; 5061 } 5062 if ( aExp < 0x3FFF ) { 5063 if ( ( aExp == 0 ) 5064 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 5065 return a; 5066 } 5067 status->float_exception_flags |= float_flag_inexact; 5068 aSign = extractFloatx80Sign( a ); 5069 switch (status->float_rounding_mode) { 5070 case float_round_nearest_even: 5071 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 5072 ) { 5073 return 5074 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 5075 } 5076 break; 5077 case float_round_ties_away: 5078 if (aExp == 0x3FFE) { 5079 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000)); 5080 } 5081 break; 5082 case float_round_down: 5083 return 5084 aSign ? 5085 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 5086 : packFloatx80( 0, 0, 0 ); 5087 case float_round_up: 5088 return 5089 aSign ? packFloatx80( 1, 0, 0 ) 5090 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 5091 } 5092 return packFloatx80( aSign, 0, 0 ); 5093 } 5094 lastBitMask = 1; 5095 lastBitMask <<= 0x403E - aExp; 5096 roundBitsMask = lastBitMask - 1; 5097 z = a; 5098 switch (status->float_rounding_mode) { 5099 case float_round_nearest_even: 5100 z.low += lastBitMask>>1; 5101 if ((z.low & roundBitsMask) == 0) { 5102 z.low &= ~lastBitMask; 5103 } 5104 break; 5105 case float_round_ties_away: 5106 z.low += lastBitMask >> 1; 5107 break; 5108 case float_round_to_zero: 5109 break; 5110 case float_round_up: 5111 if (!extractFloatx80Sign(z)) { 5112 z.low += roundBitsMask; 5113 } 5114 break; 5115 case float_round_down: 5116 if (extractFloatx80Sign(z)) { 5117 z.low += roundBitsMask; 5118 } 5119 break; 5120 default: 5121 abort(); 5122 } 5123 z.low &= ~ roundBitsMask; 5124 if ( z.low == 0 ) { 5125 ++z.high; 5126 z.low = LIT64( 0x8000000000000000 ); 5127 } 5128 if (z.low != a.low) { 5129 status->float_exception_flags |= float_flag_inexact; 5130 } 5131 return z; 5132 5133 } 5134 5135 /*---------------------------------------------------------------------------- 5136 | Returns the result of adding the absolute values of the extended double- 5137 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 5138 | negated before being returned. `zSign' is ignored if the result is a NaN. 5139 | The addition is performed according to the IEC/IEEE Standard for Binary 5140 | Floating-Point Arithmetic. 5141 *----------------------------------------------------------------------------*/ 5142 5143 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5144 float_status *status) 5145 { 5146 int32_t aExp, bExp, zExp; 5147 uint64_t aSig, bSig, zSig0, zSig1; 5148 int32_t expDiff; 5149 5150 aSig = extractFloatx80Frac( a ); 5151 aExp = extractFloatx80Exp( a ); 5152 bSig = extractFloatx80Frac( b ); 5153 bExp = extractFloatx80Exp( b ); 5154 expDiff = aExp - bExp; 5155 if ( 0 < expDiff ) { 5156 if ( aExp == 0x7FFF ) { 5157 if ((uint64_t)(aSig << 1)) { 5158 return propagateFloatx80NaN(a, b, status); 5159 } 5160 return a; 5161 } 5162 if ( bExp == 0 ) --expDiff; 5163 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5164 zExp = aExp; 5165 } 5166 else if ( expDiff < 0 ) { 5167 if ( bExp == 0x7FFF ) { 5168 if ((uint64_t)(bSig << 1)) { 5169 return propagateFloatx80NaN(a, b, status); 5170 } 5171 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5172 } 5173 if ( aExp == 0 ) ++expDiff; 5174 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5175 zExp = bExp; 5176 } 5177 else { 5178 if ( aExp == 0x7FFF ) { 5179 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5180 return propagateFloatx80NaN(a, b, status); 5181 } 5182 return a; 5183 } 5184 zSig1 = 0; 5185 zSig0 = aSig + bSig; 5186 if ( aExp == 0 ) { 5187 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 5188 goto roundAndPack; 5189 } 5190 zExp = aExp; 5191 goto shiftRight1; 5192 } 5193 zSig0 = aSig + bSig; 5194 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 5195 shiftRight1: 5196 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5197 zSig0 |= LIT64( 0x8000000000000000 ); 5198 ++zExp; 5199 roundAndPack: 5200 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5201 zSign, zExp, zSig0, zSig1, status); 5202 } 5203 5204 /*---------------------------------------------------------------------------- 5205 | Returns the result of subtracting the absolute values of the extended 5206 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 5207 | difference is negated before being returned. `zSign' is ignored if the 5208 | result is a NaN. The subtraction is performed according to the IEC/IEEE 5209 | Standard for Binary Floating-Point Arithmetic. 5210 *----------------------------------------------------------------------------*/ 5211 5212 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5213 float_status *status) 5214 { 5215 int32_t aExp, bExp, zExp; 5216 uint64_t aSig, bSig, zSig0, zSig1; 5217 int32_t expDiff; 5218 5219 aSig = extractFloatx80Frac( a ); 5220 aExp = extractFloatx80Exp( a ); 5221 bSig = extractFloatx80Frac( b ); 5222 bExp = extractFloatx80Exp( b ); 5223 expDiff = aExp - bExp; 5224 if ( 0 < expDiff ) goto aExpBigger; 5225 if ( expDiff < 0 ) goto bExpBigger; 5226 if ( aExp == 0x7FFF ) { 5227 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5228 return propagateFloatx80NaN(a, b, status); 5229 } 5230 float_raise(float_flag_invalid, status); 5231 return floatx80_default_nan(status); 5232 } 5233 if ( aExp == 0 ) { 5234 aExp = 1; 5235 bExp = 1; 5236 } 5237 zSig1 = 0; 5238 if ( bSig < aSig ) goto aBigger; 5239 if ( aSig < bSig ) goto bBigger; 5240 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 5241 bExpBigger: 5242 if ( bExp == 0x7FFF ) { 5243 if ((uint64_t)(bSig << 1)) { 5244 return propagateFloatx80NaN(a, b, status); 5245 } 5246 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5247 } 5248 if ( aExp == 0 ) ++expDiff; 5249 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5250 bBigger: 5251 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 5252 zExp = bExp; 5253 zSign ^= 1; 5254 goto normalizeRoundAndPack; 5255 aExpBigger: 5256 if ( aExp == 0x7FFF ) { 5257 if ((uint64_t)(aSig << 1)) { 5258 return propagateFloatx80NaN(a, b, status); 5259 } 5260 return a; 5261 } 5262 if ( bExp == 0 ) --expDiff; 5263 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5264 aBigger: 5265 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 5266 zExp = aExp; 5267 normalizeRoundAndPack: 5268 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 5269 zSign, zExp, zSig0, zSig1, status); 5270 } 5271 5272 /*---------------------------------------------------------------------------- 5273 | Returns the result of adding the extended double-precision floating-point 5274 | values `a' and `b'. The operation is performed according to the IEC/IEEE 5275 | Standard for Binary Floating-Point Arithmetic. 5276 *----------------------------------------------------------------------------*/ 5277 5278 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 5279 { 5280 flag aSign, bSign; 5281 5282 aSign = extractFloatx80Sign( a ); 5283 bSign = extractFloatx80Sign( b ); 5284 if ( aSign == bSign ) { 5285 return addFloatx80Sigs(a, b, aSign, status); 5286 } 5287 else { 5288 return subFloatx80Sigs(a, b, aSign, status); 5289 } 5290 5291 } 5292 5293 /*---------------------------------------------------------------------------- 5294 | Returns the result of subtracting the extended double-precision floating- 5295 | point values `a' and `b'. The operation is performed according to the 5296 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5297 *----------------------------------------------------------------------------*/ 5298 5299 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 5300 { 5301 flag aSign, bSign; 5302 5303 aSign = extractFloatx80Sign( a ); 5304 bSign = extractFloatx80Sign( b ); 5305 if ( aSign == bSign ) { 5306 return subFloatx80Sigs(a, b, aSign, status); 5307 } 5308 else { 5309 return addFloatx80Sigs(a, b, aSign, status); 5310 } 5311 5312 } 5313 5314 /*---------------------------------------------------------------------------- 5315 | Returns the result of multiplying the extended double-precision floating- 5316 | point values `a' and `b'. The operation is performed according to the 5317 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5318 *----------------------------------------------------------------------------*/ 5319 5320 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 5321 { 5322 flag aSign, bSign, zSign; 5323 int32_t aExp, bExp, zExp; 5324 uint64_t aSig, bSig, zSig0, zSig1; 5325 5326 aSig = extractFloatx80Frac( a ); 5327 aExp = extractFloatx80Exp( a ); 5328 aSign = extractFloatx80Sign( a ); 5329 bSig = extractFloatx80Frac( b ); 5330 bExp = extractFloatx80Exp( b ); 5331 bSign = extractFloatx80Sign( b ); 5332 zSign = aSign ^ bSign; 5333 if ( aExp == 0x7FFF ) { 5334 if ( (uint64_t) ( aSig<<1 ) 5335 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5336 return propagateFloatx80NaN(a, b, status); 5337 } 5338 if ( ( bExp | bSig ) == 0 ) goto invalid; 5339 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5340 } 5341 if ( bExp == 0x7FFF ) { 5342 if ((uint64_t)(bSig << 1)) { 5343 return propagateFloatx80NaN(a, b, status); 5344 } 5345 if ( ( aExp | aSig ) == 0 ) { 5346 invalid: 5347 float_raise(float_flag_invalid, status); 5348 return floatx80_default_nan(status); 5349 } 5350 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5351 } 5352 if ( aExp == 0 ) { 5353 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5354 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5355 } 5356 if ( bExp == 0 ) { 5357 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5358 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5359 } 5360 zExp = aExp + bExp - 0x3FFE; 5361 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 5362 if ( 0 < (int64_t) zSig0 ) { 5363 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5364 --zExp; 5365 } 5366 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5367 zSign, zExp, zSig0, zSig1, status); 5368 } 5369 5370 /*---------------------------------------------------------------------------- 5371 | Returns the result of dividing the extended double-precision floating-point 5372 | value `a' by the corresponding value `b'. The operation is performed 5373 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5374 *----------------------------------------------------------------------------*/ 5375 5376 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 5377 { 5378 flag aSign, bSign, zSign; 5379 int32_t aExp, bExp, zExp; 5380 uint64_t aSig, bSig, zSig0, zSig1; 5381 uint64_t rem0, rem1, rem2, term0, term1, term2; 5382 5383 aSig = extractFloatx80Frac( a ); 5384 aExp = extractFloatx80Exp( a ); 5385 aSign = extractFloatx80Sign( a ); 5386 bSig = extractFloatx80Frac( b ); 5387 bExp = extractFloatx80Exp( b ); 5388 bSign = extractFloatx80Sign( b ); 5389 zSign = aSign ^ bSign; 5390 if ( aExp == 0x7FFF ) { 5391 if ((uint64_t)(aSig << 1)) { 5392 return propagateFloatx80NaN(a, b, status); 5393 } 5394 if ( bExp == 0x7FFF ) { 5395 if ((uint64_t)(bSig << 1)) { 5396 return propagateFloatx80NaN(a, b, status); 5397 } 5398 goto invalid; 5399 } 5400 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5401 } 5402 if ( bExp == 0x7FFF ) { 5403 if ((uint64_t)(bSig << 1)) { 5404 return propagateFloatx80NaN(a, b, status); 5405 } 5406 return packFloatx80( zSign, 0, 0 ); 5407 } 5408 if ( bExp == 0 ) { 5409 if ( bSig == 0 ) { 5410 if ( ( aExp | aSig ) == 0 ) { 5411 invalid: 5412 float_raise(float_flag_invalid, status); 5413 return floatx80_default_nan(status); 5414 } 5415 float_raise(float_flag_divbyzero, status); 5416 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5417 } 5418 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5419 } 5420 if ( aExp == 0 ) { 5421 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5422 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5423 } 5424 zExp = aExp - bExp + 0x3FFE; 5425 rem1 = 0; 5426 if ( bSig <= aSig ) { 5427 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 5428 ++zExp; 5429 } 5430 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 5431 mul64To128( bSig, zSig0, &term0, &term1 ); 5432 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 5433 while ( (int64_t) rem0 < 0 ) { 5434 --zSig0; 5435 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 5436 } 5437 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 5438 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 5439 mul64To128( bSig, zSig1, &term1, &term2 ); 5440 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5441 while ( (int64_t) rem1 < 0 ) { 5442 --zSig1; 5443 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 5444 } 5445 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 5446 } 5447 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5448 zSign, zExp, zSig0, zSig1, status); 5449 } 5450 5451 /*---------------------------------------------------------------------------- 5452 | Returns the remainder of the extended double-precision floating-point value 5453 | `a' with respect to the corresponding value `b'. The operation is performed 5454 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5455 *----------------------------------------------------------------------------*/ 5456 5457 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 5458 { 5459 flag aSign, zSign; 5460 int32_t aExp, bExp, expDiff; 5461 uint64_t aSig0, aSig1, bSig; 5462 uint64_t q, term0, term1, alternateASig0, alternateASig1; 5463 5464 aSig0 = extractFloatx80Frac( a ); 5465 aExp = extractFloatx80Exp( a ); 5466 aSign = extractFloatx80Sign( a ); 5467 bSig = extractFloatx80Frac( b ); 5468 bExp = extractFloatx80Exp( b ); 5469 if ( aExp == 0x7FFF ) { 5470 if ( (uint64_t) ( aSig0<<1 ) 5471 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5472 return propagateFloatx80NaN(a, b, status); 5473 } 5474 goto invalid; 5475 } 5476 if ( bExp == 0x7FFF ) { 5477 if ((uint64_t)(bSig << 1)) { 5478 return propagateFloatx80NaN(a, b, status); 5479 } 5480 return a; 5481 } 5482 if ( bExp == 0 ) { 5483 if ( bSig == 0 ) { 5484 invalid: 5485 float_raise(float_flag_invalid, status); 5486 return floatx80_default_nan(status); 5487 } 5488 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5489 } 5490 if ( aExp == 0 ) { 5491 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a; 5492 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5493 } 5494 bSig |= LIT64( 0x8000000000000000 ); 5495 zSign = aSign; 5496 expDiff = aExp - bExp; 5497 aSig1 = 0; 5498 if ( expDiff < 0 ) { 5499 if ( expDiff < -1 ) return a; 5500 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 5501 expDiff = 0; 5502 } 5503 q = ( bSig <= aSig0 ); 5504 if ( q ) aSig0 -= bSig; 5505 expDiff -= 64; 5506 while ( 0 < expDiff ) { 5507 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5508 q = ( 2 < q ) ? q - 2 : 0; 5509 mul64To128( bSig, q, &term0, &term1 ); 5510 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5511 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 5512 expDiff -= 62; 5513 } 5514 expDiff += 64; 5515 if ( 0 < expDiff ) { 5516 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5517 q = ( 2 < q ) ? q - 2 : 0; 5518 q >>= 64 - expDiff; 5519 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 5520 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5521 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 5522 while ( le128( term0, term1, aSig0, aSig1 ) ) { 5523 ++q; 5524 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5525 } 5526 } 5527 else { 5528 term1 = 0; 5529 term0 = bSig; 5530 } 5531 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 5532 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5533 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5534 && ( q & 1 ) ) 5535 ) { 5536 aSig0 = alternateASig0; 5537 aSig1 = alternateASig1; 5538 zSign = ! zSign; 5539 } 5540 return 5541 normalizeRoundAndPackFloatx80( 5542 80, zSign, bExp + expDiff, aSig0, aSig1, status); 5543 5544 } 5545 5546 /*---------------------------------------------------------------------------- 5547 | Returns the square root of the extended double-precision floating-point 5548 | value `a'. The operation is performed according to the IEC/IEEE Standard 5549 | for Binary Floating-Point Arithmetic. 5550 *----------------------------------------------------------------------------*/ 5551 5552 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 5553 { 5554 flag aSign; 5555 int32_t aExp, zExp; 5556 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 5557 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5558 5559 aSig0 = extractFloatx80Frac( a ); 5560 aExp = extractFloatx80Exp( a ); 5561 aSign = extractFloatx80Sign( a ); 5562 if ( aExp == 0x7FFF ) { 5563 if ((uint64_t)(aSig0 << 1)) { 5564 return propagateFloatx80NaN(a, a, status); 5565 } 5566 if ( ! aSign ) return a; 5567 goto invalid; 5568 } 5569 if ( aSign ) { 5570 if ( ( aExp | aSig0 ) == 0 ) return a; 5571 invalid: 5572 float_raise(float_flag_invalid, status); 5573 return floatx80_default_nan(status); 5574 } 5575 if ( aExp == 0 ) { 5576 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 5577 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5578 } 5579 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 5580 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 5581 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 5582 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5583 doubleZSig0 = zSig0<<1; 5584 mul64To128( zSig0, zSig0, &term0, &term1 ); 5585 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5586 while ( (int64_t) rem0 < 0 ) { 5587 --zSig0; 5588 doubleZSig0 -= 2; 5589 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5590 } 5591 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5592 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 5593 if ( zSig1 == 0 ) zSig1 = 1; 5594 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5595 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5596 mul64To128( zSig1, zSig1, &term2, &term3 ); 5597 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5598 while ( (int64_t) rem1 < 0 ) { 5599 --zSig1; 5600 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5601 term3 |= 1; 5602 term2 |= doubleZSig0; 5603 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5604 } 5605 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5606 } 5607 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 5608 zSig0 |= doubleZSig0; 5609 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5610 0, zExp, zSig0, zSig1, status); 5611 } 5612 5613 /*---------------------------------------------------------------------------- 5614 | Returns 1 if the extended double-precision floating-point value `a' is equal 5615 | to the corresponding value `b', and 0 otherwise. The invalid exception is 5616 | raised if either operand is a NaN. Otherwise, the comparison is performed 5617 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5618 *----------------------------------------------------------------------------*/ 5619 5620 int floatx80_eq(floatx80 a, floatx80 b, float_status *status) 5621 { 5622 5623 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5624 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5625 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5626 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5627 ) { 5628 float_raise(float_flag_invalid, status); 5629 return 0; 5630 } 5631 return 5632 ( a.low == b.low ) 5633 && ( ( a.high == b.high ) 5634 || ( ( a.low == 0 ) 5635 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5636 ); 5637 5638 } 5639 5640 /*---------------------------------------------------------------------------- 5641 | Returns 1 if the extended double-precision floating-point value `a' is 5642 | less than or equal to the corresponding value `b', and 0 otherwise. The 5643 | invalid exception is raised if either operand is a NaN. The comparison is 5644 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5645 | Arithmetic. 5646 *----------------------------------------------------------------------------*/ 5647 5648 int floatx80_le(floatx80 a, floatx80 b, float_status *status) 5649 { 5650 flag aSign, bSign; 5651 5652 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5653 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5654 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5655 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5656 ) { 5657 float_raise(float_flag_invalid, status); 5658 return 0; 5659 } 5660 aSign = extractFloatx80Sign( a ); 5661 bSign = extractFloatx80Sign( b ); 5662 if ( aSign != bSign ) { 5663 return 5664 aSign 5665 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5666 == 0 ); 5667 } 5668 return 5669 aSign ? le128( b.high, b.low, a.high, a.low ) 5670 : le128( a.high, a.low, b.high, b.low ); 5671 5672 } 5673 5674 /*---------------------------------------------------------------------------- 5675 | Returns 1 if the extended double-precision floating-point value `a' is 5676 | less than the corresponding value `b', and 0 otherwise. The invalid 5677 | exception is raised if either operand is a NaN. The comparison is performed 5678 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5679 *----------------------------------------------------------------------------*/ 5680 5681 int floatx80_lt(floatx80 a, floatx80 b, float_status *status) 5682 { 5683 flag aSign, bSign; 5684 5685 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5686 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5687 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5688 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5689 ) { 5690 float_raise(float_flag_invalid, status); 5691 return 0; 5692 } 5693 aSign = extractFloatx80Sign( a ); 5694 bSign = extractFloatx80Sign( b ); 5695 if ( aSign != bSign ) { 5696 return 5697 aSign 5698 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5699 != 0 ); 5700 } 5701 return 5702 aSign ? lt128( b.high, b.low, a.high, a.low ) 5703 : lt128( a.high, a.low, b.high, b.low ); 5704 5705 } 5706 5707 /*---------------------------------------------------------------------------- 5708 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5709 | cannot be compared, and 0 otherwise. The invalid exception is raised if 5710 | either operand is a NaN. The comparison is performed according to the 5711 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5712 *----------------------------------------------------------------------------*/ 5713 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status) 5714 { 5715 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5716 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5717 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5718 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5719 ) { 5720 float_raise(float_flag_invalid, status); 5721 return 1; 5722 } 5723 return 0; 5724 } 5725 5726 /*---------------------------------------------------------------------------- 5727 | Returns 1 if the extended double-precision floating-point value `a' is 5728 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5729 | cause an exception. The comparison is performed according to the IEC/IEEE 5730 | Standard for Binary Floating-Point Arithmetic. 5731 *----------------------------------------------------------------------------*/ 5732 5733 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status) 5734 { 5735 5736 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5737 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5738 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5739 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5740 ) { 5741 if (floatx80_is_signaling_nan(a, status) 5742 || floatx80_is_signaling_nan(b, status)) { 5743 float_raise(float_flag_invalid, status); 5744 } 5745 return 0; 5746 } 5747 return 5748 ( a.low == b.low ) 5749 && ( ( a.high == b.high ) 5750 || ( ( a.low == 0 ) 5751 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5752 ); 5753 5754 } 5755 5756 /*---------------------------------------------------------------------------- 5757 | Returns 1 if the extended double-precision floating-point value `a' is less 5758 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 5759 | do not cause an exception. Otherwise, the comparison is performed according 5760 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5761 *----------------------------------------------------------------------------*/ 5762 5763 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status) 5764 { 5765 flag aSign, bSign; 5766 5767 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5768 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5769 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5770 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5771 ) { 5772 if (floatx80_is_signaling_nan(a, status) 5773 || floatx80_is_signaling_nan(b, status)) { 5774 float_raise(float_flag_invalid, status); 5775 } 5776 return 0; 5777 } 5778 aSign = extractFloatx80Sign( a ); 5779 bSign = extractFloatx80Sign( b ); 5780 if ( aSign != bSign ) { 5781 return 5782 aSign 5783 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5784 == 0 ); 5785 } 5786 return 5787 aSign ? le128( b.high, b.low, a.high, a.low ) 5788 : le128( a.high, a.low, b.high, b.low ); 5789 5790 } 5791 5792 /*---------------------------------------------------------------------------- 5793 | Returns 1 if the extended double-precision floating-point value `a' is less 5794 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 5795 | an exception. Otherwise, the comparison is performed according to the 5796 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5797 *----------------------------------------------------------------------------*/ 5798 5799 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status) 5800 { 5801 flag aSign, bSign; 5802 5803 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5804 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5805 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5806 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5807 ) { 5808 if (floatx80_is_signaling_nan(a, status) 5809 || floatx80_is_signaling_nan(b, status)) { 5810 float_raise(float_flag_invalid, status); 5811 } 5812 return 0; 5813 } 5814 aSign = extractFloatx80Sign( a ); 5815 bSign = extractFloatx80Sign( b ); 5816 if ( aSign != bSign ) { 5817 return 5818 aSign 5819 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5820 != 0 ); 5821 } 5822 return 5823 aSign ? lt128( b.high, b.low, a.high, a.low ) 5824 : lt128( a.high, a.low, b.high, b.low ); 5825 5826 } 5827 5828 /*---------------------------------------------------------------------------- 5829 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5830 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception. 5831 | The comparison is performed according to the IEC/IEEE Standard for Binary 5832 | Floating-Point Arithmetic. 5833 *----------------------------------------------------------------------------*/ 5834 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status) 5835 { 5836 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5837 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5838 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5839 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5840 ) { 5841 if (floatx80_is_signaling_nan(a, status) 5842 || floatx80_is_signaling_nan(b, status)) { 5843 float_raise(float_flag_invalid, status); 5844 } 5845 return 1; 5846 } 5847 return 0; 5848 } 5849 5850 /*---------------------------------------------------------------------------- 5851 | Returns the result of converting the quadruple-precision floating-point 5852 | value `a' to the 32-bit two's complement integer format. The conversion 5853 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5854 | Arithmetic---which means in particular that the conversion is rounded 5855 | according to the current rounding mode. If `a' is a NaN, the largest 5856 | positive integer is returned. Otherwise, if the conversion overflows, the 5857 | largest integer with the same sign as `a' is returned. 5858 *----------------------------------------------------------------------------*/ 5859 5860 int32_t float128_to_int32(float128 a, float_status *status) 5861 { 5862 flag aSign; 5863 int32_t aExp, shiftCount; 5864 uint64_t aSig0, aSig1; 5865 5866 aSig1 = extractFloat128Frac1( a ); 5867 aSig0 = extractFloat128Frac0( a ); 5868 aExp = extractFloat128Exp( a ); 5869 aSign = extractFloat128Sign( a ); 5870 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 5871 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5872 aSig0 |= ( aSig1 != 0 ); 5873 shiftCount = 0x4028 - aExp; 5874 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 5875 return roundAndPackInt32(aSign, aSig0, status); 5876 5877 } 5878 5879 /*---------------------------------------------------------------------------- 5880 | Returns the result of converting the quadruple-precision floating-point 5881 | value `a' to the 32-bit two's complement integer format. The conversion 5882 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5883 | Arithmetic, except that the conversion is always rounded toward zero. If 5884 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 5885 | conversion overflows, the largest integer with the same sign as `a' is 5886 | returned. 5887 *----------------------------------------------------------------------------*/ 5888 5889 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 5890 { 5891 flag aSign; 5892 int32_t aExp, shiftCount; 5893 uint64_t aSig0, aSig1, savedASig; 5894 int32_t z; 5895 5896 aSig1 = extractFloat128Frac1( a ); 5897 aSig0 = extractFloat128Frac0( a ); 5898 aExp = extractFloat128Exp( a ); 5899 aSign = extractFloat128Sign( a ); 5900 aSig0 |= ( aSig1 != 0 ); 5901 if ( 0x401E < aExp ) { 5902 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 5903 goto invalid; 5904 } 5905 else if ( aExp < 0x3FFF ) { 5906 if (aExp || aSig0) { 5907 status->float_exception_flags |= float_flag_inexact; 5908 } 5909 return 0; 5910 } 5911 aSig0 |= LIT64( 0x0001000000000000 ); 5912 shiftCount = 0x402F - aExp; 5913 savedASig = aSig0; 5914 aSig0 >>= shiftCount; 5915 z = aSig0; 5916 if ( aSign ) z = - z; 5917 if ( ( z < 0 ) ^ aSign ) { 5918 invalid: 5919 float_raise(float_flag_invalid, status); 5920 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5921 } 5922 if ( ( aSig0<<shiftCount ) != savedASig ) { 5923 status->float_exception_flags |= float_flag_inexact; 5924 } 5925 return z; 5926 5927 } 5928 5929 /*---------------------------------------------------------------------------- 5930 | Returns the result of converting the quadruple-precision floating-point 5931 | value `a' to the 64-bit two's complement integer format. The conversion 5932 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5933 | Arithmetic---which means in particular that the conversion is rounded 5934 | according to the current rounding mode. If `a' is a NaN, the largest 5935 | positive integer is returned. Otherwise, if the conversion overflows, the 5936 | largest integer with the same sign as `a' is returned. 5937 *----------------------------------------------------------------------------*/ 5938 5939 int64_t float128_to_int64(float128 a, float_status *status) 5940 { 5941 flag aSign; 5942 int32_t aExp, shiftCount; 5943 uint64_t aSig0, aSig1; 5944 5945 aSig1 = extractFloat128Frac1( a ); 5946 aSig0 = extractFloat128Frac0( a ); 5947 aExp = extractFloat128Exp( a ); 5948 aSign = extractFloat128Sign( a ); 5949 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5950 shiftCount = 0x402F - aExp; 5951 if ( shiftCount <= 0 ) { 5952 if ( 0x403E < aExp ) { 5953 float_raise(float_flag_invalid, status); 5954 if ( ! aSign 5955 || ( ( aExp == 0x7FFF ) 5956 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 5957 ) 5958 ) { 5959 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5960 } 5961 return (int64_t) LIT64( 0x8000000000000000 ); 5962 } 5963 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 5964 } 5965 else { 5966 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 5967 } 5968 return roundAndPackInt64(aSign, aSig0, aSig1, status); 5969 5970 } 5971 5972 /*---------------------------------------------------------------------------- 5973 | Returns the result of converting the quadruple-precision floating-point 5974 | value `a' to the 64-bit two's complement integer format. The conversion 5975 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5976 | Arithmetic, except that the conversion is always rounded toward zero. 5977 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 5978 | the conversion overflows, the largest integer with the same sign as `a' is 5979 | returned. 5980 *----------------------------------------------------------------------------*/ 5981 5982 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 5983 { 5984 flag aSign; 5985 int32_t aExp, shiftCount; 5986 uint64_t aSig0, aSig1; 5987 int64_t z; 5988 5989 aSig1 = extractFloat128Frac1( a ); 5990 aSig0 = extractFloat128Frac0( a ); 5991 aExp = extractFloat128Exp( a ); 5992 aSign = extractFloat128Sign( a ); 5993 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5994 shiftCount = aExp - 0x402F; 5995 if ( 0 < shiftCount ) { 5996 if ( 0x403E <= aExp ) { 5997 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 5998 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 5999 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 6000 if (aSig1) { 6001 status->float_exception_flags |= float_flag_inexact; 6002 } 6003 } 6004 else { 6005 float_raise(float_flag_invalid, status); 6006 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 6007 return LIT64( 0x7FFFFFFFFFFFFFFF ); 6008 } 6009 } 6010 return (int64_t) LIT64( 0x8000000000000000 ); 6011 } 6012 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 6013 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 6014 status->float_exception_flags |= float_flag_inexact; 6015 } 6016 } 6017 else { 6018 if ( aExp < 0x3FFF ) { 6019 if ( aExp | aSig0 | aSig1 ) { 6020 status->float_exception_flags |= float_flag_inexact; 6021 } 6022 return 0; 6023 } 6024 z = aSig0>>( - shiftCount ); 6025 if ( aSig1 6026 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 6027 status->float_exception_flags |= float_flag_inexact; 6028 } 6029 } 6030 if ( aSign ) z = - z; 6031 return z; 6032 6033 } 6034 6035 /*---------------------------------------------------------------------------- 6036 | Returns the result of converting the quadruple-precision floating-point 6037 | value `a' to the single-precision floating-point format. The conversion 6038 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6039 | Arithmetic. 6040 *----------------------------------------------------------------------------*/ 6041 6042 float32 float128_to_float32(float128 a, float_status *status) 6043 { 6044 flag aSign; 6045 int32_t aExp; 6046 uint64_t aSig0, aSig1; 6047 uint32_t zSig; 6048 6049 aSig1 = extractFloat128Frac1( a ); 6050 aSig0 = extractFloat128Frac0( a ); 6051 aExp = extractFloat128Exp( a ); 6052 aSign = extractFloat128Sign( a ); 6053 if ( aExp == 0x7FFF ) { 6054 if ( aSig0 | aSig1 ) { 6055 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 6056 } 6057 return packFloat32( aSign, 0xFF, 0 ); 6058 } 6059 aSig0 |= ( aSig1 != 0 ); 6060 shift64RightJamming( aSig0, 18, &aSig0 ); 6061 zSig = aSig0; 6062 if ( aExp || zSig ) { 6063 zSig |= 0x40000000; 6064 aExp -= 0x3F81; 6065 } 6066 return roundAndPackFloat32(aSign, aExp, zSig, status); 6067 6068 } 6069 6070 /*---------------------------------------------------------------------------- 6071 | Returns the result of converting the quadruple-precision floating-point 6072 | value `a' to the double-precision floating-point format. The conversion 6073 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6074 | Arithmetic. 6075 *----------------------------------------------------------------------------*/ 6076 6077 float64 float128_to_float64(float128 a, float_status *status) 6078 { 6079 flag aSign; 6080 int32_t aExp; 6081 uint64_t aSig0, aSig1; 6082 6083 aSig1 = extractFloat128Frac1( a ); 6084 aSig0 = extractFloat128Frac0( a ); 6085 aExp = extractFloat128Exp( a ); 6086 aSign = extractFloat128Sign( a ); 6087 if ( aExp == 0x7FFF ) { 6088 if ( aSig0 | aSig1 ) { 6089 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 6090 } 6091 return packFloat64( aSign, 0x7FF, 0 ); 6092 } 6093 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6094 aSig0 |= ( aSig1 != 0 ); 6095 if ( aExp || aSig0 ) { 6096 aSig0 |= LIT64( 0x4000000000000000 ); 6097 aExp -= 0x3C01; 6098 } 6099 return roundAndPackFloat64(aSign, aExp, aSig0, status); 6100 6101 } 6102 6103 /*---------------------------------------------------------------------------- 6104 | Returns the result of converting the quadruple-precision floating-point 6105 | value `a' to the extended double-precision floating-point format. The 6106 | conversion is performed according to the IEC/IEEE Standard for Binary 6107 | Floating-Point Arithmetic. 6108 *----------------------------------------------------------------------------*/ 6109 6110 floatx80 float128_to_floatx80(float128 a, float_status *status) 6111 { 6112 flag aSign; 6113 int32_t aExp; 6114 uint64_t aSig0, aSig1; 6115 6116 aSig1 = extractFloat128Frac1( a ); 6117 aSig0 = extractFloat128Frac0( a ); 6118 aExp = extractFloat128Exp( a ); 6119 aSign = extractFloat128Sign( a ); 6120 if ( aExp == 0x7FFF ) { 6121 if ( aSig0 | aSig1 ) { 6122 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status); 6123 } 6124 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 6125 } 6126 if ( aExp == 0 ) { 6127 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6128 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6129 } 6130 else { 6131 aSig0 |= LIT64( 0x0001000000000000 ); 6132 } 6133 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6134 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6135 6136 } 6137 6138 /*---------------------------------------------------------------------------- 6139 | Rounds the quadruple-precision floating-point value `a' to an integer, and 6140 | returns the result as a quadruple-precision floating-point value. The 6141 | operation is performed according to the IEC/IEEE Standard for Binary 6142 | Floating-Point Arithmetic. 6143 *----------------------------------------------------------------------------*/ 6144 6145 float128 float128_round_to_int(float128 a, float_status *status) 6146 { 6147 flag aSign; 6148 int32_t aExp; 6149 uint64_t lastBitMask, roundBitsMask; 6150 float128 z; 6151 6152 aExp = extractFloat128Exp( a ); 6153 if ( 0x402F <= aExp ) { 6154 if ( 0x406F <= aExp ) { 6155 if ( ( aExp == 0x7FFF ) 6156 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 6157 ) { 6158 return propagateFloat128NaN(a, a, status); 6159 } 6160 return a; 6161 } 6162 lastBitMask = 1; 6163 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 6164 roundBitsMask = lastBitMask - 1; 6165 z = a; 6166 switch (status->float_rounding_mode) { 6167 case float_round_nearest_even: 6168 if ( lastBitMask ) { 6169 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 6170 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 6171 } 6172 else { 6173 if ( (int64_t) z.low < 0 ) { 6174 ++z.high; 6175 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 6176 } 6177 } 6178 break; 6179 case float_round_ties_away: 6180 if (lastBitMask) { 6181 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 6182 } else { 6183 if ((int64_t) z.low < 0) { 6184 ++z.high; 6185 } 6186 } 6187 break; 6188 case float_round_to_zero: 6189 break; 6190 case float_round_up: 6191 if (!extractFloat128Sign(z)) { 6192 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6193 } 6194 break; 6195 case float_round_down: 6196 if (extractFloat128Sign(z)) { 6197 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6198 } 6199 break; 6200 default: 6201 abort(); 6202 } 6203 z.low &= ~ roundBitsMask; 6204 } 6205 else { 6206 if ( aExp < 0x3FFF ) { 6207 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 6208 status->float_exception_flags |= float_flag_inexact; 6209 aSign = extractFloat128Sign( a ); 6210 switch (status->float_rounding_mode) { 6211 case float_round_nearest_even: 6212 if ( ( aExp == 0x3FFE ) 6213 && ( extractFloat128Frac0( a ) 6214 | extractFloat128Frac1( a ) ) 6215 ) { 6216 return packFloat128( aSign, 0x3FFF, 0, 0 ); 6217 } 6218 break; 6219 case float_round_ties_away: 6220 if (aExp == 0x3FFE) { 6221 return packFloat128(aSign, 0x3FFF, 0, 0); 6222 } 6223 break; 6224 case float_round_down: 6225 return 6226 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 6227 : packFloat128( 0, 0, 0, 0 ); 6228 case float_round_up: 6229 return 6230 aSign ? packFloat128( 1, 0, 0, 0 ) 6231 : packFloat128( 0, 0x3FFF, 0, 0 ); 6232 } 6233 return packFloat128( aSign, 0, 0, 0 ); 6234 } 6235 lastBitMask = 1; 6236 lastBitMask <<= 0x402F - aExp; 6237 roundBitsMask = lastBitMask - 1; 6238 z.low = 0; 6239 z.high = a.high; 6240 switch (status->float_rounding_mode) { 6241 case float_round_nearest_even: 6242 z.high += lastBitMask>>1; 6243 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 6244 z.high &= ~ lastBitMask; 6245 } 6246 break; 6247 case float_round_ties_away: 6248 z.high += lastBitMask>>1; 6249 break; 6250 case float_round_to_zero: 6251 break; 6252 case float_round_up: 6253 if (!extractFloat128Sign(z)) { 6254 z.high |= ( a.low != 0 ); 6255 z.high += roundBitsMask; 6256 } 6257 break; 6258 case float_round_down: 6259 if (extractFloat128Sign(z)) { 6260 z.high |= (a.low != 0); 6261 z.high += roundBitsMask; 6262 } 6263 break; 6264 default: 6265 abort(); 6266 } 6267 z.high &= ~ roundBitsMask; 6268 } 6269 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 6270 status->float_exception_flags |= float_flag_inexact; 6271 } 6272 return z; 6273 6274 } 6275 6276 /*---------------------------------------------------------------------------- 6277 | Returns the result of adding the absolute values of the quadruple-precision 6278 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 6279 | before being returned. `zSign' is ignored if the result is a NaN. 6280 | The addition is performed according to the IEC/IEEE Standard for Binary 6281 | Floating-Point Arithmetic. 6282 *----------------------------------------------------------------------------*/ 6283 6284 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign, 6285 float_status *status) 6286 { 6287 int32_t aExp, bExp, zExp; 6288 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6289 int32_t expDiff; 6290 6291 aSig1 = extractFloat128Frac1( a ); 6292 aSig0 = extractFloat128Frac0( a ); 6293 aExp = extractFloat128Exp( a ); 6294 bSig1 = extractFloat128Frac1( b ); 6295 bSig0 = extractFloat128Frac0( b ); 6296 bExp = extractFloat128Exp( b ); 6297 expDiff = aExp - bExp; 6298 if ( 0 < expDiff ) { 6299 if ( aExp == 0x7FFF ) { 6300 if (aSig0 | aSig1) { 6301 return propagateFloat128NaN(a, b, status); 6302 } 6303 return a; 6304 } 6305 if ( bExp == 0 ) { 6306 --expDiff; 6307 } 6308 else { 6309 bSig0 |= LIT64( 0x0001000000000000 ); 6310 } 6311 shift128ExtraRightJamming( 6312 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 6313 zExp = aExp; 6314 } 6315 else if ( expDiff < 0 ) { 6316 if ( bExp == 0x7FFF ) { 6317 if (bSig0 | bSig1) { 6318 return propagateFloat128NaN(a, b, status); 6319 } 6320 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6321 } 6322 if ( aExp == 0 ) { 6323 ++expDiff; 6324 } 6325 else { 6326 aSig0 |= LIT64( 0x0001000000000000 ); 6327 } 6328 shift128ExtraRightJamming( 6329 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 6330 zExp = bExp; 6331 } 6332 else { 6333 if ( aExp == 0x7FFF ) { 6334 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6335 return propagateFloat128NaN(a, b, status); 6336 } 6337 return a; 6338 } 6339 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6340 if ( aExp == 0 ) { 6341 if (status->flush_to_zero) { 6342 if (zSig0 | zSig1) { 6343 float_raise(float_flag_output_denormal, status); 6344 } 6345 return packFloat128(zSign, 0, 0, 0); 6346 } 6347 return packFloat128( zSign, 0, zSig0, zSig1 ); 6348 } 6349 zSig2 = 0; 6350 zSig0 |= LIT64( 0x0002000000000000 ); 6351 zExp = aExp; 6352 goto shiftRight1; 6353 } 6354 aSig0 |= LIT64( 0x0001000000000000 ); 6355 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6356 --zExp; 6357 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 6358 ++zExp; 6359 shiftRight1: 6360 shift128ExtraRightJamming( 6361 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6362 roundAndPack: 6363 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6364 6365 } 6366 6367 /*---------------------------------------------------------------------------- 6368 | Returns the result of subtracting the absolute values of the quadruple- 6369 | precision floating-point values `a' and `b'. If `zSign' is 1, the 6370 | difference is negated before being returned. `zSign' is ignored if the 6371 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6372 | Standard for Binary Floating-Point Arithmetic. 6373 *----------------------------------------------------------------------------*/ 6374 6375 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign, 6376 float_status *status) 6377 { 6378 int32_t aExp, bExp, zExp; 6379 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 6380 int32_t expDiff; 6381 6382 aSig1 = extractFloat128Frac1( a ); 6383 aSig0 = extractFloat128Frac0( a ); 6384 aExp = extractFloat128Exp( a ); 6385 bSig1 = extractFloat128Frac1( b ); 6386 bSig0 = extractFloat128Frac0( b ); 6387 bExp = extractFloat128Exp( b ); 6388 expDiff = aExp - bExp; 6389 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6390 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 6391 if ( 0 < expDiff ) goto aExpBigger; 6392 if ( expDiff < 0 ) goto bExpBigger; 6393 if ( aExp == 0x7FFF ) { 6394 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6395 return propagateFloat128NaN(a, b, status); 6396 } 6397 float_raise(float_flag_invalid, status); 6398 return float128_default_nan(status); 6399 } 6400 if ( aExp == 0 ) { 6401 aExp = 1; 6402 bExp = 1; 6403 } 6404 if ( bSig0 < aSig0 ) goto aBigger; 6405 if ( aSig0 < bSig0 ) goto bBigger; 6406 if ( bSig1 < aSig1 ) goto aBigger; 6407 if ( aSig1 < bSig1 ) goto bBigger; 6408 return packFloat128(status->float_rounding_mode == float_round_down, 6409 0, 0, 0); 6410 bExpBigger: 6411 if ( bExp == 0x7FFF ) { 6412 if (bSig0 | bSig1) { 6413 return propagateFloat128NaN(a, b, status); 6414 } 6415 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 6416 } 6417 if ( aExp == 0 ) { 6418 ++expDiff; 6419 } 6420 else { 6421 aSig0 |= LIT64( 0x4000000000000000 ); 6422 } 6423 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6424 bSig0 |= LIT64( 0x4000000000000000 ); 6425 bBigger: 6426 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6427 zExp = bExp; 6428 zSign ^= 1; 6429 goto normalizeRoundAndPack; 6430 aExpBigger: 6431 if ( aExp == 0x7FFF ) { 6432 if (aSig0 | aSig1) { 6433 return propagateFloat128NaN(a, b, status); 6434 } 6435 return a; 6436 } 6437 if ( bExp == 0 ) { 6438 --expDiff; 6439 } 6440 else { 6441 bSig0 |= LIT64( 0x4000000000000000 ); 6442 } 6443 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 6444 aSig0 |= LIT64( 0x4000000000000000 ); 6445 aBigger: 6446 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6447 zExp = aExp; 6448 normalizeRoundAndPack: 6449 --zExp; 6450 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 6451 status); 6452 6453 } 6454 6455 /*---------------------------------------------------------------------------- 6456 | Returns the result of adding the quadruple-precision floating-point values 6457 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 6458 | for Binary Floating-Point Arithmetic. 6459 *----------------------------------------------------------------------------*/ 6460 6461 float128 float128_add(float128 a, float128 b, float_status *status) 6462 { 6463 flag aSign, bSign; 6464 6465 aSign = extractFloat128Sign( a ); 6466 bSign = extractFloat128Sign( b ); 6467 if ( aSign == bSign ) { 6468 return addFloat128Sigs(a, b, aSign, status); 6469 } 6470 else { 6471 return subFloat128Sigs(a, b, aSign, status); 6472 } 6473 6474 } 6475 6476 /*---------------------------------------------------------------------------- 6477 | Returns the result of subtracting the quadruple-precision floating-point 6478 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6479 | Standard for Binary Floating-Point Arithmetic. 6480 *----------------------------------------------------------------------------*/ 6481 6482 float128 float128_sub(float128 a, float128 b, float_status *status) 6483 { 6484 flag aSign, bSign; 6485 6486 aSign = extractFloat128Sign( a ); 6487 bSign = extractFloat128Sign( b ); 6488 if ( aSign == bSign ) { 6489 return subFloat128Sigs(a, b, aSign, status); 6490 } 6491 else { 6492 return addFloat128Sigs(a, b, aSign, status); 6493 } 6494 6495 } 6496 6497 /*---------------------------------------------------------------------------- 6498 | Returns the result of multiplying the quadruple-precision floating-point 6499 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6500 | Standard for Binary Floating-Point Arithmetic. 6501 *----------------------------------------------------------------------------*/ 6502 6503 float128 float128_mul(float128 a, float128 b, float_status *status) 6504 { 6505 flag aSign, bSign, zSign; 6506 int32_t aExp, bExp, zExp; 6507 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 6508 6509 aSig1 = extractFloat128Frac1( a ); 6510 aSig0 = extractFloat128Frac0( a ); 6511 aExp = extractFloat128Exp( a ); 6512 aSign = extractFloat128Sign( a ); 6513 bSig1 = extractFloat128Frac1( b ); 6514 bSig0 = extractFloat128Frac0( b ); 6515 bExp = extractFloat128Exp( b ); 6516 bSign = extractFloat128Sign( b ); 6517 zSign = aSign ^ bSign; 6518 if ( aExp == 0x7FFF ) { 6519 if ( ( aSig0 | aSig1 ) 6520 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6521 return propagateFloat128NaN(a, b, status); 6522 } 6523 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 6524 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6525 } 6526 if ( bExp == 0x7FFF ) { 6527 if (bSig0 | bSig1) { 6528 return propagateFloat128NaN(a, b, status); 6529 } 6530 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6531 invalid: 6532 float_raise(float_flag_invalid, status); 6533 return float128_default_nan(status); 6534 } 6535 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6536 } 6537 if ( aExp == 0 ) { 6538 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6539 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6540 } 6541 if ( bExp == 0 ) { 6542 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6543 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6544 } 6545 zExp = aExp + bExp - 0x4000; 6546 aSig0 |= LIT64( 0x0001000000000000 ); 6547 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 6548 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 6549 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6550 zSig2 |= ( zSig3 != 0 ); 6551 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 6552 shift128ExtraRightJamming( 6553 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6554 ++zExp; 6555 } 6556 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6557 6558 } 6559 6560 /*---------------------------------------------------------------------------- 6561 | Returns the result of dividing the quadruple-precision floating-point value 6562 | `a' by the corresponding value `b'. The operation is performed according to 6563 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6564 *----------------------------------------------------------------------------*/ 6565 6566 float128 float128_div(float128 a, float128 b, float_status *status) 6567 { 6568 flag aSign, bSign, zSign; 6569 int32_t aExp, bExp, zExp; 6570 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6571 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6572 6573 aSig1 = extractFloat128Frac1( a ); 6574 aSig0 = extractFloat128Frac0( a ); 6575 aExp = extractFloat128Exp( a ); 6576 aSign = extractFloat128Sign( a ); 6577 bSig1 = extractFloat128Frac1( b ); 6578 bSig0 = extractFloat128Frac0( b ); 6579 bExp = extractFloat128Exp( b ); 6580 bSign = extractFloat128Sign( b ); 6581 zSign = aSign ^ bSign; 6582 if ( aExp == 0x7FFF ) { 6583 if (aSig0 | aSig1) { 6584 return propagateFloat128NaN(a, b, status); 6585 } 6586 if ( bExp == 0x7FFF ) { 6587 if (bSig0 | bSig1) { 6588 return propagateFloat128NaN(a, b, status); 6589 } 6590 goto invalid; 6591 } 6592 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6593 } 6594 if ( bExp == 0x7FFF ) { 6595 if (bSig0 | bSig1) { 6596 return propagateFloat128NaN(a, b, status); 6597 } 6598 return packFloat128( zSign, 0, 0, 0 ); 6599 } 6600 if ( bExp == 0 ) { 6601 if ( ( bSig0 | bSig1 ) == 0 ) { 6602 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6603 invalid: 6604 float_raise(float_flag_invalid, status); 6605 return float128_default_nan(status); 6606 } 6607 float_raise(float_flag_divbyzero, status); 6608 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6609 } 6610 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6611 } 6612 if ( aExp == 0 ) { 6613 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6614 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6615 } 6616 zExp = aExp - bExp + 0x3FFD; 6617 shortShift128Left( 6618 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 6619 shortShift128Left( 6620 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6621 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 6622 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 6623 ++zExp; 6624 } 6625 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6626 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 6627 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 6628 while ( (int64_t) rem0 < 0 ) { 6629 --zSig0; 6630 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 6631 } 6632 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 6633 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 6634 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 6635 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 6636 while ( (int64_t) rem1 < 0 ) { 6637 --zSig1; 6638 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 6639 } 6640 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6641 } 6642 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 6643 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6644 6645 } 6646 6647 /*---------------------------------------------------------------------------- 6648 | Returns the remainder of the quadruple-precision floating-point value `a' 6649 | with respect to the corresponding value `b'. The operation is performed 6650 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6651 *----------------------------------------------------------------------------*/ 6652 6653 float128 float128_rem(float128 a, float128 b, float_status *status) 6654 { 6655 flag aSign, zSign; 6656 int32_t aExp, bExp, expDiff; 6657 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 6658 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 6659 int64_t sigMean0; 6660 6661 aSig1 = extractFloat128Frac1( a ); 6662 aSig0 = extractFloat128Frac0( a ); 6663 aExp = extractFloat128Exp( a ); 6664 aSign = extractFloat128Sign( a ); 6665 bSig1 = extractFloat128Frac1( b ); 6666 bSig0 = extractFloat128Frac0( b ); 6667 bExp = extractFloat128Exp( b ); 6668 if ( aExp == 0x7FFF ) { 6669 if ( ( aSig0 | aSig1 ) 6670 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6671 return propagateFloat128NaN(a, b, status); 6672 } 6673 goto invalid; 6674 } 6675 if ( bExp == 0x7FFF ) { 6676 if (bSig0 | bSig1) { 6677 return propagateFloat128NaN(a, b, status); 6678 } 6679 return a; 6680 } 6681 if ( bExp == 0 ) { 6682 if ( ( bSig0 | bSig1 ) == 0 ) { 6683 invalid: 6684 float_raise(float_flag_invalid, status); 6685 return float128_default_nan(status); 6686 } 6687 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6688 } 6689 if ( aExp == 0 ) { 6690 if ( ( aSig0 | aSig1 ) == 0 ) return a; 6691 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6692 } 6693 expDiff = aExp - bExp; 6694 if ( expDiff < -1 ) return a; 6695 shortShift128Left( 6696 aSig0 | LIT64( 0x0001000000000000 ), 6697 aSig1, 6698 15 - ( expDiff < 0 ), 6699 &aSig0, 6700 &aSig1 6701 ); 6702 shortShift128Left( 6703 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6704 q = le128( bSig0, bSig1, aSig0, aSig1 ); 6705 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6706 expDiff -= 64; 6707 while ( 0 < expDiff ) { 6708 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6709 q = ( 4 < q ) ? q - 4 : 0; 6710 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6711 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 6712 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 6713 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 6714 expDiff -= 61; 6715 } 6716 if ( -64 < expDiff ) { 6717 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6718 q = ( 4 < q ) ? q - 4 : 0; 6719 q >>= - expDiff; 6720 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6721 expDiff += 52; 6722 if ( expDiff < 0 ) { 6723 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6724 } 6725 else { 6726 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 6727 } 6728 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6729 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 6730 } 6731 else { 6732 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 6733 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6734 } 6735 do { 6736 alternateASig0 = aSig0; 6737 alternateASig1 = aSig1; 6738 ++q; 6739 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6740 } while ( 0 <= (int64_t) aSig0 ); 6741 add128( 6742 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 6743 if ( ( sigMean0 < 0 ) 6744 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 6745 aSig0 = alternateASig0; 6746 aSig1 = alternateASig1; 6747 } 6748 zSign = ( (int64_t) aSig0 < 0 ); 6749 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 6750 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 6751 status); 6752 } 6753 6754 /*---------------------------------------------------------------------------- 6755 | Returns the square root of the quadruple-precision floating-point value `a'. 6756 | The operation is performed according to the IEC/IEEE Standard for Binary 6757 | Floating-Point Arithmetic. 6758 *----------------------------------------------------------------------------*/ 6759 6760 float128 float128_sqrt(float128 a, float_status *status) 6761 { 6762 flag aSign; 6763 int32_t aExp, zExp; 6764 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 6765 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6766 6767 aSig1 = extractFloat128Frac1( a ); 6768 aSig0 = extractFloat128Frac0( a ); 6769 aExp = extractFloat128Exp( a ); 6770 aSign = extractFloat128Sign( a ); 6771 if ( aExp == 0x7FFF ) { 6772 if (aSig0 | aSig1) { 6773 return propagateFloat128NaN(a, a, status); 6774 } 6775 if ( ! aSign ) return a; 6776 goto invalid; 6777 } 6778 if ( aSign ) { 6779 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 6780 invalid: 6781 float_raise(float_flag_invalid, status); 6782 return float128_default_nan(status); 6783 } 6784 if ( aExp == 0 ) { 6785 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 6786 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6787 } 6788 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 6789 aSig0 |= LIT64( 0x0001000000000000 ); 6790 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 6791 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 6792 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6793 doubleZSig0 = zSig0<<1; 6794 mul64To128( zSig0, zSig0, &term0, &term1 ); 6795 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6796 while ( (int64_t) rem0 < 0 ) { 6797 --zSig0; 6798 doubleZSig0 -= 2; 6799 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6800 } 6801 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6802 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 6803 if ( zSig1 == 0 ) zSig1 = 1; 6804 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6805 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6806 mul64To128( zSig1, zSig1, &term2, &term3 ); 6807 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6808 while ( (int64_t) rem1 < 0 ) { 6809 --zSig1; 6810 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6811 term3 |= 1; 6812 term2 |= doubleZSig0; 6813 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6814 } 6815 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6816 } 6817 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 6818 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 6819 6820 } 6821 6822 /*---------------------------------------------------------------------------- 6823 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 6824 | the corresponding value `b', and 0 otherwise. The invalid exception is 6825 | raised if either operand is a NaN. Otherwise, the comparison is performed 6826 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6827 *----------------------------------------------------------------------------*/ 6828 6829 int float128_eq(float128 a, float128 b, float_status *status) 6830 { 6831 6832 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6833 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6834 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6835 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6836 ) { 6837 float_raise(float_flag_invalid, status); 6838 return 0; 6839 } 6840 return 6841 ( a.low == b.low ) 6842 && ( ( a.high == b.high ) 6843 || ( ( a.low == 0 ) 6844 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6845 ); 6846 6847 } 6848 6849 /*---------------------------------------------------------------------------- 6850 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6851 | or equal to the corresponding value `b', and 0 otherwise. The invalid 6852 | exception is raised if either operand is a NaN. The comparison is performed 6853 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6854 *----------------------------------------------------------------------------*/ 6855 6856 int float128_le(float128 a, float128 b, float_status *status) 6857 { 6858 flag aSign, bSign; 6859 6860 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6861 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6862 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6863 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6864 ) { 6865 float_raise(float_flag_invalid, status); 6866 return 0; 6867 } 6868 aSign = extractFloat128Sign( a ); 6869 bSign = extractFloat128Sign( b ); 6870 if ( aSign != bSign ) { 6871 return 6872 aSign 6873 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6874 == 0 ); 6875 } 6876 return 6877 aSign ? le128( b.high, b.low, a.high, a.low ) 6878 : le128( a.high, a.low, b.high, b.low ); 6879 6880 } 6881 6882 /*---------------------------------------------------------------------------- 6883 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6884 | the corresponding value `b', and 0 otherwise. The invalid exception is 6885 | raised if either operand is a NaN. The comparison is performed according 6886 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6887 *----------------------------------------------------------------------------*/ 6888 6889 int float128_lt(float128 a, float128 b, float_status *status) 6890 { 6891 flag aSign, bSign; 6892 6893 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6894 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6895 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6896 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6897 ) { 6898 float_raise(float_flag_invalid, status); 6899 return 0; 6900 } 6901 aSign = extractFloat128Sign( a ); 6902 bSign = extractFloat128Sign( b ); 6903 if ( aSign != bSign ) { 6904 return 6905 aSign 6906 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6907 != 0 ); 6908 } 6909 return 6910 aSign ? lt128( b.high, b.low, a.high, a.low ) 6911 : lt128( a.high, a.low, b.high, b.low ); 6912 6913 } 6914 6915 /*---------------------------------------------------------------------------- 6916 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 6917 | be compared, and 0 otherwise. The invalid exception is raised if either 6918 | operand is a NaN. The comparison is performed according to the IEC/IEEE 6919 | Standard for Binary Floating-Point Arithmetic. 6920 *----------------------------------------------------------------------------*/ 6921 6922 int float128_unordered(float128 a, float128 b, float_status *status) 6923 { 6924 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6925 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6926 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6927 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6928 ) { 6929 float_raise(float_flag_invalid, status); 6930 return 1; 6931 } 6932 return 0; 6933 } 6934 6935 /*---------------------------------------------------------------------------- 6936 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 6937 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 6938 | exception. The comparison is performed according to the IEC/IEEE Standard 6939 | for Binary Floating-Point Arithmetic. 6940 *----------------------------------------------------------------------------*/ 6941 6942 int float128_eq_quiet(float128 a, float128 b, float_status *status) 6943 { 6944 6945 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6946 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6947 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6948 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6949 ) { 6950 if (float128_is_signaling_nan(a, status) 6951 || float128_is_signaling_nan(b, status)) { 6952 float_raise(float_flag_invalid, status); 6953 } 6954 return 0; 6955 } 6956 return 6957 ( a.low == b.low ) 6958 && ( ( a.high == b.high ) 6959 || ( ( a.low == 0 ) 6960 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6961 ); 6962 6963 } 6964 6965 /*---------------------------------------------------------------------------- 6966 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6967 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 6968 | cause an exception. Otherwise, the comparison is performed according to the 6969 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6970 *----------------------------------------------------------------------------*/ 6971 6972 int float128_le_quiet(float128 a, float128 b, float_status *status) 6973 { 6974 flag aSign, bSign; 6975 6976 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6977 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6978 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6979 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6980 ) { 6981 if (float128_is_signaling_nan(a, status) 6982 || float128_is_signaling_nan(b, status)) { 6983 float_raise(float_flag_invalid, status); 6984 } 6985 return 0; 6986 } 6987 aSign = extractFloat128Sign( a ); 6988 bSign = extractFloat128Sign( b ); 6989 if ( aSign != bSign ) { 6990 return 6991 aSign 6992 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6993 == 0 ); 6994 } 6995 return 6996 aSign ? le128( b.high, b.low, a.high, a.low ) 6997 : le128( a.high, a.low, b.high, b.low ); 6998 6999 } 7000 7001 /*---------------------------------------------------------------------------- 7002 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7003 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 7004 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 7005 | Standard for Binary Floating-Point Arithmetic. 7006 *----------------------------------------------------------------------------*/ 7007 7008 int float128_lt_quiet(float128 a, float128 b, float_status *status) 7009 { 7010 flag aSign, bSign; 7011 7012 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7013 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7014 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7015 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7016 ) { 7017 if (float128_is_signaling_nan(a, status) 7018 || float128_is_signaling_nan(b, status)) { 7019 float_raise(float_flag_invalid, status); 7020 } 7021 return 0; 7022 } 7023 aSign = extractFloat128Sign( a ); 7024 bSign = extractFloat128Sign( b ); 7025 if ( aSign != bSign ) { 7026 return 7027 aSign 7028 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7029 != 0 ); 7030 } 7031 return 7032 aSign ? lt128( b.high, b.low, a.high, a.low ) 7033 : lt128( a.high, a.low, b.high, b.low ); 7034 7035 } 7036 7037 /*---------------------------------------------------------------------------- 7038 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7039 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 7040 | comparison is performed according to the IEC/IEEE Standard for Binary 7041 | Floating-Point Arithmetic. 7042 *----------------------------------------------------------------------------*/ 7043 7044 int float128_unordered_quiet(float128 a, float128 b, float_status *status) 7045 { 7046 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7047 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7048 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7049 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7050 ) { 7051 if (float128_is_signaling_nan(a, status) 7052 || float128_is_signaling_nan(b, status)) { 7053 float_raise(float_flag_invalid, status); 7054 } 7055 return 1; 7056 } 7057 return 0; 7058 } 7059 7060 /* misc functions */ 7061 float32 uint32_to_float32(uint32_t a, float_status *status) 7062 { 7063 return int64_to_float32(a, status); 7064 } 7065 7066 float64 uint32_to_float64(uint32_t a, float_status *status) 7067 { 7068 return int64_to_float64(a, status); 7069 } 7070 7071 uint32_t float32_to_uint32(float32 a, float_status *status) 7072 { 7073 int64_t v; 7074 uint32_t res; 7075 int old_exc_flags = get_float_exception_flags(status); 7076 7077 v = float32_to_int64(a, status); 7078 if (v < 0) { 7079 res = 0; 7080 } else if (v > 0xffffffff) { 7081 res = 0xffffffff; 7082 } else { 7083 return v; 7084 } 7085 set_float_exception_flags(old_exc_flags, status); 7086 float_raise(float_flag_invalid, status); 7087 return res; 7088 } 7089 7090 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *status) 7091 { 7092 int64_t v; 7093 uint32_t res; 7094 int old_exc_flags = get_float_exception_flags(status); 7095 7096 v = float32_to_int64_round_to_zero(a, status); 7097 if (v < 0) { 7098 res = 0; 7099 } else if (v > 0xffffffff) { 7100 res = 0xffffffff; 7101 } else { 7102 return v; 7103 } 7104 set_float_exception_flags(old_exc_flags, status); 7105 float_raise(float_flag_invalid, status); 7106 return res; 7107 } 7108 7109 int16_t float32_to_int16(float32 a, float_status *status) 7110 { 7111 int32_t v; 7112 int16_t res; 7113 int old_exc_flags = get_float_exception_flags(status); 7114 7115 v = float32_to_int32(a, status); 7116 if (v < -0x8000) { 7117 res = -0x8000; 7118 } else if (v > 0x7fff) { 7119 res = 0x7fff; 7120 } else { 7121 return v; 7122 } 7123 7124 set_float_exception_flags(old_exc_flags, status); 7125 float_raise(float_flag_invalid, status); 7126 return res; 7127 } 7128 7129 uint16_t float32_to_uint16(float32 a, float_status *status) 7130 { 7131 int32_t v; 7132 uint16_t res; 7133 int old_exc_flags = get_float_exception_flags(status); 7134 7135 v = float32_to_int32(a, status); 7136 if (v < 0) { 7137 res = 0; 7138 } else if (v > 0xffff) { 7139 res = 0xffff; 7140 } else { 7141 return v; 7142 } 7143 7144 set_float_exception_flags(old_exc_flags, status); 7145 float_raise(float_flag_invalid, status); 7146 return res; 7147 } 7148 7149 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *status) 7150 { 7151 int64_t v; 7152 uint16_t res; 7153 int old_exc_flags = get_float_exception_flags(status); 7154 7155 v = float32_to_int64_round_to_zero(a, status); 7156 if (v < 0) { 7157 res = 0; 7158 } else if (v > 0xffff) { 7159 res = 0xffff; 7160 } else { 7161 return v; 7162 } 7163 set_float_exception_flags(old_exc_flags, status); 7164 float_raise(float_flag_invalid, status); 7165 return res; 7166 } 7167 7168 uint32_t float64_to_uint32(float64 a, float_status *status) 7169 { 7170 uint64_t v; 7171 uint32_t res; 7172 int old_exc_flags = get_float_exception_flags(status); 7173 7174 v = float64_to_uint64(a, status); 7175 if (v > 0xffffffff) { 7176 res = 0xffffffff; 7177 } else { 7178 return v; 7179 } 7180 set_float_exception_flags(old_exc_flags, status); 7181 float_raise(float_flag_invalid, status); 7182 return res; 7183 } 7184 7185 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *status) 7186 { 7187 uint64_t v; 7188 uint32_t res; 7189 int old_exc_flags = get_float_exception_flags(status); 7190 7191 v = float64_to_uint64_round_to_zero(a, status); 7192 if (v > 0xffffffff) { 7193 res = 0xffffffff; 7194 } else { 7195 return v; 7196 } 7197 set_float_exception_flags(old_exc_flags, status); 7198 float_raise(float_flag_invalid, status); 7199 return res; 7200 } 7201 7202 int16_t float64_to_int16(float64 a, float_status *status) 7203 { 7204 int64_t v; 7205 int16_t res; 7206 int old_exc_flags = get_float_exception_flags(status); 7207 7208 v = float64_to_int32(a, status); 7209 if (v < -0x8000) { 7210 res = -0x8000; 7211 } else if (v > 0x7fff) { 7212 res = 0x7fff; 7213 } else { 7214 return v; 7215 } 7216 7217 set_float_exception_flags(old_exc_flags, status); 7218 float_raise(float_flag_invalid, status); 7219 return res; 7220 } 7221 7222 uint16_t float64_to_uint16(float64 a, float_status *status) 7223 { 7224 int64_t v; 7225 uint16_t res; 7226 int old_exc_flags = get_float_exception_flags(status); 7227 7228 v = float64_to_int32(a, status); 7229 if (v < 0) { 7230 res = 0; 7231 } else if (v > 0xffff) { 7232 res = 0xffff; 7233 } else { 7234 return v; 7235 } 7236 7237 set_float_exception_flags(old_exc_flags, status); 7238 float_raise(float_flag_invalid, status); 7239 return res; 7240 } 7241 7242 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *status) 7243 { 7244 int64_t v; 7245 uint16_t res; 7246 int old_exc_flags = get_float_exception_flags(status); 7247 7248 v = float64_to_int64_round_to_zero(a, status); 7249 if (v < 0) { 7250 res = 0; 7251 } else if (v > 0xffff) { 7252 res = 0xffff; 7253 } else { 7254 return v; 7255 } 7256 set_float_exception_flags(old_exc_flags, status); 7257 float_raise(float_flag_invalid, status); 7258 return res; 7259 } 7260 7261 /*---------------------------------------------------------------------------- 7262 | Returns the result of converting the double-precision floating-point value 7263 | `a' to the 64-bit unsigned integer format. The conversion is 7264 | performed according to the IEC/IEEE Standard for Binary Floating-Point 7265 | Arithmetic---which means in particular that the conversion is rounded 7266 | according to the current rounding mode. If `a' is a NaN, the largest 7267 | positive integer is returned. If the conversion overflows, the 7268 | largest unsigned integer is returned. If 'a' is negative, the value is 7269 | rounded and zero is returned; negative values that do not round to zero 7270 | will raise the inexact exception. 7271 *----------------------------------------------------------------------------*/ 7272 7273 uint64_t float64_to_uint64(float64 a, float_status *status) 7274 { 7275 flag aSign; 7276 int aExp; 7277 int shiftCount; 7278 uint64_t aSig, aSigExtra; 7279 a = float64_squash_input_denormal(a, status); 7280 7281 aSig = extractFloat64Frac(a); 7282 aExp = extractFloat64Exp(a); 7283 aSign = extractFloat64Sign(a); 7284 if (aSign && (aExp > 1022)) { 7285 float_raise(float_flag_invalid, status); 7286 if (float64_is_any_nan(a)) { 7287 return LIT64(0xFFFFFFFFFFFFFFFF); 7288 } else { 7289 return 0; 7290 } 7291 } 7292 if (aExp) { 7293 aSig |= LIT64(0x0010000000000000); 7294 } 7295 shiftCount = 0x433 - aExp; 7296 if (shiftCount <= 0) { 7297 if (0x43E < aExp) { 7298 float_raise(float_flag_invalid, status); 7299 return LIT64(0xFFFFFFFFFFFFFFFF); 7300 } 7301 aSigExtra = 0; 7302 aSig <<= -shiftCount; 7303 } else { 7304 shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra); 7305 } 7306 return roundAndPackUint64(aSign, aSig, aSigExtra, status); 7307 } 7308 7309 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status) 7310 { 7311 signed char current_rounding_mode = status->float_rounding_mode; 7312 set_float_rounding_mode(float_round_to_zero, status); 7313 int64_t v = float64_to_uint64(a, status); 7314 set_float_rounding_mode(current_rounding_mode, status); 7315 return v; 7316 } 7317 7318 #define COMPARE(s, nan_exp) \ 7319 static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\ 7320 int is_quiet, float_status *status) \ 7321 { \ 7322 flag aSign, bSign; \ 7323 uint ## s ## _t av, bv; \ 7324 a = float ## s ## _squash_input_denormal(a, status); \ 7325 b = float ## s ## _squash_input_denormal(b, status); \ 7326 \ 7327 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \ 7328 extractFloat ## s ## Frac( a ) ) || \ 7329 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \ 7330 extractFloat ## s ## Frac( b ) )) { \ 7331 if (!is_quiet || \ 7332 float ## s ## _is_signaling_nan(a, status) || \ 7333 float ## s ## _is_signaling_nan(b, status)) { \ 7334 float_raise(float_flag_invalid, status); \ 7335 } \ 7336 return float_relation_unordered; \ 7337 } \ 7338 aSign = extractFloat ## s ## Sign( a ); \ 7339 bSign = extractFloat ## s ## Sign( b ); \ 7340 av = float ## s ## _val(a); \ 7341 bv = float ## s ## _val(b); \ 7342 if ( aSign != bSign ) { \ 7343 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \ 7344 /* zero case */ \ 7345 return float_relation_equal; \ 7346 } else { \ 7347 return 1 - (2 * aSign); \ 7348 } \ 7349 } else { \ 7350 if (av == bv) { \ 7351 return float_relation_equal; \ 7352 } else { \ 7353 return 1 - 2 * (aSign ^ ( av < bv )); \ 7354 } \ 7355 } \ 7356 } \ 7357 \ 7358 int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \ 7359 { \ 7360 return float ## s ## _compare_internal(a, b, 0, status); \ 7361 } \ 7362 \ 7363 int float ## s ## _compare_quiet(float ## s a, float ## s b, \ 7364 float_status *status) \ 7365 { \ 7366 return float ## s ## _compare_internal(a, b, 1, status); \ 7367 } 7368 7369 COMPARE(32, 0xff) 7370 COMPARE(64, 0x7ff) 7371 7372 static inline int floatx80_compare_internal(floatx80 a, floatx80 b, 7373 int is_quiet, float_status *status) 7374 { 7375 flag aSign, bSign; 7376 7377 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 7378 ( extractFloatx80Frac( a )<<1 ) ) || 7379 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 7380 ( extractFloatx80Frac( b )<<1 ) )) { 7381 if (!is_quiet || 7382 floatx80_is_signaling_nan(a, status) || 7383 floatx80_is_signaling_nan(b, status)) { 7384 float_raise(float_flag_invalid, status); 7385 } 7386 return float_relation_unordered; 7387 } 7388 aSign = extractFloatx80Sign( a ); 7389 bSign = extractFloatx80Sign( b ); 7390 if ( aSign != bSign ) { 7391 7392 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 7393 ( ( a.low | b.low ) == 0 ) ) { 7394 /* zero case */ 7395 return float_relation_equal; 7396 } else { 7397 return 1 - (2 * aSign); 7398 } 7399 } else { 7400 if (a.low == b.low && a.high == b.high) { 7401 return float_relation_equal; 7402 } else { 7403 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7404 } 7405 } 7406 } 7407 7408 int floatx80_compare(floatx80 a, floatx80 b, float_status *status) 7409 { 7410 return floatx80_compare_internal(a, b, 0, status); 7411 } 7412 7413 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status) 7414 { 7415 return floatx80_compare_internal(a, b, 1, status); 7416 } 7417 7418 static inline int float128_compare_internal(float128 a, float128 b, 7419 int is_quiet, float_status *status) 7420 { 7421 flag aSign, bSign; 7422 7423 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7424 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7425 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7426 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7427 if (!is_quiet || 7428 float128_is_signaling_nan(a, status) || 7429 float128_is_signaling_nan(b, status)) { 7430 float_raise(float_flag_invalid, status); 7431 } 7432 return float_relation_unordered; 7433 } 7434 aSign = extractFloat128Sign( a ); 7435 bSign = extractFloat128Sign( b ); 7436 if ( aSign != bSign ) { 7437 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7438 /* zero case */ 7439 return float_relation_equal; 7440 } else { 7441 return 1 - (2 * aSign); 7442 } 7443 } else { 7444 if (a.low == b.low && a.high == b.high) { 7445 return float_relation_equal; 7446 } else { 7447 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7448 } 7449 } 7450 } 7451 7452 int float128_compare(float128 a, float128 b, float_status *status) 7453 { 7454 return float128_compare_internal(a, b, 0, status); 7455 } 7456 7457 int float128_compare_quiet(float128 a, float128 b, float_status *status) 7458 { 7459 return float128_compare_internal(a, b, 1, status); 7460 } 7461 7462 /* min() and max() functions. These can't be implemented as 7463 * 'compare and pick one input' because that would mishandle 7464 * NaNs and +0 vs -0. 7465 * 7466 * minnum() and maxnum() functions. These are similar to the min() 7467 * and max() functions but if one of the arguments is a QNaN and 7468 * the other is numerical then the numerical argument is returned. 7469 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 7470 * and maxNum() operations. min() and max() are the typical min/max 7471 * semantics provided by many CPUs which predate that specification. 7472 * 7473 * minnummag() and maxnummag() functions correspond to minNumMag() 7474 * and minNumMag() from the IEEE-754 2008. 7475 */ 7476 #define MINMAX(s) \ 7477 static inline float ## s float ## s ## _minmax(float ## s a, float ## s b, \ 7478 int ismin, int isieee, \ 7479 int ismag, \ 7480 float_status *status) \ 7481 { \ 7482 flag aSign, bSign; \ 7483 uint ## s ## _t av, bv, aav, abv; \ 7484 a = float ## s ## _squash_input_denormal(a, status); \ 7485 b = float ## s ## _squash_input_denormal(b, status); \ 7486 if (float ## s ## _is_any_nan(a) || \ 7487 float ## s ## _is_any_nan(b)) { \ 7488 if (isieee) { \ 7489 if (float ## s ## _is_quiet_nan(a, status) && \ 7490 !float ## s ##_is_any_nan(b)) { \ 7491 return b; \ 7492 } else if (float ## s ## _is_quiet_nan(b, status) && \ 7493 !float ## s ## _is_any_nan(a)) { \ 7494 return a; \ 7495 } \ 7496 } \ 7497 return propagateFloat ## s ## NaN(a, b, status); \ 7498 } \ 7499 aSign = extractFloat ## s ## Sign(a); \ 7500 bSign = extractFloat ## s ## Sign(b); \ 7501 av = float ## s ## _val(a); \ 7502 bv = float ## s ## _val(b); \ 7503 if (ismag) { \ 7504 aav = float ## s ## _abs(av); \ 7505 abv = float ## s ## _abs(bv); \ 7506 if (aav != abv) { \ 7507 if (ismin) { \ 7508 return (aav < abv) ? a : b; \ 7509 } else { \ 7510 return (aav < abv) ? b : a; \ 7511 } \ 7512 } \ 7513 } \ 7514 if (aSign != bSign) { \ 7515 if (ismin) { \ 7516 return aSign ? a : b; \ 7517 } else { \ 7518 return aSign ? b : a; \ 7519 } \ 7520 } else { \ 7521 if (ismin) { \ 7522 return (aSign ^ (av < bv)) ? a : b; \ 7523 } else { \ 7524 return (aSign ^ (av < bv)) ? b : a; \ 7525 } \ 7526 } \ 7527 } \ 7528 \ 7529 float ## s float ## s ## _min(float ## s a, float ## s b, \ 7530 float_status *status) \ 7531 { \ 7532 return float ## s ## _minmax(a, b, 1, 0, 0, status); \ 7533 } \ 7534 \ 7535 float ## s float ## s ## _max(float ## s a, float ## s b, \ 7536 float_status *status) \ 7537 { \ 7538 return float ## s ## _minmax(a, b, 0, 0, 0, status); \ 7539 } \ 7540 \ 7541 float ## s float ## s ## _minnum(float ## s a, float ## s b, \ 7542 float_status *status) \ 7543 { \ 7544 return float ## s ## _minmax(a, b, 1, 1, 0, status); \ 7545 } \ 7546 \ 7547 float ## s float ## s ## _maxnum(float ## s a, float ## s b, \ 7548 float_status *status) \ 7549 { \ 7550 return float ## s ## _minmax(a, b, 0, 1, 0, status); \ 7551 } \ 7552 \ 7553 float ## s float ## s ## _minnummag(float ## s a, float ## s b, \ 7554 float_status *status) \ 7555 { \ 7556 return float ## s ## _minmax(a, b, 1, 1, 1, status); \ 7557 } \ 7558 \ 7559 float ## s float ## s ## _maxnummag(float ## s a, float ## s b, \ 7560 float_status *status) \ 7561 { \ 7562 return float ## s ## _minmax(a, b, 0, 1, 1, status); \ 7563 } 7564 7565 MINMAX(32) 7566 MINMAX(64) 7567 7568 7569 /* Multiply A by 2 raised to the power N. */ 7570 float32 float32_scalbn(float32 a, int n, float_status *status) 7571 { 7572 flag aSign; 7573 int16_t aExp; 7574 uint32_t aSig; 7575 7576 a = float32_squash_input_denormal(a, status); 7577 aSig = extractFloat32Frac( a ); 7578 aExp = extractFloat32Exp( a ); 7579 aSign = extractFloat32Sign( a ); 7580 7581 if ( aExp == 0xFF ) { 7582 if ( aSig ) { 7583 return propagateFloat32NaN(a, a, status); 7584 } 7585 return a; 7586 } 7587 if (aExp != 0) { 7588 aSig |= 0x00800000; 7589 } else if (aSig == 0) { 7590 return a; 7591 } else { 7592 aExp++; 7593 } 7594 7595 if (n > 0x200) { 7596 n = 0x200; 7597 } else if (n < -0x200) { 7598 n = -0x200; 7599 } 7600 7601 aExp += n - 1; 7602 aSig <<= 7; 7603 return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status); 7604 } 7605 7606 float64 float64_scalbn(float64 a, int n, float_status *status) 7607 { 7608 flag aSign; 7609 int16_t aExp; 7610 uint64_t aSig; 7611 7612 a = float64_squash_input_denormal(a, status); 7613 aSig = extractFloat64Frac( a ); 7614 aExp = extractFloat64Exp( a ); 7615 aSign = extractFloat64Sign( a ); 7616 7617 if ( aExp == 0x7FF ) { 7618 if ( aSig ) { 7619 return propagateFloat64NaN(a, a, status); 7620 } 7621 return a; 7622 } 7623 if (aExp != 0) { 7624 aSig |= LIT64( 0x0010000000000000 ); 7625 } else if (aSig == 0) { 7626 return a; 7627 } else { 7628 aExp++; 7629 } 7630 7631 if (n > 0x1000) { 7632 n = 0x1000; 7633 } else if (n < -0x1000) { 7634 n = -0x1000; 7635 } 7636 7637 aExp += n - 1; 7638 aSig <<= 10; 7639 return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status); 7640 } 7641 7642 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7643 { 7644 flag aSign; 7645 int32_t aExp; 7646 uint64_t aSig; 7647 7648 aSig = extractFloatx80Frac( a ); 7649 aExp = extractFloatx80Exp( a ); 7650 aSign = extractFloatx80Sign( a ); 7651 7652 if ( aExp == 0x7FFF ) { 7653 if ( aSig<<1 ) { 7654 return propagateFloatx80NaN(a, a, status); 7655 } 7656 return a; 7657 } 7658 7659 if (aExp == 0) { 7660 if (aSig == 0) { 7661 return a; 7662 } 7663 aExp++; 7664 } 7665 7666 if (n > 0x10000) { 7667 n = 0x10000; 7668 } else if (n < -0x10000) { 7669 n = -0x10000; 7670 } 7671 7672 aExp += n; 7673 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7674 aSign, aExp, aSig, 0, status); 7675 } 7676 7677 float128 float128_scalbn(float128 a, int n, float_status *status) 7678 { 7679 flag aSign; 7680 int32_t aExp; 7681 uint64_t aSig0, aSig1; 7682 7683 aSig1 = extractFloat128Frac1( a ); 7684 aSig0 = extractFloat128Frac0( a ); 7685 aExp = extractFloat128Exp( a ); 7686 aSign = extractFloat128Sign( a ); 7687 if ( aExp == 0x7FFF ) { 7688 if ( aSig0 | aSig1 ) { 7689 return propagateFloat128NaN(a, a, status); 7690 } 7691 return a; 7692 } 7693 if (aExp != 0) { 7694 aSig0 |= LIT64( 0x0001000000000000 ); 7695 } else if (aSig0 == 0 && aSig1 == 0) { 7696 return a; 7697 } else { 7698 aExp++; 7699 } 7700 7701 if (n > 0x10000) { 7702 n = 0x10000; 7703 } else if (n < -0x10000) { 7704 n = -0x10000; 7705 } 7706 7707 aExp += n - 1; 7708 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7709 , status); 7710 7711 } 7712