1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 87 #include "fpu/softfloat.h" 88 89 /* We only need stdlib for abort() */ 90 91 /*---------------------------------------------------------------------------- 92 | Primitive arithmetic functions, including multi-word arithmetic, and 93 | division and square root approximations. (Can be specialized to target if 94 | desired.) 95 *----------------------------------------------------------------------------*/ 96 #include "softfloat-macros.h" 97 98 /*---------------------------------------------------------------------------- 99 | Functions and definitions to determine: (1) whether tininess for underflow 100 | is detected before or after rounding by default, (2) what (if anything) 101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 103 | are propagated from function inputs to output. These details are target- 104 | specific. 105 *----------------------------------------------------------------------------*/ 106 #include "softfloat-specialize.h" 107 108 /*---------------------------------------------------------------------------- 109 | Returns the fraction bits of the half-precision floating-point value `a'. 110 *----------------------------------------------------------------------------*/ 111 112 static inline uint32_t extractFloat16Frac(float16 a) 113 { 114 return float16_val(a) & 0x3ff; 115 } 116 117 /*---------------------------------------------------------------------------- 118 | Returns the exponent bits of the half-precision floating-point value `a'. 119 *----------------------------------------------------------------------------*/ 120 121 static inline int extractFloat16Exp(float16 a) 122 { 123 return (float16_val(a) >> 10) & 0x1f; 124 } 125 126 /*---------------------------------------------------------------------------- 127 | Returns the sign bit of the single-precision floating-point value `a'. 128 *----------------------------------------------------------------------------*/ 129 130 static inline flag extractFloat16Sign(float16 a) 131 { 132 return float16_val(a)>>15; 133 } 134 135 /*---------------------------------------------------------------------------- 136 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 137 | and 7, and returns the properly rounded 32-bit integer corresponding to the 138 | input. If `zSign' is 1, the input is negated before being converted to an 139 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 140 | is simply rounded to an integer, with the inexact exception raised if the 141 | input cannot be represented exactly as an integer. However, if the fixed- 142 | point input is too large, the invalid exception is raised and the largest 143 | positive or negative integer is returned. 144 *----------------------------------------------------------------------------*/ 145 146 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status) 147 { 148 int8_t roundingMode; 149 flag roundNearestEven; 150 int8_t roundIncrement, roundBits; 151 int32_t z; 152 153 roundingMode = status->float_rounding_mode; 154 roundNearestEven = ( roundingMode == float_round_nearest_even ); 155 switch (roundingMode) { 156 case float_round_nearest_even: 157 case float_round_ties_away: 158 roundIncrement = 0x40; 159 break; 160 case float_round_to_zero: 161 roundIncrement = 0; 162 break; 163 case float_round_up: 164 roundIncrement = zSign ? 0 : 0x7f; 165 break; 166 case float_round_down: 167 roundIncrement = zSign ? 0x7f : 0; 168 break; 169 default: 170 abort(); 171 } 172 roundBits = absZ & 0x7F; 173 absZ = ( absZ + roundIncrement )>>7; 174 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 175 z = absZ; 176 if ( zSign ) z = - z; 177 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 178 float_raise(float_flag_invalid, status); 179 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 180 } 181 if (roundBits) { 182 status->float_exception_flags |= float_flag_inexact; 183 } 184 return z; 185 186 } 187 188 /*---------------------------------------------------------------------------- 189 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 190 | `absZ1', with binary point between bits 63 and 64 (between the input words), 191 | and returns the properly rounded 64-bit integer corresponding to the input. 192 | If `zSign' is 1, the input is negated before being converted to an integer. 193 | Ordinarily, the fixed-point input is simply rounded to an integer, with 194 | the inexact exception raised if the input cannot be represented exactly as 195 | an integer. However, if the fixed-point input is too large, the invalid 196 | exception is raised and the largest positive or negative integer is 197 | returned. 198 *----------------------------------------------------------------------------*/ 199 200 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1, 201 float_status *status) 202 { 203 int8_t roundingMode; 204 flag roundNearestEven, increment; 205 int64_t z; 206 207 roundingMode = status->float_rounding_mode; 208 roundNearestEven = ( roundingMode == float_round_nearest_even ); 209 switch (roundingMode) { 210 case float_round_nearest_even: 211 case float_round_ties_away: 212 increment = ((int64_t) absZ1 < 0); 213 break; 214 case float_round_to_zero: 215 increment = 0; 216 break; 217 case float_round_up: 218 increment = !zSign && absZ1; 219 break; 220 case float_round_down: 221 increment = zSign && absZ1; 222 break; 223 default: 224 abort(); 225 } 226 if ( increment ) { 227 ++absZ0; 228 if ( absZ0 == 0 ) goto overflow; 229 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 230 } 231 z = absZ0; 232 if ( zSign ) z = - z; 233 if ( z && ( ( z < 0 ) ^ zSign ) ) { 234 overflow: 235 float_raise(float_flag_invalid, status); 236 return 237 zSign ? (int64_t) LIT64( 0x8000000000000000 ) 238 : LIT64( 0x7FFFFFFFFFFFFFFF ); 239 } 240 if (absZ1) { 241 status->float_exception_flags |= float_flag_inexact; 242 } 243 return z; 244 245 } 246 247 /*---------------------------------------------------------------------------- 248 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 249 | `absZ1', with binary point between bits 63 and 64 (between the input words), 250 | and returns the properly rounded 64-bit unsigned integer corresponding to the 251 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 252 | with the inexact exception raised if the input cannot be represented exactly 253 | as an integer. However, if the fixed-point input is too large, the invalid 254 | exception is raised and the largest unsigned integer is returned. 255 *----------------------------------------------------------------------------*/ 256 257 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0, 258 uint64_t absZ1, float_status *status) 259 { 260 int8_t roundingMode; 261 flag roundNearestEven, increment; 262 263 roundingMode = status->float_rounding_mode; 264 roundNearestEven = (roundingMode == float_round_nearest_even); 265 switch (roundingMode) { 266 case float_round_nearest_even: 267 case float_round_ties_away: 268 increment = ((int64_t)absZ1 < 0); 269 break; 270 case float_round_to_zero: 271 increment = 0; 272 break; 273 case float_round_up: 274 increment = !zSign && absZ1; 275 break; 276 case float_round_down: 277 increment = zSign && absZ1; 278 break; 279 default: 280 abort(); 281 } 282 if (increment) { 283 ++absZ0; 284 if (absZ0 == 0) { 285 float_raise(float_flag_invalid, status); 286 return LIT64(0xFFFFFFFFFFFFFFFF); 287 } 288 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven); 289 } 290 291 if (zSign && absZ0) { 292 float_raise(float_flag_invalid, status); 293 return 0; 294 } 295 296 if (absZ1) { 297 status->float_exception_flags |= float_flag_inexact; 298 } 299 return absZ0; 300 } 301 302 /*---------------------------------------------------------------------------- 303 | Returns the fraction bits of the single-precision floating-point value `a'. 304 *----------------------------------------------------------------------------*/ 305 306 static inline uint32_t extractFloat32Frac( float32 a ) 307 { 308 309 return float32_val(a) & 0x007FFFFF; 310 311 } 312 313 /*---------------------------------------------------------------------------- 314 | Returns the exponent bits of the single-precision floating-point value `a'. 315 *----------------------------------------------------------------------------*/ 316 317 static inline int extractFloat32Exp(float32 a) 318 { 319 320 return ( float32_val(a)>>23 ) & 0xFF; 321 322 } 323 324 /*---------------------------------------------------------------------------- 325 | Returns the sign bit of the single-precision floating-point value `a'. 326 *----------------------------------------------------------------------------*/ 327 328 static inline flag extractFloat32Sign( float32 a ) 329 { 330 331 return float32_val(a)>>31; 332 333 } 334 335 /*---------------------------------------------------------------------------- 336 | If `a' is denormal and we are in flush-to-zero mode then set the 337 | input-denormal exception and return zero. Otherwise just return the value. 338 *----------------------------------------------------------------------------*/ 339 float32 float32_squash_input_denormal(float32 a, float_status *status) 340 { 341 if (status->flush_inputs_to_zero) { 342 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) { 343 float_raise(float_flag_input_denormal, status); 344 return make_float32(float32_val(a) & 0x80000000); 345 } 346 } 347 return a; 348 } 349 350 /*---------------------------------------------------------------------------- 351 | Normalizes the subnormal single-precision floating-point value represented 352 | by the denormalized significand `aSig'. The normalized exponent and 353 | significand are stored at the locations pointed to by `zExpPtr' and 354 | `zSigPtr', respectively. 355 *----------------------------------------------------------------------------*/ 356 357 static void 358 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 359 { 360 int8_t shiftCount; 361 362 shiftCount = countLeadingZeros32( aSig ) - 8; 363 *zSigPtr = aSig<<shiftCount; 364 *zExpPtr = 1 - shiftCount; 365 366 } 367 368 /*---------------------------------------------------------------------------- 369 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 370 | single-precision floating-point value, returning the result. After being 371 | shifted into the proper positions, the three fields are simply added 372 | together to form the result. This means that any integer portion of `zSig' 373 | will be added into the exponent. Since a properly normalized significand 374 | will have an integer portion equal to 1, the `zExp' input should be 1 less 375 | than the desired result exponent whenever `zSig' is a complete, normalized 376 | significand. 377 *----------------------------------------------------------------------------*/ 378 379 static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig) 380 { 381 382 return make_float32( 383 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig); 384 385 } 386 387 /*---------------------------------------------------------------------------- 388 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 389 | and significand `zSig', and returns the proper single-precision floating- 390 | point value corresponding to the abstract input. Ordinarily, the abstract 391 | value is simply rounded and packed into the single-precision format, with 392 | the inexact exception raised if the abstract input cannot be represented 393 | exactly. However, if the abstract value is too large, the overflow and 394 | inexact exceptions are raised and an infinity or maximal finite value is 395 | returned. If the abstract value is too small, the input value is rounded to 396 | a subnormal number, and the underflow and inexact exceptions are raised if 397 | the abstract input cannot be represented exactly as a subnormal single- 398 | precision floating-point number. 399 | The input significand `zSig' has its binary point between bits 30 400 | and 29, which is 7 bits to the left of the usual location. This shifted 401 | significand must be normalized or smaller. If `zSig' is not normalized, 402 | `zExp' must be 0; in that case, the result returned is a subnormal number, 403 | and it must not require rounding. In the usual case that `zSig' is 404 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 405 | The handling of underflow and overflow follows the IEC/IEEE Standard for 406 | Binary Floating-Point Arithmetic. 407 *----------------------------------------------------------------------------*/ 408 409 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 410 float_status *status) 411 { 412 int8_t roundingMode; 413 flag roundNearestEven; 414 int8_t roundIncrement, roundBits; 415 flag isTiny; 416 417 roundingMode = status->float_rounding_mode; 418 roundNearestEven = ( roundingMode == float_round_nearest_even ); 419 switch (roundingMode) { 420 case float_round_nearest_even: 421 case float_round_ties_away: 422 roundIncrement = 0x40; 423 break; 424 case float_round_to_zero: 425 roundIncrement = 0; 426 break; 427 case float_round_up: 428 roundIncrement = zSign ? 0 : 0x7f; 429 break; 430 case float_round_down: 431 roundIncrement = zSign ? 0x7f : 0; 432 break; 433 default: 434 abort(); 435 break; 436 } 437 roundBits = zSig & 0x7F; 438 if ( 0xFD <= (uint16_t) zExp ) { 439 if ( ( 0xFD < zExp ) 440 || ( ( zExp == 0xFD ) 441 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 442 ) { 443 float_raise(float_flag_overflow | float_flag_inexact, status); 444 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 )); 445 } 446 if ( zExp < 0 ) { 447 if (status->flush_to_zero) { 448 float_raise(float_flag_output_denormal, status); 449 return packFloat32(zSign, 0, 0); 450 } 451 isTiny = 452 (status->float_detect_tininess 453 == float_tininess_before_rounding) 454 || ( zExp < -1 ) 455 || ( zSig + roundIncrement < 0x80000000 ); 456 shift32RightJamming( zSig, - zExp, &zSig ); 457 zExp = 0; 458 roundBits = zSig & 0x7F; 459 if (isTiny && roundBits) { 460 float_raise(float_flag_underflow, status); 461 } 462 } 463 } 464 if (roundBits) { 465 status->float_exception_flags |= float_flag_inexact; 466 } 467 zSig = ( zSig + roundIncrement )>>7; 468 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 469 if ( zSig == 0 ) zExp = 0; 470 return packFloat32( zSign, zExp, zSig ); 471 472 } 473 474 /*---------------------------------------------------------------------------- 475 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 476 | and significand `zSig', and returns the proper single-precision floating- 477 | point value corresponding to the abstract input. This routine is just like 478 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 479 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 480 | floating-point exponent. 481 *----------------------------------------------------------------------------*/ 482 483 static float32 484 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 485 float_status *status) 486 { 487 int8_t shiftCount; 488 489 shiftCount = countLeadingZeros32( zSig ) - 1; 490 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 491 status); 492 493 } 494 495 /*---------------------------------------------------------------------------- 496 | Returns the fraction bits of the double-precision floating-point value `a'. 497 *----------------------------------------------------------------------------*/ 498 499 static inline uint64_t extractFloat64Frac( float64 a ) 500 { 501 502 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF ); 503 504 } 505 506 /*---------------------------------------------------------------------------- 507 | Returns the exponent bits of the double-precision floating-point value `a'. 508 *----------------------------------------------------------------------------*/ 509 510 static inline int extractFloat64Exp(float64 a) 511 { 512 513 return ( float64_val(a)>>52 ) & 0x7FF; 514 515 } 516 517 /*---------------------------------------------------------------------------- 518 | Returns the sign bit of the double-precision floating-point value `a'. 519 *----------------------------------------------------------------------------*/ 520 521 static inline flag extractFloat64Sign( float64 a ) 522 { 523 524 return float64_val(a)>>63; 525 526 } 527 528 /*---------------------------------------------------------------------------- 529 | If `a' is denormal and we are in flush-to-zero mode then set the 530 | input-denormal exception and return zero. Otherwise just return the value. 531 *----------------------------------------------------------------------------*/ 532 float64 float64_squash_input_denormal(float64 a, float_status *status) 533 { 534 if (status->flush_inputs_to_zero) { 535 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) { 536 float_raise(float_flag_input_denormal, status); 537 return make_float64(float64_val(a) & (1ULL << 63)); 538 } 539 } 540 return a; 541 } 542 543 /*---------------------------------------------------------------------------- 544 | Normalizes the subnormal double-precision floating-point value represented 545 | by the denormalized significand `aSig'. The normalized exponent and 546 | significand are stored at the locations pointed to by `zExpPtr' and 547 | `zSigPtr', respectively. 548 *----------------------------------------------------------------------------*/ 549 550 static void 551 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 552 { 553 int8_t shiftCount; 554 555 shiftCount = countLeadingZeros64( aSig ) - 11; 556 *zSigPtr = aSig<<shiftCount; 557 *zExpPtr = 1 - shiftCount; 558 559 } 560 561 /*---------------------------------------------------------------------------- 562 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 563 | double-precision floating-point value, returning the result. After being 564 | shifted into the proper positions, the three fields are simply added 565 | together to form the result. This means that any integer portion of `zSig' 566 | will be added into the exponent. Since a properly normalized significand 567 | will have an integer portion equal to 1, the `zExp' input should be 1 less 568 | than the desired result exponent whenever `zSig' is a complete, normalized 569 | significand. 570 *----------------------------------------------------------------------------*/ 571 572 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig) 573 { 574 575 return make_float64( 576 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 577 578 } 579 580 /*---------------------------------------------------------------------------- 581 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 582 | and significand `zSig', and returns the proper double-precision floating- 583 | point value corresponding to the abstract input. Ordinarily, the abstract 584 | value is simply rounded and packed into the double-precision format, with 585 | the inexact exception raised if the abstract input cannot be represented 586 | exactly. However, if the abstract value is too large, the overflow and 587 | inexact exceptions are raised and an infinity or maximal finite value is 588 | returned. If the abstract value is too small, the input value is rounded to 589 | a subnormal number, and the underflow and inexact exceptions are raised if 590 | the abstract input cannot be represented exactly as a subnormal double- 591 | precision floating-point number. 592 | The input significand `zSig' has its binary point between bits 62 593 | and 61, which is 10 bits to the left of the usual location. This shifted 594 | significand must be normalized or smaller. If `zSig' is not normalized, 595 | `zExp' must be 0; in that case, the result returned is a subnormal number, 596 | and it must not require rounding. In the usual case that `zSig' is 597 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 598 | The handling of underflow and overflow follows the IEC/IEEE Standard for 599 | Binary Floating-Point Arithmetic. 600 *----------------------------------------------------------------------------*/ 601 602 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 603 float_status *status) 604 { 605 int8_t roundingMode; 606 flag roundNearestEven; 607 int roundIncrement, roundBits; 608 flag isTiny; 609 610 roundingMode = status->float_rounding_mode; 611 roundNearestEven = ( roundingMode == float_round_nearest_even ); 612 switch (roundingMode) { 613 case float_round_nearest_even: 614 case float_round_ties_away: 615 roundIncrement = 0x200; 616 break; 617 case float_round_to_zero: 618 roundIncrement = 0; 619 break; 620 case float_round_up: 621 roundIncrement = zSign ? 0 : 0x3ff; 622 break; 623 case float_round_down: 624 roundIncrement = zSign ? 0x3ff : 0; 625 break; 626 case float_round_to_odd: 627 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 628 break; 629 default: 630 abort(); 631 } 632 roundBits = zSig & 0x3FF; 633 if ( 0x7FD <= (uint16_t) zExp ) { 634 if ( ( 0x7FD < zExp ) 635 || ( ( zExp == 0x7FD ) 636 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 637 ) { 638 bool overflow_to_inf = roundingMode != float_round_to_odd && 639 roundIncrement != 0; 640 float_raise(float_flag_overflow | float_flag_inexact, status); 641 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 642 } 643 if ( zExp < 0 ) { 644 if (status->flush_to_zero) { 645 float_raise(float_flag_output_denormal, status); 646 return packFloat64(zSign, 0, 0); 647 } 648 isTiny = 649 (status->float_detect_tininess 650 == float_tininess_before_rounding) 651 || ( zExp < -1 ) 652 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 653 shift64RightJamming( zSig, - zExp, &zSig ); 654 zExp = 0; 655 roundBits = zSig & 0x3FF; 656 if (isTiny && roundBits) { 657 float_raise(float_flag_underflow, status); 658 } 659 if (roundingMode == float_round_to_odd) { 660 /* 661 * For round-to-odd case, the roundIncrement depends on 662 * zSig which just changed. 663 */ 664 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 665 } 666 } 667 } 668 if (roundBits) { 669 status->float_exception_flags |= float_flag_inexact; 670 } 671 zSig = ( zSig + roundIncrement )>>10; 672 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 673 if ( zSig == 0 ) zExp = 0; 674 return packFloat64( zSign, zExp, zSig ); 675 676 } 677 678 /*---------------------------------------------------------------------------- 679 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 680 | and significand `zSig', and returns the proper double-precision floating- 681 | point value corresponding to the abstract input. This routine is just like 682 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 683 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 684 | floating-point exponent. 685 *----------------------------------------------------------------------------*/ 686 687 static float64 688 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 689 float_status *status) 690 { 691 int8_t shiftCount; 692 693 shiftCount = countLeadingZeros64( zSig ) - 1; 694 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 695 status); 696 697 } 698 699 /*---------------------------------------------------------------------------- 700 | Returns the fraction bits of the extended double-precision floating-point 701 | value `a'. 702 *----------------------------------------------------------------------------*/ 703 704 static inline uint64_t extractFloatx80Frac( floatx80 a ) 705 { 706 707 return a.low; 708 709 } 710 711 /*---------------------------------------------------------------------------- 712 | Returns the exponent bits of the extended double-precision floating-point 713 | value `a'. 714 *----------------------------------------------------------------------------*/ 715 716 static inline int32_t extractFloatx80Exp( floatx80 a ) 717 { 718 719 return a.high & 0x7FFF; 720 721 } 722 723 /*---------------------------------------------------------------------------- 724 | Returns the sign bit of the extended double-precision floating-point value 725 | `a'. 726 *----------------------------------------------------------------------------*/ 727 728 static inline flag extractFloatx80Sign( floatx80 a ) 729 { 730 731 return a.high>>15; 732 733 } 734 735 /*---------------------------------------------------------------------------- 736 | Normalizes the subnormal extended double-precision floating-point value 737 | represented by the denormalized significand `aSig'. The normalized exponent 738 | and significand are stored at the locations pointed to by `zExpPtr' and 739 | `zSigPtr', respectively. 740 *----------------------------------------------------------------------------*/ 741 742 static void 743 normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr ) 744 { 745 int8_t shiftCount; 746 747 shiftCount = countLeadingZeros64( aSig ); 748 *zSigPtr = aSig<<shiftCount; 749 *zExpPtr = 1 - shiftCount; 750 751 } 752 753 /*---------------------------------------------------------------------------- 754 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 755 | extended double-precision floating-point value, returning the result. 756 *----------------------------------------------------------------------------*/ 757 758 static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig ) 759 { 760 floatx80 z; 761 762 z.low = zSig; 763 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp; 764 return z; 765 766 } 767 768 /*---------------------------------------------------------------------------- 769 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 770 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 771 | and returns the proper extended double-precision floating-point value 772 | corresponding to the abstract input. Ordinarily, the abstract value is 773 | rounded and packed into the extended double-precision format, with the 774 | inexact exception raised if the abstract input cannot be represented 775 | exactly. However, if the abstract value is too large, the overflow and 776 | inexact exceptions are raised and an infinity or maximal finite value is 777 | returned. If the abstract value is too small, the input value is rounded to 778 | a subnormal number, and the underflow and inexact exceptions are raised if 779 | the abstract input cannot be represented exactly as a subnormal extended 780 | double-precision floating-point number. 781 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 782 | number of bits as single or double precision, respectively. Otherwise, the 783 | result is rounded to the full precision of the extended double-precision 784 | format. 785 | The input significand must be normalized or smaller. If the input 786 | significand is not normalized, `zExp' must be 0; in that case, the result 787 | returned is a subnormal number, and it must not require rounding. The 788 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 789 | Floating-Point Arithmetic. 790 *----------------------------------------------------------------------------*/ 791 792 static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign, 793 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 794 float_status *status) 795 { 796 int8_t roundingMode; 797 flag roundNearestEven, increment, isTiny; 798 int64_t roundIncrement, roundMask, roundBits; 799 800 roundingMode = status->float_rounding_mode; 801 roundNearestEven = ( roundingMode == float_round_nearest_even ); 802 if ( roundingPrecision == 80 ) goto precision80; 803 if ( roundingPrecision == 64 ) { 804 roundIncrement = LIT64( 0x0000000000000400 ); 805 roundMask = LIT64( 0x00000000000007FF ); 806 } 807 else if ( roundingPrecision == 32 ) { 808 roundIncrement = LIT64( 0x0000008000000000 ); 809 roundMask = LIT64( 0x000000FFFFFFFFFF ); 810 } 811 else { 812 goto precision80; 813 } 814 zSig0 |= ( zSig1 != 0 ); 815 switch (roundingMode) { 816 case float_round_nearest_even: 817 case float_round_ties_away: 818 break; 819 case float_round_to_zero: 820 roundIncrement = 0; 821 break; 822 case float_round_up: 823 roundIncrement = zSign ? 0 : roundMask; 824 break; 825 case float_round_down: 826 roundIncrement = zSign ? roundMask : 0; 827 break; 828 default: 829 abort(); 830 } 831 roundBits = zSig0 & roundMask; 832 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 833 if ( ( 0x7FFE < zExp ) 834 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 835 ) { 836 goto overflow; 837 } 838 if ( zExp <= 0 ) { 839 if (status->flush_to_zero) { 840 float_raise(float_flag_output_denormal, status); 841 return packFloatx80(zSign, 0, 0); 842 } 843 isTiny = 844 (status->float_detect_tininess 845 == float_tininess_before_rounding) 846 || ( zExp < 0 ) 847 || ( zSig0 <= zSig0 + roundIncrement ); 848 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 849 zExp = 0; 850 roundBits = zSig0 & roundMask; 851 if (isTiny && roundBits) { 852 float_raise(float_flag_underflow, status); 853 } 854 if (roundBits) { 855 status->float_exception_flags |= float_flag_inexact; 856 } 857 zSig0 += roundIncrement; 858 if ( (int64_t) zSig0 < 0 ) zExp = 1; 859 roundIncrement = roundMask + 1; 860 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 861 roundMask |= roundIncrement; 862 } 863 zSig0 &= ~ roundMask; 864 return packFloatx80( zSign, zExp, zSig0 ); 865 } 866 } 867 if (roundBits) { 868 status->float_exception_flags |= float_flag_inexact; 869 } 870 zSig0 += roundIncrement; 871 if ( zSig0 < roundIncrement ) { 872 ++zExp; 873 zSig0 = LIT64( 0x8000000000000000 ); 874 } 875 roundIncrement = roundMask + 1; 876 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 877 roundMask |= roundIncrement; 878 } 879 zSig0 &= ~ roundMask; 880 if ( zSig0 == 0 ) zExp = 0; 881 return packFloatx80( zSign, zExp, zSig0 ); 882 precision80: 883 switch (roundingMode) { 884 case float_round_nearest_even: 885 case float_round_ties_away: 886 increment = ((int64_t)zSig1 < 0); 887 break; 888 case float_round_to_zero: 889 increment = 0; 890 break; 891 case float_round_up: 892 increment = !zSign && zSig1; 893 break; 894 case float_round_down: 895 increment = zSign && zSig1; 896 break; 897 default: 898 abort(); 899 } 900 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 901 if ( ( 0x7FFE < zExp ) 902 || ( ( zExp == 0x7FFE ) 903 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 904 && increment 905 ) 906 ) { 907 roundMask = 0; 908 overflow: 909 float_raise(float_flag_overflow | float_flag_inexact, status); 910 if ( ( roundingMode == float_round_to_zero ) 911 || ( zSign && ( roundingMode == float_round_up ) ) 912 || ( ! zSign && ( roundingMode == float_round_down ) ) 913 ) { 914 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 915 } 916 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 917 } 918 if ( zExp <= 0 ) { 919 isTiny = 920 (status->float_detect_tininess 921 == float_tininess_before_rounding) 922 || ( zExp < 0 ) 923 || ! increment 924 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 925 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 926 zExp = 0; 927 if (isTiny && zSig1) { 928 float_raise(float_flag_underflow, status); 929 } 930 if (zSig1) { 931 status->float_exception_flags |= float_flag_inexact; 932 } 933 switch (roundingMode) { 934 case float_round_nearest_even: 935 case float_round_ties_away: 936 increment = ((int64_t)zSig1 < 0); 937 break; 938 case float_round_to_zero: 939 increment = 0; 940 break; 941 case float_round_up: 942 increment = !zSign && zSig1; 943 break; 944 case float_round_down: 945 increment = zSign && zSig1; 946 break; 947 default: 948 abort(); 949 } 950 if ( increment ) { 951 ++zSig0; 952 zSig0 &= 953 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 954 if ( (int64_t) zSig0 < 0 ) zExp = 1; 955 } 956 return packFloatx80( zSign, zExp, zSig0 ); 957 } 958 } 959 if (zSig1) { 960 status->float_exception_flags |= float_flag_inexact; 961 } 962 if ( increment ) { 963 ++zSig0; 964 if ( zSig0 == 0 ) { 965 ++zExp; 966 zSig0 = LIT64( 0x8000000000000000 ); 967 } 968 else { 969 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 970 } 971 } 972 else { 973 if ( zSig0 == 0 ) zExp = 0; 974 } 975 return packFloatx80( zSign, zExp, zSig0 ); 976 977 } 978 979 /*---------------------------------------------------------------------------- 980 | Takes an abstract floating-point value having sign `zSign', exponent 981 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 982 | and returns the proper extended double-precision floating-point value 983 | corresponding to the abstract input. This routine is just like 984 | `roundAndPackFloatx80' except that the input significand does not have to be 985 | normalized. 986 *----------------------------------------------------------------------------*/ 987 988 static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 989 flag zSign, int32_t zExp, 990 uint64_t zSig0, uint64_t zSig1, 991 float_status *status) 992 { 993 int8_t shiftCount; 994 995 if ( zSig0 == 0 ) { 996 zSig0 = zSig1; 997 zSig1 = 0; 998 zExp -= 64; 999 } 1000 shiftCount = countLeadingZeros64( zSig0 ); 1001 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1002 zExp -= shiftCount; 1003 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 1004 zSig0, zSig1, status); 1005 1006 } 1007 1008 /*---------------------------------------------------------------------------- 1009 | Returns the least-significant 64 fraction bits of the quadruple-precision 1010 | floating-point value `a'. 1011 *----------------------------------------------------------------------------*/ 1012 1013 static inline uint64_t extractFloat128Frac1( float128 a ) 1014 { 1015 1016 return a.low; 1017 1018 } 1019 1020 /*---------------------------------------------------------------------------- 1021 | Returns the most-significant 48 fraction bits of the quadruple-precision 1022 | floating-point value `a'. 1023 *----------------------------------------------------------------------------*/ 1024 1025 static inline uint64_t extractFloat128Frac0( float128 a ) 1026 { 1027 1028 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 1029 1030 } 1031 1032 /*---------------------------------------------------------------------------- 1033 | Returns the exponent bits of the quadruple-precision floating-point value 1034 | `a'. 1035 *----------------------------------------------------------------------------*/ 1036 1037 static inline int32_t extractFloat128Exp( float128 a ) 1038 { 1039 1040 return ( a.high>>48 ) & 0x7FFF; 1041 1042 } 1043 1044 /*---------------------------------------------------------------------------- 1045 | Returns the sign bit of the quadruple-precision floating-point value `a'. 1046 *----------------------------------------------------------------------------*/ 1047 1048 static inline flag extractFloat128Sign( float128 a ) 1049 { 1050 1051 return a.high>>63; 1052 1053 } 1054 1055 /*---------------------------------------------------------------------------- 1056 | Normalizes the subnormal quadruple-precision floating-point value 1057 | represented by the denormalized significand formed by the concatenation of 1058 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 1059 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 1060 | significand are stored at the location pointed to by `zSig0Ptr', and the 1061 | least significant 64 bits of the normalized significand are stored at the 1062 | location pointed to by `zSig1Ptr'. 1063 *----------------------------------------------------------------------------*/ 1064 1065 static void 1066 normalizeFloat128Subnormal( 1067 uint64_t aSig0, 1068 uint64_t aSig1, 1069 int32_t *zExpPtr, 1070 uint64_t *zSig0Ptr, 1071 uint64_t *zSig1Ptr 1072 ) 1073 { 1074 int8_t shiftCount; 1075 1076 if ( aSig0 == 0 ) { 1077 shiftCount = countLeadingZeros64( aSig1 ) - 15; 1078 if ( shiftCount < 0 ) { 1079 *zSig0Ptr = aSig1>>( - shiftCount ); 1080 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 1081 } 1082 else { 1083 *zSig0Ptr = aSig1<<shiftCount; 1084 *zSig1Ptr = 0; 1085 } 1086 *zExpPtr = - shiftCount - 63; 1087 } 1088 else { 1089 shiftCount = countLeadingZeros64( aSig0 ) - 15; 1090 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 1091 *zExpPtr = 1 - shiftCount; 1092 } 1093 1094 } 1095 1096 /*---------------------------------------------------------------------------- 1097 | Packs the sign `zSign', the exponent `zExp', and the significand formed 1098 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 1099 | floating-point value, returning the result. After being shifted into the 1100 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 1101 | added together to form the most significant 32 bits of the result. This 1102 | means that any integer portion of `zSig0' will be added into the exponent. 1103 | Since a properly normalized significand will have an integer portion equal 1104 | to 1, the `zExp' input should be 1 less than the desired result exponent 1105 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 1106 | significand. 1107 *----------------------------------------------------------------------------*/ 1108 1109 static inline float128 1110 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 ) 1111 { 1112 float128 z; 1113 1114 z.low = zSig1; 1115 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0; 1116 return z; 1117 1118 } 1119 1120 /*---------------------------------------------------------------------------- 1121 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1122 | and extended significand formed by the concatenation of `zSig0', `zSig1', 1123 | and `zSig2', and returns the proper quadruple-precision floating-point value 1124 | corresponding to the abstract input. Ordinarily, the abstract value is 1125 | simply rounded and packed into the quadruple-precision format, with the 1126 | inexact exception raised if the abstract input cannot be represented 1127 | exactly. However, if the abstract value is too large, the overflow and 1128 | inexact exceptions are raised and an infinity or maximal finite value is 1129 | returned. If the abstract value is too small, the input value is rounded to 1130 | a subnormal number, and the underflow and inexact exceptions are raised if 1131 | the abstract input cannot be represented exactly as a subnormal quadruple- 1132 | precision floating-point number. 1133 | The input significand must be normalized or smaller. If the input 1134 | significand is not normalized, `zExp' must be 0; in that case, the result 1135 | returned is a subnormal number, and it must not require rounding. In the 1136 | usual case that the input significand is normalized, `zExp' must be 1 less 1137 | than the ``true'' floating-point exponent. The handling of underflow and 1138 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1139 *----------------------------------------------------------------------------*/ 1140 1141 static float128 roundAndPackFloat128(flag zSign, int32_t zExp, 1142 uint64_t zSig0, uint64_t zSig1, 1143 uint64_t zSig2, float_status *status) 1144 { 1145 int8_t roundingMode; 1146 flag roundNearestEven, increment, isTiny; 1147 1148 roundingMode = status->float_rounding_mode; 1149 roundNearestEven = ( roundingMode == float_round_nearest_even ); 1150 switch (roundingMode) { 1151 case float_round_nearest_even: 1152 case float_round_ties_away: 1153 increment = ((int64_t)zSig2 < 0); 1154 break; 1155 case float_round_to_zero: 1156 increment = 0; 1157 break; 1158 case float_round_up: 1159 increment = !zSign && zSig2; 1160 break; 1161 case float_round_down: 1162 increment = zSign && zSig2; 1163 break; 1164 case float_round_to_odd: 1165 increment = !(zSig1 & 0x1) && zSig2; 1166 break; 1167 default: 1168 abort(); 1169 } 1170 if ( 0x7FFD <= (uint32_t) zExp ) { 1171 if ( ( 0x7FFD < zExp ) 1172 || ( ( zExp == 0x7FFD ) 1173 && eq128( 1174 LIT64( 0x0001FFFFFFFFFFFF ), 1175 LIT64( 0xFFFFFFFFFFFFFFFF ), 1176 zSig0, 1177 zSig1 1178 ) 1179 && increment 1180 ) 1181 ) { 1182 float_raise(float_flag_overflow | float_flag_inexact, status); 1183 if ( ( roundingMode == float_round_to_zero ) 1184 || ( zSign && ( roundingMode == float_round_up ) ) 1185 || ( ! zSign && ( roundingMode == float_round_down ) ) 1186 || (roundingMode == float_round_to_odd) 1187 ) { 1188 return 1189 packFloat128( 1190 zSign, 1191 0x7FFE, 1192 LIT64( 0x0000FFFFFFFFFFFF ), 1193 LIT64( 0xFFFFFFFFFFFFFFFF ) 1194 ); 1195 } 1196 return packFloat128( zSign, 0x7FFF, 0, 0 ); 1197 } 1198 if ( zExp < 0 ) { 1199 if (status->flush_to_zero) { 1200 float_raise(float_flag_output_denormal, status); 1201 return packFloat128(zSign, 0, 0, 0); 1202 } 1203 isTiny = 1204 (status->float_detect_tininess 1205 == float_tininess_before_rounding) 1206 || ( zExp < -1 ) 1207 || ! increment 1208 || lt128( 1209 zSig0, 1210 zSig1, 1211 LIT64( 0x0001FFFFFFFFFFFF ), 1212 LIT64( 0xFFFFFFFFFFFFFFFF ) 1213 ); 1214 shift128ExtraRightJamming( 1215 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 1216 zExp = 0; 1217 if (isTiny && zSig2) { 1218 float_raise(float_flag_underflow, status); 1219 } 1220 switch (roundingMode) { 1221 case float_round_nearest_even: 1222 case float_round_ties_away: 1223 increment = ((int64_t)zSig2 < 0); 1224 break; 1225 case float_round_to_zero: 1226 increment = 0; 1227 break; 1228 case float_round_up: 1229 increment = !zSign && zSig2; 1230 break; 1231 case float_round_down: 1232 increment = zSign && zSig2; 1233 break; 1234 case float_round_to_odd: 1235 increment = !(zSig1 & 0x1) && zSig2; 1236 break; 1237 default: 1238 abort(); 1239 } 1240 } 1241 } 1242 if (zSig2) { 1243 status->float_exception_flags |= float_flag_inexact; 1244 } 1245 if ( increment ) { 1246 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 1247 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 1248 } 1249 else { 1250 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 1251 } 1252 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1253 1254 } 1255 1256 /*---------------------------------------------------------------------------- 1257 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1258 | and significand formed by the concatenation of `zSig0' and `zSig1', and 1259 | returns the proper quadruple-precision floating-point value corresponding 1260 | to the abstract input. This routine is just like `roundAndPackFloat128' 1261 | except that the input significand has fewer bits and does not have to be 1262 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 1263 | point exponent. 1264 *----------------------------------------------------------------------------*/ 1265 1266 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp, 1267 uint64_t zSig0, uint64_t zSig1, 1268 float_status *status) 1269 { 1270 int8_t shiftCount; 1271 uint64_t zSig2; 1272 1273 if ( zSig0 == 0 ) { 1274 zSig0 = zSig1; 1275 zSig1 = 0; 1276 zExp -= 64; 1277 } 1278 shiftCount = countLeadingZeros64( zSig0 ) - 15; 1279 if ( 0 <= shiftCount ) { 1280 zSig2 = 0; 1281 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1282 } 1283 else { 1284 shift128ExtraRightJamming( 1285 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 1286 } 1287 zExp -= shiftCount; 1288 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 1289 1290 } 1291 1292 /*---------------------------------------------------------------------------- 1293 | Returns the result of converting the 32-bit two's complement integer `a' 1294 | to the single-precision floating-point format. The conversion is performed 1295 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1296 *----------------------------------------------------------------------------*/ 1297 1298 float32 int32_to_float32(int32_t a, float_status *status) 1299 { 1300 flag zSign; 1301 1302 if ( a == 0 ) return float32_zero; 1303 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); 1304 zSign = ( a < 0 ); 1305 return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status); 1306 } 1307 1308 /*---------------------------------------------------------------------------- 1309 | Returns the result of converting the 32-bit two's complement integer `a' 1310 | to the double-precision floating-point format. The conversion is performed 1311 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1312 *----------------------------------------------------------------------------*/ 1313 1314 float64 int32_to_float64(int32_t a, float_status *status) 1315 { 1316 flag zSign; 1317 uint32_t absA; 1318 int8_t shiftCount; 1319 uint64_t zSig; 1320 1321 if ( a == 0 ) return float64_zero; 1322 zSign = ( a < 0 ); 1323 absA = zSign ? - a : a; 1324 shiftCount = countLeadingZeros32( absA ) + 21; 1325 zSig = absA; 1326 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount ); 1327 1328 } 1329 1330 /*---------------------------------------------------------------------------- 1331 | Returns the result of converting the 32-bit two's complement integer `a' 1332 | to the extended double-precision floating-point format. The conversion 1333 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1334 | Arithmetic. 1335 *----------------------------------------------------------------------------*/ 1336 1337 floatx80 int32_to_floatx80(int32_t a, float_status *status) 1338 { 1339 flag zSign; 1340 uint32_t absA; 1341 int8_t shiftCount; 1342 uint64_t zSig; 1343 1344 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1345 zSign = ( a < 0 ); 1346 absA = zSign ? - a : a; 1347 shiftCount = countLeadingZeros32( absA ) + 32; 1348 zSig = absA; 1349 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 1350 1351 } 1352 1353 /*---------------------------------------------------------------------------- 1354 | Returns the result of converting the 32-bit two's complement integer `a' to 1355 | the quadruple-precision floating-point format. The conversion is performed 1356 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1357 *----------------------------------------------------------------------------*/ 1358 1359 float128 int32_to_float128(int32_t a, float_status *status) 1360 { 1361 flag zSign; 1362 uint32_t absA; 1363 int8_t shiftCount; 1364 uint64_t zSig0; 1365 1366 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1367 zSign = ( a < 0 ); 1368 absA = zSign ? - a : a; 1369 shiftCount = countLeadingZeros32( absA ) + 17; 1370 zSig0 = absA; 1371 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 1372 1373 } 1374 1375 /*---------------------------------------------------------------------------- 1376 | Returns the result of converting the 64-bit two's complement integer `a' 1377 | to the single-precision floating-point format. The conversion is performed 1378 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1379 *----------------------------------------------------------------------------*/ 1380 1381 float32 int64_to_float32(int64_t a, float_status *status) 1382 { 1383 flag zSign; 1384 uint64_t absA; 1385 int8_t shiftCount; 1386 1387 if ( a == 0 ) return float32_zero; 1388 zSign = ( a < 0 ); 1389 absA = zSign ? - a : a; 1390 shiftCount = countLeadingZeros64( absA ) - 40; 1391 if ( 0 <= shiftCount ) { 1392 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount ); 1393 } 1394 else { 1395 shiftCount += 7; 1396 if ( shiftCount < 0 ) { 1397 shift64RightJamming( absA, - shiftCount, &absA ); 1398 } 1399 else { 1400 absA <<= shiftCount; 1401 } 1402 return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status); 1403 } 1404 1405 } 1406 1407 /*---------------------------------------------------------------------------- 1408 | Returns the result of converting the 64-bit two's complement integer `a' 1409 | to the double-precision floating-point format. The conversion is performed 1410 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1411 *----------------------------------------------------------------------------*/ 1412 1413 float64 int64_to_float64(int64_t a, float_status *status) 1414 { 1415 flag zSign; 1416 1417 if ( a == 0 ) return float64_zero; 1418 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) { 1419 return packFloat64( 1, 0x43E, 0 ); 1420 } 1421 zSign = ( a < 0 ); 1422 return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status); 1423 } 1424 1425 /*---------------------------------------------------------------------------- 1426 | Returns the result of converting the 64-bit two's complement integer `a' 1427 | to the extended double-precision floating-point format. The conversion 1428 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1429 | Arithmetic. 1430 *----------------------------------------------------------------------------*/ 1431 1432 floatx80 int64_to_floatx80(int64_t a, float_status *status) 1433 { 1434 flag zSign; 1435 uint64_t absA; 1436 int8_t shiftCount; 1437 1438 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1439 zSign = ( a < 0 ); 1440 absA = zSign ? - a : a; 1441 shiftCount = countLeadingZeros64( absA ); 1442 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 1443 1444 } 1445 1446 /*---------------------------------------------------------------------------- 1447 | Returns the result of converting the 64-bit two's complement integer `a' to 1448 | the quadruple-precision floating-point format. The conversion is performed 1449 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1450 *----------------------------------------------------------------------------*/ 1451 1452 float128 int64_to_float128(int64_t a, float_status *status) 1453 { 1454 flag zSign; 1455 uint64_t absA; 1456 int8_t shiftCount; 1457 int32_t zExp; 1458 uint64_t zSig0, zSig1; 1459 1460 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1461 zSign = ( a < 0 ); 1462 absA = zSign ? - a : a; 1463 shiftCount = countLeadingZeros64( absA ) + 49; 1464 zExp = 0x406E - shiftCount; 1465 if ( 64 <= shiftCount ) { 1466 zSig1 = 0; 1467 zSig0 = absA; 1468 shiftCount -= 64; 1469 } 1470 else { 1471 zSig1 = absA; 1472 zSig0 = 0; 1473 } 1474 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1475 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1476 1477 } 1478 1479 /*---------------------------------------------------------------------------- 1480 | Returns the result of converting the 64-bit unsigned integer `a' 1481 | to the single-precision floating-point format. The conversion is performed 1482 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1483 *----------------------------------------------------------------------------*/ 1484 1485 float32 uint64_to_float32(uint64_t a, float_status *status) 1486 { 1487 int shiftcount; 1488 1489 if (a == 0) { 1490 return float32_zero; 1491 } 1492 1493 /* Determine (left) shift needed to put first set bit into bit posn 23 1494 * (since packFloat32() expects the binary point between bits 23 and 22); 1495 * this is the fast case for smallish numbers. 1496 */ 1497 shiftcount = countLeadingZeros64(a) - 40; 1498 if (shiftcount >= 0) { 1499 return packFloat32(0, 0x95 - shiftcount, a << shiftcount); 1500 } 1501 /* Otherwise we need to do a round-and-pack. roundAndPackFloat32() 1502 * expects the binary point between bits 30 and 29, hence the + 7. 1503 */ 1504 shiftcount += 7; 1505 if (shiftcount < 0) { 1506 shift64RightJamming(a, -shiftcount, &a); 1507 } else { 1508 a <<= shiftcount; 1509 } 1510 1511 return roundAndPackFloat32(0, 0x9c - shiftcount, a, status); 1512 } 1513 1514 /*---------------------------------------------------------------------------- 1515 | Returns the result of converting the 64-bit unsigned integer `a' 1516 | to the double-precision floating-point format. The conversion is performed 1517 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1518 *----------------------------------------------------------------------------*/ 1519 1520 float64 uint64_to_float64(uint64_t a, float_status *status) 1521 { 1522 int exp = 0x43C; 1523 int shiftcount; 1524 1525 if (a == 0) { 1526 return float64_zero; 1527 } 1528 1529 shiftcount = countLeadingZeros64(a) - 1; 1530 if (shiftcount < 0) { 1531 shift64RightJamming(a, -shiftcount, &a); 1532 } else { 1533 a <<= shiftcount; 1534 } 1535 return roundAndPackFloat64(0, exp - shiftcount, a, status); 1536 } 1537 1538 /*---------------------------------------------------------------------------- 1539 | Returns the result of converting the 64-bit unsigned integer `a' 1540 | to the quadruple-precision floating-point format. The conversion is performed 1541 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1542 *----------------------------------------------------------------------------*/ 1543 1544 float128 uint64_to_float128(uint64_t a, float_status *status) 1545 { 1546 if (a == 0) { 1547 return float128_zero; 1548 } 1549 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status); 1550 } 1551 1552 /*---------------------------------------------------------------------------- 1553 | Returns the result of converting the single-precision floating-point value 1554 | `a' to the 32-bit two's complement integer format. The conversion is 1555 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1556 | Arithmetic---which means in particular that the conversion is rounded 1557 | according to the current rounding mode. If `a' is a NaN, the largest 1558 | positive integer is returned. Otherwise, if the conversion overflows, the 1559 | largest integer with the same sign as `a' is returned. 1560 *----------------------------------------------------------------------------*/ 1561 1562 int32_t float32_to_int32(float32 a, float_status *status) 1563 { 1564 flag aSign; 1565 int aExp; 1566 int shiftCount; 1567 uint32_t aSig; 1568 uint64_t aSig64; 1569 1570 a = float32_squash_input_denormal(a, status); 1571 aSig = extractFloat32Frac( a ); 1572 aExp = extractFloat32Exp( a ); 1573 aSign = extractFloat32Sign( a ); 1574 if ( ( aExp == 0xFF ) && aSig ) aSign = 0; 1575 if ( aExp ) aSig |= 0x00800000; 1576 shiftCount = 0xAF - aExp; 1577 aSig64 = aSig; 1578 aSig64 <<= 32; 1579 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 ); 1580 return roundAndPackInt32(aSign, aSig64, status); 1581 1582 } 1583 1584 /*---------------------------------------------------------------------------- 1585 | Returns the result of converting the single-precision floating-point value 1586 | `a' to the 32-bit two's complement integer format. The conversion is 1587 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1588 | Arithmetic, except that the conversion is always rounded toward zero. 1589 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1590 | the conversion overflows, the largest integer with the same sign as `a' is 1591 | returned. 1592 *----------------------------------------------------------------------------*/ 1593 1594 int32_t float32_to_int32_round_to_zero(float32 a, float_status *status) 1595 { 1596 flag aSign; 1597 int aExp; 1598 int shiftCount; 1599 uint32_t aSig; 1600 int32_t z; 1601 a = float32_squash_input_denormal(a, status); 1602 1603 aSig = extractFloat32Frac( a ); 1604 aExp = extractFloat32Exp( a ); 1605 aSign = extractFloat32Sign( a ); 1606 shiftCount = aExp - 0x9E; 1607 if ( 0 <= shiftCount ) { 1608 if ( float32_val(a) != 0xCF000000 ) { 1609 float_raise(float_flag_invalid, status); 1610 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; 1611 } 1612 return (int32_t) 0x80000000; 1613 } 1614 else if ( aExp <= 0x7E ) { 1615 if (aExp | aSig) { 1616 status->float_exception_flags |= float_flag_inexact; 1617 } 1618 return 0; 1619 } 1620 aSig = ( aSig | 0x00800000 )<<8; 1621 z = aSig>>( - shiftCount ); 1622 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) { 1623 status->float_exception_flags |= float_flag_inexact; 1624 } 1625 if ( aSign ) z = - z; 1626 return z; 1627 1628 } 1629 1630 /*---------------------------------------------------------------------------- 1631 | Returns the result of converting the single-precision floating-point value 1632 | `a' to the 16-bit two's complement integer format. The conversion is 1633 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1634 | Arithmetic, except that the conversion is always rounded toward zero. 1635 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1636 | the conversion overflows, the largest integer with the same sign as `a' is 1637 | returned. 1638 *----------------------------------------------------------------------------*/ 1639 1640 int16_t float32_to_int16_round_to_zero(float32 a, float_status *status) 1641 { 1642 flag aSign; 1643 int aExp; 1644 int shiftCount; 1645 uint32_t aSig; 1646 int32_t z; 1647 1648 aSig = extractFloat32Frac( a ); 1649 aExp = extractFloat32Exp( a ); 1650 aSign = extractFloat32Sign( a ); 1651 shiftCount = aExp - 0x8E; 1652 if ( 0 <= shiftCount ) { 1653 if ( float32_val(a) != 0xC7000000 ) { 1654 float_raise(float_flag_invalid, status); 1655 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1656 return 0x7FFF; 1657 } 1658 } 1659 return (int32_t) 0xffff8000; 1660 } 1661 else if ( aExp <= 0x7E ) { 1662 if ( aExp | aSig ) { 1663 status->float_exception_flags |= float_flag_inexact; 1664 } 1665 return 0; 1666 } 1667 shiftCount -= 0x10; 1668 aSig = ( aSig | 0x00800000 )<<8; 1669 z = aSig>>( - shiftCount ); 1670 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) { 1671 status->float_exception_flags |= float_flag_inexact; 1672 } 1673 if ( aSign ) { 1674 z = - z; 1675 } 1676 return z; 1677 1678 } 1679 1680 /*---------------------------------------------------------------------------- 1681 | Returns the result of converting the single-precision floating-point value 1682 | `a' to the 64-bit two's complement integer format. The conversion is 1683 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1684 | Arithmetic---which means in particular that the conversion is rounded 1685 | according to the current rounding mode. If `a' is a NaN, the largest 1686 | positive integer is returned. Otherwise, if the conversion overflows, the 1687 | largest integer with the same sign as `a' is returned. 1688 *----------------------------------------------------------------------------*/ 1689 1690 int64_t float32_to_int64(float32 a, float_status *status) 1691 { 1692 flag aSign; 1693 int aExp; 1694 int shiftCount; 1695 uint32_t aSig; 1696 uint64_t aSig64, aSigExtra; 1697 a = float32_squash_input_denormal(a, status); 1698 1699 aSig = extractFloat32Frac( a ); 1700 aExp = extractFloat32Exp( a ); 1701 aSign = extractFloat32Sign( a ); 1702 shiftCount = 0xBE - aExp; 1703 if ( shiftCount < 0 ) { 1704 float_raise(float_flag_invalid, status); 1705 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1706 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1707 } 1708 return (int64_t) LIT64( 0x8000000000000000 ); 1709 } 1710 if ( aExp ) aSig |= 0x00800000; 1711 aSig64 = aSig; 1712 aSig64 <<= 40; 1713 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra ); 1714 return roundAndPackInt64(aSign, aSig64, aSigExtra, status); 1715 1716 } 1717 1718 /*---------------------------------------------------------------------------- 1719 | Returns the result of converting the single-precision floating-point value 1720 | `a' to the 64-bit unsigned integer format. The conversion is 1721 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1722 | Arithmetic---which means in particular that the conversion is rounded 1723 | according to the current rounding mode. If `a' is a NaN, the largest 1724 | unsigned integer is returned. Otherwise, if the conversion overflows, the 1725 | largest unsigned integer is returned. If the 'a' is negative, the result 1726 | is rounded and zero is returned; values that do not round to zero will 1727 | raise the inexact exception flag. 1728 *----------------------------------------------------------------------------*/ 1729 1730 uint64_t float32_to_uint64(float32 a, float_status *status) 1731 { 1732 flag aSign; 1733 int aExp; 1734 int shiftCount; 1735 uint32_t aSig; 1736 uint64_t aSig64, aSigExtra; 1737 a = float32_squash_input_denormal(a, status); 1738 1739 aSig = extractFloat32Frac(a); 1740 aExp = extractFloat32Exp(a); 1741 aSign = extractFloat32Sign(a); 1742 if ((aSign) && (aExp > 126)) { 1743 float_raise(float_flag_invalid, status); 1744 if (float32_is_any_nan(a)) { 1745 return LIT64(0xFFFFFFFFFFFFFFFF); 1746 } else { 1747 return 0; 1748 } 1749 } 1750 shiftCount = 0xBE - aExp; 1751 if (aExp) { 1752 aSig |= 0x00800000; 1753 } 1754 if (shiftCount < 0) { 1755 float_raise(float_flag_invalid, status); 1756 return LIT64(0xFFFFFFFFFFFFFFFF); 1757 } 1758 1759 aSig64 = aSig; 1760 aSig64 <<= 40; 1761 shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra); 1762 return roundAndPackUint64(aSign, aSig64, aSigExtra, status); 1763 } 1764 1765 /*---------------------------------------------------------------------------- 1766 | Returns the result of converting the single-precision floating-point value 1767 | `a' to the 64-bit unsigned integer format. The conversion is 1768 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1769 | Arithmetic, except that the conversion is always rounded toward zero. If 1770 | `a' is a NaN, the largest unsigned integer is returned. Otherwise, if the 1771 | conversion overflows, the largest unsigned integer is returned. If the 1772 | 'a' is negative, the result is rounded and zero is returned; values that do 1773 | not round to zero will raise the inexact flag. 1774 *----------------------------------------------------------------------------*/ 1775 1776 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *status) 1777 { 1778 signed char current_rounding_mode = status->float_rounding_mode; 1779 set_float_rounding_mode(float_round_to_zero, status); 1780 int64_t v = float32_to_uint64(a, status); 1781 set_float_rounding_mode(current_rounding_mode, status); 1782 return v; 1783 } 1784 1785 /*---------------------------------------------------------------------------- 1786 | Returns the result of converting the single-precision floating-point value 1787 | `a' to the 64-bit two's complement integer format. The conversion is 1788 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1789 | Arithmetic, except that the conversion is always rounded toward zero. If 1790 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 1791 | conversion overflows, the largest integer with the same sign as `a' is 1792 | returned. 1793 *----------------------------------------------------------------------------*/ 1794 1795 int64_t float32_to_int64_round_to_zero(float32 a, float_status *status) 1796 { 1797 flag aSign; 1798 int aExp; 1799 int shiftCount; 1800 uint32_t aSig; 1801 uint64_t aSig64; 1802 int64_t z; 1803 a = float32_squash_input_denormal(a, status); 1804 1805 aSig = extractFloat32Frac( a ); 1806 aExp = extractFloat32Exp( a ); 1807 aSign = extractFloat32Sign( a ); 1808 shiftCount = aExp - 0xBE; 1809 if ( 0 <= shiftCount ) { 1810 if ( float32_val(a) != 0xDF000000 ) { 1811 float_raise(float_flag_invalid, status); 1812 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1813 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1814 } 1815 } 1816 return (int64_t) LIT64( 0x8000000000000000 ); 1817 } 1818 else if ( aExp <= 0x7E ) { 1819 if (aExp | aSig) { 1820 status->float_exception_flags |= float_flag_inexact; 1821 } 1822 return 0; 1823 } 1824 aSig64 = aSig | 0x00800000; 1825 aSig64 <<= 40; 1826 z = aSig64>>( - shiftCount ); 1827 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) { 1828 status->float_exception_flags |= float_flag_inexact; 1829 } 1830 if ( aSign ) z = - z; 1831 return z; 1832 1833 } 1834 1835 /*---------------------------------------------------------------------------- 1836 | Returns the result of converting the single-precision floating-point value 1837 | `a' to the double-precision floating-point format. The conversion is 1838 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1839 | Arithmetic. 1840 *----------------------------------------------------------------------------*/ 1841 1842 float64 float32_to_float64(float32 a, float_status *status) 1843 { 1844 flag aSign; 1845 int aExp; 1846 uint32_t aSig; 1847 a = float32_squash_input_denormal(a, status); 1848 1849 aSig = extractFloat32Frac( a ); 1850 aExp = extractFloat32Exp( a ); 1851 aSign = extractFloat32Sign( a ); 1852 if ( aExp == 0xFF ) { 1853 if (aSig) { 1854 return commonNaNToFloat64(float32ToCommonNaN(a, status), status); 1855 } 1856 return packFloat64( aSign, 0x7FF, 0 ); 1857 } 1858 if ( aExp == 0 ) { 1859 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 1860 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1861 --aExp; 1862 } 1863 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 ); 1864 1865 } 1866 1867 /*---------------------------------------------------------------------------- 1868 | Returns the result of converting the single-precision floating-point value 1869 | `a' to the extended double-precision floating-point format. The conversion 1870 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1871 | Arithmetic. 1872 *----------------------------------------------------------------------------*/ 1873 1874 floatx80 float32_to_floatx80(float32 a, float_status *status) 1875 { 1876 flag aSign; 1877 int aExp; 1878 uint32_t aSig; 1879 1880 a = float32_squash_input_denormal(a, status); 1881 aSig = extractFloat32Frac( a ); 1882 aExp = extractFloat32Exp( a ); 1883 aSign = extractFloat32Sign( a ); 1884 if ( aExp == 0xFF ) { 1885 if (aSig) { 1886 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status); 1887 } 1888 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 1889 } 1890 if ( aExp == 0 ) { 1891 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 1892 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1893 } 1894 aSig |= 0x00800000; 1895 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 1896 1897 } 1898 1899 /*---------------------------------------------------------------------------- 1900 | Returns the result of converting the single-precision floating-point value 1901 | `a' to the double-precision floating-point format. The conversion is 1902 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1903 | Arithmetic. 1904 *----------------------------------------------------------------------------*/ 1905 1906 float128 float32_to_float128(float32 a, float_status *status) 1907 { 1908 flag aSign; 1909 int aExp; 1910 uint32_t aSig; 1911 1912 a = float32_squash_input_denormal(a, status); 1913 aSig = extractFloat32Frac( a ); 1914 aExp = extractFloat32Exp( a ); 1915 aSign = extractFloat32Sign( a ); 1916 if ( aExp == 0xFF ) { 1917 if (aSig) { 1918 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 1919 } 1920 return packFloat128( aSign, 0x7FFF, 0, 0 ); 1921 } 1922 if ( aExp == 0 ) { 1923 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 1924 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1925 --aExp; 1926 } 1927 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 1928 1929 } 1930 1931 /*---------------------------------------------------------------------------- 1932 | Rounds the single-precision floating-point value `a' to an integer, and 1933 | returns the result as a single-precision floating-point value. The 1934 | operation is performed according to the IEC/IEEE Standard for Binary 1935 | Floating-Point Arithmetic. 1936 *----------------------------------------------------------------------------*/ 1937 1938 float32 float32_round_to_int(float32 a, float_status *status) 1939 { 1940 flag aSign; 1941 int aExp; 1942 uint32_t lastBitMask, roundBitsMask; 1943 uint32_t z; 1944 a = float32_squash_input_denormal(a, status); 1945 1946 aExp = extractFloat32Exp( a ); 1947 if ( 0x96 <= aExp ) { 1948 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { 1949 return propagateFloat32NaN(a, a, status); 1950 } 1951 return a; 1952 } 1953 if ( aExp <= 0x7E ) { 1954 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a; 1955 status->float_exception_flags |= float_flag_inexact; 1956 aSign = extractFloat32Sign( a ); 1957 switch (status->float_rounding_mode) { 1958 case float_round_nearest_even: 1959 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { 1960 return packFloat32( aSign, 0x7F, 0 ); 1961 } 1962 break; 1963 case float_round_ties_away: 1964 if (aExp == 0x7E) { 1965 return packFloat32(aSign, 0x7F, 0); 1966 } 1967 break; 1968 case float_round_down: 1969 return make_float32(aSign ? 0xBF800000 : 0); 1970 case float_round_up: 1971 return make_float32(aSign ? 0x80000000 : 0x3F800000); 1972 } 1973 return packFloat32( aSign, 0, 0 ); 1974 } 1975 lastBitMask = 1; 1976 lastBitMask <<= 0x96 - aExp; 1977 roundBitsMask = lastBitMask - 1; 1978 z = float32_val(a); 1979 switch (status->float_rounding_mode) { 1980 case float_round_nearest_even: 1981 z += lastBitMask>>1; 1982 if ((z & roundBitsMask) == 0) { 1983 z &= ~lastBitMask; 1984 } 1985 break; 1986 case float_round_ties_away: 1987 z += lastBitMask >> 1; 1988 break; 1989 case float_round_to_zero: 1990 break; 1991 case float_round_up: 1992 if (!extractFloat32Sign(make_float32(z))) { 1993 z += roundBitsMask; 1994 } 1995 break; 1996 case float_round_down: 1997 if (extractFloat32Sign(make_float32(z))) { 1998 z += roundBitsMask; 1999 } 2000 break; 2001 default: 2002 abort(); 2003 } 2004 z &= ~ roundBitsMask; 2005 if (z != float32_val(a)) { 2006 status->float_exception_flags |= float_flag_inexact; 2007 } 2008 return make_float32(z); 2009 2010 } 2011 2012 /*---------------------------------------------------------------------------- 2013 | Returns the result of adding the absolute values of the single-precision 2014 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 2015 | before being returned. `zSign' is ignored if the result is a NaN. 2016 | The addition is performed according to the IEC/IEEE Standard for Binary 2017 | Floating-Point Arithmetic. 2018 *----------------------------------------------------------------------------*/ 2019 2020 static float32 addFloat32Sigs(float32 a, float32 b, flag zSign, 2021 float_status *status) 2022 { 2023 int aExp, bExp, zExp; 2024 uint32_t aSig, bSig, zSig; 2025 int expDiff; 2026 2027 aSig = extractFloat32Frac( a ); 2028 aExp = extractFloat32Exp( a ); 2029 bSig = extractFloat32Frac( b ); 2030 bExp = extractFloat32Exp( b ); 2031 expDiff = aExp - bExp; 2032 aSig <<= 6; 2033 bSig <<= 6; 2034 if ( 0 < expDiff ) { 2035 if ( aExp == 0xFF ) { 2036 if (aSig) { 2037 return propagateFloat32NaN(a, b, status); 2038 } 2039 return a; 2040 } 2041 if ( bExp == 0 ) { 2042 --expDiff; 2043 } 2044 else { 2045 bSig |= 0x20000000; 2046 } 2047 shift32RightJamming( bSig, expDiff, &bSig ); 2048 zExp = aExp; 2049 } 2050 else if ( expDiff < 0 ) { 2051 if ( bExp == 0xFF ) { 2052 if (bSig) { 2053 return propagateFloat32NaN(a, b, status); 2054 } 2055 return packFloat32( zSign, 0xFF, 0 ); 2056 } 2057 if ( aExp == 0 ) { 2058 ++expDiff; 2059 } 2060 else { 2061 aSig |= 0x20000000; 2062 } 2063 shift32RightJamming( aSig, - expDiff, &aSig ); 2064 zExp = bExp; 2065 } 2066 else { 2067 if ( aExp == 0xFF ) { 2068 if (aSig | bSig) { 2069 return propagateFloat32NaN(a, b, status); 2070 } 2071 return a; 2072 } 2073 if ( aExp == 0 ) { 2074 if (status->flush_to_zero) { 2075 if (aSig | bSig) { 2076 float_raise(float_flag_output_denormal, status); 2077 } 2078 return packFloat32(zSign, 0, 0); 2079 } 2080 return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); 2081 } 2082 zSig = 0x40000000 + aSig + bSig; 2083 zExp = aExp; 2084 goto roundAndPack; 2085 } 2086 aSig |= 0x20000000; 2087 zSig = ( aSig + bSig )<<1; 2088 --zExp; 2089 if ( (int32_t) zSig < 0 ) { 2090 zSig = aSig + bSig; 2091 ++zExp; 2092 } 2093 roundAndPack: 2094 return roundAndPackFloat32(zSign, zExp, zSig, status); 2095 2096 } 2097 2098 /*---------------------------------------------------------------------------- 2099 | Returns the result of subtracting the absolute values of the single- 2100 | precision floating-point values `a' and `b'. If `zSign' is 1, the 2101 | difference is negated before being returned. `zSign' is ignored if the 2102 | result is a NaN. The subtraction is performed according to the IEC/IEEE 2103 | Standard for Binary Floating-Point Arithmetic. 2104 *----------------------------------------------------------------------------*/ 2105 2106 static float32 subFloat32Sigs(float32 a, float32 b, flag zSign, 2107 float_status *status) 2108 { 2109 int aExp, bExp, zExp; 2110 uint32_t aSig, bSig, zSig; 2111 int expDiff; 2112 2113 aSig = extractFloat32Frac( a ); 2114 aExp = extractFloat32Exp( a ); 2115 bSig = extractFloat32Frac( b ); 2116 bExp = extractFloat32Exp( b ); 2117 expDiff = aExp - bExp; 2118 aSig <<= 7; 2119 bSig <<= 7; 2120 if ( 0 < expDiff ) goto aExpBigger; 2121 if ( expDiff < 0 ) goto bExpBigger; 2122 if ( aExp == 0xFF ) { 2123 if (aSig | bSig) { 2124 return propagateFloat32NaN(a, b, status); 2125 } 2126 float_raise(float_flag_invalid, status); 2127 return float32_default_nan(status); 2128 } 2129 if ( aExp == 0 ) { 2130 aExp = 1; 2131 bExp = 1; 2132 } 2133 if ( bSig < aSig ) goto aBigger; 2134 if ( aSig < bSig ) goto bBigger; 2135 return packFloat32(status->float_rounding_mode == float_round_down, 0, 0); 2136 bExpBigger: 2137 if ( bExp == 0xFF ) { 2138 if (bSig) { 2139 return propagateFloat32NaN(a, b, status); 2140 } 2141 return packFloat32( zSign ^ 1, 0xFF, 0 ); 2142 } 2143 if ( aExp == 0 ) { 2144 ++expDiff; 2145 } 2146 else { 2147 aSig |= 0x40000000; 2148 } 2149 shift32RightJamming( aSig, - expDiff, &aSig ); 2150 bSig |= 0x40000000; 2151 bBigger: 2152 zSig = bSig - aSig; 2153 zExp = bExp; 2154 zSign ^= 1; 2155 goto normalizeRoundAndPack; 2156 aExpBigger: 2157 if ( aExp == 0xFF ) { 2158 if (aSig) { 2159 return propagateFloat32NaN(a, b, status); 2160 } 2161 return a; 2162 } 2163 if ( bExp == 0 ) { 2164 --expDiff; 2165 } 2166 else { 2167 bSig |= 0x40000000; 2168 } 2169 shift32RightJamming( bSig, expDiff, &bSig ); 2170 aSig |= 0x40000000; 2171 aBigger: 2172 zSig = aSig - bSig; 2173 zExp = aExp; 2174 normalizeRoundAndPack: 2175 --zExp; 2176 return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status); 2177 2178 } 2179 2180 /*---------------------------------------------------------------------------- 2181 | Returns the result of adding the single-precision floating-point values `a' 2182 | and `b'. The operation is performed according to the IEC/IEEE Standard for 2183 | Binary Floating-Point Arithmetic. 2184 *----------------------------------------------------------------------------*/ 2185 2186 float32 float32_add(float32 a, float32 b, float_status *status) 2187 { 2188 flag aSign, bSign; 2189 a = float32_squash_input_denormal(a, status); 2190 b = float32_squash_input_denormal(b, status); 2191 2192 aSign = extractFloat32Sign( a ); 2193 bSign = extractFloat32Sign( b ); 2194 if ( aSign == bSign ) { 2195 return addFloat32Sigs(a, b, aSign, status); 2196 } 2197 else { 2198 return subFloat32Sigs(a, b, aSign, status); 2199 } 2200 2201 } 2202 2203 /*---------------------------------------------------------------------------- 2204 | Returns the result of subtracting the single-precision floating-point values 2205 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2206 | for Binary Floating-Point Arithmetic. 2207 *----------------------------------------------------------------------------*/ 2208 2209 float32 float32_sub(float32 a, float32 b, float_status *status) 2210 { 2211 flag aSign, bSign; 2212 a = float32_squash_input_denormal(a, status); 2213 b = float32_squash_input_denormal(b, status); 2214 2215 aSign = extractFloat32Sign( a ); 2216 bSign = extractFloat32Sign( b ); 2217 if ( aSign == bSign ) { 2218 return subFloat32Sigs(a, b, aSign, status); 2219 } 2220 else { 2221 return addFloat32Sigs(a, b, aSign, status); 2222 } 2223 2224 } 2225 2226 /*---------------------------------------------------------------------------- 2227 | Returns the result of multiplying the single-precision floating-point values 2228 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2229 | for Binary Floating-Point Arithmetic. 2230 *----------------------------------------------------------------------------*/ 2231 2232 float32 float32_mul(float32 a, float32 b, float_status *status) 2233 { 2234 flag aSign, bSign, zSign; 2235 int aExp, bExp, zExp; 2236 uint32_t aSig, bSig; 2237 uint64_t zSig64; 2238 uint32_t zSig; 2239 2240 a = float32_squash_input_denormal(a, status); 2241 b = float32_squash_input_denormal(b, status); 2242 2243 aSig = extractFloat32Frac( a ); 2244 aExp = extractFloat32Exp( a ); 2245 aSign = extractFloat32Sign( a ); 2246 bSig = extractFloat32Frac( b ); 2247 bExp = extractFloat32Exp( b ); 2248 bSign = extractFloat32Sign( b ); 2249 zSign = aSign ^ bSign; 2250 if ( aExp == 0xFF ) { 2251 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 2252 return propagateFloat32NaN(a, b, status); 2253 } 2254 if ( ( bExp | bSig ) == 0 ) { 2255 float_raise(float_flag_invalid, status); 2256 return float32_default_nan(status); 2257 } 2258 return packFloat32( zSign, 0xFF, 0 ); 2259 } 2260 if ( bExp == 0xFF ) { 2261 if (bSig) { 2262 return propagateFloat32NaN(a, b, status); 2263 } 2264 if ( ( aExp | aSig ) == 0 ) { 2265 float_raise(float_flag_invalid, status); 2266 return float32_default_nan(status); 2267 } 2268 return packFloat32( zSign, 0xFF, 0 ); 2269 } 2270 if ( aExp == 0 ) { 2271 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 2272 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2273 } 2274 if ( bExp == 0 ) { 2275 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); 2276 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2277 } 2278 zExp = aExp + bExp - 0x7F; 2279 aSig = ( aSig | 0x00800000 )<<7; 2280 bSig = ( bSig | 0x00800000 )<<8; 2281 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 ); 2282 zSig = zSig64; 2283 if ( 0 <= (int32_t) ( zSig<<1 ) ) { 2284 zSig <<= 1; 2285 --zExp; 2286 } 2287 return roundAndPackFloat32(zSign, zExp, zSig, status); 2288 2289 } 2290 2291 /*---------------------------------------------------------------------------- 2292 | Returns the result of dividing the single-precision floating-point value `a' 2293 | by the corresponding value `b'. The operation is performed according to the 2294 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2295 *----------------------------------------------------------------------------*/ 2296 2297 float32 float32_div(float32 a, float32 b, float_status *status) 2298 { 2299 flag aSign, bSign, zSign; 2300 int aExp, bExp, zExp; 2301 uint32_t aSig, bSig, zSig; 2302 a = float32_squash_input_denormal(a, status); 2303 b = float32_squash_input_denormal(b, status); 2304 2305 aSig = extractFloat32Frac( a ); 2306 aExp = extractFloat32Exp( a ); 2307 aSign = extractFloat32Sign( a ); 2308 bSig = extractFloat32Frac( b ); 2309 bExp = extractFloat32Exp( b ); 2310 bSign = extractFloat32Sign( b ); 2311 zSign = aSign ^ bSign; 2312 if ( aExp == 0xFF ) { 2313 if (aSig) { 2314 return propagateFloat32NaN(a, b, status); 2315 } 2316 if ( bExp == 0xFF ) { 2317 if (bSig) { 2318 return propagateFloat32NaN(a, b, status); 2319 } 2320 float_raise(float_flag_invalid, status); 2321 return float32_default_nan(status); 2322 } 2323 return packFloat32( zSign, 0xFF, 0 ); 2324 } 2325 if ( bExp == 0xFF ) { 2326 if (bSig) { 2327 return propagateFloat32NaN(a, b, status); 2328 } 2329 return packFloat32( zSign, 0, 0 ); 2330 } 2331 if ( bExp == 0 ) { 2332 if ( bSig == 0 ) { 2333 if ( ( aExp | aSig ) == 0 ) { 2334 float_raise(float_flag_invalid, status); 2335 return float32_default_nan(status); 2336 } 2337 float_raise(float_flag_divbyzero, status); 2338 return packFloat32( zSign, 0xFF, 0 ); 2339 } 2340 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2341 } 2342 if ( aExp == 0 ) { 2343 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 2344 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2345 } 2346 zExp = aExp - bExp + 0x7D; 2347 aSig = ( aSig | 0x00800000 )<<7; 2348 bSig = ( bSig | 0x00800000 )<<8; 2349 if ( bSig <= ( aSig + aSig ) ) { 2350 aSig >>= 1; 2351 ++zExp; 2352 } 2353 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig; 2354 if ( ( zSig & 0x3F ) == 0 ) { 2355 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 ); 2356 } 2357 return roundAndPackFloat32(zSign, zExp, zSig, status); 2358 2359 } 2360 2361 /*---------------------------------------------------------------------------- 2362 | Returns the remainder of the single-precision floating-point value `a' 2363 | with respect to the corresponding value `b'. The operation is performed 2364 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2365 *----------------------------------------------------------------------------*/ 2366 2367 float32 float32_rem(float32 a, float32 b, float_status *status) 2368 { 2369 flag aSign, zSign; 2370 int aExp, bExp, expDiff; 2371 uint32_t aSig, bSig; 2372 uint32_t q; 2373 uint64_t aSig64, bSig64, q64; 2374 uint32_t alternateASig; 2375 int32_t sigMean; 2376 a = float32_squash_input_denormal(a, status); 2377 b = float32_squash_input_denormal(b, status); 2378 2379 aSig = extractFloat32Frac( a ); 2380 aExp = extractFloat32Exp( a ); 2381 aSign = extractFloat32Sign( a ); 2382 bSig = extractFloat32Frac( b ); 2383 bExp = extractFloat32Exp( b ); 2384 if ( aExp == 0xFF ) { 2385 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 2386 return propagateFloat32NaN(a, b, status); 2387 } 2388 float_raise(float_flag_invalid, status); 2389 return float32_default_nan(status); 2390 } 2391 if ( bExp == 0xFF ) { 2392 if (bSig) { 2393 return propagateFloat32NaN(a, b, status); 2394 } 2395 return a; 2396 } 2397 if ( bExp == 0 ) { 2398 if ( bSig == 0 ) { 2399 float_raise(float_flag_invalid, status); 2400 return float32_default_nan(status); 2401 } 2402 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2403 } 2404 if ( aExp == 0 ) { 2405 if ( aSig == 0 ) return a; 2406 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2407 } 2408 expDiff = aExp - bExp; 2409 aSig |= 0x00800000; 2410 bSig |= 0x00800000; 2411 if ( expDiff < 32 ) { 2412 aSig <<= 8; 2413 bSig <<= 8; 2414 if ( expDiff < 0 ) { 2415 if ( expDiff < -1 ) return a; 2416 aSig >>= 1; 2417 } 2418 q = ( bSig <= aSig ); 2419 if ( q ) aSig -= bSig; 2420 if ( 0 < expDiff ) { 2421 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 2422 q >>= 32 - expDiff; 2423 bSig >>= 2; 2424 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 2425 } 2426 else { 2427 aSig >>= 2; 2428 bSig >>= 2; 2429 } 2430 } 2431 else { 2432 if ( bSig <= aSig ) aSig -= bSig; 2433 aSig64 = ( (uint64_t) aSig )<<40; 2434 bSig64 = ( (uint64_t) bSig )<<40; 2435 expDiff -= 64; 2436 while ( 0 < expDiff ) { 2437 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2438 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2439 aSig64 = - ( ( bSig * q64 )<<38 ); 2440 expDiff -= 62; 2441 } 2442 expDiff += 64; 2443 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2444 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2445 q = q64>>( 64 - expDiff ); 2446 bSig <<= 6; 2447 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 2448 } 2449 do { 2450 alternateASig = aSig; 2451 ++q; 2452 aSig -= bSig; 2453 } while ( 0 <= (int32_t) aSig ); 2454 sigMean = aSig + alternateASig; 2455 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 2456 aSig = alternateASig; 2457 } 2458 zSign = ( (int32_t) aSig < 0 ); 2459 if ( zSign ) aSig = - aSig; 2460 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 2461 } 2462 2463 /*---------------------------------------------------------------------------- 2464 | Returns the result of multiplying the single-precision floating-point values 2465 | `a' and `b' then adding 'c', with no intermediate rounding step after the 2466 | multiplication. The operation is performed according to the IEC/IEEE 2467 | Standard for Binary Floating-Point Arithmetic 754-2008. 2468 | The flags argument allows the caller to select negation of the 2469 | addend, the intermediate product, or the final result. (The difference 2470 | between this and having the caller do a separate negation is that negating 2471 | externally will flip the sign bit on NaNs.) 2472 *----------------------------------------------------------------------------*/ 2473 2474 float32 float32_muladd(float32 a, float32 b, float32 c, int flags, 2475 float_status *status) 2476 { 2477 flag aSign, bSign, cSign, zSign; 2478 int aExp, bExp, cExp, pExp, zExp, expDiff; 2479 uint32_t aSig, bSig, cSig; 2480 flag pInf, pZero, pSign; 2481 uint64_t pSig64, cSig64, zSig64; 2482 uint32_t pSig; 2483 int shiftcount; 2484 flag signflip, infzero; 2485 2486 a = float32_squash_input_denormal(a, status); 2487 b = float32_squash_input_denormal(b, status); 2488 c = float32_squash_input_denormal(c, status); 2489 aSig = extractFloat32Frac(a); 2490 aExp = extractFloat32Exp(a); 2491 aSign = extractFloat32Sign(a); 2492 bSig = extractFloat32Frac(b); 2493 bExp = extractFloat32Exp(b); 2494 bSign = extractFloat32Sign(b); 2495 cSig = extractFloat32Frac(c); 2496 cExp = extractFloat32Exp(c); 2497 cSign = extractFloat32Sign(c); 2498 2499 infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) || 2500 (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0)); 2501 2502 /* It is implementation-defined whether the cases of (0,inf,qnan) 2503 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 2504 * they return if they do), so we have to hand this information 2505 * off to the target-specific pick-a-NaN routine. 2506 */ 2507 if (((aExp == 0xff) && aSig) || 2508 ((bExp == 0xff) && bSig) || 2509 ((cExp == 0xff) && cSig)) { 2510 return propagateFloat32MulAddNaN(a, b, c, infzero, status); 2511 } 2512 2513 if (infzero) { 2514 float_raise(float_flag_invalid, status); 2515 return float32_default_nan(status); 2516 } 2517 2518 if (flags & float_muladd_negate_c) { 2519 cSign ^= 1; 2520 } 2521 2522 signflip = (flags & float_muladd_negate_result) ? 1 : 0; 2523 2524 /* Work out the sign and type of the product */ 2525 pSign = aSign ^ bSign; 2526 if (flags & float_muladd_negate_product) { 2527 pSign ^= 1; 2528 } 2529 pInf = (aExp == 0xff) || (bExp == 0xff); 2530 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0); 2531 2532 if (cExp == 0xff) { 2533 if (pInf && (pSign ^ cSign)) { 2534 /* addition of opposite-signed infinities => InvalidOperation */ 2535 float_raise(float_flag_invalid, status); 2536 return float32_default_nan(status); 2537 } 2538 /* Otherwise generate an infinity of the same sign */ 2539 return packFloat32(cSign ^ signflip, 0xff, 0); 2540 } 2541 2542 if (pInf) { 2543 return packFloat32(pSign ^ signflip, 0xff, 0); 2544 } 2545 2546 if (pZero) { 2547 if (cExp == 0) { 2548 if (cSig == 0) { 2549 /* Adding two exact zeroes */ 2550 if (pSign == cSign) { 2551 zSign = pSign; 2552 } else if (status->float_rounding_mode == float_round_down) { 2553 zSign = 1; 2554 } else { 2555 zSign = 0; 2556 } 2557 return packFloat32(zSign ^ signflip, 0, 0); 2558 } 2559 /* Exact zero plus a denorm */ 2560 if (status->flush_to_zero) { 2561 float_raise(float_flag_output_denormal, status); 2562 return packFloat32(cSign ^ signflip, 0, 0); 2563 } 2564 } 2565 /* Zero plus something non-zero : just return the something */ 2566 if (flags & float_muladd_halve_result) { 2567 if (cExp == 0) { 2568 normalizeFloat32Subnormal(cSig, &cExp, &cSig); 2569 } 2570 /* Subtract one to halve, and one again because roundAndPackFloat32 2571 * wants one less than the true exponent. 2572 */ 2573 cExp -= 2; 2574 cSig = (cSig | 0x00800000) << 7; 2575 return roundAndPackFloat32(cSign ^ signflip, cExp, cSig, status); 2576 } 2577 return packFloat32(cSign ^ signflip, cExp, cSig); 2578 } 2579 2580 if (aExp == 0) { 2581 normalizeFloat32Subnormal(aSig, &aExp, &aSig); 2582 } 2583 if (bExp == 0) { 2584 normalizeFloat32Subnormal(bSig, &bExp, &bSig); 2585 } 2586 2587 /* Calculate the actual result a * b + c */ 2588 2589 /* Multiply first; this is easy. */ 2590 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f 2591 * because we want the true exponent, not the "one-less-than" 2592 * flavour that roundAndPackFloat32() takes. 2593 */ 2594 pExp = aExp + bExp - 0x7e; 2595 aSig = (aSig | 0x00800000) << 7; 2596 bSig = (bSig | 0x00800000) << 8; 2597 pSig64 = (uint64_t)aSig * bSig; 2598 if ((int64_t)(pSig64 << 1) >= 0) { 2599 pSig64 <<= 1; 2600 pExp--; 2601 } 2602 2603 zSign = pSign ^ signflip; 2604 2605 /* Now pSig64 is the significand of the multiply, with the explicit bit in 2606 * position 62. 2607 */ 2608 if (cExp == 0) { 2609 if (!cSig) { 2610 /* Throw out the special case of c being an exact zero now */ 2611 shift64RightJamming(pSig64, 32, &pSig64); 2612 pSig = pSig64; 2613 if (flags & float_muladd_halve_result) { 2614 pExp--; 2615 } 2616 return roundAndPackFloat32(zSign, pExp - 1, 2617 pSig, status); 2618 } 2619 normalizeFloat32Subnormal(cSig, &cExp, &cSig); 2620 } 2621 2622 cSig64 = (uint64_t)cSig << (62 - 23); 2623 cSig64 |= LIT64(0x4000000000000000); 2624 expDiff = pExp - cExp; 2625 2626 if (pSign == cSign) { 2627 /* Addition */ 2628 if (expDiff > 0) { 2629 /* scale c to match p */ 2630 shift64RightJamming(cSig64, expDiff, &cSig64); 2631 zExp = pExp; 2632 } else if (expDiff < 0) { 2633 /* scale p to match c */ 2634 shift64RightJamming(pSig64, -expDiff, &pSig64); 2635 zExp = cExp; 2636 } else { 2637 /* no scaling needed */ 2638 zExp = cExp; 2639 } 2640 /* Add significands and make sure explicit bit ends up in posn 62 */ 2641 zSig64 = pSig64 + cSig64; 2642 if ((int64_t)zSig64 < 0) { 2643 shift64RightJamming(zSig64, 1, &zSig64); 2644 } else { 2645 zExp--; 2646 } 2647 } else { 2648 /* Subtraction */ 2649 if (expDiff > 0) { 2650 shift64RightJamming(cSig64, expDiff, &cSig64); 2651 zSig64 = pSig64 - cSig64; 2652 zExp = pExp; 2653 } else if (expDiff < 0) { 2654 shift64RightJamming(pSig64, -expDiff, &pSig64); 2655 zSig64 = cSig64 - pSig64; 2656 zExp = cExp; 2657 zSign ^= 1; 2658 } else { 2659 zExp = pExp; 2660 if (cSig64 < pSig64) { 2661 zSig64 = pSig64 - cSig64; 2662 } else if (pSig64 < cSig64) { 2663 zSig64 = cSig64 - pSig64; 2664 zSign ^= 1; 2665 } else { 2666 /* Exact zero */ 2667 zSign = signflip; 2668 if (status->float_rounding_mode == float_round_down) { 2669 zSign ^= 1; 2670 } 2671 return packFloat32(zSign, 0, 0); 2672 } 2673 } 2674 --zExp; 2675 /* Normalize to put the explicit bit back into bit 62. */ 2676 shiftcount = countLeadingZeros64(zSig64) - 1; 2677 zSig64 <<= shiftcount; 2678 zExp -= shiftcount; 2679 } 2680 if (flags & float_muladd_halve_result) { 2681 zExp--; 2682 } 2683 2684 shift64RightJamming(zSig64, 32, &zSig64); 2685 return roundAndPackFloat32(zSign, zExp, zSig64, status); 2686 } 2687 2688 2689 /*---------------------------------------------------------------------------- 2690 | Returns the square root of the single-precision floating-point value `a'. 2691 | The operation is performed according to the IEC/IEEE Standard for Binary 2692 | Floating-Point Arithmetic. 2693 *----------------------------------------------------------------------------*/ 2694 2695 float32 float32_sqrt(float32 a, float_status *status) 2696 { 2697 flag aSign; 2698 int aExp, zExp; 2699 uint32_t aSig, zSig; 2700 uint64_t rem, term; 2701 a = float32_squash_input_denormal(a, status); 2702 2703 aSig = extractFloat32Frac( a ); 2704 aExp = extractFloat32Exp( a ); 2705 aSign = extractFloat32Sign( a ); 2706 if ( aExp == 0xFF ) { 2707 if (aSig) { 2708 return propagateFloat32NaN(a, float32_zero, status); 2709 } 2710 if ( ! aSign ) return a; 2711 float_raise(float_flag_invalid, status); 2712 return float32_default_nan(status); 2713 } 2714 if ( aSign ) { 2715 if ( ( aExp | aSig ) == 0 ) return a; 2716 float_raise(float_flag_invalid, status); 2717 return float32_default_nan(status); 2718 } 2719 if ( aExp == 0 ) { 2720 if ( aSig == 0 ) return float32_zero; 2721 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2722 } 2723 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 2724 aSig = ( aSig | 0x00800000 )<<8; 2725 zSig = estimateSqrt32( aExp, aSig ) + 2; 2726 if ( ( zSig & 0x7F ) <= 5 ) { 2727 if ( zSig < 2 ) { 2728 zSig = 0x7FFFFFFF; 2729 goto roundAndPack; 2730 } 2731 aSig >>= aExp & 1; 2732 term = ( (uint64_t) zSig ) * zSig; 2733 rem = ( ( (uint64_t) aSig )<<32 ) - term; 2734 while ( (int64_t) rem < 0 ) { 2735 --zSig; 2736 rem += ( ( (uint64_t) zSig )<<1 ) | 1; 2737 } 2738 zSig |= ( rem != 0 ); 2739 } 2740 shift32RightJamming( zSig, 1, &zSig ); 2741 roundAndPack: 2742 return roundAndPackFloat32(0, zExp, zSig, status); 2743 2744 } 2745 2746 /*---------------------------------------------------------------------------- 2747 | Returns the binary exponential of the single-precision floating-point value 2748 | `a'. The operation is performed according to the IEC/IEEE Standard for 2749 | Binary Floating-Point Arithmetic. 2750 | 2751 | Uses the following identities: 2752 | 2753 | 1. ------------------------------------------------------------------------- 2754 | x x*ln(2) 2755 | 2 = e 2756 | 2757 | 2. ------------------------------------------------------------------------- 2758 | 2 3 4 5 n 2759 | x x x x x x x 2760 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 2761 | 1! 2! 3! 4! 5! n! 2762 *----------------------------------------------------------------------------*/ 2763 2764 static const float64 float32_exp2_coefficients[15] = 2765 { 2766 const_float64( 0x3ff0000000000000ll ), /* 1 */ 2767 const_float64( 0x3fe0000000000000ll ), /* 2 */ 2768 const_float64( 0x3fc5555555555555ll ), /* 3 */ 2769 const_float64( 0x3fa5555555555555ll ), /* 4 */ 2770 const_float64( 0x3f81111111111111ll ), /* 5 */ 2771 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 2772 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 2773 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 2774 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 2775 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 2776 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 2777 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 2778 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 2779 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 2780 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 2781 }; 2782 2783 float32 float32_exp2(float32 a, float_status *status) 2784 { 2785 flag aSign; 2786 int aExp; 2787 uint32_t aSig; 2788 float64 r, x, xn; 2789 int i; 2790 a = float32_squash_input_denormal(a, status); 2791 2792 aSig = extractFloat32Frac( a ); 2793 aExp = extractFloat32Exp( a ); 2794 aSign = extractFloat32Sign( a ); 2795 2796 if ( aExp == 0xFF) { 2797 if (aSig) { 2798 return propagateFloat32NaN(a, float32_zero, status); 2799 } 2800 return (aSign) ? float32_zero : a; 2801 } 2802 if (aExp == 0) { 2803 if (aSig == 0) return float32_one; 2804 } 2805 2806 float_raise(float_flag_inexact, status); 2807 2808 /* ******************************* */ 2809 /* using float64 for approximation */ 2810 /* ******************************* */ 2811 x = float32_to_float64(a, status); 2812 x = float64_mul(x, float64_ln2, status); 2813 2814 xn = x; 2815 r = float64_one; 2816 for (i = 0 ; i < 15 ; i++) { 2817 float64 f; 2818 2819 f = float64_mul(xn, float32_exp2_coefficients[i], status); 2820 r = float64_add(r, f, status); 2821 2822 xn = float64_mul(xn, x, status); 2823 } 2824 2825 return float64_to_float32(r, status); 2826 } 2827 2828 /*---------------------------------------------------------------------------- 2829 | Returns the binary log of the single-precision floating-point value `a'. 2830 | The operation is performed according to the IEC/IEEE Standard for Binary 2831 | Floating-Point Arithmetic. 2832 *----------------------------------------------------------------------------*/ 2833 float32 float32_log2(float32 a, float_status *status) 2834 { 2835 flag aSign, zSign; 2836 int aExp; 2837 uint32_t aSig, zSig, i; 2838 2839 a = float32_squash_input_denormal(a, status); 2840 aSig = extractFloat32Frac( a ); 2841 aExp = extractFloat32Exp( a ); 2842 aSign = extractFloat32Sign( a ); 2843 2844 if ( aExp == 0 ) { 2845 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 2846 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2847 } 2848 if ( aSign ) { 2849 float_raise(float_flag_invalid, status); 2850 return float32_default_nan(status); 2851 } 2852 if ( aExp == 0xFF ) { 2853 if (aSig) { 2854 return propagateFloat32NaN(a, float32_zero, status); 2855 } 2856 return a; 2857 } 2858 2859 aExp -= 0x7F; 2860 aSig |= 0x00800000; 2861 zSign = aExp < 0; 2862 zSig = aExp << 23; 2863 2864 for (i = 1 << 22; i > 0; i >>= 1) { 2865 aSig = ( (uint64_t)aSig * aSig ) >> 23; 2866 if ( aSig & 0x01000000 ) { 2867 aSig >>= 1; 2868 zSig |= i; 2869 } 2870 } 2871 2872 if ( zSign ) 2873 zSig = -zSig; 2874 2875 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 2876 } 2877 2878 /*---------------------------------------------------------------------------- 2879 | Returns 1 if the single-precision floating-point value `a' is equal to 2880 | the corresponding value `b', and 0 otherwise. The invalid exception is 2881 | raised if either operand is a NaN. Otherwise, the comparison is performed 2882 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2883 *----------------------------------------------------------------------------*/ 2884 2885 int float32_eq(float32 a, float32 b, float_status *status) 2886 { 2887 uint32_t av, bv; 2888 a = float32_squash_input_denormal(a, status); 2889 b = float32_squash_input_denormal(b, status); 2890 2891 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2892 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2893 ) { 2894 float_raise(float_flag_invalid, status); 2895 return 0; 2896 } 2897 av = float32_val(a); 2898 bv = float32_val(b); 2899 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 2900 } 2901 2902 /*---------------------------------------------------------------------------- 2903 | Returns 1 if the single-precision floating-point value `a' is less than 2904 | or equal to the corresponding value `b', and 0 otherwise. The invalid 2905 | exception is raised if either operand is a NaN. The comparison is performed 2906 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2907 *----------------------------------------------------------------------------*/ 2908 2909 int float32_le(float32 a, float32 b, float_status *status) 2910 { 2911 flag aSign, bSign; 2912 uint32_t av, bv; 2913 a = float32_squash_input_denormal(a, status); 2914 b = float32_squash_input_denormal(b, status); 2915 2916 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2917 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2918 ) { 2919 float_raise(float_flag_invalid, status); 2920 return 0; 2921 } 2922 aSign = extractFloat32Sign( a ); 2923 bSign = extractFloat32Sign( b ); 2924 av = float32_val(a); 2925 bv = float32_val(b); 2926 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 2927 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 2928 2929 } 2930 2931 /*---------------------------------------------------------------------------- 2932 | Returns 1 if the single-precision floating-point value `a' is less than 2933 | the corresponding value `b', and 0 otherwise. The invalid exception is 2934 | raised if either operand is a NaN. The comparison is performed according 2935 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2936 *----------------------------------------------------------------------------*/ 2937 2938 int float32_lt(float32 a, float32 b, float_status *status) 2939 { 2940 flag aSign, bSign; 2941 uint32_t av, bv; 2942 a = float32_squash_input_denormal(a, status); 2943 b = float32_squash_input_denormal(b, status); 2944 2945 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2946 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2947 ) { 2948 float_raise(float_flag_invalid, status); 2949 return 0; 2950 } 2951 aSign = extractFloat32Sign( a ); 2952 bSign = extractFloat32Sign( b ); 2953 av = float32_val(a); 2954 bv = float32_val(b); 2955 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 2956 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 2957 2958 } 2959 2960 /*---------------------------------------------------------------------------- 2961 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 2962 | be compared, and 0 otherwise. The invalid exception is raised if either 2963 | operand is a NaN. The comparison is performed according to the IEC/IEEE 2964 | Standard for Binary Floating-Point Arithmetic. 2965 *----------------------------------------------------------------------------*/ 2966 2967 int float32_unordered(float32 a, float32 b, float_status *status) 2968 { 2969 a = float32_squash_input_denormal(a, status); 2970 b = float32_squash_input_denormal(b, status); 2971 2972 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2973 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2974 ) { 2975 float_raise(float_flag_invalid, status); 2976 return 1; 2977 } 2978 return 0; 2979 } 2980 2981 /*---------------------------------------------------------------------------- 2982 | Returns 1 if the single-precision floating-point value `a' is equal to 2983 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 2984 | exception. The comparison is performed according to the IEC/IEEE Standard 2985 | for Binary Floating-Point Arithmetic. 2986 *----------------------------------------------------------------------------*/ 2987 2988 int float32_eq_quiet(float32 a, float32 b, float_status *status) 2989 { 2990 a = float32_squash_input_denormal(a, status); 2991 b = float32_squash_input_denormal(b, status); 2992 2993 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2994 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2995 ) { 2996 if (float32_is_signaling_nan(a, status) 2997 || float32_is_signaling_nan(b, status)) { 2998 float_raise(float_flag_invalid, status); 2999 } 3000 return 0; 3001 } 3002 return ( float32_val(a) == float32_val(b) ) || 3003 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 3004 } 3005 3006 /*---------------------------------------------------------------------------- 3007 | Returns 1 if the single-precision floating-point value `a' is less than or 3008 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 3009 | cause an exception. Otherwise, the comparison is performed according to the 3010 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3011 *----------------------------------------------------------------------------*/ 3012 3013 int float32_le_quiet(float32 a, float32 b, float_status *status) 3014 { 3015 flag aSign, bSign; 3016 uint32_t av, bv; 3017 a = float32_squash_input_denormal(a, status); 3018 b = float32_squash_input_denormal(b, status); 3019 3020 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3021 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3022 ) { 3023 if (float32_is_signaling_nan(a, status) 3024 || float32_is_signaling_nan(b, status)) { 3025 float_raise(float_flag_invalid, status); 3026 } 3027 return 0; 3028 } 3029 aSign = extractFloat32Sign( a ); 3030 bSign = extractFloat32Sign( b ); 3031 av = float32_val(a); 3032 bv = float32_val(b); 3033 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3034 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3035 3036 } 3037 3038 /*---------------------------------------------------------------------------- 3039 | Returns 1 if the single-precision floating-point value `a' is less than 3040 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3041 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 3042 | Standard for Binary Floating-Point Arithmetic. 3043 *----------------------------------------------------------------------------*/ 3044 3045 int float32_lt_quiet(float32 a, float32 b, float_status *status) 3046 { 3047 flag aSign, bSign; 3048 uint32_t av, bv; 3049 a = float32_squash_input_denormal(a, status); 3050 b = float32_squash_input_denormal(b, status); 3051 3052 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3053 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3054 ) { 3055 if (float32_is_signaling_nan(a, status) 3056 || float32_is_signaling_nan(b, status)) { 3057 float_raise(float_flag_invalid, status); 3058 } 3059 return 0; 3060 } 3061 aSign = extractFloat32Sign( a ); 3062 bSign = extractFloat32Sign( b ); 3063 av = float32_val(a); 3064 bv = float32_val(b); 3065 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 3066 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3067 3068 } 3069 3070 /*---------------------------------------------------------------------------- 3071 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 3072 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 3073 | comparison is performed according to the IEC/IEEE Standard for Binary 3074 | Floating-Point Arithmetic. 3075 *----------------------------------------------------------------------------*/ 3076 3077 int float32_unordered_quiet(float32 a, float32 b, float_status *status) 3078 { 3079 a = float32_squash_input_denormal(a, status); 3080 b = float32_squash_input_denormal(b, status); 3081 3082 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3083 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3084 ) { 3085 if (float32_is_signaling_nan(a, status) 3086 || float32_is_signaling_nan(b, status)) { 3087 float_raise(float_flag_invalid, status); 3088 } 3089 return 1; 3090 } 3091 return 0; 3092 } 3093 3094 /*---------------------------------------------------------------------------- 3095 | Returns the result of converting the double-precision floating-point value 3096 | `a' to the 32-bit two's complement integer format. The conversion is 3097 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3098 | Arithmetic---which means in particular that the conversion is rounded 3099 | according to the current rounding mode. If `a' is a NaN, the largest 3100 | positive integer is returned. Otherwise, if the conversion overflows, the 3101 | largest integer with the same sign as `a' is returned. 3102 *----------------------------------------------------------------------------*/ 3103 3104 int32_t float64_to_int32(float64 a, float_status *status) 3105 { 3106 flag aSign; 3107 int aExp; 3108 int shiftCount; 3109 uint64_t aSig; 3110 a = float64_squash_input_denormal(a, status); 3111 3112 aSig = extractFloat64Frac( a ); 3113 aExp = extractFloat64Exp( a ); 3114 aSign = extractFloat64Sign( a ); 3115 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 3116 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 3117 shiftCount = 0x42C - aExp; 3118 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 3119 return roundAndPackInt32(aSign, aSig, status); 3120 3121 } 3122 3123 /*---------------------------------------------------------------------------- 3124 | Returns the result of converting the double-precision floating-point value 3125 | `a' to the 32-bit two's complement integer format. The conversion is 3126 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3127 | Arithmetic, except that the conversion is always rounded toward zero. 3128 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 3129 | the conversion overflows, the largest integer with the same sign as `a' is 3130 | returned. 3131 *----------------------------------------------------------------------------*/ 3132 3133 int32_t float64_to_int32_round_to_zero(float64 a, float_status *status) 3134 { 3135 flag aSign; 3136 int aExp; 3137 int shiftCount; 3138 uint64_t aSig, savedASig; 3139 int32_t z; 3140 a = float64_squash_input_denormal(a, status); 3141 3142 aSig = extractFloat64Frac( a ); 3143 aExp = extractFloat64Exp( a ); 3144 aSign = extractFloat64Sign( a ); 3145 if ( 0x41E < aExp ) { 3146 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 3147 goto invalid; 3148 } 3149 else if ( aExp < 0x3FF ) { 3150 if (aExp || aSig) { 3151 status->float_exception_flags |= float_flag_inexact; 3152 } 3153 return 0; 3154 } 3155 aSig |= LIT64( 0x0010000000000000 ); 3156 shiftCount = 0x433 - aExp; 3157 savedASig = aSig; 3158 aSig >>= shiftCount; 3159 z = aSig; 3160 if ( aSign ) z = - z; 3161 if ( ( z < 0 ) ^ aSign ) { 3162 invalid: 3163 float_raise(float_flag_invalid, status); 3164 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 3165 } 3166 if ( ( aSig<<shiftCount ) != savedASig ) { 3167 status->float_exception_flags |= float_flag_inexact; 3168 } 3169 return z; 3170 3171 } 3172 3173 /*---------------------------------------------------------------------------- 3174 | Returns the result of converting the double-precision floating-point value 3175 | `a' to the 16-bit two's complement integer format. The conversion is 3176 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3177 | Arithmetic, except that the conversion is always rounded toward zero. 3178 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 3179 | the conversion overflows, the largest integer with the same sign as `a' is 3180 | returned. 3181 *----------------------------------------------------------------------------*/ 3182 3183 int16_t float64_to_int16_round_to_zero(float64 a, float_status *status) 3184 { 3185 flag aSign; 3186 int aExp; 3187 int shiftCount; 3188 uint64_t aSig, savedASig; 3189 int32_t z; 3190 3191 aSig = extractFloat64Frac( a ); 3192 aExp = extractFloat64Exp( a ); 3193 aSign = extractFloat64Sign( a ); 3194 if ( 0x40E < aExp ) { 3195 if ( ( aExp == 0x7FF ) && aSig ) { 3196 aSign = 0; 3197 } 3198 goto invalid; 3199 } 3200 else if ( aExp < 0x3FF ) { 3201 if ( aExp || aSig ) { 3202 status->float_exception_flags |= float_flag_inexact; 3203 } 3204 return 0; 3205 } 3206 aSig |= LIT64( 0x0010000000000000 ); 3207 shiftCount = 0x433 - aExp; 3208 savedASig = aSig; 3209 aSig >>= shiftCount; 3210 z = aSig; 3211 if ( aSign ) { 3212 z = - z; 3213 } 3214 if ( ( (int16_t)z < 0 ) ^ aSign ) { 3215 invalid: 3216 float_raise(float_flag_invalid, status); 3217 return aSign ? (int32_t) 0xffff8000 : 0x7FFF; 3218 } 3219 if ( ( aSig<<shiftCount ) != savedASig ) { 3220 status->float_exception_flags |= float_flag_inexact; 3221 } 3222 return z; 3223 } 3224 3225 /*---------------------------------------------------------------------------- 3226 | Returns the result of converting the double-precision floating-point value 3227 | `a' to the 64-bit two's complement integer format. The conversion is 3228 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3229 | Arithmetic---which means in particular that the conversion is rounded 3230 | according to the current rounding mode. If `a' is a NaN, the largest 3231 | positive integer is returned. Otherwise, if the conversion overflows, the 3232 | largest integer with the same sign as `a' is returned. 3233 *----------------------------------------------------------------------------*/ 3234 3235 int64_t float64_to_int64(float64 a, float_status *status) 3236 { 3237 flag aSign; 3238 int aExp; 3239 int shiftCount; 3240 uint64_t aSig, aSigExtra; 3241 a = float64_squash_input_denormal(a, status); 3242 3243 aSig = extractFloat64Frac( a ); 3244 aExp = extractFloat64Exp( a ); 3245 aSign = extractFloat64Sign( a ); 3246 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 3247 shiftCount = 0x433 - aExp; 3248 if ( shiftCount <= 0 ) { 3249 if ( 0x43E < aExp ) { 3250 float_raise(float_flag_invalid, status); 3251 if ( ! aSign 3252 || ( ( aExp == 0x7FF ) 3253 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 3254 ) { 3255 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3256 } 3257 return (int64_t) LIT64( 0x8000000000000000 ); 3258 } 3259 aSigExtra = 0; 3260 aSig <<= - shiftCount; 3261 } 3262 else { 3263 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 3264 } 3265 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 3266 3267 } 3268 3269 /*---------------------------------------------------------------------------- 3270 | Returns the result of converting the double-precision floating-point value 3271 | `a' to the 64-bit two's complement integer format. The conversion is 3272 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3273 | Arithmetic, except that the conversion is always rounded toward zero. 3274 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 3275 | the conversion overflows, the largest integer with the same sign as `a' is 3276 | returned. 3277 *----------------------------------------------------------------------------*/ 3278 3279 int64_t float64_to_int64_round_to_zero(float64 a, float_status *status) 3280 { 3281 flag aSign; 3282 int aExp; 3283 int shiftCount; 3284 uint64_t aSig; 3285 int64_t z; 3286 a = float64_squash_input_denormal(a, status); 3287 3288 aSig = extractFloat64Frac( a ); 3289 aExp = extractFloat64Exp( a ); 3290 aSign = extractFloat64Sign( a ); 3291 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 3292 shiftCount = aExp - 0x433; 3293 if ( 0 <= shiftCount ) { 3294 if ( 0x43E <= aExp ) { 3295 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) { 3296 float_raise(float_flag_invalid, status); 3297 if ( ! aSign 3298 || ( ( aExp == 0x7FF ) 3299 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 3300 ) { 3301 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3302 } 3303 } 3304 return (int64_t) LIT64( 0x8000000000000000 ); 3305 } 3306 z = aSig<<shiftCount; 3307 } 3308 else { 3309 if ( aExp < 0x3FE ) { 3310 if (aExp | aSig) { 3311 status->float_exception_flags |= float_flag_inexact; 3312 } 3313 return 0; 3314 } 3315 z = aSig>>( - shiftCount ); 3316 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 3317 status->float_exception_flags |= float_flag_inexact; 3318 } 3319 } 3320 if ( aSign ) z = - z; 3321 return z; 3322 3323 } 3324 3325 /*---------------------------------------------------------------------------- 3326 | Returns the result of converting the double-precision floating-point value 3327 | `a' to the single-precision floating-point format. The conversion is 3328 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3329 | Arithmetic. 3330 *----------------------------------------------------------------------------*/ 3331 3332 float32 float64_to_float32(float64 a, float_status *status) 3333 { 3334 flag aSign; 3335 int aExp; 3336 uint64_t aSig; 3337 uint32_t zSig; 3338 a = float64_squash_input_denormal(a, status); 3339 3340 aSig = extractFloat64Frac( a ); 3341 aExp = extractFloat64Exp( a ); 3342 aSign = extractFloat64Sign( a ); 3343 if ( aExp == 0x7FF ) { 3344 if (aSig) { 3345 return commonNaNToFloat32(float64ToCommonNaN(a, status), status); 3346 } 3347 return packFloat32( aSign, 0xFF, 0 ); 3348 } 3349 shift64RightJamming( aSig, 22, &aSig ); 3350 zSig = aSig; 3351 if ( aExp || zSig ) { 3352 zSig |= 0x40000000; 3353 aExp -= 0x381; 3354 } 3355 return roundAndPackFloat32(aSign, aExp, zSig, status); 3356 3357 } 3358 3359 3360 /*---------------------------------------------------------------------------- 3361 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 3362 | half-precision floating-point value, returning the result. After being 3363 | shifted into the proper positions, the three fields are simply added 3364 | together to form the result. This means that any integer portion of `zSig' 3365 | will be added into the exponent. Since a properly normalized significand 3366 | will have an integer portion equal to 1, the `zExp' input should be 1 less 3367 | than the desired result exponent whenever `zSig' is a complete, normalized 3368 | significand. 3369 *----------------------------------------------------------------------------*/ 3370 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig) 3371 { 3372 return make_float16( 3373 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig); 3374 } 3375 3376 /*---------------------------------------------------------------------------- 3377 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3378 | and significand `zSig', and returns the proper half-precision floating- 3379 | point value corresponding to the abstract input. Ordinarily, the abstract 3380 | value is simply rounded and packed into the half-precision format, with 3381 | the inexact exception raised if the abstract input cannot be represented 3382 | exactly. However, if the abstract value is too large, the overflow and 3383 | inexact exceptions are raised and an infinity or maximal finite value is 3384 | returned. If the abstract value is too small, the input value is rounded to 3385 | a subnormal number, and the underflow and inexact exceptions are raised if 3386 | the abstract input cannot be represented exactly as a subnormal half- 3387 | precision floating-point number. 3388 | The `ieee' flag indicates whether to use IEEE standard half precision, or 3389 | ARM-style "alternative representation", which omits the NaN and Inf 3390 | encodings in order to raise the maximum representable exponent by one. 3391 | The input significand `zSig' has its binary point between bits 22 3392 | and 23, which is 13 bits to the left of the usual location. This shifted 3393 | significand must be normalized or smaller. If `zSig' is not normalized, 3394 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3395 | and it must not require rounding. In the usual case that `zSig' is 3396 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3397 | Note the slightly odd position of the binary point in zSig compared with the 3398 | other roundAndPackFloat functions. This should probably be fixed if we 3399 | need to implement more float16 routines than just conversion. 3400 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3401 | Binary Floating-Point Arithmetic. 3402 *----------------------------------------------------------------------------*/ 3403 3404 static float16 roundAndPackFloat16(flag zSign, int zExp, 3405 uint32_t zSig, flag ieee, 3406 float_status *status) 3407 { 3408 int maxexp = ieee ? 29 : 30; 3409 uint32_t mask; 3410 uint32_t increment; 3411 bool rounding_bumps_exp; 3412 bool is_tiny = false; 3413 3414 /* Calculate the mask of bits of the mantissa which are not 3415 * representable in half-precision and will be lost. 3416 */ 3417 if (zExp < 1) { 3418 /* Will be denormal in halfprec */ 3419 mask = 0x00ffffff; 3420 if (zExp >= -11) { 3421 mask >>= 11 + zExp; 3422 } 3423 } else { 3424 /* Normal number in halfprec */ 3425 mask = 0x00001fff; 3426 } 3427 3428 switch (status->float_rounding_mode) { 3429 case float_round_nearest_even: 3430 increment = (mask + 1) >> 1; 3431 if ((zSig & mask) == increment) { 3432 increment = zSig & (increment << 1); 3433 } 3434 break; 3435 case float_round_ties_away: 3436 increment = (mask + 1) >> 1; 3437 break; 3438 case float_round_up: 3439 increment = zSign ? 0 : mask; 3440 break; 3441 case float_round_down: 3442 increment = zSign ? mask : 0; 3443 break; 3444 default: /* round_to_zero */ 3445 increment = 0; 3446 break; 3447 } 3448 3449 rounding_bumps_exp = (zSig + increment >= 0x01000000); 3450 3451 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) { 3452 if (ieee) { 3453 float_raise(float_flag_overflow | float_flag_inexact, status); 3454 return packFloat16(zSign, 0x1f, 0); 3455 } else { 3456 float_raise(float_flag_invalid, status); 3457 return packFloat16(zSign, 0x1f, 0x3ff); 3458 } 3459 } 3460 3461 if (zExp < 0) { 3462 /* Note that flush-to-zero does not affect half-precision results */ 3463 is_tiny = 3464 (status->float_detect_tininess == float_tininess_before_rounding) 3465 || (zExp < -1) 3466 || (!rounding_bumps_exp); 3467 } 3468 if (zSig & mask) { 3469 float_raise(float_flag_inexact, status); 3470 if (is_tiny) { 3471 float_raise(float_flag_underflow, status); 3472 } 3473 } 3474 3475 zSig += increment; 3476 if (rounding_bumps_exp) { 3477 zSig >>= 1; 3478 zExp++; 3479 } 3480 3481 if (zExp < -10) { 3482 return packFloat16(zSign, 0, 0); 3483 } 3484 if (zExp < 0) { 3485 zSig >>= -zExp; 3486 zExp = 0; 3487 } 3488 return packFloat16(zSign, zExp, zSig >> 13); 3489 } 3490 3491 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr, 3492 uint32_t *zSigPtr) 3493 { 3494 int8_t shiftCount = countLeadingZeros32(aSig) - 21; 3495 *zSigPtr = aSig << shiftCount; 3496 *zExpPtr = 1 - shiftCount; 3497 } 3498 3499 /* Half precision floats come in two formats: standard IEEE and "ARM" format. 3500 The latter gains extra exponent range by omitting the NaN/Inf encodings. */ 3501 3502 float32 float16_to_float32(float16 a, flag ieee, float_status *status) 3503 { 3504 flag aSign; 3505 int aExp; 3506 uint32_t aSig; 3507 3508 aSign = extractFloat16Sign(a); 3509 aExp = extractFloat16Exp(a); 3510 aSig = extractFloat16Frac(a); 3511 3512 if (aExp == 0x1f && ieee) { 3513 if (aSig) { 3514 return commonNaNToFloat32(float16ToCommonNaN(a, status), status); 3515 } 3516 return packFloat32(aSign, 0xff, 0); 3517 } 3518 if (aExp == 0) { 3519 if (aSig == 0) { 3520 return packFloat32(aSign, 0, 0); 3521 } 3522 3523 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 3524 aExp--; 3525 } 3526 return packFloat32( aSign, aExp + 0x70, aSig << 13); 3527 } 3528 3529 float16 float32_to_float16(float32 a, flag ieee, float_status *status) 3530 { 3531 flag aSign; 3532 int aExp; 3533 uint32_t aSig; 3534 3535 a = float32_squash_input_denormal(a, status); 3536 3537 aSig = extractFloat32Frac( a ); 3538 aExp = extractFloat32Exp( a ); 3539 aSign = extractFloat32Sign( a ); 3540 if ( aExp == 0xFF ) { 3541 if (aSig) { 3542 /* Input is a NaN */ 3543 if (!ieee) { 3544 float_raise(float_flag_invalid, status); 3545 return packFloat16(aSign, 0, 0); 3546 } 3547 return commonNaNToFloat16( 3548 float32ToCommonNaN(a, status), status); 3549 } 3550 /* Infinity */ 3551 if (!ieee) { 3552 float_raise(float_flag_invalid, status); 3553 return packFloat16(aSign, 0x1f, 0x3ff); 3554 } 3555 return packFloat16(aSign, 0x1f, 0); 3556 } 3557 if (aExp == 0 && aSig == 0) { 3558 return packFloat16(aSign, 0, 0); 3559 } 3560 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 3561 * even if the input is denormal; however this is harmless because 3562 * the largest possible single-precision denormal is still smaller 3563 * than the smallest representable half-precision denormal, and so we 3564 * will end up ignoring aSig and returning via the "always return zero" 3565 * codepath. 3566 */ 3567 aSig |= 0x00800000; 3568 aExp -= 0x71; 3569 3570 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status); 3571 } 3572 3573 float64 float16_to_float64(float16 a, flag ieee, float_status *status) 3574 { 3575 flag aSign; 3576 int aExp; 3577 uint32_t aSig; 3578 3579 aSign = extractFloat16Sign(a); 3580 aExp = extractFloat16Exp(a); 3581 aSig = extractFloat16Frac(a); 3582 3583 if (aExp == 0x1f && ieee) { 3584 if (aSig) { 3585 return commonNaNToFloat64( 3586 float16ToCommonNaN(a, status), status); 3587 } 3588 return packFloat64(aSign, 0x7ff, 0); 3589 } 3590 if (aExp == 0) { 3591 if (aSig == 0) { 3592 return packFloat64(aSign, 0, 0); 3593 } 3594 3595 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 3596 aExp--; 3597 } 3598 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42); 3599 } 3600 3601 float16 float64_to_float16(float64 a, flag ieee, float_status *status) 3602 { 3603 flag aSign; 3604 int aExp; 3605 uint64_t aSig; 3606 uint32_t zSig; 3607 3608 a = float64_squash_input_denormal(a, status); 3609 3610 aSig = extractFloat64Frac(a); 3611 aExp = extractFloat64Exp(a); 3612 aSign = extractFloat64Sign(a); 3613 if (aExp == 0x7FF) { 3614 if (aSig) { 3615 /* Input is a NaN */ 3616 if (!ieee) { 3617 float_raise(float_flag_invalid, status); 3618 return packFloat16(aSign, 0, 0); 3619 } 3620 return commonNaNToFloat16( 3621 float64ToCommonNaN(a, status), status); 3622 } 3623 /* Infinity */ 3624 if (!ieee) { 3625 float_raise(float_flag_invalid, status); 3626 return packFloat16(aSign, 0x1f, 0x3ff); 3627 } 3628 return packFloat16(aSign, 0x1f, 0); 3629 } 3630 shift64RightJamming(aSig, 29, &aSig); 3631 zSig = aSig; 3632 if (aExp == 0 && zSig == 0) { 3633 return packFloat16(aSign, 0, 0); 3634 } 3635 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 3636 * even if the input is denormal; however this is harmless because 3637 * the largest possible single-precision denormal is still smaller 3638 * than the smallest representable half-precision denormal, and so we 3639 * will end up ignoring aSig and returning via the "always return zero" 3640 * codepath. 3641 */ 3642 zSig |= 0x00800000; 3643 aExp -= 0x3F1; 3644 3645 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status); 3646 } 3647 3648 /*---------------------------------------------------------------------------- 3649 | Returns the result of converting the double-precision floating-point value 3650 | `a' to the extended double-precision floating-point format. The conversion 3651 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3652 | Arithmetic. 3653 *----------------------------------------------------------------------------*/ 3654 3655 floatx80 float64_to_floatx80(float64 a, float_status *status) 3656 { 3657 flag aSign; 3658 int aExp; 3659 uint64_t aSig; 3660 3661 a = float64_squash_input_denormal(a, status); 3662 aSig = extractFloat64Frac( a ); 3663 aExp = extractFloat64Exp( a ); 3664 aSign = extractFloat64Sign( a ); 3665 if ( aExp == 0x7FF ) { 3666 if (aSig) { 3667 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status); 3668 } 3669 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3670 } 3671 if ( aExp == 0 ) { 3672 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 3673 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3674 } 3675 return 3676 packFloatx80( 3677 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 3678 3679 } 3680 3681 /*---------------------------------------------------------------------------- 3682 | Returns the result of converting the double-precision floating-point value 3683 | `a' to the quadruple-precision floating-point format. The conversion is 3684 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3685 | Arithmetic. 3686 *----------------------------------------------------------------------------*/ 3687 3688 float128 float64_to_float128(float64 a, float_status *status) 3689 { 3690 flag aSign; 3691 int aExp; 3692 uint64_t aSig, zSig0, zSig1; 3693 3694 a = float64_squash_input_denormal(a, status); 3695 aSig = extractFloat64Frac( a ); 3696 aExp = extractFloat64Exp( a ); 3697 aSign = extractFloat64Sign( a ); 3698 if ( aExp == 0x7FF ) { 3699 if (aSig) { 3700 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 3701 } 3702 return packFloat128( aSign, 0x7FFF, 0, 0 ); 3703 } 3704 if ( aExp == 0 ) { 3705 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 3706 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3707 --aExp; 3708 } 3709 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 3710 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 3711 3712 } 3713 3714 /*---------------------------------------------------------------------------- 3715 | Rounds the double-precision floating-point value `a' to an integer, and 3716 | returns the result as a double-precision floating-point value. The 3717 | operation is performed according to the IEC/IEEE Standard for Binary 3718 | Floating-Point Arithmetic. 3719 *----------------------------------------------------------------------------*/ 3720 3721 float64 float64_round_to_int(float64 a, float_status *status) 3722 { 3723 flag aSign; 3724 int aExp; 3725 uint64_t lastBitMask, roundBitsMask; 3726 uint64_t z; 3727 a = float64_squash_input_denormal(a, status); 3728 3729 aExp = extractFloat64Exp( a ); 3730 if ( 0x433 <= aExp ) { 3731 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { 3732 return propagateFloat64NaN(a, a, status); 3733 } 3734 return a; 3735 } 3736 if ( aExp < 0x3FF ) { 3737 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a; 3738 status->float_exception_flags |= float_flag_inexact; 3739 aSign = extractFloat64Sign( a ); 3740 switch (status->float_rounding_mode) { 3741 case float_round_nearest_even: 3742 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { 3743 return packFloat64( aSign, 0x3FF, 0 ); 3744 } 3745 break; 3746 case float_round_ties_away: 3747 if (aExp == 0x3FE) { 3748 return packFloat64(aSign, 0x3ff, 0); 3749 } 3750 break; 3751 case float_round_down: 3752 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0); 3753 case float_round_up: 3754 return make_float64( 3755 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 )); 3756 } 3757 return packFloat64( aSign, 0, 0 ); 3758 } 3759 lastBitMask = 1; 3760 lastBitMask <<= 0x433 - aExp; 3761 roundBitsMask = lastBitMask - 1; 3762 z = float64_val(a); 3763 switch (status->float_rounding_mode) { 3764 case float_round_nearest_even: 3765 z += lastBitMask >> 1; 3766 if ((z & roundBitsMask) == 0) { 3767 z &= ~lastBitMask; 3768 } 3769 break; 3770 case float_round_ties_away: 3771 z += lastBitMask >> 1; 3772 break; 3773 case float_round_to_zero: 3774 break; 3775 case float_round_up: 3776 if (!extractFloat64Sign(make_float64(z))) { 3777 z += roundBitsMask; 3778 } 3779 break; 3780 case float_round_down: 3781 if (extractFloat64Sign(make_float64(z))) { 3782 z += roundBitsMask; 3783 } 3784 break; 3785 default: 3786 abort(); 3787 } 3788 z &= ~ roundBitsMask; 3789 if (z != float64_val(a)) { 3790 status->float_exception_flags |= float_flag_inexact; 3791 } 3792 return make_float64(z); 3793 3794 } 3795 3796 float64 float64_trunc_to_int(float64 a, float_status *status) 3797 { 3798 int oldmode; 3799 float64 res; 3800 oldmode = status->float_rounding_mode; 3801 status->float_rounding_mode = float_round_to_zero; 3802 res = float64_round_to_int(a, status); 3803 status->float_rounding_mode = oldmode; 3804 return res; 3805 } 3806 3807 /*---------------------------------------------------------------------------- 3808 | Returns the result of adding the absolute values of the double-precision 3809 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 3810 | before being returned. `zSign' is ignored if the result is a NaN. 3811 | The addition is performed according to the IEC/IEEE Standard for Binary 3812 | Floating-Point Arithmetic. 3813 *----------------------------------------------------------------------------*/ 3814 3815 static float64 addFloat64Sigs(float64 a, float64 b, flag zSign, 3816 float_status *status) 3817 { 3818 int aExp, bExp, zExp; 3819 uint64_t aSig, bSig, zSig; 3820 int expDiff; 3821 3822 aSig = extractFloat64Frac( a ); 3823 aExp = extractFloat64Exp( a ); 3824 bSig = extractFloat64Frac( b ); 3825 bExp = extractFloat64Exp( b ); 3826 expDiff = aExp - bExp; 3827 aSig <<= 9; 3828 bSig <<= 9; 3829 if ( 0 < expDiff ) { 3830 if ( aExp == 0x7FF ) { 3831 if (aSig) { 3832 return propagateFloat64NaN(a, b, status); 3833 } 3834 return a; 3835 } 3836 if ( bExp == 0 ) { 3837 --expDiff; 3838 } 3839 else { 3840 bSig |= LIT64( 0x2000000000000000 ); 3841 } 3842 shift64RightJamming( bSig, expDiff, &bSig ); 3843 zExp = aExp; 3844 } 3845 else if ( expDiff < 0 ) { 3846 if ( bExp == 0x7FF ) { 3847 if (bSig) { 3848 return propagateFloat64NaN(a, b, status); 3849 } 3850 return packFloat64( zSign, 0x7FF, 0 ); 3851 } 3852 if ( aExp == 0 ) { 3853 ++expDiff; 3854 } 3855 else { 3856 aSig |= LIT64( 0x2000000000000000 ); 3857 } 3858 shift64RightJamming( aSig, - expDiff, &aSig ); 3859 zExp = bExp; 3860 } 3861 else { 3862 if ( aExp == 0x7FF ) { 3863 if (aSig | bSig) { 3864 return propagateFloat64NaN(a, b, status); 3865 } 3866 return a; 3867 } 3868 if ( aExp == 0 ) { 3869 if (status->flush_to_zero) { 3870 if (aSig | bSig) { 3871 float_raise(float_flag_output_denormal, status); 3872 } 3873 return packFloat64(zSign, 0, 0); 3874 } 3875 return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); 3876 } 3877 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig; 3878 zExp = aExp; 3879 goto roundAndPack; 3880 } 3881 aSig |= LIT64( 0x2000000000000000 ); 3882 zSig = ( aSig + bSig )<<1; 3883 --zExp; 3884 if ( (int64_t) zSig < 0 ) { 3885 zSig = aSig + bSig; 3886 ++zExp; 3887 } 3888 roundAndPack: 3889 return roundAndPackFloat64(zSign, zExp, zSig, status); 3890 3891 } 3892 3893 /*---------------------------------------------------------------------------- 3894 | Returns the result of subtracting the absolute values of the double- 3895 | precision floating-point values `a' and `b'. If `zSign' is 1, the 3896 | difference is negated before being returned. `zSign' is ignored if the 3897 | result is a NaN. The subtraction is performed according to the IEC/IEEE 3898 | Standard for Binary Floating-Point Arithmetic. 3899 *----------------------------------------------------------------------------*/ 3900 3901 static float64 subFloat64Sigs(float64 a, float64 b, flag zSign, 3902 float_status *status) 3903 { 3904 int aExp, bExp, zExp; 3905 uint64_t aSig, bSig, zSig; 3906 int expDiff; 3907 3908 aSig = extractFloat64Frac( a ); 3909 aExp = extractFloat64Exp( a ); 3910 bSig = extractFloat64Frac( b ); 3911 bExp = extractFloat64Exp( b ); 3912 expDiff = aExp - bExp; 3913 aSig <<= 10; 3914 bSig <<= 10; 3915 if ( 0 < expDiff ) goto aExpBigger; 3916 if ( expDiff < 0 ) goto bExpBigger; 3917 if ( aExp == 0x7FF ) { 3918 if (aSig | bSig) { 3919 return propagateFloat64NaN(a, b, status); 3920 } 3921 float_raise(float_flag_invalid, status); 3922 return float64_default_nan(status); 3923 } 3924 if ( aExp == 0 ) { 3925 aExp = 1; 3926 bExp = 1; 3927 } 3928 if ( bSig < aSig ) goto aBigger; 3929 if ( aSig < bSig ) goto bBigger; 3930 return packFloat64(status->float_rounding_mode == float_round_down, 0, 0); 3931 bExpBigger: 3932 if ( bExp == 0x7FF ) { 3933 if (bSig) { 3934 return propagateFloat64NaN(a, b, status); 3935 } 3936 return packFloat64( zSign ^ 1, 0x7FF, 0 ); 3937 } 3938 if ( aExp == 0 ) { 3939 ++expDiff; 3940 } 3941 else { 3942 aSig |= LIT64( 0x4000000000000000 ); 3943 } 3944 shift64RightJamming( aSig, - expDiff, &aSig ); 3945 bSig |= LIT64( 0x4000000000000000 ); 3946 bBigger: 3947 zSig = bSig - aSig; 3948 zExp = bExp; 3949 zSign ^= 1; 3950 goto normalizeRoundAndPack; 3951 aExpBigger: 3952 if ( aExp == 0x7FF ) { 3953 if (aSig) { 3954 return propagateFloat64NaN(a, b, status); 3955 } 3956 return a; 3957 } 3958 if ( bExp == 0 ) { 3959 --expDiff; 3960 } 3961 else { 3962 bSig |= LIT64( 0x4000000000000000 ); 3963 } 3964 shift64RightJamming( bSig, expDiff, &bSig ); 3965 aSig |= LIT64( 0x4000000000000000 ); 3966 aBigger: 3967 zSig = aSig - bSig; 3968 zExp = aExp; 3969 normalizeRoundAndPack: 3970 --zExp; 3971 return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status); 3972 3973 } 3974 3975 /*---------------------------------------------------------------------------- 3976 | Returns the result of adding the double-precision floating-point values `a' 3977 | and `b'. The operation is performed according to the IEC/IEEE Standard for 3978 | Binary Floating-Point Arithmetic. 3979 *----------------------------------------------------------------------------*/ 3980 3981 float64 float64_add(float64 a, float64 b, float_status *status) 3982 { 3983 flag aSign, bSign; 3984 a = float64_squash_input_denormal(a, status); 3985 b = float64_squash_input_denormal(b, status); 3986 3987 aSign = extractFloat64Sign( a ); 3988 bSign = extractFloat64Sign( b ); 3989 if ( aSign == bSign ) { 3990 return addFloat64Sigs(a, b, aSign, status); 3991 } 3992 else { 3993 return subFloat64Sigs(a, b, aSign, status); 3994 } 3995 3996 } 3997 3998 /*---------------------------------------------------------------------------- 3999 | Returns the result of subtracting the double-precision floating-point values 4000 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 4001 | for Binary Floating-Point Arithmetic. 4002 *----------------------------------------------------------------------------*/ 4003 4004 float64 float64_sub(float64 a, float64 b, float_status *status) 4005 { 4006 flag aSign, bSign; 4007 a = float64_squash_input_denormal(a, status); 4008 b = float64_squash_input_denormal(b, status); 4009 4010 aSign = extractFloat64Sign( a ); 4011 bSign = extractFloat64Sign( b ); 4012 if ( aSign == bSign ) { 4013 return subFloat64Sigs(a, b, aSign, status); 4014 } 4015 else { 4016 return addFloat64Sigs(a, b, aSign, status); 4017 } 4018 4019 } 4020 4021 /*---------------------------------------------------------------------------- 4022 | Returns the result of multiplying the double-precision floating-point values 4023 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 4024 | for Binary Floating-Point Arithmetic. 4025 *----------------------------------------------------------------------------*/ 4026 4027 float64 float64_mul(float64 a, float64 b, float_status *status) 4028 { 4029 flag aSign, bSign, zSign; 4030 int aExp, bExp, zExp; 4031 uint64_t aSig, bSig, zSig0, zSig1; 4032 4033 a = float64_squash_input_denormal(a, status); 4034 b = float64_squash_input_denormal(b, status); 4035 4036 aSig = extractFloat64Frac( a ); 4037 aExp = extractFloat64Exp( a ); 4038 aSign = extractFloat64Sign( a ); 4039 bSig = extractFloat64Frac( b ); 4040 bExp = extractFloat64Exp( b ); 4041 bSign = extractFloat64Sign( b ); 4042 zSign = aSign ^ bSign; 4043 if ( aExp == 0x7FF ) { 4044 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 4045 return propagateFloat64NaN(a, b, status); 4046 } 4047 if ( ( bExp | bSig ) == 0 ) { 4048 float_raise(float_flag_invalid, status); 4049 return float64_default_nan(status); 4050 } 4051 return packFloat64( zSign, 0x7FF, 0 ); 4052 } 4053 if ( bExp == 0x7FF ) { 4054 if (bSig) { 4055 return propagateFloat64NaN(a, b, status); 4056 } 4057 if ( ( aExp | aSig ) == 0 ) { 4058 float_raise(float_flag_invalid, status); 4059 return float64_default_nan(status); 4060 } 4061 return packFloat64( zSign, 0x7FF, 0 ); 4062 } 4063 if ( aExp == 0 ) { 4064 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 4065 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4066 } 4067 if ( bExp == 0 ) { 4068 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 ); 4069 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4070 } 4071 zExp = aExp + bExp - 0x3FF; 4072 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 4073 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4074 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 4075 zSig0 |= ( zSig1 != 0 ); 4076 if ( 0 <= (int64_t) ( zSig0<<1 ) ) { 4077 zSig0 <<= 1; 4078 --zExp; 4079 } 4080 return roundAndPackFloat64(zSign, zExp, zSig0, status); 4081 4082 } 4083 4084 /*---------------------------------------------------------------------------- 4085 | Returns the result of dividing the double-precision floating-point value `a' 4086 | by the corresponding value `b'. The operation is performed according to 4087 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4088 *----------------------------------------------------------------------------*/ 4089 4090 float64 float64_div(float64 a, float64 b, float_status *status) 4091 { 4092 flag aSign, bSign, zSign; 4093 int aExp, bExp, zExp; 4094 uint64_t aSig, bSig, zSig; 4095 uint64_t rem0, rem1; 4096 uint64_t term0, term1; 4097 a = float64_squash_input_denormal(a, status); 4098 b = float64_squash_input_denormal(b, status); 4099 4100 aSig = extractFloat64Frac( a ); 4101 aExp = extractFloat64Exp( a ); 4102 aSign = extractFloat64Sign( a ); 4103 bSig = extractFloat64Frac( b ); 4104 bExp = extractFloat64Exp( b ); 4105 bSign = extractFloat64Sign( b ); 4106 zSign = aSign ^ bSign; 4107 if ( aExp == 0x7FF ) { 4108 if (aSig) { 4109 return propagateFloat64NaN(a, b, status); 4110 } 4111 if ( bExp == 0x7FF ) { 4112 if (bSig) { 4113 return propagateFloat64NaN(a, b, status); 4114 } 4115 float_raise(float_flag_invalid, status); 4116 return float64_default_nan(status); 4117 } 4118 return packFloat64( zSign, 0x7FF, 0 ); 4119 } 4120 if ( bExp == 0x7FF ) { 4121 if (bSig) { 4122 return propagateFloat64NaN(a, b, status); 4123 } 4124 return packFloat64( zSign, 0, 0 ); 4125 } 4126 if ( bExp == 0 ) { 4127 if ( bSig == 0 ) { 4128 if ( ( aExp | aSig ) == 0 ) { 4129 float_raise(float_flag_invalid, status); 4130 return float64_default_nan(status); 4131 } 4132 float_raise(float_flag_divbyzero, status); 4133 return packFloat64( zSign, 0x7FF, 0 ); 4134 } 4135 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4136 } 4137 if ( aExp == 0 ) { 4138 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 4139 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4140 } 4141 zExp = aExp - bExp + 0x3FD; 4142 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 4143 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4144 if ( bSig <= ( aSig + aSig ) ) { 4145 aSig >>= 1; 4146 ++zExp; 4147 } 4148 zSig = estimateDiv128To64( aSig, 0, bSig ); 4149 if ( ( zSig & 0x1FF ) <= 2 ) { 4150 mul64To128( bSig, zSig, &term0, &term1 ); 4151 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 4152 while ( (int64_t) rem0 < 0 ) { 4153 --zSig; 4154 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 4155 } 4156 zSig |= ( rem1 != 0 ); 4157 } 4158 return roundAndPackFloat64(zSign, zExp, zSig, status); 4159 4160 } 4161 4162 /*---------------------------------------------------------------------------- 4163 | Returns the remainder of the double-precision floating-point value `a' 4164 | with respect to the corresponding value `b'. The operation is performed 4165 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4166 *----------------------------------------------------------------------------*/ 4167 4168 float64 float64_rem(float64 a, float64 b, float_status *status) 4169 { 4170 flag aSign, zSign; 4171 int aExp, bExp, expDiff; 4172 uint64_t aSig, bSig; 4173 uint64_t q, alternateASig; 4174 int64_t sigMean; 4175 4176 a = float64_squash_input_denormal(a, status); 4177 b = float64_squash_input_denormal(b, status); 4178 aSig = extractFloat64Frac( a ); 4179 aExp = extractFloat64Exp( a ); 4180 aSign = extractFloat64Sign( a ); 4181 bSig = extractFloat64Frac( b ); 4182 bExp = extractFloat64Exp( b ); 4183 if ( aExp == 0x7FF ) { 4184 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 4185 return propagateFloat64NaN(a, b, status); 4186 } 4187 float_raise(float_flag_invalid, status); 4188 return float64_default_nan(status); 4189 } 4190 if ( bExp == 0x7FF ) { 4191 if (bSig) { 4192 return propagateFloat64NaN(a, b, status); 4193 } 4194 return a; 4195 } 4196 if ( bExp == 0 ) { 4197 if ( bSig == 0 ) { 4198 float_raise(float_flag_invalid, status); 4199 return float64_default_nan(status); 4200 } 4201 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4202 } 4203 if ( aExp == 0 ) { 4204 if ( aSig == 0 ) return a; 4205 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4206 } 4207 expDiff = aExp - bExp; 4208 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 4209 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4210 if ( expDiff < 0 ) { 4211 if ( expDiff < -1 ) return a; 4212 aSig >>= 1; 4213 } 4214 q = ( bSig <= aSig ); 4215 if ( q ) aSig -= bSig; 4216 expDiff -= 64; 4217 while ( 0 < expDiff ) { 4218 q = estimateDiv128To64( aSig, 0, bSig ); 4219 q = ( 2 < q ) ? q - 2 : 0; 4220 aSig = - ( ( bSig>>2 ) * q ); 4221 expDiff -= 62; 4222 } 4223 expDiff += 64; 4224 if ( 0 < expDiff ) { 4225 q = estimateDiv128To64( aSig, 0, bSig ); 4226 q = ( 2 < q ) ? q - 2 : 0; 4227 q >>= 64 - expDiff; 4228 bSig >>= 2; 4229 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4230 } 4231 else { 4232 aSig >>= 2; 4233 bSig >>= 2; 4234 } 4235 do { 4236 alternateASig = aSig; 4237 ++q; 4238 aSig -= bSig; 4239 } while ( 0 <= (int64_t) aSig ); 4240 sigMean = aSig + alternateASig; 4241 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 4242 aSig = alternateASig; 4243 } 4244 zSign = ( (int64_t) aSig < 0 ); 4245 if ( zSign ) aSig = - aSig; 4246 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 4247 4248 } 4249 4250 /*---------------------------------------------------------------------------- 4251 | Returns the result of multiplying the double-precision floating-point values 4252 | `a' and `b' then adding 'c', with no intermediate rounding step after the 4253 | multiplication. The operation is performed according to the IEC/IEEE 4254 | Standard for Binary Floating-Point Arithmetic 754-2008. 4255 | The flags argument allows the caller to select negation of the 4256 | addend, the intermediate product, or the final result. (The difference 4257 | between this and having the caller do a separate negation is that negating 4258 | externally will flip the sign bit on NaNs.) 4259 *----------------------------------------------------------------------------*/ 4260 4261 float64 float64_muladd(float64 a, float64 b, float64 c, int flags, 4262 float_status *status) 4263 { 4264 flag aSign, bSign, cSign, zSign; 4265 int aExp, bExp, cExp, pExp, zExp, expDiff; 4266 uint64_t aSig, bSig, cSig; 4267 flag pInf, pZero, pSign; 4268 uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1; 4269 int shiftcount; 4270 flag signflip, infzero; 4271 4272 a = float64_squash_input_denormal(a, status); 4273 b = float64_squash_input_denormal(b, status); 4274 c = float64_squash_input_denormal(c, status); 4275 aSig = extractFloat64Frac(a); 4276 aExp = extractFloat64Exp(a); 4277 aSign = extractFloat64Sign(a); 4278 bSig = extractFloat64Frac(b); 4279 bExp = extractFloat64Exp(b); 4280 bSign = extractFloat64Sign(b); 4281 cSig = extractFloat64Frac(c); 4282 cExp = extractFloat64Exp(c); 4283 cSign = extractFloat64Sign(c); 4284 4285 infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) || 4286 (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0)); 4287 4288 /* It is implementation-defined whether the cases of (0,inf,qnan) 4289 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 4290 * they return if they do), so we have to hand this information 4291 * off to the target-specific pick-a-NaN routine. 4292 */ 4293 if (((aExp == 0x7ff) && aSig) || 4294 ((bExp == 0x7ff) && bSig) || 4295 ((cExp == 0x7ff) && cSig)) { 4296 return propagateFloat64MulAddNaN(a, b, c, infzero, status); 4297 } 4298 4299 if (infzero) { 4300 float_raise(float_flag_invalid, status); 4301 return float64_default_nan(status); 4302 } 4303 4304 if (flags & float_muladd_negate_c) { 4305 cSign ^= 1; 4306 } 4307 4308 signflip = (flags & float_muladd_negate_result) ? 1 : 0; 4309 4310 /* Work out the sign and type of the product */ 4311 pSign = aSign ^ bSign; 4312 if (flags & float_muladd_negate_product) { 4313 pSign ^= 1; 4314 } 4315 pInf = (aExp == 0x7ff) || (bExp == 0x7ff); 4316 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0); 4317 4318 if (cExp == 0x7ff) { 4319 if (pInf && (pSign ^ cSign)) { 4320 /* addition of opposite-signed infinities => InvalidOperation */ 4321 float_raise(float_flag_invalid, status); 4322 return float64_default_nan(status); 4323 } 4324 /* Otherwise generate an infinity of the same sign */ 4325 return packFloat64(cSign ^ signflip, 0x7ff, 0); 4326 } 4327 4328 if (pInf) { 4329 return packFloat64(pSign ^ signflip, 0x7ff, 0); 4330 } 4331 4332 if (pZero) { 4333 if (cExp == 0) { 4334 if (cSig == 0) { 4335 /* Adding two exact zeroes */ 4336 if (pSign == cSign) { 4337 zSign = pSign; 4338 } else if (status->float_rounding_mode == float_round_down) { 4339 zSign = 1; 4340 } else { 4341 zSign = 0; 4342 } 4343 return packFloat64(zSign ^ signflip, 0, 0); 4344 } 4345 /* Exact zero plus a denorm */ 4346 if (status->flush_to_zero) { 4347 float_raise(float_flag_output_denormal, status); 4348 return packFloat64(cSign ^ signflip, 0, 0); 4349 } 4350 } 4351 /* Zero plus something non-zero : just return the something */ 4352 if (flags & float_muladd_halve_result) { 4353 if (cExp == 0) { 4354 normalizeFloat64Subnormal(cSig, &cExp, &cSig); 4355 } 4356 /* Subtract one to halve, and one again because roundAndPackFloat64 4357 * wants one less than the true exponent. 4358 */ 4359 cExp -= 2; 4360 cSig = (cSig | 0x0010000000000000ULL) << 10; 4361 return roundAndPackFloat64(cSign ^ signflip, cExp, cSig, status); 4362 } 4363 return packFloat64(cSign ^ signflip, cExp, cSig); 4364 } 4365 4366 if (aExp == 0) { 4367 normalizeFloat64Subnormal(aSig, &aExp, &aSig); 4368 } 4369 if (bExp == 0) { 4370 normalizeFloat64Subnormal(bSig, &bExp, &bSig); 4371 } 4372 4373 /* Calculate the actual result a * b + c */ 4374 4375 /* Multiply first; this is easy. */ 4376 /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff 4377 * because we want the true exponent, not the "one-less-than" 4378 * flavour that roundAndPackFloat64() takes. 4379 */ 4380 pExp = aExp + bExp - 0x3fe; 4381 aSig = (aSig | LIT64(0x0010000000000000))<<10; 4382 bSig = (bSig | LIT64(0x0010000000000000))<<11; 4383 mul64To128(aSig, bSig, &pSig0, &pSig1); 4384 if ((int64_t)(pSig0 << 1) >= 0) { 4385 shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1); 4386 pExp--; 4387 } 4388 4389 zSign = pSign ^ signflip; 4390 4391 /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit 4392 * bit in position 126. 4393 */ 4394 if (cExp == 0) { 4395 if (!cSig) { 4396 /* Throw out the special case of c being an exact zero now */ 4397 shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1); 4398 if (flags & float_muladd_halve_result) { 4399 pExp--; 4400 } 4401 return roundAndPackFloat64(zSign, pExp - 1, 4402 pSig1, status); 4403 } 4404 normalizeFloat64Subnormal(cSig, &cExp, &cSig); 4405 } 4406 4407 /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the 4408 * significand of the addend, with the explicit bit in position 126. 4409 */ 4410 cSig0 = cSig << (126 - 64 - 52); 4411 cSig1 = 0; 4412 cSig0 |= LIT64(0x4000000000000000); 4413 expDiff = pExp - cExp; 4414 4415 if (pSign == cSign) { 4416 /* Addition */ 4417 if (expDiff > 0) { 4418 /* scale c to match p */ 4419 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1); 4420 zExp = pExp; 4421 } else if (expDiff < 0) { 4422 /* scale p to match c */ 4423 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1); 4424 zExp = cExp; 4425 } else { 4426 /* no scaling needed */ 4427 zExp = cExp; 4428 } 4429 /* Add significands and make sure explicit bit ends up in posn 126 */ 4430 add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); 4431 if ((int64_t)zSig0 < 0) { 4432 shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1); 4433 } else { 4434 zExp--; 4435 } 4436 shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1); 4437 if (flags & float_muladd_halve_result) { 4438 zExp--; 4439 } 4440 return roundAndPackFloat64(zSign, zExp, zSig1, status); 4441 } else { 4442 /* Subtraction */ 4443 if (expDiff > 0) { 4444 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1); 4445 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); 4446 zExp = pExp; 4447 } else if (expDiff < 0) { 4448 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1); 4449 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1); 4450 zExp = cExp; 4451 zSign ^= 1; 4452 } else { 4453 zExp = pExp; 4454 if (lt128(cSig0, cSig1, pSig0, pSig1)) { 4455 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); 4456 } else if (lt128(pSig0, pSig1, cSig0, cSig1)) { 4457 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1); 4458 zSign ^= 1; 4459 } else { 4460 /* Exact zero */ 4461 zSign = signflip; 4462 if (status->float_rounding_mode == float_round_down) { 4463 zSign ^= 1; 4464 } 4465 return packFloat64(zSign, 0, 0); 4466 } 4467 } 4468 --zExp; 4469 /* Do the equivalent of normalizeRoundAndPackFloat64() but 4470 * starting with the significand in a pair of uint64_t. 4471 */ 4472 if (zSig0) { 4473 shiftcount = countLeadingZeros64(zSig0) - 1; 4474 shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1); 4475 if (zSig1) { 4476 zSig0 |= 1; 4477 } 4478 zExp -= shiftcount; 4479 } else { 4480 shiftcount = countLeadingZeros64(zSig1); 4481 if (shiftcount == 0) { 4482 zSig0 = (zSig1 >> 1) | (zSig1 & 1); 4483 zExp -= 63; 4484 } else { 4485 shiftcount--; 4486 zSig0 = zSig1 << shiftcount; 4487 zExp -= (shiftcount + 64); 4488 } 4489 } 4490 if (flags & float_muladd_halve_result) { 4491 zExp--; 4492 } 4493 return roundAndPackFloat64(zSign, zExp, zSig0, status); 4494 } 4495 } 4496 4497 /*---------------------------------------------------------------------------- 4498 | Returns the square root of the double-precision floating-point value `a'. 4499 | The operation is performed according to the IEC/IEEE Standard for Binary 4500 | Floating-Point Arithmetic. 4501 *----------------------------------------------------------------------------*/ 4502 4503 float64 float64_sqrt(float64 a, float_status *status) 4504 { 4505 flag aSign; 4506 int aExp, zExp; 4507 uint64_t aSig, zSig, doubleZSig; 4508 uint64_t rem0, rem1, term0, term1; 4509 a = float64_squash_input_denormal(a, status); 4510 4511 aSig = extractFloat64Frac( a ); 4512 aExp = extractFloat64Exp( a ); 4513 aSign = extractFloat64Sign( a ); 4514 if ( aExp == 0x7FF ) { 4515 if (aSig) { 4516 return propagateFloat64NaN(a, a, status); 4517 } 4518 if ( ! aSign ) return a; 4519 float_raise(float_flag_invalid, status); 4520 return float64_default_nan(status); 4521 } 4522 if ( aSign ) { 4523 if ( ( aExp | aSig ) == 0 ) return a; 4524 float_raise(float_flag_invalid, status); 4525 return float64_default_nan(status); 4526 } 4527 if ( aExp == 0 ) { 4528 if ( aSig == 0 ) return float64_zero; 4529 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4530 } 4531 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 4532 aSig |= LIT64( 0x0010000000000000 ); 4533 zSig = estimateSqrt32( aExp, aSig>>21 ); 4534 aSig <<= 9 - ( aExp & 1 ); 4535 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); 4536 if ( ( zSig & 0x1FF ) <= 5 ) { 4537 doubleZSig = zSig<<1; 4538 mul64To128( zSig, zSig, &term0, &term1 ); 4539 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 4540 while ( (int64_t) rem0 < 0 ) { 4541 --zSig; 4542 doubleZSig -= 2; 4543 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); 4544 } 4545 zSig |= ( ( rem0 | rem1 ) != 0 ); 4546 } 4547 return roundAndPackFloat64(0, zExp, zSig, status); 4548 4549 } 4550 4551 /*---------------------------------------------------------------------------- 4552 | Returns the binary log of the double-precision floating-point value `a'. 4553 | The operation is performed according to the IEC/IEEE Standard for Binary 4554 | Floating-Point Arithmetic. 4555 *----------------------------------------------------------------------------*/ 4556 float64 float64_log2(float64 a, float_status *status) 4557 { 4558 flag aSign, zSign; 4559 int aExp; 4560 uint64_t aSig, aSig0, aSig1, zSig, i; 4561 a = float64_squash_input_denormal(a, status); 4562 4563 aSig = extractFloat64Frac( a ); 4564 aExp = extractFloat64Exp( a ); 4565 aSign = extractFloat64Sign( a ); 4566 4567 if ( aExp == 0 ) { 4568 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 4569 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4570 } 4571 if ( aSign ) { 4572 float_raise(float_flag_invalid, status); 4573 return float64_default_nan(status); 4574 } 4575 if ( aExp == 0x7FF ) { 4576 if (aSig) { 4577 return propagateFloat64NaN(a, float64_zero, status); 4578 } 4579 return a; 4580 } 4581 4582 aExp -= 0x3FF; 4583 aSig |= LIT64( 0x0010000000000000 ); 4584 zSign = aExp < 0; 4585 zSig = (uint64_t)aExp << 52; 4586 for (i = 1LL << 51; i > 0; i >>= 1) { 4587 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 4588 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 4589 if ( aSig & LIT64( 0x0020000000000000 ) ) { 4590 aSig >>= 1; 4591 zSig |= i; 4592 } 4593 } 4594 4595 if ( zSign ) 4596 zSig = -zSig; 4597 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 4598 } 4599 4600 /*---------------------------------------------------------------------------- 4601 | Returns 1 if the double-precision floating-point value `a' is equal to the 4602 | corresponding value `b', and 0 otherwise. The invalid exception is raised 4603 | if either operand is a NaN. Otherwise, the comparison is performed 4604 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4605 *----------------------------------------------------------------------------*/ 4606 4607 int float64_eq(float64 a, float64 b, float_status *status) 4608 { 4609 uint64_t av, bv; 4610 a = float64_squash_input_denormal(a, status); 4611 b = float64_squash_input_denormal(b, status); 4612 4613 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4614 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4615 ) { 4616 float_raise(float_flag_invalid, status); 4617 return 0; 4618 } 4619 av = float64_val(a); 4620 bv = float64_val(b); 4621 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4622 4623 } 4624 4625 /*---------------------------------------------------------------------------- 4626 | Returns 1 if the double-precision floating-point value `a' is less than or 4627 | equal to the corresponding value `b', and 0 otherwise. The invalid 4628 | exception is raised if either operand is a NaN. The comparison is performed 4629 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4630 *----------------------------------------------------------------------------*/ 4631 4632 int float64_le(float64 a, float64 b, float_status *status) 4633 { 4634 flag aSign, bSign; 4635 uint64_t av, bv; 4636 a = float64_squash_input_denormal(a, status); 4637 b = float64_squash_input_denormal(b, status); 4638 4639 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4640 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4641 ) { 4642 float_raise(float_flag_invalid, status); 4643 return 0; 4644 } 4645 aSign = extractFloat64Sign( a ); 4646 bSign = extractFloat64Sign( b ); 4647 av = float64_val(a); 4648 bv = float64_val(b); 4649 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4650 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4651 4652 } 4653 4654 /*---------------------------------------------------------------------------- 4655 | Returns 1 if the double-precision floating-point value `a' is less than 4656 | the corresponding value `b', and 0 otherwise. The invalid exception is 4657 | raised if either operand is a NaN. The comparison is performed according 4658 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4659 *----------------------------------------------------------------------------*/ 4660 4661 int float64_lt(float64 a, float64 b, float_status *status) 4662 { 4663 flag aSign, bSign; 4664 uint64_t av, bv; 4665 4666 a = float64_squash_input_denormal(a, status); 4667 b = float64_squash_input_denormal(b, status); 4668 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4669 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4670 ) { 4671 float_raise(float_flag_invalid, status); 4672 return 0; 4673 } 4674 aSign = extractFloat64Sign( a ); 4675 bSign = extractFloat64Sign( b ); 4676 av = float64_val(a); 4677 bv = float64_val(b); 4678 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4679 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4680 4681 } 4682 4683 /*---------------------------------------------------------------------------- 4684 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4685 | be compared, and 0 otherwise. The invalid exception is raised if either 4686 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4687 | Standard for Binary Floating-Point Arithmetic. 4688 *----------------------------------------------------------------------------*/ 4689 4690 int float64_unordered(float64 a, float64 b, float_status *status) 4691 { 4692 a = float64_squash_input_denormal(a, status); 4693 b = float64_squash_input_denormal(b, status); 4694 4695 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4696 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4697 ) { 4698 float_raise(float_flag_invalid, status); 4699 return 1; 4700 } 4701 return 0; 4702 } 4703 4704 /*---------------------------------------------------------------------------- 4705 | Returns 1 if the double-precision floating-point value `a' is equal to the 4706 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4707 | exception.The comparison is performed according to the IEC/IEEE Standard 4708 | for Binary Floating-Point Arithmetic. 4709 *----------------------------------------------------------------------------*/ 4710 4711 int float64_eq_quiet(float64 a, float64 b, float_status *status) 4712 { 4713 uint64_t av, bv; 4714 a = float64_squash_input_denormal(a, status); 4715 b = float64_squash_input_denormal(b, status); 4716 4717 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4718 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4719 ) { 4720 if (float64_is_signaling_nan(a, status) 4721 || float64_is_signaling_nan(b, status)) { 4722 float_raise(float_flag_invalid, status); 4723 } 4724 return 0; 4725 } 4726 av = float64_val(a); 4727 bv = float64_val(b); 4728 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4729 4730 } 4731 4732 /*---------------------------------------------------------------------------- 4733 | Returns 1 if the double-precision floating-point value `a' is less than or 4734 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4735 | cause an exception. Otherwise, the comparison is performed according to the 4736 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4737 *----------------------------------------------------------------------------*/ 4738 4739 int float64_le_quiet(float64 a, float64 b, float_status *status) 4740 { 4741 flag aSign, bSign; 4742 uint64_t av, bv; 4743 a = float64_squash_input_denormal(a, status); 4744 b = float64_squash_input_denormal(b, status); 4745 4746 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4747 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4748 ) { 4749 if (float64_is_signaling_nan(a, status) 4750 || float64_is_signaling_nan(b, status)) { 4751 float_raise(float_flag_invalid, status); 4752 } 4753 return 0; 4754 } 4755 aSign = extractFloat64Sign( a ); 4756 bSign = extractFloat64Sign( b ); 4757 av = float64_val(a); 4758 bv = float64_val(b); 4759 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4760 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4761 4762 } 4763 4764 /*---------------------------------------------------------------------------- 4765 | Returns 1 if the double-precision floating-point value `a' is less than 4766 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4767 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 4768 | Standard for Binary Floating-Point Arithmetic. 4769 *----------------------------------------------------------------------------*/ 4770 4771 int float64_lt_quiet(float64 a, float64 b, float_status *status) 4772 { 4773 flag aSign, bSign; 4774 uint64_t av, bv; 4775 a = float64_squash_input_denormal(a, status); 4776 b = float64_squash_input_denormal(b, status); 4777 4778 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4779 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4780 ) { 4781 if (float64_is_signaling_nan(a, status) 4782 || float64_is_signaling_nan(b, status)) { 4783 float_raise(float_flag_invalid, status); 4784 } 4785 return 0; 4786 } 4787 aSign = extractFloat64Sign( a ); 4788 bSign = extractFloat64Sign( b ); 4789 av = float64_val(a); 4790 bv = float64_val(b); 4791 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4792 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4793 4794 } 4795 4796 /*---------------------------------------------------------------------------- 4797 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4798 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 4799 | comparison is performed according to the IEC/IEEE Standard for Binary 4800 | Floating-Point Arithmetic. 4801 *----------------------------------------------------------------------------*/ 4802 4803 int float64_unordered_quiet(float64 a, float64 b, float_status *status) 4804 { 4805 a = float64_squash_input_denormal(a, status); 4806 b = float64_squash_input_denormal(b, status); 4807 4808 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4809 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4810 ) { 4811 if (float64_is_signaling_nan(a, status) 4812 || float64_is_signaling_nan(b, status)) { 4813 float_raise(float_flag_invalid, status); 4814 } 4815 return 1; 4816 } 4817 return 0; 4818 } 4819 4820 /*---------------------------------------------------------------------------- 4821 | Returns the result of converting the extended double-precision floating- 4822 | point value `a' to the 32-bit two's complement integer format. The 4823 | conversion is performed according to the IEC/IEEE Standard for Binary 4824 | Floating-Point Arithmetic---which means in particular that the conversion 4825 | is rounded according to the current rounding mode. If `a' is a NaN, the 4826 | largest positive integer is returned. Otherwise, if the conversion 4827 | overflows, the largest integer with the same sign as `a' is returned. 4828 *----------------------------------------------------------------------------*/ 4829 4830 int32_t floatx80_to_int32(floatx80 a, float_status *status) 4831 { 4832 flag aSign; 4833 int32_t aExp, shiftCount; 4834 uint64_t aSig; 4835 4836 if (floatx80_invalid_encoding(a)) { 4837 float_raise(float_flag_invalid, status); 4838 return 1 << 31; 4839 } 4840 aSig = extractFloatx80Frac( a ); 4841 aExp = extractFloatx80Exp( a ); 4842 aSign = extractFloatx80Sign( a ); 4843 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4844 shiftCount = 0x4037 - aExp; 4845 if ( shiftCount <= 0 ) shiftCount = 1; 4846 shift64RightJamming( aSig, shiftCount, &aSig ); 4847 return roundAndPackInt32(aSign, aSig, status); 4848 4849 } 4850 4851 /*---------------------------------------------------------------------------- 4852 | Returns the result of converting the extended double-precision floating- 4853 | point value `a' to the 32-bit two's complement integer format. The 4854 | conversion is performed according to the IEC/IEEE Standard for Binary 4855 | Floating-Point Arithmetic, except that the conversion is always rounded 4856 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4857 | Otherwise, if the conversion overflows, the largest integer with the same 4858 | sign as `a' is returned. 4859 *----------------------------------------------------------------------------*/ 4860 4861 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 4862 { 4863 flag aSign; 4864 int32_t aExp, shiftCount; 4865 uint64_t aSig, savedASig; 4866 int32_t z; 4867 4868 if (floatx80_invalid_encoding(a)) { 4869 float_raise(float_flag_invalid, status); 4870 return 1 << 31; 4871 } 4872 aSig = extractFloatx80Frac( a ); 4873 aExp = extractFloatx80Exp( a ); 4874 aSign = extractFloatx80Sign( a ); 4875 if ( 0x401E < aExp ) { 4876 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4877 goto invalid; 4878 } 4879 else if ( aExp < 0x3FFF ) { 4880 if (aExp || aSig) { 4881 status->float_exception_flags |= float_flag_inexact; 4882 } 4883 return 0; 4884 } 4885 shiftCount = 0x403E - aExp; 4886 savedASig = aSig; 4887 aSig >>= shiftCount; 4888 z = aSig; 4889 if ( aSign ) z = - z; 4890 if ( ( z < 0 ) ^ aSign ) { 4891 invalid: 4892 float_raise(float_flag_invalid, status); 4893 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 4894 } 4895 if ( ( aSig<<shiftCount ) != savedASig ) { 4896 status->float_exception_flags |= float_flag_inexact; 4897 } 4898 return z; 4899 4900 } 4901 4902 /*---------------------------------------------------------------------------- 4903 | Returns the result of converting the extended double-precision floating- 4904 | point value `a' to the 64-bit two's complement integer format. The 4905 | conversion is performed according to the IEC/IEEE Standard for Binary 4906 | Floating-Point Arithmetic---which means in particular that the conversion 4907 | is rounded according to the current rounding mode. If `a' is a NaN, 4908 | the largest positive integer is returned. Otherwise, if the conversion 4909 | overflows, the largest integer with the same sign as `a' is returned. 4910 *----------------------------------------------------------------------------*/ 4911 4912 int64_t floatx80_to_int64(floatx80 a, float_status *status) 4913 { 4914 flag aSign; 4915 int32_t aExp, shiftCount; 4916 uint64_t aSig, aSigExtra; 4917 4918 if (floatx80_invalid_encoding(a)) { 4919 float_raise(float_flag_invalid, status); 4920 return 1ULL << 63; 4921 } 4922 aSig = extractFloatx80Frac( a ); 4923 aExp = extractFloatx80Exp( a ); 4924 aSign = extractFloatx80Sign( a ); 4925 shiftCount = 0x403E - aExp; 4926 if ( shiftCount <= 0 ) { 4927 if ( shiftCount ) { 4928 float_raise(float_flag_invalid, status); 4929 if ( ! aSign 4930 || ( ( aExp == 0x7FFF ) 4931 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 4932 ) { 4933 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4934 } 4935 return (int64_t) LIT64( 0x8000000000000000 ); 4936 } 4937 aSigExtra = 0; 4938 } 4939 else { 4940 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 4941 } 4942 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 4943 4944 } 4945 4946 /*---------------------------------------------------------------------------- 4947 | Returns the result of converting the extended double-precision floating- 4948 | point value `a' to the 64-bit two's complement integer format. The 4949 | conversion is performed according to the IEC/IEEE Standard for Binary 4950 | Floating-Point Arithmetic, except that the conversion is always rounded 4951 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4952 | Otherwise, if the conversion overflows, the largest integer with the same 4953 | sign as `a' is returned. 4954 *----------------------------------------------------------------------------*/ 4955 4956 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 4957 { 4958 flag aSign; 4959 int32_t aExp, shiftCount; 4960 uint64_t aSig; 4961 int64_t z; 4962 4963 if (floatx80_invalid_encoding(a)) { 4964 float_raise(float_flag_invalid, status); 4965 return 1ULL << 63; 4966 } 4967 aSig = extractFloatx80Frac( a ); 4968 aExp = extractFloatx80Exp( a ); 4969 aSign = extractFloatx80Sign( a ); 4970 shiftCount = aExp - 0x403E; 4971 if ( 0 <= shiftCount ) { 4972 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 4973 if ( ( a.high != 0xC03E ) || aSig ) { 4974 float_raise(float_flag_invalid, status); 4975 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 4976 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4977 } 4978 } 4979 return (int64_t) LIT64( 0x8000000000000000 ); 4980 } 4981 else if ( aExp < 0x3FFF ) { 4982 if (aExp | aSig) { 4983 status->float_exception_flags |= float_flag_inexact; 4984 } 4985 return 0; 4986 } 4987 z = aSig>>( - shiftCount ); 4988 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 4989 status->float_exception_flags |= float_flag_inexact; 4990 } 4991 if ( aSign ) z = - z; 4992 return z; 4993 4994 } 4995 4996 /*---------------------------------------------------------------------------- 4997 | Returns the result of converting the extended double-precision floating- 4998 | point value `a' to the single-precision floating-point format. The 4999 | conversion is performed according to the IEC/IEEE Standard for Binary 5000 | Floating-Point Arithmetic. 5001 *----------------------------------------------------------------------------*/ 5002 5003 float32 floatx80_to_float32(floatx80 a, float_status *status) 5004 { 5005 flag aSign; 5006 int32_t aExp; 5007 uint64_t aSig; 5008 5009 if (floatx80_invalid_encoding(a)) { 5010 float_raise(float_flag_invalid, status); 5011 return float32_default_nan(status); 5012 } 5013 aSig = extractFloatx80Frac( a ); 5014 aExp = extractFloatx80Exp( a ); 5015 aSign = extractFloatx80Sign( a ); 5016 if ( aExp == 0x7FFF ) { 5017 if ( (uint64_t) ( aSig<<1 ) ) { 5018 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status); 5019 } 5020 return packFloat32( aSign, 0xFF, 0 ); 5021 } 5022 shift64RightJamming( aSig, 33, &aSig ); 5023 if ( aExp || aSig ) aExp -= 0x3F81; 5024 return roundAndPackFloat32(aSign, aExp, aSig, status); 5025 5026 } 5027 5028 /*---------------------------------------------------------------------------- 5029 | Returns the result of converting the extended double-precision floating- 5030 | point value `a' to the double-precision floating-point format. The 5031 | conversion is performed according to the IEC/IEEE Standard for Binary 5032 | Floating-Point Arithmetic. 5033 *----------------------------------------------------------------------------*/ 5034 5035 float64 floatx80_to_float64(floatx80 a, float_status *status) 5036 { 5037 flag aSign; 5038 int32_t aExp; 5039 uint64_t aSig, zSig; 5040 5041 if (floatx80_invalid_encoding(a)) { 5042 float_raise(float_flag_invalid, status); 5043 return float64_default_nan(status); 5044 } 5045 aSig = extractFloatx80Frac( a ); 5046 aExp = extractFloatx80Exp( a ); 5047 aSign = extractFloatx80Sign( a ); 5048 if ( aExp == 0x7FFF ) { 5049 if ( (uint64_t) ( aSig<<1 ) ) { 5050 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status); 5051 } 5052 return packFloat64( aSign, 0x7FF, 0 ); 5053 } 5054 shift64RightJamming( aSig, 1, &zSig ); 5055 if ( aExp || aSig ) aExp -= 0x3C01; 5056 return roundAndPackFloat64(aSign, aExp, zSig, status); 5057 5058 } 5059 5060 /*---------------------------------------------------------------------------- 5061 | Returns the result of converting the extended double-precision floating- 5062 | point value `a' to the quadruple-precision floating-point format. The 5063 | conversion is performed according to the IEC/IEEE Standard for Binary 5064 | Floating-Point Arithmetic. 5065 *----------------------------------------------------------------------------*/ 5066 5067 float128 floatx80_to_float128(floatx80 a, float_status *status) 5068 { 5069 flag aSign; 5070 int aExp; 5071 uint64_t aSig, zSig0, zSig1; 5072 5073 if (floatx80_invalid_encoding(a)) { 5074 float_raise(float_flag_invalid, status); 5075 return float128_default_nan(status); 5076 } 5077 aSig = extractFloatx80Frac( a ); 5078 aExp = extractFloatx80Exp( a ); 5079 aSign = extractFloatx80Sign( a ); 5080 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 5081 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status); 5082 } 5083 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 5084 return packFloat128( aSign, aExp, zSig0, zSig1 ); 5085 5086 } 5087 5088 /*---------------------------------------------------------------------------- 5089 | Rounds the extended double-precision floating-point value `a' to an integer, 5090 | and returns the result as an extended quadruple-precision floating-point 5091 | value. The operation is performed according to the IEC/IEEE Standard for 5092 | Binary Floating-Point Arithmetic. 5093 *----------------------------------------------------------------------------*/ 5094 5095 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 5096 { 5097 flag aSign; 5098 int32_t aExp; 5099 uint64_t lastBitMask, roundBitsMask; 5100 floatx80 z; 5101 5102 if (floatx80_invalid_encoding(a)) { 5103 float_raise(float_flag_invalid, status); 5104 return floatx80_default_nan(status); 5105 } 5106 aExp = extractFloatx80Exp( a ); 5107 if ( 0x403E <= aExp ) { 5108 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 5109 return propagateFloatx80NaN(a, a, status); 5110 } 5111 return a; 5112 } 5113 if ( aExp < 0x3FFF ) { 5114 if ( ( aExp == 0 ) 5115 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 5116 return a; 5117 } 5118 status->float_exception_flags |= float_flag_inexact; 5119 aSign = extractFloatx80Sign( a ); 5120 switch (status->float_rounding_mode) { 5121 case float_round_nearest_even: 5122 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 5123 ) { 5124 return 5125 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 5126 } 5127 break; 5128 case float_round_ties_away: 5129 if (aExp == 0x3FFE) { 5130 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000)); 5131 } 5132 break; 5133 case float_round_down: 5134 return 5135 aSign ? 5136 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 5137 : packFloatx80( 0, 0, 0 ); 5138 case float_round_up: 5139 return 5140 aSign ? packFloatx80( 1, 0, 0 ) 5141 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 5142 } 5143 return packFloatx80( aSign, 0, 0 ); 5144 } 5145 lastBitMask = 1; 5146 lastBitMask <<= 0x403E - aExp; 5147 roundBitsMask = lastBitMask - 1; 5148 z = a; 5149 switch (status->float_rounding_mode) { 5150 case float_round_nearest_even: 5151 z.low += lastBitMask>>1; 5152 if ((z.low & roundBitsMask) == 0) { 5153 z.low &= ~lastBitMask; 5154 } 5155 break; 5156 case float_round_ties_away: 5157 z.low += lastBitMask >> 1; 5158 break; 5159 case float_round_to_zero: 5160 break; 5161 case float_round_up: 5162 if (!extractFloatx80Sign(z)) { 5163 z.low += roundBitsMask; 5164 } 5165 break; 5166 case float_round_down: 5167 if (extractFloatx80Sign(z)) { 5168 z.low += roundBitsMask; 5169 } 5170 break; 5171 default: 5172 abort(); 5173 } 5174 z.low &= ~ roundBitsMask; 5175 if ( z.low == 0 ) { 5176 ++z.high; 5177 z.low = LIT64( 0x8000000000000000 ); 5178 } 5179 if (z.low != a.low) { 5180 status->float_exception_flags |= float_flag_inexact; 5181 } 5182 return z; 5183 5184 } 5185 5186 /*---------------------------------------------------------------------------- 5187 | Returns the result of adding the absolute values of the extended double- 5188 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 5189 | negated before being returned. `zSign' is ignored if the result is a NaN. 5190 | The addition is performed according to the IEC/IEEE Standard for Binary 5191 | Floating-Point Arithmetic. 5192 *----------------------------------------------------------------------------*/ 5193 5194 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5195 float_status *status) 5196 { 5197 int32_t aExp, bExp, zExp; 5198 uint64_t aSig, bSig, zSig0, zSig1; 5199 int32_t expDiff; 5200 5201 aSig = extractFloatx80Frac( a ); 5202 aExp = extractFloatx80Exp( a ); 5203 bSig = extractFloatx80Frac( b ); 5204 bExp = extractFloatx80Exp( b ); 5205 expDiff = aExp - bExp; 5206 if ( 0 < expDiff ) { 5207 if ( aExp == 0x7FFF ) { 5208 if ((uint64_t)(aSig << 1)) { 5209 return propagateFloatx80NaN(a, b, status); 5210 } 5211 return a; 5212 } 5213 if ( bExp == 0 ) --expDiff; 5214 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5215 zExp = aExp; 5216 } 5217 else if ( expDiff < 0 ) { 5218 if ( bExp == 0x7FFF ) { 5219 if ((uint64_t)(bSig << 1)) { 5220 return propagateFloatx80NaN(a, b, status); 5221 } 5222 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5223 } 5224 if ( aExp == 0 ) ++expDiff; 5225 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5226 zExp = bExp; 5227 } 5228 else { 5229 if ( aExp == 0x7FFF ) { 5230 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5231 return propagateFloatx80NaN(a, b, status); 5232 } 5233 return a; 5234 } 5235 zSig1 = 0; 5236 zSig0 = aSig + bSig; 5237 if ( aExp == 0 ) { 5238 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 5239 goto roundAndPack; 5240 } 5241 zExp = aExp; 5242 goto shiftRight1; 5243 } 5244 zSig0 = aSig + bSig; 5245 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 5246 shiftRight1: 5247 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5248 zSig0 |= LIT64( 0x8000000000000000 ); 5249 ++zExp; 5250 roundAndPack: 5251 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5252 zSign, zExp, zSig0, zSig1, status); 5253 } 5254 5255 /*---------------------------------------------------------------------------- 5256 | Returns the result of subtracting the absolute values of the extended 5257 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 5258 | difference is negated before being returned. `zSign' is ignored if the 5259 | result is a NaN. The subtraction is performed according to the IEC/IEEE 5260 | Standard for Binary Floating-Point Arithmetic. 5261 *----------------------------------------------------------------------------*/ 5262 5263 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5264 float_status *status) 5265 { 5266 int32_t aExp, bExp, zExp; 5267 uint64_t aSig, bSig, zSig0, zSig1; 5268 int32_t expDiff; 5269 5270 aSig = extractFloatx80Frac( a ); 5271 aExp = extractFloatx80Exp( a ); 5272 bSig = extractFloatx80Frac( b ); 5273 bExp = extractFloatx80Exp( b ); 5274 expDiff = aExp - bExp; 5275 if ( 0 < expDiff ) goto aExpBigger; 5276 if ( expDiff < 0 ) goto bExpBigger; 5277 if ( aExp == 0x7FFF ) { 5278 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5279 return propagateFloatx80NaN(a, b, status); 5280 } 5281 float_raise(float_flag_invalid, status); 5282 return floatx80_default_nan(status); 5283 } 5284 if ( aExp == 0 ) { 5285 aExp = 1; 5286 bExp = 1; 5287 } 5288 zSig1 = 0; 5289 if ( bSig < aSig ) goto aBigger; 5290 if ( aSig < bSig ) goto bBigger; 5291 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 5292 bExpBigger: 5293 if ( bExp == 0x7FFF ) { 5294 if ((uint64_t)(bSig << 1)) { 5295 return propagateFloatx80NaN(a, b, status); 5296 } 5297 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5298 } 5299 if ( aExp == 0 ) ++expDiff; 5300 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5301 bBigger: 5302 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 5303 zExp = bExp; 5304 zSign ^= 1; 5305 goto normalizeRoundAndPack; 5306 aExpBigger: 5307 if ( aExp == 0x7FFF ) { 5308 if ((uint64_t)(aSig << 1)) { 5309 return propagateFloatx80NaN(a, b, status); 5310 } 5311 return a; 5312 } 5313 if ( bExp == 0 ) --expDiff; 5314 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5315 aBigger: 5316 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 5317 zExp = aExp; 5318 normalizeRoundAndPack: 5319 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 5320 zSign, zExp, zSig0, zSig1, status); 5321 } 5322 5323 /*---------------------------------------------------------------------------- 5324 | Returns the result of adding the extended double-precision floating-point 5325 | values `a' and `b'. The operation is performed according to the IEC/IEEE 5326 | Standard for Binary Floating-Point Arithmetic. 5327 *----------------------------------------------------------------------------*/ 5328 5329 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 5330 { 5331 flag aSign, bSign; 5332 5333 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5334 float_raise(float_flag_invalid, status); 5335 return floatx80_default_nan(status); 5336 } 5337 aSign = extractFloatx80Sign( a ); 5338 bSign = extractFloatx80Sign( b ); 5339 if ( aSign == bSign ) { 5340 return addFloatx80Sigs(a, b, aSign, status); 5341 } 5342 else { 5343 return subFloatx80Sigs(a, b, aSign, status); 5344 } 5345 5346 } 5347 5348 /*---------------------------------------------------------------------------- 5349 | Returns the result of subtracting the extended double-precision floating- 5350 | point values `a' and `b'. The operation is performed according to the 5351 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5352 *----------------------------------------------------------------------------*/ 5353 5354 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 5355 { 5356 flag aSign, bSign; 5357 5358 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5359 float_raise(float_flag_invalid, status); 5360 return floatx80_default_nan(status); 5361 } 5362 aSign = extractFloatx80Sign( a ); 5363 bSign = extractFloatx80Sign( b ); 5364 if ( aSign == bSign ) { 5365 return subFloatx80Sigs(a, b, aSign, status); 5366 } 5367 else { 5368 return addFloatx80Sigs(a, b, aSign, status); 5369 } 5370 5371 } 5372 5373 /*---------------------------------------------------------------------------- 5374 | Returns the result of multiplying the extended double-precision floating- 5375 | point values `a' and `b'. The operation is performed according to the 5376 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5377 *----------------------------------------------------------------------------*/ 5378 5379 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 5380 { 5381 flag aSign, bSign, zSign; 5382 int32_t aExp, bExp, zExp; 5383 uint64_t aSig, bSig, zSig0, zSig1; 5384 5385 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5386 float_raise(float_flag_invalid, status); 5387 return floatx80_default_nan(status); 5388 } 5389 aSig = extractFloatx80Frac( a ); 5390 aExp = extractFloatx80Exp( a ); 5391 aSign = extractFloatx80Sign( a ); 5392 bSig = extractFloatx80Frac( b ); 5393 bExp = extractFloatx80Exp( b ); 5394 bSign = extractFloatx80Sign( b ); 5395 zSign = aSign ^ bSign; 5396 if ( aExp == 0x7FFF ) { 5397 if ( (uint64_t) ( aSig<<1 ) 5398 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5399 return propagateFloatx80NaN(a, b, status); 5400 } 5401 if ( ( bExp | bSig ) == 0 ) goto invalid; 5402 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5403 } 5404 if ( bExp == 0x7FFF ) { 5405 if ((uint64_t)(bSig << 1)) { 5406 return propagateFloatx80NaN(a, b, status); 5407 } 5408 if ( ( aExp | aSig ) == 0 ) { 5409 invalid: 5410 float_raise(float_flag_invalid, status); 5411 return floatx80_default_nan(status); 5412 } 5413 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5414 } 5415 if ( aExp == 0 ) { 5416 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5417 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5418 } 5419 if ( bExp == 0 ) { 5420 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5421 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5422 } 5423 zExp = aExp + bExp - 0x3FFE; 5424 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 5425 if ( 0 < (int64_t) zSig0 ) { 5426 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5427 --zExp; 5428 } 5429 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5430 zSign, zExp, zSig0, zSig1, status); 5431 } 5432 5433 /*---------------------------------------------------------------------------- 5434 | Returns the result of dividing the extended double-precision floating-point 5435 | value `a' by the corresponding value `b'. The operation is performed 5436 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5437 *----------------------------------------------------------------------------*/ 5438 5439 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 5440 { 5441 flag aSign, bSign, zSign; 5442 int32_t aExp, bExp, zExp; 5443 uint64_t aSig, bSig, zSig0, zSig1; 5444 uint64_t rem0, rem1, rem2, term0, term1, term2; 5445 5446 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5447 float_raise(float_flag_invalid, status); 5448 return floatx80_default_nan(status); 5449 } 5450 aSig = extractFloatx80Frac( a ); 5451 aExp = extractFloatx80Exp( a ); 5452 aSign = extractFloatx80Sign( a ); 5453 bSig = extractFloatx80Frac( b ); 5454 bExp = extractFloatx80Exp( b ); 5455 bSign = extractFloatx80Sign( b ); 5456 zSign = aSign ^ bSign; 5457 if ( aExp == 0x7FFF ) { 5458 if ((uint64_t)(aSig << 1)) { 5459 return propagateFloatx80NaN(a, b, status); 5460 } 5461 if ( bExp == 0x7FFF ) { 5462 if ((uint64_t)(bSig << 1)) { 5463 return propagateFloatx80NaN(a, b, status); 5464 } 5465 goto invalid; 5466 } 5467 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5468 } 5469 if ( bExp == 0x7FFF ) { 5470 if ((uint64_t)(bSig << 1)) { 5471 return propagateFloatx80NaN(a, b, status); 5472 } 5473 return packFloatx80( zSign, 0, 0 ); 5474 } 5475 if ( bExp == 0 ) { 5476 if ( bSig == 0 ) { 5477 if ( ( aExp | aSig ) == 0 ) { 5478 invalid: 5479 float_raise(float_flag_invalid, status); 5480 return floatx80_default_nan(status); 5481 } 5482 float_raise(float_flag_divbyzero, status); 5483 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5484 } 5485 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5486 } 5487 if ( aExp == 0 ) { 5488 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5489 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5490 } 5491 zExp = aExp - bExp + 0x3FFE; 5492 rem1 = 0; 5493 if ( bSig <= aSig ) { 5494 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 5495 ++zExp; 5496 } 5497 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 5498 mul64To128( bSig, zSig0, &term0, &term1 ); 5499 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 5500 while ( (int64_t) rem0 < 0 ) { 5501 --zSig0; 5502 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 5503 } 5504 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 5505 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 5506 mul64To128( bSig, zSig1, &term1, &term2 ); 5507 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5508 while ( (int64_t) rem1 < 0 ) { 5509 --zSig1; 5510 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 5511 } 5512 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 5513 } 5514 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5515 zSign, zExp, zSig0, zSig1, status); 5516 } 5517 5518 /*---------------------------------------------------------------------------- 5519 | Returns the remainder of the extended double-precision floating-point value 5520 | `a' with respect to the corresponding value `b'. The operation is performed 5521 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5522 *----------------------------------------------------------------------------*/ 5523 5524 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 5525 { 5526 flag aSign, zSign; 5527 int32_t aExp, bExp, expDiff; 5528 uint64_t aSig0, aSig1, bSig; 5529 uint64_t q, term0, term1, alternateASig0, alternateASig1; 5530 5531 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5532 float_raise(float_flag_invalid, status); 5533 return floatx80_default_nan(status); 5534 } 5535 aSig0 = extractFloatx80Frac( a ); 5536 aExp = extractFloatx80Exp( a ); 5537 aSign = extractFloatx80Sign( a ); 5538 bSig = extractFloatx80Frac( b ); 5539 bExp = extractFloatx80Exp( b ); 5540 if ( aExp == 0x7FFF ) { 5541 if ( (uint64_t) ( aSig0<<1 ) 5542 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5543 return propagateFloatx80NaN(a, b, status); 5544 } 5545 goto invalid; 5546 } 5547 if ( bExp == 0x7FFF ) { 5548 if ((uint64_t)(bSig << 1)) { 5549 return propagateFloatx80NaN(a, b, status); 5550 } 5551 return a; 5552 } 5553 if ( bExp == 0 ) { 5554 if ( bSig == 0 ) { 5555 invalid: 5556 float_raise(float_flag_invalid, status); 5557 return floatx80_default_nan(status); 5558 } 5559 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5560 } 5561 if ( aExp == 0 ) { 5562 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a; 5563 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5564 } 5565 bSig |= LIT64( 0x8000000000000000 ); 5566 zSign = aSign; 5567 expDiff = aExp - bExp; 5568 aSig1 = 0; 5569 if ( expDiff < 0 ) { 5570 if ( expDiff < -1 ) return a; 5571 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 5572 expDiff = 0; 5573 } 5574 q = ( bSig <= aSig0 ); 5575 if ( q ) aSig0 -= bSig; 5576 expDiff -= 64; 5577 while ( 0 < expDiff ) { 5578 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5579 q = ( 2 < q ) ? q - 2 : 0; 5580 mul64To128( bSig, q, &term0, &term1 ); 5581 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5582 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 5583 expDiff -= 62; 5584 } 5585 expDiff += 64; 5586 if ( 0 < expDiff ) { 5587 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5588 q = ( 2 < q ) ? q - 2 : 0; 5589 q >>= 64 - expDiff; 5590 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 5591 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5592 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 5593 while ( le128( term0, term1, aSig0, aSig1 ) ) { 5594 ++q; 5595 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5596 } 5597 } 5598 else { 5599 term1 = 0; 5600 term0 = bSig; 5601 } 5602 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 5603 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5604 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5605 && ( q & 1 ) ) 5606 ) { 5607 aSig0 = alternateASig0; 5608 aSig1 = alternateASig1; 5609 zSign = ! zSign; 5610 } 5611 return 5612 normalizeRoundAndPackFloatx80( 5613 80, zSign, bExp + expDiff, aSig0, aSig1, status); 5614 5615 } 5616 5617 /*---------------------------------------------------------------------------- 5618 | Returns the square root of the extended double-precision floating-point 5619 | value `a'. The operation is performed according to the IEC/IEEE Standard 5620 | for Binary Floating-Point Arithmetic. 5621 *----------------------------------------------------------------------------*/ 5622 5623 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 5624 { 5625 flag aSign; 5626 int32_t aExp, zExp; 5627 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 5628 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5629 5630 if (floatx80_invalid_encoding(a)) { 5631 float_raise(float_flag_invalid, status); 5632 return floatx80_default_nan(status); 5633 } 5634 aSig0 = extractFloatx80Frac( a ); 5635 aExp = extractFloatx80Exp( a ); 5636 aSign = extractFloatx80Sign( a ); 5637 if ( aExp == 0x7FFF ) { 5638 if ((uint64_t)(aSig0 << 1)) { 5639 return propagateFloatx80NaN(a, a, status); 5640 } 5641 if ( ! aSign ) return a; 5642 goto invalid; 5643 } 5644 if ( aSign ) { 5645 if ( ( aExp | aSig0 ) == 0 ) return a; 5646 invalid: 5647 float_raise(float_flag_invalid, status); 5648 return floatx80_default_nan(status); 5649 } 5650 if ( aExp == 0 ) { 5651 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 5652 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5653 } 5654 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 5655 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 5656 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 5657 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5658 doubleZSig0 = zSig0<<1; 5659 mul64To128( zSig0, zSig0, &term0, &term1 ); 5660 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5661 while ( (int64_t) rem0 < 0 ) { 5662 --zSig0; 5663 doubleZSig0 -= 2; 5664 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5665 } 5666 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5667 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 5668 if ( zSig1 == 0 ) zSig1 = 1; 5669 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5670 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5671 mul64To128( zSig1, zSig1, &term2, &term3 ); 5672 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5673 while ( (int64_t) rem1 < 0 ) { 5674 --zSig1; 5675 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5676 term3 |= 1; 5677 term2 |= doubleZSig0; 5678 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5679 } 5680 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5681 } 5682 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 5683 zSig0 |= doubleZSig0; 5684 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5685 0, zExp, zSig0, zSig1, status); 5686 } 5687 5688 /*---------------------------------------------------------------------------- 5689 | Returns 1 if the extended double-precision floating-point value `a' is equal 5690 | to the corresponding value `b', and 0 otherwise. The invalid exception is 5691 | raised if either operand is a NaN. Otherwise, the comparison is performed 5692 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5693 *----------------------------------------------------------------------------*/ 5694 5695 int floatx80_eq(floatx80 a, floatx80 b, float_status *status) 5696 { 5697 5698 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5699 || (extractFloatx80Exp(a) == 0x7FFF 5700 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5701 || (extractFloatx80Exp(b) == 0x7FFF 5702 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5703 ) { 5704 float_raise(float_flag_invalid, status); 5705 return 0; 5706 } 5707 return 5708 ( a.low == b.low ) 5709 && ( ( a.high == b.high ) 5710 || ( ( a.low == 0 ) 5711 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5712 ); 5713 5714 } 5715 5716 /*---------------------------------------------------------------------------- 5717 | Returns 1 if the extended double-precision floating-point value `a' is 5718 | less than or equal to the corresponding value `b', and 0 otherwise. The 5719 | invalid exception is raised if either operand is a NaN. The comparison is 5720 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5721 | Arithmetic. 5722 *----------------------------------------------------------------------------*/ 5723 5724 int floatx80_le(floatx80 a, floatx80 b, float_status *status) 5725 { 5726 flag aSign, bSign; 5727 5728 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5729 || (extractFloatx80Exp(a) == 0x7FFF 5730 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5731 || (extractFloatx80Exp(b) == 0x7FFF 5732 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5733 ) { 5734 float_raise(float_flag_invalid, status); 5735 return 0; 5736 } 5737 aSign = extractFloatx80Sign( a ); 5738 bSign = extractFloatx80Sign( b ); 5739 if ( aSign != bSign ) { 5740 return 5741 aSign 5742 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5743 == 0 ); 5744 } 5745 return 5746 aSign ? le128( b.high, b.low, a.high, a.low ) 5747 : le128( a.high, a.low, b.high, b.low ); 5748 5749 } 5750 5751 /*---------------------------------------------------------------------------- 5752 | Returns 1 if the extended double-precision floating-point value `a' is 5753 | less than the corresponding value `b', and 0 otherwise. The invalid 5754 | exception is raised if either operand is a NaN. The comparison is performed 5755 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5756 *----------------------------------------------------------------------------*/ 5757 5758 int floatx80_lt(floatx80 a, floatx80 b, float_status *status) 5759 { 5760 flag aSign, bSign; 5761 5762 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5763 || (extractFloatx80Exp(a) == 0x7FFF 5764 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5765 || (extractFloatx80Exp(b) == 0x7FFF 5766 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5767 ) { 5768 float_raise(float_flag_invalid, status); 5769 return 0; 5770 } 5771 aSign = extractFloatx80Sign( a ); 5772 bSign = extractFloatx80Sign( b ); 5773 if ( aSign != bSign ) { 5774 return 5775 aSign 5776 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5777 != 0 ); 5778 } 5779 return 5780 aSign ? lt128( b.high, b.low, a.high, a.low ) 5781 : lt128( a.high, a.low, b.high, b.low ); 5782 5783 } 5784 5785 /*---------------------------------------------------------------------------- 5786 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5787 | cannot be compared, and 0 otherwise. The invalid exception is raised if 5788 | either operand is a NaN. The comparison is performed according to the 5789 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5790 *----------------------------------------------------------------------------*/ 5791 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status) 5792 { 5793 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5794 || (extractFloatx80Exp(a) == 0x7FFF 5795 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5796 || (extractFloatx80Exp(b) == 0x7FFF 5797 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5798 ) { 5799 float_raise(float_flag_invalid, status); 5800 return 1; 5801 } 5802 return 0; 5803 } 5804 5805 /*---------------------------------------------------------------------------- 5806 | Returns 1 if the extended double-precision floating-point value `a' is 5807 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5808 | cause an exception. The comparison is performed according to the IEC/IEEE 5809 | Standard for Binary Floating-Point Arithmetic. 5810 *----------------------------------------------------------------------------*/ 5811 5812 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status) 5813 { 5814 5815 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5816 float_raise(float_flag_invalid, status); 5817 return 0; 5818 } 5819 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5820 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5821 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5822 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5823 ) { 5824 if (floatx80_is_signaling_nan(a, status) 5825 || floatx80_is_signaling_nan(b, status)) { 5826 float_raise(float_flag_invalid, status); 5827 } 5828 return 0; 5829 } 5830 return 5831 ( a.low == b.low ) 5832 && ( ( a.high == b.high ) 5833 || ( ( a.low == 0 ) 5834 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5835 ); 5836 5837 } 5838 5839 /*---------------------------------------------------------------------------- 5840 | Returns 1 if the extended double-precision floating-point value `a' is less 5841 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 5842 | do not cause an exception. Otherwise, the comparison is performed according 5843 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5844 *----------------------------------------------------------------------------*/ 5845 5846 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status) 5847 { 5848 flag aSign, bSign; 5849 5850 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5851 float_raise(float_flag_invalid, status); 5852 return 0; 5853 } 5854 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5855 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5856 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5857 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5858 ) { 5859 if (floatx80_is_signaling_nan(a, status) 5860 || floatx80_is_signaling_nan(b, status)) { 5861 float_raise(float_flag_invalid, status); 5862 } 5863 return 0; 5864 } 5865 aSign = extractFloatx80Sign( a ); 5866 bSign = extractFloatx80Sign( b ); 5867 if ( aSign != bSign ) { 5868 return 5869 aSign 5870 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5871 == 0 ); 5872 } 5873 return 5874 aSign ? le128( b.high, b.low, a.high, a.low ) 5875 : le128( a.high, a.low, b.high, b.low ); 5876 5877 } 5878 5879 /*---------------------------------------------------------------------------- 5880 | Returns 1 if the extended double-precision floating-point value `a' is less 5881 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 5882 | an exception. Otherwise, the comparison is performed according to the 5883 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5884 *----------------------------------------------------------------------------*/ 5885 5886 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status) 5887 { 5888 flag aSign, bSign; 5889 5890 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5891 float_raise(float_flag_invalid, status); 5892 return 0; 5893 } 5894 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5895 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5896 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5897 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5898 ) { 5899 if (floatx80_is_signaling_nan(a, status) 5900 || floatx80_is_signaling_nan(b, status)) { 5901 float_raise(float_flag_invalid, status); 5902 } 5903 return 0; 5904 } 5905 aSign = extractFloatx80Sign( a ); 5906 bSign = extractFloatx80Sign( b ); 5907 if ( aSign != bSign ) { 5908 return 5909 aSign 5910 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5911 != 0 ); 5912 } 5913 return 5914 aSign ? lt128( b.high, b.low, a.high, a.low ) 5915 : lt128( a.high, a.low, b.high, b.low ); 5916 5917 } 5918 5919 /*---------------------------------------------------------------------------- 5920 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5921 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception. 5922 | The comparison is performed according to the IEC/IEEE Standard for Binary 5923 | Floating-Point Arithmetic. 5924 *----------------------------------------------------------------------------*/ 5925 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status) 5926 { 5927 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5928 float_raise(float_flag_invalid, status); 5929 return 1; 5930 } 5931 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5932 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5933 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5934 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5935 ) { 5936 if (floatx80_is_signaling_nan(a, status) 5937 || floatx80_is_signaling_nan(b, status)) { 5938 float_raise(float_flag_invalid, status); 5939 } 5940 return 1; 5941 } 5942 return 0; 5943 } 5944 5945 /*---------------------------------------------------------------------------- 5946 | Returns the result of converting the quadruple-precision floating-point 5947 | value `a' to the 32-bit two's complement integer format. The conversion 5948 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5949 | Arithmetic---which means in particular that the conversion is rounded 5950 | according to the current rounding mode. If `a' is a NaN, the largest 5951 | positive integer is returned. Otherwise, if the conversion overflows, the 5952 | largest integer with the same sign as `a' is returned. 5953 *----------------------------------------------------------------------------*/ 5954 5955 int32_t float128_to_int32(float128 a, float_status *status) 5956 { 5957 flag aSign; 5958 int32_t aExp, shiftCount; 5959 uint64_t aSig0, aSig1; 5960 5961 aSig1 = extractFloat128Frac1( a ); 5962 aSig0 = extractFloat128Frac0( a ); 5963 aExp = extractFloat128Exp( a ); 5964 aSign = extractFloat128Sign( a ); 5965 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 5966 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5967 aSig0 |= ( aSig1 != 0 ); 5968 shiftCount = 0x4028 - aExp; 5969 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 5970 return roundAndPackInt32(aSign, aSig0, status); 5971 5972 } 5973 5974 /*---------------------------------------------------------------------------- 5975 | Returns the result of converting the quadruple-precision floating-point 5976 | value `a' to the 32-bit two's complement integer format. The conversion 5977 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5978 | Arithmetic, except that the conversion is always rounded toward zero. If 5979 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 5980 | conversion overflows, the largest integer with the same sign as `a' is 5981 | returned. 5982 *----------------------------------------------------------------------------*/ 5983 5984 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 5985 { 5986 flag aSign; 5987 int32_t aExp, shiftCount; 5988 uint64_t aSig0, aSig1, savedASig; 5989 int32_t z; 5990 5991 aSig1 = extractFloat128Frac1( a ); 5992 aSig0 = extractFloat128Frac0( a ); 5993 aExp = extractFloat128Exp( a ); 5994 aSign = extractFloat128Sign( a ); 5995 aSig0 |= ( aSig1 != 0 ); 5996 if ( 0x401E < aExp ) { 5997 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 5998 goto invalid; 5999 } 6000 else if ( aExp < 0x3FFF ) { 6001 if (aExp || aSig0) { 6002 status->float_exception_flags |= float_flag_inexact; 6003 } 6004 return 0; 6005 } 6006 aSig0 |= LIT64( 0x0001000000000000 ); 6007 shiftCount = 0x402F - aExp; 6008 savedASig = aSig0; 6009 aSig0 >>= shiftCount; 6010 z = aSig0; 6011 if ( aSign ) z = - z; 6012 if ( ( z < 0 ) ^ aSign ) { 6013 invalid: 6014 float_raise(float_flag_invalid, status); 6015 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 6016 } 6017 if ( ( aSig0<<shiftCount ) != savedASig ) { 6018 status->float_exception_flags |= float_flag_inexact; 6019 } 6020 return z; 6021 6022 } 6023 6024 /*---------------------------------------------------------------------------- 6025 | Returns the result of converting the quadruple-precision floating-point 6026 | value `a' to the 64-bit two's complement integer format. The conversion 6027 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6028 | Arithmetic---which means in particular that the conversion is rounded 6029 | according to the current rounding mode. If `a' is a NaN, the largest 6030 | positive integer is returned. Otherwise, if the conversion overflows, the 6031 | largest integer with the same sign as `a' is returned. 6032 *----------------------------------------------------------------------------*/ 6033 6034 int64_t float128_to_int64(float128 a, float_status *status) 6035 { 6036 flag aSign; 6037 int32_t aExp, shiftCount; 6038 uint64_t aSig0, aSig1; 6039 6040 aSig1 = extractFloat128Frac1( a ); 6041 aSig0 = extractFloat128Frac0( a ); 6042 aExp = extractFloat128Exp( a ); 6043 aSign = extractFloat128Sign( a ); 6044 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6045 shiftCount = 0x402F - aExp; 6046 if ( shiftCount <= 0 ) { 6047 if ( 0x403E < aExp ) { 6048 float_raise(float_flag_invalid, status); 6049 if ( ! aSign 6050 || ( ( aExp == 0x7FFF ) 6051 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 6052 ) 6053 ) { 6054 return LIT64( 0x7FFFFFFFFFFFFFFF ); 6055 } 6056 return (int64_t) LIT64( 0x8000000000000000 ); 6057 } 6058 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 6059 } 6060 else { 6061 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 6062 } 6063 return roundAndPackInt64(aSign, aSig0, aSig1, status); 6064 6065 } 6066 6067 /*---------------------------------------------------------------------------- 6068 | Returns the result of converting the quadruple-precision floating-point 6069 | value `a' to the 64-bit two's complement integer format. The conversion 6070 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6071 | Arithmetic, except that the conversion is always rounded toward zero. 6072 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 6073 | the conversion overflows, the largest integer with the same sign as `a' is 6074 | returned. 6075 *----------------------------------------------------------------------------*/ 6076 6077 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 6078 { 6079 flag aSign; 6080 int32_t aExp, shiftCount; 6081 uint64_t aSig0, aSig1; 6082 int64_t z; 6083 6084 aSig1 = extractFloat128Frac1( a ); 6085 aSig0 = extractFloat128Frac0( a ); 6086 aExp = extractFloat128Exp( a ); 6087 aSign = extractFloat128Sign( a ); 6088 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6089 shiftCount = aExp - 0x402F; 6090 if ( 0 < shiftCount ) { 6091 if ( 0x403E <= aExp ) { 6092 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 6093 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 6094 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 6095 if (aSig1) { 6096 status->float_exception_flags |= float_flag_inexact; 6097 } 6098 } 6099 else { 6100 float_raise(float_flag_invalid, status); 6101 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 6102 return LIT64( 0x7FFFFFFFFFFFFFFF ); 6103 } 6104 } 6105 return (int64_t) LIT64( 0x8000000000000000 ); 6106 } 6107 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 6108 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 6109 status->float_exception_flags |= float_flag_inexact; 6110 } 6111 } 6112 else { 6113 if ( aExp < 0x3FFF ) { 6114 if ( aExp | aSig0 | aSig1 ) { 6115 status->float_exception_flags |= float_flag_inexact; 6116 } 6117 return 0; 6118 } 6119 z = aSig0>>( - shiftCount ); 6120 if ( aSig1 6121 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 6122 status->float_exception_flags |= float_flag_inexact; 6123 } 6124 } 6125 if ( aSign ) z = - z; 6126 return z; 6127 6128 } 6129 6130 /*---------------------------------------------------------------------------- 6131 | Returns the result of converting the quadruple-precision floating-point value 6132 | `a' to the 64-bit unsigned integer format. The conversion is 6133 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6134 | Arithmetic---which means in particular that the conversion is rounded 6135 | according to the current rounding mode. If `a' is a NaN, the largest 6136 | positive integer is returned. If the conversion overflows, the 6137 | largest unsigned integer is returned. If 'a' is negative, the value is 6138 | rounded and zero is returned; negative values that do not round to zero 6139 | will raise the inexact exception. 6140 *----------------------------------------------------------------------------*/ 6141 6142 uint64_t float128_to_uint64(float128 a, float_status *status) 6143 { 6144 flag aSign; 6145 int aExp; 6146 int shiftCount; 6147 uint64_t aSig0, aSig1; 6148 6149 aSig0 = extractFloat128Frac0(a); 6150 aSig1 = extractFloat128Frac1(a); 6151 aExp = extractFloat128Exp(a); 6152 aSign = extractFloat128Sign(a); 6153 if (aSign && (aExp > 0x3FFE)) { 6154 float_raise(float_flag_invalid, status); 6155 if (float128_is_any_nan(a)) { 6156 return LIT64(0xFFFFFFFFFFFFFFFF); 6157 } else { 6158 return 0; 6159 } 6160 } 6161 if (aExp) { 6162 aSig0 |= LIT64(0x0001000000000000); 6163 } 6164 shiftCount = 0x402F - aExp; 6165 if (shiftCount <= 0) { 6166 if (0x403E < aExp) { 6167 float_raise(float_flag_invalid, status); 6168 return LIT64(0xFFFFFFFFFFFFFFFF); 6169 } 6170 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 6171 } else { 6172 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 6173 } 6174 return roundAndPackUint64(aSign, aSig0, aSig1, status); 6175 } 6176 6177 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 6178 { 6179 uint64_t v; 6180 signed char current_rounding_mode = status->float_rounding_mode; 6181 6182 set_float_rounding_mode(float_round_to_zero, status); 6183 v = float128_to_uint64(a, status); 6184 set_float_rounding_mode(current_rounding_mode, status); 6185 6186 return v; 6187 } 6188 6189 /*---------------------------------------------------------------------------- 6190 | Returns the result of converting the quadruple-precision floating-point 6191 | value `a' to the 32-bit unsigned integer format. The conversion 6192 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6193 | Arithmetic except that the conversion is always rounded toward zero. 6194 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 6195 | if the conversion overflows, the largest unsigned integer is returned. 6196 | If 'a' is negative, the value is rounded and zero is returned; negative 6197 | values that do not round to zero will raise the inexact exception. 6198 *----------------------------------------------------------------------------*/ 6199 6200 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 6201 { 6202 uint64_t v; 6203 uint32_t res; 6204 int old_exc_flags = get_float_exception_flags(status); 6205 6206 v = float128_to_uint64_round_to_zero(a, status); 6207 if (v > 0xffffffff) { 6208 res = 0xffffffff; 6209 } else { 6210 return v; 6211 } 6212 set_float_exception_flags(old_exc_flags, status); 6213 float_raise(float_flag_invalid, status); 6214 return res; 6215 } 6216 6217 /*---------------------------------------------------------------------------- 6218 | Returns the result of converting the quadruple-precision floating-point 6219 | value `a' to the single-precision floating-point format. The conversion 6220 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6221 | Arithmetic. 6222 *----------------------------------------------------------------------------*/ 6223 6224 float32 float128_to_float32(float128 a, float_status *status) 6225 { 6226 flag aSign; 6227 int32_t aExp; 6228 uint64_t aSig0, aSig1; 6229 uint32_t zSig; 6230 6231 aSig1 = extractFloat128Frac1( a ); 6232 aSig0 = extractFloat128Frac0( a ); 6233 aExp = extractFloat128Exp( a ); 6234 aSign = extractFloat128Sign( a ); 6235 if ( aExp == 0x7FFF ) { 6236 if ( aSig0 | aSig1 ) { 6237 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 6238 } 6239 return packFloat32( aSign, 0xFF, 0 ); 6240 } 6241 aSig0 |= ( aSig1 != 0 ); 6242 shift64RightJamming( aSig0, 18, &aSig0 ); 6243 zSig = aSig0; 6244 if ( aExp || zSig ) { 6245 zSig |= 0x40000000; 6246 aExp -= 0x3F81; 6247 } 6248 return roundAndPackFloat32(aSign, aExp, zSig, status); 6249 6250 } 6251 6252 /*---------------------------------------------------------------------------- 6253 | Returns the result of converting the quadruple-precision floating-point 6254 | value `a' to the double-precision floating-point format. The conversion 6255 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6256 | Arithmetic. 6257 *----------------------------------------------------------------------------*/ 6258 6259 float64 float128_to_float64(float128 a, float_status *status) 6260 { 6261 flag aSign; 6262 int32_t aExp; 6263 uint64_t aSig0, aSig1; 6264 6265 aSig1 = extractFloat128Frac1( a ); 6266 aSig0 = extractFloat128Frac0( a ); 6267 aExp = extractFloat128Exp( a ); 6268 aSign = extractFloat128Sign( a ); 6269 if ( aExp == 0x7FFF ) { 6270 if ( aSig0 | aSig1 ) { 6271 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 6272 } 6273 return packFloat64( aSign, 0x7FF, 0 ); 6274 } 6275 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6276 aSig0 |= ( aSig1 != 0 ); 6277 if ( aExp || aSig0 ) { 6278 aSig0 |= LIT64( 0x4000000000000000 ); 6279 aExp -= 0x3C01; 6280 } 6281 return roundAndPackFloat64(aSign, aExp, aSig0, status); 6282 6283 } 6284 6285 /*---------------------------------------------------------------------------- 6286 | Returns the result of converting the quadruple-precision floating-point 6287 | value `a' to the extended double-precision floating-point format. The 6288 | conversion is performed according to the IEC/IEEE Standard for Binary 6289 | Floating-Point Arithmetic. 6290 *----------------------------------------------------------------------------*/ 6291 6292 floatx80 float128_to_floatx80(float128 a, float_status *status) 6293 { 6294 flag aSign; 6295 int32_t aExp; 6296 uint64_t aSig0, aSig1; 6297 6298 aSig1 = extractFloat128Frac1( a ); 6299 aSig0 = extractFloat128Frac0( a ); 6300 aExp = extractFloat128Exp( a ); 6301 aSign = extractFloat128Sign( a ); 6302 if ( aExp == 0x7FFF ) { 6303 if ( aSig0 | aSig1 ) { 6304 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status); 6305 } 6306 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 6307 } 6308 if ( aExp == 0 ) { 6309 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6310 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6311 } 6312 else { 6313 aSig0 |= LIT64( 0x0001000000000000 ); 6314 } 6315 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6316 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6317 6318 } 6319 6320 /*---------------------------------------------------------------------------- 6321 | Rounds the quadruple-precision floating-point value `a' to an integer, and 6322 | returns the result as a quadruple-precision floating-point value. The 6323 | operation is performed according to the IEC/IEEE Standard for Binary 6324 | Floating-Point Arithmetic. 6325 *----------------------------------------------------------------------------*/ 6326 6327 float128 float128_round_to_int(float128 a, float_status *status) 6328 { 6329 flag aSign; 6330 int32_t aExp; 6331 uint64_t lastBitMask, roundBitsMask; 6332 float128 z; 6333 6334 aExp = extractFloat128Exp( a ); 6335 if ( 0x402F <= aExp ) { 6336 if ( 0x406F <= aExp ) { 6337 if ( ( aExp == 0x7FFF ) 6338 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 6339 ) { 6340 return propagateFloat128NaN(a, a, status); 6341 } 6342 return a; 6343 } 6344 lastBitMask = 1; 6345 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 6346 roundBitsMask = lastBitMask - 1; 6347 z = a; 6348 switch (status->float_rounding_mode) { 6349 case float_round_nearest_even: 6350 if ( lastBitMask ) { 6351 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 6352 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 6353 } 6354 else { 6355 if ( (int64_t) z.low < 0 ) { 6356 ++z.high; 6357 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 6358 } 6359 } 6360 break; 6361 case float_round_ties_away: 6362 if (lastBitMask) { 6363 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 6364 } else { 6365 if ((int64_t) z.low < 0) { 6366 ++z.high; 6367 } 6368 } 6369 break; 6370 case float_round_to_zero: 6371 break; 6372 case float_round_up: 6373 if (!extractFloat128Sign(z)) { 6374 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6375 } 6376 break; 6377 case float_round_down: 6378 if (extractFloat128Sign(z)) { 6379 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6380 } 6381 break; 6382 default: 6383 abort(); 6384 } 6385 z.low &= ~ roundBitsMask; 6386 } 6387 else { 6388 if ( aExp < 0x3FFF ) { 6389 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 6390 status->float_exception_flags |= float_flag_inexact; 6391 aSign = extractFloat128Sign( a ); 6392 switch (status->float_rounding_mode) { 6393 case float_round_nearest_even: 6394 if ( ( aExp == 0x3FFE ) 6395 && ( extractFloat128Frac0( a ) 6396 | extractFloat128Frac1( a ) ) 6397 ) { 6398 return packFloat128( aSign, 0x3FFF, 0, 0 ); 6399 } 6400 break; 6401 case float_round_ties_away: 6402 if (aExp == 0x3FFE) { 6403 return packFloat128(aSign, 0x3FFF, 0, 0); 6404 } 6405 break; 6406 case float_round_down: 6407 return 6408 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 6409 : packFloat128( 0, 0, 0, 0 ); 6410 case float_round_up: 6411 return 6412 aSign ? packFloat128( 1, 0, 0, 0 ) 6413 : packFloat128( 0, 0x3FFF, 0, 0 ); 6414 } 6415 return packFloat128( aSign, 0, 0, 0 ); 6416 } 6417 lastBitMask = 1; 6418 lastBitMask <<= 0x402F - aExp; 6419 roundBitsMask = lastBitMask - 1; 6420 z.low = 0; 6421 z.high = a.high; 6422 switch (status->float_rounding_mode) { 6423 case float_round_nearest_even: 6424 z.high += lastBitMask>>1; 6425 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 6426 z.high &= ~ lastBitMask; 6427 } 6428 break; 6429 case float_round_ties_away: 6430 z.high += lastBitMask>>1; 6431 break; 6432 case float_round_to_zero: 6433 break; 6434 case float_round_up: 6435 if (!extractFloat128Sign(z)) { 6436 z.high |= ( a.low != 0 ); 6437 z.high += roundBitsMask; 6438 } 6439 break; 6440 case float_round_down: 6441 if (extractFloat128Sign(z)) { 6442 z.high |= (a.low != 0); 6443 z.high += roundBitsMask; 6444 } 6445 break; 6446 default: 6447 abort(); 6448 } 6449 z.high &= ~ roundBitsMask; 6450 } 6451 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 6452 status->float_exception_flags |= float_flag_inexact; 6453 } 6454 return z; 6455 6456 } 6457 6458 /*---------------------------------------------------------------------------- 6459 | Returns the result of adding the absolute values of the quadruple-precision 6460 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 6461 | before being returned. `zSign' is ignored if the result is a NaN. 6462 | The addition is performed according to the IEC/IEEE Standard for Binary 6463 | Floating-Point Arithmetic. 6464 *----------------------------------------------------------------------------*/ 6465 6466 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign, 6467 float_status *status) 6468 { 6469 int32_t aExp, bExp, zExp; 6470 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6471 int32_t expDiff; 6472 6473 aSig1 = extractFloat128Frac1( a ); 6474 aSig0 = extractFloat128Frac0( a ); 6475 aExp = extractFloat128Exp( a ); 6476 bSig1 = extractFloat128Frac1( b ); 6477 bSig0 = extractFloat128Frac0( b ); 6478 bExp = extractFloat128Exp( b ); 6479 expDiff = aExp - bExp; 6480 if ( 0 < expDiff ) { 6481 if ( aExp == 0x7FFF ) { 6482 if (aSig0 | aSig1) { 6483 return propagateFloat128NaN(a, b, status); 6484 } 6485 return a; 6486 } 6487 if ( bExp == 0 ) { 6488 --expDiff; 6489 } 6490 else { 6491 bSig0 |= LIT64( 0x0001000000000000 ); 6492 } 6493 shift128ExtraRightJamming( 6494 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 6495 zExp = aExp; 6496 } 6497 else if ( expDiff < 0 ) { 6498 if ( bExp == 0x7FFF ) { 6499 if (bSig0 | bSig1) { 6500 return propagateFloat128NaN(a, b, status); 6501 } 6502 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6503 } 6504 if ( aExp == 0 ) { 6505 ++expDiff; 6506 } 6507 else { 6508 aSig0 |= LIT64( 0x0001000000000000 ); 6509 } 6510 shift128ExtraRightJamming( 6511 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 6512 zExp = bExp; 6513 } 6514 else { 6515 if ( aExp == 0x7FFF ) { 6516 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6517 return propagateFloat128NaN(a, b, status); 6518 } 6519 return a; 6520 } 6521 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6522 if ( aExp == 0 ) { 6523 if (status->flush_to_zero) { 6524 if (zSig0 | zSig1) { 6525 float_raise(float_flag_output_denormal, status); 6526 } 6527 return packFloat128(zSign, 0, 0, 0); 6528 } 6529 return packFloat128( zSign, 0, zSig0, zSig1 ); 6530 } 6531 zSig2 = 0; 6532 zSig0 |= LIT64( 0x0002000000000000 ); 6533 zExp = aExp; 6534 goto shiftRight1; 6535 } 6536 aSig0 |= LIT64( 0x0001000000000000 ); 6537 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6538 --zExp; 6539 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 6540 ++zExp; 6541 shiftRight1: 6542 shift128ExtraRightJamming( 6543 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6544 roundAndPack: 6545 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6546 6547 } 6548 6549 /*---------------------------------------------------------------------------- 6550 | Returns the result of subtracting the absolute values of the quadruple- 6551 | precision floating-point values `a' and `b'. If `zSign' is 1, the 6552 | difference is negated before being returned. `zSign' is ignored if the 6553 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6554 | Standard for Binary Floating-Point Arithmetic. 6555 *----------------------------------------------------------------------------*/ 6556 6557 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign, 6558 float_status *status) 6559 { 6560 int32_t aExp, bExp, zExp; 6561 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 6562 int32_t expDiff; 6563 6564 aSig1 = extractFloat128Frac1( a ); 6565 aSig0 = extractFloat128Frac0( a ); 6566 aExp = extractFloat128Exp( a ); 6567 bSig1 = extractFloat128Frac1( b ); 6568 bSig0 = extractFloat128Frac0( b ); 6569 bExp = extractFloat128Exp( b ); 6570 expDiff = aExp - bExp; 6571 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6572 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 6573 if ( 0 < expDiff ) goto aExpBigger; 6574 if ( expDiff < 0 ) goto bExpBigger; 6575 if ( aExp == 0x7FFF ) { 6576 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6577 return propagateFloat128NaN(a, b, status); 6578 } 6579 float_raise(float_flag_invalid, status); 6580 return float128_default_nan(status); 6581 } 6582 if ( aExp == 0 ) { 6583 aExp = 1; 6584 bExp = 1; 6585 } 6586 if ( bSig0 < aSig0 ) goto aBigger; 6587 if ( aSig0 < bSig0 ) goto bBigger; 6588 if ( bSig1 < aSig1 ) goto aBigger; 6589 if ( aSig1 < bSig1 ) goto bBigger; 6590 return packFloat128(status->float_rounding_mode == float_round_down, 6591 0, 0, 0); 6592 bExpBigger: 6593 if ( bExp == 0x7FFF ) { 6594 if (bSig0 | bSig1) { 6595 return propagateFloat128NaN(a, b, status); 6596 } 6597 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 6598 } 6599 if ( aExp == 0 ) { 6600 ++expDiff; 6601 } 6602 else { 6603 aSig0 |= LIT64( 0x4000000000000000 ); 6604 } 6605 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6606 bSig0 |= LIT64( 0x4000000000000000 ); 6607 bBigger: 6608 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6609 zExp = bExp; 6610 zSign ^= 1; 6611 goto normalizeRoundAndPack; 6612 aExpBigger: 6613 if ( aExp == 0x7FFF ) { 6614 if (aSig0 | aSig1) { 6615 return propagateFloat128NaN(a, b, status); 6616 } 6617 return a; 6618 } 6619 if ( bExp == 0 ) { 6620 --expDiff; 6621 } 6622 else { 6623 bSig0 |= LIT64( 0x4000000000000000 ); 6624 } 6625 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 6626 aSig0 |= LIT64( 0x4000000000000000 ); 6627 aBigger: 6628 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6629 zExp = aExp; 6630 normalizeRoundAndPack: 6631 --zExp; 6632 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 6633 status); 6634 6635 } 6636 6637 /*---------------------------------------------------------------------------- 6638 | Returns the result of adding the quadruple-precision floating-point values 6639 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 6640 | for Binary Floating-Point Arithmetic. 6641 *----------------------------------------------------------------------------*/ 6642 6643 float128 float128_add(float128 a, float128 b, float_status *status) 6644 { 6645 flag aSign, bSign; 6646 6647 aSign = extractFloat128Sign( a ); 6648 bSign = extractFloat128Sign( b ); 6649 if ( aSign == bSign ) { 6650 return addFloat128Sigs(a, b, aSign, status); 6651 } 6652 else { 6653 return subFloat128Sigs(a, b, aSign, status); 6654 } 6655 6656 } 6657 6658 /*---------------------------------------------------------------------------- 6659 | Returns the result of subtracting the quadruple-precision floating-point 6660 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6661 | Standard for Binary Floating-Point Arithmetic. 6662 *----------------------------------------------------------------------------*/ 6663 6664 float128 float128_sub(float128 a, float128 b, float_status *status) 6665 { 6666 flag aSign, bSign; 6667 6668 aSign = extractFloat128Sign( a ); 6669 bSign = extractFloat128Sign( b ); 6670 if ( aSign == bSign ) { 6671 return subFloat128Sigs(a, b, aSign, status); 6672 } 6673 else { 6674 return addFloat128Sigs(a, b, aSign, status); 6675 } 6676 6677 } 6678 6679 /*---------------------------------------------------------------------------- 6680 | Returns the result of multiplying the quadruple-precision floating-point 6681 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6682 | Standard for Binary Floating-Point Arithmetic. 6683 *----------------------------------------------------------------------------*/ 6684 6685 float128 float128_mul(float128 a, float128 b, float_status *status) 6686 { 6687 flag aSign, bSign, zSign; 6688 int32_t aExp, bExp, zExp; 6689 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 6690 6691 aSig1 = extractFloat128Frac1( a ); 6692 aSig0 = extractFloat128Frac0( a ); 6693 aExp = extractFloat128Exp( a ); 6694 aSign = extractFloat128Sign( a ); 6695 bSig1 = extractFloat128Frac1( b ); 6696 bSig0 = extractFloat128Frac0( b ); 6697 bExp = extractFloat128Exp( b ); 6698 bSign = extractFloat128Sign( b ); 6699 zSign = aSign ^ bSign; 6700 if ( aExp == 0x7FFF ) { 6701 if ( ( aSig0 | aSig1 ) 6702 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6703 return propagateFloat128NaN(a, b, status); 6704 } 6705 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 6706 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6707 } 6708 if ( bExp == 0x7FFF ) { 6709 if (bSig0 | bSig1) { 6710 return propagateFloat128NaN(a, b, status); 6711 } 6712 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6713 invalid: 6714 float_raise(float_flag_invalid, status); 6715 return float128_default_nan(status); 6716 } 6717 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6718 } 6719 if ( aExp == 0 ) { 6720 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6721 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6722 } 6723 if ( bExp == 0 ) { 6724 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6725 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6726 } 6727 zExp = aExp + bExp - 0x4000; 6728 aSig0 |= LIT64( 0x0001000000000000 ); 6729 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 6730 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 6731 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6732 zSig2 |= ( zSig3 != 0 ); 6733 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 6734 shift128ExtraRightJamming( 6735 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6736 ++zExp; 6737 } 6738 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6739 6740 } 6741 6742 /*---------------------------------------------------------------------------- 6743 | Returns the result of dividing the quadruple-precision floating-point value 6744 | `a' by the corresponding value `b'. The operation is performed according to 6745 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6746 *----------------------------------------------------------------------------*/ 6747 6748 float128 float128_div(float128 a, float128 b, float_status *status) 6749 { 6750 flag aSign, bSign, zSign; 6751 int32_t aExp, bExp, zExp; 6752 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6753 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6754 6755 aSig1 = extractFloat128Frac1( a ); 6756 aSig0 = extractFloat128Frac0( a ); 6757 aExp = extractFloat128Exp( a ); 6758 aSign = extractFloat128Sign( a ); 6759 bSig1 = extractFloat128Frac1( b ); 6760 bSig0 = extractFloat128Frac0( b ); 6761 bExp = extractFloat128Exp( b ); 6762 bSign = extractFloat128Sign( b ); 6763 zSign = aSign ^ bSign; 6764 if ( aExp == 0x7FFF ) { 6765 if (aSig0 | aSig1) { 6766 return propagateFloat128NaN(a, b, status); 6767 } 6768 if ( bExp == 0x7FFF ) { 6769 if (bSig0 | bSig1) { 6770 return propagateFloat128NaN(a, b, status); 6771 } 6772 goto invalid; 6773 } 6774 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6775 } 6776 if ( bExp == 0x7FFF ) { 6777 if (bSig0 | bSig1) { 6778 return propagateFloat128NaN(a, b, status); 6779 } 6780 return packFloat128( zSign, 0, 0, 0 ); 6781 } 6782 if ( bExp == 0 ) { 6783 if ( ( bSig0 | bSig1 ) == 0 ) { 6784 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6785 invalid: 6786 float_raise(float_flag_invalid, status); 6787 return float128_default_nan(status); 6788 } 6789 float_raise(float_flag_divbyzero, status); 6790 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6791 } 6792 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6793 } 6794 if ( aExp == 0 ) { 6795 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6796 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6797 } 6798 zExp = aExp - bExp + 0x3FFD; 6799 shortShift128Left( 6800 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 6801 shortShift128Left( 6802 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6803 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 6804 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 6805 ++zExp; 6806 } 6807 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6808 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 6809 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 6810 while ( (int64_t) rem0 < 0 ) { 6811 --zSig0; 6812 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 6813 } 6814 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 6815 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 6816 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 6817 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 6818 while ( (int64_t) rem1 < 0 ) { 6819 --zSig1; 6820 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 6821 } 6822 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6823 } 6824 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 6825 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6826 6827 } 6828 6829 /*---------------------------------------------------------------------------- 6830 | Returns the remainder of the quadruple-precision floating-point value `a' 6831 | with respect to the corresponding value `b'. The operation is performed 6832 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6833 *----------------------------------------------------------------------------*/ 6834 6835 float128 float128_rem(float128 a, float128 b, float_status *status) 6836 { 6837 flag aSign, zSign; 6838 int32_t aExp, bExp, expDiff; 6839 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 6840 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 6841 int64_t sigMean0; 6842 6843 aSig1 = extractFloat128Frac1( a ); 6844 aSig0 = extractFloat128Frac0( a ); 6845 aExp = extractFloat128Exp( a ); 6846 aSign = extractFloat128Sign( a ); 6847 bSig1 = extractFloat128Frac1( b ); 6848 bSig0 = extractFloat128Frac0( b ); 6849 bExp = extractFloat128Exp( b ); 6850 if ( aExp == 0x7FFF ) { 6851 if ( ( aSig0 | aSig1 ) 6852 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6853 return propagateFloat128NaN(a, b, status); 6854 } 6855 goto invalid; 6856 } 6857 if ( bExp == 0x7FFF ) { 6858 if (bSig0 | bSig1) { 6859 return propagateFloat128NaN(a, b, status); 6860 } 6861 return a; 6862 } 6863 if ( bExp == 0 ) { 6864 if ( ( bSig0 | bSig1 ) == 0 ) { 6865 invalid: 6866 float_raise(float_flag_invalid, status); 6867 return float128_default_nan(status); 6868 } 6869 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6870 } 6871 if ( aExp == 0 ) { 6872 if ( ( aSig0 | aSig1 ) == 0 ) return a; 6873 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6874 } 6875 expDiff = aExp - bExp; 6876 if ( expDiff < -1 ) return a; 6877 shortShift128Left( 6878 aSig0 | LIT64( 0x0001000000000000 ), 6879 aSig1, 6880 15 - ( expDiff < 0 ), 6881 &aSig0, 6882 &aSig1 6883 ); 6884 shortShift128Left( 6885 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6886 q = le128( bSig0, bSig1, aSig0, aSig1 ); 6887 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6888 expDiff -= 64; 6889 while ( 0 < expDiff ) { 6890 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6891 q = ( 4 < q ) ? q - 4 : 0; 6892 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6893 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 6894 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 6895 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 6896 expDiff -= 61; 6897 } 6898 if ( -64 < expDiff ) { 6899 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6900 q = ( 4 < q ) ? q - 4 : 0; 6901 q >>= - expDiff; 6902 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6903 expDiff += 52; 6904 if ( expDiff < 0 ) { 6905 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6906 } 6907 else { 6908 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 6909 } 6910 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6911 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 6912 } 6913 else { 6914 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 6915 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6916 } 6917 do { 6918 alternateASig0 = aSig0; 6919 alternateASig1 = aSig1; 6920 ++q; 6921 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6922 } while ( 0 <= (int64_t) aSig0 ); 6923 add128( 6924 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 6925 if ( ( sigMean0 < 0 ) 6926 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 6927 aSig0 = alternateASig0; 6928 aSig1 = alternateASig1; 6929 } 6930 zSign = ( (int64_t) aSig0 < 0 ); 6931 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 6932 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 6933 status); 6934 } 6935 6936 /*---------------------------------------------------------------------------- 6937 | Returns the square root of the quadruple-precision floating-point value `a'. 6938 | The operation is performed according to the IEC/IEEE Standard for Binary 6939 | Floating-Point Arithmetic. 6940 *----------------------------------------------------------------------------*/ 6941 6942 float128 float128_sqrt(float128 a, float_status *status) 6943 { 6944 flag aSign; 6945 int32_t aExp, zExp; 6946 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 6947 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6948 6949 aSig1 = extractFloat128Frac1( a ); 6950 aSig0 = extractFloat128Frac0( a ); 6951 aExp = extractFloat128Exp( a ); 6952 aSign = extractFloat128Sign( a ); 6953 if ( aExp == 0x7FFF ) { 6954 if (aSig0 | aSig1) { 6955 return propagateFloat128NaN(a, a, status); 6956 } 6957 if ( ! aSign ) return a; 6958 goto invalid; 6959 } 6960 if ( aSign ) { 6961 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 6962 invalid: 6963 float_raise(float_flag_invalid, status); 6964 return float128_default_nan(status); 6965 } 6966 if ( aExp == 0 ) { 6967 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 6968 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6969 } 6970 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 6971 aSig0 |= LIT64( 0x0001000000000000 ); 6972 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 6973 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 6974 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6975 doubleZSig0 = zSig0<<1; 6976 mul64To128( zSig0, zSig0, &term0, &term1 ); 6977 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6978 while ( (int64_t) rem0 < 0 ) { 6979 --zSig0; 6980 doubleZSig0 -= 2; 6981 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6982 } 6983 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6984 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 6985 if ( zSig1 == 0 ) zSig1 = 1; 6986 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6987 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6988 mul64To128( zSig1, zSig1, &term2, &term3 ); 6989 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6990 while ( (int64_t) rem1 < 0 ) { 6991 --zSig1; 6992 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6993 term3 |= 1; 6994 term2 |= doubleZSig0; 6995 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6996 } 6997 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6998 } 6999 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 7000 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 7001 7002 } 7003 7004 /*---------------------------------------------------------------------------- 7005 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 7006 | the corresponding value `b', and 0 otherwise. The invalid exception is 7007 | raised if either operand is a NaN. Otherwise, the comparison is performed 7008 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7009 *----------------------------------------------------------------------------*/ 7010 7011 int float128_eq(float128 a, float128 b, float_status *status) 7012 { 7013 7014 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7015 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7016 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7017 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7018 ) { 7019 float_raise(float_flag_invalid, status); 7020 return 0; 7021 } 7022 return 7023 ( a.low == b.low ) 7024 && ( ( a.high == b.high ) 7025 || ( ( a.low == 0 ) 7026 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 7027 ); 7028 7029 } 7030 7031 /*---------------------------------------------------------------------------- 7032 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7033 | or equal to the corresponding value `b', and 0 otherwise. The invalid 7034 | exception is raised if either operand is a NaN. The comparison is performed 7035 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7036 *----------------------------------------------------------------------------*/ 7037 7038 int float128_le(float128 a, float128 b, float_status *status) 7039 { 7040 flag aSign, bSign; 7041 7042 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7043 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7044 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7045 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7046 ) { 7047 float_raise(float_flag_invalid, status); 7048 return 0; 7049 } 7050 aSign = extractFloat128Sign( a ); 7051 bSign = extractFloat128Sign( b ); 7052 if ( aSign != bSign ) { 7053 return 7054 aSign 7055 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7056 == 0 ); 7057 } 7058 return 7059 aSign ? le128( b.high, b.low, a.high, a.low ) 7060 : le128( a.high, a.low, b.high, b.low ); 7061 7062 } 7063 7064 /*---------------------------------------------------------------------------- 7065 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7066 | the corresponding value `b', and 0 otherwise. The invalid exception is 7067 | raised if either operand is a NaN. The comparison is performed according 7068 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7069 *----------------------------------------------------------------------------*/ 7070 7071 int float128_lt(float128 a, float128 b, float_status *status) 7072 { 7073 flag aSign, bSign; 7074 7075 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7076 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7077 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7078 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7079 ) { 7080 float_raise(float_flag_invalid, status); 7081 return 0; 7082 } 7083 aSign = extractFloat128Sign( a ); 7084 bSign = extractFloat128Sign( b ); 7085 if ( aSign != bSign ) { 7086 return 7087 aSign 7088 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7089 != 0 ); 7090 } 7091 return 7092 aSign ? lt128( b.high, b.low, a.high, a.low ) 7093 : lt128( a.high, a.low, b.high, b.low ); 7094 7095 } 7096 7097 /*---------------------------------------------------------------------------- 7098 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7099 | be compared, and 0 otherwise. The invalid exception is raised if either 7100 | operand is a NaN. The comparison is performed according to the IEC/IEEE 7101 | Standard for Binary Floating-Point Arithmetic. 7102 *----------------------------------------------------------------------------*/ 7103 7104 int float128_unordered(float128 a, float128 b, float_status *status) 7105 { 7106 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7107 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7108 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7109 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7110 ) { 7111 float_raise(float_flag_invalid, status); 7112 return 1; 7113 } 7114 return 0; 7115 } 7116 7117 /*---------------------------------------------------------------------------- 7118 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 7119 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 7120 | exception. The comparison is performed according to the IEC/IEEE Standard 7121 | for Binary Floating-Point Arithmetic. 7122 *----------------------------------------------------------------------------*/ 7123 7124 int float128_eq_quiet(float128 a, float128 b, float_status *status) 7125 { 7126 7127 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7128 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7129 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7130 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7131 ) { 7132 if (float128_is_signaling_nan(a, status) 7133 || float128_is_signaling_nan(b, status)) { 7134 float_raise(float_flag_invalid, status); 7135 } 7136 return 0; 7137 } 7138 return 7139 ( a.low == b.low ) 7140 && ( ( a.high == b.high ) 7141 || ( ( a.low == 0 ) 7142 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 7143 ); 7144 7145 } 7146 7147 /*---------------------------------------------------------------------------- 7148 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7149 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 7150 | cause an exception. Otherwise, the comparison is performed according to the 7151 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7152 *----------------------------------------------------------------------------*/ 7153 7154 int float128_le_quiet(float128 a, float128 b, float_status *status) 7155 { 7156 flag aSign, bSign; 7157 7158 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7159 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7160 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7161 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7162 ) { 7163 if (float128_is_signaling_nan(a, status) 7164 || float128_is_signaling_nan(b, status)) { 7165 float_raise(float_flag_invalid, status); 7166 } 7167 return 0; 7168 } 7169 aSign = extractFloat128Sign( a ); 7170 bSign = extractFloat128Sign( b ); 7171 if ( aSign != bSign ) { 7172 return 7173 aSign 7174 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7175 == 0 ); 7176 } 7177 return 7178 aSign ? le128( b.high, b.low, a.high, a.low ) 7179 : le128( a.high, a.low, b.high, b.low ); 7180 7181 } 7182 7183 /*---------------------------------------------------------------------------- 7184 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7185 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 7186 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 7187 | Standard for Binary Floating-Point Arithmetic. 7188 *----------------------------------------------------------------------------*/ 7189 7190 int float128_lt_quiet(float128 a, float128 b, float_status *status) 7191 { 7192 flag aSign, bSign; 7193 7194 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7195 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7196 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7197 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7198 ) { 7199 if (float128_is_signaling_nan(a, status) 7200 || float128_is_signaling_nan(b, status)) { 7201 float_raise(float_flag_invalid, status); 7202 } 7203 return 0; 7204 } 7205 aSign = extractFloat128Sign( a ); 7206 bSign = extractFloat128Sign( b ); 7207 if ( aSign != bSign ) { 7208 return 7209 aSign 7210 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7211 != 0 ); 7212 } 7213 return 7214 aSign ? lt128( b.high, b.low, a.high, a.low ) 7215 : lt128( a.high, a.low, b.high, b.low ); 7216 7217 } 7218 7219 /*---------------------------------------------------------------------------- 7220 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7221 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 7222 | comparison is performed according to the IEC/IEEE Standard for Binary 7223 | Floating-Point Arithmetic. 7224 *----------------------------------------------------------------------------*/ 7225 7226 int float128_unordered_quiet(float128 a, float128 b, float_status *status) 7227 { 7228 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7229 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7230 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7231 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7232 ) { 7233 if (float128_is_signaling_nan(a, status) 7234 || float128_is_signaling_nan(b, status)) { 7235 float_raise(float_flag_invalid, status); 7236 } 7237 return 1; 7238 } 7239 return 0; 7240 } 7241 7242 /* misc functions */ 7243 float32 uint32_to_float32(uint32_t a, float_status *status) 7244 { 7245 return int64_to_float32(a, status); 7246 } 7247 7248 float64 uint32_to_float64(uint32_t a, float_status *status) 7249 { 7250 return int64_to_float64(a, status); 7251 } 7252 7253 uint32_t float32_to_uint32(float32 a, float_status *status) 7254 { 7255 int64_t v; 7256 uint32_t res; 7257 int old_exc_flags = get_float_exception_flags(status); 7258 7259 v = float32_to_int64(a, status); 7260 if (v < 0) { 7261 res = 0; 7262 } else if (v > 0xffffffff) { 7263 res = 0xffffffff; 7264 } else { 7265 return v; 7266 } 7267 set_float_exception_flags(old_exc_flags, status); 7268 float_raise(float_flag_invalid, status); 7269 return res; 7270 } 7271 7272 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *status) 7273 { 7274 int64_t v; 7275 uint32_t res; 7276 int old_exc_flags = get_float_exception_flags(status); 7277 7278 v = float32_to_int64_round_to_zero(a, status); 7279 if (v < 0) { 7280 res = 0; 7281 } else if (v > 0xffffffff) { 7282 res = 0xffffffff; 7283 } else { 7284 return v; 7285 } 7286 set_float_exception_flags(old_exc_flags, status); 7287 float_raise(float_flag_invalid, status); 7288 return res; 7289 } 7290 7291 int16_t float32_to_int16(float32 a, float_status *status) 7292 { 7293 int32_t v; 7294 int16_t res; 7295 int old_exc_flags = get_float_exception_flags(status); 7296 7297 v = float32_to_int32(a, status); 7298 if (v < -0x8000) { 7299 res = -0x8000; 7300 } else if (v > 0x7fff) { 7301 res = 0x7fff; 7302 } else { 7303 return v; 7304 } 7305 7306 set_float_exception_flags(old_exc_flags, status); 7307 float_raise(float_flag_invalid, status); 7308 return res; 7309 } 7310 7311 uint16_t float32_to_uint16(float32 a, float_status *status) 7312 { 7313 int32_t v; 7314 uint16_t res; 7315 int old_exc_flags = get_float_exception_flags(status); 7316 7317 v = float32_to_int32(a, status); 7318 if (v < 0) { 7319 res = 0; 7320 } else if (v > 0xffff) { 7321 res = 0xffff; 7322 } else { 7323 return v; 7324 } 7325 7326 set_float_exception_flags(old_exc_flags, status); 7327 float_raise(float_flag_invalid, status); 7328 return res; 7329 } 7330 7331 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *status) 7332 { 7333 int64_t v; 7334 uint16_t res; 7335 int old_exc_flags = get_float_exception_flags(status); 7336 7337 v = float32_to_int64_round_to_zero(a, status); 7338 if (v < 0) { 7339 res = 0; 7340 } else if (v > 0xffff) { 7341 res = 0xffff; 7342 } else { 7343 return v; 7344 } 7345 set_float_exception_flags(old_exc_flags, status); 7346 float_raise(float_flag_invalid, status); 7347 return res; 7348 } 7349 7350 uint32_t float64_to_uint32(float64 a, float_status *status) 7351 { 7352 uint64_t v; 7353 uint32_t res; 7354 int old_exc_flags = get_float_exception_flags(status); 7355 7356 v = float64_to_uint64(a, status); 7357 if (v > 0xffffffff) { 7358 res = 0xffffffff; 7359 } else { 7360 return v; 7361 } 7362 set_float_exception_flags(old_exc_flags, status); 7363 float_raise(float_flag_invalid, status); 7364 return res; 7365 } 7366 7367 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *status) 7368 { 7369 uint64_t v; 7370 uint32_t res; 7371 int old_exc_flags = get_float_exception_flags(status); 7372 7373 v = float64_to_uint64_round_to_zero(a, status); 7374 if (v > 0xffffffff) { 7375 res = 0xffffffff; 7376 } else { 7377 return v; 7378 } 7379 set_float_exception_flags(old_exc_flags, status); 7380 float_raise(float_flag_invalid, status); 7381 return res; 7382 } 7383 7384 int16_t float64_to_int16(float64 a, float_status *status) 7385 { 7386 int64_t v; 7387 int16_t res; 7388 int old_exc_flags = get_float_exception_flags(status); 7389 7390 v = float64_to_int32(a, status); 7391 if (v < -0x8000) { 7392 res = -0x8000; 7393 } else if (v > 0x7fff) { 7394 res = 0x7fff; 7395 } else { 7396 return v; 7397 } 7398 7399 set_float_exception_flags(old_exc_flags, status); 7400 float_raise(float_flag_invalid, status); 7401 return res; 7402 } 7403 7404 uint16_t float64_to_uint16(float64 a, float_status *status) 7405 { 7406 int64_t v; 7407 uint16_t res; 7408 int old_exc_flags = get_float_exception_flags(status); 7409 7410 v = float64_to_int32(a, status); 7411 if (v < 0) { 7412 res = 0; 7413 } else if (v > 0xffff) { 7414 res = 0xffff; 7415 } else { 7416 return v; 7417 } 7418 7419 set_float_exception_flags(old_exc_flags, status); 7420 float_raise(float_flag_invalid, status); 7421 return res; 7422 } 7423 7424 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *status) 7425 { 7426 int64_t v; 7427 uint16_t res; 7428 int old_exc_flags = get_float_exception_flags(status); 7429 7430 v = float64_to_int64_round_to_zero(a, status); 7431 if (v < 0) { 7432 res = 0; 7433 } else if (v > 0xffff) { 7434 res = 0xffff; 7435 } else { 7436 return v; 7437 } 7438 set_float_exception_flags(old_exc_flags, status); 7439 float_raise(float_flag_invalid, status); 7440 return res; 7441 } 7442 7443 /*---------------------------------------------------------------------------- 7444 | Returns the result of converting the double-precision floating-point value 7445 | `a' to the 64-bit unsigned integer format. The conversion is 7446 | performed according to the IEC/IEEE Standard for Binary Floating-Point 7447 | Arithmetic---which means in particular that the conversion is rounded 7448 | according to the current rounding mode. If `a' is a NaN, the largest 7449 | positive integer is returned. If the conversion overflows, the 7450 | largest unsigned integer is returned. If 'a' is negative, the value is 7451 | rounded and zero is returned; negative values that do not round to zero 7452 | will raise the inexact exception. 7453 *----------------------------------------------------------------------------*/ 7454 7455 uint64_t float64_to_uint64(float64 a, float_status *status) 7456 { 7457 flag aSign; 7458 int aExp; 7459 int shiftCount; 7460 uint64_t aSig, aSigExtra; 7461 a = float64_squash_input_denormal(a, status); 7462 7463 aSig = extractFloat64Frac(a); 7464 aExp = extractFloat64Exp(a); 7465 aSign = extractFloat64Sign(a); 7466 if (aSign && (aExp > 1022)) { 7467 float_raise(float_flag_invalid, status); 7468 if (float64_is_any_nan(a)) { 7469 return LIT64(0xFFFFFFFFFFFFFFFF); 7470 } else { 7471 return 0; 7472 } 7473 } 7474 if (aExp) { 7475 aSig |= LIT64(0x0010000000000000); 7476 } 7477 shiftCount = 0x433 - aExp; 7478 if (shiftCount <= 0) { 7479 if (0x43E < aExp) { 7480 float_raise(float_flag_invalid, status); 7481 return LIT64(0xFFFFFFFFFFFFFFFF); 7482 } 7483 aSigExtra = 0; 7484 aSig <<= -shiftCount; 7485 } else { 7486 shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra); 7487 } 7488 return roundAndPackUint64(aSign, aSig, aSigExtra, status); 7489 } 7490 7491 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status) 7492 { 7493 signed char current_rounding_mode = status->float_rounding_mode; 7494 set_float_rounding_mode(float_round_to_zero, status); 7495 int64_t v = float64_to_uint64(a, status); 7496 set_float_rounding_mode(current_rounding_mode, status); 7497 return v; 7498 } 7499 7500 #define COMPARE(s, nan_exp) \ 7501 static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\ 7502 int is_quiet, float_status *status) \ 7503 { \ 7504 flag aSign, bSign; \ 7505 uint ## s ## _t av, bv; \ 7506 a = float ## s ## _squash_input_denormal(a, status); \ 7507 b = float ## s ## _squash_input_denormal(b, status); \ 7508 \ 7509 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \ 7510 extractFloat ## s ## Frac( a ) ) || \ 7511 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \ 7512 extractFloat ## s ## Frac( b ) )) { \ 7513 if (!is_quiet || \ 7514 float ## s ## _is_signaling_nan(a, status) || \ 7515 float ## s ## _is_signaling_nan(b, status)) { \ 7516 float_raise(float_flag_invalid, status); \ 7517 } \ 7518 return float_relation_unordered; \ 7519 } \ 7520 aSign = extractFloat ## s ## Sign( a ); \ 7521 bSign = extractFloat ## s ## Sign( b ); \ 7522 av = float ## s ## _val(a); \ 7523 bv = float ## s ## _val(b); \ 7524 if ( aSign != bSign ) { \ 7525 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \ 7526 /* zero case */ \ 7527 return float_relation_equal; \ 7528 } else { \ 7529 return 1 - (2 * aSign); \ 7530 } \ 7531 } else { \ 7532 if (av == bv) { \ 7533 return float_relation_equal; \ 7534 } else { \ 7535 return 1 - 2 * (aSign ^ ( av < bv )); \ 7536 } \ 7537 } \ 7538 } \ 7539 \ 7540 int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \ 7541 { \ 7542 return float ## s ## _compare_internal(a, b, 0, status); \ 7543 } \ 7544 \ 7545 int float ## s ## _compare_quiet(float ## s a, float ## s b, \ 7546 float_status *status) \ 7547 { \ 7548 return float ## s ## _compare_internal(a, b, 1, status); \ 7549 } 7550 7551 COMPARE(32, 0xff) 7552 COMPARE(64, 0x7ff) 7553 7554 static inline int floatx80_compare_internal(floatx80 a, floatx80 b, 7555 int is_quiet, float_status *status) 7556 { 7557 flag aSign, bSign; 7558 7559 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 7560 float_raise(float_flag_invalid, status); 7561 return float_relation_unordered; 7562 } 7563 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 7564 ( extractFloatx80Frac( a )<<1 ) ) || 7565 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 7566 ( extractFloatx80Frac( b )<<1 ) )) { 7567 if (!is_quiet || 7568 floatx80_is_signaling_nan(a, status) || 7569 floatx80_is_signaling_nan(b, status)) { 7570 float_raise(float_flag_invalid, status); 7571 } 7572 return float_relation_unordered; 7573 } 7574 aSign = extractFloatx80Sign( a ); 7575 bSign = extractFloatx80Sign( b ); 7576 if ( aSign != bSign ) { 7577 7578 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 7579 ( ( a.low | b.low ) == 0 ) ) { 7580 /* zero case */ 7581 return float_relation_equal; 7582 } else { 7583 return 1 - (2 * aSign); 7584 } 7585 } else { 7586 if (a.low == b.low && a.high == b.high) { 7587 return float_relation_equal; 7588 } else { 7589 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7590 } 7591 } 7592 } 7593 7594 int floatx80_compare(floatx80 a, floatx80 b, float_status *status) 7595 { 7596 return floatx80_compare_internal(a, b, 0, status); 7597 } 7598 7599 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status) 7600 { 7601 return floatx80_compare_internal(a, b, 1, status); 7602 } 7603 7604 static inline int float128_compare_internal(float128 a, float128 b, 7605 int is_quiet, float_status *status) 7606 { 7607 flag aSign, bSign; 7608 7609 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7610 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7611 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7612 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7613 if (!is_quiet || 7614 float128_is_signaling_nan(a, status) || 7615 float128_is_signaling_nan(b, status)) { 7616 float_raise(float_flag_invalid, status); 7617 } 7618 return float_relation_unordered; 7619 } 7620 aSign = extractFloat128Sign( a ); 7621 bSign = extractFloat128Sign( b ); 7622 if ( aSign != bSign ) { 7623 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7624 /* zero case */ 7625 return float_relation_equal; 7626 } else { 7627 return 1 - (2 * aSign); 7628 } 7629 } else { 7630 if (a.low == b.low && a.high == b.high) { 7631 return float_relation_equal; 7632 } else { 7633 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7634 } 7635 } 7636 } 7637 7638 int float128_compare(float128 a, float128 b, float_status *status) 7639 { 7640 return float128_compare_internal(a, b, 0, status); 7641 } 7642 7643 int float128_compare_quiet(float128 a, float128 b, float_status *status) 7644 { 7645 return float128_compare_internal(a, b, 1, status); 7646 } 7647 7648 /* min() and max() functions. These can't be implemented as 7649 * 'compare and pick one input' because that would mishandle 7650 * NaNs and +0 vs -0. 7651 * 7652 * minnum() and maxnum() functions. These are similar to the min() 7653 * and max() functions but if one of the arguments is a QNaN and 7654 * the other is numerical then the numerical argument is returned. 7655 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 7656 * and maxNum() operations. min() and max() are the typical min/max 7657 * semantics provided by many CPUs which predate that specification. 7658 * 7659 * minnummag() and maxnummag() functions correspond to minNumMag() 7660 * and minNumMag() from the IEEE-754 2008. 7661 */ 7662 #define MINMAX(s) \ 7663 static inline float ## s float ## s ## _minmax(float ## s a, float ## s b, \ 7664 int ismin, int isieee, \ 7665 int ismag, \ 7666 float_status *status) \ 7667 { \ 7668 flag aSign, bSign; \ 7669 uint ## s ## _t av, bv, aav, abv; \ 7670 a = float ## s ## _squash_input_denormal(a, status); \ 7671 b = float ## s ## _squash_input_denormal(b, status); \ 7672 if (float ## s ## _is_any_nan(a) || \ 7673 float ## s ## _is_any_nan(b)) { \ 7674 if (isieee) { \ 7675 if (float ## s ## _is_quiet_nan(a, status) && \ 7676 !float ## s ##_is_any_nan(b)) { \ 7677 return b; \ 7678 } else if (float ## s ## _is_quiet_nan(b, status) && \ 7679 !float ## s ## _is_any_nan(a)) { \ 7680 return a; \ 7681 } \ 7682 } \ 7683 return propagateFloat ## s ## NaN(a, b, status); \ 7684 } \ 7685 aSign = extractFloat ## s ## Sign(a); \ 7686 bSign = extractFloat ## s ## Sign(b); \ 7687 av = float ## s ## _val(a); \ 7688 bv = float ## s ## _val(b); \ 7689 if (ismag) { \ 7690 aav = float ## s ## _abs(av); \ 7691 abv = float ## s ## _abs(bv); \ 7692 if (aav != abv) { \ 7693 if (ismin) { \ 7694 return (aav < abv) ? a : b; \ 7695 } else { \ 7696 return (aav < abv) ? b : a; \ 7697 } \ 7698 } \ 7699 } \ 7700 if (aSign != bSign) { \ 7701 if (ismin) { \ 7702 return aSign ? a : b; \ 7703 } else { \ 7704 return aSign ? b : a; \ 7705 } \ 7706 } else { \ 7707 if (ismin) { \ 7708 return (aSign ^ (av < bv)) ? a : b; \ 7709 } else { \ 7710 return (aSign ^ (av < bv)) ? b : a; \ 7711 } \ 7712 } \ 7713 } \ 7714 \ 7715 float ## s float ## s ## _min(float ## s a, float ## s b, \ 7716 float_status *status) \ 7717 { \ 7718 return float ## s ## _minmax(a, b, 1, 0, 0, status); \ 7719 } \ 7720 \ 7721 float ## s float ## s ## _max(float ## s a, float ## s b, \ 7722 float_status *status) \ 7723 { \ 7724 return float ## s ## _minmax(a, b, 0, 0, 0, status); \ 7725 } \ 7726 \ 7727 float ## s float ## s ## _minnum(float ## s a, float ## s b, \ 7728 float_status *status) \ 7729 { \ 7730 return float ## s ## _minmax(a, b, 1, 1, 0, status); \ 7731 } \ 7732 \ 7733 float ## s float ## s ## _maxnum(float ## s a, float ## s b, \ 7734 float_status *status) \ 7735 { \ 7736 return float ## s ## _minmax(a, b, 0, 1, 0, status); \ 7737 } \ 7738 \ 7739 float ## s float ## s ## _minnummag(float ## s a, float ## s b, \ 7740 float_status *status) \ 7741 { \ 7742 return float ## s ## _minmax(a, b, 1, 1, 1, status); \ 7743 } \ 7744 \ 7745 float ## s float ## s ## _maxnummag(float ## s a, float ## s b, \ 7746 float_status *status) \ 7747 { \ 7748 return float ## s ## _minmax(a, b, 0, 1, 1, status); \ 7749 } 7750 7751 MINMAX(32) 7752 MINMAX(64) 7753 7754 7755 /* Multiply A by 2 raised to the power N. */ 7756 float32 float32_scalbn(float32 a, int n, float_status *status) 7757 { 7758 flag aSign; 7759 int16_t aExp; 7760 uint32_t aSig; 7761 7762 a = float32_squash_input_denormal(a, status); 7763 aSig = extractFloat32Frac( a ); 7764 aExp = extractFloat32Exp( a ); 7765 aSign = extractFloat32Sign( a ); 7766 7767 if ( aExp == 0xFF ) { 7768 if ( aSig ) { 7769 return propagateFloat32NaN(a, a, status); 7770 } 7771 return a; 7772 } 7773 if (aExp != 0) { 7774 aSig |= 0x00800000; 7775 } else if (aSig == 0) { 7776 return a; 7777 } else { 7778 aExp++; 7779 } 7780 7781 if (n > 0x200) { 7782 n = 0x200; 7783 } else if (n < -0x200) { 7784 n = -0x200; 7785 } 7786 7787 aExp += n - 1; 7788 aSig <<= 7; 7789 return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status); 7790 } 7791 7792 float64 float64_scalbn(float64 a, int n, float_status *status) 7793 { 7794 flag aSign; 7795 int16_t aExp; 7796 uint64_t aSig; 7797 7798 a = float64_squash_input_denormal(a, status); 7799 aSig = extractFloat64Frac( a ); 7800 aExp = extractFloat64Exp( a ); 7801 aSign = extractFloat64Sign( a ); 7802 7803 if ( aExp == 0x7FF ) { 7804 if ( aSig ) { 7805 return propagateFloat64NaN(a, a, status); 7806 } 7807 return a; 7808 } 7809 if (aExp != 0) { 7810 aSig |= LIT64( 0x0010000000000000 ); 7811 } else if (aSig == 0) { 7812 return a; 7813 } else { 7814 aExp++; 7815 } 7816 7817 if (n > 0x1000) { 7818 n = 0x1000; 7819 } else if (n < -0x1000) { 7820 n = -0x1000; 7821 } 7822 7823 aExp += n - 1; 7824 aSig <<= 10; 7825 return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status); 7826 } 7827 7828 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7829 { 7830 flag aSign; 7831 int32_t aExp; 7832 uint64_t aSig; 7833 7834 if (floatx80_invalid_encoding(a)) { 7835 float_raise(float_flag_invalid, status); 7836 return floatx80_default_nan(status); 7837 } 7838 aSig = extractFloatx80Frac( a ); 7839 aExp = extractFloatx80Exp( a ); 7840 aSign = extractFloatx80Sign( a ); 7841 7842 if ( aExp == 0x7FFF ) { 7843 if ( aSig<<1 ) { 7844 return propagateFloatx80NaN(a, a, status); 7845 } 7846 return a; 7847 } 7848 7849 if (aExp == 0) { 7850 if (aSig == 0) { 7851 return a; 7852 } 7853 aExp++; 7854 } 7855 7856 if (n > 0x10000) { 7857 n = 0x10000; 7858 } else if (n < -0x10000) { 7859 n = -0x10000; 7860 } 7861 7862 aExp += n; 7863 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7864 aSign, aExp, aSig, 0, status); 7865 } 7866 7867 float128 float128_scalbn(float128 a, int n, float_status *status) 7868 { 7869 flag aSign; 7870 int32_t aExp; 7871 uint64_t aSig0, aSig1; 7872 7873 aSig1 = extractFloat128Frac1( a ); 7874 aSig0 = extractFloat128Frac0( a ); 7875 aExp = extractFloat128Exp( a ); 7876 aSign = extractFloat128Sign( a ); 7877 if ( aExp == 0x7FFF ) { 7878 if ( aSig0 | aSig1 ) { 7879 return propagateFloat128NaN(a, a, status); 7880 } 7881 return a; 7882 } 7883 if (aExp != 0) { 7884 aSig0 |= LIT64( 0x0001000000000000 ); 7885 } else if (aSig0 == 0 && aSig1 == 0) { 7886 return a; 7887 } else { 7888 aExp++; 7889 } 7890 7891 if (n > 0x10000) { 7892 n = 0x10000; 7893 } else if (n < -0x10000) { 7894 n = -0x10000; 7895 } 7896 7897 aExp += n - 1; 7898 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7899 , status); 7900 7901 } 7902