1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 87 #include "fpu/softfloat.h" 88 89 /* We only need stdlib for abort() */ 90 91 /*---------------------------------------------------------------------------- 92 | Primitive arithmetic functions, including multi-word arithmetic, and 93 | division and square root approximations. (Can be specialized to target if 94 | desired.) 95 *----------------------------------------------------------------------------*/ 96 #include "softfloat-macros.h" 97 98 /*---------------------------------------------------------------------------- 99 | Functions and definitions to determine: (1) whether tininess for underflow 100 | is detected before or after rounding by default, (2) what (if anything) 101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 103 | are propagated from function inputs to output. These details are target- 104 | specific. 105 *----------------------------------------------------------------------------*/ 106 #include "softfloat-specialize.h" 107 108 /*---------------------------------------------------------------------------- 109 | Returns the fraction bits of the half-precision floating-point value `a'. 110 *----------------------------------------------------------------------------*/ 111 112 static inline uint32_t extractFloat16Frac(float16 a) 113 { 114 return float16_val(a) & 0x3ff; 115 } 116 117 /*---------------------------------------------------------------------------- 118 | Returns the exponent bits of the half-precision floating-point value `a'. 119 *----------------------------------------------------------------------------*/ 120 121 static inline int_fast16_t extractFloat16Exp(float16 a) 122 { 123 return (float16_val(a) >> 10) & 0x1f; 124 } 125 126 /*---------------------------------------------------------------------------- 127 | Returns the sign bit of the single-precision floating-point value `a'. 128 *----------------------------------------------------------------------------*/ 129 130 static inline flag extractFloat16Sign(float16 a) 131 { 132 return float16_val(a)>>15; 133 } 134 135 /*---------------------------------------------------------------------------- 136 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 137 | and 7, and returns the properly rounded 32-bit integer corresponding to the 138 | input. If `zSign' is 1, the input is negated before being converted to an 139 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 140 | is simply rounded to an integer, with the inexact exception raised if the 141 | input cannot be represented exactly as an integer. However, if the fixed- 142 | point input is too large, the invalid exception is raised and the largest 143 | positive or negative integer is returned. 144 *----------------------------------------------------------------------------*/ 145 146 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status) 147 { 148 int8_t roundingMode; 149 flag roundNearestEven; 150 int8_t roundIncrement, roundBits; 151 int32_t z; 152 153 roundingMode = status->float_rounding_mode; 154 roundNearestEven = ( roundingMode == float_round_nearest_even ); 155 switch (roundingMode) { 156 case float_round_nearest_even: 157 case float_round_ties_away: 158 roundIncrement = 0x40; 159 break; 160 case float_round_to_zero: 161 roundIncrement = 0; 162 break; 163 case float_round_up: 164 roundIncrement = zSign ? 0 : 0x7f; 165 break; 166 case float_round_down: 167 roundIncrement = zSign ? 0x7f : 0; 168 break; 169 default: 170 abort(); 171 } 172 roundBits = absZ & 0x7F; 173 absZ = ( absZ + roundIncrement )>>7; 174 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 175 z = absZ; 176 if ( zSign ) z = - z; 177 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 178 float_raise(float_flag_invalid, status); 179 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 180 } 181 if (roundBits) { 182 status->float_exception_flags |= float_flag_inexact; 183 } 184 return z; 185 186 } 187 188 /*---------------------------------------------------------------------------- 189 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 190 | `absZ1', with binary point between bits 63 and 64 (between the input words), 191 | and returns the properly rounded 64-bit integer corresponding to the input. 192 | If `zSign' is 1, the input is negated before being converted to an integer. 193 | Ordinarily, the fixed-point input is simply rounded to an integer, with 194 | the inexact exception raised if the input cannot be represented exactly as 195 | an integer. However, if the fixed-point input is too large, the invalid 196 | exception is raised and the largest positive or negative integer is 197 | returned. 198 *----------------------------------------------------------------------------*/ 199 200 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1, 201 float_status *status) 202 { 203 int8_t roundingMode; 204 flag roundNearestEven, increment; 205 int64_t z; 206 207 roundingMode = status->float_rounding_mode; 208 roundNearestEven = ( roundingMode == float_round_nearest_even ); 209 switch (roundingMode) { 210 case float_round_nearest_even: 211 case float_round_ties_away: 212 increment = ((int64_t) absZ1 < 0); 213 break; 214 case float_round_to_zero: 215 increment = 0; 216 break; 217 case float_round_up: 218 increment = !zSign && absZ1; 219 break; 220 case float_round_down: 221 increment = zSign && absZ1; 222 break; 223 default: 224 abort(); 225 } 226 if ( increment ) { 227 ++absZ0; 228 if ( absZ0 == 0 ) goto overflow; 229 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 230 } 231 z = absZ0; 232 if ( zSign ) z = - z; 233 if ( z && ( ( z < 0 ) ^ zSign ) ) { 234 overflow: 235 float_raise(float_flag_invalid, status); 236 return 237 zSign ? (int64_t) LIT64( 0x8000000000000000 ) 238 : LIT64( 0x7FFFFFFFFFFFFFFF ); 239 } 240 if (absZ1) { 241 status->float_exception_flags |= float_flag_inexact; 242 } 243 return z; 244 245 } 246 247 /*---------------------------------------------------------------------------- 248 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 249 | `absZ1', with binary point between bits 63 and 64 (between the input words), 250 | and returns the properly rounded 64-bit unsigned integer corresponding to the 251 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 252 | with the inexact exception raised if the input cannot be represented exactly 253 | as an integer. However, if the fixed-point input is too large, the invalid 254 | exception is raised and the largest unsigned integer is returned. 255 *----------------------------------------------------------------------------*/ 256 257 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0, 258 uint64_t absZ1, float_status *status) 259 { 260 int8_t roundingMode; 261 flag roundNearestEven, increment; 262 263 roundingMode = status->float_rounding_mode; 264 roundNearestEven = (roundingMode == float_round_nearest_even); 265 switch (roundingMode) { 266 case float_round_nearest_even: 267 case float_round_ties_away: 268 increment = ((int64_t)absZ1 < 0); 269 break; 270 case float_round_to_zero: 271 increment = 0; 272 break; 273 case float_round_up: 274 increment = !zSign && absZ1; 275 break; 276 case float_round_down: 277 increment = zSign && absZ1; 278 break; 279 default: 280 abort(); 281 } 282 if (increment) { 283 ++absZ0; 284 if (absZ0 == 0) { 285 float_raise(float_flag_invalid, status); 286 return LIT64(0xFFFFFFFFFFFFFFFF); 287 } 288 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven); 289 } 290 291 if (zSign && absZ0) { 292 float_raise(float_flag_invalid, status); 293 return 0; 294 } 295 296 if (absZ1) { 297 status->float_exception_flags |= float_flag_inexact; 298 } 299 return absZ0; 300 } 301 302 /*---------------------------------------------------------------------------- 303 | Returns the fraction bits of the single-precision floating-point value `a'. 304 *----------------------------------------------------------------------------*/ 305 306 static inline uint32_t extractFloat32Frac( float32 a ) 307 { 308 309 return float32_val(a) & 0x007FFFFF; 310 311 } 312 313 /*---------------------------------------------------------------------------- 314 | Returns the exponent bits of the single-precision floating-point value `a'. 315 *----------------------------------------------------------------------------*/ 316 317 static inline int_fast16_t extractFloat32Exp(float32 a) 318 { 319 320 return ( float32_val(a)>>23 ) & 0xFF; 321 322 } 323 324 /*---------------------------------------------------------------------------- 325 | Returns the sign bit of the single-precision floating-point value `a'. 326 *----------------------------------------------------------------------------*/ 327 328 static inline flag extractFloat32Sign( float32 a ) 329 { 330 331 return float32_val(a)>>31; 332 333 } 334 335 /*---------------------------------------------------------------------------- 336 | If `a' is denormal and we are in flush-to-zero mode then set the 337 | input-denormal exception and return zero. Otherwise just return the value. 338 *----------------------------------------------------------------------------*/ 339 float32 float32_squash_input_denormal(float32 a, float_status *status) 340 { 341 if (status->flush_inputs_to_zero) { 342 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) { 343 float_raise(float_flag_input_denormal, status); 344 return make_float32(float32_val(a) & 0x80000000); 345 } 346 } 347 return a; 348 } 349 350 /*---------------------------------------------------------------------------- 351 | Normalizes the subnormal single-precision floating-point value represented 352 | by the denormalized significand `aSig'. The normalized exponent and 353 | significand are stored at the locations pointed to by `zExpPtr' and 354 | `zSigPtr', respectively. 355 *----------------------------------------------------------------------------*/ 356 357 static void 358 normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr) 359 { 360 int8_t shiftCount; 361 362 shiftCount = countLeadingZeros32( aSig ) - 8; 363 *zSigPtr = aSig<<shiftCount; 364 *zExpPtr = 1 - shiftCount; 365 366 } 367 368 /*---------------------------------------------------------------------------- 369 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 370 | single-precision floating-point value, returning the result. After being 371 | shifted into the proper positions, the three fields are simply added 372 | together to form the result. This means that any integer portion of `zSig' 373 | will be added into the exponent. Since a properly normalized significand 374 | will have an integer portion equal to 1, the `zExp' input should be 1 less 375 | than the desired result exponent whenever `zSig' is a complete, normalized 376 | significand. 377 *----------------------------------------------------------------------------*/ 378 379 static inline float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig) 380 { 381 382 return make_float32( 383 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig); 384 385 } 386 387 /*---------------------------------------------------------------------------- 388 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 389 | and significand `zSig', and returns the proper single-precision floating- 390 | point value corresponding to the abstract input. Ordinarily, the abstract 391 | value is simply rounded and packed into the single-precision format, with 392 | the inexact exception raised if the abstract input cannot be represented 393 | exactly. However, if the abstract value is too large, the overflow and 394 | inexact exceptions are raised and an infinity or maximal finite value is 395 | returned. If the abstract value is too small, the input value is rounded to 396 | a subnormal number, and the underflow and inexact exceptions are raised if 397 | the abstract input cannot be represented exactly as a subnormal single- 398 | precision floating-point number. 399 | The input significand `zSig' has its binary point between bits 30 400 | and 29, which is 7 bits to the left of the usual location. This shifted 401 | significand must be normalized or smaller. If `zSig' is not normalized, 402 | `zExp' must be 0; in that case, the result returned is a subnormal number, 403 | and it must not require rounding. In the usual case that `zSig' is 404 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 405 | The handling of underflow and overflow follows the IEC/IEEE Standard for 406 | Binary Floating-Point Arithmetic. 407 *----------------------------------------------------------------------------*/ 408 409 static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig, 410 float_status *status) 411 { 412 int8_t roundingMode; 413 flag roundNearestEven; 414 int8_t roundIncrement, roundBits; 415 flag isTiny; 416 417 roundingMode = status->float_rounding_mode; 418 roundNearestEven = ( roundingMode == float_round_nearest_even ); 419 switch (roundingMode) { 420 case float_round_nearest_even: 421 case float_round_ties_away: 422 roundIncrement = 0x40; 423 break; 424 case float_round_to_zero: 425 roundIncrement = 0; 426 break; 427 case float_round_up: 428 roundIncrement = zSign ? 0 : 0x7f; 429 break; 430 case float_round_down: 431 roundIncrement = zSign ? 0x7f : 0; 432 break; 433 default: 434 abort(); 435 break; 436 } 437 roundBits = zSig & 0x7F; 438 if ( 0xFD <= (uint16_t) zExp ) { 439 if ( ( 0xFD < zExp ) 440 || ( ( zExp == 0xFD ) 441 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 442 ) { 443 float_raise(float_flag_overflow | float_flag_inexact, status); 444 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 )); 445 } 446 if ( zExp < 0 ) { 447 if (status->flush_to_zero) { 448 float_raise(float_flag_output_denormal, status); 449 return packFloat32(zSign, 0, 0); 450 } 451 isTiny = 452 (status->float_detect_tininess 453 == float_tininess_before_rounding) 454 || ( zExp < -1 ) 455 || ( zSig + roundIncrement < 0x80000000 ); 456 shift32RightJamming( zSig, - zExp, &zSig ); 457 zExp = 0; 458 roundBits = zSig & 0x7F; 459 if (isTiny && roundBits) { 460 float_raise(float_flag_underflow, status); 461 } 462 } 463 } 464 if (roundBits) { 465 status->float_exception_flags |= float_flag_inexact; 466 } 467 zSig = ( zSig + roundIncrement )>>7; 468 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 469 if ( zSig == 0 ) zExp = 0; 470 return packFloat32( zSign, zExp, zSig ); 471 472 } 473 474 /*---------------------------------------------------------------------------- 475 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 476 | and significand `zSig', and returns the proper single-precision floating- 477 | point value corresponding to the abstract input. This routine is just like 478 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 479 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 480 | floating-point exponent. 481 *----------------------------------------------------------------------------*/ 482 483 static float32 484 normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig, 485 float_status *status) 486 { 487 int8_t shiftCount; 488 489 shiftCount = countLeadingZeros32( zSig ) - 1; 490 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 491 status); 492 493 } 494 495 /*---------------------------------------------------------------------------- 496 | Returns the fraction bits of the double-precision floating-point value `a'. 497 *----------------------------------------------------------------------------*/ 498 499 static inline uint64_t extractFloat64Frac( float64 a ) 500 { 501 502 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF ); 503 504 } 505 506 /*---------------------------------------------------------------------------- 507 | Returns the exponent bits of the double-precision floating-point value `a'. 508 *----------------------------------------------------------------------------*/ 509 510 static inline int_fast16_t extractFloat64Exp(float64 a) 511 { 512 513 return ( float64_val(a)>>52 ) & 0x7FF; 514 515 } 516 517 /*---------------------------------------------------------------------------- 518 | Returns the sign bit of the double-precision floating-point value `a'. 519 *----------------------------------------------------------------------------*/ 520 521 static inline flag extractFloat64Sign( float64 a ) 522 { 523 524 return float64_val(a)>>63; 525 526 } 527 528 /*---------------------------------------------------------------------------- 529 | If `a' is denormal and we are in flush-to-zero mode then set the 530 | input-denormal exception and return zero. Otherwise just return the value. 531 *----------------------------------------------------------------------------*/ 532 float64 float64_squash_input_denormal(float64 a, float_status *status) 533 { 534 if (status->flush_inputs_to_zero) { 535 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) { 536 float_raise(float_flag_input_denormal, status); 537 return make_float64(float64_val(a) & (1ULL << 63)); 538 } 539 } 540 return a; 541 } 542 543 /*---------------------------------------------------------------------------- 544 | Normalizes the subnormal double-precision floating-point value represented 545 | by the denormalized significand `aSig'. The normalized exponent and 546 | significand are stored at the locations pointed to by `zExpPtr' and 547 | `zSigPtr', respectively. 548 *----------------------------------------------------------------------------*/ 549 550 static void 551 normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr) 552 { 553 int8_t shiftCount; 554 555 shiftCount = countLeadingZeros64( aSig ) - 11; 556 *zSigPtr = aSig<<shiftCount; 557 *zExpPtr = 1 - shiftCount; 558 559 } 560 561 /*---------------------------------------------------------------------------- 562 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 563 | double-precision floating-point value, returning the result. After being 564 | shifted into the proper positions, the three fields are simply added 565 | together to form the result. This means that any integer portion of `zSig' 566 | will be added into the exponent. Since a properly normalized significand 567 | will have an integer portion equal to 1, the `zExp' input should be 1 less 568 | than the desired result exponent whenever `zSig' is a complete, normalized 569 | significand. 570 *----------------------------------------------------------------------------*/ 571 572 static inline float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig) 573 { 574 575 return make_float64( 576 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 577 578 } 579 580 /*---------------------------------------------------------------------------- 581 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 582 | and significand `zSig', and returns the proper double-precision floating- 583 | point value corresponding to the abstract input. Ordinarily, the abstract 584 | value is simply rounded and packed into the double-precision format, with 585 | the inexact exception raised if the abstract input cannot be represented 586 | exactly. However, if the abstract value is too large, the overflow and 587 | inexact exceptions are raised and an infinity or maximal finite value is 588 | returned. If the abstract value is too small, the input value is rounded to 589 | a subnormal number, and the underflow and inexact exceptions are raised if 590 | the abstract input cannot be represented exactly as a subnormal double- 591 | precision floating-point number. 592 | The input significand `zSig' has its binary point between bits 62 593 | and 61, which is 10 bits to the left of the usual location. This shifted 594 | significand must be normalized or smaller. If `zSig' is not normalized, 595 | `zExp' must be 0; in that case, the result returned is a subnormal number, 596 | and it must not require rounding. In the usual case that `zSig' is 597 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 598 | The handling of underflow and overflow follows the IEC/IEEE Standard for 599 | Binary Floating-Point Arithmetic. 600 *----------------------------------------------------------------------------*/ 601 602 static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig, 603 float_status *status) 604 { 605 int8_t roundingMode; 606 flag roundNearestEven; 607 int_fast16_t roundIncrement, roundBits; 608 flag isTiny; 609 610 roundingMode = status->float_rounding_mode; 611 roundNearestEven = ( roundingMode == float_round_nearest_even ); 612 switch (roundingMode) { 613 case float_round_nearest_even: 614 case float_round_ties_away: 615 roundIncrement = 0x200; 616 break; 617 case float_round_to_zero: 618 roundIncrement = 0; 619 break; 620 case float_round_up: 621 roundIncrement = zSign ? 0 : 0x3ff; 622 break; 623 case float_round_down: 624 roundIncrement = zSign ? 0x3ff : 0; 625 break; 626 default: 627 abort(); 628 } 629 roundBits = zSig & 0x3FF; 630 if ( 0x7FD <= (uint16_t) zExp ) { 631 if ( ( 0x7FD < zExp ) 632 || ( ( zExp == 0x7FD ) 633 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 634 ) { 635 float_raise(float_flag_overflow | float_flag_inexact, status); 636 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 )); 637 } 638 if ( zExp < 0 ) { 639 if (status->flush_to_zero) { 640 float_raise(float_flag_output_denormal, status); 641 return packFloat64(zSign, 0, 0); 642 } 643 isTiny = 644 (status->float_detect_tininess 645 == float_tininess_before_rounding) 646 || ( zExp < -1 ) 647 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 648 shift64RightJamming( zSig, - zExp, &zSig ); 649 zExp = 0; 650 roundBits = zSig & 0x3FF; 651 if (isTiny && roundBits) { 652 float_raise(float_flag_underflow, status); 653 } 654 } 655 } 656 if (roundBits) { 657 status->float_exception_flags |= float_flag_inexact; 658 } 659 zSig = ( zSig + roundIncrement )>>10; 660 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 661 if ( zSig == 0 ) zExp = 0; 662 return packFloat64( zSign, zExp, zSig ); 663 664 } 665 666 /*---------------------------------------------------------------------------- 667 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 668 | and significand `zSig', and returns the proper double-precision floating- 669 | point value corresponding to the abstract input. This routine is just like 670 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 671 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 672 | floating-point exponent. 673 *----------------------------------------------------------------------------*/ 674 675 static float64 676 normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig, 677 float_status *status) 678 { 679 int8_t shiftCount; 680 681 shiftCount = countLeadingZeros64( zSig ) - 1; 682 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 683 status); 684 685 } 686 687 /*---------------------------------------------------------------------------- 688 | Returns the fraction bits of the extended double-precision floating-point 689 | value `a'. 690 *----------------------------------------------------------------------------*/ 691 692 static inline uint64_t extractFloatx80Frac( floatx80 a ) 693 { 694 695 return a.low; 696 697 } 698 699 /*---------------------------------------------------------------------------- 700 | Returns the exponent bits of the extended double-precision floating-point 701 | value `a'. 702 *----------------------------------------------------------------------------*/ 703 704 static inline int32_t extractFloatx80Exp( floatx80 a ) 705 { 706 707 return a.high & 0x7FFF; 708 709 } 710 711 /*---------------------------------------------------------------------------- 712 | Returns the sign bit of the extended double-precision floating-point value 713 | `a'. 714 *----------------------------------------------------------------------------*/ 715 716 static inline flag extractFloatx80Sign( floatx80 a ) 717 { 718 719 return a.high>>15; 720 721 } 722 723 /*---------------------------------------------------------------------------- 724 | Normalizes the subnormal extended double-precision floating-point value 725 | represented by the denormalized significand `aSig'. The normalized exponent 726 | and significand are stored at the locations pointed to by `zExpPtr' and 727 | `zSigPtr', respectively. 728 *----------------------------------------------------------------------------*/ 729 730 static void 731 normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr ) 732 { 733 int8_t shiftCount; 734 735 shiftCount = countLeadingZeros64( aSig ); 736 *zSigPtr = aSig<<shiftCount; 737 *zExpPtr = 1 - shiftCount; 738 739 } 740 741 /*---------------------------------------------------------------------------- 742 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 743 | extended double-precision floating-point value, returning the result. 744 *----------------------------------------------------------------------------*/ 745 746 static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig ) 747 { 748 floatx80 z; 749 750 z.low = zSig; 751 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp; 752 return z; 753 754 } 755 756 /*---------------------------------------------------------------------------- 757 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 758 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 759 | and returns the proper extended double-precision floating-point value 760 | corresponding to the abstract input. Ordinarily, the abstract value is 761 | rounded and packed into the extended double-precision format, with the 762 | inexact exception raised if the abstract input cannot be represented 763 | exactly. However, if the abstract value is too large, the overflow and 764 | inexact exceptions are raised and an infinity or maximal finite value is 765 | returned. If the abstract value is too small, the input value is rounded to 766 | a subnormal number, and the underflow and inexact exceptions are raised if 767 | the abstract input cannot be represented exactly as a subnormal extended 768 | double-precision floating-point number. 769 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 770 | number of bits as single or double precision, respectively. Otherwise, the 771 | result is rounded to the full precision of the extended double-precision 772 | format. 773 | The input significand must be normalized or smaller. If the input 774 | significand is not normalized, `zExp' must be 0; in that case, the result 775 | returned is a subnormal number, and it must not require rounding. The 776 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 777 | Floating-Point Arithmetic. 778 *----------------------------------------------------------------------------*/ 779 780 static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign, 781 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 782 float_status *status) 783 { 784 int8_t roundingMode; 785 flag roundNearestEven, increment, isTiny; 786 int64_t roundIncrement, roundMask, roundBits; 787 788 roundingMode = status->float_rounding_mode; 789 roundNearestEven = ( roundingMode == float_round_nearest_even ); 790 if ( roundingPrecision == 80 ) goto precision80; 791 if ( roundingPrecision == 64 ) { 792 roundIncrement = LIT64( 0x0000000000000400 ); 793 roundMask = LIT64( 0x00000000000007FF ); 794 } 795 else if ( roundingPrecision == 32 ) { 796 roundIncrement = LIT64( 0x0000008000000000 ); 797 roundMask = LIT64( 0x000000FFFFFFFFFF ); 798 } 799 else { 800 goto precision80; 801 } 802 zSig0 |= ( zSig1 != 0 ); 803 switch (roundingMode) { 804 case float_round_nearest_even: 805 case float_round_ties_away: 806 break; 807 case float_round_to_zero: 808 roundIncrement = 0; 809 break; 810 case float_round_up: 811 roundIncrement = zSign ? 0 : roundMask; 812 break; 813 case float_round_down: 814 roundIncrement = zSign ? roundMask : 0; 815 break; 816 default: 817 abort(); 818 } 819 roundBits = zSig0 & roundMask; 820 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 821 if ( ( 0x7FFE < zExp ) 822 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 823 ) { 824 goto overflow; 825 } 826 if ( zExp <= 0 ) { 827 if (status->flush_to_zero) { 828 float_raise(float_flag_output_denormal, status); 829 return packFloatx80(zSign, 0, 0); 830 } 831 isTiny = 832 (status->float_detect_tininess 833 == float_tininess_before_rounding) 834 || ( zExp < 0 ) 835 || ( zSig0 <= zSig0 + roundIncrement ); 836 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 837 zExp = 0; 838 roundBits = zSig0 & roundMask; 839 if (isTiny && roundBits) { 840 float_raise(float_flag_underflow, status); 841 } 842 if (roundBits) { 843 status->float_exception_flags |= float_flag_inexact; 844 } 845 zSig0 += roundIncrement; 846 if ( (int64_t) zSig0 < 0 ) zExp = 1; 847 roundIncrement = roundMask + 1; 848 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 849 roundMask |= roundIncrement; 850 } 851 zSig0 &= ~ roundMask; 852 return packFloatx80( zSign, zExp, zSig0 ); 853 } 854 } 855 if (roundBits) { 856 status->float_exception_flags |= float_flag_inexact; 857 } 858 zSig0 += roundIncrement; 859 if ( zSig0 < roundIncrement ) { 860 ++zExp; 861 zSig0 = LIT64( 0x8000000000000000 ); 862 } 863 roundIncrement = roundMask + 1; 864 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 865 roundMask |= roundIncrement; 866 } 867 zSig0 &= ~ roundMask; 868 if ( zSig0 == 0 ) zExp = 0; 869 return packFloatx80( zSign, zExp, zSig0 ); 870 precision80: 871 switch (roundingMode) { 872 case float_round_nearest_even: 873 case float_round_ties_away: 874 increment = ((int64_t)zSig1 < 0); 875 break; 876 case float_round_to_zero: 877 increment = 0; 878 break; 879 case float_round_up: 880 increment = !zSign && zSig1; 881 break; 882 case float_round_down: 883 increment = zSign && zSig1; 884 break; 885 default: 886 abort(); 887 } 888 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 889 if ( ( 0x7FFE < zExp ) 890 || ( ( zExp == 0x7FFE ) 891 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 892 && increment 893 ) 894 ) { 895 roundMask = 0; 896 overflow: 897 float_raise(float_flag_overflow | float_flag_inexact, status); 898 if ( ( roundingMode == float_round_to_zero ) 899 || ( zSign && ( roundingMode == float_round_up ) ) 900 || ( ! zSign && ( roundingMode == float_round_down ) ) 901 ) { 902 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 903 } 904 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 905 } 906 if ( zExp <= 0 ) { 907 isTiny = 908 (status->float_detect_tininess 909 == float_tininess_before_rounding) 910 || ( zExp < 0 ) 911 || ! increment 912 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 913 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 914 zExp = 0; 915 if (isTiny && zSig1) { 916 float_raise(float_flag_underflow, status); 917 } 918 if (zSig1) { 919 status->float_exception_flags |= float_flag_inexact; 920 } 921 switch (roundingMode) { 922 case float_round_nearest_even: 923 case float_round_ties_away: 924 increment = ((int64_t)zSig1 < 0); 925 break; 926 case float_round_to_zero: 927 increment = 0; 928 break; 929 case float_round_up: 930 increment = !zSign && zSig1; 931 break; 932 case float_round_down: 933 increment = zSign && zSig1; 934 break; 935 default: 936 abort(); 937 } 938 if ( increment ) { 939 ++zSig0; 940 zSig0 &= 941 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 942 if ( (int64_t) zSig0 < 0 ) zExp = 1; 943 } 944 return packFloatx80( zSign, zExp, zSig0 ); 945 } 946 } 947 if (zSig1) { 948 status->float_exception_flags |= float_flag_inexact; 949 } 950 if ( increment ) { 951 ++zSig0; 952 if ( zSig0 == 0 ) { 953 ++zExp; 954 zSig0 = LIT64( 0x8000000000000000 ); 955 } 956 else { 957 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 958 } 959 } 960 else { 961 if ( zSig0 == 0 ) zExp = 0; 962 } 963 return packFloatx80( zSign, zExp, zSig0 ); 964 965 } 966 967 /*---------------------------------------------------------------------------- 968 | Takes an abstract floating-point value having sign `zSign', exponent 969 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 970 | and returns the proper extended double-precision floating-point value 971 | corresponding to the abstract input. This routine is just like 972 | `roundAndPackFloatx80' except that the input significand does not have to be 973 | normalized. 974 *----------------------------------------------------------------------------*/ 975 976 static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 977 flag zSign, int32_t zExp, 978 uint64_t zSig0, uint64_t zSig1, 979 float_status *status) 980 { 981 int8_t shiftCount; 982 983 if ( zSig0 == 0 ) { 984 zSig0 = zSig1; 985 zSig1 = 0; 986 zExp -= 64; 987 } 988 shiftCount = countLeadingZeros64( zSig0 ); 989 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 990 zExp -= shiftCount; 991 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 992 zSig0, zSig1, status); 993 994 } 995 996 /*---------------------------------------------------------------------------- 997 | Returns the least-significant 64 fraction bits of the quadruple-precision 998 | floating-point value `a'. 999 *----------------------------------------------------------------------------*/ 1000 1001 static inline uint64_t extractFloat128Frac1( float128 a ) 1002 { 1003 1004 return a.low; 1005 1006 } 1007 1008 /*---------------------------------------------------------------------------- 1009 | Returns the most-significant 48 fraction bits of the quadruple-precision 1010 | floating-point value `a'. 1011 *----------------------------------------------------------------------------*/ 1012 1013 static inline uint64_t extractFloat128Frac0( float128 a ) 1014 { 1015 1016 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 1017 1018 } 1019 1020 /*---------------------------------------------------------------------------- 1021 | Returns the exponent bits of the quadruple-precision floating-point value 1022 | `a'. 1023 *----------------------------------------------------------------------------*/ 1024 1025 static inline int32_t extractFloat128Exp( float128 a ) 1026 { 1027 1028 return ( a.high>>48 ) & 0x7FFF; 1029 1030 } 1031 1032 /*---------------------------------------------------------------------------- 1033 | Returns the sign bit of the quadruple-precision floating-point value `a'. 1034 *----------------------------------------------------------------------------*/ 1035 1036 static inline flag extractFloat128Sign( float128 a ) 1037 { 1038 1039 return a.high>>63; 1040 1041 } 1042 1043 /*---------------------------------------------------------------------------- 1044 | Normalizes the subnormal quadruple-precision floating-point value 1045 | represented by the denormalized significand formed by the concatenation of 1046 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 1047 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 1048 | significand are stored at the location pointed to by `zSig0Ptr', and the 1049 | least significant 64 bits of the normalized significand are stored at the 1050 | location pointed to by `zSig1Ptr'. 1051 *----------------------------------------------------------------------------*/ 1052 1053 static void 1054 normalizeFloat128Subnormal( 1055 uint64_t aSig0, 1056 uint64_t aSig1, 1057 int32_t *zExpPtr, 1058 uint64_t *zSig0Ptr, 1059 uint64_t *zSig1Ptr 1060 ) 1061 { 1062 int8_t shiftCount; 1063 1064 if ( aSig0 == 0 ) { 1065 shiftCount = countLeadingZeros64( aSig1 ) - 15; 1066 if ( shiftCount < 0 ) { 1067 *zSig0Ptr = aSig1>>( - shiftCount ); 1068 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 1069 } 1070 else { 1071 *zSig0Ptr = aSig1<<shiftCount; 1072 *zSig1Ptr = 0; 1073 } 1074 *zExpPtr = - shiftCount - 63; 1075 } 1076 else { 1077 shiftCount = countLeadingZeros64( aSig0 ) - 15; 1078 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 1079 *zExpPtr = 1 - shiftCount; 1080 } 1081 1082 } 1083 1084 /*---------------------------------------------------------------------------- 1085 | Packs the sign `zSign', the exponent `zExp', and the significand formed 1086 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 1087 | floating-point value, returning the result. After being shifted into the 1088 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 1089 | added together to form the most significant 32 bits of the result. This 1090 | means that any integer portion of `zSig0' will be added into the exponent. 1091 | Since a properly normalized significand will have an integer portion equal 1092 | to 1, the `zExp' input should be 1 less than the desired result exponent 1093 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 1094 | significand. 1095 *----------------------------------------------------------------------------*/ 1096 1097 static inline float128 1098 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 ) 1099 { 1100 float128 z; 1101 1102 z.low = zSig1; 1103 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0; 1104 return z; 1105 1106 } 1107 1108 /*---------------------------------------------------------------------------- 1109 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1110 | and extended significand formed by the concatenation of `zSig0', `zSig1', 1111 | and `zSig2', and returns the proper quadruple-precision floating-point value 1112 | corresponding to the abstract input. Ordinarily, the abstract value is 1113 | simply rounded and packed into the quadruple-precision format, with the 1114 | inexact exception raised if the abstract input cannot be represented 1115 | exactly. However, if the abstract value is too large, the overflow and 1116 | inexact exceptions are raised and an infinity or maximal finite value is 1117 | returned. If the abstract value is too small, the input value is rounded to 1118 | a subnormal number, and the underflow and inexact exceptions are raised if 1119 | the abstract input cannot be represented exactly as a subnormal quadruple- 1120 | precision floating-point number. 1121 | The input significand must be normalized or smaller. If the input 1122 | significand is not normalized, `zExp' must be 0; in that case, the result 1123 | returned is a subnormal number, and it must not require rounding. In the 1124 | usual case that the input significand is normalized, `zExp' must be 1 less 1125 | than the ``true'' floating-point exponent. The handling of underflow and 1126 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1127 *----------------------------------------------------------------------------*/ 1128 1129 static float128 roundAndPackFloat128(flag zSign, int32_t zExp, 1130 uint64_t zSig0, uint64_t zSig1, 1131 uint64_t zSig2, float_status *status) 1132 { 1133 int8_t roundingMode; 1134 flag roundNearestEven, increment, isTiny; 1135 1136 roundingMode = status->float_rounding_mode; 1137 roundNearestEven = ( roundingMode == float_round_nearest_even ); 1138 switch (roundingMode) { 1139 case float_round_nearest_even: 1140 case float_round_ties_away: 1141 increment = ((int64_t)zSig2 < 0); 1142 break; 1143 case float_round_to_zero: 1144 increment = 0; 1145 break; 1146 case float_round_up: 1147 increment = !zSign && zSig2; 1148 break; 1149 case float_round_down: 1150 increment = zSign && zSig2; 1151 break; 1152 default: 1153 abort(); 1154 } 1155 if ( 0x7FFD <= (uint32_t) zExp ) { 1156 if ( ( 0x7FFD < zExp ) 1157 || ( ( zExp == 0x7FFD ) 1158 && eq128( 1159 LIT64( 0x0001FFFFFFFFFFFF ), 1160 LIT64( 0xFFFFFFFFFFFFFFFF ), 1161 zSig0, 1162 zSig1 1163 ) 1164 && increment 1165 ) 1166 ) { 1167 float_raise(float_flag_overflow | float_flag_inexact, status); 1168 if ( ( roundingMode == float_round_to_zero ) 1169 || ( zSign && ( roundingMode == float_round_up ) ) 1170 || ( ! zSign && ( roundingMode == float_round_down ) ) 1171 ) { 1172 return 1173 packFloat128( 1174 zSign, 1175 0x7FFE, 1176 LIT64( 0x0000FFFFFFFFFFFF ), 1177 LIT64( 0xFFFFFFFFFFFFFFFF ) 1178 ); 1179 } 1180 return packFloat128( zSign, 0x7FFF, 0, 0 ); 1181 } 1182 if ( zExp < 0 ) { 1183 if (status->flush_to_zero) { 1184 float_raise(float_flag_output_denormal, status); 1185 return packFloat128(zSign, 0, 0, 0); 1186 } 1187 isTiny = 1188 (status->float_detect_tininess 1189 == float_tininess_before_rounding) 1190 || ( zExp < -1 ) 1191 || ! increment 1192 || lt128( 1193 zSig0, 1194 zSig1, 1195 LIT64( 0x0001FFFFFFFFFFFF ), 1196 LIT64( 0xFFFFFFFFFFFFFFFF ) 1197 ); 1198 shift128ExtraRightJamming( 1199 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 1200 zExp = 0; 1201 if (isTiny && zSig2) { 1202 float_raise(float_flag_underflow, status); 1203 } 1204 switch (roundingMode) { 1205 case float_round_nearest_even: 1206 case float_round_ties_away: 1207 increment = ((int64_t)zSig2 < 0); 1208 break; 1209 case float_round_to_zero: 1210 increment = 0; 1211 break; 1212 case float_round_up: 1213 increment = !zSign && zSig2; 1214 break; 1215 case float_round_down: 1216 increment = zSign && zSig2; 1217 break; 1218 default: 1219 abort(); 1220 } 1221 } 1222 } 1223 if (zSig2) { 1224 status->float_exception_flags |= float_flag_inexact; 1225 } 1226 if ( increment ) { 1227 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 1228 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 1229 } 1230 else { 1231 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 1232 } 1233 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1234 1235 } 1236 1237 /*---------------------------------------------------------------------------- 1238 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1239 | and significand formed by the concatenation of `zSig0' and `zSig1', and 1240 | returns the proper quadruple-precision floating-point value corresponding 1241 | to the abstract input. This routine is just like `roundAndPackFloat128' 1242 | except that the input significand has fewer bits and does not have to be 1243 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 1244 | point exponent. 1245 *----------------------------------------------------------------------------*/ 1246 1247 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp, 1248 uint64_t zSig0, uint64_t zSig1, 1249 float_status *status) 1250 { 1251 int8_t shiftCount; 1252 uint64_t zSig2; 1253 1254 if ( zSig0 == 0 ) { 1255 zSig0 = zSig1; 1256 zSig1 = 0; 1257 zExp -= 64; 1258 } 1259 shiftCount = countLeadingZeros64( zSig0 ) - 15; 1260 if ( 0 <= shiftCount ) { 1261 zSig2 = 0; 1262 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1263 } 1264 else { 1265 shift128ExtraRightJamming( 1266 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 1267 } 1268 zExp -= shiftCount; 1269 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 1270 1271 } 1272 1273 /*---------------------------------------------------------------------------- 1274 | Returns the result of converting the 32-bit two's complement integer `a' 1275 | to the single-precision floating-point format. The conversion is performed 1276 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1277 *----------------------------------------------------------------------------*/ 1278 1279 float32 int32_to_float32(int32_t a, float_status *status) 1280 { 1281 flag zSign; 1282 1283 if ( a == 0 ) return float32_zero; 1284 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); 1285 zSign = ( a < 0 ); 1286 return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status); 1287 } 1288 1289 /*---------------------------------------------------------------------------- 1290 | Returns the result of converting the 32-bit two's complement integer `a' 1291 | to the double-precision floating-point format. The conversion is performed 1292 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1293 *----------------------------------------------------------------------------*/ 1294 1295 float64 int32_to_float64(int32_t a, float_status *status) 1296 { 1297 flag zSign; 1298 uint32_t absA; 1299 int8_t shiftCount; 1300 uint64_t zSig; 1301 1302 if ( a == 0 ) return float64_zero; 1303 zSign = ( a < 0 ); 1304 absA = zSign ? - a : a; 1305 shiftCount = countLeadingZeros32( absA ) + 21; 1306 zSig = absA; 1307 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount ); 1308 1309 } 1310 1311 /*---------------------------------------------------------------------------- 1312 | Returns the result of converting the 32-bit two's complement integer `a' 1313 | to the extended double-precision floating-point format. The conversion 1314 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1315 | Arithmetic. 1316 *----------------------------------------------------------------------------*/ 1317 1318 floatx80 int32_to_floatx80(int32_t a, float_status *status) 1319 { 1320 flag zSign; 1321 uint32_t absA; 1322 int8_t shiftCount; 1323 uint64_t zSig; 1324 1325 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1326 zSign = ( a < 0 ); 1327 absA = zSign ? - a : a; 1328 shiftCount = countLeadingZeros32( absA ) + 32; 1329 zSig = absA; 1330 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 1331 1332 } 1333 1334 /*---------------------------------------------------------------------------- 1335 | Returns the result of converting the 32-bit two's complement integer `a' to 1336 | the quadruple-precision floating-point format. The conversion is performed 1337 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1338 *----------------------------------------------------------------------------*/ 1339 1340 float128 int32_to_float128(int32_t a, float_status *status) 1341 { 1342 flag zSign; 1343 uint32_t absA; 1344 int8_t shiftCount; 1345 uint64_t zSig0; 1346 1347 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1348 zSign = ( a < 0 ); 1349 absA = zSign ? - a : a; 1350 shiftCount = countLeadingZeros32( absA ) + 17; 1351 zSig0 = absA; 1352 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 1353 1354 } 1355 1356 /*---------------------------------------------------------------------------- 1357 | Returns the result of converting the 64-bit two's complement integer `a' 1358 | to the single-precision floating-point format. The conversion is performed 1359 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1360 *----------------------------------------------------------------------------*/ 1361 1362 float32 int64_to_float32(int64_t a, float_status *status) 1363 { 1364 flag zSign; 1365 uint64_t absA; 1366 int8_t shiftCount; 1367 1368 if ( a == 0 ) return float32_zero; 1369 zSign = ( a < 0 ); 1370 absA = zSign ? - a : a; 1371 shiftCount = countLeadingZeros64( absA ) - 40; 1372 if ( 0 <= shiftCount ) { 1373 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount ); 1374 } 1375 else { 1376 shiftCount += 7; 1377 if ( shiftCount < 0 ) { 1378 shift64RightJamming( absA, - shiftCount, &absA ); 1379 } 1380 else { 1381 absA <<= shiftCount; 1382 } 1383 return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status); 1384 } 1385 1386 } 1387 1388 /*---------------------------------------------------------------------------- 1389 | Returns the result of converting the 64-bit two's complement integer `a' 1390 | to the double-precision floating-point format. The conversion is performed 1391 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1392 *----------------------------------------------------------------------------*/ 1393 1394 float64 int64_to_float64(int64_t a, float_status *status) 1395 { 1396 flag zSign; 1397 1398 if ( a == 0 ) return float64_zero; 1399 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) { 1400 return packFloat64( 1, 0x43E, 0 ); 1401 } 1402 zSign = ( a < 0 ); 1403 return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status); 1404 } 1405 1406 /*---------------------------------------------------------------------------- 1407 | Returns the result of converting the 64-bit two's complement integer `a' 1408 | to the extended double-precision floating-point format. The conversion 1409 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1410 | Arithmetic. 1411 *----------------------------------------------------------------------------*/ 1412 1413 floatx80 int64_to_floatx80(int64_t a, float_status *status) 1414 { 1415 flag zSign; 1416 uint64_t absA; 1417 int8_t shiftCount; 1418 1419 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1420 zSign = ( a < 0 ); 1421 absA = zSign ? - a : a; 1422 shiftCount = countLeadingZeros64( absA ); 1423 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 1424 1425 } 1426 1427 /*---------------------------------------------------------------------------- 1428 | Returns the result of converting the 64-bit two's complement integer `a' to 1429 | the quadruple-precision floating-point format. The conversion is performed 1430 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1431 *----------------------------------------------------------------------------*/ 1432 1433 float128 int64_to_float128(int64_t a, float_status *status) 1434 { 1435 flag zSign; 1436 uint64_t absA; 1437 int8_t shiftCount; 1438 int32_t zExp; 1439 uint64_t zSig0, zSig1; 1440 1441 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1442 zSign = ( a < 0 ); 1443 absA = zSign ? - a : a; 1444 shiftCount = countLeadingZeros64( absA ) + 49; 1445 zExp = 0x406E - shiftCount; 1446 if ( 64 <= shiftCount ) { 1447 zSig1 = 0; 1448 zSig0 = absA; 1449 shiftCount -= 64; 1450 } 1451 else { 1452 zSig1 = absA; 1453 zSig0 = 0; 1454 } 1455 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1456 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1457 1458 } 1459 1460 /*---------------------------------------------------------------------------- 1461 | Returns the result of converting the 64-bit unsigned integer `a' 1462 | to the single-precision floating-point format. The conversion is performed 1463 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1464 *----------------------------------------------------------------------------*/ 1465 1466 float32 uint64_to_float32(uint64_t a, float_status *status) 1467 { 1468 int shiftcount; 1469 1470 if (a == 0) { 1471 return float32_zero; 1472 } 1473 1474 /* Determine (left) shift needed to put first set bit into bit posn 23 1475 * (since packFloat32() expects the binary point between bits 23 and 22); 1476 * this is the fast case for smallish numbers. 1477 */ 1478 shiftcount = countLeadingZeros64(a) - 40; 1479 if (shiftcount >= 0) { 1480 return packFloat32(0, 0x95 - shiftcount, a << shiftcount); 1481 } 1482 /* Otherwise we need to do a round-and-pack. roundAndPackFloat32() 1483 * expects the binary point between bits 30 and 29, hence the + 7. 1484 */ 1485 shiftcount += 7; 1486 if (shiftcount < 0) { 1487 shift64RightJamming(a, -shiftcount, &a); 1488 } else { 1489 a <<= shiftcount; 1490 } 1491 1492 return roundAndPackFloat32(0, 0x9c - shiftcount, a, status); 1493 } 1494 1495 /*---------------------------------------------------------------------------- 1496 | Returns the result of converting the 64-bit unsigned integer `a' 1497 | to the double-precision floating-point format. The conversion is performed 1498 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1499 *----------------------------------------------------------------------------*/ 1500 1501 float64 uint64_to_float64(uint64_t a, float_status *status) 1502 { 1503 int exp = 0x43C; 1504 int shiftcount; 1505 1506 if (a == 0) { 1507 return float64_zero; 1508 } 1509 1510 shiftcount = countLeadingZeros64(a) - 1; 1511 if (shiftcount < 0) { 1512 shift64RightJamming(a, -shiftcount, &a); 1513 } else { 1514 a <<= shiftcount; 1515 } 1516 return roundAndPackFloat64(0, exp - shiftcount, a, status); 1517 } 1518 1519 /*---------------------------------------------------------------------------- 1520 | Returns the result of converting the 64-bit unsigned integer `a' 1521 | to the quadruple-precision floating-point format. The conversion is performed 1522 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1523 *----------------------------------------------------------------------------*/ 1524 1525 float128 uint64_to_float128(uint64_t a, float_status *status) 1526 { 1527 if (a == 0) { 1528 return float128_zero; 1529 } 1530 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status); 1531 } 1532 1533 /*---------------------------------------------------------------------------- 1534 | Returns the result of converting the single-precision floating-point value 1535 | `a' to the 32-bit two's complement integer format. The conversion is 1536 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1537 | Arithmetic---which means in particular that the conversion is rounded 1538 | according to the current rounding mode. If `a' is a NaN, the largest 1539 | positive integer is returned. Otherwise, if the conversion overflows, the 1540 | largest integer with the same sign as `a' is returned. 1541 *----------------------------------------------------------------------------*/ 1542 1543 int32_t float32_to_int32(float32 a, float_status *status) 1544 { 1545 flag aSign; 1546 int_fast16_t aExp, shiftCount; 1547 uint32_t aSig; 1548 uint64_t aSig64; 1549 1550 a = float32_squash_input_denormal(a, status); 1551 aSig = extractFloat32Frac( a ); 1552 aExp = extractFloat32Exp( a ); 1553 aSign = extractFloat32Sign( a ); 1554 if ( ( aExp == 0xFF ) && aSig ) aSign = 0; 1555 if ( aExp ) aSig |= 0x00800000; 1556 shiftCount = 0xAF - aExp; 1557 aSig64 = aSig; 1558 aSig64 <<= 32; 1559 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 ); 1560 return roundAndPackInt32(aSign, aSig64, status); 1561 1562 } 1563 1564 /*---------------------------------------------------------------------------- 1565 | Returns the result of converting the single-precision floating-point value 1566 | `a' to the 32-bit two's complement integer format. The conversion is 1567 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1568 | Arithmetic, except that the conversion is always rounded toward zero. 1569 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1570 | the conversion overflows, the largest integer with the same sign as `a' is 1571 | returned. 1572 *----------------------------------------------------------------------------*/ 1573 1574 int32_t float32_to_int32_round_to_zero(float32 a, float_status *status) 1575 { 1576 flag aSign; 1577 int_fast16_t aExp, shiftCount; 1578 uint32_t aSig; 1579 int32_t z; 1580 a = float32_squash_input_denormal(a, status); 1581 1582 aSig = extractFloat32Frac( a ); 1583 aExp = extractFloat32Exp( a ); 1584 aSign = extractFloat32Sign( a ); 1585 shiftCount = aExp - 0x9E; 1586 if ( 0 <= shiftCount ) { 1587 if ( float32_val(a) != 0xCF000000 ) { 1588 float_raise(float_flag_invalid, status); 1589 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; 1590 } 1591 return (int32_t) 0x80000000; 1592 } 1593 else if ( aExp <= 0x7E ) { 1594 if (aExp | aSig) { 1595 status->float_exception_flags |= float_flag_inexact; 1596 } 1597 return 0; 1598 } 1599 aSig = ( aSig | 0x00800000 )<<8; 1600 z = aSig>>( - shiftCount ); 1601 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) { 1602 status->float_exception_flags |= float_flag_inexact; 1603 } 1604 if ( aSign ) z = - z; 1605 return z; 1606 1607 } 1608 1609 /*---------------------------------------------------------------------------- 1610 | Returns the result of converting the single-precision floating-point value 1611 | `a' to the 16-bit two's complement integer format. The conversion is 1612 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1613 | Arithmetic, except that the conversion is always rounded toward zero. 1614 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1615 | the conversion overflows, the largest integer with the same sign as `a' is 1616 | returned. 1617 *----------------------------------------------------------------------------*/ 1618 1619 int_fast16_t float32_to_int16_round_to_zero(float32 a, float_status *status) 1620 { 1621 flag aSign; 1622 int_fast16_t aExp, shiftCount; 1623 uint32_t aSig; 1624 int32_t z; 1625 1626 aSig = extractFloat32Frac( a ); 1627 aExp = extractFloat32Exp( a ); 1628 aSign = extractFloat32Sign( a ); 1629 shiftCount = aExp - 0x8E; 1630 if ( 0 <= shiftCount ) { 1631 if ( float32_val(a) != 0xC7000000 ) { 1632 float_raise(float_flag_invalid, status); 1633 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1634 return 0x7FFF; 1635 } 1636 } 1637 return (int32_t) 0xffff8000; 1638 } 1639 else if ( aExp <= 0x7E ) { 1640 if ( aExp | aSig ) { 1641 status->float_exception_flags |= float_flag_inexact; 1642 } 1643 return 0; 1644 } 1645 shiftCount -= 0x10; 1646 aSig = ( aSig | 0x00800000 )<<8; 1647 z = aSig>>( - shiftCount ); 1648 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) { 1649 status->float_exception_flags |= float_flag_inexact; 1650 } 1651 if ( aSign ) { 1652 z = - z; 1653 } 1654 return z; 1655 1656 } 1657 1658 /*---------------------------------------------------------------------------- 1659 | Returns the result of converting the single-precision floating-point value 1660 | `a' to the 64-bit two's complement integer format. The conversion is 1661 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1662 | Arithmetic---which means in particular that the conversion is rounded 1663 | according to the current rounding mode. If `a' is a NaN, the largest 1664 | positive integer is returned. Otherwise, if the conversion overflows, the 1665 | largest integer with the same sign as `a' is returned. 1666 *----------------------------------------------------------------------------*/ 1667 1668 int64_t float32_to_int64(float32 a, float_status *status) 1669 { 1670 flag aSign; 1671 int_fast16_t aExp, shiftCount; 1672 uint32_t aSig; 1673 uint64_t aSig64, aSigExtra; 1674 a = float32_squash_input_denormal(a, status); 1675 1676 aSig = extractFloat32Frac( a ); 1677 aExp = extractFloat32Exp( a ); 1678 aSign = extractFloat32Sign( a ); 1679 shiftCount = 0xBE - aExp; 1680 if ( shiftCount < 0 ) { 1681 float_raise(float_flag_invalid, status); 1682 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1683 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1684 } 1685 return (int64_t) LIT64( 0x8000000000000000 ); 1686 } 1687 if ( aExp ) aSig |= 0x00800000; 1688 aSig64 = aSig; 1689 aSig64 <<= 40; 1690 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra ); 1691 return roundAndPackInt64(aSign, aSig64, aSigExtra, status); 1692 1693 } 1694 1695 /*---------------------------------------------------------------------------- 1696 | Returns the result of converting the single-precision floating-point value 1697 | `a' to the 64-bit unsigned integer format. The conversion is 1698 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1699 | Arithmetic---which means in particular that the conversion is rounded 1700 | according to the current rounding mode. If `a' is a NaN, the largest 1701 | unsigned integer is returned. Otherwise, if the conversion overflows, the 1702 | largest unsigned integer is returned. If the 'a' is negative, the result 1703 | is rounded and zero is returned; values that do not round to zero will 1704 | raise the inexact exception flag. 1705 *----------------------------------------------------------------------------*/ 1706 1707 uint64_t float32_to_uint64(float32 a, float_status *status) 1708 { 1709 flag aSign; 1710 int_fast16_t aExp, shiftCount; 1711 uint32_t aSig; 1712 uint64_t aSig64, aSigExtra; 1713 a = float32_squash_input_denormal(a, status); 1714 1715 aSig = extractFloat32Frac(a); 1716 aExp = extractFloat32Exp(a); 1717 aSign = extractFloat32Sign(a); 1718 if ((aSign) && (aExp > 126)) { 1719 float_raise(float_flag_invalid, status); 1720 if (float32_is_any_nan(a)) { 1721 return LIT64(0xFFFFFFFFFFFFFFFF); 1722 } else { 1723 return 0; 1724 } 1725 } 1726 shiftCount = 0xBE - aExp; 1727 if (aExp) { 1728 aSig |= 0x00800000; 1729 } 1730 if (shiftCount < 0) { 1731 float_raise(float_flag_invalid, status); 1732 return LIT64(0xFFFFFFFFFFFFFFFF); 1733 } 1734 1735 aSig64 = aSig; 1736 aSig64 <<= 40; 1737 shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra); 1738 return roundAndPackUint64(aSign, aSig64, aSigExtra, status); 1739 } 1740 1741 /*---------------------------------------------------------------------------- 1742 | Returns the result of converting the single-precision floating-point value 1743 | `a' to the 64-bit unsigned integer format. The conversion is 1744 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1745 | Arithmetic, except that the conversion is always rounded toward zero. If 1746 | `a' is a NaN, the largest unsigned integer is returned. Otherwise, if the 1747 | conversion overflows, the largest unsigned integer is returned. If the 1748 | 'a' is negative, the result is rounded and zero is returned; values that do 1749 | not round to zero will raise the inexact flag. 1750 *----------------------------------------------------------------------------*/ 1751 1752 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *status) 1753 { 1754 signed char current_rounding_mode = status->float_rounding_mode; 1755 set_float_rounding_mode(float_round_to_zero, status); 1756 int64_t v = float32_to_uint64(a, status); 1757 set_float_rounding_mode(current_rounding_mode, status); 1758 return v; 1759 } 1760 1761 /*---------------------------------------------------------------------------- 1762 | Returns the result of converting the single-precision floating-point value 1763 | `a' to the 64-bit two's complement integer format. The conversion is 1764 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1765 | Arithmetic, except that the conversion is always rounded toward zero. If 1766 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 1767 | conversion overflows, the largest integer with the same sign as `a' is 1768 | returned. 1769 *----------------------------------------------------------------------------*/ 1770 1771 int64_t float32_to_int64_round_to_zero(float32 a, float_status *status) 1772 { 1773 flag aSign; 1774 int_fast16_t aExp, shiftCount; 1775 uint32_t aSig; 1776 uint64_t aSig64; 1777 int64_t z; 1778 a = float32_squash_input_denormal(a, status); 1779 1780 aSig = extractFloat32Frac( a ); 1781 aExp = extractFloat32Exp( a ); 1782 aSign = extractFloat32Sign( a ); 1783 shiftCount = aExp - 0xBE; 1784 if ( 0 <= shiftCount ) { 1785 if ( float32_val(a) != 0xDF000000 ) { 1786 float_raise(float_flag_invalid, status); 1787 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1788 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1789 } 1790 } 1791 return (int64_t) LIT64( 0x8000000000000000 ); 1792 } 1793 else if ( aExp <= 0x7E ) { 1794 if (aExp | aSig) { 1795 status->float_exception_flags |= float_flag_inexact; 1796 } 1797 return 0; 1798 } 1799 aSig64 = aSig | 0x00800000; 1800 aSig64 <<= 40; 1801 z = aSig64>>( - shiftCount ); 1802 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) { 1803 status->float_exception_flags |= float_flag_inexact; 1804 } 1805 if ( aSign ) z = - z; 1806 return z; 1807 1808 } 1809 1810 /*---------------------------------------------------------------------------- 1811 | Returns the result of converting the single-precision floating-point value 1812 | `a' to the double-precision floating-point format. The conversion is 1813 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1814 | Arithmetic. 1815 *----------------------------------------------------------------------------*/ 1816 1817 float64 float32_to_float64(float32 a, float_status *status) 1818 { 1819 flag aSign; 1820 int_fast16_t aExp; 1821 uint32_t aSig; 1822 a = float32_squash_input_denormal(a, status); 1823 1824 aSig = extractFloat32Frac( a ); 1825 aExp = extractFloat32Exp( a ); 1826 aSign = extractFloat32Sign( a ); 1827 if ( aExp == 0xFF ) { 1828 if (aSig) { 1829 return commonNaNToFloat64(float32ToCommonNaN(a, status), status); 1830 } 1831 return packFloat64( aSign, 0x7FF, 0 ); 1832 } 1833 if ( aExp == 0 ) { 1834 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 1835 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1836 --aExp; 1837 } 1838 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 ); 1839 1840 } 1841 1842 /*---------------------------------------------------------------------------- 1843 | Returns the result of converting the single-precision floating-point value 1844 | `a' to the extended double-precision floating-point format. The conversion 1845 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1846 | Arithmetic. 1847 *----------------------------------------------------------------------------*/ 1848 1849 floatx80 float32_to_floatx80(float32 a, float_status *status) 1850 { 1851 flag aSign; 1852 int_fast16_t aExp; 1853 uint32_t aSig; 1854 1855 a = float32_squash_input_denormal(a, status); 1856 aSig = extractFloat32Frac( a ); 1857 aExp = extractFloat32Exp( a ); 1858 aSign = extractFloat32Sign( a ); 1859 if ( aExp == 0xFF ) { 1860 if (aSig) { 1861 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status); 1862 } 1863 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 1864 } 1865 if ( aExp == 0 ) { 1866 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 1867 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1868 } 1869 aSig |= 0x00800000; 1870 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 1871 1872 } 1873 1874 /*---------------------------------------------------------------------------- 1875 | Returns the result of converting the single-precision floating-point value 1876 | `a' to the double-precision floating-point format. The conversion is 1877 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1878 | Arithmetic. 1879 *----------------------------------------------------------------------------*/ 1880 1881 float128 float32_to_float128(float32 a, float_status *status) 1882 { 1883 flag aSign; 1884 int_fast16_t aExp; 1885 uint32_t aSig; 1886 1887 a = float32_squash_input_denormal(a, status); 1888 aSig = extractFloat32Frac( a ); 1889 aExp = extractFloat32Exp( a ); 1890 aSign = extractFloat32Sign( a ); 1891 if ( aExp == 0xFF ) { 1892 if (aSig) { 1893 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 1894 } 1895 return packFloat128( aSign, 0x7FFF, 0, 0 ); 1896 } 1897 if ( aExp == 0 ) { 1898 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 1899 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1900 --aExp; 1901 } 1902 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 1903 1904 } 1905 1906 /*---------------------------------------------------------------------------- 1907 | Rounds the single-precision floating-point value `a' to an integer, and 1908 | returns the result as a single-precision floating-point value. The 1909 | operation is performed according to the IEC/IEEE Standard for Binary 1910 | Floating-Point Arithmetic. 1911 *----------------------------------------------------------------------------*/ 1912 1913 float32 float32_round_to_int(float32 a, float_status *status) 1914 { 1915 flag aSign; 1916 int_fast16_t aExp; 1917 uint32_t lastBitMask, roundBitsMask; 1918 uint32_t z; 1919 a = float32_squash_input_denormal(a, status); 1920 1921 aExp = extractFloat32Exp( a ); 1922 if ( 0x96 <= aExp ) { 1923 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { 1924 return propagateFloat32NaN(a, a, status); 1925 } 1926 return a; 1927 } 1928 if ( aExp <= 0x7E ) { 1929 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a; 1930 status->float_exception_flags |= float_flag_inexact; 1931 aSign = extractFloat32Sign( a ); 1932 switch (status->float_rounding_mode) { 1933 case float_round_nearest_even: 1934 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { 1935 return packFloat32( aSign, 0x7F, 0 ); 1936 } 1937 break; 1938 case float_round_ties_away: 1939 if (aExp == 0x7E) { 1940 return packFloat32(aSign, 0x7F, 0); 1941 } 1942 break; 1943 case float_round_down: 1944 return make_float32(aSign ? 0xBF800000 : 0); 1945 case float_round_up: 1946 return make_float32(aSign ? 0x80000000 : 0x3F800000); 1947 } 1948 return packFloat32( aSign, 0, 0 ); 1949 } 1950 lastBitMask = 1; 1951 lastBitMask <<= 0x96 - aExp; 1952 roundBitsMask = lastBitMask - 1; 1953 z = float32_val(a); 1954 switch (status->float_rounding_mode) { 1955 case float_round_nearest_even: 1956 z += lastBitMask>>1; 1957 if ((z & roundBitsMask) == 0) { 1958 z &= ~lastBitMask; 1959 } 1960 break; 1961 case float_round_ties_away: 1962 z += lastBitMask >> 1; 1963 break; 1964 case float_round_to_zero: 1965 break; 1966 case float_round_up: 1967 if (!extractFloat32Sign(make_float32(z))) { 1968 z += roundBitsMask; 1969 } 1970 break; 1971 case float_round_down: 1972 if (extractFloat32Sign(make_float32(z))) { 1973 z += roundBitsMask; 1974 } 1975 break; 1976 default: 1977 abort(); 1978 } 1979 z &= ~ roundBitsMask; 1980 if (z != float32_val(a)) { 1981 status->float_exception_flags |= float_flag_inexact; 1982 } 1983 return make_float32(z); 1984 1985 } 1986 1987 /*---------------------------------------------------------------------------- 1988 | Returns the result of adding the absolute values of the single-precision 1989 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 1990 | before being returned. `zSign' is ignored if the result is a NaN. 1991 | The addition is performed according to the IEC/IEEE Standard for Binary 1992 | Floating-Point Arithmetic. 1993 *----------------------------------------------------------------------------*/ 1994 1995 static float32 addFloat32Sigs(float32 a, float32 b, flag zSign, 1996 float_status *status) 1997 { 1998 int_fast16_t aExp, bExp, zExp; 1999 uint32_t aSig, bSig, zSig; 2000 int_fast16_t expDiff; 2001 2002 aSig = extractFloat32Frac( a ); 2003 aExp = extractFloat32Exp( a ); 2004 bSig = extractFloat32Frac( b ); 2005 bExp = extractFloat32Exp( b ); 2006 expDiff = aExp - bExp; 2007 aSig <<= 6; 2008 bSig <<= 6; 2009 if ( 0 < expDiff ) { 2010 if ( aExp == 0xFF ) { 2011 if (aSig) { 2012 return propagateFloat32NaN(a, b, status); 2013 } 2014 return a; 2015 } 2016 if ( bExp == 0 ) { 2017 --expDiff; 2018 } 2019 else { 2020 bSig |= 0x20000000; 2021 } 2022 shift32RightJamming( bSig, expDiff, &bSig ); 2023 zExp = aExp; 2024 } 2025 else if ( expDiff < 0 ) { 2026 if ( bExp == 0xFF ) { 2027 if (bSig) { 2028 return propagateFloat32NaN(a, b, status); 2029 } 2030 return packFloat32( zSign, 0xFF, 0 ); 2031 } 2032 if ( aExp == 0 ) { 2033 ++expDiff; 2034 } 2035 else { 2036 aSig |= 0x20000000; 2037 } 2038 shift32RightJamming( aSig, - expDiff, &aSig ); 2039 zExp = bExp; 2040 } 2041 else { 2042 if ( aExp == 0xFF ) { 2043 if (aSig | bSig) { 2044 return propagateFloat32NaN(a, b, status); 2045 } 2046 return a; 2047 } 2048 if ( aExp == 0 ) { 2049 if (status->flush_to_zero) { 2050 if (aSig | bSig) { 2051 float_raise(float_flag_output_denormal, status); 2052 } 2053 return packFloat32(zSign, 0, 0); 2054 } 2055 return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); 2056 } 2057 zSig = 0x40000000 + aSig + bSig; 2058 zExp = aExp; 2059 goto roundAndPack; 2060 } 2061 aSig |= 0x20000000; 2062 zSig = ( aSig + bSig )<<1; 2063 --zExp; 2064 if ( (int32_t) zSig < 0 ) { 2065 zSig = aSig + bSig; 2066 ++zExp; 2067 } 2068 roundAndPack: 2069 return roundAndPackFloat32(zSign, zExp, zSig, status); 2070 2071 } 2072 2073 /*---------------------------------------------------------------------------- 2074 | Returns the result of subtracting the absolute values of the single- 2075 | precision floating-point values `a' and `b'. If `zSign' is 1, the 2076 | difference is negated before being returned. `zSign' is ignored if the 2077 | result is a NaN. The subtraction is performed according to the IEC/IEEE 2078 | Standard for Binary Floating-Point Arithmetic. 2079 *----------------------------------------------------------------------------*/ 2080 2081 static float32 subFloat32Sigs(float32 a, float32 b, flag zSign, 2082 float_status *status) 2083 { 2084 int_fast16_t aExp, bExp, zExp; 2085 uint32_t aSig, bSig, zSig; 2086 int_fast16_t expDiff; 2087 2088 aSig = extractFloat32Frac( a ); 2089 aExp = extractFloat32Exp( a ); 2090 bSig = extractFloat32Frac( b ); 2091 bExp = extractFloat32Exp( b ); 2092 expDiff = aExp - bExp; 2093 aSig <<= 7; 2094 bSig <<= 7; 2095 if ( 0 < expDiff ) goto aExpBigger; 2096 if ( expDiff < 0 ) goto bExpBigger; 2097 if ( aExp == 0xFF ) { 2098 if (aSig | bSig) { 2099 return propagateFloat32NaN(a, b, status); 2100 } 2101 float_raise(float_flag_invalid, status); 2102 return float32_default_nan; 2103 } 2104 if ( aExp == 0 ) { 2105 aExp = 1; 2106 bExp = 1; 2107 } 2108 if ( bSig < aSig ) goto aBigger; 2109 if ( aSig < bSig ) goto bBigger; 2110 return packFloat32(status->float_rounding_mode == float_round_down, 0, 0); 2111 bExpBigger: 2112 if ( bExp == 0xFF ) { 2113 if (bSig) { 2114 return propagateFloat32NaN(a, b, status); 2115 } 2116 return packFloat32( zSign ^ 1, 0xFF, 0 ); 2117 } 2118 if ( aExp == 0 ) { 2119 ++expDiff; 2120 } 2121 else { 2122 aSig |= 0x40000000; 2123 } 2124 shift32RightJamming( aSig, - expDiff, &aSig ); 2125 bSig |= 0x40000000; 2126 bBigger: 2127 zSig = bSig - aSig; 2128 zExp = bExp; 2129 zSign ^= 1; 2130 goto normalizeRoundAndPack; 2131 aExpBigger: 2132 if ( aExp == 0xFF ) { 2133 if (aSig) { 2134 return propagateFloat32NaN(a, b, status); 2135 } 2136 return a; 2137 } 2138 if ( bExp == 0 ) { 2139 --expDiff; 2140 } 2141 else { 2142 bSig |= 0x40000000; 2143 } 2144 shift32RightJamming( bSig, expDiff, &bSig ); 2145 aSig |= 0x40000000; 2146 aBigger: 2147 zSig = aSig - bSig; 2148 zExp = aExp; 2149 normalizeRoundAndPack: 2150 --zExp; 2151 return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status); 2152 2153 } 2154 2155 /*---------------------------------------------------------------------------- 2156 | Returns the result of adding the single-precision floating-point values `a' 2157 | and `b'. The operation is performed according to the IEC/IEEE Standard for 2158 | Binary Floating-Point Arithmetic. 2159 *----------------------------------------------------------------------------*/ 2160 2161 float32 float32_add(float32 a, float32 b, float_status *status) 2162 { 2163 flag aSign, bSign; 2164 a = float32_squash_input_denormal(a, status); 2165 b = float32_squash_input_denormal(b, status); 2166 2167 aSign = extractFloat32Sign( a ); 2168 bSign = extractFloat32Sign( b ); 2169 if ( aSign == bSign ) { 2170 return addFloat32Sigs(a, b, aSign, status); 2171 } 2172 else { 2173 return subFloat32Sigs(a, b, aSign, status); 2174 } 2175 2176 } 2177 2178 /*---------------------------------------------------------------------------- 2179 | Returns the result of subtracting the single-precision floating-point values 2180 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2181 | for Binary Floating-Point Arithmetic. 2182 *----------------------------------------------------------------------------*/ 2183 2184 float32 float32_sub(float32 a, float32 b, float_status *status) 2185 { 2186 flag aSign, bSign; 2187 a = float32_squash_input_denormal(a, status); 2188 b = float32_squash_input_denormal(b, status); 2189 2190 aSign = extractFloat32Sign( a ); 2191 bSign = extractFloat32Sign( b ); 2192 if ( aSign == bSign ) { 2193 return subFloat32Sigs(a, b, aSign, status); 2194 } 2195 else { 2196 return addFloat32Sigs(a, b, aSign, status); 2197 } 2198 2199 } 2200 2201 /*---------------------------------------------------------------------------- 2202 | Returns the result of multiplying the single-precision floating-point values 2203 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2204 | for Binary Floating-Point Arithmetic. 2205 *----------------------------------------------------------------------------*/ 2206 2207 float32 float32_mul(float32 a, float32 b, float_status *status) 2208 { 2209 flag aSign, bSign, zSign; 2210 int_fast16_t aExp, bExp, zExp; 2211 uint32_t aSig, bSig; 2212 uint64_t zSig64; 2213 uint32_t zSig; 2214 2215 a = float32_squash_input_denormal(a, status); 2216 b = float32_squash_input_denormal(b, status); 2217 2218 aSig = extractFloat32Frac( a ); 2219 aExp = extractFloat32Exp( a ); 2220 aSign = extractFloat32Sign( a ); 2221 bSig = extractFloat32Frac( b ); 2222 bExp = extractFloat32Exp( b ); 2223 bSign = extractFloat32Sign( b ); 2224 zSign = aSign ^ bSign; 2225 if ( aExp == 0xFF ) { 2226 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 2227 return propagateFloat32NaN(a, b, status); 2228 } 2229 if ( ( bExp | bSig ) == 0 ) { 2230 float_raise(float_flag_invalid, status); 2231 return float32_default_nan; 2232 } 2233 return packFloat32( zSign, 0xFF, 0 ); 2234 } 2235 if ( bExp == 0xFF ) { 2236 if (bSig) { 2237 return propagateFloat32NaN(a, b, status); 2238 } 2239 if ( ( aExp | aSig ) == 0 ) { 2240 float_raise(float_flag_invalid, status); 2241 return float32_default_nan; 2242 } 2243 return packFloat32( zSign, 0xFF, 0 ); 2244 } 2245 if ( aExp == 0 ) { 2246 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 2247 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2248 } 2249 if ( bExp == 0 ) { 2250 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); 2251 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2252 } 2253 zExp = aExp + bExp - 0x7F; 2254 aSig = ( aSig | 0x00800000 )<<7; 2255 bSig = ( bSig | 0x00800000 )<<8; 2256 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 ); 2257 zSig = zSig64; 2258 if ( 0 <= (int32_t) ( zSig<<1 ) ) { 2259 zSig <<= 1; 2260 --zExp; 2261 } 2262 return roundAndPackFloat32(zSign, zExp, zSig, status); 2263 2264 } 2265 2266 /*---------------------------------------------------------------------------- 2267 | Returns the result of dividing the single-precision floating-point value `a' 2268 | by the corresponding value `b'. The operation is performed according to the 2269 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2270 *----------------------------------------------------------------------------*/ 2271 2272 float32 float32_div(float32 a, float32 b, float_status *status) 2273 { 2274 flag aSign, bSign, zSign; 2275 int_fast16_t aExp, bExp, zExp; 2276 uint32_t aSig, bSig, zSig; 2277 a = float32_squash_input_denormal(a, status); 2278 b = float32_squash_input_denormal(b, status); 2279 2280 aSig = extractFloat32Frac( a ); 2281 aExp = extractFloat32Exp( a ); 2282 aSign = extractFloat32Sign( a ); 2283 bSig = extractFloat32Frac( b ); 2284 bExp = extractFloat32Exp( b ); 2285 bSign = extractFloat32Sign( b ); 2286 zSign = aSign ^ bSign; 2287 if ( aExp == 0xFF ) { 2288 if (aSig) { 2289 return propagateFloat32NaN(a, b, status); 2290 } 2291 if ( bExp == 0xFF ) { 2292 if (bSig) { 2293 return propagateFloat32NaN(a, b, status); 2294 } 2295 float_raise(float_flag_invalid, status); 2296 return float32_default_nan; 2297 } 2298 return packFloat32( zSign, 0xFF, 0 ); 2299 } 2300 if ( bExp == 0xFF ) { 2301 if (bSig) { 2302 return propagateFloat32NaN(a, b, status); 2303 } 2304 return packFloat32( zSign, 0, 0 ); 2305 } 2306 if ( bExp == 0 ) { 2307 if ( bSig == 0 ) { 2308 if ( ( aExp | aSig ) == 0 ) { 2309 float_raise(float_flag_invalid, status); 2310 return float32_default_nan; 2311 } 2312 float_raise(float_flag_divbyzero, status); 2313 return packFloat32( zSign, 0xFF, 0 ); 2314 } 2315 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2316 } 2317 if ( aExp == 0 ) { 2318 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 2319 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2320 } 2321 zExp = aExp - bExp + 0x7D; 2322 aSig = ( aSig | 0x00800000 )<<7; 2323 bSig = ( bSig | 0x00800000 )<<8; 2324 if ( bSig <= ( aSig + aSig ) ) { 2325 aSig >>= 1; 2326 ++zExp; 2327 } 2328 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig; 2329 if ( ( zSig & 0x3F ) == 0 ) { 2330 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 ); 2331 } 2332 return roundAndPackFloat32(zSign, zExp, zSig, status); 2333 2334 } 2335 2336 /*---------------------------------------------------------------------------- 2337 | Returns the remainder of the single-precision floating-point value `a' 2338 | with respect to the corresponding value `b'. The operation is performed 2339 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2340 *----------------------------------------------------------------------------*/ 2341 2342 float32 float32_rem(float32 a, float32 b, float_status *status) 2343 { 2344 flag aSign, zSign; 2345 int_fast16_t aExp, bExp, expDiff; 2346 uint32_t aSig, bSig; 2347 uint32_t q; 2348 uint64_t aSig64, bSig64, q64; 2349 uint32_t alternateASig; 2350 int32_t sigMean; 2351 a = float32_squash_input_denormal(a, status); 2352 b = float32_squash_input_denormal(b, status); 2353 2354 aSig = extractFloat32Frac( a ); 2355 aExp = extractFloat32Exp( a ); 2356 aSign = extractFloat32Sign( a ); 2357 bSig = extractFloat32Frac( b ); 2358 bExp = extractFloat32Exp( b ); 2359 if ( aExp == 0xFF ) { 2360 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 2361 return propagateFloat32NaN(a, b, status); 2362 } 2363 float_raise(float_flag_invalid, status); 2364 return float32_default_nan; 2365 } 2366 if ( bExp == 0xFF ) { 2367 if (bSig) { 2368 return propagateFloat32NaN(a, b, status); 2369 } 2370 return a; 2371 } 2372 if ( bExp == 0 ) { 2373 if ( bSig == 0 ) { 2374 float_raise(float_flag_invalid, status); 2375 return float32_default_nan; 2376 } 2377 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2378 } 2379 if ( aExp == 0 ) { 2380 if ( aSig == 0 ) return a; 2381 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2382 } 2383 expDiff = aExp - bExp; 2384 aSig |= 0x00800000; 2385 bSig |= 0x00800000; 2386 if ( expDiff < 32 ) { 2387 aSig <<= 8; 2388 bSig <<= 8; 2389 if ( expDiff < 0 ) { 2390 if ( expDiff < -1 ) return a; 2391 aSig >>= 1; 2392 } 2393 q = ( bSig <= aSig ); 2394 if ( q ) aSig -= bSig; 2395 if ( 0 < expDiff ) { 2396 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 2397 q >>= 32 - expDiff; 2398 bSig >>= 2; 2399 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 2400 } 2401 else { 2402 aSig >>= 2; 2403 bSig >>= 2; 2404 } 2405 } 2406 else { 2407 if ( bSig <= aSig ) aSig -= bSig; 2408 aSig64 = ( (uint64_t) aSig )<<40; 2409 bSig64 = ( (uint64_t) bSig )<<40; 2410 expDiff -= 64; 2411 while ( 0 < expDiff ) { 2412 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2413 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2414 aSig64 = - ( ( bSig * q64 )<<38 ); 2415 expDiff -= 62; 2416 } 2417 expDiff += 64; 2418 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2419 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2420 q = q64>>( 64 - expDiff ); 2421 bSig <<= 6; 2422 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 2423 } 2424 do { 2425 alternateASig = aSig; 2426 ++q; 2427 aSig -= bSig; 2428 } while ( 0 <= (int32_t) aSig ); 2429 sigMean = aSig + alternateASig; 2430 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 2431 aSig = alternateASig; 2432 } 2433 zSign = ( (int32_t) aSig < 0 ); 2434 if ( zSign ) aSig = - aSig; 2435 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 2436 } 2437 2438 /*---------------------------------------------------------------------------- 2439 | Returns the result of multiplying the single-precision floating-point values 2440 | `a' and `b' then adding 'c', with no intermediate rounding step after the 2441 | multiplication. The operation is performed according to the IEC/IEEE 2442 | Standard for Binary Floating-Point Arithmetic 754-2008. 2443 | The flags argument allows the caller to select negation of the 2444 | addend, the intermediate product, or the final result. (The difference 2445 | between this and having the caller do a separate negation is that negating 2446 | externally will flip the sign bit on NaNs.) 2447 *----------------------------------------------------------------------------*/ 2448 2449 float32 float32_muladd(float32 a, float32 b, float32 c, int flags, 2450 float_status *status) 2451 { 2452 flag aSign, bSign, cSign, zSign; 2453 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff; 2454 uint32_t aSig, bSig, cSig; 2455 flag pInf, pZero, pSign; 2456 uint64_t pSig64, cSig64, zSig64; 2457 uint32_t pSig; 2458 int shiftcount; 2459 flag signflip, infzero; 2460 2461 a = float32_squash_input_denormal(a, status); 2462 b = float32_squash_input_denormal(b, status); 2463 c = float32_squash_input_denormal(c, status); 2464 aSig = extractFloat32Frac(a); 2465 aExp = extractFloat32Exp(a); 2466 aSign = extractFloat32Sign(a); 2467 bSig = extractFloat32Frac(b); 2468 bExp = extractFloat32Exp(b); 2469 bSign = extractFloat32Sign(b); 2470 cSig = extractFloat32Frac(c); 2471 cExp = extractFloat32Exp(c); 2472 cSign = extractFloat32Sign(c); 2473 2474 infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) || 2475 (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0)); 2476 2477 /* It is implementation-defined whether the cases of (0,inf,qnan) 2478 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 2479 * they return if they do), so we have to hand this information 2480 * off to the target-specific pick-a-NaN routine. 2481 */ 2482 if (((aExp == 0xff) && aSig) || 2483 ((bExp == 0xff) && bSig) || 2484 ((cExp == 0xff) && cSig)) { 2485 return propagateFloat32MulAddNaN(a, b, c, infzero, status); 2486 } 2487 2488 if (infzero) { 2489 float_raise(float_flag_invalid, status); 2490 return float32_default_nan; 2491 } 2492 2493 if (flags & float_muladd_negate_c) { 2494 cSign ^= 1; 2495 } 2496 2497 signflip = (flags & float_muladd_negate_result) ? 1 : 0; 2498 2499 /* Work out the sign and type of the product */ 2500 pSign = aSign ^ bSign; 2501 if (flags & float_muladd_negate_product) { 2502 pSign ^= 1; 2503 } 2504 pInf = (aExp == 0xff) || (bExp == 0xff); 2505 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0); 2506 2507 if (cExp == 0xff) { 2508 if (pInf && (pSign ^ cSign)) { 2509 /* addition of opposite-signed infinities => InvalidOperation */ 2510 float_raise(float_flag_invalid, status); 2511 return float32_default_nan; 2512 } 2513 /* Otherwise generate an infinity of the same sign */ 2514 return packFloat32(cSign ^ signflip, 0xff, 0); 2515 } 2516 2517 if (pInf) { 2518 return packFloat32(pSign ^ signflip, 0xff, 0); 2519 } 2520 2521 if (pZero) { 2522 if (cExp == 0) { 2523 if (cSig == 0) { 2524 /* Adding two exact zeroes */ 2525 if (pSign == cSign) { 2526 zSign = pSign; 2527 } else if (status->float_rounding_mode == float_round_down) { 2528 zSign = 1; 2529 } else { 2530 zSign = 0; 2531 } 2532 return packFloat32(zSign ^ signflip, 0, 0); 2533 } 2534 /* Exact zero plus a denorm */ 2535 if (status->flush_to_zero) { 2536 float_raise(float_flag_output_denormal, status); 2537 return packFloat32(cSign ^ signflip, 0, 0); 2538 } 2539 } 2540 /* Zero plus something non-zero : just return the something */ 2541 if (flags & float_muladd_halve_result) { 2542 if (cExp == 0) { 2543 normalizeFloat32Subnormal(cSig, &cExp, &cSig); 2544 } 2545 /* Subtract one to halve, and one again because roundAndPackFloat32 2546 * wants one less than the true exponent. 2547 */ 2548 cExp -= 2; 2549 cSig = (cSig | 0x00800000) << 7; 2550 return roundAndPackFloat32(cSign ^ signflip, cExp, cSig, status); 2551 } 2552 return packFloat32(cSign ^ signflip, cExp, cSig); 2553 } 2554 2555 if (aExp == 0) { 2556 normalizeFloat32Subnormal(aSig, &aExp, &aSig); 2557 } 2558 if (bExp == 0) { 2559 normalizeFloat32Subnormal(bSig, &bExp, &bSig); 2560 } 2561 2562 /* Calculate the actual result a * b + c */ 2563 2564 /* Multiply first; this is easy. */ 2565 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f 2566 * because we want the true exponent, not the "one-less-than" 2567 * flavour that roundAndPackFloat32() takes. 2568 */ 2569 pExp = aExp + bExp - 0x7e; 2570 aSig = (aSig | 0x00800000) << 7; 2571 bSig = (bSig | 0x00800000) << 8; 2572 pSig64 = (uint64_t)aSig * bSig; 2573 if ((int64_t)(pSig64 << 1) >= 0) { 2574 pSig64 <<= 1; 2575 pExp--; 2576 } 2577 2578 zSign = pSign ^ signflip; 2579 2580 /* Now pSig64 is the significand of the multiply, with the explicit bit in 2581 * position 62. 2582 */ 2583 if (cExp == 0) { 2584 if (!cSig) { 2585 /* Throw out the special case of c being an exact zero now */ 2586 shift64RightJamming(pSig64, 32, &pSig64); 2587 pSig = pSig64; 2588 if (flags & float_muladd_halve_result) { 2589 pExp--; 2590 } 2591 return roundAndPackFloat32(zSign, pExp - 1, 2592 pSig, status); 2593 } 2594 normalizeFloat32Subnormal(cSig, &cExp, &cSig); 2595 } 2596 2597 cSig64 = (uint64_t)cSig << (62 - 23); 2598 cSig64 |= LIT64(0x4000000000000000); 2599 expDiff = pExp - cExp; 2600 2601 if (pSign == cSign) { 2602 /* Addition */ 2603 if (expDiff > 0) { 2604 /* scale c to match p */ 2605 shift64RightJamming(cSig64, expDiff, &cSig64); 2606 zExp = pExp; 2607 } else if (expDiff < 0) { 2608 /* scale p to match c */ 2609 shift64RightJamming(pSig64, -expDiff, &pSig64); 2610 zExp = cExp; 2611 } else { 2612 /* no scaling needed */ 2613 zExp = cExp; 2614 } 2615 /* Add significands and make sure explicit bit ends up in posn 62 */ 2616 zSig64 = pSig64 + cSig64; 2617 if ((int64_t)zSig64 < 0) { 2618 shift64RightJamming(zSig64, 1, &zSig64); 2619 } else { 2620 zExp--; 2621 } 2622 } else { 2623 /* Subtraction */ 2624 if (expDiff > 0) { 2625 shift64RightJamming(cSig64, expDiff, &cSig64); 2626 zSig64 = pSig64 - cSig64; 2627 zExp = pExp; 2628 } else if (expDiff < 0) { 2629 shift64RightJamming(pSig64, -expDiff, &pSig64); 2630 zSig64 = cSig64 - pSig64; 2631 zExp = cExp; 2632 zSign ^= 1; 2633 } else { 2634 zExp = pExp; 2635 if (cSig64 < pSig64) { 2636 zSig64 = pSig64 - cSig64; 2637 } else if (pSig64 < cSig64) { 2638 zSig64 = cSig64 - pSig64; 2639 zSign ^= 1; 2640 } else { 2641 /* Exact zero */ 2642 zSign = signflip; 2643 if (status->float_rounding_mode == float_round_down) { 2644 zSign ^= 1; 2645 } 2646 return packFloat32(zSign, 0, 0); 2647 } 2648 } 2649 --zExp; 2650 /* Normalize to put the explicit bit back into bit 62. */ 2651 shiftcount = countLeadingZeros64(zSig64) - 1; 2652 zSig64 <<= shiftcount; 2653 zExp -= shiftcount; 2654 } 2655 if (flags & float_muladd_halve_result) { 2656 zExp--; 2657 } 2658 2659 shift64RightJamming(zSig64, 32, &zSig64); 2660 return roundAndPackFloat32(zSign, zExp, zSig64, status); 2661 } 2662 2663 2664 /*---------------------------------------------------------------------------- 2665 | Returns the square root of the single-precision floating-point value `a'. 2666 | The operation is performed according to the IEC/IEEE Standard for Binary 2667 | Floating-Point Arithmetic. 2668 *----------------------------------------------------------------------------*/ 2669 2670 float32 float32_sqrt(float32 a, float_status *status) 2671 { 2672 flag aSign; 2673 int_fast16_t aExp, zExp; 2674 uint32_t aSig, zSig; 2675 uint64_t rem, term; 2676 a = float32_squash_input_denormal(a, status); 2677 2678 aSig = extractFloat32Frac( a ); 2679 aExp = extractFloat32Exp( a ); 2680 aSign = extractFloat32Sign( a ); 2681 if ( aExp == 0xFF ) { 2682 if (aSig) { 2683 return propagateFloat32NaN(a, float32_zero, status); 2684 } 2685 if ( ! aSign ) return a; 2686 float_raise(float_flag_invalid, status); 2687 return float32_default_nan; 2688 } 2689 if ( aSign ) { 2690 if ( ( aExp | aSig ) == 0 ) return a; 2691 float_raise(float_flag_invalid, status); 2692 return float32_default_nan; 2693 } 2694 if ( aExp == 0 ) { 2695 if ( aSig == 0 ) return float32_zero; 2696 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2697 } 2698 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 2699 aSig = ( aSig | 0x00800000 )<<8; 2700 zSig = estimateSqrt32( aExp, aSig ) + 2; 2701 if ( ( zSig & 0x7F ) <= 5 ) { 2702 if ( zSig < 2 ) { 2703 zSig = 0x7FFFFFFF; 2704 goto roundAndPack; 2705 } 2706 aSig >>= aExp & 1; 2707 term = ( (uint64_t) zSig ) * zSig; 2708 rem = ( ( (uint64_t) aSig )<<32 ) - term; 2709 while ( (int64_t) rem < 0 ) { 2710 --zSig; 2711 rem += ( ( (uint64_t) zSig )<<1 ) | 1; 2712 } 2713 zSig |= ( rem != 0 ); 2714 } 2715 shift32RightJamming( zSig, 1, &zSig ); 2716 roundAndPack: 2717 return roundAndPackFloat32(0, zExp, zSig, status); 2718 2719 } 2720 2721 /*---------------------------------------------------------------------------- 2722 | Returns the binary exponential of the single-precision floating-point value 2723 | `a'. The operation is performed according to the IEC/IEEE Standard for 2724 | Binary Floating-Point Arithmetic. 2725 | 2726 | Uses the following identities: 2727 | 2728 | 1. ------------------------------------------------------------------------- 2729 | x x*ln(2) 2730 | 2 = e 2731 | 2732 | 2. ------------------------------------------------------------------------- 2733 | 2 3 4 5 n 2734 | x x x x x x x 2735 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 2736 | 1! 2! 3! 4! 5! n! 2737 *----------------------------------------------------------------------------*/ 2738 2739 static const float64 float32_exp2_coefficients[15] = 2740 { 2741 const_float64( 0x3ff0000000000000ll ), /* 1 */ 2742 const_float64( 0x3fe0000000000000ll ), /* 2 */ 2743 const_float64( 0x3fc5555555555555ll ), /* 3 */ 2744 const_float64( 0x3fa5555555555555ll ), /* 4 */ 2745 const_float64( 0x3f81111111111111ll ), /* 5 */ 2746 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 2747 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 2748 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 2749 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 2750 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 2751 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 2752 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 2753 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 2754 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 2755 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 2756 }; 2757 2758 float32 float32_exp2(float32 a, float_status *status) 2759 { 2760 flag aSign; 2761 int_fast16_t aExp; 2762 uint32_t aSig; 2763 float64 r, x, xn; 2764 int i; 2765 a = float32_squash_input_denormal(a, status); 2766 2767 aSig = extractFloat32Frac( a ); 2768 aExp = extractFloat32Exp( a ); 2769 aSign = extractFloat32Sign( a ); 2770 2771 if ( aExp == 0xFF) { 2772 if (aSig) { 2773 return propagateFloat32NaN(a, float32_zero, status); 2774 } 2775 return (aSign) ? float32_zero : a; 2776 } 2777 if (aExp == 0) { 2778 if (aSig == 0) return float32_one; 2779 } 2780 2781 float_raise(float_flag_inexact, status); 2782 2783 /* ******************************* */ 2784 /* using float64 for approximation */ 2785 /* ******************************* */ 2786 x = float32_to_float64(a, status); 2787 x = float64_mul(x, float64_ln2, status); 2788 2789 xn = x; 2790 r = float64_one; 2791 for (i = 0 ; i < 15 ; i++) { 2792 float64 f; 2793 2794 f = float64_mul(xn, float32_exp2_coefficients[i], status); 2795 r = float64_add(r, f, status); 2796 2797 xn = float64_mul(xn, x, status); 2798 } 2799 2800 return float64_to_float32(r, status); 2801 } 2802 2803 /*---------------------------------------------------------------------------- 2804 | Returns the binary log of the single-precision floating-point value `a'. 2805 | The operation is performed according to the IEC/IEEE Standard for Binary 2806 | Floating-Point Arithmetic. 2807 *----------------------------------------------------------------------------*/ 2808 float32 float32_log2(float32 a, float_status *status) 2809 { 2810 flag aSign, zSign; 2811 int_fast16_t aExp; 2812 uint32_t aSig, zSig, i; 2813 2814 a = float32_squash_input_denormal(a, status); 2815 aSig = extractFloat32Frac( a ); 2816 aExp = extractFloat32Exp( a ); 2817 aSign = extractFloat32Sign( a ); 2818 2819 if ( aExp == 0 ) { 2820 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 2821 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2822 } 2823 if ( aSign ) { 2824 float_raise(float_flag_invalid, status); 2825 return float32_default_nan; 2826 } 2827 if ( aExp == 0xFF ) { 2828 if (aSig) { 2829 return propagateFloat32NaN(a, float32_zero, status); 2830 } 2831 return a; 2832 } 2833 2834 aExp -= 0x7F; 2835 aSig |= 0x00800000; 2836 zSign = aExp < 0; 2837 zSig = aExp << 23; 2838 2839 for (i = 1 << 22; i > 0; i >>= 1) { 2840 aSig = ( (uint64_t)aSig * aSig ) >> 23; 2841 if ( aSig & 0x01000000 ) { 2842 aSig >>= 1; 2843 zSig |= i; 2844 } 2845 } 2846 2847 if ( zSign ) 2848 zSig = -zSig; 2849 2850 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 2851 } 2852 2853 /*---------------------------------------------------------------------------- 2854 | Returns 1 if the single-precision floating-point value `a' is equal to 2855 | the corresponding value `b', and 0 otherwise. The invalid exception is 2856 | raised if either operand is a NaN. Otherwise, the comparison is performed 2857 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2858 *----------------------------------------------------------------------------*/ 2859 2860 int float32_eq(float32 a, float32 b, float_status *status) 2861 { 2862 uint32_t av, bv; 2863 a = float32_squash_input_denormal(a, status); 2864 b = float32_squash_input_denormal(b, status); 2865 2866 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2867 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2868 ) { 2869 float_raise(float_flag_invalid, status); 2870 return 0; 2871 } 2872 av = float32_val(a); 2873 bv = float32_val(b); 2874 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 2875 } 2876 2877 /*---------------------------------------------------------------------------- 2878 | Returns 1 if the single-precision floating-point value `a' is less than 2879 | or equal to the corresponding value `b', and 0 otherwise. The invalid 2880 | exception is raised if either operand is a NaN. The comparison is performed 2881 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2882 *----------------------------------------------------------------------------*/ 2883 2884 int float32_le(float32 a, float32 b, float_status *status) 2885 { 2886 flag aSign, bSign; 2887 uint32_t av, bv; 2888 a = float32_squash_input_denormal(a, status); 2889 b = float32_squash_input_denormal(b, status); 2890 2891 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2892 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2893 ) { 2894 float_raise(float_flag_invalid, status); 2895 return 0; 2896 } 2897 aSign = extractFloat32Sign( a ); 2898 bSign = extractFloat32Sign( b ); 2899 av = float32_val(a); 2900 bv = float32_val(b); 2901 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 2902 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 2903 2904 } 2905 2906 /*---------------------------------------------------------------------------- 2907 | Returns 1 if the single-precision floating-point value `a' is less than 2908 | the corresponding value `b', and 0 otherwise. The invalid exception is 2909 | raised if either operand is a NaN. The comparison is performed according 2910 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2911 *----------------------------------------------------------------------------*/ 2912 2913 int float32_lt(float32 a, float32 b, float_status *status) 2914 { 2915 flag aSign, bSign; 2916 uint32_t av, bv; 2917 a = float32_squash_input_denormal(a, status); 2918 b = float32_squash_input_denormal(b, status); 2919 2920 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2921 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2922 ) { 2923 float_raise(float_flag_invalid, status); 2924 return 0; 2925 } 2926 aSign = extractFloat32Sign( a ); 2927 bSign = extractFloat32Sign( b ); 2928 av = float32_val(a); 2929 bv = float32_val(b); 2930 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 2931 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 2932 2933 } 2934 2935 /*---------------------------------------------------------------------------- 2936 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 2937 | be compared, and 0 otherwise. The invalid exception is raised if either 2938 | operand is a NaN. The comparison is performed according to the IEC/IEEE 2939 | Standard for Binary Floating-Point Arithmetic. 2940 *----------------------------------------------------------------------------*/ 2941 2942 int float32_unordered(float32 a, float32 b, float_status *status) 2943 { 2944 a = float32_squash_input_denormal(a, status); 2945 b = float32_squash_input_denormal(b, status); 2946 2947 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2948 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2949 ) { 2950 float_raise(float_flag_invalid, status); 2951 return 1; 2952 } 2953 return 0; 2954 } 2955 2956 /*---------------------------------------------------------------------------- 2957 | Returns 1 if the single-precision floating-point value `a' is equal to 2958 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 2959 | exception. The comparison is performed according to the IEC/IEEE Standard 2960 | for Binary Floating-Point Arithmetic. 2961 *----------------------------------------------------------------------------*/ 2962 2963 int float32_eq_quiet(float32 a, float32 b, float_status *status) 2964 { 2965 a = float32_squash_input_denormal(a, status); 2966 b = float32_squash_input_denormal(b, status); 2967 2968 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2969 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2970 ) { 2971 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2972 float_raise(float_flag_invalid, status); 2973 } 2974 return 0; 2975 } 2976 return ( float32_val(a) == float32_val(b) ) || 2977 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 2978 } 2979 2980 /*---------------------------------------------------------------------------- 2981 | Returns 1 if the single-precision floating-point value `a' is less than or 2982 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 2983 | cause an exception. Otherwise, the comparison is performed according to the 2984 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2985 *----------------------------------------------------------------------------*/ 2986 2987 int float32_le_quiet(float32 a, float32 b, float_status *status) 2988 { 2989 flag aSign, bSign; 2990 uint32_t av, bv; 2991 a = float32_squash_input_denormal(a, status); 2992 b = float32_squash_input_denormal(b, status); 2993 2994 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2995 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2996 ) { 2997 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2998 float_raise(float_flag_invalid, status); 2999 } 3000 return 0; 3001 } 3002 aSign = extractFloat32Sign( a ); 3003 bSign = extractFloat32Sign( b ); 3004 av = float32_val(a); 3005 bv = float32_val(b); 3006 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3007 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3008 3009 } 3010 3011 /*---------------------------------------------------------------------------- 3012 | Returns 1 if the single-precision floating-point value `a' is less than 3013 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3014 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 3015 | Standard for Binary Floating-Point Arithmetic. 3016 *----------------------------------------------------------------------------*/ 3017 3018 int float32_lt_quiet(float32 a, float32 b, float_status *status) 3019 { 3020 flag aSign, bSign; 3021 uint32_t av, bv; 3022 a = float32_squash_input_denormal(a, status); 3023 b = float32_squash_input_denormal(b, status); 3024 3025 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3026 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3027 ) { 3028 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 3029 float_raise(float_flag_invalid, status); 3030 } 3031 return 0; 3032 } 3033 aSign = extractFloat32Sign( a ); 3034 bSign = extractFloat32Sign( b ); 3035 av = float32_val(a); 3036 bv = float32_val(b); 3037 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 3038 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3039 3040 } 3041 3042 /*---------------------------------------------------------------------------- 3043 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 3044 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 3045 | comparison is performed according to the IEC/IEEE Standard for Binary 3046 | Floating-Point Arithmetic. 3047 *----------------------------------------------------------------------------*/ 3048 3049 int float32_unordered_quiet(float32 a, float32 b, float_status *status) 3050 { 3051 a = float32_squash_input_denormal(a, status); 3052 b = float32_squash_input_denormal(b, status); 3053 3054 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3055 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3056 ) { 3057 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 3058 float_raise(float_flag_invalid, status); 3059 } 3060 return 1; 3061 } 3062 return 0; 3063 } 3064 3065 /*---------------------------------------------------------------------------- 3066 | Returns the result of converting the double-precision floating-point value 3067 | `a' to the 32-bit two's complement integer format. The conversion is 3068 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3069 | Arithmetic---which means in particular that the conversion is rounded 3070 | according to the current rounding mode. If `a' is a NaN, the largest 3071 | positive integer is returned. Otherwise, if the conversion overflows, the 3072 | largest integer with the same sign as `a' is returned. 3073 *----------------------------------------------------------------------------*/ 3074 3075 int32_t float64_to_int32(float64 a, float_status *status) 3076 { 3077 flag aSign; 3078 int_fast16_t aExp, shiftCount; 3079 uint64_t aSig; 3080 a = float64_squash_input_denormal(a, status); 3081 3082 aSig = extractFloat64Frac( a ); 3083 aExp = extractFloat64Exp( a ); 3084 aSign = extractFloat64Sign( a ); 3085 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 3086 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 3087 shiftCount = 0x42C - aExp; 3088 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 3089 return roundAndPackInt32(aSign, aSig, status); 3090 3091 } 3092 3093 /*---------------------------------------------------------------------------- 3094 | Returns the result of converting the double-precision floating-point value 3095 | `a' to the 32-bit two's complement integer format. The conversion is 3096 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3097 | Arithmetic, except that the conversion is always rounded toward zero. 3098 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 3099 | the conversion overflows, the largest integer with the same sign as `a' is 3100 | returned. 3101 *----------------------------------------------------------------------------*/ 3102 3103 int32_t float64_to_int32_round_to_zero(float64 a, float_status *status) 3104 { 3105 flag aSign; 3106 int_fast16_t aExp, shiftCount; 3107 uint64_t aSig, savedASig; 3108 int32_t z; 3109 a = float64_squash_input_denormal(a, status); 3110 3111 aSig = extractFloat64Frac( a ); 3112 aExp = extractFloat64Exp( a ); 3113 aSign = extractFloat64Sign( a ); 3114 if ( 0x41E < aExp ) { 3115 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 3116 goto invalid; 3117 } 3118 else if ( aExp < 0x3FF ) { 3119 if (aExp || aSig) { 3120 status->float_exception_flags |= float_flag_inexact; 3121 } 3122 return 0; 3123 } 3124 aSig |= LIT64( 0x0010000000000000 ); 3125 shiftCount = 0x433 - aExp; 3126 savedASig = aSig; 3127 aSig >>= shiftCount; 3128 z = aSig; 3129 if ( aSign ) z = - z; 3130 if ( ( z < 0 ) ^ aSign ) { 3131 invalid: 3132 float_raise(float_flag_invalid, status); 3133 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 3134 } 3135 if ( ( aSig<<shiftCount ) != savedASig ) { 3136 status->float_exception_flags |= float_flag_inexact; 3137 } 3138 return z; 3139 3140 } 3141 3142 /*---------------------------------------------------------------------------- 3143 | Returns the result of converting the double-precision floating-point value 3144 | `a' to the 16-bit two's complement integer format. The conversion is 3145 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3146 | Arithmetic, except that the conversion is always rounded toward zero. 3147 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 3148 | the conversion overflows, the largest integer with the same sign as `a' is 3149 | returned. 3150 *----------------------------------------------------------------------------*/ 3151 3152 int_fast16_t float64_to_int16_round_to_zero(float64 a, float_status *status) 3153 { 3154 flag aSign; 3155 int_fast16_t aExp, shiftCount; 3156 uint64_t aSig, savedASig; 3157 int32_t z; 3158 3159 aSig = extractFloat64Frac( a ); 3160 aExp = extractFloat64Exp( a ); 3161 aSign = extractFloat64Sign( a ); 3162 if ( 0x40E < aExp ) { 3163 if ( ( aExp == 0x7FF ) && aSig ) { 3164 aSign = 0; 3165 } 3166 goto invalid; 3167 } 3168 else if ( aExp < 0x3FF ) { 3169 if ( aExp || aSig ) { 3170 status->float_exception_flags |= float_flag_inexact; 3171 } 3172 return 0; 3173 } 3174 aSig |= LIT64( 0x0010000000000000 ); 3175 shiftCount = 0x433 - aExp; 3176 savedASig = aSig; 3177 aSig >>= shiftCount; 3178 z = aSig; 3179 if ( aSign ) { 3180 z = - z; 3181 } 3182 if ( ( (int16_t)z < 0 ) ^ aSign ) { 3183 invalid: 3184 float_raise(float_flag_invalid, status); 3185 return aSign ? (int32_t) 0xffff8000 : 0x7FFF; 3186 } 3187 if ( ( aSig<<shiftCount ) != savedASig ) { 3188 status->float_exception_flags |= float_flag_inexact; 3189 } 3190 return z; 3191 } 3192 3193 /*---------------------------------------------------------------------------- 3194 | Returns the result of converting the double-precision floating-point value 3195 | `a' to the 64-bit two's complement integer format. The conversion is 3196 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3197 | Arithmetic---which means in particular that the conversion is rounded 3198 | according to the current rounding mode. If `a' is a NaN, the largest 3199 | positive integer is returned. Otherwise, if the conversion overflows, the 3200 | largest integer with the same sign as `a' is returned. 3201 *----------------------------------------------------------------------------*/ 3202 3203 int64_t float64_to_int64(float64 a, float_status *status) 3204 { 3205 flag aSign; 3206 int_fast16_t aExp, shiftCount; 3207 uint64_t aSig, aSigExtra; 3208 a = float64_squash_input_denormal(a, status); 3209 3210 aSig = extractFloat64Frac( a ); 3211 aExp = extractFloat64Exp( a ); 3212 aSign = extractFloat64Sign( a ); 3213 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 3214 shiftCount = 0x433 - aExp; 3215 if ( shiftCount <= 0 ) { 3216 if ( 0x43E < aExp ) { 3217 float_raise(float_flag_invalid, status); 3218 if ( ! aSign 3219 || ( ( aExp == 0x7FF ) 3220 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 3221 ) { 3222 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3223 } 3224 return (int64_t) LIT64( 0x8000000000000000 ); 3225 } 3226 aSigExtra = 0; 3227 aSig <<= - shiftCount; 3228 } 3229 else { 3230 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 3231 } 3232 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 3233 3234 } 3235 3236 /*---------------------------------------------------------------------------- 3237 | Returns the result of converting the double-precision floating-point value 3238 | `a' to the 64-bit two's complement integer format. The conversion is 3239 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3240 | Arithmetic, except that the conversion is always rounded toward zero. 3241 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 3242 | the conversion overflows, the largest integer with the same sign as `a' is 3243 | returned. 3244 *----------------------------------------------------------------------------*/ 3245 3246 int64_t float64_to_int64_round_to_zero(float64 a, float_status *status) 3247 { 3248 flag aSign; 3249 int_fast16_t aExp, shiftCount; 3250 uint64_t aSig; 3251 int64_t z; 3252 a = float64_squash_input_denormal(a, status); 3253 3254 aSig = extractFloat64Frac( a ); 3255 aExp = extractFloat64Exp( a ); 3256 aSign = extractFloat64Sign( a ); 3257 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 3258 shiftCount = aExp - 0x433; 3259 if ( 0 <= shiftCount ) { 3260 if ( 0x43E <= aExp ) { 3261 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) { 3262 float_raise(float_flag_invalid, status); 3263 if ( ! aSign 3264 || ( ( aExp == 0x7FF ) 3265 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 3266 ) { 3267 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3268 } 3269 } 3270 return (int64_t) LIT64( 0x8000000000000000 ); 3271 } 3272 z = aSig<<shiftCount; 3273 } 3274 else { 3275 if ( aExp < 0x3FE ) { 3276 if (aExp | aSig) { 3277 status->float_exception_flags |= float_flag_inexact; 3278 } 3279 return 0; 3280 } 3281 z = aSig>>( - shiftCount ); 3282 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 3283 status->float_exception_flags |= float_flag_inexact; 3284 } 3285 } 3286 if ( aSign ) z = - z; 3287 return z; 3288 3289 } 3290 3291 /*---------------------------------------------------------------------------- 3292 | Returns the result of converting the double-precision floating-point value 3293 | `a' to the single-precision floating-point format. The conversion is 3294 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3295 | Arithmetic. 3296 *----------------------------------------------------------------------------*/ 3297 3298 float32 float64_to_float32(float64 a, float_status *status) 3299 { 3300 flag aSign; 3301 int_fast16_t aExp; 3302 uint64_t aSig; 3303 uint32_t zSig; 3304 a = float64_squash_input_denormal(a, status); 3305 3306 aSig = extractFloat64Frac( a ); 3307 aExp = extractFloat64Exp( a ); 3308 aSign = extractFloat64Sign( a ); 3309 if ( aExp == 0x7FF ) { 3310 if (aSig) { 3311 return commonNaNToFloat32(float64ToCommonNaN(a, status), status); 3312 } 3313 return packFloat32( aSign, 0xFF, 0 ); 3314 } 3315 shift64RightJamming( aSig, 22, &aSig ); 3316 zSig = aSig; 3317 if ( aExp || zSig ) { 3318 zSig |= 0x40000000; 3319 aExp -= 0x381; 3320 } 3321 return roundAndPackFloat32(aSign, aExp, zSig, status); 3322 3323 } 3324 3325 3326 /*---------------------------------------------------------------------------- 3327 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 3328 | half-precision floating-point value, returning the result. After being 3329 | shifted into the proper positions, the three fields are simply added 3330 | together to form the result. This means that any integer portion of `zSig' 3331 | will be added into the exponent. Since a properly normalized significand 3332 | will have an integer portion equal to 1, the `zExp' input should be 1 less 3333 | than the desired result exponent whenever `zSig' is a complete, normalized 3334 | significand. 3335 *----------------------------------------------------------------------------*/ 3336 static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig) 3337 { 3338 return make_float16( 3339 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig); 3340 } 3341 3342 /*---------------------------------------------------------------------------- 3343 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3344 | and significand `zSig', and returns the proper half-precision floating- 3345 | point value corresponding to the abstract input. Ordinarily, the abstract 3346 | value is simply rounded and packed into the half-precision format, with 3347 | the inexact exception raised if the abstract input cannot be represented 3348 | exactly. However, if the abstract value is too large, the overflow and 3349 | inexact exceptions are raised and an infinity or maximal finite value is 3350 | returned. If the abstract value is too small, the input value is rounded to 3351 | a subnormal number, and the underflow and inexact exceptions are raised if 3352 | the abstract input cannot be represented exactly as a subnormal half- 3353 | precision floating-point number. 3354 | The `ieee' flag indicates whether to use IEEE standard half precision, or 3355 | ARM-style "alternative representation", which omits the NaN and Inf 3356 | encodings in order to raise the maximum representable exponent by one. 3357 | The input significand `zSig' has its binary point between bits 22 3358 | and 23, which is 13 bits to the left of the usual location. This shifted 3359 | significand must be normalized or smaller. If `zSig' is not normalized, 3360 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3361 | and it must not require rounding. In the usual case that `zSig' is 3362 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3363 | Note the slightly odd position of the binary point in zSig compared with the 3364 | other roundAndPackFloat functions. This should probably be fixed if we 3365 | need to implement more float16 routines than just conversion. 3366 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3367 | Binary Floating-Point Arithmetic. 3368 *----------------------------------------------------------------------------*/ 3369 3370 static float16 roundAndPackFloat16(flag zSign, int_fast16_t zExp, 3371 uint32_t zSig, flag ieee, 3372 float_status *status) 3373 { 3374 int maxexp = ieee ? 29 : 30; 3375 uint32_t mask; 3376 uint32_t increment; 3377 bool rounding_bumps_exp; 3378 bool is_tiny = false; 3379 3380 /* Calculate the mask of bits of the mantissa which are not 3381 * representable in half-precision and will be lost. 3382 */ 3383 if (zExp < 1) { 3384 /* Will be denormal in halfprec */ 3385 mask = 0x00ffffff; 3386 if (zExp >= -11) { 3387 mask >>= 11 + zExp; 3388 } 3389 } else { 3390 /* Normal number in halfprec */ 3391 mask = 0x00001fff; 3392 } 3393 3394 switch (status->float_rounding_mode) { 3395 case float_round_nearest_even: 3396 increment = (mask + 1) >> 1; 3397 if ((zSig & mask) == increment) { 3398 increment = zSig & (increment << 1); 3399 } 3400 break; 3401 case float_round_ties_away: 3402 increment = (mask + 1) >> 1; 3403 break; 3404 case float_round_up: 3405 increment = zSign ? 0 : mask; 3406 break; 3407 case float_round_down: 3408 increment = zSign ? mask : 0; 3409 break; 3410 default: /* round_to_zero */ 3411 increment = 0; 3412 break; 3413 } 3414 3415 rounding_bumps_exp = (zSig + increment >= 0x01000000); 3416 3417 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) { 3418 if (ieee) { 3419 float_raise(float_flag_overflow | float_flag_inexact, status); 3420 return packFloat16(zSign, 0x1f, 0); 3421 } else { 3422 float_raise(float_flag_invalid, status); 3423 return packFloat16(zSign, 0x1f, 0x3ff); 3424 } 3425 } 3426 3427 if (zExp < 0) { 3428 /* Note that flush-to-zero does not affect half-precision results */ 3429 is_tiny = 3430 (status->float_detect_tininess == float_tininess_before_rounding) 3431 || (zExp < -1) 3432 || (!rounding_bumps_exp); 3433 } 3434 if (zSig & mask) { 3435 float_raise(float_flag_inexact, status); 3436 if (is_tiny) { 3437 float_raise(float_flag_underflow, status); 3438 } 3439 } 3440 3441 zSig += increment; 3442 if (rounding_bumps_exp) { 3443 zSig >>= 1; 3444 zExp++; 3445 } 3446 3447 if (zExp < -10) { 3448 return packFloat16(zSign, 0, 0); 3449 } 3450 if (zExp < 0) { 3451 zSig >>= -zExp; 3452 zExp = 0; 3453 } 3454 return packFloat16(zSign, zExp, zSig >> 13); 3455 } 3456 3457 static void normalizeFloat16Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, 3458 uint32_t *zSigPtr) 3459 { 3460 int8_t shiftCount = countLeadingZeros32(aSig) - 21; 3461 *zSigPtr = aSig << shiftCount; 3462 *zExpPtr = 1 - shiftCount; 3463 } 3464 3465 /* Half precision floats come in two formats: standard IEEE and "ARM" format. 3466 The latter gains extra exponent range by omitting the NaN/Inf encodings. */ 3467 3468 float32 float16_to_float32(float16 a, flag ieee, float_status *status) 3469 { 3470 flag aSign; 3471 int_fast16_t aExp; 3472 uint32_t aSig; 3473 3474 aSign = extractFloat16Sign(a); 3475 aExp = extractFloat16Exp(a); 3476 aSig = extractFloat16Frac(a); 3477 3478 if (aExp == 0x1f && ieee) { 3479 if (aSig) { 3480 return commonNaNToFloat32(float16ToCommonNaN(a, status), status); 3481 } 3482 return packFloat32(aSign, 0xff, 0); 3483 } 3484 if (aExp == 0) { 3485 if (aSig == 0) { 3486 return packFloat32(aSign, 0, 0); 3487 } 3488 3489 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 3490 aExp--; 3491 } 3492 return packFloat32( aSign, aExp + 0x70, aSig << 13); 3493 } 3494 3495 float16 float32_to_float16(float32 a, flag ieee, float_status *status) 3496 { 3497 flag aSign; 3498 int_fast16_t aExp; 3499 uint32_t aSig; 3500 3501 a = float32_squash_input_denormal(a, status); 3502 3503 aSig = extractFloat32Frac( a ); 3504 aExp = extractFloat32Exp( a ); 3505 aSign = extractFloat32Sign( a ); 3506 if ( aExp == 0xFF ) { 3507 if (aSig) { 3508 /* Input is a NaN */ 3509 if (!ieee) { 3510 float_raise(float_flag_invalid, status); 3511 return packFloat16(aSign, 0, 0); 3512 } 3513 return commonNaNToFloat16( 3514 float32ToCommonNaN(a, status), status); 3515 } 3516 /* Infinity */ 3517 if (!ieee) { 3518 float_raise(float_flag_invalid, status); 3519 return packFloat16(aSign, 0x1f, 0x3ff); 3520 } 3521 return packFloat16(aSign, 0x1f, 0); 3522 } 3523 if (aExp == 0 && aSig == 0) { 3524 return packFloat16(aSign, 0, 0); 3525 } 3526 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 3527 * even if the input is denormal; however this is harmless because 3528 * the largest possible single-precision denormal is still smaller 3529 * than the smallest representable half-precision denormal, and so we 3530 * will end up ignoring aSig and returning via the "always return zero" 3531 * codepath. 3532 */ 3533 aSig |= 0x00800000; 3534 aExp -= 0x71; 3535 3536 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status); 3537 } 3538 3539 float64 float16_to_float64(float16 a, flag ieee, float_status *status) 3540 { 3541 flag aSign; 3542 int_fast16_t aExp; 3543 uint32_t aSig; 3544 3545 aSign = extractFloat16Sign(a); 3546 aExp = extractFloat16Exp(a); 3547 aSig = extractFloat16Frac(a); 3548 3549 if (aExp == 0x1f && ieee) { 3550 if (aSig) { 3551 return commonNaNToFloat64( 3552 float16ToCommonNaN(a, status), status); 3553 } 3554 return packFloat64(aSign, 0x7ff, 0); 3555 } 3556 if (aExp == 0) { 3557 if (aSig == 0) { 3558 return packFloat64(aSign, 0, 0); 3559 } 3560 3561 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 3562 aExp--; 3563 } 3564 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42); 3565 } 3566 3567 float16 float64_to_float16(float64 a, flag ieee, float_status *status) 3568 { 3569 flag aSign; 3570 int_fast16_t aExp; 3571 uint64_t aSig; 3572 uint32_t zSig; 3573 3574 a = float64_squash_input_denormal(a, status); 3575 3576 aSig = extractFloat64Frac(a); 3577 aExp = extractFloat64Exp(a); 3578 aSign = extractFloat64Sign(a); 3579 if (aExp == 0x7FF) { 3580 if (aSig) { 3581 /* Input is a NaN */ 3582 if (!ieee) { 3583 float_raise(float_flag_invalid, status); 3584 return packFloat16(aSign, 0, 0); 3585 } 3586 return commonNaNToFloat16( 3587 float64ToCommonNaN(a, status), status); 3588 } 3589 /* Infinity */ 3590 if (!ieee) { 3591 float_raise(float_flag_invalid, status); 3592 return packFloat16(aSign, 0x1f, 0x3ff); 3593 } 3594 return packFloat16(aSign, 0x1f, 0); 3595 } 3596 shift64RightJamming(aSig, 29, &aSig); 3597 zSig = aSig; 3598 if (aExp == 0 && zSig == 0) { 3599 return packFloat16(aSign, 0, 0); 3600 } 3601 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 3602 * even if the input is denormal; however this is harmless because 3603 * the largest possible single-precision denormal is still smaller 3604 * than the smallest representable half-precision denormal, and so we 3605 * will end up ignoring aSig and returning via the "always return zero" 3606 * codepath. 3607 */ 3608 zSig |= 0x00800000; 3609 aExp -= 0x3F1; 3610 3611 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status); 3612 } 3613 3614 /*---------------------------------------------------------------------------- 3615 | Returns the result of converting the double-precision floating-point value 3616 | `a' to the extended double-precision floating-point format. The conversion 3617 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3618 | Arithmetic. 3619 *----------------------------------------------------------------------------*/ 3620 3621 floatx80 float64_to_floatx80(float64 a, float_status *status) 3622 { 3623 flag aSign; 3624 int_fast16_t aExp; 3625 uint64_t aSig; 3626 3627 a = float64_squash_input_denormal(a, status); 3628 aSig = extractFloat64Frac( a ); 3629 aExp = extractFloat64Exp( a ); 3630 aSign = extractFloat64Sign( a ); 3631 if ( aExp == 0x7FF ) { 3632 if (aSig) { 3633 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status); 3634 } 3635 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3636 } 3637 if ( aExp == 0 ) { 3638 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 3639 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3640 } 3641 return 3642 packFloatx80( 3643 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 3644 3645 } 3646 3647 /*---------------------------------------------------------------------------- 3648 | Returns the result of converting the double-precision floating-point value 3649 | `a' to the quadruple-precision floating-point format. The conversion is 3650 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3651 | Arithmetic. 3652 *----------------------------------------------------------------------------*/ 3653 3654 float128 float64_to_float128(float64 a, float_status *status) 3655 { 3656 flag aSign; 3657 int_fast16_t aExp; 3658 uint64_t aSig, zSig0, zSig1; 3659 3660 a = float64_squash_input_denormal(a, status); 3661 aSig = extractFloat64Frac( a ); 3662 aExp = extractFloat64Exp( a ); 3663 aSign = extractFloat64Sign( a ); 3664 if ( aExp == 0x7FF ) { 3665 if (aSig) { 3666 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 3667 } 3668 return packFloat128( aSign, 0x7FFF, 0, 0 ); 3669 } 3670 if ( aExp == 0 ) { 3671 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 3672 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3673 --aExp; 3674 } 3675 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 3676 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 3677 3678 } 3679 3680 /*---------------------------------------------------------------------------- 3681 | Rounds the double-precision floating-point value `a' to an integer, and 3682 | returns the result as a double-precision floating-point value. The 3683 | operation is performed according to the IEC/IEEE Standard for Binary 3684 | Floating-Point Arithmetic. 3685 *----------------------------------------------------------------------------*/ 3686 3687 float64 float64_round_to_int(float64 a, float_status *status) 3688 { 3689 flag aSign; 3690 int_fast16_t aExp; 3691 uint64_t lastBitMask, roundBitsMask; 3692 uint64_t z; 3693 a = float64_squash_input_denormal(a, status); 3694 3695 aExp = extractFloat64Exp( a ); 3696 if ( 0x433 <= aExp ) { 3697 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { 3698 return propagateFloat64NaN(a, a, status); 3699 } 3700 return a; 3701 } 3702 if ( aExp < 0x3FF ) { 3703 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a; 3704 status->float_exception_flags |= float_flag_inexact; 3705 aSign = extractFloat64Sign( a ); 3706 switch (status->float_rounding_mode) { 3707 case float_round_nearest_even: 3708 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { 3709 return packFloat64( aSign, 0x3FF, 0 ); 3710 } 3711 break; 3712 case float_round_ties_away: 3713 if (aExp == 0x3FE) { 3714 return packFloat64(aSign, 0x3ff, 0); 3715 } 3716 break; 3717 case float_round_down: 3718 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0); 3719 case float_round_up: 3720 return make_float64( 3721 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 )); 3722 } 3723 return packFloat64( aSign, 0, 0 ); 3724 } 3725 lastBitMask = 1; 3726 lastBitMask <<= 0x433 - aExp; 3727 roundBitsMask = lastBitMask - 1; 3728 z = float64_val(a); 3729 switch (status->float_rounding_mode) { 3730 case float_round_nearest_even: 3731 z += lastBitMask >> 1; 3732 if ((z & roundBitsMask) == 0) { 3733 z &= ~lastBitMask; 3734 } 3735 break; 3736 case float_round_ties_away: 3737 z += lastBitMask >> 1; 3738 break; 3739 case float_round_to_zero: 3740 break; 3741 case float_round_up: 3742 if (!extractFloat64Sign(make_float64(z))) { 3743 z += roundBitsMask; 3744 } 3745 break; 3746 case float_round_down: 3747 if (extractFloat64Sign(make_float64(z))) { 3748 z += roundBitsMask; 3749 } 3750 break; 3751 default: 3752 abort(); 3753 } 3754 z &= ~ roundBitsMask; 3755 if (z != float64_val(a)) { 3756 status->float_exception_flags |= float_flag_inexact; 3757 } 3758 return make_float64(z); 3759 3760 } 3761 3762 float64 float64_trunc_to_int(float64 a, float_status *status) 3763 { 3764 int oldmode; 3765 float64 res; 3766 oldmode = status->float_rounding_mode; 3767 status->float_rounding_mode = float_round_to_zero; 3768 res = float64_round_to_int(a, status); 3769 status->float_rounding_mode = oldmode; 3770 return res; 3771 } 3772 3773 /*---------------------------------------------------------------------------- 3774 | Returns the result of adding the absolute values of the double-precision 3775 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 3776 | before being returned. `zSign' is ignored if the result is a NaN. 3777 | The addition is performed according to the IEC/IEEE Standard for Binary 3778 | Floating-Point Arithmetic. 3779 *----------------------------------------------------------------------------*/ 3780 3781 static float64 addFloat64Sigs(float64 a, float64 b, flag zSign, 3782 float_status *status) 3783 { 3784 int_fast16_t aExp, bExp, zExp; 3785 uint64_t aSig, bSig, zSig; 3786 int_fast16_t expDiff; 3787 3788 aSig = extractFloat64Frac( a ); 3789 aExp = extractFloat64Exp( a ); 3790 bSig = extractFloat64Frac( b ); 3791 bExp = extractFloat64Exp( b ); 3792 expDiff = aExp - bExp; 3793 aSig <<= 9; 3794 bSig <<= 9; 3795 if ( 0 < expDiff ) { 3796 if ( aExp == 0x7FF ) { 3797 if (aSig) { 3798 return propagateFloat64NaN(a, b, status); 3799 } 3800 return a; 3801 } 3802 if ( bExp == 0 ) { 3803 --expDiff; 3804 } 3805 else { 3806 bSig |= LIT64( 0x2000000000000000 ); 3807 } 3808 shift64RightJamming( bSig, expDiff, &bSig ); 3809 zExp = aExp; 3810 } 3811 else if ( expDiff < 0 ) { 3812 if ( bExp == 0x7FF ) { 3813 if (bSig) { 3814 return propagateFloat64NaN(a, b, status); 3815 } 3816 return packFloat64( zSign, 0x7FF, 0 ); 3817 } 3818 if ( aExp == 0 ) { 3819 ++expDiff; 3820 } 3821 else { 3822 aSig |= LIT64( 0x2000000000000000 ); 3823 } 3824 shift64RightJamming( aSig, - expDiff, &aSig ); 3825 zExp = bExp; 3826 } 3827 else { 3828 if ( aExp == 0x7FF ) { 3829 if (aSig | bSig) { 3830 return propagateFloat64NaN(a, b, status); 3831 } 3832 return a; 3833 } 3834 if ( aExp == 0 ) { 3835 if (status->flush_to_zero) { 3836 if (aSig | bSig) { 3837 float_raise(float_flag_output_denormal, status); 3838 } 3839 return packFloat64(zSign, 0, 0); 3840 } 3841 return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); 3842 } 3843 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig; 3844 zExp = aExp; 3845 goto roundAndPack; 3846 } 3847 aSig |= LIT64( 0x2000000000000000 ); 3848 zSig = ( aSig + bSig )<<1; 3849 --zExp; 3850 if ( (int64_t) zSig < 0 ) { 3851 zSig = aSig + bSig; 3852 ++zExp; 3853 } 3854 roundAndPack: 3855 return roundAndPackFloat64(zSign, zExp, zSig, status); 3856 3857 } 3858 3859 /*---------------------------------------------------------------------------- 3860 | Returns the result of subtracting the absolute values of the double- 3861 | precision floating-point values `a' and `b'. If `zSign' is 1, the 3862 | difference is negated before being returned. `zSign' is ignored if the 3863 | result is a NaN. The subtraction is performed according to the IEC/IEEE 3864 | Standard for Binary Floating-Point Arithmetic. 3865 *----------------------------------------------------------------------------*/ 3866 3867 static float64 subFloat64Sigs(float64 a, float64 b, flag zSign, 3868 float_status *status) 3869 { 3870 int_fast16_t aExp, bExp, zExp; 3871 uint64_t aSig, bSig, zSig; 3872 int_fast16_t expDiff; 3873 3874 aSig = extractFloat64Frac( a ); 3875 aExp = extractFloat64Exp( a ); 3876 bSig = extractFloat64Frac( b ); 3877 bExp = extractFloat64Exp( b ); 3878 expDiff = aExp - bExp; 3879 aSig <<= 10; 3880 bSig <<= 10; 3881 if ( 0 < expDiff ) goto aExpBigger; 3882 if ( expDiff < 0 ) goto bExpBigger; 3883 if ( aExp == 0x7FF ) { 3884 if (aSig | bSig) { 3885 return propagateFloat64NaN(a, b, status); 3886 } 3887 float_raise(float_flag_invalid, status); 3888 return float64_default_nan; 3889 } 3890 if ( aExp == 0 ) { 3891 aExp = 1; 3892 bExp = 1; 3893 } 3894 if ( bSig < aSig ) goto aBigger; 3895 if ( aSig < bSig ) goto bBigger; 3896 return packFloat64(status->float_rounding_mode == float_round_down, 0, 0); 3897 bExpBigger: 3898 if ( bExp == 0x7FF ) { 3899 if (bSig) { 3900 return propagateFloat64NaN(a, b, status); 3901 } 3902 return packFloat64( zSign ^ 1, 0x7FF, 0 ); 3903 } 3904 if ( aExp == 0 ) { 3905 ++expDiff; 3906 } 3907 else { 3908 aSig |= LIT64( 0x4000000000000000 ); 3909 } 3910 shift64RightJamming( aSig, - expDiff, &aSig ); 3911 bSig |= LIT64( 0x4000000000000000 ); 3912 bBigger: 3913 zSig = bSig - aSig; 3914 zExp = bExp; 3915 zSign ^= 1; 3916 goto normalizeRoundAndPack; 3917 aExpBigger: 3918 if ( aExp == 0x7FF ) { 3919 if (aSig) { 3920 return propagateFloat64NaN(a, b, status); 3921 } 3922 return a; 3923 } 3924 if ( bExp == 0 ) { 3925 --expDiff; 3926 } 3927 else { 3928 bSig |= LIT64( 0x4000000000000000 ); 3929 } 3930 shift64RightJamming( bSig, expDiff, &bSig ); 3931 aSig |= LIT64( 0x4000000000000000 ); 3932 aBigger: 3933 zSig = aSig - bSig; 3934 zExp = aExp; 3935 normalizeRoundAndPack: 3936 --zExp; 3937 return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status); 3938 3939 } 3940 3941 /*---------------------------------------------------------------------------- 3942 | Returns the result of adding the double-precision floating-point values `a' 3943 | and `b'. The operation is performed according to the IEC/IEEE Standard for 3944 | Binary Floating-Point Arithmetic. 3945 *----------------------------------------------------------------------------*/ 3946 3947 float64 float64_add(float64 a, float64 b, float_status *status) 3948 { 3949 flag aSign, bSign; 3950 a = float64_squash_input_denormal(a, status); 3951 b = float64_squash_input_denormal(b, status); 3952 3953 aSign = extractFloat64Sign( a ); 3954 bSign = extractFloat64Sign( b ); 3955 if ( aSign == bSign ) { 3956 return addFloat64Sigs(a, b, aSign, status); 3957 } 3958 else { 3959 return subFloat64Sigs(a, b, aSign, status); 3960 } 3961 3962 } 3963 3964 /*---------------------------------------------------------------------------- 3965 | Returns the result of subtracting the double-precision floating-point values 3966 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 3967 | for Binary Floating-Point Arithmetic. 3968 *----------------------------------------------------------------------------*/ 3969 3970 float64 float64_sub(float64 a, float64 b, float_status *status) 3971 { 3972 flag aSign, bSign; 3973 a = float64_squash_input_denormal(a, status); 3974 b = float64_squash_input_denormal(b, status); 3975 3976 aSign = extractFloat64Sign( a ); 3977 bSign = extractFloat64Sign( b ); 3978 if ( aSign == bSign ) { 3979 return subFloat64Sigs(a, b, aSign, status); 3980 } 3981 else { 3982 return addFloat64Sigs(a, b, aSign, status); 3983 } 3984 3985 } 3986 3987 /*---------------------------------------------------------------------------- 3988 | Returns the result of multiplying the double-precision floating-point values 3989 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 3990 | for Binary Floating-Point Arithmetic. 3991 *----------------------------------------------------------------------------*/ 3992 3993 float64 float64_mul(float64 a, float64 b, float_status *status) 3994 { 3995 flag aSign, bSign, zSign; 3996 int_fast16_t aExp, bExp, zExp; 3997 uint64_t aSig, bSig, zSig0, zSig1; 3998 3999 a = float64_squash_input_denormal(a, status); 4000 b = float64_squash_input_denormal(b, status); 4001 4002 aSig = extractFloat64Frac( a ); 4003 aExp = extractFloat64Exp( a ); 4004 aSign = extractFloat64Sign( a ); 4005 bSig = extractFloat64Frac( b ); 4006 bExp = extractFloat64Exp( b ); 4007 bSign = extractFloat64Sign( b ); 4008 zSign = aSign ^ bSign; 4009 if ( aExp == 0x7FF ) { 4010 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 4011 return propagateFloat64NaN(a, b, status); 4012 } 4013 if ( ( bExp | bSig ) == 0 ) { 4014 float_raise(float_flag_invalid, status); 4015 return float64_default_nan; 4016 } 4017 return packFloat64( zSign, 0x7FF, 0 ); 4018 } 4019 if ( bExp == 0x7FF ) { 4020 if (bSig) { 4021 return propagateFloat64NaN(a, b, status); 4022 } 4023 if ( ( aExp | aSig ) == 0 ) { 4024 float_raise(float_flag_invalid, status); 4025 return float64_default_nan; 4026 } 4027 return packFloat64( zSign, 0x7FF, 0 ); 4028 } 4029 if ( aExp == 0 ) { 4030 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 4031 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4032 } 4033 if ( bExp == 0 ) { 4034 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 ); 4035 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4036 } 4037 zExp = aExp + bExp - 0x3FF; 4038 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 4039 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4040 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 4041 zSig0 |= ( zSig1 != 0 ); 4042 if ( 0 <= (int64_t) ( zSig0<<1 ) ) { 4043 zSig0 <<= 1; 4044 --zExp; 4045 } 4046 return roundAndPackFloat64(zSign, zExp, zSig0, status); 4047 4048 } 4049 4050 /*---------------------------------------------------------------------------- 4051 | Returns the result of dividing the double-precision floating-point value `a' 4052 | by the corresponding value `b'. The operation is performed according to 4053 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4054 *----------------------------------------------------------------------------*/ 4055 4056 float64 float64_div(float64 a, float64 b, float_status *status) 4057 { 4058 flag aSign, bSign, zSign; 4059 int_fast16_t aExp, bExp, zExp; 4060 uint64_t aSig, bSig, zSig; 4061 uint64_t rem0, rem1; 4062 uint64_t term0, term1; 4063 a = float64_squash_input_denormal(a, status); 4064 b = float64_squash_input_denormal(b, status); 4065 4066 aSig = extractFloat64Frac( a ); 4067 aExp = extractFloat64Exp( a ); 4068 aSign = extractFloat64Sign( a ); 4069 bSig = extractFloat64Frac( b ); 4070 bExp = extractFloat64Exp( b ); 4071 bSign = extractFloat64Sign( b ); 4072 zSign = aSign ^ bSign; 4073 if ( aExp == 0x7FF ) { 4074 if (aSig) { 4075 return propagateFloat64NaN(a, b, status); 4076 } 4077 if ( bExp == 0x7FF ) { 4078 if (bSig) { 4079 return propagateFloat64NaN(a, b, status); 4080 } 4081 float_raise(float_flag_invalid, status); 4082 return float64_default_nan; 4083 } 4084 return packFloat64( zSign, 0x7FF, 0 ); 4085 } 4086 if ( bExp == 0x7FF ) { 4087 if (bSig) { 4088 return propagateFloat64NaN(a, b, status); 4089 } 4090 return packFloat64( zSign, 0, 0 ); 4091 } 4092 if ( bExp == 0 ) { 4093 if ( bSig == 0 ) { 4094 if ( ( aExp | aSig ) == 0 ) { 4095 float_raise(float_flag_invalid, status); 4096 return float64_default_nan; 4097 } 4098 float_raise(float_flag_divbyzero, status); 4099 return packFloat64( zSign, 0x7FF, 0 ); 4100 } 4101 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4102 } 4103 if ( aExp == 0 ) { 4104 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 4105 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4106 } 4107 zExp = aExp - bExp + 0x3FD; 4108 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 4109 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4110 if ( bSig <= ( aSig + aSig ) ) { 4111 aSig >>= 1; 4112 ++zExp; 4113 } 4114 zSig = estimateDiv128To64( aSig, 0, bSig ); 4115 if ( ( zSig & 0x1FF ) <= 2 ) { 4116 mul64To128( bSig, zSig, &term0, &term1 ); 4117 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 4118 while ( (int64_t) rem0 < 0 ) { 4119 --zSig; 4120 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 4121 } 4122 zSig |= ( rem1 != 0 ); 4123 } 4124 return roundAndPackFloat64(zSign, zExp, zSig, status); 4125 4126 } 4127 4128 /*---------------------------------------------------------------------------- 4129 | Returns the remainder of the double-precision floating-point value `a' 4130 | with respect to the corresponding value `b'. The operation is performed 4131 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4132 *----------------------------------------------------------------------------*/ 4133 4134 float64 float64_rem(float64 a, float64 b, float_status *status) 4135 { 4136 flag aSign, zSign; 4137 int_fast16_t aExp, bExp, expDiff; 4138 uint64_t aSig, bSig; 4139 uint64_t q, alternateASig; 4140 int64_t sigMean; 4141 4142 a = float64_squash_input_denormal(a, status); 4143 b = float64_squash_input_denormal(b, status); 4144 aSig = extractFloat64Frac( a ); 4145 aExp = extractFloat64Exp( a ); 4146 aSign = extractFloat64Sign( a ); 4147 bSig = extractFloat64Frac( b ); 4148 bExp = extractFloat64Exp( b ); 4149 if ( aExp == 0x7FF ) { 4150 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 4151 return propagateFloat64NaN(a, b, status); 4152 } 4153 float_raise(float_flag_invalid, status); 4154 return float64_default_nan; 4155 } 4156 if ( bExp == 0x7FF ) { 4157 if (bSig) { 4158 return propagateFloat64NaN(a, b, status); 4159 } 4160 return a; 4161 } 4162 if ( bExp == 0 ) { 4163 if ( bSig == 0 ) { 4164 float_raise(float_flag_invalid, status); 4165 return float64_default_nan; 4166 } 4167 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4168 } 4169 if ( aExp == 0 ) { 4170 if ( aSig == 0 ) return a; 4171 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4172 } 4173 expDiff = aExp - bExp; 4174 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 4175 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4176 if ( expDiff < 0 ) { 4177 if ( expDiff < -1 ) return a; 4178 aSig >>= 1; 4179 } 4180 q = ( bSig <= aSig ); 4181 if ( q ) aSig -= bSig; 4182 expDiff -= 64; 4183 while ( 0 < expDiff ) { 4184 q = estimateDiv128To64( aSig, 0, bSig ); 4185 q = ( 2 < q ) ? q - 2 : 0; 4186 aSig = - ( ( bSig>>2 ) * q ); 4187 expDiff -= 62; 4188 } 4189 expDiff += 64; 4190 if ( 0 < expDiff ) { 4191 q = estimateDiv128To64( aSig, 0, bSig ); 4192 q = ( 2 < q ) ? q - 2 : 0; 4193 q >>= 64 - expDiff; 4194 bSig >>= 2; 4195 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4196 } 4197 else { 4198 aSig >>= 2; 4199 bSig >>= 2; 4200 } 4201 do { 4202 alternateASig = aSig; 4203 ++q; 4204 aSig -= bSig; 4205 } while ( 0 <= (int64_t) aSig ); 4206 sigMean = aSig + alternateASig; 4207 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 4208 aSig = alternateASig; 4209 } 4210 zSign = ( (int64_t) aSig < 0 ); 4211 if ( zSign ) aSig = - aSig; 4212 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 4213 4214 } 4215 4216 /*---------------------------------------------------------------------------- 4217 | Returns the result of multiplying the double-precision floating-point values 4218 | `a' and `b' then adding 'c', with no intermediate rounding step after the 4219 | multiplication. The operation is performed according to the IEC/IEEE 4220 | Standard for Binary Floating-Point Arithmetic 754-2008. 4221 | The flags argument allows the caller to select negation of the 4222 | addend, the intermediate product, or the final result. (The difference 4223 | between this and having the caller do a separate negation is that negating 4224 | externally will flip the sign bit on NaNs.) 4225 *----------------------------------------------------------------------------*/ 4226 4227 float64 float64_muladd(float64 a, float64 b, float64 c, int flags, 4228 float_status *status) 4229 { 4230 flag aSign, bSign, cSign, zSign; 4231 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff; 4232 uint64_t aSig, bSig, cSig; 4233 flag pInf, pZero, pSign; 4234 uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1; 4235 int shiftcount; 4236 flag signflip, infzero; 4237 4238 a = float64_squash_input_denormal(a, status); 4239 b = float64_squash_input_denormal(b, status); 4240 c = float64_squash_input_denormal(c, status); 4241 aSig = extractFloat64Frac(a); 4242 aExp = extractFloat64Exp(a); 4243 aSign = extractFloat64Sign(a); 4244 bSig = extractFloat64Frac(b); 4245 bExp = extractFloat64Exp(b); 4246 bSign = extractFloat64Sign(b); 4247 cSig = extractFloat64Frac(c); 4248 cExp = extractFloat64Exp(c); 4249 cSign = extractFloat64Sign(c); 4250 4251 infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) || 4252 (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0)); 4253 4254 /* It is implementation-defined whether the cases of (0,inf,qnan) 4255 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 4256 * they return if they do), so we have to hand this information 4257 * off to the target-specific pick-a-NaN routine. 4258 */ 4259 if (((aExp == 0x7ff) && aSig) || 4260 ((bExp == 0x7ff) && bSig) || 4261 ((cExp == 0x7ff) && cSig)) { 4262 return propagateFloat64MulAddNaN(a, b, c, infzero, status); 4263 } 4264 4265 if (infzero) { 4266 float_raise(float_flag_invalid, status); 4267 return float64_default_nan; 4268 } 4269 4270 if (flags & float_muladd_negate_c) { 4271 cSign ^= 1; 4272 } 4273 4274 signflip = (flags & float_muladd_negate_result) ? 1 : 0; 4275 4276 /* Work out the sign and type of the product */ 4277 pSign = aSign ^ bSign; 4278 if (flags & float_muladd_negate_product) { 4279 pSign ^= 1; 4280 } 4281 pInf = (aExp == 0x7ff) || (bExp == 0x7ff); 4282 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0); 4283 4284 if (cExp == 0x7ff) { 4285 if (pInf && (pSign ^ cSign)) { 4286 /* addition of opposite-signed infinities => InvalidOperation */ 4287 float_raise(float_flag_invalid, status); 4288 return float64_default_nan; 4289 } 4290 /* Otherwise generate an infinity of the same sign */ 4291 return packFloat64(cSign ^ signflip, 0x7ff, 0); 4292 } 4293 4294 if (pInf) { 4295 return packFloat64(pSign ^ signflip, 0x7ff, 0); 4296 } 4297 4298 if (pZero) { 4299 if (cExp == 0) { 4300 if (cSig == 0) { 4301 /* Adding two exact zeroes */ 4302 if (pSign == cSign) { 4303 zSign = pSign; 4304 } else if (status->float_rounding_mode == float_round_down) { 4305 zSign = 1; 4306 } else { 4307 zSign = 0; 4308 } 4309 return packFloat64(zSign ^ signflip, 0, 0); 4310 } 4311 /* Exact zero plus a denorm */ 4312 if (status->flush_to_zero) { 4313 float_raise(float_flag_output_denormal, status); 4314 return packFloat64(cSign ^ signflip, 0, 0); 4315 } 4316 } 4317 /* Zero plus something non-zero : just return the something */ 4318 if (flags & float_muladd_halve_result) { 4319 if (cExp == 0) { 4320 normalizeFloat64Subnormal(cSig, &cExp, &cSig); 4321 } 4322 /* Subtract one to halve, and one again because roundAndPackFloat64 4323 * wants one less than the true exponent. 4324 */ 4325 cExp -= 2; 4326 cSig = (cSig | 0x0010000000000000ULL) << 10; 4327 return roundAndPackFloat64(cSign ^ signflip, cExp, cSig, status); 4328 } 4329 return packFloat64(cSign ^ signflip, cExp, cSig); 4330 } 4331 4332 if (aExp == 0) { 4333 normalizeFloat64Subnormal(aSig, &aExp, &aSig); 4334 } 4335 if (bExp == 0) { 4336 normalizeFloat64Subnormal(bSig, &bExp, &bSig); 4337 } 4338 4339 /* Calculate the actual result a * b + c */ 4340 4341 /* Multiply first; this is easy. */ 4342 /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff 4343 * because we want the true exponent, not the "one-less-than" 4344 * flavour that roundAndPackFloat64() takes. 4345 */ 4346 pExp = aExp + bExp - 0x3fe; 4347 aSig = (aSig | LIT64(0x0010000000000000))<<10; 4348 bSig = (bSig | LIT64(0x0010000000000000))<<11; 4349 mul64To128(aSig, bSig, &pSig0, &pSig1); 4350 if ((int64_t)(pSig0 << 1) >= 0) { 4351 shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1); 4352 pExp--; 4353 } 4354 4355 zSign = pSign ^ signflip; 4356 4357 /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit 4358 * bit in position 126. 4359 */ 4360 if (cExp == 0) { 4361 if (!cSig) { 4362 /* Throw out the special case of c being an exact zero now */ 4363 shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1); 4364 if (flags & float_muladd_halve_result) { 4365 pExp--; 4366 } 4367 return roundAndPackFloat64(zSign, pExp - 1, 4368 pSig1, status); 4369 } 4370 normalizeFloat64Subnormal(cSig, &cExp, &cSig); 4371 } 4372 4373 /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the 4374 * significand of the addend, with the explicit bit in position 126. 4375 */ 4376 cSig0 = cSig << (126 - 64 - 52); 4377 cSig1 = 0; 4378 cSig0 |= LIT64(0x4000000000000000); 4379 expDiff = pExp - cExp; 4380 4381 if (pSign == cSign) { 4382 /* Addition */ 4383 if (expDiff > 0) { 4384 /* scale c to match p */ 4385 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1); 4386 zExp = pExp; 4387 } else if (expDiff < 0) { 4388 /* scale p to match c */ 4389 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1); 4390 zExp = cExp; 4391 } else { 4392 /* no scaling needed */ 4393 zExp = cExp; 4394 } 4395 /* Add significands and make sure explicit bit ends up in posn 126 */ 4396 add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); 4397 if ((int64_t)zSig0 < 0) { 4398 shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1); 4399 } else { 4400 zExp--; 4401 } 4402 shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1); 4403 if (flags & float_muladd_halve_result) { 4404 zExp--; 4405 } 4406 return roundAndPackFloat64(zSign, zExp, zSig1, status); 4407 } else { 4408 /* Subtraction */ 4409 if (expDiff > 0) { 4410 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1); 4411 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); 4412 zExp = pExp; 4413 } else if (expDiff < 0) { 4414 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1); 4415 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1); 4416 zExp = cExp; 4417 zSign ^= 1; 4418 } else { 4419 zExp = pExp; 4420 if (lt128(cSig0, cSig1, pSig0, pSig1)) { 4421 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); 4422 } else if (lt128(pSig0, pSig1, cSig0, cSig1)) { 4423 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1); 4424 zSign ^= 1; 4425 } else { 4426 /* Exact zero */ 4427 zSign = signflip; 4428 if (status->float_rounding_mode == float_round_down) { 4429 zSign ^= 1; 4430 } 4431 return packFloat64(zSign, 0, 0); 4432 } 4433 } 4434 --zExp; 4435 /* Do the equivalent of normalizeRoundAndPackFloat64() but 4436 * starting with the significand in a pair of uint64_t. 4437 */ 4438 if (zSig0) { 4439 shiftcount = countLeadingZeros64(zSig0) - 1; 4440 shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1); 4441 if (zSig1) { 4442 zSig0 |= 1; 4443 } 4444 zExp -= shiftcount; 4445 } else { 4446 shiftcount = countLeadingZeros64(zSig1); 4447 if (shiftcount == 0) { 4448 zSig0 = (zSig1 >> 1) | (zSig1 & 1); 4449 zExp -= 63; 4450 } else { 4451 shiftcount--; 4452 zSig0 = zSig1 << shiftcount; 4453 zExp -= (shiftcount + 64); 4454 } 4455 } 4456 if (flags & float_muladd_halve_result) { 4457 zExp--; 4458 } 4459 return roundAndPackFloat64(zSign, zExp, zSig0, status); 4460 } 4461 } 4462 4463 /*---------------------------------------------------------------------------- 4464 | Returns the square root of the double-precision floating-point value `a'. 4465 | The operation is performed according to the IEC/IEEE Standard for Binary 4466 | Floating-Point Arithmetic. 4467 *----------------------------------------------------------------------------*/ 4468 4469 float64 float64_sqrt(float64 a, float_status *status) 4470 { 4471 flag aSign; 4472 int_fast16_t aExp, zExp; 4473 uint64_t aSig, zSig, doubleZSig; 4474 uint64_t rem0, rem1, term0, term1; 4475 a = float64_squash_input_denormal(a, status); 4476 4477 aSig = extractFloat64Frac( a ); 4478 aExp = extractFloat64Exp( a ); 4479 aSign = extractFloat64Sign( a ); 4480 if ( aExp == 0x7FF ) { 4481 if (aSig) { 4482 return propagateFloat64NaN(a, a, status); 4483 } 4484 if ( ! aSign ) return a; 4485 float_raise(float_flag_invalid, status); 4486 return float64_default_nan; 4487 } 4488 if ( aSign ) { 4489 if ( ( aExp | aSig ) == 0 ) return a; 4490 float_raise(float_flag_invalid, status); 4491 return float64_default_nan; 4492 } 4493 if ( aExp == 0 ) { 4494 if ( aSig == 0 ) return float64_zero; 4495 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4496 } 4497 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 4498 aSig |= LIT64( 0x0010000000000000 ); 4499 zSig = estimateSqrt32( aExp, aSig>>21 ); 4500 aSig <<= 9 - ( aExp & 1 ); 4501 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); 4502 if ( ( zSig & 0x1FF ) <= 5 ) { 4503 doubleZSig = zSig<<1; 4504 mul64To128( zSig, zSig, &term0, &term1 ); 4505 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 4506 while ( (int64_t) rem0 < 0 ) { 4507 --zSig; 4508 doubleZSig -= 2; 4509 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); 4510 } 4511 zSig |= ( ( rem0 | rem1 ) != 0 ); 4512 } 4513 return roundAndPackFloat64(0, zExp, zSig, status); 4514 4515 } 4516 4517 /*---------------------------------------------------------------------------- 4518 | Returns the binary log of the double-precision floating-point value `a'. 4519 | The operation is performed according to the IEC/IEEE Standard for Binary 4520 | Floating-Point Arithmetic. 4521 *----------------------------------------------------------------------------*/ 4522 float64 float64_log2(float64 a, float_status *status) 4523 { 4524 flag aSign, zSign; 4525 int_fast16_t aExp; 4526 uint64_t aSig, aSig0, aSig1, zSig, i; 4527 a = float64_squash_input_denormal(a, status); 4528 4529 aSig = extractFloat64Frac( a ); 4530 aExp = extractFloat64Exp( a ); 4531 aSign = extractFloat64Sign( a ); 4532 4533 if ( aExp == 0 ) { 4534 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 4535 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4536 } 4537 if ( aSign ) { 4538 float_raise(float_flag_invalid, status); 4539 return float64_default_nan; 4540 } 4541 if ( aExp == 0x7FF ) { 4542 if (aSig) { 4543 return propagateFloat64NaN(a, float64_zero, status); 4544 } 4545 return a; 4546 } 4547 4548 aExp -= 0x3FF; 4549 aSig |= LIT64( 0x0010000000000000 ); 4550 zSign = aExp < 0; 4551 zSig = (uint64_t)aExp << 52; 4552 for (i = 1LL << 51; i > 0; i >>= 1) { 4553 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 4554 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 4555 if ( aSig & LIT64( 0x0020000000000000 ) ) { 4556 aSig >>= 1; 4557 zSig |= i; 4558 } 4559 } 4560 4561 if ( zSign ) 4562 zSig = -zSig; 4563 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 4564 } 4565 4566 /*---------------------------------------------------------------------------- 4567 | Returns 1 if the double-precision floating-point value `a' is equal to the 4568 | corresponding value `b', and 0 otherwise. The invalid exception is raised 4569 | if either operand is a NaN. Otherwise, the comparison is performed 4570 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4571 *----------------------------------------------------------------------------*/ 4572 4573 int float64_eq(float64 a, float64 b, float_status *status) 4574 { 4575 uint64_t av, bv; 4576 a = float64_squash_input_denormal(a, status); 4577 b = float64_squash_input_denormal(b, status); 4578 4579 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4580 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4581 ) { 4582 float_raise(float_flag_invalid, status); 4583 return 0; 4584 } 4585 av = float64_val(a); 4586 bv = float64_val(b); 4587 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4588 4589 } 4590 4591 /*---------------------------------------------------------------------------- 4592 | Returns 1 if the double-precision floating-point value `a' is less than or 4593 | equal to the corresponding value `b', and 0 otherwise. The invalid 4594 | exception is raised if either operand is a NaN. The comparison is performed 4595 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4596 *----------------------------------------------------------------------------*/ 4597 4598 int float64_le(float64 a, float64 b, float_status *status) 4599 { 4600 flag aSign, bSign; 4601 uint64_t av, bv; 4602 a = float64_squash_input_denormal(a, status); 4603 b = float64_squash_input_denormal(b, status); 4604 4605 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4606 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4607 ) { 4608 float_raise(float_flag_invalid, status); 4609 return 0; 4610 } 4611 aSign = extractFloat64Sign( a ); 4612 bSign = extractFloat64Sign( b ); 4613 av = float64_val(a); 4614 bv = float64_val(b); 4615 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4616 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4617 4618 } 4619 4620 /*---------------------------------------------------------------------------- 4621 | Returns 1 if the double-precision floating-point value `a' is less than 4622 | the corresponding value `b', and 0 otherwise. The invalid exception is 4623 | raised if either operand is a NaN. The comparison is performed according 4624 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4625 *----------------------------------------------------------------------------*/ 4626 4627 int float64_lt(float64 a, float64 b, float_status *status) 4628 { 4629 flag aSign, bSign; 4630 uint64_t av, bv; 4631 4632 a = float64_squash_input_denormal(a, status); 4633 b = float64_squash_input_denormal(b, status); 4634 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4635 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4636 ) { 4637 float_raise(float_flag_invalid, status); 4638 return 0; 4639 } 4640 aSign = extractFloat64Sign( a ); 4641 bSign = extractFloat64Sign( b ); 4642 av = float64_val(a); 4643 bv = float64_val(b); 4644 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4645 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4646 4647 } 4648 4649 /*---------------------------------------------------------------------------- 4650 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4651 | be compared, and 0 otherwise. The invalid exception is raised if either 4652 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4653 | Standard for Binary Floating-Point Arithmetic. 4654 *----------------------------------------------------------------------------*/ 4655 4656 int float64_unordered(float64 a, float64 b, float_status *status) 4657 { 4658 a = float64_squash_input_denormal(a, status); 4659 b = float64_squash_input_denormal(b, status); 4660 4661 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4662 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4663 ) { 4664 float_raise(float_flag_invalid, status); 4665 return 1; 4666 } 4667 return 0; 4668 } 4669 4670 /*---------------------------------------------------------------------------- 4671 | Returns 1 if the double-precision floating-point value `a' is equal to the 4672 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4673 | exception.The comparison is performed according to the IEC/IEEE Standard 4674 | for Binary Floating-Point Arithmetic. 4675 *----------------------------------------------------------------------------*/ 4676 4677 int float64_eq_quiet(float64 a, float64 b, float_status *status) 4678 { 4679 uint64_t av, bv; 4680 a = float64_squash_input_denormal(a, status); 4681 b = float64_squash_input_denormal(b, status); 4682 4683 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4684 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4685 ) { 4686 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 4687 float_raise(float_flag_invalid, status); 4688 } 4689 return 0; 4690 } 4691 av = float64_val(a); 4692 bv = float64_val(b); 4693 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4694 4695 } 4696 4697 /*---------------------------------------------------------------------------- 4698 | Returns 1 if the double-precision floating-point value `a' is less than or 4699 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4700 | cause an exception. Otherwise, the comparison is performed according to the 4701 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4702 *----------------------------------------------------------------------------*/ 4703 4704 int float64_le_quiet(float64 a, float64 b, float_status *status) 4705 { 4706 flag aSign, bSign; 4707 uint64_t av, bv; 4708 a = float64_squash_input_denormal(a, status); 4709 b = float64_squash_input_denormal(b, status); 4710 4711 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4712 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4713 ) { 4714 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 4715 float_raise(float_flag_invalid, status); 4716 } 4717 return 0; 4718 } 4719 aSign = extractFloat64Sign( a ); 4720 bSign = extractFloat64Sign( b ); 4721 av = float64_val(a); 4722 bv = float64_val(b); 4723 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4724 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4725 4726 } 4727 4728 /*---------------------------------------------------------------------------- 4729 | Returns 1 if the double-precision floating-point value `a' is less than 4730 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4731 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 4732 | Standard for Binary Floating-Point Arithmetic. 4733 *----------------------------------------------------------------------------*/ 4734 4735 int float64_lt_quiet(float64 a, float64 b, float_status *status) 4736 { 4737 flag aSign, bSign; 4738 uint64_t av, bv; 4739 a = float64_squash_input_denormal(a, status); 4740 b = float64_squash_input_denormal(b, status); 4741 4742 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4743 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4744 ) { 4745 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 4746 float_raise(float_flag_invalid, status); 4747 } 4748 return 0; 4749 } 4750 aSign = extractFloat64Sign( a ); 4751 bSign = extractFloat64Sign( b ); 4752 av = float64_val(a); 4753 bv = float64_val(b); 4754 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4755 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4756 4757 } 4758 4759 /*---------------------------------------------------------------------------- 4760 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4761 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 4762 | comparison is performed according to the IEC/IEEE Standard for Binary 4763 | Floating-Point Arithmetic. 4764 *----------------------------------------------------------------------------*/ 4765 4766 int float64_unordered_quiet(float64 a, float64 b, float_status *status) 4767 { 4768 a = float64_squash_input_denormal(a, status); 4769 b = float64_squash_input_denormal(b, status); 4770 4771 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4772 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4773 ) { 4774 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 4775 float_raise(float_flag_invalid, status); 4776 } 4777 return 1; 4778 } 4779 return 0; 4780 } 4781 4782 /*---------------------------------------------------------------------------- 4783 | Returns the result of converting the extended double-precision floating- 4784 | point value `a' to the 32-bit two's complement integer format. The 4785 | conversion is performed according to the IEC/IEEE Standard for Binary 4786 | Floating-Point Arithmetic---which means in particular that the conversion 4787 | is rounded according to the current rounding mode. If `a' is a NaN, the 4788 | largest positive integer is returned. Otherwise, if the conversion 4789 | overflows, the largest integer with the same sign as `a' is returned. 4790 *----------------------------------------------------------------------------*/ 4791 4792 int32_t floatx80_to_int32(floatx80 a, float_status *status) 4793 { 4794 flag aSign; 4795 int32_t aExp, shiftCount; 4796 uint64_t aSig; 4797 4798 aSig = extractFloatx80Frac( a ); 4799 aExp = extractFloatx80Exp( a ); 4800 aSign = extractFloatx80Sign( a ); 4801 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4802 shiftCount = 0x4037 - aExp; 4803 if ( shiftCount <= 0 ) shiftCount = 1; 4804 shift64RightJamming( aSig, shiftCount, &aSig ); 4805 return roundAndPackInt32(aSign, aSig, status); 4806 4807 } 4808 4809 /*---------------------------------------------------------------------------- 4810 | Returns the result of converting the extended double-precision floating- 4811 | point value `a' to the 32-bit two's complement integer format. The 4812 | conversion is performed according to the IEC/IEEE Standard for Binary 4813 | Floating-Point Arithmetic, except that the conversion is always rounded 4814 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4815 | Otherwise, if the conversion overflows, the largest integer with the same 4816 | sign as `a' is returned. 4817 *----------------------------------------------------------------------------*/ 4818 4819 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 4820 { 4821 flag aSign; 4822 int32_t aExp, shiftCount; 4823 uint64_t aSig, savedASig; 4824 int32_t z; 4825 4826 aSig = extractFloatx80Frac( a ); 4827 aExp = extractFloatx80Exp( a ); 4828 aSign = extractFloatx80Sign( a ); 4829 if ( 0x401E < aExp ) { 4830 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4831 goto invalid; 4832 } 4833 else if ( aExp < 0x3FFF ) { 4834 if (aExp || aSig) { 4835 status->float_exception_flags |= float_flag_inexact; 4836 } 4837 return 0; 4838 } 4839 shiftCount = 0x403E - aExp; 4840 savedASig = aSig; 4841 aSig >>= shiftCount; 4842 z = aSig; 4843 if ( aSign ) z = - z; 4844 if ( ( z < 0 ) ^ aSign ) { 4845 invalid: 4846 float_raise(float_flag_invalid, status); 4847 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 4848 } 4849 if ( ( aSig<<shiftCount ) != savedASig ) { 4850 status->float_exception_flags |= float_flag_inexact; 4851 } 4852 return z; 4853 4854 } 4855 4856 /*---------------------------------------------------------------------------- 4857 | Returns the result of converting the extended double-precision floating- 4858 | point value `a' to the 64-bit two's complement integer format. The 4859 | conversion is performed according to the IEC/IEEE Standard for Binary 4860 | Floating-Point Arithmetic---which means in particular that the conversion 4861 | is rounded according to the current rounding mode. If `a' is a NaN, 4862 | the largest positive integer is returned. Otherwise, if the conversion 4863 | overflows, the largest integer with the same sign as `a' is returned. 4864 *----------------------------------------------------------------------------*/ 4865 4866 int64_t floatx80_to_int64(floatx80 a, float_status *status) 4867 { 4868 flag aSign; 4869 int32_t aExp, shiftCount; 4870 uint64_t aSig, aSigExtra; 4871 4872 aSig = extractFloatx80Frac( a ); 4873 aExp = extractFloatx80Exp( a ); 4874 aSign = extractFloatx80Sign( a ); 4875 shiftCount = 0x403E - aExp; 4876 if ( shiftCount <= 0 ) { 4877 if ( shiftCount ) { 4878 float_raise(float_flag_invalid, status); 4879 if ( ! aSign 4880 || ( ( aExp == 0x7FFF ) 4881 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 4882 ) { 4883 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4884 } 4885 return (int64_t) LIT64( 0x8000000000000000 ); 4886 } 4887 aSigExtra = 0; 4888 } 4889 else { 4890 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 4891 } 4892 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 4893 4894 } 4895 4896 /*---------------------------------------------------------------------------- 4897 | Returns the result of converting the extended double-precision floating- 4898 | point value `a' to the 64-bit two's complement integer format. The 4899 | conversion is performed according to the IEC/IEEE Standard for Binary 4900 | Floating-Point Arithmetic, except that the conversion is always rounded 4901 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4902 | Otherwise, if the conversion overflows, the largest integer with the same 4903 | sign as `a' is returned. 4904 *----------------------------------------------------------------------------*/ 4905 4906 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 4907 { 4908 flag aSign; 4909 int32_t aExp, shiftCount; 4910 uint64_t aSig; 4911 int64_t z; 4912 4913 aSig = extractFloatx80Frac( a ); 4914 aExp = extractFloatx80Exp( a ); 4915 aSign = extractFloatx80Sign( a ); 4916 shiftCount = aExp - 0x403E; 4917 if ( 0 <= shiftCount ) { 4918 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 4919 if ( ( a.high != 0xC03E ) || aSig ) { 4920 float_raise(float_flag_invalid, status); 4921 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 4922 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4923 } 4924 } 4925 return (int64_t) LIT64( 0x8000000000000000 ); 4926 } 4927 else if ( aExp < 0x3FFF ) { 4928 if (aExp | aSig) { 4929 status->float_exception_flags |= float_flag_inexact; 4930 } 4931 return 0; 4932 } 4933 z = aSig>>( - shiftCount ); 4934 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 4935 status->float_exception_flags |= float_flag_inexact; 4936 } 4937 if ( aSign ) z = - z; 4938 return z; 4939 4940 } 4941 4942 /*---------------------------------------------------------------------------- 4943 | Returns the result of converting the extended double-precision floating- 4944 | point value `a' to the single-precision floating-point format. The 4945 | conversion is performed according to the IEC/IEEE Standard for Binary 4946 | Floating-Point Arithmetic. 4947 *----------------------------------------------------------------------------*/ 4948 4949 float32 floatx80_to_float32(floatx80 a, float_status *status) 4950 { 4951 flag aSign; 4952 int32_t aExp; 4953 uint64_t aSig; 4954 4955 aSig = extractFloatx80Frac( a ); 4956 aExp = extractFloatx80Exp( a ); 4957 aSign = extractFloatx80Sign( a ); 4958 if ( aExp == 0x7FFF ) { 4959 if ( (uint64_t) ( aSig<<1 ) ) { 4960 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status); 4961 } 4962 return packFloat32( aSign, 0xFF, 0 ); 4963 } 4964 shift64RightJamming( aSig, 33, &aSig ); 4965 if ( aExp || aSig ) aExp -= 0x3F81; 4966 return roundAndPackFloat32(aSign, aExp, aSig, status); 4967 4968 } 4969 4970 /*---------------------------------------------------------------------------- 4971 | Returns the result of converting the extended double-precision floating- 4972 | point value `a' to the double-precision floating-point format. The 4973 | conversion is performed according to the IEC/IEEE Standard for Binary 4974 | Floating-Point Arithmetic. 4975 *----------------------------------------------------------------------------*/ 4976 4977 float64 floatx80_to_float64(floatx80 a, float_status *status) 4978 { 4979 flag aSign; 4980 int32_t aExp; 4981 uint64_t aSig, zSig; 4982 4983 aSig = extractFloatx80Frac( a ); 4984 aExp = extractFloatx80Exp( a ); 4985 aSign = extractFloatx80Sign( a ); 4986 if ( aExp == 0x7FFF ) { 4987 if ( (uint64_t) ( aSig<<1 ) ) { 4988 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status); 4989 } 4990 return packFloat64( aSign, 0x7FF, 0 ); 4991 } 4992 shift64RightJamming( aSig, 1, &zSig ); 4993 if ( aExp || aSig ) aExp -= 0x3C01; 4994 return roundAndPackFloat64(aSign, aExp, zSig, status); 4995 4996 } 4997 4998 /*---------------------------------------------------------------------------- 4999 | Returns the result of converting the extended double-precision floating- 5000 | point value `a' to the quadruple-precision floating-point format. The 5001 | conversion is performed according to the IEC/IEEE Standard for Binary 5002 | Floating-Point Arithmetic. 5003 *----------------------------------------------------------------------------*/ 5004 5005 float128 floatx80_to_float128(floatx80 a, float_status *status) 5006 { 5007 flag aSign; 5008 int_fast16_t aExp; 5009 uint64_t aSig, zSig0, zSig1; 5010 5011 aSig = extractFloatx80Frac( a ); 5012 aExp = extractFloatx80Exp( a ); 5013 aSign = extractFloatx80Sign( a ); 5014 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 5015 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status); 5016 } 5017 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 5018 return packFloat128( aSign, aExp, zSig0, zSig1 ); 5019 5020 } 5021 5022 /*---------------------------------------------------------------------------- 5023 | Rounds the extended double-precision floating-point value `a' to an integer, 5024 | and returns the result as an extended quadruple-precision floating-point 5025 | value. The operation is performed according to the IEC/IEEE Standard for 5026 | Binary Floating-Point Arithmetic. 5027 *----------------------------------------------------------------------------*/ 5028 5029 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 5030 { 5031 flag aSign; 5032 int32_t aExp; 5033 uint64_t lastBitMask, roundBitsMask; 5034 floatx80 z; 5035 5036 aExp = extractFloatx80Exp( a ); 5037 if ( 0x403E <= aExp ) { 5038 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 5039 return propagateFloatx80NaN(a, a, status); 5040 } 5041 return a; 5042 } 5043 if ( aExp < 0x3FFF ) { 5044 if ( ( aExp == 0 ) 5045 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 5046 return a; 5047 } 5048 status->float_exception_flags |= float_flag_inexact; 5049 aSign = extractFloatx80Sign( a ); 5050 switch (status->float_rounding_mode) { 5051 case float_round_nearest_even: 5052 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 5053 ) { 5054 return 5055 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 5056 } 5057 break; 5058 case float_round_ties_away: 5059 if (aExp == 0x3FFE) { 5060 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000)); 5061 } 5062 break; 5063 case float_round_down: 5064 return 5065 aSign ? 5066 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 5067 : packFloatx80( 0, 0, 0 ); 5068 case float_round_up: 5069 return 5070 aSign ? packFloatx80( 1, 0, 0 ) 5071 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 5072 } 5073 return packFloatx80( aSign, 0, 0 ); 5074 } 5075 lastBitMask = 1; 5076 lastBitMask <<= 0x403E - aExp; 5077 roundBitsMask = lastBitMask - 1; 5078 z = a; 5079 switch (status->float_rounding_mode) { 5080 case float_round_nearest_even: 5081 z.low += lastBitMask>>1; 5082 if ((z.low & roundBitsMask) == 0) { 5083 z.low &= ~lastBitMask; 5084 } 5085 break; 5086 case float_round_ties_away: 5087 z.low += lastBitMask >> 1; 5088 break; 5089 case float_round_to_zero: 5090 break; 5091 case float_round_up: 5092 if (!extractFloatx80Sign(z)) { 5093 z.low += roundBitsMask; 5094 } 5095 break; 5096 case float_round_down: 5097 if (extractFloatx80Sign(z)) { 5098 z.low += roundBitsMask; 5099 } 5100 break; 5101 default: 5102 abort(); 5103 } 5104 z.low &= ~ roundBitsMask; 5105 if ( z.low == 0 ) { 5106 ++z.high; 5107 z.low = LIT64( 0x8000000000000000 ); 5108 } 5109 if (z.low != a.low) { 5110 status->float_exception_flags |= float_flag_inexact; 5111 } 5112 return z; 5113 5114 } 5115 5116 /*---------------------------------------------------------------------------- 5117 | Returns the result of adding the absolute values of the extended double- 5118 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 5119 | negated before being returned. `zSign' is ignored if the result is a NaN. 5120 | The addition is performed according to the IEC/IEEE Standard for Binary 5121 | Floating-Point Arithmetic. 5122 *----------------------------------------------------------------------------*/ 5123 5124 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5125 float_status *status) 5126 { 5127 int32_t aExp, bExp, zExp; 5128 uint64_t aSig, bSig, zSig0, zSig1; 5129 int32_t expDiff; 5130 5131 aSig = extractFloatx80Frac( a ); 5132 aExp = extractFloatx80Exp( a ); 5133 bSig = extractFloatx80Frac( b ); 5134 bExp = extractFloatx80Exp( b ); 5135 expDiff = aExp - bExp; 5136 if ( 0 < expDiff ) { 5137 if ( aExp == 0x7FFF ) { 5138 if ((uint64_t)(aSig << 1)) { 5139 return propagateFloatx80NaN(a, b, status); 5140 } 5141 return a; 5142 } 5143 if ( bExp == 0 ) --expDiff; 5144 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5145 zExp = aExp; 5146 } 5147 else if ( expDiff < 0 ) { 5148 if ( bExp == 0x7FFF ) { 5149 if ((uint64_t)(bSig << 1)) { 5150 return propagateFloatx80NaN(a, b, status); 5151 } 5152 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5153 } 5154 if ( aExp == 0 ) ++expDiff; 5155 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5156 zExp = bExp; 5157 } 5158 else { 5159 if ( aExp == 0x7FFF ) { 5160 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5161 return propagateFloatx80NaN(a, b, status); 5162 } 5163 return a; 5164 } 5165 zSig1 = 0; 5166 zSig0 = aSig + bSig; 5167 if ( aExp == 0 ) { 5168 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 5169 goto roundAndPack; 5170 } 5171 zExp = aExp; 5172 goto shiftRight1; 5173 } 5174 zSig0 = aSig + bSig; 5175 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 5176 shiftRight1: 5177 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5178 zSig0 |= LIT64( 0x8000000000000000 ); 5179 ++zExp; 5180 roundAndPack: 5181 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5182 zSign, zExp, zSig0, zSig1, status); 5183 } 5184 5185 /*---------------------------------------------------------------------------- 5186 | Returns the result of subtracting the absolute values of the extended 5187 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 5188 | difference is negated before being returned. `zSign' is ignored if the 5189 | result is a NaN. The subtraction is performed according to the IEC/IEEE 5190 | Standard for Binary Floating-Point Arithmetic. 5191 *----------------------------------------------------------------------------*/ 5192 5193 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5194 float_status *status) 5195 { 5196 int32_t aExp, bExp, zExp; 5197 uint64_t aSig, bSig, zSig0, zSig1; 5198 int32_t expDiff; 5199 floatx80 z; 5200 5201 aSig = extractFloatx80Frac( a ); 5202 aExp = extractFloatx80Exp( a ); 5203 bSig = extractFloatx80Frac( b ); 5204 bExp = extractFloatx80Exp( b ); 5205 expDiff = aExp - bExp; 5206 if ( 0 < expDiff ) goto aExpBigger; 5207 if ( expDiff < 0 ) goto bExpBigger; 5208 if ( aExp == 0x7FFF ) { 5209 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5210 return propagateFloatx80NaN(a, b, status); 5211 } 5212 float_raise(float_flag_invalid, status); 5213 z.low = floatx80_default_nan_low; 5214 z.high = floatx80_default_nan_high; 5215 return z; 5216 } 5217 if ( aExp == 0 ) { 5218 aExp = 1; 5219 bExp = 1; 5220 } 5221 zSig1 = 0; 5222 if ( bSig < aSig ) goto aBigger; 5223 if ( aSig < bSig ) goto bBigger; 5224 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 5225 bExpBigger: 5226 if ( bExp == 0x7FFF ) { 5227 if ((uint64_t)(bSig << 1)) { 5228 return propagateFloatx80NaN(a, b, status); 5229 } 5230 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5231 } 5232 if ( aExp == 0 ) ++expDiff; 5233 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5234 bBigger: 5235 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 5236 zExp = bExp; 5237 zSign ^= 1; 5238 goto normalizeRoundAndPack; 5239 aExpBigger: 5240 if ( aExp == 0x7FFF ) { 5241 if ((uint64_t)(aSig << 1)) { 5242 return propagateFloatx80NaN(a, b, status); 5243 } 5244 return a; 5245 } 5246 if ( bExp == 0 ) --expDiff; 5247 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5248 aBigger: 5249 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 5250 zExp = aExp; 5251 normalizeRoundAndPack: 5252 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 5253 zSign, zExp, zSig0, zSig1, status); 5254 } 5255 5256 /*---------------------------------------------------------------------------- 5257 | Returns the result of adding the extended double-precision floating-point 5258 | values `a' and `b'. The operation is performed according to the IEC/IEEE 5259 | Standard for Binary Floating-Point Arithmetic. 5260 *----------------------------------------------------------------------------*/ 5261 5262 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 5263 { 5264 flag aSign, bSign; 5265 5266 aSign = extractFloatx80Sign( a ); 5267 bSign = extractFloatx80Sign( b ); 5268 if ( aSign == bSign ) { 5269 return addFloatx80Sigs(a, b, aSign, status); 5270 } 5271 else { 5272 return subFloatx80Sigs(a, b, aSign, status); 5273 } 5274 5275 } 5276 5277 /*---------------------------------------------------------------------------- 5278 | Returns the result of subtracting the extended double-precision floating- 5279 | point values `a' and `b'. The operation is performed according to the 5280 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5281 *----------------------------------------------------------------------------*/ 5282 5283 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 5284 { 5285 flag aSign, bSign; 5286 5287 aSign = extractFloatx80Sign( a ); 5288 bSign = extractFloatx80Sign( b ); 5289 if ( aSign == bSign ) { 5290 return subFloatx80Sigs(a, b, aSign, status); 5291 } 5292 else { 5293 return addFloatx80Sigs(a, b, aSign, status); 5294 } 5295 5296 } 5297 5298 /*---------------------------------------------------------------------------- 5299 | Returns the result of multiplying the extended double-precision floating- 5300 | point values `a' and `b'. The operation is performed according to the 5301 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5302 *----------------------------------------------------------------------------*/ 5303 5304 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 5305 { 5306 flag aSign, bSign, zSign; 5307 int32_t aExp, bExp, zExp; 5308 uint64_t aSig, bSig, zSig0, zSig1; 5309 floatx80 z; 5310 5311 aSig = extractFloatx80Frac( a ); 5312 aExp = extractFloatx80Exp( a ); 5313 aSign = extractFloatx80Sign( a ); 5314 bSig = extractFloatx80Frac( b ); 5315 bExp = extractFloatx80Exp( b ); 5316 bSign = extractFloatx80Sign( b ); 5317 zSign = aSign ^ bSign; 5318 if ( aExp == 0x7FFF ) { 5319 if ( (uint64_t) ( aSig<<1 ) 5320 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5321 return propagateFloatx80NaN(a, b, status); 5322 } 5323 if ( ( bExp | bSig ) == 0 ) goto invalid; 5324 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5325 } 5326 if ( bExp == 0x7FFF ) { 5327 if ((uint64_t)(bSig << 1)) { 5328 return propagateFloatx80NaN(a, b, status); 5329 } 5330 if ( ( aExp | aSig ) == 0 ) { 5331 invalid: 5332 float_raise(float_flag_invalid, status); 5333 z.low = floatx80_default_nan_low; 5334 z.high = floatx80_default_nan_high; 5335 return z; 5336 } 5337 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5338 } 5339 if ( aExp == 0 ) { 5340 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5341 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5342 } 5343 if ( bExp == 0 ) { 5344 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5345 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5346 } 5347 zExp = aExp + bExp - 0x3FFE; 5348 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 5349 if ( 0 < (int64_t) zSig0 ) { 5350 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5351 --zExp; 5352 } 5353 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5354 zSign, zExp, zSig0, zSig1, status); 5355 } 5356 5357 /*---------------------------------------------------------------------------- 5358 | Returns the result of dividing the extended double-precision floating-point 5359 | value `a' by the corresponding value `b'. The operation is performed 5360 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5361 *----------------------------------------------------------------------------*/ 5362 5363 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 5364 { 5365 flag aSign, bSign, zSign; 5366 int32_t aExp, bExp, zExp; 5367 uint64_t aSig, bSig, zSig0, zSig1; 5368 uint64_t rem0, rem1, rem2, term0, term1, term2; 5369 floatx80 z; 5370 5371 aSig = extractFloatx80Frac( a ); 5372 aExp = extractFloatx80Exp( a ); 5373 aSign = extractFloatx80Sign( a ); 5374 bSig = extractFloatx80Frac( b ); 5375 bExp = extractFloatx80Exp( b ); 5376 bSign = extractFloatx80Sign( b ); 5377 zSign = aSign ^ bSign; 5378 if ( aExp == 0x7FFF ) { 5379 if ((uint64_t)(aSig << 1)) { 5380 return propagateFloatx80NaN(a, b, status); 5381 } 5382 if ( bExp == 0x7FFF ) { 5383 if ((uint64_t)(bSig << 1)) { 5384 return propagateFloatx80NaN(a, b, status); 5385 } 5386 goto invalid; 5387 } 5388 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5389 } 5390 if ( bExp == 0x7FFF ) { 5391 if ((uint64_t)(bSig << 1)) { 5392 return propagateFloatx80NaN(a, b, status); 5393 } 5394 return packFloatx80( zSign, 0, 0 ); 5395 } 5396 if ( bExp == 0 ) { 5397 if ( bSig == 0 ) { 5398 if ( ( aExp | aSig ) == 0 ) { 5399 invalid: 5400 float_raise(float_flag_invalid, status); 5401 z.low = floatx80_default_nan_low; 5402 z.high = floatx80_default_nan_high; 5403 return z; 5404 } 5405 float_raise(float_flag_divbyzero, status); 5406 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5407 } 5408 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5409 } 5410 if ( aExp == 0 ) { 5411 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5412 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5413 } 5414 zExp = aExp - bExp + 0x3FFE; 5415 rem1 = 0; 5416 if ( bSig <= aSig ) { 5417 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 5418 ++zExp; 5419 } 5420 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 5421 mul64To128( bSig, zSig0, &term0, &term1 ); 5422 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 5423 while ( (int64_t) rem0 < 0 ) { 5424 --zSig0; 5425 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 5426 } 5427 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 5428 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 5429 mul64To128( bSig, zSig1, &term1, &term2 ); 5430 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5431 while ( (int64_t) rem1 < 0 ) { 5432 --zSig1; 5433 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 5434 } 5435 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 5436 } 5437 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5438 zSign, zExp, zSig0, zSig1, status); 5439 } 5440 5441 /*---------------------------------------------------------------------------- 5442 | Returns the remainder of the extended double-precision floating-point value 5443 | `a' with respect to the corresponding value `b'. The operation is performed 5444 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5445 *----------------------------------------------------------------------------*/ 5446 5447 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 5448 { 5449 flag aSign, zSign; 5450 int32_t aExp, bExp, expDiff; 5451 uint64_t aSig0, aSig1, bSig; 5452 uint64_t q, term0, term1, alternateASig0, alternateASig1; 5453 floatx80 z; 5454 5455 aSig0 = extractFloatx80Frac( a ); 5456 aExp = extractFloatx80Exp( a ); 5457 aSign = extractFloatx80Sign( a ); 5458 bSig = extractFloatx80Frac( b ); 5459 bExp = extractFloatx80Exp( b ); 5460 if ( aExp == 0x7FFF ) { 5461 if ( (uint64_t) ( aSig0<<1 ) 5462 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5463 return propagateFloatx80NaN(a, b, status); 5464 } 5465 goto invalid; 5466 } 5467 if ( bExp == 0x7FFF ) { 5468 if ((uint64_t)(bSig << 1)) { 5469 return propagateFloatx80NaN(a, b, status); 5470 } 5471 return a; 5472 } 5473 if ( bExp == 0 ) { 5474 if ( bSig == 0 ) { 5475 invalid: 5476 float_raise(float_flag_invalid, status); 5477 z.low = floatx80_default_nan_low; 5478 z.high = floatx80_default_nan_high; 5479 return z; 5480 } 5481 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5482 } 5483 if ( aExp == 0 ) { 5484 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a; 5485 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5486 } 5487 bSig |= LIT64( 0x8000000000000000 ); 5488 zSign = aSign; 5489 expDiff = aExp - bExp; 5490 aSig1 = 0; 5491 if ( expDiff < 0 ) { 5492 if ( expDiff < -1 ) return a; 5493 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 5494 expDiff = 0; 5495 } 5496 q = ( bSig <= aSig0 ); 5497 if ( q ) aSig0 -= bSig; 5498 expDiff -= 64; 5499 while ( 0 < expDiff ) { 5500 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5501 q = ( 2 < q ) ? q - 2 : 0; 5502 mul64To128( bSig, q, &term0, &term1 ); 5503 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5504 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 5505 expDiff -= 62; 5506 } 5507 expDiff += 64; 5508 if ( 0 < expDiff ) { 5509 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5510 q = ( 2 < q ) ? q - 2 : 0; 5511 q >>= 64 - expDiff; 5512 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 5513 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5514 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 5515 while ( le128( term0, term1, aSig0, aSig1 ) ) { 5516 ++q; 5517 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5518 } 5519 } 5520 else { 5521 term1 = 0; 5522 term0 = bSig; 5523 } 5524 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 5525 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5526 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5527 && ( q & 1 ) ) 5528 ) { 5529 aSig0 = alternateASig0; 5530 aSig1 = alternateASig1; 5531 zSign = ! zSign; 5532 } 5533 return 5534 normalizeRoundAndPackFloatx80( 5535 80, zSign, bExp + expDiff, aSig0, aSig1, status); 5536 5537 } 5538 5539 /*---------------------------------------------------------------------------- 5540 | Returns the square root of the extended double-precision floating-point 5541 | value `a'. The operation is performed according to the IEC/IEEE Standard 5542 | for Binary Floating-Point Arithmetic. 5543 *----------------------------------------------------------------------------*/ 5544 5545 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 5546 { 5547 flag aSign; 5548 int32_t aExp, zExp; 5549 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 5550 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5551 floatx80 z; 5552 5553 aSig0 = extractFloatx80Frac( a ); 5554 aExp = extractFloatx80Exp( a ); 5555 aSign = extractFloatx80Sign( a ); 5556 if ( aExp == 0x7FFF ) { 5557 if ((uint64_t)(aSig0 << 1)) { 5558 return propagateFloatx80NaN(a, a, status); 5559 } 5560 if ( ! aSign ) return a; 5561 goto invalid; 5562 } 5563 if ( aSign ) { 5564 if ( ( aExp | aSig0 ) == 0 ) return a; 5565 invalid: 5566 float_raise(float_flag_invalid, status); 5567 z.low = floatx80_default_nan_low; 5568 z.high = floatx80_default_nan_high; 5569 return z; 5570 } 5571 if ( aExp == 0 ) { 5572 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 5573 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5574 } 5575 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 5576 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 5577 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 5578 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5579 doubleZSig0 = zSig0<<1; 5580 mul64To128( zSig0, zSig0, &term0, &term1 ); 5581 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5582 while ( (int64_t) rem0 < 0 ) { 5583 --zSig0; 5584 doubleZSig0 -= 2; 5585 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5586 } 5587 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5588 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 5589 if ( zSig1 == 0 ) zSig1 = 1; 5590 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5591 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5592 mul64To128( zSig1, zSig1, &term2, &term3 ); 5593 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5594 while ( (int64_t) rem1 < 0 ) { 5595 --zSig1; 5596 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5597 term3 |= 1; 5598 term2 |= doubleZSig0; 5599 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5600 } 5601 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5602 } 5603 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 5604 zSig0 |= doubleZSig0; 5605 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5606 0, zExp, zSig0, zSig1, status); 5607 } 5608 5609 /*---------------------------------------------------------------------------- 5610 | Returns 1 if the extended double-precision floating-point value `a' is equal 5611 | to the corresponding value `b', and 0 otherwise. The invalid exception is 5612 | raised if either operand is a NaN. Otherwise, the comparison is performed 5613 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5614 *----------------------------------------------------------------------------*/ 5615 5616 int floatx80_eq(floatx80 a, floatx80 b, float_status *status) 5617 { 5618 5619 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5620 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5621 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5622 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5623 ) { 5624 float_raise(float_flag_invalid, status); 5625 return 0; 5626 } 5627 return 5628 ( a.low == b.low ) 5629 && ( ( a.high == b.high ) 5630 || ( ( a.low == 0 ) 5631 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5632 ); 5633 5634 } 5635 5636 /*---------------------------------------------------------------------------- 5637 | Returns 1 if the extended double-precision floating-point value `a' is 5638 | less than or equal to the corresponding value `b', and 0 otherwise. The 5639 | invalid exception is raised if either operand is a NaN. The comparison is 5640 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5641 | Arithmetic. 5642 *----------------------------------------------------------------------------*/ 5643 5644 int floatx80_le(floatx80 a, floatx80 b, float_status *status) 5645 { 5646 flag aSign, bSign; 5647 5648 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5649 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5650 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5651 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5652 ) { 5653 float_raise(float_flag_invalid, status); 5654 return 0; 5655 } 5656 aSign = extractFloatx80Sign( a ); 5657 bSign = extractFloatx80Sign( b ); 5658 if ( aSign != bSign ) { 5659 return 5660 aSign 5661 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5662 == 0 ); 5663 } 5664 return 5665 aSign ? le128( b.high, b.low, a.high, a.low ) 5666 : le128( a.high, a.low, b.high, b.low ); 5667 5668 } 5669 5670 /*---------------------------------------------------------------------------- 5671 | Returns 1 if the extended double-precision floating-point value `a' is 5672 | less than the corresponding value `b', and 0 otherwise. The invalid 5673 | exception is raised if either operand is a NaN. The comparison is performed 5674 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5675 *----------------------------------------------------------------------------*/ 5676 5677 int floatx80_lt(floatx80 a, floatx80 b, float_status *status) 5678 { 5679 flag aSign, bSign; 5680 5681 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5682 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5683 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5684 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5685 ) { 5686 float_raise(float_flag_invalid, status); 5687 return 0; 5688 } 5689 aSign = extractFloatx80Sign( a ); 5690 bSign = extractFloatx80Sign( b ); 5691 if ( aSign != bSign ) { 5692 return 5693 aSign 5694 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5695 != 0 ); 5696 } 5697 return 5698 aSign ? lt128( b.high, b.low, a.high, a.low ) 5699 : lt128( a.high, a.low, b.high, b.low ); 5700 5701 } 5702 5703 /*---------------------------------------------------------------------------- 5704 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5705 | cannot be compared, and 0 otherwise. The invalid exception is raised if 5706 | either operand is a NaN. The comparison is performed according to the 5707 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5708 *----------------------------------------------------------------------------*/ 5709 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status) 5710 { 5711 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5712 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5713 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5714 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5715 ) { 5716 float_raise(float_flag_invalid, status); 5717 return 1; 5718 } 5719 return 0; 5720 } 5721 5722 /*---------------------------------------------------------------------------- 5723 | Returns 1 if the extended double-precision floating-point value `a' is 5724 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5725 | cause an exception. The comparison is performed according to the IEC/IEEE 5726 | Standard for Binary Floating-Point Arithmetic. 5727 *----------------------------------------------------------------------------*/ 5728 5729 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status) 5730 { 5731 5732 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5733 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5734 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5735 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5736 ) { 5737 if ( floatx80_is_signaling_nan( a ) 5738 || floatx80_is_signaling_nan( b ) ) { 5739 float_raise(float_flag_invalid, status); 5740 } 5741 return 0; 5742 } 5743 return 5744 ( a.low == b.low ) 5745 && ( ( a.high == b.high ) 5746 || ( ( a.low == 0 ) 5747 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5748 ); 5749 5750 } 5751 5752 /*---------------------------------------------------------------------------- 5753 | Returns 1 if the extended double-precision floating-point value `a' is less 5754 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 5755 | do not cause an exception. Otherwise, the comparison is performed according 5756 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5757 *----------------------------------------------------------------------------*/ 5758 5759 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status) 5760 { 5761 flag aSign, bSign; 5762 5763 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5764 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5765 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5766 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5767 ) { 5768 if ( floatx80_is_signaling_nan( a ) 5769 || floatx80_is_signaling_nan( b ) ) { 5770 float_raise(float_flag_invalid, status); 5771 } 5772 return 0; 5773 } 5774 aSign = extractFloatx80Sign( a ); 5775 bSign = extractFloatx80Sign( b ); 5776 if ( aSign != bSign ) { 5777 return 5778 aSign 5779 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5780 == 0 ); 5781 } 5782 return 5783 aSign ? le128( b.high, b.low, a.high, a.low ) 5784 : le128( a.high, a.low, b.high, b.low ); 5785 5786 } 5787 5788 /*---------------------------------------------------------------------------- 5789 | Returns 1 if the extended double-precision floating-point value `a' is less 5790 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 5791 | an exception. Otherwise, the comparison is performed according to the 5792 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5793 *----------------------------------------------------------------------------*/ 5794 5795 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status) 5796 { 5797 flag aSign, bSign; 5798 5799 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5800 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5801 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5802 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5803 ) { 5804 if ( floatx80_is_signaling_nan( a ) 5805 || floatx80_is_signaling_nan( b ) ) { 5806 float_raise(float_flag_invalid, status); 5807 } 5808 return 0; 5809 } 5810 aSign = extractFloatx80Sign( a ); 5811 bSign = extractFloatx80Sign( b ); 5812 if ( aSign != bSign ) { 5813 return 5814 aSign 5815 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5816 != 0 ); 5817 } 5818 return 5819 aSign ? lt128( b.high, b.low, a.high, a.low ) 5820 : lt128( a.high, a.low, b.high, b.low ); 5821 5822 } 5823 5824 /*---------------------------------------------------------------------------- 5825 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5826 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception. 5827 | The comparison is performed according to the IEC/IEEE Standard for Binary 5828 | Floating-Point Arithmetic. 5829 *----------------------------------------------------------------------------*/ 5830 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status) 5831 { 5832 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5833 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5834 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5835 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5836 ) { 5837 if ( floatx80_is_signaling_nan( a ) 5838 || floatx80_is_signaling_nan( b ) ) { 5839 float_raise(float_flag_invalid, status); 5840 } 5841 return 1; 5842 } 5843 return 0; 5844 } 5845 5846 /*---------------------------------------------------------------------------- 5847 | Returns the result of converting the quadruple-precision floating-point 5848 | value `a' to the 32-bit two's complement integer format. The conversion 5849 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5850 | Arithmetic---which means in particular that the conversion is rounded 5851 | according to the current rounding mode. If `a' is a NaN, the largest 5852 | positive integer is returned. Otherwise, if the conversion overflows, the 5853 | largest integer with the same sign as `a' is returned. 5854 *----------------------------------------------------------------------------*/ 5855 5856 int32_t float128_to_int32(float128 a, float_status *status) 5857 { 5858 flag aSign; 5859 int32_t aExp, shiftCount; 5860 uint64_t aSig0, aSig1; 5861 5862 aSig1 = extractFloat128Frac1( a ); 5863 aSig0 = extractFloat128Frac0( a ); 5864 aExp = extractFloat128Exp( a ); 5865 aSign = extractFloat128Sign( a ); 5866 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 5867 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5868 aSig0 |= ( aSig1 != 0 ); 5869 shiftCount = 0x4028 - aExp; 5870 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 5871 return roundAndPackInt32(aSign, aSig0, status); 5872 5873 } 5874 5875 /*---------------------------------------------------------------------------- 5876 | Returns the result of converting the quadruple-precision floating-point 5877 | value `a' to the 32-bit two's complement integer format. The conversion 5878 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5879 | Arithmetic, except that the conversion is always rounded toward zero. If 5880 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 5881 | conversion overflows, the largest integer with the same sign as `a' is 5882 | returned. 5883 *----------------------------------------------------------------------------*/ 5884 5885 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 5886 { 5887 flag aSign; 5888 int32_t aExp, shiftCount; 5889 uint64_t aSig0, aSig1, savedASig; 5890 int32_t z; 5891 5892 aSig1 = extractFloat128Frac1( a ); 5893 aSig0 = extractFloat128Frac0( a ); 5894 aExp = extractFloat128Exp( a ); 5895 aSign = extractFloat128Sign( a ); 5896 aSig0 |= ( aSig1 != 0 ); 5897 if ( 0x401E < aExp ) { 5898 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 5899 goto invalid; 5900 } 5901 else if ( aExp < 0x3FFF ) { 5902 if (aExp || aSig0) { 5903 status->float_exception_flags |= float_flag_inexact; 5904 } 5905 return 0; 5906 } 5907 aSig0 |= LIT64( 0x0001000000000000 ); 5908 shiftCount = 0x402F - aExp; 5909 savedASig = aSig0; 5910 aSig0 >>= shiftCount; 5911 z = aSig0; 5912 if ( aSign ) z = - z; 5913 if ( ( z < 0 ) ^ aSign ) { 5914 invalid: 5915 float_raise(float_flag_invalid, status); 5916 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5917 } 5918 if ( ( aSig0<<shiftCount ) != savedASig ) { 5919 status->float_exception_flags |= float_flag_inexact; 5920 } 5921 return z; 5922 5923 } 5924 5925 /*---------------------------------------------------------------------------- 5926 | Returns the result of converting the quadruple-precision floating-point 5927 | value `a' to the 64-bit two's complement integer format. The conversion 5928 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5929 | Arithmetic---which means in particular that the conversion is rounded 5930 | according to the current rounding mode. If `a' is a NaN, the largest 5931 | positive integer is returned. Otherwise, if the conversion overflows, the 5932 | largest integer with the same sign as `a' is returned. 5933 *----------------------------------------------------------------------------*/ 5934 5935 int64_t float128_to_int64(float128 a, float_status *status) 5936 { 5937 flag aSign; 5938 int32_t aExp, shiftCount; 5939 uint64_t aSig0, aSig1; 5940 5941 aSig1 = extractFloat128Frac1( a ); 5942 aSig0 = extractFloat128Frac0( a ); 5943 aExp = extractFloat128Exp( a ); 5944 aSign = extractFloat128Sign( a ); 5945 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5946 shiftCount = 0x402F - aExp; 5947 if ( shiftCount <= 0 ) { 5948 if ( 0x403E < aExp ) { 5949 float_raise(float_flag_invalid, status); 5950 if ( ! aSign 5951 || ( ( aExp == 0x7FFF ) 5952 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 5953 ) 5954 ) { 5955 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5956 } 5957 return (int64_t) LIT64( 0x8000000000000000 ); 5958 } 5959 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 5960 } 5961 else { 5962 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 5963 } 5964 return roundAndPackInt64(aSign, aSig0, aSig1, status); 5965 5966 } 5967 5968 /*---------------------------------------------------------------------------- 5969 | Returns the result of converting the quadruple-precision floating-point 5970 | value `a' to the 64-bit two's complement integer format. The conversion 5971 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5972 | Arithmetic, except that the conversion is always rounded toward zero. 5973 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 5974 | the conversion overflows, the largest integer with the same sign as `a' is 5975 | returned. 5976 *----------------------------------------------------------------------------*/ 5977 5978 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 5979 { 5980 flag aSign; 5981 int32_t aExp, shiftCount; 5982 uint64_t aSig0, aSig1; 5983 int64_t z; 5984 5985 aSig1 = extractFloat128Frac1( a ); 5986 aSig0 = extractFloat128Frac0( a ); 5987 aExp = extractFloat128Exp( a ); 5988 aSign = extractFloat128Sign( a ); 5989 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5990 shiftCount = aExp - 0x402F; 5991 if ( 0 < shiftCount ) { 5992 if ( 0x403E <= aExp ) { 5993 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 5994 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 5995 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 5996 if (aSig1) { 5997 status->float_exception_flags |= float_flag_inexact; 5998 } 5999 } 6000 else { 6001 float_raise(float_flag_invalid, status); 6002 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 6003 return LIT64( 0x7FFFFFFFFFFFFFFF ); 6004 } 6005 } 6006 return (int64_t) LIT64( 0x8000000000000000 ); 6007 } 6008 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 6009 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 6010 status->float_exception_flags |= float_flag_inexact; 6011 } 6012 } 6013 else { 6014 if ( aExp < 0x3FFF ) { 6015 if ( aExp | aSig0 | aSig1 ) { 6016 status->float_exception_flags |= float_flag_inexact; 6017 } 6018 return 0; 6019 } 6020 z = aSig0>>( - shiftCount ); 6021 if ( aSig1 6022 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 6023 status->float_exception_flags |= float_flag_inexact; 6024 } 6025 } 6026 if ( aSign ) z = - z; 6027 return z; 6028 6029 } 6030 6031 /*---------------------------------------------------------------------------- 6032 | Returns the result of converting the quadruple-precision floating-point 6033 | value `a' to the single-precision floating-point format. The conversion 6034 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6035 | Arithmetic. 6036 *----------------------------------------------------------------------------*/ 6037 6038 float32 float128_to_float32(float128 a, float_status *status) 6039 { 6040 flag aSign; 6041 int32_t aExp; 6042 uint64_t aSig0, aSig1; 6043 uint32_t zSig; 6044 6045 aSig1 = extractFloat128Frac1( a ); 6046 aSig0 = extractFloat128Frac0( a ); 6047 aExp = extractFloat128Exp( a ); 6048 aSign = extractFloat128Sign( a ); 6049 if ( aExp == 0x7FFF ) { 6050 if ( aSig0 | aSig1 ) { 6051 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 6052 } 6053 return packFloat32( aSign, 0xFF, 0 ); 6054 } 6055 aSig0 |= ( aSig1 != 0 ); 6056 shift64RightJamming( aSig0, 18, &aSig0 ); 6057 zSig = aSig0; 6058 if ( aExp || zSig ) { 6059 zSig |= 0x40000000; 6060 aExp -= 0x3F81; 6061 } 6062 return roundAndPackFloat32(aSign, aExp, zSig, status); 6063 6064 } 6065 6066 /*---------------------------------------------------------------------------- 6067 | Returns the result of converting the quadruple-precision floating-point 6068 | value `a' to the double-precision floating-point format. The conversion 6069 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6070 | Arithmetic. 6071 *----------------------------------------------------------------------------*/ 6072 6073 float64 float128_to_float64(float128 a, float_status *status) 6074 { 6075 flag aSign; 6076 int32_t aExp; 6077 uint64_t aSig0, aSig1; 6078 6079 aSig1 = extractFloat128Frac1( a ); 6080 aSig0 = extractFloat128Frac0( a ); 6081 aExp = extractFloat128Exp( a ); 6082 aSign = extractFloat128Sign( a ); 6083 if ( aExp == 0x7FFF ) { 6084 if ( aSig0 | aSig1 ) { 6085 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 6086 } 6087 return packFloat64( aSign, 0x7FF, 0 ); 6088 } 6089 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6090 aSig0 |= ( aSig1 != 0 ); 6091 if ( aExp || aSig0 ) { 6092 aSig0 |= LIT64( 0x4000000000000000 ); 6093 aExp -= 0x3C01; 6094 } 6095 return roundAndPackFloat64(aSign, aExp, aSig0, status); 6096 6097 } 6098 6099 /*---------------------------------------------------------------------------- 6100 | Returns the result of converting the quadruple-precision floating-point 6101 | value `a' to the extended double-precision floating-point format. The 6102 | conversion is performed according to the IEC/IEEE Standard for Binary 6103 | Floating-Point Arithmetic. 6104 *----------------------------------------------------------------------------*/ 6105 6106 floatx80 float128_to_floatx80(float128 a, float_status *status) 6107 { 6108 flag aSign; 6109 int32_t aExp; 6110 uint64_t aSig0, aSig1; 6111 6112 aSig1 = extractFloat128Frac1( a ); 6113 aSig0 = extractFloat128Frac0( a ); 6114 aExp = extractFloat128Exp( a ); 6115 aSign = extractFloat128Sign( a ); 6116 if ( aExp == 0x7FFF ) { 6117 if ( aSig0 | aSig1 ) { 6118 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status); 6119 } 6120 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 6121 } 6122 if ( aExp == 0 ) { 6123 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6124 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6125 } 6126 else { 6127 aSig0 |= LIT64( 0x0001000000000000 ); 6128 } 6129 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6130 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6131 6132 } 6133 6134 /*---------------------------------------------------------------------------- 6135 | Rounds the quadruple-precision floating-point value `a' to an integer, and 6136 | returns the result as a quadruple-precision floating-point value. The 6137 | operation is performed according to the IEC/IEEE Standard for Binary 6138 | Floating-Point Arithmetic. 6139 *----------------------------------------------------------------------------*/ 6140 6141 float128 float128_round_to_int(float128 a, float_status *status) 6142 { 6143 flag aSign; 6144 int32_t aExp; 6145 uint64_t lastBitMask, roundBitsMask; 6146 float128 z; 6147 6148 aExp = extractFloat128Exp( a ); 6149 if ( 0x402F <= aExp ) { 6150 if ( 0x406F <= aExp ) { 6151 if ( ( aExp == 0x7FFF ) 6152 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 6153 ) { 6154 return propagateFloat128NaN(a, a, status); 6155 } 6156 return a; 6157 } 6158 lastBitMask = 1; 6159 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 6160 roundBitsMask = lastBitMask - 1; 6161 z = a; 6162 switch (status->float_rounding_mode) { 6163 case float_round_nearest_even: 6164 if ( lastBitMask ) { 6165 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 6166 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 6167 } 6168 else { 6169 if ( (int64_t) z.low < 0 ) { 6170 ++z.high; 6171 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 6172 } 6173 } 6174 break; 6175 case float_round_ties_away: 6176 if (lastBitMask) { 6177 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 6178 } else { 6179 if ((int64_t) z.low < 0) { 6180 ++z.high; 6181 } 6182 } 6183 break; 6184 case float_round_to_zero: 6185 break; 6186 case float_round_up: 6187 if (!extractFloat128Sign(z)) { 6188 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6189 } 6190 break; 6191 case float_round_down: 6192 if (extractFloat128Sign(z)) { 6193 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6194 } 6195 break; 6196 default: 6197 abort(); 6198 } 6199 z.low &= ~ roundBitsMask; 6200 } 6201 else { 6202 if ( aExp < 0x3FFF ) { 6203 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 6204 status->float_exception_flags |= float_flag_inexact; 6205 aSign = extractFloat128Sign( a ); 6206 switch (status->float_rounding_mode) { 6207 case float_round_nearest_even: 6208 if ( ( aExp == 0x3FFE ) 6209 && ( extractFloat128Frac0( a ) 6210 | extractFloat128Frac1( a ) ) 6211 ) { 6212 return packFloat128( aSign, 0x3FFF, 0, 0 ); 6213 } 6214 break; 6215 case float_round_ties_away: 6216 if (aExp == 0x3FFE) { 6217 return packFloat128(aSign, 0x3FFF, 0, 0); 6218 } 6219 break; 6220 case float_round_down: 6221 return 6222 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 6223 : packFloat128( 0, 0, 0, 0 ); 6224 case float_round_up: 6225 return 6226 aSign ? packFloat128( 1, 0, 0, 0 ) 6227 : packFloat128( 0, 0x3FFF, 0, 0 ); 6228 } 6229 return packFloat128( aSign, 0, 0, 0 ); 6230 } 6231 lastBitMask = 1; 6232 lastBitMask <<= 0x402F - aExp; 6233 roundBitsMask = lastBitMask - 1; 6234 z.low = 0; 6235 z.high = a.high; 6236 switch (status->float_rounding_mode) { 6237 case float_round_nearest_even: 6238 z.high += lastBitMask>>1; 6239 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 6240 z.high &= ~ lastBitMask; 6241 } 6242 break; 6243 case float_round_ties_away: 6244 z.high += lastBitMask>>1; 6245 break; 6246 case float_round_to_zero: 6247 break; 6248 case float_round_up: 6249 if (!extractFloat128Sign(z)) { 6250 z.high |= ( a.low != 0 ); 6251 z.high += roundBitsMask; 6252 } 6253 break; 6254 case float_round_down: 6255 if (extractFloat128Sign(z)) { 6256 z.high |= (a.low != 0); 6257 z.high += roundBitsMask; 6258 } 6259 break; 6260 default: 6261 abort(); 6262 } 6263 z.high &= ~ roundBitsMask; 6264 } 6265 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 6266 status->float_exception_flags |= float_flag_inexact; 6267 } 6268 return z; 6269 6270 } 6271 6272 /*---------------------------------------------------------------------------- 6273 | Returns the result of adding the absolute values of the quadruple-precision 6274 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 6275 | before being returned. `zSign' is ignored if the result is a NaN. 6276 | The addition is performed according to the IEC/IEEE Standard for Binary 6277 | Floating-Point Arithmetic. 6278 *----------------------------------------------------------------------------*/ 6279 6280 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign, 6281 float_status *status) 6282 { 6283 int32_t aExp, bExp, zExp; 6284 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6285 int32_t expDiff; 6286 6287 aSig1 = extractFloat128Frac1( a ); 6288 aSig0 = extractFloat128Frac0( a ); 6289 aExp = extractFloat128Exp( a ); 6290 bSig1 = extractFloat128Frac1( b ); 6291 bSig0 = extractFloat128Frac0( b ); 6292 bExp = extractFloat128Exp( b ); 6293 expDiff = aExp - bExp; 6294 if ( 0 < expDiff ) { 6295 if ( aExp == 0x7FFF ) { 6296 if (aSig0 | aSig1) { 6297 return propagateFloat128NaN(a, b, status); 6298 } 6299 return a; 6300 } 6301 if ( bExp == 0 ) { 6302 --expDiff; 6303 } 6304 else { 6305 bSig0 |= LIT64( 0x0001000000000000 ); 6306 } 6307 shift128ExtraRightJamming( 6308 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 6309 zExp = aExp; 6310 } 6311 else if ( expDiff < 0 ) { 6312 if ( bExp == 0x7FFF ) { 6313 if (bSig0 | bSig1) { 6314 return propagateFloat128NaN(a, b, status); 6315 } 6316 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6317 } 6318 if ( aExp == 0 ) { 6319 ++expDiff; 6320 } 6321 else { 6322 aSig0 |= LIT64( 0x0001000000000000 ); 6323 } 6324 shift128ExtraRightJamming( 6325 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 6326 zExp = bExp; 6327 } 6328 else { 6329 if ( aExp == 0x7FFF ) { 6330 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6331 return propagateFloat128NaN(a, b, status); 6332 } 6333 return a; 6334 } 6335 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6336 if ( aExp == 0 ) { 6337 if (status->flush_to_zero) { 6338 if (zSig0 | zSig1) { 6339 float_raise(float_flag_output_denormal, status); 6340 } 6341 return packFloat128(zSign, 0, 0, 0); 6342 } 6343 return packFloat128( zSign, 0, zSig0, zSig1 ); 6344 } 6345 zSig2 = 0; 6346 zSig0 |= LIT64( 0x0002000000000000 ); 6347 zExp = aExp; 6348 goto shiftRight1; 6349 } 6350 aSig0 |= LIT64( 0x0001000000000000 ); 6351 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6352 --zExp; 6353 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 6354 ++zExp; 6355 shiftRight1: 6356 shift128ExtraRightJamming( 6357 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6358 roundAndPack: 6359 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6360 6361 } 6362 6363 /*---------------------------------------------------------------------------- 6364 | Returns the result of subtracting the absolute values of the quadruple- 6365 | precision floating-point values `a' and `b'. If `zSign' is 1, the 6366 | difference is negated before being returned. `zSign' is ignored if the 6367 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6368 | Standard for Binary Floating-Point Arithmetic. 6369 *----------------------------------------------------------------------------*/ 6370 6371 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign, 6372 float_status *status) 6373 { 6374 int32_t aExp, bExp, zExp; 6375 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 6376 int32_t expDiff; 6377 float128 z; 6378 6379 aSig1 = extractFloat128Frac1( a ); 6380 aSig0 = extractFloat128Frac0( a ); 6381 aExp = extractFloat128Exp( a ); 6382 bSig1 = extractFloat128Frac1( b ); 6383 bSig0 = extractFloat128Frac0( b ); 6384 bExp = extractFloat128Exp( b ); 6385 expDiff = aExp - bExp; 6386 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6387 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 6388 if ( 0 < expDiff ) goto aExpBigger; 6389 if ( expDiff < 0 ) goto bExpBigger; 6390 if ( aExp == 0x7FFF ) { 6391 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6392 return propagateFloat128NaN(a, b, status); 6393 } 6394 float_raise(float_flag_invalid, status); 6395 z.low = float128_default_nan_low; 6396 z.high = float128_default_nan_high; 6397 return z; 6398 } 6399 if ( aExp == 0 ) { 6400 aExp = 1; 6401 bExp = 1; 6402 } 6403 if ( bSig0 < aSig0 ) goto aBigger; 6404 if ( aSig0 < bSig0 ) goto bBigger; 6405 if ( bSig1 < aSig1 ) goto aBigger; 6406 if ( aSig1 < bSig1 ) goto bBigger; 6407 return packFloat128(status->float_rounding_mode == float_round_down, 6408 0, 0, 0); 6409 bExpBigger: 6410 if ( bExp == 0x7FFF ) { 6411 if (bSig0 | bSig1) { 6412 return propagateFloat128NaN(a, b, status); 6413 } 6414 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 6415 } 6416 if ( aExp == 0 ) { 6417 ++expDiff; 6418 } 6419 else { 6420 aSig0 |= LIT64( 0x4000000000000000 ); 6421 } 6422 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6423 bSig0 |= LIT64( 0x4000000000000000 ); 6424 bBigger: 6425 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6426 zExp = bExp; 6427 zSign ^= 1; 6428 goto normalizeRoundAndPack; 6429 aExpBigger: 6430 if ( aExp == 0x7FFF ) { 6431 if (aSig0 | aSig1) { 6432 return propagateFloat128NaN(a, b, status); 6433 } 6434 return a; 6435 } 6436 if ( bExp == 0 ) { 6437 --expDiff; 6438 } 6439 else { 6440 bSig0 |= LIT64( 0x4000000000000000 ); 6441 } 6442 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 6443 aSig0 |= LIT64( 0x4000000000000000 ); 6444 aBigger: 6445 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6446 zExp = aExp; 6447 normalizeRoundAndPack: 6448 --zExp; 6449 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 6450 status); 6451 6452 } 6453 6454 /*---------------------------------------------------------------------------- 6455 | Returns the result of adding the quadruple-precision floating-point values 6456 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 6457 | for Binary Floating-Point Arithmetic. 6458 *----------------------------------------------------------------------------*/ 6459 6460 float128 float128_add(float128 a, float128 b, float_status *status) 6461 { 6462 flag aSign, bSign; 6463 6464 aSign = extractFloat128Sign( a ); 6465 bSign = extractFloat128Sign( b ); 6466 if ( aSign == bSign ) { 6467 return addFloat128Sigs(a, b, aSign, status); 6468 } 6469 else { 6470 return subFloat128Sigs(a, b, aSign, status); 6471 } 6472 6473 } 6474 6475 /*---------------------------------------------------------------------------- 6476 | Returns the result of subtracting the quadruple-precision floating-point 6477 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6478 | Standard for Binary Floating-Point Arithmetic. 6479 *----------------------------------------------------------------------------*/ 6480 6481 float128 float128_sub(float128 a, float128 b, float_status *status) 6482 { 6483 flag aSign, bSign; 6484 6485 aSign = extractFloat128Sign( a ); 6486 bSign = extractFloat128Sign( b ); 6487 if ( aSign == bSign ) { 6488 return subFloat128Sigs(a, b, aSign, status); 6489 } 6490 else { 6491 return addFloat128Sigs(a, b, aSign, status); 6492 } 6493 6494 } 6495 6496 /*---------------------------------------------------------------------------- 6497 | Returns the result of multiplying the quadruple-precision floating-point 6498 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6499 | Standard for Binary Floating-Point Arithmetic. 6500 *----------------------------------------------------------------------------*/ 6501 6502 float128 float128_mul(float128 a, float128 b, float_status *status) 6503 { 6504 flag aSign, bSign, zSign; 6505 int32_t aExp, bExp, zExp; 6506 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 6507 float128 z; 6508 6509 aSig1 = extractFloat128Frac1( a ); 6510 aSig0 = extractFloat128Frac0( a ); 6511 aExp = extractFloat128Exp( a ); 6512 aSign = extractFloat128Sign( a ); 6513 bSig1 = extractFloat128Frac1( b ); 6514 bSig0 = extractFloat128Frac0( b ); 6515 bExp = extractFloat128Exp( b ); 6516 bSign = extractFloat128Sign( b ); 6517 zSign = aSign ^ bSign; 6518 if ( aExp == 0x7FFF ) { 6519 if ( ( aSig0 | aSig1 ) 6520 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6521 return propagateFloat128NaN(a, b, status); 6522 } 6523 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 6524 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6525 } 6526 if ( bExp == 0x7FFF ) { 6527 if (bSig0 | bSig1) { 6528 return propagateFloat128NaN(a, b, status); 6529 } 6530 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6531 invalid: 6532 float_raise(float_flag_invalid, status); 6533 z.low = float128_default_nan_low; 6534 z.high = float128_default_nan_high; 6535 return z; 6536 } 6537 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6538 } 6539 if ( aExp == 0 ) { 6540 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6541 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6542 } 6543 if ( bExp == 0 ) { 6544 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6545 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6546 } 6547 zExp = aExp + bExp - 0x4000; 6548 aSig0 |= LIT64( 0x0001000000000000 ); 6549 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 6550 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 6551 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6552 zSig2 |= ( zSig3 != 0 ); 6553 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 6554 shift128ExtraRightJamming( 6555 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6556 ++zExp; 6557 } 6558 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6559 6560 } 6561 6562 /*---------------------------------------------------------------------------- 6563 | Returns the result of dividing the quadruple-precision floating-point value 6564 | `a' by the corresponding value `b'. The operation is performed according to 6565 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6566 *----------------------------------------------------------------------------*/ 6567 6568 float128 float128_div(float128 a, float128 b, float_status *status) 6569 { 6570 flag aSign, bSign, zSign; 6571 int32_t aExp, bExp, zExp; 6572 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6573 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6574 float128 z; 6575 6576 aSig1 = extractFloat128Frac1( a ); 6577 aSig0 = extractFloat128Frac0( a ); 6578 aExp = extractFloat128Exp( a ); 6579 aSign = extractFloat128Sign( a ); 6580 bSig1 = extractFloat128Frac1( b ); 6581 bSig0 = extractFloat128Frac0( b ); 6582 bExp = extractFloat128Exp( b ); 6583 bSign = extractFloat128Sign( b ); 6584 zSign = aSign ^ bSign; 6585 if ( aExp == 0x7FFF ) { 6586 if (aSig0 | aSig1) { 6587 return propagateFloat128NaN(a, b, status); 6588 } 6589 if ( bExp == 0x7FFF ) { 6590 if (bSig0 | bSig1) { 6591 return propagateFloat128NaN(a, b, status); 6592 } 6593 goto invalid; 6594 } 6595 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6596 } 6597 if ( bExp == 0x7FFF ) { 6598 if (bSig0 | bSig1) { 6599 return propagateFloat128NaN(a, b, status); 6600 } 6601 return packFloat128( zSign, 0, 0, 0 ); 6602 } 6603 if ( bExp == 0 ) { 6604 if ( ( bSig0 | bSig1 ) == 0 ) { 6605 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6606 invalid: 6607 float_raise(float_flag_invalid, status); 6608 z.low = float128_default_nan_low; 6609 z.high = float128_default_nan_high; 6610 return z; 6611 } 6612 float_raise(float_flag_divbyzero, status); 6613 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6614 } 6615 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6616 } 6617 if ( aExp == 0 ) { 6618 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6619 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6620 } 6621 zExp = aExp - bExp + 0x3FFD; 6622 shortShift128Left( 6623 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 6624 shortShift128Left( 6625 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6626 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 6627 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 6628 ++zExp; 6629 } 6630 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6631 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 6632 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 6633 while ( (int64_t) rem0 < 0 ) { 6634 --zSig0; 6635 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 6636 } 6637 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 6638 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 6639 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 6640 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 6641 while ( (int64_t) rem1 < 0 ) { 6642 --zSig1; 6643 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 6644 } 6645 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6646 } 6647 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 6648 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6649 6650 } 6651 6652 /*---------------------------------------------------------------------------- 6653 | Returns the remainder of the quadruple-precision floating-point value `a' 6654 | with respect to the corresponding value `b'. The operation is performed 6655 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6656 *----------------------------------------------------------------------------*/ 6657 6658 float128 float128_rem(float128 a, float128 b, float_status *status) 6659 { 6660 flag aSign, zSign; 6661 int32_t aExp, bExp, expDiff; 6662 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 6663 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 6664 int64_t sigMean0; 6665 float128 z; 6666 6667 aSig1 = extractFloat128Frac1( a ); 6668 aSig0 = extractFloat128Frac0( a ); 6669 aExp = extractFloat128Exp( a ); 6670 aSign = extractFloat128Sign( a ); 6671 bSig1 = extractFloat128Frac1( b ); 6672 bSig0 = extractFloat128Frac0( b ); 6673 bExp = extractFloat128Exp( b ); 6674 if ( aExp == 0x7FFF ) { 6675 if ( ( aSig0 | aSig1 ) 6676 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6677 return propagateFloat128NaN(a, b, status); 6678 } 6679 goto invalid; 6680 } 6681 if ( bExp == 0x7FFF ) { 6682 if (bSig0 | bSig1) { 6683 return propagateFloat128NaN(a, b, status); 6684 } 6685 return a; 6686 } 6687 if ( bExp == 0 ) { 6688 if ( ( bSig0 | bSig1 ) == 0 ) { 6689 invalid: 6690 float_raise(float_flag_invalid, status); 6691 z.low = float128_default_nan_low; 6692 z.high = float128_default_nan_high; 6693 return z; 6694 } 6695 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6696 } 6697 if ( aExp == 0 ) { 6698 if ( ( aSig0 | aSig1 ) == 0 ) return a; 6699 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6700 } 6701 expDiff = aExp - bExp; 6702 if ( expDiff < -1 ) return a; 6703 shortShift128Left( 6704 aSig0 | LIT64( 0x0001000000000000 ), 6705 aSig1, 6706 15 - ( expDiff < 0 ), 6707 &aSig0, 6708 &aSig1 6709 ); 6710 shortShift128Left( 6711 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6712 q = le128( bSig0, bSig1, aSig0, aSig1 ); 6713 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6714 expDiff -= 64; 6715 while ( 0 < expDiff ) { 6716 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6717 q = ( 4 < q ) ? q - 4 : 0; 6718 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6719 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 6720 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 6721 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 6722 expDiff -= 61; 6723 } 6724 if ( -64 < expDiff ) { 6725 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6726 q = ( 4 < q ) ? q - 4 : 0; 6727 q >>= - expDiff; 6728 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6729 expDiff += 52; 6730 if ( expDiff < 0 ) { 6731 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6732 } 6733 else { 6734 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 6735 } 6736 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6737 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 6738 } 6739 else { 6740 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 6741 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6742 } 6743 do { 6744 alternateASig0 = aSig0; 6745 alternateASig1 = aSig1; 6746 ++q; 6747 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6748 } while ( 0 <= (int64_t) aSig0 ); 6749 add128( 6750 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 6751 if ( ( sigMean0 < 0 ) 6752 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 6753 aSig0 = alternateASig0; 6754 aSig1 = alternateASig1; 6755 } 6756 zSign = ( (int64_t) aSig0 < 0 ); 6757 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 6758 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 6759 status); 6760 } 6761 6762 /*---------------------------------------------------------------------------- 6763 | Returns the square root of the quadruple-precision floating-point value `a'. 6764 | The operation is performed according to the IEC/IEEE Standard for Binary 6765 | Floating-Point Arithmetic. 6766 *----------------------------------------------------------------------------*/ 6767 6768 float128 float128_sqrt(float128 a, float_status *status) 6769 { 6770 flag aSign; 6771 int32_t aExp, zExp; 6772 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 6773 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6774 float128 z; 6775 6776 aSig1 = extractFloat128Frac1( a ); 6777 aSig0 = extractFloat128Frac0( a ); 6778 aExp = extractFloat128Exp( a ); 6779 aSign = extractFloat128Sign( a ); 6780 if ( aExp == 0x7FFF ) { 6781 if (aSig0 | aSig1) { 6782 return propagateFloat128NaN(a, a, status); 6783 } 6784 if ( ! aSign ) return a; 6785 goto invalid; 6786 } 6787 if ( aSign ) { 6788 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 6789 invalid: 6790 float_raise(float_flag_invalid, status); 6791 z.low = float128_default_nan_low; 6792 z.high = float128_default_nan_high; 6793 return z; 6794 } 6795 if ( aExp == 0 ) { 6796 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 6797 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6798 } 6799 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 6800 aSig0 |= LIT64( 0x0001000000000000 ); 6801 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 6802 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 6803 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6804 doubleZSig0 = zSig0<<1; 6805 mul64To128( zSig0, zSig0, &term0, &term1 ); 6806 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6807 while ( (int64_t) rem0 < 0 ) { 6808 --zSig0; 6809 doubleZSig0 -= 2; 6810 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6811 } 6812 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6813 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 6814 if ( zSig1 == 0 ) zSig1 = 1; 6815 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6816 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6817 mul64To128( zSig1, zSig1, &term2, &term3 ); 6818 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6819 while ( (int64_t) rem1 < 0 ) { 6820 --zSig1; 6821 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6822 term3 |= 1; 6823 term2 |= doubleZSig0; 6824 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6825 } 6826 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6827 } 6828 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 6829 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 6830 6831 } 6832 6833 /*---------------------------------------------------------------------------- 6834 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 6835 | the corresponding value `b', and 0 otherwise. The invalid exception is 6836 | raised if either operand is a NaN. Otherwise, the comparison is performed 6837 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6838 *----------------------------------------------------------------------------*/ 6839 6840 int float128_eq(float128 a, float128 b, float_status *status) 6841 { 6842 6843 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6844 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6845 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6846 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6847 ) { 6848 float_raise(float_flag_invalid, status); 6849 return 0; 6850 } 6851 return 6852 ( a.low == b.low ) 6853 && ( ( a.high == b.high ) 6854 || ( ( a.low == 0 ) 6855 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6856 ); 6857 6858 } 6859 6860 /*---------------------------------------------------------------------------- 6861 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6862 | or equal to the corresponding value `b', and 0 otherwise. The invalid 6863 | exception is raised if either operand is a NaN. The comparison is performed 6864 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6865 *----------------------------------------------------------------------------*/ 6866 6867 int float128_le(float128 a, float128 b, float_status *status) 6868 { 6869 flag aSign, bSign; 6870 6871 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6872 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6873 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6874 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6875 ) { 6876 float_raise(float_flag_invalid, status); 6877 return 0; 6878 } 6879 aSign = extractFloat128Sign( a ); 6880 bSign = extractFloat128Sign( b ); 6881 if ( aSign != bSign ) { 6882 return 6883 aSign 6884 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6885 == 0 ); 6886 } 6887 return 6888 aSign ? le128( b.high, b.low, a.high, a.low ) 6889 : le128( a.high, a.low, b.high, b.low ); 6890 6891 } 6892 6893 /*---------------------------------------------------------------------------- 6894 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6895 | the corresponding value `b', and 0 otherwise. The invalid exception is 6896 | raised if either operand is a NaN. The comparison is performed according 6897 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6898 *----------------------------------------------------------------------------*/ 6899 6900 int float128_lt(float128 a, float128 b, float_status *status) 6901 { 6902 flag aSign, bSign; 6903 6904 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6905 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6906 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6907 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6908 ) { 6909 float_raise(float_flag_invalid, status); 6910 return 0; 6911 } 6912 aSign = extractFloat128Sign( a ); 6913 bSign = extractFloat128Sign( b ); 6914 if ( aSign != bSign ) { 6915 return 6916 aSign 6917 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6918 != 0 ); 6919 } 6920 return 6921 aSign ? lt128( b.high, b.low, a.high, a.low ) 6922 : lt128( a.high, a.low, b.high, b.low ); 6923 6924 } 6925 6926 /*---------------------------------------------------------------------------- 6927 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 6928 | be compared, and 0 otherwise. The invalid exception is raised if either 6929 | operand is a NaN. The comparison is performed according to the IEC/IEEE 6930 | Standard for Binary Floating-Point Arithmetic. 6931 *----------------------------------------------------------------------------*/ 6932 6933 int float128_unordered(float128 a, float128 b, float_status *status) 6934 { 6935 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6936 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6937 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6938 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6939 ) { 6940 float_raise(float_flag_invalid, status); 6941 return 1; 6942 } 6943 return 0; 6944 } 6945 6946 /*---------------------------------------------------------------------------- 6947 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 6948 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 6949 | exception. The comparison is performed according to the IEC/IEEE Standard 6950 | for Binary Floating-Point Arithmetic. 6951 *----------------------------------------------------------------------------*/ 6952 6953 int float128_eq_quiet(float128 a, float128 b, float_status *status) 6954 { 6955 6956 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6957 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6958 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6959 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6960 ) { 6961 if ( float128_is_signaling_nan( a ) 6962 || float128_is_signaling_nan( b ) ) { 6963 float_raise(float_flag_invalid, status); 6964 } 6965 return 0; 6966 } 6967 return 6968 ( a.low == b.low ) 6969 && ( ( a.high == b.high ) 6970 || ( ( a.low == 0 ) 6971 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6972 ); 6973 6974 } 6975 6976 /*---------------------------------------------------------------------------- 6977 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6978 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 6979 | cause an exception. Otherwise, the comparison is performed according to the 6980 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6981 *----------------------------------------------------------------------------*/ 6982 6983 int float128_le_quiet(float128 a, float128 b, float_status *status) 6984 { 6985 flag aSign, bSign; 6986 6987 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6988 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6989 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6990 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6991 ) { 6992 if ( float128_is_signaling_nan( a ) 6993 || float128_is_signaling_nan( b ) ) { 6994 float_raise(float_flag_invalid, status); 6995 } 6996 return 0; 6997 } 6998 aSign = extractFloat128Sign( a ); 6999 bSign = extractFloat128Sign( b ); 7000 if ( aSign != bSign ) { 7001 return 7002 aSign 7003 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7004 == 0 ); 7005 } 7006 return 7007 aSign ? le128( b.high, b.low, a.high, a.low ) 7008 : le128( a.high, a.low, b.high, b.low ); 7009 7010 } 7011 7012 /*---------------------------------------------------------------------------- 7013 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7014 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 7015 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 7016 | Standard for Binary Floating-Point Arithmetic. 7017 *----------------------------------------------------------------------------*/ 7018 7019 int float128_lt_quiet(float128 a, float128 b, float_status *status) 7020 { 7021 flag aSign, bSign; 7022 7023 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7024 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7025 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7026 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7027 ) { 7028 if ( float128_is_signaling_nan( a ) 7029 || float128_is_signaling_nan( b ) ) { 7030 float_raise(float_flag_invalid, status); 7031 } 7032 return 0; 7033 } 7034 aSign = extractFloat128Sign( a ); 7035 bSign = extractFloat128Sign( b ); 7036 if ( aSign != bSign ) { 7037 return 7038 aSign 7039 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7040 != 0 ); 7041 } 7042 return 7043 aSign ? lt128( b.high, b.low, a.high, a.low ) 7044 : lt128( a.high, a.low, b.high, b.low ); 7045 7046 } 7047 7048 /*---------------------------------------------------------------------------- 7049 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7050 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 7051 | comparison is performed according to the IEC/IEEE Standard for Binary 7052 | Floating-Point Arithmetic. 7053 *----------------------------------------------------------------------------*/ 7054 7055 int float128_unordered_quiet(float128 a, float128 b, float_status *status) 7056 { 7057 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7058 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7059 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7060 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7061 ) { 7062 if ( float128_is_signaling_nan( a ) 7063 || float128_is_signaling_nan( b ) ) { 7064 float_raise(float_flag_invalid, status); 7065 } 7066 return 1; 7067 } 7068 return 0; 7069 } 7070 7071 /* misc functions */ 7072 float32 uint32_to_float32(uint32_t a, float_status *status) 7073 { 7074 return int64_to_float32(a, status); 7075 } 7076 7077 float64 uint32_to_float64(uint32_t a, float_status *status) 7078 { 7079 return int64_to_float64(a, status); 7080 } 7081 7082 uint32_t float32_to_uint32(float32 a, float_status *status) 7083 { 7084 int64_t v; 7085 uint32_t res; 7086 int old_exc_flags = get_float_exception_flags(status); 7087 7088 v = float32_to_int64(a, status); 7089 if (v < 0) { 7090 res = 0; 7091 } else if (v > 0xffffffff) { 7092 res = 0xffffffff; 7093 } else { 7094 return v; 7095 } 7096 set_float_exception_flags(old_exc_flags, status); 7097 float_raise(float_flag_invalid, status); 7098 return res; 7099 } 7100 7101 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *status) 7102 { 7103 int64_t v; 7104 uint32_t res; 7105 int old_exc_flags = get_float_exception_flags(status); 7106 7107 v = float32_to_int64_round_to_zero(a, status); 7108 if (v < 0) { 7109 res = 0; 7110 } else if (v > 0xffffffff) { 7111 res = 0xffffffff; 7112 } else { 7113 return v; 7114 } 7115 set_float_exception_flags(old_exc_flags, status); 7116 float_raise(float_flag_invalid, status); 7117 return res; 7118 } 7119 7120 int_fast16_t float32_to_int16(float32 a, float_status *status) 7121 { 7122 int32_t v; 7123 int_fast16_t res; 7124 int old_exc_flags = get_float_exception_flags(status); 7125 7126 v = float32_to_int32(a, status); 7127 if (v < -0x8000) { 7128 res = -0x8000; 7129 } else if (v > 0x7fff) { 7130 res = 0x7fff; 7131 } else { 7132 return v; 7133 } 7134 7135 set_float_exception_flags(old_exc_flags, status); 7136 float_raise(float_flag_invalid, status); 7137 return res; 7138 } 7139 7140 uint_fast16_t float32_to_uint16(float32 a, float_status *status) 7141 { 7142 int32_t v; 7143 uint_fast16_t res; 7144 int old_exc_flags = get_float_exception_flags(status); 7145 7146 v = float32_to_int32(a, status); 7147 if (v < 0) { 7148 res = 0; 7149 } else if (v > 0xffff) { 7150 res = 0xffff; 7151 } else { 7152 return v; 7153 } 7154 7155 set_float_exception_flags(old_exc_flags, status); 7156 float_raise(float_flag_invalid, status); 7157 return res; 7158 } 7159 7160 uint_fast16_t float32_to_uint16_round_to_zero(float32 a, float_status *status) 7161 { 7162 int64_t v; 7163 uint_fast16_t res; 7164 int old_exc_flags = get_float_exception_flags(status); 7165 7166 v = float32_to_int64_round_to_zero(a, status); 7167 if (v < 0) { 7168 res = 0; 7169 } else if (v > 0xffff) { 7170 res = 0xffff; 7171 } else { 7172 return v; 7173 } 7174 set_float_exception_flags(old_exc_flags, status); 7175 float_raise(float_flag_invalid, status); 7176 return res; 7177 } 7178 7179 uint32_t float64_to_uint32(float64 a, float_status *status) 7180 { 7181 uint64_t v; 7182 uint32_t res; 7183 int old_exc_flags = get_float_exception_flags(status); 7184 7185 v = float64_to_uint64(a, status); 7186 if (v > 0xffffffff) { 7187 res = 0xffffffff; 7188 } else { 7189 return v; 7190 } 7191 set_float_exception_flags(old_exc_flags, status); 7192 float_raise(float_flag_invalid, status); 7193 return res; 7194 } 7195 7196 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *status) 7197 { 7198 uint64_t v; 7199 uint32_t res; 7200 int old_exc_flags = get_float_exception_flags(status); 7201 7202 v = float64_to_uint64_round_to_zero(a, status); 7203 if (v > 0xffffffff) { 7204 res = 0xffffffff; 7205 } else { 7206 return v; 7207 } 7208 set_float_exception_flags(old_exc_flags, status); 7209 float_raise(float_flag_invalid, status); 7210 return res; 7211 } 7212 7213 int_fast16_t float64_to_int16(float64 a, float_status *status) 7214 { 7215 int64_t v; 7216 int_fast16_t res; 7217 int old_exc_flags = get_float_exception_flags(status); 7218 7219 v = float64_to_int32(a, status); 7220 if (v < -0x8000) { 7221 res = -0x8000; 7222 } else if (v > 0x7fff) { 7223 res = 0x7fff; 7224 } else { 7225 return v; 7226 } 7227 7228 set_float_exception_flags(old_exc_flags, status); 7229 float_raise(float_flag_invalid, status); 7230 return res; 7231 } 7232 7233 uint_fast16_t float64_to_uint16(float64 a, float_status *status) 7234 { 7235 int64_t v; 7236 uint_fast16_t res; 7237 int old_exc_flags = get_float_exception_flags(status); 7238 7239 v = float64_to_int32(a, status); 7240 if (v < 0) { 7241 res = 0; 7242 } else if (v > 0xffff) { 7243 res = 0xffff; 7244 } else { 7245 return v; 7246 } 7247 7248 set_float_exception_flags(old_exc_flags, status); 7249 float_raise(float_flag_invalid, status); 7250 return res; 7251 } 7252 7253 uint_fast16_t float64_to_uint16_round_to_zero(float64 a, float_status *status) 7254 { 7255 int64_t v; 7256 uint_fast16_t res; 7257 int old_exc_flags = get_float_exception_flags(status); 7258 7259 v = float64_to_int64_round_to_zero(a, status); 7260 if (v < 0) { 7261 res = 0; 7262 } else if (v > 0xffff) { 7263 res = 0xffff; 7264 } else { 7265 return v; 7266 } 7267 set_float_exception_flags(old_exc_flags, status); 7268 float_raise(float_flag_invalid, status); 7269 return res; 7270 } 7271 7272 /*---------------------------------------------------------------------------- 7273 | Returns the result of converting the double-precision floating-point value 7274 | `a' to the 64-bit unsigned integer format. The conversion is 7275 | performed according to the IEC/IEEE Standard for Binary Floating-Point 7276 | Arithmetic---which means in particular that the conversion is rounded 7277 | according to the current rounding mode. If `a' is a NaN, the largest 7278 | positive integer is returned. If the conversion overflows, the 7279 | largest unsigned integer is returned. If 'a' is negative, the value is 7280 | rounded and zero is returned; negative values that do not round to zero 7281 | will raise the inexact exception. 7282 *----------------------------------------------------------------------------*/ 7283 7284 uint64_t float64_to_uint64(float64 a, float_status *status) 7285 { 7286 flag aSign; 7287 int_fast16_t aExp, shiftCount; 7288 uint64_t aSig, aSigExtra; 7289 a = float64_squash_input_denormal(a, status); 7290 7291 aSig = extractFloat64Frac(a); 7292 aExp = extractFloat64Exp(a); 7293 aSign = extractFloat64Sign(a); 7294 if (aSign && (aExp > 1022)) { 7295 float_raise(float_flag_invalid, status); 7296 if (float64_is_any_nan(a)) { 7297 return LIT64(0xFFFFFFFFFFFFFFFF); 7298 } else { 7299 return 0; 7300 } 7301 } 7302 if (aExp) { 7303 aSig |= LIT64(0x0010000000000000); 7304 } 7305 shiftCount = 0x433 - aExp; 7306 if (shiftCount <= 0) { 7307 if (0x43E < aExp) { 7308 float_raise(float_flag_invalid, status); 7309 return LIT64(0xFFFFFFFFFFFFFFFF); 7310 } 7311 aSigExtra = 0; 7312 aSig <<= -shiftCount; 7313 } else { 7314 shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra); 7315 } 7316 return roundAndPackUint64(aSign, aSig, aSigExtra, status); 7317 } 7318 7319 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status) 7320 { 7321 signed char current_rounding_mode = status->float_rounding_mode; 7322 set_float_rounding_mode(float_round_to_zero, status); 7323 int64_t v = float64_to_uint64(a, status); 7324 set_float_rounding_mode(current_rounding_mode, status); 7325 return v; 7326 } 7327 7328 #define COMPARE(s, nan_exp) \ 7329 static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\ 7330 int is_quiet, float_status *status) \ 7331 { \ 7332 flag aSign, bSign; \ 7333 uint ## s ## _t av, bv; \ 7334 a = float ## s ## _squash_input_denormal(a, status); \ 7335 b = float ## s ## _squash_input_denormal(b, status); \ 7336 \ 7337 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \ 7338 extractFloat ## s ## Frac( a ) ) || \ 7339 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \ 7340 extractFloat ## s ## Frac( b ) )) { \ 7341 if (!is_quiet || \ 7342 float ## s ## _is_signaling_nan( a ) || \ 7343 float ## s ## _is_signaling_nan( b ) ) { \ 7344 float_raise(float_flag_invalid, status); \ 7345 } \ 7346 return float_relation_unordered; \ 7347 } \ 7348 aSign = extractFloat ## s ## Sign( a ); \ 7349 bSign = extractFloat ## s ## Sign( b ); \ 7350 av = float ## s ## _val(a); \ 7351 bv = float ## s ## _val(b); \ 7352 if ( aSign != bSign ) { \ 7353 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \ 7354 /* zero case */ \ 7355 return float_relation_equal; \ 7356 } else { \ 7357 return 1 - (2 * aSign); \ 7358 } \ 7359 } else { \ 7360 if (av == bv) { \ 7361 return float_relation_equal; \ 7362 } else { \ 7363 return 1 - 2 * (aSign ^ ( av < bv )); \ 7364 } \ 7365 } \ 7366 } \ 7367 \ 7368 int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \ 7369 { \ 7370 return float ## s ## _compare_internal(a, b, 0, status); \ 7371 } \ 7372 \ 7373 int float ## s ## _compare_quiet(float ## s a, float ## s b, \ 7374 float_status *status) \ 7375 { \ 7376 return float ## s ## _compare_internal(a, b, 1, status); \ 7377 } 7378 7379 COMPARE(32, 0xff) 7380 COMPARE(64, 0x7ff) 7381 7382 static inline int floatx80_compare_internal(floatx80 a, floatx80 b, 7383 int is_quiet, float_status *status) 7384 { 7385 flag aSign, bSign; 7386 7387 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 7388 ( extractFloatx80Frac( a )<<1 ) ) || 7389 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 7390 ( extractFloatx80Frac( b )<<1 ) )) { 7391 if (!is_quiet || 7392 floatx80_is_signaling_nan( a ) || 7393 floatx80_is_signaling_nan( b ) ) { 7394 float_raise(float_flag_invalid, status); 7395 } 7396 return float_relation_unordered; 7397 } 7398 aSign = extractFloatx80Sign( a ); 7399 bSign = extractFloatx80Sign( b ); 7400 if ( aSign != bSign ) { 7401 7402 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 7403 ( ( a.low | b.low ) == 0 ) ) { 7404 /* zero case */ 7405 return float_relation_equal; 7406 } else { 7407 return 1 - (2 * aSign); 7408 } 7409 } else { 7410 if (a.low == b.low && a.high == b.high) { 7411 return float_relation_equal; 7412 } else { 7413 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7414 } 7415 } 7416 } 7417 7418 int floatx80_compare(floatx80 a, floatx80 b, float_status *status) 7419 { 7420 return floatx80_compare_internal(a, b, 0, status); 7421 } 7422 7423 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status) 7424 { 7425 return floatx80_compare_internal(a, b, 1, status); 7426 } 7427 7428 static inline int float128_compare_internal(float128 a, float128 b, 7429 int is_quiet, float_status *status) 7430 { 7431 flag aSign, bSign; 7432 7433 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7434 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7435 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7436 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7437 if (!is_quiet || 7438 float128_is_signaling_nan( a ) || 7439 float128_is_signaling_nan( b ) ) { 7440 float_raise(float_flag_invalid, status); 7441 } 7442 return float_relation_unordered; 7443 } 7444 aSign = extractFloat128Sign( a ); 7445 bSign = extractFloat128Sign( b ); 7446 if ( aSign != bSign ) { 7447 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7448 /* zero case */ 7449 return float_relation_equal; 7450 } else { 7451 return 1 - (2 * aSign); 7452 } 7453 } else { 7454 if (a.low == b.low && a.high == b.high) { 7455 return float_relation_equal; 7456 } else { 7457 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7458 } 7459 } 7460 } 7461 7462 int float128_compare(float128 a, float128 b, float_status *status) 7463 { 7464 return float128_compare_internal(a, b, 0, status); 7465 } 7466 7467 int float128_compare_quiet(float128 a, float128 b, float_status *status) 7468 { 7469 return float128_compare_internal(a, b, 1, status); 7470 } 7471 7472 /* min() and max() functions. These can't be implemented as 7473 * 'compare and pick one input' because that would mishandle 7474 * NaNs and +0 vs -0. 7475 * 7476 * minnum() and maxnum() functions. These are similar to the min() 7477 * and max() functions but if one of the arguments is a QNaN and 7478 * the other is numerical then the numerical argument is returned. 7479 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 7480 * and maxNum() operations. min() and max() are the typical min/max 7481 * semantics provided by many CPUs which predate that specification. 7482 * 7483 * minnummag() and maxnummag() functions correspond to minNumMag() 7484 * and minNumMag() from the IEEE-754 2008. 7485 */ 7486 #define MINMAX(s) \ 7487 static inline float ## s float ## s ## _minmax(float ## s a, float ## s b, \ 7488 int ismin, int isieee, \ 7489 int ismag, \ 7490 float_status *status) \ 7491 { \ 7492 flag aSign, bSign; \ 7493 uint ## s ## _t av, bv, aav, abv; \ 7494 a = float ## s ## _squash_input_denormal(a, status); \ 7495 b = float ## s ## _squash_input_denormal(b, status); \ 7496 if (float ## s ## _is_any_nan(a) || \ 7497 float ## s ## _is_any_nan(b)) { \ 7498 if (isieee) { \ 7499 if (float ## s ## _is_quiet_nan(a) && \ 7500 !float ## s ##_is_any_nan(b)) { \ 7501 return b; \ 7502 } else if (float ## s ## _is_quiet_nan(b) && \ 7503 !float ## s ## _is_any_nan(a)) { \ 7504 return a; \ 7505 } \ 7506 } \ 7507 return propagateFloat ## s ## NaN(a, b, status); \ 7508 } \ 7509 aSign = extractFloat ## s ## Sign(a); \ 7510 bSign = extractFloat ## s ## Sign(b); \ 7511 av = float ## s ## _val(a); \ 7512 bv = float ## s ## _val(b); \ 7513 if (ismag) { \ 7514 aav = float ## s ## _abs(av); \ 7515 abv = float ## s ## _abs(bv); \ 7516 if (aav != abv) { \ 7517 if (ismin) { \ 7518 return (aav < abv) ? a : b; \ 7519 } else { \ 7520 return (aav < abv) ? b : a; \ 7521 } \ 7522 } \ 7523 } \ 7524 if (aSign != bSign) { \ 7525 if (ismin) { \ 7526 return aSign ? a : b; \ 7527 } else { \ 7528 return aSign ? b : a; \ 7529 } \ 7530 } else { \ 7531 if (ismin) { \ 7532 return (aSign ^ (av < bv)) ? a : b; \ 7533 } else { \ 7534 return (aSign ^ (av < bv)) ? b : a; \ 7535 } \ 7536 } \ 7537 } \ 7538 \ 7539 float ## s float ## s ## _min(float ## s a, float ## s b, \ 7540 float_status *status) \ 7541 { \ 7542 return float ## s ## _minmax(a, b, 1, 0, 0, status); \ 7543 } \ 7544 \ 7545 float ## s float ## s ## _max(float ## s a, float ## s b, \ 7546 float_status *status) \ 7547 { \ 7548 return float ## s ## _minmax(a, b, 0, 0, 0, status); \ 7549 } \ 7550 \ 7551 float ## s float ## s ## _minnum(float ## s a, float ## s b, \ 7552 float_status *status) \ 7553 { \ 7554 return float ## s ## _minmax(a, b, 1, 1, 0, status); \ 7555 } \ 7556 \ 7557 float ## s float ## s ## _maxnum(float ## s a, float ## s b, \ 7558 float_status *status) \ 7559 { \ 7560 return float ## s ## _minmax(a, b, 0, 1, 0, status); \ 7561 } \ 7562 \ 7563 float ## s float ## s ## _minnummag(float ## s a, float ## s b, \ 7564 float_status *status) \ 7565 { \ 7566 return float ## s ## _minmax(a, b, 1, 1, 1, status); \ 7567 } \ 7568 \ 7569 float ## s float ## s ## _maxnummag(float ## s a, float ## s b, \ 7570 float_status *status) \ 7571 { \ 7572 return float ## s ## _minmax(a, b, 0, 1, 1, status); \ 7573 } 7574 7575 MINMAX(32) 7576 MINMAX(64) 7577 7578 7579 /* Multiply A by 2 raised to the power N. */ 7580 float32 float32_scalbn(float32 a, int n, float_status *status) 7581 { 7582 flag aSign; 7583 int16_t aExp; 7584 uint32_t aSig; 7585 7586 a = float32_squash_input_denormal(a, status); 7587 aSig = extractFloat32Frac( a ); 7588 aExp = extractFloat32Exp( a ); 7589 aSign = extractFloat32Sign( a ); 7590 7591 if ( aExp == 0xFF ) { 7592 if ( aSig ) { 7593 return propagateFloat32NaN(a, a, status); 7594 } 7595 return a; 7596 } 7597 if (aExp != 0) { 7598 aSig |= 0x00800000; 7599 } else if (aSig == 0) { 7600 return a; 7601 } else { 7602 aExp++; 7603 } 7604 7605 if (n > 0x200) { 7606 n = 0x200; 7607 } else if (n < -0x200) { 7608 n = -0x200; 7609 } 7610 7611 aExp += n - 1; 7612 aSig <<= 7; 7613 return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status); 7614 } 7615 7616 float64 float64_scalbn(float64 a, int n, float_status *status) 7617 { 7618 flag aSign; 7619 int16_t aExp; 7620 uint64_t aSig; 7621 7622 a = float64_squash_input_denormal(a, status); 7623 aSig = extractFloat64Frac( a ); 7624 aExp = extractFloat64Exp( a ); 7625 aSign = extractFloat64Sign( a ); 7626 7627 if ( aExp == 0x7FF ) { 7628 if ( aSig ) { 7629 return propagateFloat64NaN(a, a, status); 7630 } 7631 return a; 7632 } 7633 if (aExp != 0) { 7634 aSig |= LIT64( 0x0010000000000000 ); 7635 } else if (aSig == 0) { 7636 return a; 7637 } else { 7638 aExp++; 7639 } 7640 7641 if (n > 0x1000) { 7642 n = 0x1000; 7643 } else if (n < -0x1000) { 7644 n = -0x1000; 7645 } 7646 7647 aExp += n - 1; 7648 aSig <<= 10; 7649 return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status); 7650 } 7651 7652 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7653 { 7654 flag aSign; 7655 int32_t aExp; 7656 uint64_t aSig; 7657 7658 aSig = extractFloatx80Frac( a ); 7659 aExp = extractFloatx80Exp( a ); 7660 aSign = extractFloatx80Sign( a ); 7661 7662 if ( aExp == 0x7FFF ) { 7663 if ( aSig<<1 ) { 7664 return propagateFloatx80NaN(a, a, status); 7665 } 7666 return a; 7667 } 7668 7669 if (aExp == 0) { 7670 if (aSig == 0) { 7671 return a; 7672 } 7673 aExp++; 7674 } 7675 7676 if (n > 0x10000) { 7677 n = 0x10000; 7678 } else if (n < -0x10000) { 7679 n = -0x10000; 7680 } 7681 7682 aExp += n; 7683 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7684 aSign, aExp, aSig, 0, status); 7685 } 7686 7687 float128 float128_scalbn(float128 a, int n, float_status *status) 7688 { 7689 flag aSign; 7690 int32_t aExp; 7691 uint64_t aSig0, aSig1; 7692 7693 aSig1 = extractFloat128Frac1( a ); 7694 aSig0 = extractFloat128Frac0( a ); 7695 aExp = extractFloat128Exp( a ); 7696 aSign = extractFloat128Sign( a ); 7697 if ( aExp == 0x7FFF ) { 7698 if ( aSig0 | aSig1 ) { 7699 return propagateFloat128NaN(a, a, status); 7700 } 7701 return a; 7702 } 7703 if (aExp != 0) { 7704 aSig0 |= LIT64( 0x0001000000000000 ); 7705 } else if (aSig0 == 0 && aSig1 == 0) { 7706 return a; 7707 } else { 7708 aExp++; 7709 } 7710 7711 if (n > 0x10000) { 7712 n = 0x10000; 7713 } else if (n < -0x10000) { 7714 n = -0x10000; 7715 } 7716 7717 aExp += n - 1; 7718 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7719 , status); 7720 7721 } 7722