1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "config.h" 86 87 #include "fpu/softfloat.h" 88 89 /* We only need stdlib for abort() */ 90 #include <stdlib.h> 91 92 /*---------------------------------------------------------------------------- 93 | Primitive arithmetic functions, including multi-word arithmetic, and 94 | division and square root approximations. (Can be specialized to target if 95 | desired.) 96 *----------------------------------------------------------------------------*/ 97 #include "softfloat-macros.h" 98 99 /*---------------------------------------------------------------------------- 100 | Functions and definitions to determine: (1) whether tininess for underflow 101 | is detected before or after rounding by default, (2) what (if anything) 102 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 103 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 104 | are propagated from function inputs to output. These details are target- 105 | specific. 106 *----------------------------------------------------------------------------*/ 107 #include "softfloat-specialize.h" 108 109 /*---------------------------------------------------------------------------- 110 | Returns the fraction bits of the half-precision floating-point value `a'. 111 *----------------------------------------------------------------------------*/ 112 113 static inline uint32_t extractFloat16Frac(float16 a) 114 { 115 return float16_val(a) & 0x3ff; 116 } 117 118 /*---------------------------------------------------------------------------- 119 | Returns the exponent bits of the half-precision floating-point value `a'. 120 *----------------------------------------------------------------------------*/ 121 122 static inline int_fast16_t extractFloat16Exp(float16 a) 123 { 124 return (float16_val(a) >> 10) & 0x1f; 125 } 126 127 /*---------------------------------------------------------------------------- 128 | Returns the sign bit of the single-precision floating-point value `a'. 129 *----------------------------------------------------------------------------*/ 130 131 static inline flag extractFloat16Sign(float16 a) 132 { 133 return float16_val(a)>>15; 134 } 135 136 /*---------------------------------------------------------------------------- 137 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 138 | and 7, and returns the properly rounded 32-bit integer corresponding to the 139 | input. If `zSign' is 1, the input is negated before being converted to an 140 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 141 | is simply rounded to an integer, with the inexact exception raised if the 142 | input cannot be represented exactly as an integer. However, if the fixed- 143 | point input is too large, the invalid exception is raised and the largest 144 | positive or negative integer is returned. 145 *----------------------------------------------------------------------------*/ 146 147 static int32 roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status) 148 { 149 int8 roundingMode; 150 flag roundNearestEven; 151 int8 roundIncrement, roundBits; 152 int32_t z; 153 154 roundingMode = status->float_rounding_mode; 155 roundNearestEven = ( roundingMode == float_round_nearest_even ); 156 switch (roundingMode) { 157 case float_round_nearest_even: 158 case float_round_ties_away: 159 roundIncrement = 0x40; 160 break; 161 case float_round_to_zero: 162 roundIncrement = 0; 163 break; 164 case float_round_up: 165 roundIncrement = zSign ? 0 : 0x7f; 166 break; 167 case float_round_down: 168 roundIncrement = zSign ? 0x7f : 0; 169 break; 170 default: 171 abort(); 172 } 173 roundBits = absZ & 0x7F; 174 absZ = ( absZ + roundIncrement )>>7; 175 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 176 z = absZ; 177 if ( zSign ) z = - z; 178 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 179 float_raise(float_flag_invalid, status); 180 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 181 } 182 if (roundBits) { 183 status->float_exception_flags |= float_flag_inexact; 184 } 185 return z; 186 187 } 188 189 /*---------------------------------------------------------------------------- 190 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 191 | `absZ1', with binary point between bits 63 and 64 (between the input words), 192 | and returns the properly rounded 64-bit integer corresponding to the input. 193 | If `zSign' is 1, the input is negated before being converted to an integer. 194 | Ordinarily, the fixed-point input is simply rounded to an integer, with 195 | the inexact exception raised if the input cannot be represented exactly as 196 | an integer. However, if the fixed-point input is too large, the invalid 197 | exception is raised and the largest positive or negative integer is 198 | returned. 199 *----------------------------------------------------------------------------*/ 200 201 static int64 roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1, 202 float_status *status) 203 { 204 int8 roundingMode; 205 flag roundNearestEven, increment; 206 int64_t z; 207 208 roundingMode = status->float_rounding_mode; 209 roundNearestEven = ( roundingMode == float_round_nearest_even ); 210 switch (roundingMode) { 211 case float_round_nearest_even: 212 case float_round_ties_away: 213 increment = ((int64_t) absZ1 < 0); 214 break; 215 case float_round_to_zero: 216 increment = 0; 217 break; 218 case float_round_up: 219 increment = !zSign && absZ1; 220 break; 221 case float_round_down: 222 increment = zSign && absZ1; 223 break; 224 default: 225 abort(); 226 } 227 if ( increment ) { 228 ++absZ0; 229 if ( absZ0 == 0 ) goto overflow; 230 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 231 } 232 z = absZ0; 233 if ( zSign ) z = - z; 234 if ( z && ( ( z < 0 ) ^ zSign ) ) { 235 overflow: 236 float_raise(float_flag_invalid, status); 237 return 238 zSign ? (int64_t) LIT64( 0x8000000000000000 ) 239 : LIT64( 0x7FFFFFFFFFFFFFFF ); 240 } 241 if (absZ1) { 242 status->float_exception_flags |= float_flag_inexact; 243 } 244 return z; 245 246 } 247 248 /*---------------------------------------------------------------------------- 249 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 250 | `absZ1', with binary point between bits 63 and 64 (between the input words), 251 | and returns the properly rounded 64-bit unsigned integer corresponding to the 252 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 253 | with the inexact exception raised if the input cannot be represented exactly 254 | as an integer. However, if the fixed-point input is too large, the invalid 255 | exception is raised and the largest unsigned integer is returned. 256 *----------------------------------------------------------------------------*/ 257 258 static int64 roundAndPackUint64(flag zSign, uint64_t absZ0, 259 uint64_t absZ1, float_status *status) 260 { 261 int8 roundingMode; 262 flag roundNearestEven, increment; 263 264 roundingMode = status->float_rounding_mode; 265 roundNearestEven = (roundingMode == float_round_nearest_even); 266 switch (roundingMode) { 267 case float_round_nearest_even: 268 case float_round_ties_away: 269 increment = ((int64_t)absZ1 < 0); 270 break; 271 case float_round_to_zero: 272 increment = 0; 273 break; 274 case float_round_up: 275 increment = !zSign && absZ1; 276 break; 277 case float_round_down: 278 increment = zSign && absZ1; 279 break; 280 default: 281 abort(); 282 } 283 if (increment) { 284 ++absZ0; 285 if (absZ0 == 0) { 286 float_raise(float_flag_invalid, status); 287 return LIT64(0xFFFFFFFFFFFFFFFF); 288 } 289 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven); 290 } 291 292 if (zSign && absZ0) { 293 float_raise(float_flag_invalid, status); 294 return 0; 295 } 296 297 if (absZ1) { 298 status->float_exception_flags |= float_flag_inexact; 299 } 300 return absZ0; 301 } 302 303 /*---------------------------------------------------------------------------- 304 | Returns the fraction bits of the single-precision floating-point value `a'. 305 *----------------------------------------------------------------------------*/ 306 307 static inline uint32_t extractFloat32Frac( float32 a ) 308 { 309 310 return float32_val(a) & 0x007FFFFF; 311 312 } 313 314 /*---------------------------------------------------------------------------- 315 | Returns the exponent bits of the single-precision floating-point value `a'. 316 *----------------------------------------------------------------------------*/ 317 318 static inline int_fast16_t extractFloat32Exp(float32 a) 319 { 320 321 return ( float32_val(a)>>23 ) & 0xFF; 322 323 } 324 325 /*---------------------------------------------------------------------------- 326 | Returns the sign bit of the single-precision floating-point value `a'. 327 *----------------------------------------------------------------------------*/ 328 329 static inline flag extractFloat32Sign( float32 a ) 330 { 331 332 return float32_val(a)>>31; 333 334 } 335 336 /*---------------------------------------------------------------------------- 337 | If `a' is denormal and we are in flush-to-zero mode then set the 338 | input-denormal exception and return zero. Otherwise just return the value. 339 *----------------------------------------------------------------------------*/ 340 float32 float32_squash_input_denormal(float32 a, float_status *status) 341 { 342 if (status->flush_inputs_to_zero) { 343 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) { 344 float_raise(float_flag_input_denormal, status); 345 return make_float32(float32_val(a) & 0x80000000); 346 } 347 } 348 return a; 349 } 350 351 /*---------------------------------------------------------------------------- 352 | Normalizes the subnormal single-precision floating-point value represented 353 | by the denormalized significand `aSig'. The normalized exponent and 354 | significand are stored at the locations pointed to by `zExpPtr' and 355 | `zSigPtr', respectively. 356 *----------------------------------------------------------------------------*/ 357 358 static void 359 normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr) 360 { 361 int8 shiftCount; 362 363 shiftCount = countLeadingZeros32( aSig ) - 8; 364 *zSigPtr = aSig<<shiftCount; 365 *zExpPtr = 1 - shiftCount; 366 367 } 368 369 /*---------------------------------------------------------------------------- 370 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 371 | single-precision floating-point value, returning the result. After being 372 | shifted into the proper positions, the three fields are simply added 373 | together to form the result. This means that any integer portion of `zSig' 374 | will be added into the exponent. Since a properly normalized significand 375 | will have an integer portion equal to 1, the `zExp' input should be 1 less 376 | than the desired result exponent whenever `zSig' is a complete, normalized 377 | significand. 378 *----------------------------------------------------------------------------*/ 379 380 static inline float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig) 381 { 382 383 return make_float32( 384 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig); 385 386 } 387 388 /*---------------------------------------------------------------------------- 389 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 390 | and significand `zSig', and returns the proper single-precision floating- 391 | point value corresponding to the abstract input. Ordinarily, the abstract 392 | value is simply rounded and packed into the single-precision format, with 393 | the inexact exception raised if the abstract input cannot be represented 394 | exactly. However, if the abstract value is too large, the overflow and 395 | inexact exceptions are raised and an infinity or maximal finite value is 396 | returned. If the abstract value is too small, the input value is rounded to 397 | a subnormal number, and the underflow and inexact exceptions are raised if 398 | the abstract input cannot be represented exactly as a subnormal single- 399 | precision floating-point number. 400 | The input significand `zSig' has its binary point between bits 30 401 | and 29, which is 7 bits to the left of the usual location. This shifted 402 | significand must be normalized or smaller. If `zSig' is not normalized, 403 | `zExp' must be 0; in that case, the result returned is a subnormal number, 404 | and it must not require rounding. In the usual case that `zSig' is 405 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 406 | The handling of underflow and overflow follows the IEC/IEEE Standard for 407 | Binary Floating-Point Arithmetic. 408 *----------------------------------------------------------------------------*/ 409 410 static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig, 411 float_status *status) 412 { 413 int8 roundingMode; 414 flag roundNearestEven; 415 int8 roundIncrement, roundBits; 416 flag isTiny; 417 418 roundingMode = status->float_rounding_mode; 419 roundNearestEven = ( roundingMode == float_round_nearest_even ); 420 switch (roundingMode) { 421 case float_round_nearest_even: 422 case float_round_ties_away: 423 roundIncrement = 0x40; 424 break; 425 case float_round_to_zero: 426 roundIncrement = 0; 427 break; 428 case float_round_up: 429 roundIncrement = zSign ? 0 : 0x7f; 430 break; 431 case float_round_down: 432 roundIncrement = zSign ? 0x7f : 0; 433 break; 434 default: 435 abort(); 436 break; 437 } 438 roundBits = zSig & 0x7F; 439 if ( 0xFD <= (uint16_t) zExp ) { 440 if ( ( 0xFD < zExp ) 441 || ( ( zExp == 0xFD ) 442 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 443 ) { 444 float_raise(float_flag_overflow | float_flag_inexact, status); 445 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 )); 446 } 447 if ( zExp < 0 ) { 448 if (status->flush_to_zero) { 449 float_raise(float_flag_output_denormal, status); 450 return packFloat32(zSign, 0, 0); 451 } 452 isTiny = 453 (status->float_detect_tininess 454 == float_tininess_before_rounding) 455 || ( zExp < -1 ) 456 || ( zSig + roundIncrement < 0x80000000 ); 457 shift32RightJamming( zSig, - zExp, &zSig ); 458 zExp = 0; 459 roundBits = zSig & 0x7F; 460 if (isTiny && roundBits) { 461 float_raise(float_flag_underflow, status); 462 } 463 } 464 } 465 if (roundBits) { 466 status->float_exception_flags |= float_flag_inexact; 467 } 468 zSig = ( zSig + roundIncrement )>>7; 469 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 470 if ( zSig == 0 ) zExp = 0; 471 return packFloat32( zSign, zExp, zSig ); 472 473 } 474 475 /*---------------------------------------------------------------------------- 476 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 477 | and significand `zSig', and returns the proper single-precision floating- 478 | point value corresponding to the abstract input. This routine is just like 479 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 480 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 481 | floating-point exponent. 482 *----------------------------------------------------------------------------*/ 483 484 static float32 485 normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig, 486 float_status *status) 487 { 488 int8 shiftCount; 489 490 shiftCount = countLeadingZeros32( zSig ) - 1; 491 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 492 status); 493 494 } 495 496 /*---------------------------------------------------------------------------- 497 | Returns the fraction bits of the double-precision floating-point value `a'. 498 *----------------------------------------------------------------------------*/ 499 500 static inline uint64_t extractFloat64Frac( float64 a ) 501 { 502 503 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF ); 504 505 } 506 507 /*---------------------------------------------------------------------------- 508 | Returns the exponent bits of the double-precision floating-point value `a'. 509 *----------------------------------------------------------------------------*/ 510 511 static inline int_fast16_t extractFloat64Exp(float64 a) 512 { 513 514 return ( float64_val(a)>>52 ) & 0x7FF; 515 516 } 517 518 /*---------------------------------------------------------------------------- 519 | Returns the sign bit of the double-precision floating-point value `a'. 520 *----------------------------------------------------------------------------*/ 521 522 static inline flag extractFloat64Sign( float64 a ) 523 { 524 525 return float64_val(a)>>63; 526 527 } 528 529 /*---------------------------------------------------------------------------- 530 | If `a' is denormal and we are in flush-to-zero mode then set the 531 | input-denormal exception and return zero. Otherwise just return the value. 532 *----------------------------------------------------------------------------*/ 533 float64 float64_squash_input_denormal(float64 a, float_status *status) 534 { 535 if (status->flush_inputs_to_zero) { 536 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) { 537 float_raise(float_flag_input_denormal, status); 538 return make_float64(float64_val(a) & (1ULL << 63)); 539 } 540 } 541 return a; 542 } 543 544 /*---------------------------------------------------------------------------- 545 | Normalizes the subnormal double-precision floating-point value represented 546 | by the denormalized significand `aSig'. The normalized exponent and 547 | significand are stored at the locations pointed to by `zExpPtr' and 548 | `zSigPtr', respectively. 549 *----------------------------------------------------------------------------*/ 550 551 static void 552 normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr) 553 { 554 int8 shiftCount; 555 556 shiftCount = countLeadingZeros64( aSig ) - 11; 557 *zSigPtr = aSig<<shiftCount; 558 *zExpPtr = 1 - shiftCount; 559 560 } 561 562 /*---------------------------------------------------------------------------- 563 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 564 | double-precision floating-point value, returning the result. After being 565 | shifted into the proper positions, the three fields are simply added 566 | together to form the result. This means that any integer portion of `zSig' 567 | will be added into the exponent. Since a properly normalized significand 568 | will have an integer portion equal to 1, the `zExp' input should be 1 less 569 | than the desired result exponent whenever `zSig' is a complete, normalized 570 | significand. 571 *----------------------------------------------------------------------------*/ 572 573 static inline float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig) 574 { 575 576 return make_float64( 577 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 578 579 } 580 581 /*---------------------------------------------------------------------------- 582 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 583 | and significand `zSig', and returns the proper double-precision floating- 584 | point value corresponding to the abstract input. Ordinarily, the abstract 585 | value is simply rounded and packed into the double-precision format, with 586 | the inexact exception raised if the abstract input cannot be represented 587 | exactly. However, if the abstract value is too large, the overflow and 588 | inexact exceptions are raised and an infinity or maximal finite value is 589 | returned. If the abstract value is too small, the input value is rounded to 590 | a subnormal number, and the underflow and inexact exceptions are raised if 591 | the abstract input cannot be represented exactly as a subnormal double- 592 | precision floating-point number. 593 | The input significand `zSig' has its binary point between bits 62 594 | and 61, which is 10 bits to the left of the usual location. This shifted 595 | significand must be normalized or smaller. If `zSig' is not normalized, 596 | `zExp' must be 0; in that case, the result returned is a subnormal number, 597 | and it must not require rounding. In the usual case that `zSig' is 598 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 599 | The handling of underflow and overflow follows the IEC/IEEE Standard for 600 | Binary Floating-Point Arithmetic. 601 *----------------------------------------------------------------------------*/ 602 603 static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig, 604 float_status *status) 605 { 606 int8 roundingMode; 607 flag roundNearestEven; 608 int_fast16_t roundIncrement, roundBits; 609 flag isTiny; 610 611 roundingMode = status->float_rounding_mode; 612 roundNearestEven = ( roundingMode == float_round_nearest_even ); 613 switch (roundingMode) { 614 case float_round_nearest_even: 615 case float_round_ties_away: 616 roundIncrement = 0x200; 617 break; 618 case float_round_to_zero: 619 roundIncrement = 0; 620 break; 621 case float_round_up: 622 roundIncrement = zSign ? 0 : 0x3ff; 623 break; 624 case float_round_down: 625 roundIncrement = zSign ? 0x3ff : 0; 626 break; 627 default: 628 abort(); 629 } 630 roundBits = zSig & 0x3FF; 631 if ( 0x7FD <= (uint16_t) zExp ) { 632 if ( ( 0x7FD < zExp ) 633 || ( ( zExp == 0x7FD ) 634 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 635 ) { 636 float_raise(float_flag_overflow | float_flag_inexact, status); 637 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 )); 638 } 639 if ( zExp < 0 ) { 640 if (status->flush_to_zero) { 641 float_raise(float_flag_output_denormal, status); 642 return packFloat64(zSign, 0, 0); 643 } 644 isTiny = 645 (status->float_detect_tininess 646 == float_tininess_before_rounding) 647 || ( zExp < -1 ) 648 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 649 shift64RightJamming( zSig, - zExp, &zSig ); 650 zExp = 0; 651 roundBits = zSig & 0x3FF; 652 if (isTiny && roundBits) { 653 float_raise(float_flag_underflow, status); 654 } 655 } 656 } 657 if (roundBits) { 658 status->float_exception_flags |= float_flag_inexact; 659 } 660 zSig = ( zSig + roundIncrement )>>10; 661 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 662 if ( zSig == 0 ) zExp = 0; 663 return packFloat64( zSign, zExp, zSig ); 664 665 } 666 667 /*---------------------------------------------------------------------------- 668 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 669 | and significand `zSig', and returns the proper double-precision floating- 670 | point value corresponding to the abstract input. This routine is just like 671 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 672 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 673 | floating-point exponent. 674 *----------------------------------------------------------------------------*/ 675 676 static float64 677 normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig, 678 float_status *status) 679 { 680 int8 shiftCount; 681 682 shiftCount = countLeadingZeros64( zSig ) - 1; 683 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 684 status); 685 686 } 687 688 /*---------------------------------------------------------------------------- 689 | Returns the fraction bits of the extended double-precision floating-point 690 | value `a'. 691 *----------------------------------------------------------------------------*/ 692 693 static inline uint64_t extractFloatx80Frac( floatx80 a ) 694 { 695 696 return a.low; 697 698 } 699 700 /*---------------------------------------------------------------------------- 701 | Returns the exponent bits of the extended double-precision floating-point 702 | value `a'. 703 *----------------------------------------------------------------------------*/ 704 705 static inline int32 extractFloatx80Exp( floatx80 a ) 706 { 707 708 return a.high & 0x7FFF; 709 710 } 711 712 /*---------------------------------------------------------------------------- 713 | Returns the sign bit of the extended double-precision floating-point value 714 | `a'. 715 *----------------------------------------------------------------------------*/ 716 717 static inline flag extractFloatx80Sign( floatx80 a ) 718 { 719 720 return a.high>>15; 721 722 } 723 724 /*---------------------------------------------------------------------------- 725 | Normalizes the subnormal extended double-precision floating-point value 726 | represented by the denormalized significand `aSig'. The normalized exponent 727 | and significand are stored at the locations pointed to by `zExpPtr' and 728 | `zSigPtr', respectively. 729 *----------------------------------------------------------------------------*/ 730 731 static void 732 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr ) 733 { 734 int8 shiftCount; 735 736 shiftCount = countLeadingZeros64( aSig ); 737 *zSigPtr = aSig<<shiftCount; 738 *zExpPtr = 1 - shiftCount; 739 740 } 741 742 /*---------------------------------------------------------------------------- 743 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 744 | extended double-precision floating-point value, returning the result. 745 *----------------------------------------------------------------------------*/ 746 747 static inline floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig ) 748 { 749 floatx80 z; 750 751 z.low = zSig; 752 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp; 753 return z; 754 755 } 756 757 /*---------------------------------------------------------------------------- 758 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 759 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 760 | and returns the proper extended double-precision floating-point value 761 | corresponding to the abstract input. Ordinarily, the abstract value is 762 | rounded and packed into the extended double-precision format, with the 763 | inexact exception raised if the abstract input cannot be represented 764 | exactly. However, if the abstract value is too large, the overflow and 765 | inexact exceptions are raised and an infinity or maximal finite value is 766 | returned. If the abstract value is too small, the input value is rounded to 767 | a subnormal number, and the underflow and inexact exceptions are raised if 768 | the abstract input cannot be represented exactly as a subnormal extended 769 | double-precision floating-point number. 770 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 771 | number of bits as single or double precision, respectively. Otherwise, the 772 | result is rounded to the full precision of the extended double-precision 773 | format. 774 | The input significand must be normalized or smaller. If the input 775 | significand is not normalized, `zExp' must be 0; in that case, the result 776 | returned is a subnormal number, and it must not require rounding. The 777 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 778 | Floating-Point Arithmetic. 779 *----------------------------------------------------------------------------*/ 780 781 static floatx80 roundAndPackFloatx80(int8 roundingPrecision, flag zSign, 782 int32 zExp, uint64_t zSig0, uint64_t zSig1, 783 float_status *status) 784 { 785 int8 roundingMode; 786 flag roundNearestEven, increment, isTiny; 787 int64 roundIncrement, roundMask, roundBits; 788 789 roundingMode = status->float_rounding_mode; 790 roundNearestEven = ( roundingMode == float_round_nearest_even ); 791 if ( roundingPrecision == 80 ) goto precision80; 792 if ( roundingPrecision == 64 ) { 793 roundIncrement = LIT64( 0x0000000000000400 ); 794 roundMask = LIT64( 0x00000000000007FF ); 795 } 796 else if ( roundingPrecision == 32 ) { 797 roundIncrement = LIT64( 0x0000008000000000 ); 798 roundMask = LIT64( 0x000000FFFFFFFFFF ); 799 } 800 else { 801 goto precision80; 802 } 803 zSig0 |= ( zSig1 != 0 ); 804 switch (roundingMode) { 805 case float_round_nearest_even: 806 case float_round_ties_away: 807 break; 808 case float_round_to_zero: 809 roundIncrement = 0; 810 break; 811 case float_round_up: 812 roundIncrement = zSign ? 0 : roundMask; 813 break; 814 case float_round_down: 815 roundIncrement = zSign ? roundMask : 0; 816 break; 817 default: 818 abort(); 819 } 820 roundBits = zSig0 & roundMask; 821 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 822 if ( ( 0x7FFE < zExp ) 823 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 824 ) { 825 goto overflow; 826 } 827 if ( zExp <= 0 ) { 828 if (status->flush_to_zero) { 829 float_raise(float_flag_output_denormal, status); 830 return packFloatx80(zSign, 0, 0); 831 } 832 isTiny = 833 (status->float_detect_tininess 834 == float_tininess_before_rounding) 835 || ( zExp < 0 ) 836 || ( zSig0 <= zSig0 + roundIncrement ); 837 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 838 zExp = 0; 839 roundBits = zSig0 & roundMask; 840 if (isTiny && roundBits) { 841 float_raise(float_flag_underflow, status); 842 } 843 if (roundBits) { 844 status->float_exception_flags |= float_flag_inexact; 845 } 846 zSig0 += roundIncrement; 847 if ( (int64_t) zSig0 < 0 ) zExp = 1; 848 roundIncrement = roundMask + 1; 849 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 850 roundMask |= roundIncrement; 851 } 852 zSig0 &= ~ roundMask; 853 return packFloatx80( zSign, zExp, zSig0 ); 854 } 855 } 856 if (roundBits) { 857 status->float_exception_flags |= float_flag_inexact; 858 } 859 zSig0 += roundIncrement; 860 if ( zSig0 < roundIncrement ) { 861 ++zExp; 862 zSig0 = LIT64( 0x8000000000000000 ); 863 } 864 roundIncrement = roundMask + 1; 865 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 866 roundMask |= roundIncrement; 867 } 868 zSig0 &= ~ roundMask; 869 if ( zSig0 == 0 ) zExp = 0; 870 return packFloatx80( zSign, zExp, zSig0 ); 871 precision80: 872 switch (roundingMode) { 873 case float_round_nearest_even: 874 case float_round_ties_away: 875 increment = ((int64_t)zSig1 < 0); 876 break; 877 case float_round_to_zero: 878 increment = 0; 879 break; 880 case float_round_up: 881 increment = !zSign && zSig1; 882 break; 883 case float_round_down: 884 increment = zSign && zSig1; 885 break; 886 default: 887 abort(); 888 } 889 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 890 if ( ( 0x7FFE < zExp ) 891 || ( ( zExp == 0x7FFE ) 892 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 893 && increment 894 ) 895 ) { 896 roundMask = 0; 897 overflow: 898 float_raise(float_flag_overflow | float_flag_inexact, status); 899 if ( ( roundingMode == float_round_to_zero ) 900 || ( zSign && ( roundingMode == float_round_up ) ) 901 || ( ! zSign && ( roundingMode == float_round_down ) ) 902 ) { 903 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 904 } 905 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 906 } 907 if ( zExp <= 0 ) { 908 isTiny = 909 (status->float_detect_tininess 910 == float_tininess_before_rounding) 911 || ( zExp < 0 ) 912 || ! increment 913 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 914 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 915 zExp = 0; 916 if (isTiny && zSig1) { 917 float_raise(float_flag_underflow, status); 918 } 919 if (zSig1) { 920 status->float_exception_flags |= float_flag_inexact; 921 } 922 switch (roundingMode) { 923 case float_round_nearest_even: 924 case float_round_ties_away: 925 increment = ((int64_t)zSig1 < 0); 926 break; 927 case float_round_to_zero: 928 increment = 0; 929 break; 930 case float_round_up: 931 increment = !zSign && zSig1; 932 break; 933 case float_round_down: 934 increment = zSign && zSig1; 935 break; 936 default: 937 abort(); 938 } 939 if ( increment ) { 940 ++zSig0; 941 zSig0 &= 942 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 943 if ( (int64_t) zSig0 < 0 ) zExp = 1; 944 } 945 return packFloatx80( zSign, zExp, zSig0 ); 946 } 947 } 948 if (zSig1) { 949 status->float_exception_flags |= float_flag_inexact; 950 } 951 if ( increment ) { 952 ++zSig0; 953 if ( zSig0 == 0 ) { 954 ++zExp; 955 zSig0 = LIT64( 0x8000000000000000 ); 956 } 957 else { 958 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 959 } 960 } 961 else { 962 if ( zSig0 == 0 ) zExp = 0; 963 } 964 return packFloatx80( zSign, zExp, zSig0 ); 965 966 } 967 968 /*---------------------------------------------------------------------------- 969 | Takes an abstract floating-point value having sign `zSign', exponent 970 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 971 | and returns the proper extended double-precision floating-point value 972 | corresponding to the abstract input. This routine is just like 973 | `roundAndPackFloatx80' except that the input significand does not have to be 974 | normalized. 975 *----------------------------------------------------------------------------*/ 976 977 static floatx80 normalizeRoundAndPackFloatx80(int8 roundingPrecision, 978 flag zSign, int32 zExp, 979 uint64_t zSig0, uint64_t zSig1, 980 float_status *status) 981 { 982 int8 shiftCount; 983 984 if ( zSig0 == 0 ) { 985 zSig0 = zSig1; 986 zSig1 = 0; 987 zExp -= 64; 988 } 989 shiftCount = countLeadingZeros64( zSig0 ); 990 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 991 zExp -= shiftCount; 992 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 993 zSig0, zSig1, status); 994 995 } 996 997 /*---------------------------------------------------------------------------- 998 | Returns the least-significant 64 fraction bits of the quadruple-precision 999 | floating-point value `a'. 1000 *----------------------------------------------------------------------------*/ 1001 1002 static inline uint64_t extractFloat128Frac1( float128 a ) 1003 { 1004 1005 return a.low; 1006 1007 } 1008 1009 /*---------------------------------------------------------------------------- 1010 | Returns the most-significant 48 fraction bits of the quadruple-precision 1011 | floating-point value `a'. 1012 *----------------------------------------------------------------------------*/ 1013 1014 static inline uint64_t extractFloat128Frac0( float128 a ) 1015 { 1016 1017 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 1018 1019 } 1020 1021 /*---------------------------------------------------------------------------- 1022 | Returns the exponent bits of the quadruple-precision floating-point value 1023 | `a'. 1024 *----------------------------------------------------------------------------*/ 1025 1026 static inline int32 extractFloat128Exp( float128 a ) 1027 { 1028 1029 return ( a.high>>48 ) & 0x7FFF; 1030 1031 } 1032 1033 /*---------------------------------------------------------------------------- 1034 | Returns the sign bit of the quadruple-precision floating-point value `a'. 1035 *----------------------------------------------------------------------------*/ 1036 1037 static inline flag extractFloat128Sign( float128 a ) 1038 { 1039 1040 return a.high>>63; 1041 1042 } 1043 1044 /*---------------------------------------------------------------------------- 1045 | Normalizes the subnormal quadruple-precision floating-point value 1046 | represented by the denormalized significand formed by the concatenation of 1047 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 1048 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 1049 | significand are stored at the location pointed to by `zSig0Ptr', and the 1050 | least significant 64 bits of the normalized significand are stored at the 1051 | location pointed to by `zSig1Ptr'. 1052 *----------------------------------------------------------------------------*/ 1053 1054 static void 1055 normalizeFloat128Subnormal( 1056 uint64_t aSig0, 1057 uint64_t aSig1, 1058 int32 *zExpPtr, 1059 uint64_t *zSig0Ptr, 1060 uint64_t *zSig1Ptr 1061 ) 1062 { 1063 int8 shiftCount; 1064 1065 if ( aSig0 == 0 ) { 1066 shiftCount = countLeadingZeros64( aSig1 ) - 15; 1067 if ( shiftCount < 0 ) { 1068 *zSig0Ptr = aSig1>>( - shiftCount ); 1069 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 1070 } 1071 else { 1072 *zSig0Ptr = aSig1<<shiftCount; 1073 *zSig1Ptr = 0; 1074 } 1075 *zExpPtr = - shiftCount - 63; 1076 } 1077 else { 1078 shiftCount = countLeadingZeros64( aSig0 ) - 15; 1079 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 1080 *zExpPtr = 1 - shiftCount; 1081 } 1082 1083 } 1084 1085 /*---------------------------------------------------------------------------- 1086 | Packs the sign `zSign', the exponent `zExp', and the significand formed 1087 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 1088 | floating-point value, returning the result. After being shifted into the 1089 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 1090 | added together to form the most significant 32 bits of the result. This 1091 | means that any integer portion of `zSig0' will be added into the exponent. 1092 | Since a properly normalized significand will have an integer portion equal 1093 | to 1, the `zExp' input should be 1 less than the desired result exponent 1094 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 1095 | significand. 1096 *----------------------------------------------------------------------------*/ 1097 1098 static inline float128 1099 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 ) 1100 { 1101 float128 z; 1102 1103 z.low = zSig1; 1104 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0; 1105 return z; 1106 1107 } 1108 1109 /*---------------------------------------------------------------------------- 1110 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1111 | and extended significand formed by the concatenation of `zSig0', `zSig1', 1112 | and `zSig2', and returns the proper quadruple-precision floating-point value 1113 | corresponding to the abstract input. Ordinarily, the abstract value is 1114 | simply rounded and packed into the quadruple-precision format, with the 1115 | inexact exception raised if the abstract input cannot be represented 1116 | exactly. However, if the abstract value is too large, the overflow and 1117 | inexact exceptions are raised and an infinity or maximal finite value is 1118 | returned. If the abstract value is too small, the input value is rounded to 1119 | a subnormal number, and the underflow and inexact exceptions are raised if 1120 | the abstract input cannot be represented exactly as a subnormal quadruple- 1121 | precision floating-point number. 1122 | The input significand must be normalized or smaller. If the input 1123 | significand is not normalized, `zExp' must be 0; in that case, the result 1124 | returned is a subnormal number, and it must not require rounding. In the 1125 | usual case that the input significand is normalized, `zExp' must be 1 less 1126 | than the ``true'' floating-point exponent. The handling of underflow and 1127 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1128 *----------------------------------------------------------------------------*/ 1129 1130 static float128 roundAndPackFloat128(flag zSign, int32 zExp, 1131 uint64_t zSig0, uint64_t zSig1, 1132 uint64_t zSig2, float_status *status) 1133 { 1134 int8 roundingMode; 1135 flag roundNearestEven, increment, isTiny; 1136 1137 roundingMode = status->float_rounding_mode; 1138 roundNearestEven = ( roundingMode == float_round_nearest_even ); 1139 switch (roundingMode) { 1140 case float_round_nearest_even: 1141 case float_round_ties_away: 1142 increment = ((int64_t)zSig2 < 0); 1143 break; 1144 case float_round_to_zero: 1145 increment = 0; 1146 break; 1147 case float_round_up: 1148 increment = !zSign && zSig2; 1149 break; 1150 case float_round_down: 1151 increment = zSign && zSig2; 1152 break; 1153 default: 1154 abort(); 1155 } 1156 if ( 0x7FFD <= (uint32_t) zExp ) { 1157 if ( ( 0x7FFD < zExp ) 1158 || ( ( zExp == 0x7FFD ) 1159 && eq128( 1160 LIT64( 0x0001FFFFFFFFFFFF ), 1161 LIT64( 0xFFFFFFFFFFFFFFFF ), 1162 zSig0, 1163 zSig1 1164 ) 1165 && increment 1166 ) 1167 ) { 1168 float_raise(float_flag_overflow | float_flag_inexact, status); 1169 if ( ( roundingMode == float_round_to_zero ) 1170 || ( zSign && ( roundingMode == float_round_up ) ) 1171 || ( ! zSign && ( roundingMode == float_round_down ) ) 1172 ) { 1173 return 1174 packFloat128( 1175 zSign, 1176 0x7FFE, 1177 LIT64( 0x0000FFFFFFFFFFFF ), 1178 LIT64( 0xFFFFFFFFFFFFFFFF ) 1179 ); 1180 } 1181 return packFloat128( zSign, 0x7FFF, 0, 0 ); 1182 } 1183 if ( zExp < 0 ) { 1184 if (status->flush_to_zero) { 1185 float_raise(float_flag_output_denormal, status); 1186 return packFloat128(zSign, 0, 0, 0); 1187 } 1188 isTiny = 1189 (status->float_detect_tininess 1190 == float_tininess_before_rounding) 1191 || ( zExp < -1 ) 1192 || ! increment 1193 || lt128( 1194 zSig0, 1195 zSig1, 1196 LIT64( 0x0001FFFFFFFFFFFF ), 1197 LIT64( 0xFFFFFFFFFFFFFFFF ) 1198 ); 1199 shift128ExtraRightJamming( 1200 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 1201 zExp = 0; 1202 if (isTiny && zSig2) { 1203 float_raise(float_flag_underflow, status); 1204 } 1205 switch (roundingMode) { 1206 case float_round_nearest_even: 1207 case float_round_ties_away: 1208 increment = ((int64_t)zSig2 < 0); 1209 break; 1210 case float_round_to_zero: 1211 increment = 0; 1212 break; 1213 case float_round_up: 1214 increment = !zSign && zSig2; 1215 break; 1216 case float_round_down: 1217 increment = zSign && zSig2; 1218 break; 1219 default: 1220 abort(); 1221 } 1222 } 1223 } 1224 if (zSig2) { 1225 status->float_exception_flags |= float_flag_inexact; 1226 } 1227 if ( increment ) { 1228 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 1229 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 1230 } 1231 else { 1232 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 1233 } 1234 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1235 1236 } 1237 1238 /*---------------------------------------------------------------------------- 1239 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1240 | and significand formed by the concatenation of `zSig0' and `zSig1', and 1241 | returns the proper quadruple-precision floating-point value corresponding 1242 | to the abstract input. This routine is just like `roundAndPackFloat128' 1243 | except that the input significand has fewer bits and does not have to be 1244 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 1245 | point exponent. 1246 *----------------------------------------------------------------------------*/ 1247 1248 static float128 normalizeRoundAndPackFloat128(flag zSign, int32 zExp, 1249 uint64_t zSig0, uint64_t zSig1, 1250 float_status *status) 1251 { 1252 int8 shiftCount; 1253 uint64_t zSig2; 1254 1255 if ( zSig0 == 0 ) { 1256 zSig0 = zSig1; 1257 zSig1 = 0; 1258 zExp -= 64; 1259 } 1260 shiftCount = countLeadingZeros64( zSig0 ) - 15; 1261 if ( 0 <= shiftCount ) { 1262 zSig2 = 0; 1263 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1264 } 1265 else { 1266 shift128ExtraRightJamming( 1267 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 1268 } 1269 zExp -= shiftCount; 1270 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 1271 1272 } 1273 1274 /*---------------------------------------------------------------------------- 1275 | Returns the result of converting the 32-bit two's complement integer `a' 1276 | to the single-precision floating-point format. The conversion is performed 1277 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1278 *----------------------------------------------------------------------------*/ 1279 1280 float32 int32_to_float32(int32_t a, float_status *status) 1281 { 1282 flag zSign; 1283 1284 if ( a == 0 ) return float32_zero; 1285 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); 1286 zSign = ( a < 0 ); 1287 return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status); 1288 } 1289 1290 /*---------------------------------------------------------------------------- 1291 | Returns the result of converting the 32-bit two's complement integer `a' 1292 | to the double-precision floating-point format. The conversion is performed 1293 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1294 *----------------------------------------------------------------------------*/ 1295 1296 float64 int32_to_float64(int32_t a, float_status *status) 1297 { 1298 flag zSign; 1299 uint32 absA; 1300 int8 shiftCount; 1301 uint64_t zSig; 1302 1303 if ( a == 0 ) return float64_zero; 1304 zSign = ( a < 0 ); 1305 absA = zSign ? - a : a; 1306 shiftCount = countLeadingZeros32( absA ) + 21; 1307 zSig = absA; 1308 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount ); 1309 1310 } 1311 1312 /*---------------------------------------------------------------------------- 1313 | Returns the result of converting the 32-bit two's complement integer `a' 1314 | to the extended double-precision floating-point format. The conversion 1315 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1316 | Arithmetic. 1317 *----------------------------------------------------------------------------*/ 1318 1319 floatx80 int32_to_floatx80(int32_t a, float_status *status) 1320 { 1321 flag zSign; 1322 uint32 absA; 1323 int8 shiftCount; 1324 uint64_t zSig; 1325 1326 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1327 zSign = ( a < 0 ); 1328 absA = zSign ? - a : a; 1329 shiftCount = countLeadingZeros32( absA ) + 32; 1330 zSig = absA; 1331 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 1332 1333 } 1334 1335 /*---------------------------------------------------------------------------- 1336 | Returns the result of converting the 32-bit two's complement integer `a' to 1337 | the quadruple-precision floating-point format. The conversion is performed 1338 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1339 *----------------------------------------------------------------------------*/ 1340 1341 float128 int32_to_float128(int32_t a, float_status *status) 1342 { 1343 flag zSign; 1344 uint32 absA; 1345 int8 shiftCount; 1346 uint64_t zSig0; 1347 1348 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1349 zSign = ( a < 0 ); 1350 absA = zSign ? - a : a; 1351 shiftCount = countLeadingZeros32( absA ) + 17; 1352 zSig0 = absA; 1353 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 1354 1355 } 1356 1357 /*---------------------------------------------------------------------------- 1358 | Returns the result of converting the 64-bit two's complement integer `a' 1359 | to the single-precision floating-point format. The conversion is performed 1360 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1361 *----------------------------------------------------------------------------*/ 1362 1363 float32 int64_to_float32(int64_t a, float_status *status) 1364 { 1365 flag zSign; 1366 uint64 absA; 1367 int8 shiftCount; 1368 1369 if ( a == 0 ) return float32_zero; 1370 zSign = ( a < 0 ); 1371 absA = zSign ? - a : a; 1372 shiftCount = countLeadingZeros64( absA ) - 40; 1373 if ( 0 <= shiftCount ) { 1374 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount ); 1375 } 1376 else { 1377 shiftCount += 7; 1378 if ( shiftCount < 0 ) { 1379 shift64RightJamming( absA, - shiftCount, &absA ); 1380 } 1381 else { 1382 absA <<= shiftCount; 1383 } 1384 return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status); 1385 } 1386 1387 } 1388 1389 /*---------------------------------------------------------------------------- 1390 | Returns the result of converting the 64-bit two's complement integer `a' 1391 | to the double-precision floating-point format. The conversion is performed 1392 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1393 *----------------------------------------------------------------------------*/ 1394 1395 float64 int64_to_float64(int64_t a, float_status *status) 1396 { 1397 flag zSign; 1398 1399 if ( a == 0 ) return float64_zero; 1400 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) { 1401 return packFloat64( 1, 0x43E, 0 ); 1402 } 1403 zSign = ( a < 0 ); 1404 return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status); 1405 } 1406 1407 /*---------------------------------------------------------------------------- 1408 | Returns the result of converting the 64-bit two's complement integer `a' 1409 | to the extended double-precision floating-point format. The conversion 1410 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1411 | Arithmetic. 1412 *----------------------------------------------------------------------------*/ 1413 1414 floatx80 int64_to_floatx80(int64_t a, float_status *status) 1415 { 1416 flag zSign; 1417 uint64 absA; 1418 int8 shiftCount; 1419 1420 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1421 zSign = ( a < 0 ); 1422 absA = zSign ? - a : a; 1423 shiftCount = countLeadingZeros64( absA ); 1424 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 1425 1426 } 1427 1428 /*---------------------------------------------------------------------------- 1429 | Returns the result of converting the 64-bit two's complement integer `a' to 1430 | the quadruple-precision floating-point format. The conversion is performed 1431 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1432 *----------------------------------------------------------------------------*/ 1433 1434 float128 int64_to_float128(int64_t a, float_status *status) 1435 { 1436 flag zSign; 1437 uint64 absA; 1438 int8 shiftCount; 1439 int32 zExp; 1440 uint64_t zSig0, zSig1; 1441 1442 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1443 zSign = ( a < 0 ); 1444 absA = zSign ? - a : a; 1445 shiftCount = countLeadingZeros64( absA ) + 49; 1446 zExp = 0x406E - shiftCount; 1447 if ( 64 <= shiftCount ) { 1448 zSig1 = 0; 1449 zSig0 = absA; 1450 shiftCount -= 64; 1451 } 1452 else { 1453 zSig1 = absA; 1454 zSig0 = 0; 1455 } 1456 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1457 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1458 1459 } 1460 1461 /*---------------------------------------------------------------------------- 1462 | Returns the result of converting the 64-bit unsigned integer `a' 1463 | to the single-precision floating-point format. The conversion is performed 1464 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1465 *----------------------------------------------------------------------------*/ 1466 1467 float32 uint64_to_float32(uint64_t a, float_status *status) 1468 { 1469 int shiftcount; 1470 1471 if (a == 0) { 1472 return float32_zero; 1473 } 1474 1475 /* Determine (left) shift needed to put first set bit into bit posn 23 1476 * (since packFloat32() expects the binary point between bits 23 and 22); 1477 * this is the fast case for smallish numbers. 1478 */ 1479 shiftcount = countLeadingZeros64(a) - 40; 1480 if (shiftcount >= 0) { 1481 return packFloat32(0, 0x95 - shiftcount, a << shiftcount); 1482 } 1483 /* Otherwise we need to do a round-and-pack. roundAndPackFloat32() 1484 * expects the binary point between bits 30 and 29, hence the + 7. 1485 */ 1486 shiftcount += 7; 1487 if (shiftcount < 0) { 1488 shift64RightJamming(a, -shiftcount, &a); 1489 } else { 1490 a <<= shiftcount; 1491 } 1492 1493 return roundAndPackFloat32(0, 0x9c - shiftcount, a, status); 1494 } 1495 1496 /*---------------------------------------------------------------------------- 1497 | Returns the result of converting the 64-bit unsigned integer `a' 1498 | to the double-precision floating-point format. The conversion is performed 1499 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1500 *----------------------------------------------------------------------------*/ 1501 1502 float64 uint64_to_float64(uint64_t a, float_status *status) 1503 { 1504 int exp = 0x43C; 1505 int shiftcount; 1506 1507 if (a == 0) { 1508 return float64_zero; 1509 } 1510 1511 shiftcount = countLeadingZeros64(a) - 1; 1512 if (shiftcount < 0) { 1513 shift64RightJamming(a, -shiftcount, &a); 1514 } else { 1515 a <<= shiftcount; 1516 } 1517 return roundAndPackFloat64(0, exp - shiftcount, a, status); 1518 } 1519 1520 /*---------------------------------------------------------------------------- 1521 | Returns the result of converting the 64-bit unsigned integer `a' 1522 | to the quadruple-precision floating-point format. The conversion is performed 1523 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1524 *----------------------------------------------------------------------------*/ 1525 1526 float128 uint64_to_float128(uint64_t a, float_status *status) 1527 { 1528 if (a == 0) { 1529 return float128_zero; 1530 } 1531 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status); 1532 } 1533 1534 /*---------------------------------------------------------------------------- 1535 | Returns the result of converting the single-precision floating-point value 1536 | `a' to the 32-bit two's complement integer format. The conversion is 1537 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1538 | Arithmetic---which means in particular that the conversion is rounded 1539 | according to the current rounding mode. If `a' is a NaN, the largest 1540 | positive integer is returned. Otherwise, if the conversion overflows, the 1541 | largest integer with the same sign as `a' is returned. 1542 *----------------------------------------------------------------------------*/ 1543 1544 int32 float32_to_int32(float32 a, float_status *status) 1545 { 1546 flag aSign; 1547 int_fast16_t aExp, shiftCount; 1548 uint32_t aSig; 1549 uint64_t aSig64; 1550 1551 a = float32_squash_input_denormal(a, status); 1552 aSig = extractFloat32Frac( a ); 1553 aExp = extractFloat32Exp( a ); 1554 aSign = extractFloat32Sign( a ); 1555 if ( ( aExp == 0xFF ) && aSig ) aSign = 0; 1556 if ( aExp ) aSig |= 0x00800000; 1557 shiftCount = 0xAF - aExp; 1558 aSig64 = aSig; 1559 aSig64 <<= 32; 1560 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 ); 1561 return roundAndPackInt32(aSign, aSig64, status); 1562 1563 } 1564 1565 /*---------------------------------------------------------------------------- 1566 | Returns the result of converting the single-precision floating-point value 1567 | `a' to the 32-bit two's complement integer format. The conversion is 1568 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1569 | Arithmetic, except that the conversion is always rounded toward zero. 1570 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1571 | the conversion overflows, the largest integer with the same sign as `a' is 1572 | returned. 1573 *----------------------------------------------------------------------------*/ 1574 1575 int32 float32_to_int32_round_to_zero(float32 a, float_status *status) 1576 { 1577 flag aSign; 1578 int_fast16_t aExp, shiftCount; 1579 uint32_t aSig; 1580 int32_t z; 1581 a = float32_squash_input_denormal(a, status); 1582 1583 aSig = extractFloat32Frac( a ); 1584 aExp = extractFloat32Exp( a ); 1585 aSign = extractFloat32Sign( a ); 1586 shiftCount = aExp - 0x9E; 1587 if ( 0 <= shiftCount ) { 1588 if ( float32_val(a) != 0xCF000000 ) { 1589 float_raise(float_flag_invalid, status); 1590 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; 1591 } 1592 return (int32_t) 0x80000000; 1593 } 1594 else if ( aExp <= 0x7E ) { 1595 if (aExp | aSig) { 1596 status->float_exception_flags |= float_flag_inexact; 1597 } 1598 return 0; 1599 } 1600 aSig = ( aSig | 0x00800000 )<<8; 1601 z = aSig>>( - shiftCount ); 1602 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) { 1603 status->float_exception_flags |= float_flag_inexact; 1604 } 1605 if ( aSign ) z = - z; 1606 return z; 1607 1608 } 1609 1610 /*---------------------------------------------------------------------------- 1611 | Returns the result of converting the single-precision floating-point value 1612 | `a' to the 16-bit two's complement integer format. The conversion is 1613 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1614 | Arithmetic, except that the conversion is always rounded toward zero. 1615 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1616 | the conversion overflows, the largest integer with the same sign as `a' is 1617 | returned. 1618 *----------------------------------------------------------------------------*/ 1619 1620 int_fast16_t float32_to_int16_round_to_zero(float32 a, float_status *status) 1621 { 1622 flag aSign; 1623 int_fast16_t aExp, shiftCount; 1624 uint32_t aSig; 1625 int32 z; 1626 1627 aSig = extractFloat32Frac( a ); 1628 aExp = extractFloat32Exp( a ); 1629 aSign = extractFloat32Sign( a ); 1630 shiftCount = aExp - 0x8E; 1631 if ( 0 <= shiftCount ) { 1632 if ( float32_val(a) != 0xC7000000 ) { 1633 float_raise(float_flag_invalid, status); 1634 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1635 return 0x7FFF; 1636 } 1637 } 1638 return (int32_t) 0xffff8000; 1639 } 1640 else if ( aExp <= 0x7E ) { 1641 if ( aExp | aSig ) { 1642 status->float_exception_flags |= float_flag_inexact; 1643 } 1644 return 0; 1645 } 1646 shiftCount -= 0x10; 1647 aSig = ( aSig | 0x00800000 )<<8; 1648 z = aSig>>( - shiftCount ); 1649 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) { 1650 status->float_exception_flags |= float_flag_inexact; 1651 } 1652 if ( aSign ) { 1653 z = - z; 1654 } 1655 return z; 1656 1657 } 1658 1659 /*---------------------------------------------------------------------------- 1660 | Returns the result of converting the single-precision floating-point value 1661 | `a' to the 64-bit two's complement integer format. The conversion is 1662 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1663 | Arithmetic---which means in particular that the conversion is rounded 1664 | according to the current rounding mode. If `a' is a NaN, the largest 1665 | positive integer is returned. Otherwise, if the conversion overflows, the 1666 | largest integer with the same sign as `a' is returned. 1667 *----------------------------------------------------------------------------*/ 1668 1669 int64 float32_to_int64(float32 a, float_status *status) 1670 { 1671 flag aSign; 1672 int_fast16_t aExp, shiftCount; 1673 uint32_t aSig; 1674 uint64_t aSig64, aSigExtra; 1675 a = float32_squash_input_denormal(a, status); 1676 1677 aSig = extractFloat32Frac( a ); 1678 aExp = extractFloat32Exp( a ); 1679 aSign = extractFloat32Sign( a ); 1680 shiftCount = 0xBE - aExp; 1681 if ( shiftCount < 0 ) { 1682 float_raise(float_flag_invalid, status); 1683 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1684 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1685 } 1686 return (int64_t) LIT64( 0x8000000000000000 ); 1687 } 1688 if ( aExp ) aSig |= 0x00800000; 1689 aSig64 = aSig; 1690 aSig64 <<= 40; 1691 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra ); 1692 return roundAndPackInt64(aSign, aSig64, aSigExtra, status); 1693 1694 } 1695 1696 /*---------------------------------------------------------------------------- 1697 | Returns the result of converting the single-precision floating-point value 1698 | `a' to the 64-bit unsigned integer format. The conversion is 1699 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1700 | Arithmetic---which means in particular that the conversion is rounded 1701 | according to the current rounding mode. If `a' is a NaN, the largest 1702 | unsigned integer is returned. Otherwise, if the conversion overflows, the 1703 | largest unsigned integer is returned. If the 'a' is negative, the result 1704 | is rounded and zero is returned; values that do not round to zero will 1705 | raise the inexact exception flag. 1706 *----------------------------------------------------------------------------*/ 1707 1708 uint64 float32_to_uint64(float32 a, float_status *status) 1709 { 1710 flag aSign; 1711 int_fast16_t aExp, shiftCount; 1712 uint32_t aSig; 1713 uint64_t aSig64, aSigExtra; 1714 a = float32_squash_input_denormal(a, status); 1715 1716 aSig = extractFloat32Frac(a); 1717 aExp = extractFloat32Exp(a); 1718 aSign = extractFloat32Sign(a); 1719 if ((aSign) && (aExp > 126)) { 1720 float_raise(float_flag_invalid, status); 1721 if (float32_is_any_nan(a)) { 1722 return LIT64(0xFFFFFFFFFFFFFFFF); 1723 } else { 1724 return 0; 1725 } 1726 } 1727 shiftCount = 0xBE - aExp; 1728 if (aExp) { 1729 aSig |= 0x00800000; 1730 } 1731 if (shiftCount < 0) { 1732 float_raise(float_flag_invalid, status); 1733 return LIT64(0xFFFFFFFFFFFFFFFF); 1734 } 1735 1736 aSig64 = aSig; 1737 aSig64 <<= 40; 1738 shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra); 1739 return roundAndPackUint64(aSign, aSig64, aSigExtra, status); 1740 } 1741 1742 /*---------------------------------------------------------------------------- 1743 | Returns the result of converting the single-precision floating-point value 1744 | `a' to the 64-bit unsigned integer format. The conversion is 1745 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1746 | Arithmetic, except that the conversion is always rounded toward zero. If 1747 | `a' is a NaN, the largest unsigned integer is returned. Otherwise, if the 1748 | conversion overflows, the largest unsigned integer is returned. If the 1749 | 'a' is negative, the result is rounded and zero is returned; values that do 1750 | not round to zero will raise the inexact flag. 1751 *----------------------------------------------------------------------------*/ 1752 1753 uint64 float32_to_uint64_round_to_zero(float32 a, float_status *status) 1754 { 1755 signed char current_rounding_mode = status->float_rounding_mode; 1756 set_float_rounding_mode(float_round_to_zero, status); 1757 int64_t v = float32_to_uint64(a, status); 1758 set_float_rounding_mode(current_rounding_mode, status); 1759 return v; 1760 } 1761 1762 /*---------------------------------------------------------------------------- 1763 | Returns the result of converting the single-precision floating-point value 1764 | `a' to the 64-bit two's complement integer format. The conversion is 1765 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1766 | Arithmetic, except that the conversion is always rounded toward zero. If 1767 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 1768 | conversion overflows, the largest integer with the same sign as `a' is 1769 | returned. 1770 *----------------------------------------------------------------------------*/ 1771 1772 int64 float32_to_int64_round_to_zero(float32 a, float_status *status) 1773 { 1774 flag aSign; 1775 int_fast16_t aExp, shiftCount; 1776 uint32_t aSig; 1777 uint64_t aSig64; 1778 int64 z; 1779 a = float32_squash_input_denormal(a, status); 1780 1781 aSig = extractFloat32Frac( a ); 1782 aExp = extractFloat32Exp( a ); 1783 aSign = extractFloat32Sign( a ); 1784 shiftCount = aExp - 0xBE; 1785 if ( 0 <= shiftCount ) { 1786 if ( float32_val(a) != 0xDF000000 ) { 1787 float_raise(float_flag_invalid, status); 1788 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1789 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1790 } 1791 } 1792 return (int64_t) LIT64( 0x8000000000000000 ); 1793 } 1794 else if ( aExp <= 0x7E ) { 1795 if (aExp | aSig) { 1796 status->float_exception_flags |= float_flag_inexact; 1797 } 1798 return 0; 1799 } 1800 aSig64 = aSig | 0x00800000; 1801 aSig64 <<= 40; 1802 z = aSig64>>( - shiftCount ); 1803 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) { 1804 status->float_exception_flags |= float_flag_inexact; 1805 } 1806 if ( aSign ) z = - z; 1807 return z; 1808 1809 } 1810 1811 /*---------------------------------------------------------------------------- 1812 | Returns the result of converting the single-precision floating-point value 1813 | `a' to the double-precision floating-point format. The conversion is 1814 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1815 | Arithmetic. 1816 *----------------------------------------------------------------------------*/ 1817 1818 float64 float32_to_float64(float32 a, float_status *status) 1819 { 1820 flag aSign; 1821 int_fast16_t aExp; 1822 uint32_t aSig; 1823 a = float32_squash_input_denormal(a, status); 1824 1825 aSig = extractFloat32Frac( a ); 1826 aExp = extractFloat32Exp( a ); 1827 aSign = extractFloat32Sign( a ); 1828 if ( aExp == 0xFF ) { 1829 if (aSig) { 1830 return commonNaNToFloat64(float32ToCommonNaN(a, status), status); 1831 } 1832 return packFloat64( aSign, 0x7FF, 0 ); 1833 } 1834 if ( aExp == 0 ) { 1835 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 1836 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1837 --aExp; 1838 } 1839 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 ); 1840 1841 } 1842 1843 /*---------------------------------------------------------------------------- 1844 | Returns the result of converting the single-precision floating-point value 1845 | `a' to the extended double-precision floating-point format. The conversion 1846 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1847 | Arithmetic. 1848 *----------------------------------------------------------------------------*/ 1849 1850 floatx80 float32_to_floatx80(float32 a, float_status *status) 1851 { 1852 flag aSign; 1853 int_fast16_t aExp; 1854 uint32_t aSig; 1855 1856 a = float32_squash_input_denormal(a, status); 1857 aSig = extractFloat32Frac( a ); 1858 aExp = extractFloat32Exp( a ); 1859 aSign = extractFloat32Sign( a ); 1860 if ( aExp == 0xFF ) { 1861 if (aSig) { 1862 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status); 1863 } 1864 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 1865 } 1866 if ( aExp == 0 ) { 1867 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 1868 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1869 } 1870 aSig |= 0x00800000; 1871 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 1872 1873 } 1874 1875 /*---------------------------------------------------------------------------- 1876 | Returns the result of converting the single-precision floating-point value 1877 | `a' to the double-precision floating-point format. The conversion is 1878 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1879 | Arithmetic. 1880 *----------------------------------------------------------------------------*/ 1881 1882 float128 float32_to_float128(float32 a, float_status *status) 1883 { 1884 flag aSign; 1885 int_fast16_t aExp; 1886 uint32_t aSig; 1887 1888 a = float32_squash_input_denormal(a, status); 1889 aSig = extractFloat32Frac( a ); 1890 aExp = extractFloat32Exp( a ); 1891 aSign = extractFloat32Sign( a ); 1892 if ( aExp == 0xFF ) { 1893 if (aSig) { 1894 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 1895 } 1896 return packFloat128( aSign, 0x7FFF, 0, 0 ); 1897 } 1898 if ( aExp == 0 ) { 1899 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 1900 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1901 --aExp; 1902 } 1903 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 1904 1905 } 1906 1907 /*---------------------------------------------------------------------------- 1908 | Rounds the single-precision floating-point value `a' to an integer, and 1909 | returns the result as a single-precision floating-point value. The 1910 | operation is performed according to the IEC/IEEE Standard for Binary 1911 | Floating-Point Arithmetic. 1912 *----------------------------------------------------------------------------*/ 1913 1914 float32 float32_round_to_int(float32 a, float_status *status) 1915 { 1916 flag aSign; 1917 int_fast16_t aExp; 1918 uint32_t lastBitMask, roundBitsMask; 1919 uint32_t z; 1920 a = float32_squash_input_denormal(a, status); 1921 1922 aExp = extractFloat32Exp( a ); 1923 if ( 0x96 <= aExp ) { 1924 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { 1925 return propagateFloat32NaN(a, a, status); 1926 } 1927 return a; 1928 } 1929 if ( aExp <= 0x7E ) { 1930 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a; 1931 status->float_exception_flags |= float_flag_inexact; 1932 aSign = extractFloat32Sign( a ); 1933 switch (status->float_rounding_mode) { 1934 case float_round_nearest_even: 1935 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { 1936 return packFloat32( aSign, 0x7F, 0 ); 1937 } 1938 break; 1939 case float_round_ties_away: 1940 if (aExp == 0x7E) { 1941 return packFloat32(aSign, 0x7F, 0); 1942 } 1943 break; 1944 case float_round_down: 1945 return make_float32(aSign ? 0xBF800000 : 0); 1946 case float_round_up: 1947 return make_float32(aSign ? 0x80000000 : 0x3F800000); 1948 } 1949 return packFloat32( aSign, 0, 0 ); 1950 } 1951 lastBitMask = 1; 1952 lastBitMask <<= 0x96 - aExp; 1953 roundBitsMask = lastBitMask - 1; 1954 z = float32_val(a); 1955 switch (status->float_rounding_mode) { 1956 case float_round_nearest_even: 1957 z += lastBitMask>>1; 1958 if ((z & roundBitsMask) == 0) { 1959 z &= ~lastBitMask; 1960 } 1961 break; 1962 case float_round_ties_away: 1963 z += lastBitMask >> 1; 1964 break; 1965 case float_round_to_zero: 1966 break; 1967 case float_round_up: 1968 if (!extractFloat32Sign(make_float32(z))) { 1969 z += roundBitsMask; 1970 } 1971 break; 1972 case float_round_down: 1973 if (extractFloat32Sign(make_float32(z))) { 1974 z += roundBitsMask; 1975 } 1976 break; 1977 default: 1978 abort(); 1979 } 1980 z &= ~ roundBitsMask; 1981 if (z != float32_val(a)) { 1982 status->float_exception_flags |= float_flag_inexact; 1983 } 1984 return make_float32(z); 1985 1986 } 1987 1988 /*---------------------------------------------------------------------------- 1989 | Returns the result of adding the absolute values of the single-precision 1990 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 1991 | before being returned. `zSign' is ignored if the result is a NaN. 1992 | The addition is performed according to the IEC/IEEE Standard for Binary 1993 | Floating-Point Arithmetic. 1994 *----------------------------------------------------------------------------*/ 1995 1996 static float32 addFloat32Sigs(float32 a, float32 b, flag zSign, 1997 float_status *status) 1998 { 1999 int_fast16_t aExp, bExp, zExp; 2000 uint32_t aSig, bSig, zSig; 2001 int_fast16_t expDiff; 2002 2003 aSig = extractFloat32Frac( a ); 2004 aExp = extractFloat32Exp( a ); 2005 bSig = extractFloat32Frac( b ); 2006 bExp = extractFloat32Exp( b ); 2007 expDiff = aExp - bExp; 2008 aSig <<= 6; 2009 bSig <<= 6; 2010 if ( 0 < expDiff ) { 2011 if ( aExp == 0xFF ) { 2012 if (aSig) { 2013 return propagateFloat32NaN(a, b, status); 2014 } 2015 return a; 2016 } 2017 if ( bExp == 0 ) { 2018 --expDiff; 2019 } 2020 else { 2021 bSig |= 0x20000000; 2022 } 2023 shift32RightJamming( bSig, expDiff, &bSig ); 2024 zExp = aExp; 2025 } 2026 else if ( expDiff < 0 ) { 2027 if ( bExp == 0xFF ) { 2028 if (bSig) { 2029 return propagateFloat32NaN(a, b, status); 2030 } 2031 return packFloat32( zSign, 0xFF, 0 ); 2032 } 2033 if ( aExp == 0 ) { 2034 ++expDiff; 2035 } 2036 else { 2037 aSig |= 0x20000000; 2038 } 2039 shift32RightJamming( aSig, - expDiff, &aSig ); 2040 zExp = bExp; 2041 } 2042 else { 2043 if ( aExp == 0xFF ) { 2044 if (aSig | bSig) { 2045 return propagateFloat32NaN(a, b, status); 2046 } 2047 return a; 2048 } 2049 if ( aExp == 0 ) { 2050 if (status->flush_to_zero) { 2051 if (aSig | bSig) { 2052 float_raise(float_flag_output_denormal, status); 2053 } 2054 return packFloat32(zSign, 0, 0); 2055 } 2056 return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); 2057 } 2058 zSig = 0x40000000 + aSig + bSig; 2059 zExp = aExp; 2060 goto roundAndPack; 2061 } 2062 aSig |= 0x20000000; 2063 zSig = ( aSig + bSig )<<1; 2064 --zExp; 2065 if ( (int32_t) zSig < 0 ) { 2066 zSig = aSig + bSig; 2067 ++zExp; 2068 } 2069 roundAndPack: 2070 return roundAndPackFloat32(zSign, zExp, zSig, status); 2071 2072 } 2073 2074 /*---------------------------------------------------------------------------- 2075 | Returns the result of subtracting the absolute values of the single- 2076 | precision floating-point values `a' and `b'. If `zSign' is 1, the 2077 | difference is negated before being returned. `zSign' is ignored if the 2078 | result is a NaN. The subtraction is performed according to the IEC/IEEE 2079 | Standard for Binary Floating-Point Arithmetic. 2080 *----------------------------------------------------------------------------*/ 2081 2082 static float32 subFloat32Sigs(float32 a, float32 b, flag zSign, 2083 float_status *status) 2084 { 2085 int_fast16_t aExp, bExp, zExp; 2086 uint32_t aSig, bSig, zSig; 2087 int_fast16_t expDiff; 2088 2089 aSig = extractFloat32Frac( a ); 2090 aExp = extractFloat32Exp( a ); 2091 bSig = extractFloat32Frac( b ); 2092 bExp = extractFloat32Exp( b ); 2093 expDiff = aExp - bExp; 2094 aSig <<= 7; 2095 bSig <<= 7; 2096 if ( 0 < expDiff ) goto aExpBigger; 2097 if ( expDiff < 0 ) goto bExpBigger; 2098 if ( aExp == 0xFF ) { 2099 if (aSig | bSig) { 2100 return propagateFloat32NaN(a, b, status); 2101 } 2102 float_raise(float_flag_invalid, status); 2103 return float32_default_nan; 2104 } 2105 if ( aExp == 0 ) { 2106 aExp = 1; 2107 bExp = 1; 2108 } 2109 if ( bSig < aSig ) goto aBigger; 2110 if ( aSig < bSig ) goto bBigger; 2111 return packFloat32(status->float_rounding_mode == float_round_down, 0, 0); 2112 bExpBigger: 2113 if ( bExp == 0xFF ) { 2114 if (bSig) { 2115 return propagateFloat32NaN(a, b, status); 2116 } 2117 return packFloat32( zSign ^ 1, 0xFF, 0 ); 2118 } 2119 if ( aExp == 0 ) { 2120 ++expDiff; 2121 } 2122 else { 2123 aSig |= 0x40000000; 2124 } 2125 shift32RightJamming( aSig, - expDiff, &aSig ); 2126 bSig |= 0x40000000; 2127 bBigger: 2128 zSig = bSig - aSig; 2129 zExp = bExp; 2130 zSign ^= 1; 2131 goto normalizeRoundAndPack; 2132 aExpBigger: 2133 if ( aExp == 0xFF ) { 2134 if (aSig) { 2135 return propagateFloat32NaN(a, b, status); 2136 } 2137 return a; 2138 } 2139 if ( bExp == 0 ) { 2140 --expDiff; 2141 } 2142 else { 2143 bSig |= 0x40000000; 2144 } 2145 shift32RightJamming( bSig, expDiff, &bSig ); 2146 aSig |= 0x40000000; 2147 aBigger: 2148 zSig = aSig - bSig; 2149 zExp = aExp; 2150 normalizeRoundAndPack: 2151 --zExp; 2152 return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status); 2153 2154 } 2155 2156 /*---------------------------------------------------------------------------- 2157 | Returns the result of adding the single-precision floating-point values `a' 2158 | and `b'. The operation is performed according to the IEC/IEEE Standard for 2159 | Binary Floating-Point Arithmetic. 2160 *----------------------------------------------------------------------------*/ 2161 2162 float32 float32_add(float32 a, float32 b, float_status *status) 2163 { 2164 flag aSign, bSign; 2165 a = float32_squash_input_denormal(a, status); 2166 b = float32_squash_input_denormal(b, status); 2167 2168 aSign = extractFloat32Sign( a ); 2169 bSign = extractFloat32Sign( b ); 2170 if ( aSign == bSign ) { 2171 return addFloat32Sigs(a, b, aSign, status); 2172 } 2173 else { 2174 return subFloat32Sigs(a, b, aSign, status); 2175 } 2176 2177 } 2178 2179 /*---------------------------------------------------------------------------- 2180 | Returns the result of subtracting the single-precision floating-point values 2181 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2182 | for Binary Floating-Point Arithmetic. 2183 *----------------------------------------------------------------------------*/ 2184 2185 float32 float32_sub(float32 a, float32 b, float_status *status) 2186 { 2187 flag aSign, bSign; 2188 a = float32_squash_input_denormal(a, status); 2189 b = float32_squash_input_denormal(b, status); 2190 2191 aSign = extractFloat32Sign( a ); 2192 bSign = extractFloat32Sign( b ); 2193 if ( aSign == bSign ) { 2194 return subFloat32Sigs(a, b, aSign, status); 2195 } 2196 else { 2197 return addFloat32Sigs(a, b, aSign, status); 2198 } 2199 2200 } 2201 2202 /*---------------------------------------------------------------------------- 2203 | Returns the result of multiplying the single-precision floating-point values 2204 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2205 | for Binary Floating-Point Arithmetic. 2206 *----------------------------------------------------------------------------*/ 2207 2208 float32 float32_mul(float32 a, float32 b, float_status *status) 2209 { 2210 flag aSign, bSign, zSign; 2211 int_fast16_t aExp, bExp, zExp; 2212 uint32_t aSig, bSig; 2213 uint64_t zSig64; 2214 uint32_t zSig; 2215 2216 a = float32_squash_input_denormal(a, status); 2217 b = float32_squash_input_denormal(b, status); 2218 2219 aSig = extractFloat32Frac( a ); 2220 aExp = extractFloat32Exp( a ); 2221 aSign = extractFloat32Sign( a ); 2222 bSig = extractFloat32Frac( b ); 2223 bExp = extractFloat32Exp( b ); 2224 bSign = extractFloat32Sign( b ); 2225 zSign = aSign ^ bSign; 2226 if ( aExp == 0xFF ) { 2227 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 2228 return propagateFloat32NaN(a, b, status); 2229 } 2230 if ( ( bExp | bSig ) == 0 ) { 2231 float_raise(float_flag_invalid, status); 2232 return float32_default_nan; 2233 } 2234 return packFloat32( zSign, 0xFF, 0 ); 2235 } 2236 if ( bExp == 0xFF ) { 2237 if (bSig) { 2238 return propagateFloat32NaN(a, b, status); 2239 } 2240 if ( ( aExp | aSig ) == 0 ) { 2241 float_raise(float_flag_invalid, status); 2242 return float32_default_nan; 2243 } 2244 return packFloat32( zSign, 0xFF, 0 ); 2245 } 2246 if ( aExp == 0 ) { 2247 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 2248 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2249 } 2250 if ( bExp == 0 ) { 2251 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); 2252 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2253 } 2254 zExp = aExp + bExp - 0x7F; 2255 aSig = ( aSig | 0x00800000 )<<7; 2256 bSig = ( bSig | 0x00800000 )<<8; 2257 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 ); 2258 zSig = zSig64; 2259 if ( 0 <= (int32_t) ( zSig<<1 ) ) { 2260 zSig <<= 1; 2261 --zExp; 2262 } 2263 return roundAndPackFloat32(zSign, zExp, zSig, status); 2264 2265 } 2266 2267 /*---------------------------------------------------------------------------- 2268 | Returns the result of dividing the single-precision floating-point value `a' 2269 | by the corresponding value `b'. The operation is performed according to the 2270 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2271 *----------------------------------------------------------------------------*/ 2272 2273 float32 float32_div(float32 a, float32 b, float_status *status) 2274 { 2275 flag aSign, bSign, zSign; 2276 int_fast16_t aExp, bExp, zExp; 2277 uint32_t aSig, bSig, zSig; 2278 a = float32_squash_input_denormal(a, status); 2279 b = float32_squash_input_denormal(b, status); 2280 2281 aSig = extractFloat32Frac( a ); 2282 aExp = extractFloat32Exp( a ); 2283 aSign = extractFloat32Sign( a ); 2284 bSig = extractFloat32Frac( b ); 2285 bExp = extractFloat32Exp( b ); 2286 bSign = extractFloat32Sign( b ); 2287 zSign = aSign ^ bSign; 2288 if ( aExp == 0xFF ) { 2289 if (aSig) { 2290 return propagateFloat32NaN(a, b, status); 2291 } 2292 if ( bExp == 0xFF ) { 2293 if (bSig) { 2294 return propagateFloat32NaN(a, b, status); 2295 } 2296 float_raise(float_flag_invalid, status); 2297 return float32_default_nan; 2298 } 2299 return packFloat32( zSign, 0xFF, 0 ); 2300 } 2301 if ( bExp == 0xFF ) { 2302 if (bSig) { 2303 return propagateFloat32NaN(a, b, status); 2304 } 2305 return packFloat32( zSign, 0, 0 ); 2306 } 2307 if ( bExp == 0 ) { 2308 if ( bSig == 0 ) { 2309 if ( ( aExp | aSig ) == 0 ) { 2310 float_raise(float_flag_invalid, status); 2311 return float32_default_nan; 2312 } 2313 float_raise(float_flag_divbyzero, status); 2314 return packFloat32( zSign, 0xFF, 0 ); 2315 } 2316 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2317 } 2318 if ( aExp == 0 ) { 2319 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 2320 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2321 } 2322 zExp = aExp - bExp + 0x7D; 2323 aSig = ( aSig | 0x00800000 )<<7; 2324 bSig = ( bSig | 0x00800000 )<<8; 2325 if ( bSig <= ( aSig + aSig ) ) { 2326 aSig >>= 1; 2327 ++zExp; 2328 } 2329 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig; 2330 if ( ( zSig & 0x3F ) == 0 ) { 2331 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 ); 2332 } 2333 return roundAndPackFloat32(zSign, zExp, zSig, status); 2334 2335 } 2336 2337 /*---------------------------------------------------------------------------- 2338 | Returns the remainder of the single-precision floating-point value `a' 2339 | with respect to the corresponding value `b'. The operation is performed 2340 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2341 *----------------------------------------------------------------------------*/ 2342 2343 float32 float32_rem(float32 a, float32 b, float_status *status) 2344 { 2345 flag aSign, zSign; 2346 int_fast16_t aExp, bExp, expDiff; 2347 uint32_t aSig, bSig; 2348 uint32_t q; 2349 uint64_t aSig64, bSig64, q64; 2350 uint32_t alternateASig; 2351 int32_t sigMean; 2352 a = float32_squash_input_denormal(a, status); 2353 b = float32_squash_input_denormal(b, status); 2354 2355 aSig = extractFloat32Frac( a ); 2356 aExp = extractFloat32Exp( a ); 2357 aSign = extractFloat32Sign( a ); 2358 bSig = extractFloat32Frac( b ); 2359 bExp = extractFloat32Exp( b ); 2360 if ( aExp == 0xFF ) { 2361 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 2362 return propagateFloat32NaN(a, b, status); 2363 } 2364 float_raise(float_flag_invalid, status); 2365 return float32_default_nan; 2366 } 2367 if ( bExp == 0xFF ) { 2368 if (bSig) { 2369 return propagateFloat32NaN(a, b, status); 2370 } 2371 return a; 2372 } 2373 if ( bExp == 0 ) { 2374 if ( bSig == 0 ) { 2375 float_raise(float_flag_invalid, status); 2376 return float32_default_nan; 2377 } 2378 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2379 } 2380 if ( aExp == 0 ) { 2381 if ( aSig == 0 ) return a; 2382 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2383 } 2384 expDiff = aExp - bExp; 2385 aSig |= 0x00800000; 2386 bSig |= 0x00800000; 2387 if ( expDiff < 32 ) { 2388 aSig <<= 8; 2389 bSig <<= 8; 2390 if ( expDiff < 0 ) { 2391 if ( expDiff < -1 ) return a; 2392 aSig >>= 1; 2393 } 2394 q = ( bSig <= aSig ); 2395 if ( q ) aSig -= bSig; 2396 if ( 0 < expDiff ) { 2397 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 2398 q >>= 32 - expDiff; 2399 bSig >>= 2; 2400 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 2401 } 2402 else { 2403 aSig >>= 2; 2404 bSig >>= 2; 2405 } 2406 } 2407 else { 2408 if ( bSig <= aSig ) aSig -= bSig; 2409 aSig64 = ( (uint64_t) aSig )<<40; 2410 bSig64 = ( (uint64_t) bSig )<<40; 2411 expDiff -= 64; 2412 while ( 0 < expDiff ) { 2413 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2414 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2415 aSig64 = - ( ( bSig * q64 )<<38 ); 2416 expDiff -= 62; 2417 } 2418 expDiff += 64; 2419 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2420 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2421 q = q64>>( 64 - expDiff ); 2422 bSig <<= 6; 2423 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 2424 } 2425 do { 2426 alternateASig = aSig; 2427 ++q; 2428 aSig -= bSig; 2429 } while ( 0 <= (int32_t) aSig ); 2430 sigMean = aSig + alternateASig; 2431 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 2432 aSig = alternateASig; 2433 } 2434 zSign = ( (int32_t) aSig < 0 ); 2435 if ( zSign ) aSig = - aSig; 2436 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 2437 } 2438 2439 /*---------------------------------------------------------------------------- 2440 | Returns the result of multiplying the single-precision floating-point values 2441 | `a' and `b' then adding 'c', with no intermediate rounding step after the 2442 | multiplication. The operation is performed according to the IEC/IEEE 2443 | Standard for Binary Floating-Point Arithmetic 754-2008. 2444 | The flags argument allows the caller to select negation of the 2445 | addend, the intermediate product, or the final result. (The difference 2446 | between this and having the caller do a separate negation is that negating 2447 | externally will flip the sign bit on NaNs.) 2448 *----------------------------------------------------------------------------*/ 2449 2450 float32 float32_muladd(float32 a, float32 b, float32 c, int flags, 2451 float_status *status) 2452 { 2453 flag aSign, bSign, cSign, zSign; 2454 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff; 2455 uint32_t aSig, bSig, cSig; 2456 flag pInf, pZero, pSign; 2457 uint64_t pSig64, cSig64, zSig64; 2458 uint32_t pSig; 2459 int shiftcount; 2460 flag signflip, infzero; 2461 2462 a = float32_squash_input_denormal(a, status); 2463 b = float32_squash_input_denormal(b, status); 2464 c = float32_squash_input_denormal(c, status); 2465 aSig = extractFloat32Frac(a); 2466 aExp = extractFloat32Exp(a); 2467 aSign = extractFloat32Sign(a); 2468 bSig = extractFloat32Frac(b); 2469 bExp = extractFloat32Exp(b); 2470 bSign = extractFloat32Sign(b); 2471 cSig = extractFloat32Frac(c); 2472 cExp = extractFloat32Exp(c); 2473 cSign = extractFloat32Sign(c); 2474 2475 infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) || 2476 (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0)); 2477 2478 /* It is implementation-defined whether the cases of (0,inf,qnan) 2479 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 2480 * they return if they do), so we have to hand this information 2481 * off to the target-specific pick-a-NaN routine. 2482 */ 2483 if (((aExp == 0xff) && aSig) || 2484 ((bExp == 0xff) && bSig) || 2485 ((cExp == 0xff) && cSig)) { 2486 return propagateFloat32MulAddNaN(a, b, c, infzero, status); 2487 } 2488 2489 if (infzero) { 2490 float_raise(float_flag_invalid, status); 2491 return float32_default_nan; 2492 } 2493 2494 if (flags & float_muladd_negate_c) { 2495 cSign ^= 1; 2496 } 2497 2498 signflip = (flags & float_muladd_negate_result) ? 1 : 0; 2499 2500 /* Work out the sign and type of the product */ 2501 pSign = aSign ^ bSign; 2502 if (flags & float_muladd_negate_product) { 2503 pSign ^= 1; 2504 } 2505 pInf = (aExp == 0xff) || (bExp == 0xff); 2506 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0); 2507 2508 if (cExp == 0xff) { 2509 if (pInf && (pSign ^ cSign)) { 2510 /* addition of opposite-signed infinities => InvalidOperation */ 2511 float_raise(float_flag_invalid, status); 2512 return float32_default_nan; 2513 } 2514 /* Otherwise generate an infinity of the same sign */ 2515 return packFloat32(cSign ^ signflip, 0xff, 0); 2516 } 2517 2518 if (pInf) { 2519 return packFloat32(pSign ^ signflip, 0xff, 0); 2520 } 2521 2522 if (pZero) { 2523 if (cExp == 0) { 2524 if (cSig == 0) { 2525 /* Adding two exact zeroes */ 2526 if (pSign == cSign) { 2527 zSign = pSign; 2528 } else if (status->float_rounding_mode == float_round_down) { 2529 zSign = 1; 2530 } else { 2531 zSign = 0; 2532 } 2533 return packFloat32(zSign ^ signflip, 0, 0); 2534 } 2535 /* Exact zero plus a denorm */ 2536 if (status->flush_to_zero) { 2537 float_raise(float_flag_output_denormal, status); 2538 return packFloat32(cSign ^ signflip, 0, 0); 2539 } 2540 } 2541 /* Zero plus something non-zero : just return the something */ 2542 if (flags & float_muladd_halve_result) { 2543 if (cExp == 0) { 2544 normalizeFloat32Subnormal(cSig, &cExp, &cSig); 2545 } 2546 /* Subtract one to halve, and one again because roundAndPackFloat32 2547 * wants one less than the true exponent. 2548 */ 2549 cExp -= 2; 2550 cSig = (cSig | 0x00800000) << 7; 2551 return roundAndPackFloat32(cSign ^ signflip, cExp, cSig, status); 2552 } 2553 return packFloat32(cSign ^ signflip, cExp, cSig); 2554 } 2555 2556 if (aExp == 0) { 2557 normalizeFloat32Subnormal(aSig, &aExp, &aSig); 2558 } 2559 if (bExp == 0) { 2560 normalizeFloat32Subnormal(bSig, &bExp, &bSig); 2561 } 2562 2563 /* Calculate the actual result a * b + c */ 2564 2565 /* Multiply first; this is easy. */ 2566 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f 2567 * because we want the true exponent, not the "one-less-than" 2568 * flavour that roundAndPackFloat32() takes. 2569 */ 2570 pExp = aExp + bExp - 0x7e; 2571 aSig = (aSig | 0x00800000) << 7; 2572 bSig = (bSig | 0x00800000) << 8; 2573 pSig64 = (uint64_t)aSig * bSig; 2574 if ((int64_t)(pSig64 << 1) >= 0) { 2575 pSig64 <<= 1; 2576 pExp--; 2577 } 2578 2579 zSign = pSign ^ signflip; 2580 2581 /* Now pSig64 is the significand of the multiply, with the explicit bit in 2582 * position 62. 2583 */ 2584 if (cExp == 0) { 2585 if (!cSig) { 2586 /* Throw out the special case of c being an exact zero now */ 2587 shift64RightJamming(pSig64, 32, &pSig64); 2588 pSig = pSig64; 2589 if (flags & float_muladd_halve_result) { 2590 pExp--; 2591 } 2592 return roundAndPackFloat32(zSign, pExp - 1, 2593 pSig, status); 2594 } 2595 normalizeFloat32Subnormal(cSig, &cExp, &cSig); 2596 } 2597 2598 cSig64 = (uint64_t)cSig << (62 - 23); 2599 cSig64 |= LIT64(0x4000000000000000); 2600 expDiff = pExp - cExp; 2601 2602 if (pSign == cSign) { 2603 /* Addition */ 2604 if (expDiff > 0) { 2605 /* scale c to match p */ 2606 shift64RightJamming(cSig64, expDiff, &cSig64); 2607 zExp = pExp; 2608 } else if (expDiff < 0) { 2609 /* scale p to match c */ 2610 shift64RightJamming(pSig64, -expDiff, &pSig64); 2611 zExp = cExp; 2612 } else { 2613 /* no scaling needed */ 2614 zExp = cExp; 2615 } 2616 /* Add significands and make sure explicit bit ends up in posn 62 */ 2617 zSig64 = pSig64 + cSig64; 2618 if ((int64_t)zSig64 < 0) { 2619 shift64RightJamming(zSig64, 1, &zSig64); 2620 } else { 2621 zExp--; 2622 } 2623 } else { 2624 /* Subtraction */ 2625 if (expDiff > 0) { 2626 shift64RightJamming(cSig64, expDiff, &cSig64); 2627 zSig64 = pSig64 - cSig64; 2628 zExp = pExp; 2629 } else if (expDiff < 0) { 2630 shift64RightJamming(pSig64, -expDiff, &pSig64); 2631 zSig64 = cSig64 - pSig64; 2632 zExp = cExp; 2633 zSign ^= 1; 2634 } else { 2635 zExp = pExp; 2636 if (cSig64 < pSig64) { 2637 zSig64 = pSig64 - cSig64; 2638 } else if (pSig64 < cSig64) { 2639 zSig64 = cSig64 - pSig64; 2640 zSign ^= 1; 2641 } else { 2642 /* Exact zero */ 2643 zSign = signflip; 2644 if (status->float_rounding_mode == float_round_down) { 2645 zSign ^= 1; 2646 } 2647 return packFloat32(zSign, 0, 0); 2648 } 2649 } 2650 --zExp; 2651 /* Normalize to put the explicit bit back into bit 62. */ 2652 shiftcount = countLeadingZeros64(zSig64) - 1; 2653 zSig64 <<= shiftcount; 2654 zExp -= shiftcount; 2655 } 2656 if (flags & float_muladd_halve_result) { 2657 zExp--; 2658 } 2659 2660 shift64RightJamming(zSig64, 32, &zSig64); 2661 return roundAndPackFloat32(zSign, zExp, zSig64, status); 2662 } 2663 2664 2665 /*---------------------------------------------------------------------------- 2666 | Returns the square root of the single-precision floating-point value `a'. 2667 | The operation is performed according to the IEC/IEEE Standard for Binary 2668 | Floating-Point Arithmetic. 2669 *----------------------------------------------------------------------------*/ 2670 2671 float32 float32_sqrt(float32 a, float_status *status) 2672 { 2673 flag aSign; 2674 int_fast16_t aExp, zExp; 2675 uint32_t aSig, zSig; 2676 uint64_t rem, term; 2677 a = float32_squash_input_denormal(a, status); 2678 2679 aSig = extractFloat32Frac( a ); 2680 aExp = extractFloat32Exp( a ); 2681 aSign = extractFloat32Sign( a ); 2682 if ( aExp == 0xFF ) { 2683 if (aSig) { 2684 return propagateFloat32NaN(a, float32_zero, status); 2685 } 2686 if ( ! aSign ) return a; 2687 float_raise(float_flag_invalid, status); 2688 return float32_default_nan; 2689 } 2690 if ( aSign ) { 2691 if ( ( aExp | aSig ) == 0 ) return a; 2692 float_raise(float_flag_invalid, status); 2693 return float32_default_nan; 2694 } 2695 if ( aExp == 0 ) { 2696 if ( aSig == 0 ) return float32_zero; 2697 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2698 } 2699 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 2700 aSig = ( aSig | 0x00800000 )<<8; 2701 zSig = estimateSqrt32( aExp, aSig ) + 2; 2702 if ( ( zSig & 0x7F ) <= 5 ) { 2703 if ( zSig < 2 ) { 2704 zSig = 0x7FFFFFFF; 2705 goto roundAndPack; 2706 } 2707 aSig >>= aExp & 1; 2708 term = ( (uint64_t) zSig ) * zSig; 2709 rem = ( ( (uint64_t) aSig )<<32 ) - term; 2710 while ( (int64_t) rem < 0 ) { 2711 --zSig; 2712 rem += ( ( (uint64_t) zSig )<<1 ) | 1; 2713 } 2714 zSig |= ( rem != 0 ); 2715 } 2716 shift32RightJamming( zSig, 1, &zSig ); 2717 roundAndPack: 2718 return roundAndPackFloat32(0, zExp, zSig, status); 2719 2720 } 2721 2722 /*---------------------------------------------------------------------------- 2723 | Returns the binary exponential of the single-precision floating-point value 2724 | `a'. The operation is performed according to the IEC/IEEE Standard for 2725 | Binary Floating-Point Arithmetic. 2726 | 2727 | Uses the following identities: 2728 | 2729 | 1. ------------------------------------------------------------------------- 2730 | x x*ln(2) 2731 | 2 = e 2732 | 2733 | 2. ------------------------------------------------------------------------- 2734 | 2 3 4 5 n 2735 | x x x x x x x 2736 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 2737 | 1! 2! 3! 4! 5! n! 2738 *----------------------------------------------------------------------------*/ 2739 2740 static const float64 float32_exp2_coefficients[15] = 2741 { 2742 const_float64( 0x3ff0000000000000ll ), /* 1 */ 2743 const_float64( 0x3fe0000000000000ll ), /* 2 */ 2744 const_float64( 0x3fc5555555555555ll ), /* 3 */ 2745 const_float64( 0x3fa5555555555555ll ), /* 4 */ 2746 const_float64( 0x3f81111111111111ll ), /* 5 */ 2747 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 2748 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 2749 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 2750 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 2751 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 2752 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 2753 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 2754 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 2755 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 2756 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 2757 }; 2758 2759 float32 float32_exp2(float32 a, float_status *status) 2760 { 2761 flag aSign; 2762 int_fast16_t aExp; 2763 uint32_t aSig; 2764 float64 r, x, xn; 2765 int i; 2766 a = float32_squash_input_denormal(a, status); 2767 2768 aSig = extractFloat32Frac( a ); 2769 aExp = extractFloat32Exp( a ); 2770 aSign = extractFloat32Sign( a ); 2771 2772 if ( aExp == 0xFF) { 2773 if (aSig) { 2774 return propagateFloat32NaN(a, float32_zero, status); 2775 } 2776 return (aSign) ? float32_zero : a; 2777 } 2778 if (aExp == 0) { 2779 if (aSig == 0) return float32_one; 2780 } 2781 2782 float_raise(float_flag_inexact, status); 2783 2784 /* ******************************* */ 2785 /* using float64 for approximation */ 2786 /* ******************************* */ 2787 x = float32_to_float64(a, status); 2788 x = float64_mul(x, float64_ln2, status); 2789 2790 xn = x; 2791 r = float64_one; 2792 for (i = 0 ; i < 15 ; i++) { 2793 float64 f; 2794 2795 f = float64_mul(xn, float32_exp2_coefficients[i], status); 2796 r = float64_add(r, f, status); 2797 2798 xn = float64_mul(xn, x, status); 2799 } 2800 2801 return float64_to_float32(r, status); 2802 } 2803 2804 /*---------------------------------------------------------------------------- 2805 | Returns the binary log of the single-precision floating-point value `a'. 2806 | The operation is performed according to the IEC/IEEE Standard for Binary 2807 | Floating-Point Arithmetic. 2808 *----------------------------------------------------------------------------*/ 2809 float32 float32_log2(float32 a, float_status *status) 2810 { 2811 flag aSign, zSign; 2812 int_fast16_t aExp; 2813 uint32_t aSig, zSig, i; 2814 2815 a = float32_squash_input_denormal(a, status); 2816 aSig = extractFloat32Frac( a ); 2817 aExp = extractFloat32Exp( a ); 2818 aSign = extractFloat32Sign( a ); 2819 2820 if ( aExp == 0 ) { 2821 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 2822 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2823 } 2824 if ( aSign ) { 2825 float_raise(float_flag_invalid, status); 2826 return float32_default_nan; 2827 } 2828 if ( aExp == 0xFF ) { 2829 if (aSig) { 2830 return propagateFloat32NaN(a, float32_zero, status); 2831 } 2832 return a; 2833 } 2834 2835 aExp -= 0x7F; 2836 aSig |= 0x00800000; 2837 zSign = aExp < 0; 2838 zSig = aExp << 23; 2839 2840 for (i = 1 << 22; i > 0; i >>= 1) { 2841 aSig = ( (uint64_t)aSig * aSig ) >> 23; 2842 if ( aSig & 0x01000000 ) { 2843 aSig >>= 1; 2844 zSig |= i; 2845 } 2846 } 2847 2848 if ( zSign ) 2849 zSig = -zSig; 2850 2851 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 2852 } 2853 2854 /*---------------------------------------------------------------------------- 2855 | Returns 1 if the single-precision floating-point value `a' is equal to 2856 | the corresponding value `b', and 0 otherwise. The invalid exception is 2857 | raised if either operand is a NaN. Otherwise, the comparison is performed 2858 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2859 *----------------------------------------------------------------------------*/ 2860 2861 int float32_eq(float32 a, float32 b, float_status *status) 2862 { 2863 uint32_t av, bv; 2864 a = float32_squash_input_denormal(a, status); 2865 b = float32_squash_input_denormal(b, status); 2866 2867 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2868 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2869 ) { 2870 float_raise(float_flag_invalid, status); 2871 return 0; 2872 } 2873 av = float32_val(a); 2874 bv = float32_val(b); 2875 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 2876 } 2877 2878 /*---------------------------------------------------------------------------- 2879 | Returns 1 if the single-precision floating-point value `a' is less than 2880 | or equal to the corresponding value `b', and 0 otherwise. The invalid 2881 | exception is raised if either operand is a NaN. The comparison is performed 2882 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2883 *----------------------------------------------------------------------------*/ 2884 2885 int float32_le(float32 a, float32 b, float_status *status) 2886 { 2887 flag aSign, bSign; 2888 uint32_t av, bv; 2889 a = float32_squash_input_denormal(a, status); 2890 b = float32_squash_input_denormal(b, status); 2891 2892 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2893 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2894 ) { 2895 float_raise(float_flag_invalid, status); 2896 return 0; 2897 } 2898 aSign = extractFloat32Sign( a ); 2899 bSign = extractFloat32Sign( b ); 2900 av = float32_val(a); 2901 bv = float32_val(b); 2902 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 2903 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 2904 2905 } 2906 2907 /*---------------------------------------------------------------------------- 2908 | Returns 1 if the single-precision floating-point value `a' is less than 2909 | the corresponding value `b', and 0 otherwise. The invalid exception is 2910 | raised if either operand is a NaN. The comparison is performed according 2911 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2912 *----------------------------------------------------------------------------*/ 2913 2914 int float32_lt(float32 a, float32 b, float_status *status) 2915 { 2916 flag aSign, bSign; 2917 uint32_t av, bv; 2918 a = float32_squash_input_denormal(a, status); 2919 b = float32_squash_input_denormal(b, status); 2920 2921 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2922 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2923 ) { 2924 float_raise(float_flag_invalid, status); 2925 return 0; 2926 } 2927 aSign = extractFloat32Sign( a ); 2928 bSign = extractFloat32Sign( b ); 2929 av = float32_val(a); 2930 bv = float32_val(b); 2931 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 2932 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 2933 2934 } 2935 2936 /*---------------------------------------------------------------------------- 2937 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 2938 | be compared, and 0 otherwise. The invalid exception is raised if either 2939 | operand is a NaN. The comparison is performed according to the IEC/IEEE 2940 | Standard for Binary Floating-Point Arithmetic. 2941 *----------------------------------------------------------------------------*/ 2942 2943 int float32_unordered(float32 a, float32 b, float_status *status) 2944 { 2945 a = float32_squash_input_denormal(a, status); 2946 b = float32_squash_input_denormal(b, status); 2947 2948 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2949 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2950 ) { 2951 float_raise(float_flag_invalid, status); 2952 return 1; 2953 } 2954 return 0; 2955 } 2956 2957 /*---------------------------------------------------------------------------- 2958 | Returns 1 if the single-precision floating-point value `a' is equal to 2959 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 2960 | exception. The comparison is performed according to the IEC/IEEE Standard 2961 | for Binary Floating-Point Arithmetic. 2962 *----------------------------------------------------------------------------*/ 2963 2964 int float32_eq_quiet(float32 a, float32 b, float_status *status) 2965 { 2966 a = float32_squash_input_denormal(a, status); 2967 b = float32_squash_input_denormal(b, status); 2968 2969 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2970 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2971 ) { 2972 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2973 float_raise(float_flag_invalid, status); 2974 } 2975 return 0; 2976 } 2977 return ( float32_val(a) == float32_val(b) ) || 2978 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 2979 } 2980 2981 /*---------------------------------------------------------------------------- 2982 | Returns 1 if the single-precision floating-point value `a' is less than or 2983 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 2984 | cause an exception. Otherwise, the comparison is performed according to the 2985 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2986 *----------------------------------------------------------------------------*/ 2987 2988 int float32_le_quiet(float32 a, float32 b, float_status *status) 2989 { 2990 flag aSign, bSign; 2991 uint32_t av, bv; 2992 a = float32_squash_input_denormal(a, status); 2993 b = float32_squash_input_denormal(b, status); 2994 2995 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2996 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2997 ) { 2998 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 2999 float_raise(float_flag_invalid, status); 3000 } 3001 return 0; 3002 } 3003 aSign = extractFloat32Sign( a ); 3004 bSign = extractFloat32Sign( b ); 3005 av = float32_val(a); 3006 bv = float32_val(b); 3007 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3008 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3009 3010 } 3011 3012 /*---------------------------------------------------------------------------- 3013 | Returns 1 if the single-precision floating-point value `a' is less than 3014 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3015 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 3016 | Standard for Binary Floating-Point Arithmetic. 3017 *----------------------------------------------------------------------------*/ 3018 3019 int float32_lt_quiet(float32 a, float32 b, float_status *status) 3020 { 3021 flag aSign, bSign; 3022 uint32_t av, bv; 3023 a = float32_squash_input_denormal(a, status); 3024 b = float32_squash_input_denormal(b, status); 3025 3026 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3027 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3028 ) { 3029 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 3030 float_raise(float_flag_invalid, status); 3031 } 3032 return 0; 3033 } 3034 aSign = extractFloat32Sign( a ); 3035 bSign = extractFloat32Sign( b ); 3036 av = float32_val(a); 3037 bv = float32_val(b); 3038 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 3039 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3040 3041 } 3042 3043 /*---------------------------------------------------------------------------- 3044 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 3045 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 3046 | comparison is performed according to the IEC/IEEE Standard for Binary 3047 | Floating-Point Arithmetic. 3048 *----------------------------------------------------------------------------*/ 3049 3050 int float32_unordered_quiet(float32 a, float32 b, float_status *status) 3051 { 3052 a = float32_squash_input_denormal(a, status); 3053 b = float32_squash_input_denormal(b, status); 3054 3055 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3056 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3057 ) { 3058 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { 3059 float_raise(float_flag_invalid, status); 3060 } 3061 return 1; 3062 } 3063 return 0; 3064 } 3065 3066 /*---------------------------------------------------------------------------- 3067 | Returns the result of converting the double-precision floating-point value 3068 | `a' to the 32-bit two's complement integer format. The conversion is 3069 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3070 | Arithmetic---which means in particular that the conversion is rounded 3071 | according to the current rounding mode. If `a' is a NaN, the largest 3072 | positive integer is returned. Otherwise, if the conversion overflows, the 3073 | largest integer with the same sign as `a' is returned. 3074 *----------------------------------------------------------------------------*/ 3075 3076 int32 float64_to_int32(float64 a, float_status *status) 3077 { 3078 flag aSign; 3079 int_fast16_t aExp, shiftCount; 3080 uint64_t aSig; 3081 a = float64_squash_input_denormal(a, status); 3082 3083 aSig = extractFloat64Frac( a ); 3084 aExp = extractFloat64Exp( a ); 3085 aSign = extractFloat64Sign( a ); 3086 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 3087 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 3088 shiftCount = 0x42C - aExp; 3089 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 3090 return roundAndPackInt32(aSign, aSig, status); 3091 3092 } 3093 3094 /*---------------------------------------------------------------------------- 3095 | Returns the result of converting the double-precision floating-point value 3096 | `a' to the 32-bit two's complement integer format. The conversion is 3097 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3098 | Arithmetic, except that the conversion is always rounded toward zero. 3099 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 3100 | the conversion overflows, the largest integer with the same sign as `a' is 3101 | returned. 3102 *----------------------------------------------------------------------------*/ 3103 3104 int32 float64_to_int32_round_to_zero(float64 a, float_status *status) 3105 { 3106 flag aSign; 3107 int_fast16_t aExp, shiftCount; 3108 uint64_t aSig, savedASig; 3109 int32_t z; 3110 a = float64_squash_input_denormal(a, status); 3111 3112 aSig = extractFloat64Frac( a ); 3113 aExp = extractFloat64Exp( a ); 3114 aSign = extractFloat64Sign( a ); 3115 if ( 0x41E < aExp ) { 3116 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 3117 goto invalid; 3118 } 3119 else if ( aExp < 0x3FF ) { 3120 if (aExp || aSig) { 3121 status->float_exception_flags |= float_flag_inexact; 3122 } 3123 return 0; 3124 } 3125 aSig |= LIT64( 0x0010000000000000 ); 3126 shiftCount = 0x433 - aExp; 3127 savedASig = aSig; 3128 aSig >>= shiftCount; 3129 z = aSig; 3130 if ( aSign ) z = - z; 3131 if ( ( z < 0 ) ^ aSign ) { 3132 invalid: 3133 float_raise(float_flag_invalid, status); 3134 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 3135 } 3136 if ( ( aSig<<shiftCount ) != savedASig ) { 3137 status->float_exception_flags |= float_flag_inexact; 3138 } 3139 return z; 3140 3141 } 3142 3143 /*---------------------------------------------------------------------------- 3144 | Returns the result of converting the double-precision floating-point value 3145 | `a' to the 16-bit two's complement integer format. The conversion is 3146 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3147 | Arithmetic, except that the conversion is always rounded toward zero. 3148 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 3149 | the conversion overflows, the largest integer with the same sign as `a' is 3150 | returned. 3151 *----------------------------------------------------------------------------*/ 3152 3153 int_fast16_t float64_to_int16_round_to_zero(float64 a, float_status *status) 3154 { 3155 flag aSign; 3156 int_fast16_t aExp, shiftCount; 3157 uint64_t aSig, savedASig; 3158 int32 z; 3159 3160 aSig = extractFloat64Frac( a ); 3161 aExp = extractFloat64Exp( a ); 3162 aSign = extractFloat64Sign( a ); 3163 if ( 0x40E < aExp ) { 3164 if ( ( aExp == 0x7FF ) && aSig ) { 3165 aSign = 0; 3166 } 3167 goto invalid; 3168 } 3169 else if ( aExp < 0x3FF ) { 3170 if ( aExp || aSig ) { 3171 status->float_exception_flags |= float_flag_inexact; 3172 } 3173 return 0; 3174 } 3175 aSig |= LIT64( 0x0010000000000000 ); 3176 shiftCount = 0x433 - aExp; 3177 savedASig = aSig; 3178 aSig >>= shiftCount; 3179 z = aSig; 3180 if ( aSign ) { 3181 z = - z; 3182 } 3183 if ( ( (int16_t)z < 0 ) ^ aSign ) { 3184 invalid: 3185 float_raise(float_flag_invalid, status); 3186 return aSign ? (int32_t) 0xffff8000 : 0x7FFF; 3187 } 3188 if ( ( aSig<<shiftCount ) != savedASig ) { 3189 status->float_exception_flags |= float_flag_inexact; 3190 } 3191 return z; 3192 } 3193 3194 /*---------------------------------------------------------------------------- 3195 | Returns the result of converting the double-precision floating-point value 3196 | `a' to the 64-bit two's complement integer format. The conversion is 3197 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3198 | Arithmetic---which means in particular that the conversion is rounded 3199 | according to the current rounding mode. If `a' is a NaN, the largest 3200 | positive integer is returned. Otherwise, if the conversion overflows, the 3201 | largest integer with the same sign as `a' is returned. 3202 *----------------------------------------------------------------------------*/ 3203 3204 int64 float64_to_int64(float64 a, float_status *status) 3205 { 3206 flag aSign; 3207 int_fast16_t aExp, shiftCount; 3208 uint64_t aSig, aSigExtra; 3209 a = float64_squash_input_denormal(a, status); 3210 3211 aSig = extractFloat64Frac( a ); 3212 aExp = extractFloat64Exp( a ); 3213 aSign = extractFloat64Sign( a ); 3214 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 3215 shiftCount = 0x433 - aExp; 3216 if ( shiftCount <= 0 ) { 3217 if ( 0x43E < aExp ) { 3218 float_raise(float_flag_invalid, status); 3219 if ( ! aSign 3220 || ( ( aExp == 0x7FF ) 3221 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 3222 ) { 3223 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3224 } 3225 return (int64_t) LIT64( 0x8000000000000000 ); 3226 } 3227 aSigExtra = 0; 3228 aSig <<= - shiftCount; 3229 } 3230 else { 3231 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 3232 } 3233 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 3234 3235 } 3236 3237 /*---------------------------------------------------------------------------- 3238 | Returns the result of converting the double-precision floating-point value 3239 | `a' to the 64-bit two's complement integer format. The conversion is 3240 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3241 | Arithmetic, except that the conversion is always rounded toward zero. 3242 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 3243 | the conversion overflows, the largest integer with the same sign as `a' is 3244 | returned. 3245 *----------------------------------------------------------------------------*/ 3246 3247 int64 float64_to_int64_round_to_zero(float64 a, float_status *status) 3248 { 3249 flag aSign; 3250 int_fast16_t aExp, shiftCount; 3251 uint64_t aSig; 3252 int64 z; 3253 a = float64_squash_input_denormal(a, status); 3254 3255 aSig = extractFloat64Frac( a ); 3256 aExp = extractFloat64Exp( a ); 3257 aSign = extractFloat64Sign( a ); 3258 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 3259 shiftCount = aExp - 0x433; 3260 if ( 0 <= shiftCount ) { 3261 if ( 0x43E <= aExp ) { 3262 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) { 3263 float_raise(float_flag_invalid, status); 3264 if ( ! aSign 3265 || ( ( aExp == 0x7FF ) 3266 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 3267 ) { 3268 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3269 } 3270 } 3271 return (int64_t) LIT64( 0x8000000000000000 ); 3272 } 3273 z = aSig<<shiftCount; 3274 } 3275 else { 3276 if ( aExp < 0x3FE ) { 3277 if (aExp | aSig) { 3278 status->float_exception_flags |= float_flag_inexact; 3279 } 3280 return 0; 3281 } 3282 z = aSig>>( - shiftCount ); 3283 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 3284 status->float_exception_flags |= float_flag_inexact; 3285 } 3286 } 3287 if ( aSign ) z = - z; 3288 return z; 3289 3290 } 3291 3292 /*---------------------------------------------------------------------------- 3293 | Returns the result of converting the double-precision floating-point value 3294 | `a' to the single-precision floating-point format. The conversion is 3295 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3296 | Arithmetic. 3297 *----------------------------------------------------------------------------*/ 3298 3299 float32 float64_to_float32(float64 a, float_status *status) 3300 { 3301 flag aSign; 3302 int_fast16_t aExp; 3303 uint64_t aSig; 3304 uint32_t zSig; 3305 a = float64_squash_input_denormal(a, status); 3306 3307 aSig = extractFloat64Frac( a ); 3308 aExp = extractFloat64Exp( a ); 3309 aSign = extractFloat64Sign( a ); 3310 if ( aExp == 0x7FF ) { 3311 if (aSig) { 3312 return commonNaNToFloat32(float64ToCommonNaN(a, status), status); 3313 } 3314 return packFloat32( aSign, 0xFF, 0 ); 3315 } 3316 shift64RightJamming( aSig, 22, &aSig ); 3317 zSig = aSig; 3318 if ( aExp || zSig ) { 3319 zSig |= 0x40000000; 3320 aExp -= 0x381; 3321 } 3322 return roundAndPackFloat32(aSign, aExp, zSig, status); 3323 3324 } 3325 3326 3327 /*---------------------------------------------------------------------------- 3328 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 3329 | half-precision floating-point value, returning the result. After being 3330 | shifted into the proper positions, the three fields are simply added 3331 | together to form the result. This means that any integer portion of `zSig' 3332 | will be added into the exponent. Since a properly normalized significand 3333 | will have an integer portion equal to 1, the `zExp' input should be 1 less 3334 | than the desired result exponent whenever `zSig' is a complete, normalized 3335 | significand. 3336 *----------------------------------------------------------------------------*/ 3337 static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig) 3338 { 3339 return make_float16( 3340 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig); 3341 } 3342 3343 /*---------------------------------------------------------------------------- 3344 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3345 | and significand `zSig', and returns the proper half-precision floating- 3346 | point value corresponding to the abstract input. Ordinarily, the abstract 3347 | value is simply rounded and packed into the half-precision format, with 3348 | the inexact exception raised if the abstract input cannot be represented 3349 | exactly. However, if the abstract value is too large, the overflow and 3350 | inexact exceptions are raised and an infinity or maximal finite value is 3351 | returned. If the abstract value is too small, the input value is rounded to 3352 | a subnormal number, and the underflow and inexact exceptions are raised if 3353 | the abstract input cannot be represented exactly as a subnormal half- 3354 | precision floating-point number. 3355 | The `ieee' flag indicates whether to use IEEE standard half precision, or 3356 | ARM-style "alternative representation", which omits the NaN and Inf 3357 | encodings in order to raise the maximum representable exponent by one. 3358 | The input significand `zSig' has its binary point between bits 22 3359 | and 23, which is 13 bits to the left of the usual location. This shifted 3360 | significand must be normalized or smaller. If `zSig' is not normalized, 3361 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3362 | and it must not require rounding. In the usual case that `zSig' is 3363 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3364 | Note the slightly odd position of the binary point in zSig compared with the 3365 | other roundAndPackFloat functions. This should probably be fixed if we 3366 | need to implement more float16 routines than just conversion. 3367 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3368 | Binary Floating-Point Arithmetic. 3369 *----------------------------------------------------------------------------*/ 3370 3371 static float32 roundAndPackFloat16(flag zSign, int_fast16_t zExp, 3372 uint32_t zSig, flag ieee, 3373 float_status *status) 3374 { 3375 int maxexp = ieee ? 29 : 30; 3376 uint32_t mask; 3377 uint32_t increment; 3378 bool rounding_bumps_exp; 3379 bool is_tiny = false; 3380 3381 /* Calculate the mask of bits of the mantissa which are not 3382 * representable in half-precision and will be lost. 3383 */ 3384 if (zExp < 1) { 3385 /* Will be denormal in halfprec */ 3386 mask = 0x00ffffff; 3387 if (zExp >= -11) { 3388 mask >>= 11 + zExp; 3389 } 3390 } else { 3391 /* Normal number in halfprec */ 3392 mask = 0x00001fff; 3393 } 3394 3395 switch (status->float_rounding_mode) { 3396 case float_round_nearest_even: 3397 increment = (mask + 1) >> 1; 3398 if ((zSig & mask) == increment) { 3399 increment = zSig & (increment << 1); 3400 } 3401 break; 3402 case float_round_ties_away: 3403 increment = (mask + 1) >> 1; 3404 break; 3405 case float_round_up: 3406 increment = zSign ? 0 : mask; 3407 break; 3408 case float_round_down: 3409 increment = zSign ? mask : 0; 3410 break; 3411 default: /* round_to_zero */ 3412 increment = 0; 3413 break; 3414 } 3415 3416 rounding_bumps_exp = (zSig + increment >= 0x01000000); 3417 3418 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) { 3419 if (ieee) { 3420 float_raise(float_flag_overflow | float_flag_inexact, status); 3421 return packFloat16(zSign, 0x1f, 0); 3422 } else { 3423 float_raise(float_flag_invalid, status); 3424 return packFloat16(zSign, 0x1f, 0x3ff); 3425 } 3426 } 3427 3428 if (zExp < 0) { 3429 /* Note that flush-to-zero does not affect half-precision results */ 3430 is_tiny = 3431 (status->float_detect_tininess == float_tininess_before_rounding) 3432 || (zExp < -1) 3433 || (!rounding_bumps_exp); 3434 } 3435 if (zSig & mask) { 3436 float_raise(float_flag_inexact, status); 3437 if (is_tiny) { 3438 float_raise(float_flag_underflow, status); 3439 } 3440 } 3441 3442 zSig += increment; 3443 if (rounding_bumps_exp) { 3444 zSig >>= 1; 3445 zExp++; 3446 } 3447 3448 if (zExp < -10) { 3449 return packFloat16(zSign, 0, 0); 3450 } 3451 if (zExp < 0) { 3452 zSig >>= -zExp; 3453 zExp = 0; 3454 } 3455 return packFloat16(zSign, zExp, zSig >> 13); 3456 } 3457 3458 static void normalizeFloat16Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, 3459 uint32_t *zSigPtr) 3460 { 3461 int8_t shiftCount = countLeadingZeros32(aSig) - 21; 3462 *zSigPtr = aSig << shiftCount; 3463 *zExpPtr = 1 - shiftCount; 3464 } 3465 3466 /* Half precision floats come in two formats: standard IEEE and "ARM" format. 3467 The latter gains extra exponent range by omitting the NaN/Inf encodings. */ 3468 3469 float32 float16_to_float32(float16 a, flag ieee, float_status *status) 3470 { 3471 flag aSign; 3472 int_fast16_t aExp; 3473 uint32_t aSig; 3474 3475 aSign = extractFloat16Sign(a); 3476 aExp = extractFloat16Exp(a); 3477 aSig = extractFloat16Frac(a); 3478 3479 if (aExp == 0x1f && ieee) { 3480 if (aSig) { 3481 return commonNaNToFloat32(float16ToCommonNaN(a, status), status); 3482 } 3483 return packFloat32(aSign, 0xff, 0); 3484 } 3485 if (aExp == 0) { 3486 if (aSig == 0) { 3487 return packFloat32(aSign, 0, 0); 3488 } 3489 3490 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 3491 aExp--; 3492 } 3493 return packFloat32( aSign, aExp + 0x70, aSig << 13); 3494 } 3495 3496 float16 float32_to_float16(float32 a, flag ieee, float_status *status) 3497 { 3498 flag aSign; 3499 int_fast16_t aExp; 3500 uint32_t aSig; 3501 3502 a = float32_squash_input_denormal(a, status); 3503 3504 aSig = extractFloat32Frac( a ); 3505 aExp = extractFloat32Exp( a ); 3506 aSign = extractFloat32Sign( a ); 3507 if ( aExp == 0xFF ) { 3508 if (aSig) { 3509 /* Input is a NaN */ 3510 if (!ieee) { 3511 float_raise(float_flag_invalid, status); 3512 return packFloat16(aSign, 0, 0); 3513 } 3514 return commonNaNToFloat16( 3515 float32ToCommonNaN(a, status), status); 3516 } 3517 /* Infinity */ 3518 if (!ieee) { 3519 float_raise(float_flag_invalid, status); 3520 return packFloat16(aSign, 0x1f, 0x3ff); 3521 } 3522 return packFloat16(aSign, 0x1f, 0); 3523 } 3524 if (aExp == 0 && aSig == 0) { 3525 return packFloat16(aSign, 0, 0); 3526 } 3527 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 3528 * even if the input is denormal; however this is harmless because 3529 * the largest possible single-precision denormal is still smaller 3530 * than the smallest representable half-precision denormal, and so we 3531 * will end up ignoring aSig and returning via the "always return zero" 3532 * codepath. 3533 */ 3534 aSig |= 0x00800000; 3535 aExp -= 0x71; 3536 3537 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status); 3538 } 3539 3540 float64 float16_to_float64(float16 a, flag ieee, float_status *status) 3541 { 3542 flag aSign; 3543 int_fast16_t aExp; 3544 uint32_t aSig; 3545 3546 aSign = extractFloat16Sign(a); 3547 aExp = extractFloat16Exp(a); 3548 aSig = extractFloat16Frac(a); 3549 3550 if (aExp == 0x1f && ieee) { 3551 if (aSig) { 3552 return commonNaNToFloat64( 3553 float16ToCommonNaN(a, status), status); 3554 } 3555 return packFloat64(aSign, 0x7ff, 0); 3556 } 3557 if (aExp == 0) { 3558 if (aSig == 0) { 3559 return packFloat64(aSign, 0, 0); 3560 } 3561 3562 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 3563 aExp--; 3564 } 3565 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42); 3566 } 3567 3568 float16 float64_to_float16(float64 a, flag ieee, float_status *status) 3569 { 3570 flag aSign; 3571 int_fast16_t aExp; 3572 uint64_t aSig; 3573 uint32_t zSig; 3574 3575 a = float64_squash_input_denormal(a, status); 3576 3577 aSig = extractFloat64Frac(a); 3578 aExp = extractFloat64Exp(a); 3579 aSign = extractFloat64Sign(a); 3580 if (aExp == 0x7FF) { 3581 if (aSig) { 3582 /* Input is a NaN */ 3583 if (!ieee) { 3584 float_raise(float_flag_invalid, status); 3585 return packFloat16(aSign, 0, 0); 3586 } 3587 return commonNaNToFloat16( 3588 float64ToCommonNaN(a, status), status); 3589 } 3590 /* Infinity */ 3591 if (!ieee) { 3592 float_raise(float_flag_invalid, status); 3593 return packFloat16(aSign, 0x1f, 0x3ff); 3594 } 3595 return packFloat16(aSign, 0x1f, 0); 3596 } 3597 shift64RightJamming(aSig, 29, &aSig); 3598 zSig = aSig; 3599 if (aExp == 0 && zSig == 0) { 3600 return packFloat16(aSign, 0, 0); 3601 } 3602 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 3603 * even if the input is denormal; however this is harmless because 3604 * the largest possible single-precision denormal is still smaller 3605 * than the smallest representable half-precision denormal, and so we 3606 * will end up ignoring aSig and returning via the "always return zero" 3607 * codepath. 3608 */ 3609 zSig |= 0x00800000; 3610 aExp -= 0x3F1; 3611 3612 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status); 3613 } 3614 3615 /*---------------------------------------------------------------------------- 3616 | Returns the result of converting the double-precision floating-point value 3617 | `a' to the extended double-precision floating-point format. The conversion 3618 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3619 | Arithmetic. 3620 *----------------------------------------------------------------------------*/ 3621 3622 floatx80 float64_to_floatx80(float64 a, float_status *status) 3623 { 3624 flag aSign; 3625 int_fast16_t aExp; 3626 uint64_t aSig; 3627 3628 a = float64_squash_input_denormal(a, status); 3629 aSig = extractFloat64Frac( a ); 3630 aExp = extractFloat64Exp( a ); 3631 aSign = extractFloat64Sign( a ); 3632 if ( aExp == 0x7FF ) { 3633 if (aSig) { 3634 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status); 3635 } 3636 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3637 } 3638 if ( aExp == 0 ) { 3639 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 3640 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3641 } 3642 return 3643 packFloatx80( 3644 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 3645 3646 } 3647 3648 /*---------------------------------------------------------------------------- 3649 | Returns the result of converting the double-precision floating-point value 3650 | `a' to the quadruple-precision floating-point format. The conversion is 3651 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3652 | Arithmetic. 3653 *----------------------------------------------------------------------------*/ 3654 3655 float128 float64_to_float128(float64 a, float_status *status) 3656 { 3657 flag aSign; 3658 int_fast16_t aExp; 3659 uint64_t aSig, zSig0, zSig1; 3660 3661 a = float64_squash_input_denormal(a, status); 3662 aSig = extractFloat64Frac( a ); 3663 aExp = extractFloat64Exp( a ); 3664 aSign = extractFloat64Sign( a ); 3665 if ( aExp == 0x7FF ) { 3666 if (aSig) { 3667 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 3668 } 3669 return packFloat128( aSign, 0x7FFF, 0, 0 ); 3670 } 3671 if ( aExp == 0 ) { 3672 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 3673 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3674 --aExp; 3675 } 3676 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 3677 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 3678 3679 } 3680 3681 /*---------------------------------------------------------------------------- 3682 | Rounds the double-precision floating-point value `a' to an integer, and 3683 | returns the result as a double-precision floating-point value. The 3684 | operation is performed according to the IEC/IEEE Standard for Binary 3685 | Floating-Point Arithmetic. 3686 *----------------------------------------------------------------------------*/ 3687 3688 float64 float64_round_to_int(float64 a, float_status *status) 3689 { 3690 flag aSign; 3691 int_fast16_t aExp; 3692 uint64_t lastBitMask, roundBitsMask; 3693 uint64_t z; 3694 a = float64_squash_input_denormal(a, status); 3695 3696 aExp = extractFloat64Exp( a ); 3697 if ( 0x433 <= aExp ) { 3698 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { 3699 return propagateFloat64NaN(a, a, status); 3700 } 3701 return a; 3702 } 3703 if ( aExp < 0x3FF ) { 3704 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a; 3705 status->float_exception_flags |= float_flag_inexact; 3706 aSign = extractFloat64Sign( a ); 3707 switch (status->float_rounding_mode) { 3708 case float_round_nearest_even: 3709 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { 3710 return packFloat64( aSign, 0x3FF, 0 ); 3711 } 3712 break; 3713 case float_round_ties_away: 3714 if (aExp == 0x3FE) { 3715 return packFloat64(aSign, 0x3ff, 0); 3716 } 3717 break; 3718 case float_round_down: 3719 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0); 3720 case float_round_up: 3721 return make_float64( 3722 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 )); 3723 } 3724 return packFloat64( aSign, 0, 0 ); 3725 } 3726 lastBitMask = 1; 3727 lastBitMask <<= 0x433 - aExp; 3728 roundBitsMask = lastBitMask - 1; 3729 z = float64_val(a); 3730 switch (status->float_rounding_mode) { 3731 case float_round_nearest_even: 3732 z += lastBitMask >> 1; 3733 if ((z & roundBitsMask) == 0) { 3734 z &= ~lastBitMask; 3735 } 3736 break; 3737 case float_round_ties_away: 3738 z += lastBitMask >> 1; 3739 break; 3740 case float_round_to_zero: 3741 break; 3742 case float_round_up: 3743 if (!extractFloat64Sign(make_float64(z))) { 3744 z += roundBitsMask; 3745 } 3746 break; 3747 case float_round_down: 3748 if (extractFloat64Sign(make_float64(z))) { 3749 z += roundBitsMask; 3750 } 3751 break; 3752 default: 3753 abort(); 3754 } 3755 z &= ~ roundBitsMask; 3756 if (z != float64_val(a)) { 3757 status->float_exception_flags |= float_flag_inexact; 3758 } 3759 return make_float64(z); 3760 3761 } 3762 3763 float64 float64_trunc_to_int(float64 a, float_status *status) 3764 { 3765 int oldmode; 3766 float64 res; 3767 oldmode = status->float_rounding_mode; 3768 status->float_rounding_mode = float_round_to_zero; 3769 res = float64_round_to_int(a, status); 3770 status->float_rounding_mode = oldmode; 3771 return res; 3772 } 3773 3774 /*---------------------------------------------------------------------------- 3775 | Returns the result of adding the absolute values of the double-precision 3776 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 3777 | before being returned. `zSign' is ignored if the result is a NaN. 3778 | The addition is performed according to the IEC/IEEE Standard for Binary 3779 | Floating-Point Arithmetic. 3780 *----------------------------------------------------------------------------*/ 3781 3782 static float64 addFloat64Sigs(float64 a, float64 b, flag zSign, 3783 float_status *status) 3784 { 3785 int_fast16_t aExp, bExp, zExp; 3786 uint64_t aSig, bSig, zSig; 3787 int_fast16_t expDiff; 3788 3789 aSig = extractFloat64Frac( a ); 3790 aExp = extractFloat64Exp( a ); 3791 bSig = extractFloat64Frac( b ); 3792 bExp = extractFloat64Exp( b ); 3793 expDiff = aExp - bExp; 3794 aSig <<= 9; 3795 bSig <<= 9; 3796 if ( 0 < expDiff ) { 3797 if ( aExp == 0x7FF ) { 3798 if (aSig) { 3799 return propagateFloat64NaN(a, b, status); 3800 } 3801 return a; 3802 } 3803 if ( bExp == 0 ) { 3804 --expDiff; 3805 } 3806 else { 3807 bSig |= LIT64( 0x2000000000000000 ); 3808 } 3809 shift64RightJamming( bSig, expDiff, &bSig ); 3810 zExp = aExp; 3811 } 3812 else if ( expDiff < 0 ) { 3813 if ( bExp == 0x7FF ) { 3814 if (bSig) { 3815 return propagateFloat64NaN(a, b, status); 3816 } 3817 return packFloat64( zSign, 0x7FF, 0 ); 3818 } 3819 if ( aExp == 0 ) { 3820 ++expDiff; 3821 } 3822 else { 3823 aSig |= LIT64( 0x2000000000000000 ); 3824 } 3825 shift64RightJamming( aSig, - expDiff, &aSig ); 3826 zExp = bExp; 3827 } 3828 else { 3829 if ( aExp == 0x7FF ) { 3830 if (aSig | bSig) { 3831 return propagateFloat64NaN(a, b, status); 3832 } 3833 return a; 3834 } 3835 if ( aExp == 0 ) { 3836 if (status->flush_to_zero) { 3837 if (aSig | bSig) { 3838 float_raise(float_flag_output_denormal, status); 3839 } 3840 return packFloat64(zSign, 0, 0); 3841 } 3842 return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); 3843 } 3844 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig; 3845 zExp = aExp; 3846 goto roundAndPack; 3847 } 3848 aSig |= LIT64( 0x2000000000000000 ); 3849 zSig = ( aSig + bSig )<<1; 3850 --zExp; 3851 if ( (int64_t) zSig < 0 ) { 3852 zSig = aSig + bSig; 3853 ++zExp; 3854 } 3855 roundAndPack: 3856 return roundAndPackFloat64(zSign, zExp, zSig, status); 3857 3858 } 3859 3860 /*---------------------------------------------------------------------------- 3861 | Returns the result of subtracting the absolute values of the double- 3862 | precision floating-point values `a' and `b'. If `zSign' is 1, the 3863 | difference is negated before being returned. `zSign' is ignored if the 3864 | result is a NaN. The subtraction is performed according to the IEC/IEEE 3865 | Standard for Binary Floating-Point Arithmetic. 3866 *----------------------------------------------------------------------------*/ 3867 3868 static float64 subFloat64Sigs(float64 a, float64 b, flag zSign, 3869 float_status *status) 3870 { 3871 int_fast16_t aExp, bExp, zExp; 3872 uint64_t aSig, bSig, zSig; 3873 int_fast16_t expDiff; 3874 3875 aSig = extractFloat64Frac( a ); 3876 aExp = extractFloat64Exp( a ); 3877 bSig = extractFloat64Frac( b ); 3878 bExp = extractFloat64Exp( b ); 3879 expDiff = aExp - bExp; 3880 aSig <<= 10; 3881 bSig <<= 10; 3882 if ( 0 < expDiff ) goto aExpBigger; 3883 if ( expDiff < 0 ) goto bExpBigger; 3884 if ( aExp == 0x7FF ) { 3885 if (aSig | bSig) { 3886 return propagateFloat64NaN(a, b, status); 3887 } 3888 float_raise(float_flag_invalid, status); 3889 return float64_default_nan; 3890 } 3891 if ( aExp == 0 ) { 3892 aExp = 1; 3893 bExp = 1; 3894 } 3895 if ( bSig < aSig ) goto aBigger; 3896 if ( aSig < bSig ) goto bBigger; 3897 return packFloat64(status->float_rounding_mode == float_round_down, 0, 0); 3898 bExpBigger: 3899 if ( bExp == 0x7FF ) { 3900 if (bSig) { 3901 return propagateFloat64NaN(a, b, status); 3902 } 3903 return packFloat64( zSign ^ 1, 0x7FF, 0 ); 3904 } 3905 if ( aExp == 0 ) { 3906 ++expDiff; 3907 } 3908 else { 3909 aSig |= LIT64( 0x4000000000000000 ); 3910 } 3911 shift64RightJamming( aSig, - expDiff, &aSig ); 3912 bSig |= LIT64( 0x4000000000000000 ); 3913 bBigger: 3914 zSig = bSig - aSig; 3915 zExp = bExp; 3916 zSign ^= 1; 3917 goto normalizeRoundAndPack; 3918 aExpBigger: 3919 if ( aExp == 0x7FF ) { 3920 if (aSig) { 3921 return propagateFloat64NaN(a, b, status); 3922 } 3923 return a; 3924 } 3925 if ( bExp == 0 ) { 3926 --expDiff; 3927 } 3928 else { 3929 bSig |= LIT64( 0x4000000000000000 ); 3930 } 3931 shift64RightJamming( bSig, expDiff, &bSig ); 3932 aSig |= LIT64( 0x4000000000000000 ); 3933 aBigger: 3934 zSig = aSig - bSig; 3935 zExp = aExp; 3936 normalizeRoundAndPack: 3937 --zExp; 3938 return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status); 3939 3940 } 3941 3942 /*---------------------------------------------------------------------------- 3943 | Returns the result of adding the double-precision floating-point values `a' 3944 | and `b'. The operation is performed according to the IEC/IEEE Standard for 3945 | Binary Floating-Point Arithmetic. 3946 *----------------------------------------------------------------------------*/ 3947 3948 float64 float64_add(float64 a, float64 b, float_status *status) 3949 { 3950 flag aSign, bSign; 3951 a = float64_squash_input_denormal(a, status); 3952 b = float64_squash_input_denormal(b, status); 3953 3954 aSign = extractFloat64Sign( a ); 3955 bSign = extractFloat64Sign( b ); 3956 if ( aSign == bSign ) { 3957 return addFloat64Sigs(a, b, aSign, status); 3958 } 3959 else { 3960 return subFloat64Sigs(a, b, aSign, status); 3961 } 3962 3963 } 3964 3965 /*---------------------------------------------------------------------------- 3966 | Returns the result of subtracting the double-precision floating-point values 3967 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 3968 | for Binary Floating-Point Arithmetic. 3969 *----------------------------------------------------------------------------*/ 3970 3971 float64 float64_sub(float64 a, float64 b, float_status *status) 3972 { 3973 flag aSign, bSign; 3974 a = float64_squash_input_denormal(a, status); 3975 b = float64_squash_input_denormal(b, status); 3976 3977 aSign = extractFloat64Sign( a ); 3978 bSign = extractFloat64Sign( b ); 3979 if ( aSign == bSign ) { 3980 return subFloat64Sigs(a, b, aSign, status); 3981 } 3982 else { 3983 return addFloat64Sigs(a, b, aSign, status); 3984 } 3985 3986 } 3987 3988 /*---------------------------------------------------------------------------- 3989 | Returns the result of multiplying the double-precision floating-point values 3990 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 3991 | for Binary Floating-Point Arithmetic. 3992 *----------------------------------------------------------------------------*/ 3993 3994 float64 float64_mul(float64 a, float64 b, float_status *status) 3995 { 3996 flag aSign, bSign, zSign; 3997 int_fast16_t aExp, bExp, zExp; 3998 uint64_t aSig, bSig, zSig0, zSig1; 3999 4000 a = float64_squash_input_denormal(a, status); 4001 b = float64_squash_input_denormal(b, status); 4002 4003 aSig = extractFloat64Frac( a ); 4004 aExp = extractFloat64Exp( a ); 4005 aSign = extractFloat64Sign( a ); 4006 bSig = extractFloat64Frac( b ); 4007 bExp = extractFloat64Exp( b ); 4008 bSign = extractFloat64Sign( b ); 4009 zSign = aSign ^ bSign; 4010 if ( aExp == 0x7FF ) { 4011 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 4012 return propagateFloat64NaN(a, b, status); 4013 } 4014 if ( ( bExp | bSig ) == 0 ) { 4015 float_raise(float_flag_invalid, status); 4016 return float64_default_nan; 4017 } 4018 return packFloat64( zSign, 0x7FF, 0 ); 4019 } 4020 if ( bExp == 0x7FF ) { 4021 if (bSig) { 4022 return propagateFloat64NaN(a, b, status); 4023 } 4024 if ( ( aExp | aSig ) == 0 ) { 4025 float_raise(float_flag_invalid, status); 4026 return float64_default_nan; 4027 } 4028 return packFloat64( zSign, 0x7FF, 0 ); 4029 } 4030 if ( aExp == 0 ) { 4031 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 4032 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4033 } 4034 if ( bExp == 0 ) { 4035 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 ); 4036 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4037 } 4038 zExp = aExp + bExp - 0x3FF; 4039 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 4040 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4041 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 4042 zSig0 |= ( zSig1 != 0 ); 4043 if ( 0 <= (int64_t) ( zSig0<<1 ) ) { 4044 zSig0 <<= 1; 4045 --zExp; 4046 } 4047 return roundAndPackFloat64(zSign, zExp, zSig0, status); 4048 4049 } 4050 4051 /*---------------------------------------------------------------------------- 4052 | Returns the result of dividing the double-precision floating-point value `a' 4053 | by the corresponding value `b'. The operation is performed according to 4054 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4055 *----------------------------------------------------------------------------*/ 4056 4057 float64 float64_div(float64 a, float64 b, float_status *status) 4058 { 4059 flag aSign, bSign, zSign; 4060 int_fast16_t aExp, bExp, zExp; 4061 uint64_t aSig, bSig, zSig; 4062 uint64_t rem0, rem1; 4063 uint64_t term0, term1; 4064 a = float64_squash_input_denormal(a, status); 4065 b = float64_squash_input_denormal(b, status); 4066 4067 aSig = extractFloat64Frac( a ); 4068 aExp = extractFloat64Exp( a ); 4069 aSign = extractFloat64Sign( a ); 4070 bSig = extractFloat64Frac( b ); 4071 bExp = extractFloat64Exp( b ); 4072 bSign = extractFloat64Sign( b ); 4073 zSign = aSign ^ bSign; 4074 if ( aExp == 0x7FF ) { 4075 if (aSig) { 4076 return propagateFloat64NaN(a, b, status); 4077 } 4078 if ( bExp == 0x7FF ) { 4079 if (bSig) { 4080 return propagateFloat64NaN(a, b, status); 4081 } 4082 float_raise(float_flag_invalid, status); 4083 return float64_default_nan; 4084 } 4085 return packFloat64( zSign, 0x7FF, 0 ); 4086 } 4087 if ( bExp == 0x7FF ) { 4088 if (bSig) { 4089 return propagateFloat64NaN(a, b, status); 4090 } 4091 return packFloat64( zSign, 0, 0 ); 4092 } 4093 if ( bExp == 0 ) { 4094 if ( bSig == 0 ) { 4095 if ( ( aExp | aSig ) == 0 ) { 4096 float_raise(float_flag_invalid, status); 4097 return float64_default_nan; 4098 } 4099 float_raise(float_flag_divbyzero, status); 4100 return packFloat64( zSign, 0x7FF, 0 ); 4101 } 4102 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4103 } 4104 if ( aExp == 0 ) { 4105 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 4106 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4107 } 4108 zExp = aExp - bExp + 0x3FD; 4109 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 4110 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4111 if ( bSig <= ( aSig + aSig ) ) { 4112 aSig >>= 1; 4113 ++zExp; 4114 } 4115 zSig = estimateDiv128To64( aSig, 0, bSig ); 4116 if ( ( zSig & 0x1FF ) <= 2 ) { 4117 mul64To128( bSig, zSig, &term0, &term1 ); 4118 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 4119 while ( (int64_t) rem0 < 0 ) { 4120 --zSig; 4121 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 4122 } 4123 zSig |= ( rem1 != 0 ); 4124 } 4125 return roundAndPackFloat64(zSign, zExp, zSig, status); 4126 4127 } 4128 4129 /*---------------------------------------------------------------------------- 4130 | Returns the remainder of the double-precision floating-point value `a' 4131 | with respect to the corresponding value `b'. The operation is performed 4132 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4133 *----------------------------------------------------------------------------*/ 4134 4135 float64 float64_rem(float64 a, float64 b, float_status *status) 4136 { 4137 flag aSign, zSign; 4138 int_fast16_t aExp, bExp, expDiff; 4139 uint64_t aSig, bSig; 4140 uint64_t q, alternateASig; 4141 int64_t sigMean; 4142 4143 a = float64_squash_input_denormal(a, status); 4144 b = float64_squash_input_denormal(b, status); 4145 aSig = extractFloat64Frac( a ); 4146 aExp = extractFloat64Exp( a ); 4147 aSign = extractFloat64Sign( a ); 4148 bSig = extractFloat64Frac( b ); 4149 bExp = extractFloat64Exp( b ); 4150 if ( aExp == 0x7FF ) { 4151 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 4152 return propagateFloat64NaN(a, b, status); 4153 } 4154 float_raise(float_flag_invalid, status); 4155 return float64_default_nan; 4156 } 4157 if ( bExp == 0x7FF ) { 4158 if (bSig) { 4159 return propagateFloat64NaN(a, b, status); 4160 } 4161 return a; 4162 } 4163 if ( bExp == 0 ) { 4164 if ( bSig == 0 ) { 4165 float_raise(float_flag_invalid, status); 4166 return float64_default_nan; 4167 } 4168 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4169 } 4170 if ( aExp == 0 ) { 4171 if ( aSig == 0 ) return a; 4172 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4173 } 4174 expDiff = aExp - bExp; 4175 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 4176 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4177 if ( expDiff < 0 ) { 4178 if ( expDiff < -1 ) return a; 4179 aSig >>= 1; 4180 } 4181 q = ( bSig <= aSig ); 4182 if ( q ) aSig -= bSig; 4183 expDiff -= 64; 4184 while ( 0 < expDiff ) { 4185 q = estimateDiv128To64( aSig, 0, bSig ); 4186 q = ( 2 < q ) ? q - 2 : 0; 4187 aSig = - ( ( bSig>>2 ) * q ); 4188 expDiff -= 62; 4189 } 4190 expDiff += 64; 4191 if ( 0 < expDiff ) { 4192 q = estimateDiv128To64( aSig, 0, bSig ); 4193 q = ( 2 < q ) ? q - 2 : 0; 4194 q >>= 64 - expDiff; 4195 bSig >>= 2; 4196 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4197 } 4198 else { 4199 aSig >>= 2; 4200 bSig >>= 2; 4201 } 4202 do { 4203 alternateASig = aSig; 4204 ++q; 4205 aSig -= bSig; 4206 } while ( 0 <= (int64_t) aSig ); 4207 sigMean = aSig + alternateASig; 4208 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 4209 aSig = alternateASig; 4210 } 4211 zSign = ( (int64_t) aSig < 0 ); 4212 if ( zSign ) aSig = - aSig; 4213 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 4214 4215 } 4216 4217 /*---------------------------------------------------------------------------- 4218 | Returns the result of multiplying the double-precision floating-point values 4219 | `a' and `b' then adding 'c', with no intermediate rounding step after the 4220 | multiplication. The operation is performed according to the IEC/IEEE 4221 | Standard for Binary Floating-Point Arithmetic 754-2008. 4222 | The flags argument allows the caller to select negation of the 4223 | addend, the intermediate product, or the final result. (The difference 4224 | between this and having the caller do a separate negation is that negating 4225 | externally will flip the sign bit on NaNs.) 4226 *----------------------------------------------------------------------------*/ 4227 4228 float64 float64_muladd(float64 a, float64 b, float64 c, int flags, 4229 float_status *status) 4230 { 4231 flag aSign, bSign, cSign, zSign; 4232 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff; 4233 uint64_t aSig, bSig, cSig; 4234 flag pInf, pZero, pSign; 4235 uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1; 4236 int shiftcount; 4237 flag signflip, infzero; 4238 4239 a = float64_squash_input_denormal(a, status); 4240 b = float64_squash_input_denormal(b, status); 4241 c = float64_squash_input_denormal(c, status); 4242 aSig = extractFloat64Frac(a); 4243 aExp = extractFloat64Exp(a); 4244 aSign = extractFloat64Sign(a); 4245 bSig = extractFloat64Frac(b); 4246 bExp = extractFloat64Exp(b); 4247 bSign = extractFloat64Sign(b); 4248 cSig = extractFloat64Frac(c); 4249 cExp = extractFloat64Exp(c); 4250 cSign = extractFloat64Sign(c); 4251 4252 infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) || 4253 (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0)); 4254 4255 /* It is implementation-defined whether the cases of (0,inf,qnan) 4256 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 4257 * they return if they do), so we have to hand this information 4258 * off to the target-specific pick-a-NaN routine. 4259 */ 4260 if (((aExp == 0x7ff) && aSig) || 4261 ((bExp == 0x7ff) && bSig) || 4262 ((cExp == 0x7ff) && cSig)) { 4263 return propagateFloat64MulAddNaN(a, b, c, infzero, status); 4264 } 4265 4266 if (infzero) { 4267 float_raise(float_flag_invalid, status); 4268 return float64_default_nan; 4269 } 4270 4271 if (flags & float_muladd_negate_c) { 4272 cSign ^= 1; 4273 } 4274 4275 signflip = (flags & float_muladd_negate_result) ? 1 : 0; 4276 4277 /* Work out the sign and type of the product */ 4278 pSign = aSign ^ bSign; 4279 if (flags & float_muladd_negate_product) { 4280 pSign ^= 1; 4281 } 4282 pInf = (aExp == 0x7ff) || (bExp == 0x7ff); 4283 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0); 4284 4285 if (cExp == 0x7ff) { 4286 if (pInf && (pSign ^ cSign)) { 4287 /* addition of opposite-signed infinities => InvalidOperation */ 4288 float_raise(float_flag_invalid, status); 4289 return float64_default_nan; 4290 } 4291 /* Otherwise generate an infinity of the same sign */ 4292 return packFloat64(cSign ^ signflip, 0x7ff, 0); 4293 } 4294 4295 if (pInf) { 4296 return packFloat64(pSign ^ signflip, 0x7ff, 0); 4297 } 4298 4299 if (pZero) { 4300 if (cExp == 0) { 4301 if (cSig == 0) { 4302 /* Adding two exact zeroes */ 4303 if (pSign == cSign) { 4304 zSign = pSign; 4305 } else if (status->float_rounding_mode == float_round_down) { 4306 zSign = 1; 4307 } else { 4308 zSign = 0; 4309 } 4310 return packFloat64(zSign ^ signflip, 0, 0); 4311 } 4312 /* Exact zero plus a denorm */ 4313 if (status->flush_to_zero) { 4314 float_raise(float_flag_output_denormal, status); 4315 return packFloat64(cSign ^ signflip, 0, 0); 4316 } 4317 } 4318 /* Zero plus something non-zero : just return the something */ 4319 if (flags & float_muladd_halve_result) { 4320 if (cExp == 0) { 4321 normalizeFloat64Subnormal(cSig, &cExp, &cSig); 4322 } 4323 /* Subtract one to halve, and one again because roundAndPackFloat64 4324 * wants one less than the true exponent. 4325 */ 4326 cExp -= 2; 4327 cSig = (cSig | 0x0010000000000000ULL) << 10; 4328 return roundAndPackFloat64(cSign ^ signflip, cExp, cSig, status); 4329 } 4330 return packFloat64(cSign ^ signflip, cExp, cSig); 4331 } 4332 4333 if (aExp == 0) { 4334 normalizeFloat64Subnormal(aSig, &aExp, &aSig); 4335 } 4336 if (bExp == 0) { 4337 normalizeFloat64Subnormal(bSig, &bExp, &bSig); 4338 } 4339 4340 /* Calculate the actual result a * b + c */ 4341 4342 /* Multiply first; this is easy. */ 4343 /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff 4344 * because we want the true exponent, not the "one-less-than" 4345 * flavour that roundAndPackFloat64() takes. 4346 */ 4347 pExp = aExp + bExp - 0x3fe; 4348 aSig = (aSig | LIT64(0x0010000000000000))<<10; 4349 bSig = (bSig | LIT64(0x0010000000000000))<<11; 4350 mul64To128(aSig, bSig, &pSig0, &pSig1); 4351 if ((int64_t)(pSig0 << 1) >= 0) { 4352 shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1); 4353 pExp--; 4354 } 4355 4356 zSign = pSign ^ signflip; 4357 4358 /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit 4359 * bit in position 126. 4360 */ 4361 if (cExp == 0) { 4362 if (!cSig) { 4363 /* Throw out the special case of c being an exact zero now */ 4364 shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1); 4365 if (flags & float_muladd_halve_result) { 4366 pExp--; 4367 } 4368 return roundAndPackFloat64(zSign, pExp - 1, 4369 pSig1, status); 4370 } 4371 normalizeFloat64Subnormal(cSig, &cExp, &cSig); 4372 } 4373 4374 /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the 4375 * significand of the addend, with the explicit bit in position 126. 4376 */ 4377 cSig0 = cSig << (126 - 64 - 52); 4378 cSig1 = 0; 4379 cSig0 |= LIT64(0x4000000000000000); 4380 expDiff = pExp - cExp; 4381 4382 if (pSign == cSign) { 4383 /* Addition */ 4384 if (expDiff > 0) { 4385 /* scale c to match p */ 4386 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1); 4387 zExp = pExp; 4388 } else if (expDiff < 0) { 4389 /* scale p to match c */ 4390 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1); 4391 zExp = cExp; 4392 } else { 4393 /* no scaling needed */ 4394 zExp = cExp; 4395 } 4396 /* Add significands and make sure explicit bit ends up in posn 126 */ 4397 add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); 4398 if ((int64_t)zSig0 < 0) { 4399 shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1); 4400 } else { 4401 zExp--; 4402 } 4403 shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1); 4404 if (flags & float_muladd_halve_result) { 4405 zExp--; 4406 } 4407 return roundAndPackFloat64(zSign, zExp, zSig1, status); 4408 } else { 4409 /* Subtraction */ 4410 if (expDiff > 0) { 4411 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1); 4412 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); 4413 zExp = pExp; 4414 } else if (expDiff < 0) { 4415 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1); 4416 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1); 4417 zExp = cExp; 4418 zSign ^= 1; 4419 } else { 4420 zExp = pExp; 4421 if (lt128(cSig0, cSig1, pSig0, pSig1)) { 4422 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); 4423 } else if (lt128(pSig0, pSig1, cSig0, cSig1)) { 4424 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1); 4425 zSign ^= 1; 4426 } else { 4427 /* Exact zero */ 4428 zSign = signflip; 4429 if (status->float_rounding_mode == float_round_down) { 4430 zSign ^= 1; 4431 } 4432 return packFloat64(zSign, 0, 0); 4433 } 4434 } 4435 --zExp; 4436 /* Do the equivalent of normalizeRoundAndPackFloat64() but 4437 * starting with the significand in a pair of uint64_t. 4438 */ 4439 if (zSig0) { 4440 shiftcount = countLeadingZeros64(zSig0) - 1; 4441 shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1); 4442 if (zSig1) { 4443 zSig0 |= 1; 4444 } 4445 zExp -= shiftcount; 4446 } else { 4447 shiftcount = countLeadingZeros64(zSig1); 4448 if (shiftcount == 0) { 4449 zSig0 = (zSig1 >> 1) | (zSig1 & 1); 4450 zExp -= 63; 4451 } else { 4452 shiftcount--; 4453 zSig0 = zSig1 << shiftcount; 4454 zExp -= (shiftcount + 64); 4455 } 4456 } 4457 if (flags & float_muladd_halve_result) { 4458 zExp--; 4459 } 4460 return roundAndPackFloat64(zSign, zExp, zSig0, status); 4461 } 4462 } 4463 4464 /*---------------------------------------------------------------------------- 4465 | Returns the square root of the double-precision floating-point value `a'. 4466 | The operation is performed according to the IEC/IEEE Standard for Binary 4467 | Floating-Point Arithmetic. 4468 *----------------------------------------------------------------------------*/ 4469 4470 float64 float64_sqrt(float64 a, float_status *status) 4471 { 4472 flag aSign; 4473 int_fast16_t aExp, zExp; 4474 uint64_t aSig, zSig, doubleZSig; 4475 uint64_t rem0, rem1, term0, term1; 4476 a = float64_squash_input_denormal(a, status); 4477 4478 aSig = extractFloat64Frac( a ); 4479 aExp = extractFloat64Exp( a ); 4480 aSign = extractFloat64Sign( a ); 4481 if ( aExp == 0x7FF ) { 4482 if (aSig) { 4483 return propagateFloat64NaN(a, a, status); 4484 } 4485 if ( ! aSign ) return a; 4486 float_raise(float_flag_invalid, status); 4487 return float64_default_nan; 4488 } 4489 if ( aSign ) { 4490 if ( ( aExp | aSig ) == 0 ) return a; 4491 float_raise(float_flag_invalid, status); 4492 return float64_default_nan; 4493 } 4494 if ( aExp == 0 ) { 4495 if ( aSig == 0 ) return float64_zero; 4496 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4497 } 4498 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 4499 aSig |= LIT64( 0x0010000000000000 ); 4500 zSig = estimateSqrt32( aExp, aSig>>21 ); 4501 aSig <<= 9 - ( aExp & 1 ); 4502 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); 4503 if ( ( zSig & 0x1FF ) <= 5 ) { 4504 doubleZSig = zSig<<1; 4505 mul64To128( zSig, zSig, &term0, &term1 ); 4506 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 4507 while ( (int64_t) rem0 < 0 ) { 4508 --zSig; 4509 doubleZSig -= 2; 4510 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); 4511 } 4512 zSig |= ( ( rem0 | rem1 ) != 0 ); 4513 } 4514 return roundAndPackFloat64(0, zExp, zSig, status); 4515 4516 } 4517 4518 /*---------------------------------------------------------------------------- 4519 | Returns the binary log of the double-precision floating-point value `a'. 4520 | The operation is performed according to the IEC/IEEE Standard for Binary 4521 | Floating-Point Arithmetic. 4522 *----------------------------------------------------------------------------*/ 4523 float64 float64_log2(float64 a, float_status *status) 4524 { 4525 flag aSign, zSign; 4526 int_fast16_t aExp; 4527 uint64_t aSig, aSig0, aSig1, zSig, i; 4528 a = float64_squash_input_denormal(a, status); 4529 4530 aSig = extractFloat64Frac( a ); 4531 aExp = extractFloat64Exp( a ); 4532 aSign = extractFloat64Sign( a ); 4533 4534 if ( aExp == 0 ) { 4535 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 4536 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4537 } 4538 if ( aSign ) { 4539 float_raise(float_flag_invalid, status); 4540 return float64_default_nan; 4541 } 4542 if ( aExp == 0x7FF ) { 4543 if (aSig) { 4544 return propagateFloat64NaN(a, float64_zero, status); 4545 } 4546 return a; 4547 } 4548 4549 aExp -= 0x3FF; 4550 aSig |= LIT64( 0x0010000000000000 ); 4551 zSign = aExp < 0; 4552 zSig = (uint64_t)aExp << 52; 4553 for (i = 1LL << 51; i > 0; i >>= 1) { 4554 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 4555 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 4556 if ( aSig & LIT64( 0x0020000000000000 ) ) { 4557 aSig >>= 1; 4558 zSig |= i; 4559 } 4560 } 4561 4562 if ( zSign ) 4563 zSig = -zSig; 4564 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 4565 } 4566 4567 /*---------------------------------------------------------------------------- 4568 | Returns 1 if the double-precision floating-point value `a' is equal to the 4569 | corresponding value `b', and 0 otherwise. The invalid exception is raised 4570 | if either operand is a NaN. Otherwise, the comparison is performed 4571 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4572 *----------------------------------------------------------------------------*/ 4573 4574 int float64_eq(float64 a, float64 b, float_status *status) 4575 { 4576 uint64_t av, bv; 4577 a = float64_squash_input_denormal(a, status); 4578 b = float64_squash_input_denormal(b, status); 4579 4580 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4581 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4582 ) { 4583 float_raise(float_flag_invalid, status); 4584 return 0; 4585 } 4586 av = float64_val(a); 4587 bv = float64_val(b); 4588 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4589 4590 } 4591 4592 /*---------------------------------------------------------------------------- 4593 | Returns 1 if the double-precision floating-point value `a' is less than or 4594 | equal to the corresponding value `b', and 0 otherwise. The invalid 4595 | exception is raised if either operand is a NaN. The comparison is performed 4596 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4597 *----------------------------------------------------------------------------*/ 4598 4599 int float64_le(float64 a, float64 b, float_status *status) 4600 { 4601 flag aSign, bSign; 4602 uint64_t av, bv; 4603 a = float64_squash_input_denormal(a, status); 4604 b = float64_squash_input_denormal(b, status); 4605 4606 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4607 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4608 ) { 4609 float_raise(float_flag_invalid, status); 4610 return 0; 4611 } 4612 aSign = extractFloat64Sign( a ); 4613 bSign = extractFloat64Sign( b ); 4614 av = float64_val(a); 4615 bv = float64_val(b); 4616 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4617 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4618 4619 } 4620 4621 /*---------------------------------------------------------------------------- 4622 | Returns 1 if the double-precision floating-point value `a' is less than 4623 | the corresponding value `b', and 0 otherwise. The invalid exception is 4624 | raised if either operand is a NaN. The comparison is performed according 4625 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4626 *----------------------------------------------------------------------------*/ 4627 4628 int float64_lt(float64 a, float64 b, float_status *status) 4629 { 4630 flag aSign, bSign; 4631 uint64_t av, bv; 4632 4633 a = float64_squash_input_denormal(a, status); 4634 b = float64_squash_input_denormal(b, status); 4635 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4636 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4637 ) { 4638 float_raise(float_flag_invalid, status); 4639 return 0; 4640 } 4641 aSign = extractFloat64Sign( a ); 4642 bSign = extractFloat64Sign( b ); 4643 av = float64_val(a); 4644 bv = float64_val(b); 4645 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4646 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4647 4648 } 4649 4650 /*---------------------------------------------------------------------------- 4651 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4652 | be compared, and 0 otherwise. The invalid exception is raised if either 4653 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4654 | Standard for Binary Floating-Point Arithmetic. 4655 *----------------------------------------------------------------------------*/ 4656 4657 int float64_unordered(float64 a, float64 b, float_status *status) 4658 { 4659 a = float64_squash_input_denormal(a, status); 4660 b = float64_squash_input_denormal(b, status); 4661 4662 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4663 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4664 ) { 4665 float_raise(float_flag_invalid, status); 4666 return 1; 4667 } 4668 return 0; 4669 } 4670 4671 /*---------------------------------------------------------------------------- 4672 | Returns 1 if the double-precision floating-point value `a' is equal to the 4673 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4674 | exception.The comparison is performed according to the IEC/IEEE Standard 4675 | for Binary Floating-Point Arithmetic. 4676 *----------------------------------------------------------------------------*/ 4677 4678 int float64_eq_quiet(float64 a, float64 b, float_status *status) 4679 { 4680 uint64_t av, bv; 4681 a = float64_squash_input_denormal(a, status); 4682 b = float64_squash_input_denormal(b, status); 4683 4684 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4685 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4686 ) { 4687 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 4688 float_raise(float_flag_invalid, status); 4689 } 4690 return 0; 4691 } 4692 av = float64_val(a); 4693 bv = float64_val(b); 4694 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4695 4696 } 4697 4698 /*---------------------------------------------------------------------------- 4699 | Returns 1 if the double-precision floating-point value `a' is less than or 4700 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4701 | cause an exception. Otherwise, the comparison is performed according to the 4702 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4703 *----------------------------------------------------------------------------*/ 4704 4705 int float64_le_quiet(float64 a, float64 b, float_status *status) 4706 { 4707 flag aSign, bSign; 4708 uint64_t av, bv; 4709 a = float64_squash_input_denormal(a, status); 4710 b = float64_squash_input_denormal(b, status); 4711 4712 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4713 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4714 ) { 4715 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 4716 float_raise(float_flag_invalid, status); 4717 } 4718 return 0; 4719 } 4720 aSign = extractFloat64Sign( a ); 4721 bSign = extractFloat64Sign( b ); 4722 av = float64_val(a); 4723 bv = float64_val(b); 4724 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4725 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4726 4727 } 4728 4729 /*---------------------------------------------------------------------------- 4730 | Returns 1 if the double-precision floating-point value `a' is less than 4731 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4732 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 4733 | Standard for Binary Floating-Point Arithmetic. 4734 *----------------------------------------------------------------------------*/ 4735 4736 int float64_lt_quiet(float64 a, float64 b, float_status *status) 4737 { 4738 flag aSign, bSign; 4739 uint64_t av, bv; 4740 a = float64_squash_input_denormal(a, status); 4741 b = float64_squash_input_denormal(b, status); 4742 4743 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4744 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4745 ) { 4746 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 4747 float_raise(float_flag_invalid, status); 4748 } 4749 return 0; 4750 } 4751 aSign = extractFloat64Sign( a ); 4752 bSign = extractFloat64Sign( b ); 4753 av = float64_val(a); 4754 bv = float64_val(b); 4755 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4756 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4757 4758 } 4759 4760 /*---------------------------------------------------------------------------- 4761 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4762 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 4763 | comparison is performed according to the IEC/IEEE Standard for Binary 4764 | Floating-Point Arithmetic. 4765 *----------------------------------------------------------------------------*/ 4766 4767 int float64_unordered_quiet(float64 a, float64 b, float_status *status) 4768 { 4769 a = float64_squash_input_denormal(a, status); 4770 b = float64_squash_input_denormal(b, status); 4771 4772 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4773 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4774 ) { 4775 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { 4776 float_raise(float_flag_invalid, status); 4777 } 4778 return 1; 4779 } 4780 return 0; 4781 } 4782 4783 /*---------------------------------------------------------------------------- 4784 | Returns the result of converting the extended double-precision floating- 4785 | point value `a' to the 32-bit two's complement integer format. The 4786 | conversion is performed according to the IEC/IEEE Standard for Binary 4787 | Floating-Point Arithmetic---which means in particular that the conversion 4788 | is rounded according to the current rounding mode. If `a' is a NaN, the 4789 | largest positive integer is returned. Otherwise, if the conversion 4790 | overflows, the largest integer with the same sign as `a' is returned. 4791 *----------------------------------------------------------------------------*/ 4792 4793 int32 floatx80_to_int32(floatx80 a, float_status *status) 4794 { 4795 flag aSign; 4796 int32 aExp, shiftCount; 4797 uint64_t aSig; 4798 4799 aSig = extractFloatx80Frac( a ); 4800 aExp = extractFloatx80Exp( a ); 4801 aSign = extractFloatx80Sign( a ); 4802 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4803 shiftCount = 0x4037 - aExp; 4804 if ( shiftCount <= 0 ) shiftCount = 1; 4805 shift64RightJamming( aSig, shiftCount, &aSig ); 4806 return roundAndPackInt32(aSign, aSig, status); 4807 4808 } 4809 4810 /*---------------------------------------------------------------------------- 4811 | Returns the result of converting the extended double-precision floating- 4812 | point value `a' to the 32-bit two's complement integer format. The 4813 | conversion is performed according to the IEC/IEEE Standard for Binary 4814 | Floating-Point Arithmetic, except that the conversion is always rounded 4815 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4816 | Otherwise, if the conversion overflows, the largest integer with the same 4817 | sign as `a' is returned. 4818 *----------------------------------------------------------------------------*/ 4819 4820 int32 floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 4821 { 4822 flag aSign; 4823 int32 aExp, shiftCount; 4824 uint64_t aSig, savedASig; 4825 int32_t z; 4826 4827 aSig = extractFloatx80Frac( a ); 4828 aExp = extractFloatx80Exp( a ); 4829 aSign = extractFloatx80Sign( a ); 4830 if ( 0x401E < aExp ) { 4831 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4832 goto invalid; 4833 } 4834 else if ( aExp < 0x3FFF ) { 4835 if (aExp || aSig) { 4836 status->float_exception_flags |= float_flag_inexact; 4837 } 4838 return 0; 4839 } 4840 shiftCount = 0x403E - aExp; 4841 savedASig = aSig; 4842 aSig >>= shiftCount; 4843 z = aSig; 4844 if ( aSign ) z = - z; 4845 if ( ( z < 0 ) ^ aSign ) { 4846 invalid: 4847 float_raise(float_flag_invalid, status); 4848 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 4849 } 4850 if ( ( aSig<<shiftCount ) != savedASig ) { 4851 status->float_exception_flags |= float_flag_inexact; 4852 } 4853 return z; 4854 4855 } 4856 4857 /*---------------------------------------------------------------------------- 4858 | Returns the result of converting the extended double-precision floating- 4859 | point value `a' to the 64-bit two's complement integer format. The 4860 | conversion is performed according to the IEC/IEEE Standard for Binary 4861 | Floating-Point Arithmetic---which means in particular that the conversion 4862 | is rounded according to the current rounding mode. If `a' is a NaN, 4863 | the largest positive integer is returned. Otherwise, if the conversion 4864 | overflows, the largest integer with the same sign as `a' is returned. 4865 *----------------------------------------------------------------------------*/ 4866 4867 int64 floatx80_to_int64(floatx80 a, float_status *status) 4868 { 4869 flag aSign; 4870 int32 aExp, shiftCount; 4871 uint64_t aSig, aSigExtra; 4872 4873 aSig = extractFloatx80Frac( a ); 4874 aExp = extractFloatx80Exp( a ); 4875 aSign = extractFloatx80Sign( a ); 4876 shiftCount = 0x403E - aExp; 4877 if ( shiftCount <= 0 ) { 4878 if ( shiftCount ) { 4879 float_raise(float_flag_invalid, status); 4880 if ( ! aSign 4881 || ( ( aExp == 0x7FFF ) 4882 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 4883 ) { 4884 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4885 } 4886 return (int64_t) LIT64( 0x8000000000000000 ); 4887 } 4888 aSigExtra = 0; 4889 } 4890 else { 4891 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 4892 } 4893 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 4894 4895 } 4896 4897 /*---------------------------------------------------------------------------- 4898 | Returns the result of converting the extended double-precision floating- 4899 | point value `a' to the 64-bit two's complement integer format. The 4900 | conversion is performed according to the IEC/IEEE Standard for Binary 4901 | Floating-Point Arithmetic, except that the conversion is always rounded 4902 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4903 | Otherwise, if the conversion overflows, the largest integer with the same 4904 | sign as `a' is returned. 4905 *----------------------------------------------------------------------------*/ 4906 4907 int64 floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 4908 { 4909 flag aSign; 4910 int32 aExp, shiftCount; 4911 uint64_t aSig; 4912 int64 z; 4913 4914 aSig = extractFloatx80Frac( a ); 4915 aExp = extractFloatx80Exp( a ); 4916 aSign = extractFloatx80Sign( a ); 4917 shiftCount = aExp - 0x403E; 4918 if ( 0 <= shiftCount ) { 4919 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 4920 if ( ( a.high != 0xC03E ) || aSig ) { 4921 float_raise(float_flag_invalid, status); 4922 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 4923 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4924 } 4925 } 4926 return (int64_t) LIT64( 0x8000000000000000 ); 4927 } 4928 else if ( aExp < 0x3FFF ) { 4929 if (aExp | aSig) { 4930 status->float_exception_flags |= float_flag_inexact; 4931 } 4932 return 0; 4933 } 4934 z = aSig>>( - shiftCount ); 4935 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 4936 status->float_exception_flags |= float_flag_inexact; 4937 } 4938 if ( aSign ) z = - z; 4939 return z; 4940 4941 } 4942 4943 /*---------------------------------------------------------------------------- 4944 | Returns the result of converting the extended double-precision floating- 4945 | point value `a' to the single-precision floating-point format. The 4946 | conversion is performed according to the IEC/IEEE Standard for Binary 4947 | Floating-Point Arithmetic. 4948 *----------------------------------------------------------------------------*/ 4949 4950 float32 floatx80_to_float32(floatx80 a, float_status *status) 4951 { 4952 flag aSign; 4953 int32 aExp; 4954 uint64_t aSig; 4955 4956 aSig = extractFloatx80Frac( a ); 4957 aExp = extractFloatx80Exp( a ); 4958 aSign = extractFloatx80Sign( a ); 4959 if ( aExp == 0x7FFF ) { 4960 if ( (uint64_t) ( aSig<<1 ) ) { 4961 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status); 4962 } 4963 return packFloat32( aSign, 0xFF, 0 ); 4964 } 4965 shift64RightJamming( aSig, 33, &aSig ); 4966 if ( aExp || aSig ) aExp -= 0x3F81; 4967 return roundAndPackFloat32(aSign, aExp, aSig, status); 4968 4969 } 4970 4971 /*---------------------------------------------------------------------------- 4972 | Returns the result of converting the extended double-precision floating- 4973 | point value `a' to the double-precision floating-point format. The 4974 | conversion is performed according to the IEC/IEEE Standard for Binary 4975 | Floating-Point Arithmetic. 4976 *----------------------------------------------------------------------------*/ 4977 4978 float64 floatx80_to_float64(floatx80 a, float_status *status) 4979 { 4980 flag aSign; 4981 int32 aExp; 4982 uint64_t aSig, zSig; 4983 4984 aSig = extractFloatx80Frac( a ); 4985 aExp = extractFloatx80Exp( a ); 4986 aSign = extractFloatx80Sign( a ); 4987 if ( aExp == 0x7FFF ) { 4988 if ( (uint64_t) ( aSig<<1 ) ) { 4989 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status); 4990 } 4991 return packFloat64( aSign, 0x7FF, 0 ); 4992 } 4993 shift64RightJamming( aSig, 1, &zSig ); 4994 if ( aExp || aSig ) aExp -= 0x3C01; 4995 return roundAndPackFloat64(aSign, aExp, zSig, status); 4996 4997 } 4998 4999 /*---------------------------------------------------------------------------- 5000 | Returns the result of converting the extended double-precision floating- 5001 | point value `a' to the quadruple-precision floating-point format. The 5002 | conversion is performed according to the IEC/IEEE Standard for Binary 5003 | Floating-Point Arithmetic. 5004 *----------------------------------------------------------------------------*/ 5005 5006 float128 floatx80_to_float128(floatx80 a, float_status *status) 5007 { 5008 flag aSign; 5009 int_fast16_t aExp; 5010 uint64_t aSig, zSig0, zSig1; 5011 5012 aSig = extractFloatx80Frac( a ); 5013 aExp = extractFloatx80Exp( a ); 5014 aSign = extractFloatx80Sign( a ); 5015 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 5016 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status); 5017 } 5018 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 5019 return packFloat128( aSign, aExp, zSig0, zSig1 ); 5020 5021 } 5022 5023 /*---------------------------------------------------------------------------- 5024 | Rounds the extended double-precision floating-point value `a' to an integer, 5025 | and returns the result as an extended quadruple-precision floating-point 5026 | value. The operation is performed according to the IEC/IEEE Standard for 5027 | Binary Floating-Point Arithmetic. 5028 *----------------------------------------------------------------------------*/ 5029 5030 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 5031 { 5032 flag aSign; 5033 int32 aExp; 5034 uint64_t lastBitMask, roundBitsMask; 5035 floatx80 z; 5036 5037 aExp = extractFloatx80Exp( a ); 5038 if ( 0x403E <= aExp ) { 5039 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 5040 return propagateFloatx80NaN(a, a, status); 5041 } 5042 return a; 5043 } 5044 if ( aExp < 0x3FFF ) { 5045 if ( ( aExp == 0 ) 5046 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 5047 return a; 5048 } 5049 status->float_exception_flags |= float_flag_inexact; 5050 aSign = extractFloatx80Sign( a ); 5051 switch (status->float_rounding_mode) { 5052 case float_round_nearest_even: 5053 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 5054 ) { 5055 return 5056 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 5057 } 5058 break; 5059 case float_round_ties_away: 5060 if (aExp == 0x3FFE) { 5061 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000)); 5062 } 5063 break; 5064 case float_round_down: 5065 return 5066 aSign ? 5067 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 5068 : packFloatx80( 0, 0, 0 ); 5069 case float_round_up: 5070 return 5071 aSign ? packFloatx80( 1, 0, 0 ) 5072 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 5073 } 5074 return packFloatx80( aSign, 0, 0 ); 5075 } 5076 lastBitMask = 1; 5077 lastBitMask <<= 0x403E - aExp; 5078 roundBitsMask = lastBitMask - 1; 5079 z = a; 5080 switch (status->float_rounding_mode) { 5081 case float_round_nearest_even: 5082 z.low += lastBitMask>>1; 5083 if ((z.low & roundBitsMask) == 0) { 5084 z.low &= ~lastBitMask; 5085 } 5086 break; 5087 case float_round_ties_away: 5088 z.low += lastBitMask >> 1; 5089 break; 5090 case float_round_to_zero: 5091 break; 5092 case float_round_up: 5093 if (!extractFloatx80Sign(z)) { 5094 z.low += roundBitsMask; 5095 } 5096 break; 5097 case float_round_down: 5098 if (extractFloatx80Sign(z)) { 5099 z.low += roundBitsMask; 5100 } 5101 break; 5102 default: 5103 abort(); 5104 } 5105 z.low &= ~ roundBitsMask; 5106 if ( z.low == 0 ) { 5107 ++z.high; 5108 z.low = LIT64( 0x8000000000000000 ); 5109 } 5110 if (z.low != a.low) { 5111 status->float_exception_flags |= float_flag_inexact; 5112 } 5113 return z; 5114 5115 } 5116 5117 /*---------------------------------------------------------------------------- 5118 | Returns the result of adding the absolute values of the extended double- 5119 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 5120 | negated before being returned. `zSign' is ignored if the result is a NaN. 5121 | The addition is performed according to the IEC/IEEE Standard for Binary 5122 | Floating-Point Arithmetic. 5123 *----------------------------------------------------------------------------*/ 5124 5125 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5126 float_status *status) 5127 { 5128 int32 aExp, bExp, zExp; 5129 uint64_t aSig, bSig, zSig0, zSig1; 5130 int32 expDiff; 5131 5132 aSig = extractFloatx80Frac( a ); 5133 aExp = extractFloatx80Exp( a ); 5134 bSig = extractFloatx80Frac( b ); 5135 bExp = extractFloatx80Exp( b ); 5136 expDiff = aExp - bExp; 5137 if ( 0 < expDiff ) { 5138 if ( aExp == 0x7FFF ) { 5139 if ((uint64_t)(aSig << 1)) { 5140 return propagateFloatx80NaN(a, b, status); 5141 } 5142 return a; 5143 } 5144 if ( bExp == 0 ) --expDiff; 5145 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5146 zExp = aExp; 5147 } 5148 else if ( expDiff < 0 ) { 5149 if ( bExp == 0x7FFF ) { 5150 if ((uint64_t)(bSig << 1)) { 5151 return propagateFloatx80NaN(a, b, status); 5152 } 5153 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5154 } 5155 if ( aExp == 0 ) ++expDiff; 5156 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5157 zExp = bExp; 5158 } 5159 else { 5160 if ( aExp == 0x7FFF ) { 5161 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5162 return propagateFloatx80NaN(a, b, status); 5163 } 5164 return a; 5165 } 5166 zSig1 = 0; 5167 zSig0 = aSig + bSig; 5168 if ( aExp == 0 ) { 5169 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 5170 goto roundAndPack; 5171 } 5172 zExp = aExp; 5173 goto shiftRight1; 5174 } 5175 zSig0 = aSig + bSig; 5176 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 5177 shiftRight1: 5178 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5179 zSig0 |= LIT64( 0x8000000000000000 ); 5180 ++zExp; 5181 roundAndPack: 5182 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5183 zSign, zExp, zSig0, zSig1, status); 5184 } 5185 5186 /*---------------------------------------------------------------------------- 5187 | Returns the result of subtracting the absolute values of the extended 5188 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 5189 | difference is negated before being returned. `zSign' is ignored if the 5190 | result is a NaN. The subtraction is performed according to the IEC/IEEE 5191 | Standard for Binary Floating-Point Arithmetic. 5192 *----------------------------------------------------------------------------*/ 5193 5194 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5195 float_status *status) 5196 { 5197 int32 aExp, bExp, zExp; 5198 uint64_t aSig, bSig, zSig0, zSig1; 5199 int32 expDiff; 5200 floatx80 z; 5201 5202 aSig = extractFloatx80Frac( a ); 5203 aExp = extractFloatx80Exp( a ); 5204 bSig = extractFloatx80Frac( b ); 5205 bExp = extractFloatx80Exp( b ); 5206 expDiff = aExp - bExp; 5207 if ( 0 < expDiff ) goto aExpBigger; 5208 if ( expDiff < 0 ) goto bExpBigger; 5209 if ( aExp == 0x7FFF ) { 5210 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5211 return propagateFloatx80NaN(a, b, status); 5212 } 5213 float_raise(float_flag_invalid, status); 5214 z.low = floatx80_default_nan_low; 5215 z.high = floatx80_default_nan_high; 5216 return z; 5217 } 5218 if ( aExp == 0 ) { 5219 aExp = 1; 5220 bExp = 1; 5221 } 5222 zSig1 = 0; 5223 if ( bSig < aSig ) goto aBigger; 5224 if ( aSig < bSig ) goto bBigger; 5225 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 5226 bExpBigger: 5227 if ( bExp == 0x7FFF ) { 5228 if ((uint64_t)(bSig << 1)) { 5229 return propagateFloatx80NaN(a, b, status); 5230 } 5231 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5232 } 5233 if ( aExp == 0 ) ++expDiff; 5234 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5235 bBigger: 5236 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 5237 zExp = bExp; 5238 zSign ^= 1; 5239 goto normalizeRoundAndPack; 5240 aExpBigger: 5241 if ( aExp == 0x7FFF ) { 5242 if ((uint64_t)(aSig << 1)) { 5243 return propagateFloatx80NaN(a, b, status); 5244 } 5245 return a; 5246 } 5247 if ( bExp == 0 ) --expDiff; 5248 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5249 aBigger: 5250 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 5251 zExp = aExp; 5252 normalizeRoundAndPack: 5253 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 5254 zSign, zExp, zSig0, zSig1, status); 5255 } 5256 5257 /*---------------------------------------------------------------------------- 5258 | Returns the result of adding the extended double-precision floating-point 5259 | values `a' and `b'. The operation is performed according to the IEC/IEEE 5260 | Standard for Binary Floating-Point Arithmetic. 5261 *----------------------------------------------------------------------------*/ 5262 5263 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 5264 { 5265 flag aSign, bSign; 5266 5267 aSign = extractFloatx80Sign( a ); 5268 bSign = extractFloatx80Sign( b ); 5269 if ( aSign == bSign ) { 5270 return addFloatx80Sigs(a, b, aSign, status); 5271 } 5272 else { 5273 return subFloatx80Sigs(a, b, aSign, status); 5274 } 5275 5276 } 5277 5278 /*---------------------------------------------------------------------------- 5279 | Returns the result of subtracting the extended double-precision floating- 5280 | point values `a' and `b'. The operation is performed according to the 5281 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5282 *----------------------------------------------------------------------------*/ 5283 5284 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 5285 { 5286 flag aSign, bSign; 5287 5288 aSign = extractFloatx80Sign( a ); 5289 bSign = extractFloatx80Sign( b ); 5290 if ( aSign == bSign ) { 5291 return subFloatx80Sigs(a, b, aSign, status); 5292 } 5293 else { 5294 return addFloatx80Sigs(a, b, aSign, status); 5295 } 5296 5297 } 5298 5299 /*---------------------------------------------------------------------------- 5300 | Returns the result of multiplying the extended double-precision floating- 5301 | point values `a' and `b'. The operation is performed according to the 5302 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5303 *----------------------------------------------------------------------------*/ 5304 5305 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 5306 { 5307 flag aSign, bSign, zSign; 5308 int32 aExp, bExp, zExp; 5309 uint64_t aSig, bSig, zSig0, zSig1; 5310 floatx80 z; 5311 5312 aSig = extractFloatx80Frac( a ); 5313 aExp = extractFloatx80Exp( a ); 5314 aSign = extractFloatx80Sign( a ); 5315 bSig = extractFloatx80Frac( b ); 5316 bExp = extractFloatx80Exp( b ); 5317 bSign = extractFloatx80Sign( b ); 5318 zSign = aSign ^ bSign; 5319 if ( aExp == 0x7FFF ) { 5320 if ( (uint64_t) ( aSig<<1 ) 5321 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5322 return propagateFloatx80NaN(a, b, status); 5323 } 5324 if ( ( bExp | bSig ) == 0 ) goto invalid; 5325 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5326 } 5327 if ( bExp == 0x7FFF ) { 5328 if ((uint64_t)(bSig << 1)) { 5329 return propagateFloatx80NaN(a, b, status); 5330 } 5331 if ( ( aExp | aSig ) == 0 ) { 5332 invalid: 5333 float_raise(float_flag_invalid, status); 5334 z.low = floatx80_default_nan_low; 5335 z.high = floatx80_default_nan_high; 5336 return z; 5337 } 5338 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5339 } 5340 if ( aExp == 0 ) { 5341 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5342 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5343 } 5344 if ( bExp == 0 ) { 5345 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5346 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5347 } 5348 zExp = aExp + bExp - 0x3FFE; 5349 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 5350 if ( 0 < (int64_t) zSig0 ) { 5351 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5352 --zExp; 5353 } 5354 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5355 zSign, zExp, zSig0, zSig1, status); 5356 } 5357 5358 /*---------------------------------------------------------------------------- 5359 | Returns the result of dividing the extended double-precision floating-point 5360 | value `a' by the corresponding value `b'. The operation is performed 5361 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5362 *----------------------------------------------------------------------------*/ 5363 5364 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 5365 { 5366 flag aSign, bSign, zSign; 5367 int32 aExp, bExp, zExp; 5368 uint64_t aSig, bSig, zSig0, zSig1; 5369 uint64_t rem0, rem1, rem2, term0, term1, term2; 5370 floatx80 z; 5371 5372 aSig = extractFloatx80Frac( a ); 5373 aExp = extractFloatx80Exp( a ); 5374 aSign = extractFloatx80Sign( a ); 5375 bSig = extractFloatx80Frac( b ); 5376 bExp = extractFloatx80Exp( b ); 5377 bSign = extractFloatx80Sign( b ); 5378 zSign = aSign ^ bSign; 5379 if ( aExp == 0x7FFF ) { 5380 if ((uint64_t)(aSig << 1)) { 5381 return propagateFloatx80NaN(a, b, status); 5382 } 5383 if ( bExp == 0x7FFF ) { 5384 if ((uint64_t)(bSig << 1)) { 5385 return propagateFloatx80NaN(a, b, status); 5386 } 5387 goto invalid; 5388 } 5389 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5390 } 5391 if ( bExp == 0x7FFF ) { 5392 if ((uint64_t)(bSig << 1)) { 5393 return propagateFloatx80NaN(a, b, status); 5394 } 5395 return packFloatx80( zSign, 0, 0 ); 5396 } 5397 if ( bExp == 0 ) { 5398 if ( bSig == 0 ) { 5399 if ( ( aExp | aSig ) == 0 ) { 5400 invalid: 5401 float_raise(float_flag_invalid, status); 5402 z.low = floatx80_default_nan_low; 5403 z.high = floatx80_default_nan_high; 5404 return z; 5405 } 5406 float_raise(float_flag_divbyzero, status); 5407 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5408 } 5409 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5410 } 5411 if ( aExp == 0 ) { 5412 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5413 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5414 } 5415 zExp = aExp - bExp + 0x3FFE; 5416 rem1 = 0; 5417 if ( bSig <= aSig ) { 5418 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 5419 ++zExp; 5420 } 5421 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 5422 mul64To128( bSig, zSig0, &term0, &term1 ); 5423 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 5424 while ( (int64_t) rem0 < 0 ) { 5425 --zSig0; 5426 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 5427 } 5428 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 5429 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 5430 mul64To128( bSig, zSig1, &term1, &term2 ); 5431 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5432 while ( (int64_t) rem1 < 0 ) { 5433 --zSig1; 5434 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 5435 } 5436 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 5437 } 5438 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5439 zSign, zExp, zSig0, zSig1, status); 5440 } 5441 5442 /*---------------------------------------------------------------------------- 5443 | Returns the remainder of the extended double-precision floating-point value 5444 | `a' with respect to the corresponding value `b'. The operation is performed 5445 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5446 *----------------------------------------------------------------------------*/ 5447 5448 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 5449 { 5450 flag aSign, zSign; 5451 int32 aExp, bExp, expDiff; 5452 uint64_t aSig0, aSig1, bSig; 5453 uint64_t q, term0, term1, alternateASig0, alternateASig1; 5454 floatx80 z; 5455 5456 aSig0 = extractFloatx80Frac( a ); 5457 aExp = extractFloatx80Exp( a ); 5458 aSign = extractFloatx80Sign( a ); 5459 bSig = extractFloatx80Frac( b ); 5460 bExp = extractFloatx80Exp( b ); 5461 if ( aExp == 0x7FFF ) { 5462 if ( (uint64_t) ( aSig0<<1 ) 5463 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5464 return propagateFloatx80NaN(a, b, status); 5465 } 5466 goto invalid; 5467 } 5468 if ( bExp == 0x7FFF ) { 5469 if ((uint64_t)(bSig << 1)) { 5470 return propagateFloatx80NaN(a, b, status); 5471 } 5472 return a; 5473 } 5474 if ( bExp == 0 ) { 5475 if ( bSig == 0 ) { 5476 invalid: 5477 float_raise(float_flag_invalid, status); 5478 z.low = floatx80_default_nan_low; 5479 z.high = floatx80_default_nan_high; 5480 return z; 5481 } 5482 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5483 } 5484 if ( aExp == 0 ) { 5485 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a; 5486 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5487 } 5488 bSig |= LIT64( 0x8000000000000000 ); 5489 zSign = aSign; 5490 expDiff = aExp - bExp; 5491 aSig1 = 0; 5492 if ( expDiff < 0 ) { 5493 if ( expDiff < -1 ) return a; 5494 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 5495 expDiff = 0; 5496 } 5497 q = ( bSig <= aSig0 ); 5498 if ( q ) aSig0 -= bSig; 5499 expDiff -= 64; 5500 while ( 0 < expDiff ) { 5501 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5502 q = ( 2 < q ) ? q - 2 : 0; 5503 mul64To128( bSig, q, &term0, &term1 ); 5504 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5505 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 5506 expDiff -= 62; 5507 } 5508 expDiff += 64; 5509 if ( 0 < expDiff ) { 5510 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5511 q = ( 2 < q ) ? q - 2 : 0; 5512 q >>= 64 - expDiff; 5513 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 5514 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5515 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 5516 while ( le128( term0, term1, aSig0, aSig1 ) ) { 5517 ++q; 5518 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5519 } 5520 } 5521 else { 5522 term1 = 0; 5523 term0 = bSig; 5524 } 5525 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 5526 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5527 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5528 && ( q & 1 ) ) 5529 ) { 5530 aSig0 = alternateASig0; 5531 aSig1 = alternateASig1; 5532 zSign = ! zSign; 5533 } 5534 return 5535 normalizeRoundAndPackFloatx80( 5536 80, zSign, bExp + expDiff, aSig0, aSig1, status); 5537 5538 } 5539 5540 /*---------------------------------------------------------------------------- 5541 | Returns the square root of the extended double-precision floating-point 5542 | value `a'. The operation is performed according to the IEC/IEEE Standard 5543 | for Binary Floating-Point Arithmetic. 5544 *----------------------------------------------------------------------------*/ 5545 5546 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 5547 { 5548 flag aSign; 5549 int32 aExp, zExp; 5550 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 5551 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5552 floatx80 z; 5553 5554 aSig0 = extractFloatx80Frac( a ); 5555 aExp = extractFloatx80Exp( a ); 5556 aSign = extractFloatx80Sign( a ); 5557 if ( aExp == 0x7FFF ) { 5558 if ((uint64_t)(aSig0 << 1)) { 5559 return propagateFloatx80NaN(a, a, status); 5560 } 5561 if ( ! aSign ) return a; 5562 goto invalid; 5563 } 5564 if ( aSign ) { 5565 if ( ( aExp | aSig0 ) == 0 ) return a; 5566 invalid: 5567 float_raise(float_flag_invalid, status); 5568 z.low = floatx80_default_nan_low; 5569 z.high = floatx80_default_nan_high; 5570 return z; 5571 } 5572 if ( aExp == 0 ) { 5573 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 5574 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5575 } 5576 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 5577 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 5578 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 5579 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5580 doubleZSig0 = zSig0<<1; 5581 mul64To128( zSig0, zSig0, &term0, &term1 ); 5582 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5583 while ( (int64_t) rem0 < 0 ) { 5584 --zSig0; 5585 doubleZSig0 -= 2; 5586 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5587 } 5588 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5589 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 5590 if ( zSig1 == 0 ) zSig1 = 1; 5591 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5592 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5593 mul64To128( zSig1, zSig1, &term2, &term3 ); 5594 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5595 while ( (int64_t) rem1 < 0 ) { 5596 --zSig1; 5597 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5598 term3 |= 1; 5599 term2 |= doubleZSig0; 5600 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5601 } 5602 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5603 } 5604 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 5605 zSig0 |= doubleZSig0; 5606 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5607 0, zExp, zSig0, zSig1, status); 5608 } 5609 5610 /*---------------------------------------------------------------------------- 5611 | Returns 1 if the extended double-precision floating-point value `a' is equal 5612 | to the corresponding value `b', and 0 otherwise. The invalid exception is 5613 | raised if either operand is a NaN. Otherwise, the comparison is performed 5614 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5615 *----------------------------------------------------------------------------*/ 5616 5617 int floatx80_eq(floatx80 a, floatx80 b, float_status *status) 5618 { 5619 5620 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5621 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5622 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5623 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5624 ) { 5625 float_raise(float_flag_invalid, status); 5626 return 0; 5627 } 5628 return 5629 ( a.low == b.low ) 5630 && ( ( a.high == b.high ) 5631 || ( ( a.low == 0 ) 5632 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5633 ); 5634 5635 } 5636 5637 /*---------------------------------------------------------------------------- 5638 | Returns 1 if the extended double-precision floating-point value `a' is 5639 | less than or equal to the corresponding value `b', and 0 otherwise. The 5640 | invalid exception is raised if either operand is a NaN. The comparison is 5641 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5642 | Arithmetic. 5643 *----------------------------------------------------------------------------*/ 5644 5645 int floatx80_le(floatx80 a, floatx80 b, float_status *status) 5646 { 5647 flag aSign, bSign; 5648 5649 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5650 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5651 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5652 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5653 ) { 5654 float_raise(float_flag_invalid, status); 5655 return 0; 5656 } 5657 aSign = extractFloatx80Sign( a ); 5658 bSign = extractFloatx80Sign( b ); 5659 if ( aSign != bSign ) { 5660 return 5661 aSign 5662 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5663 == 0 ); 5664 } 5665 return 5666 aSign ? le128( b.high, b.low, a.high, a.low ) 5667 : le128( a.high, a.low, b.high, b.low ); 5668 5669 } 5670 5671 /*---------------------------------------------------------------------------- 5672 | Returns 1 if the extended double-precision floating-point value `a' is 5673 | less than the corresponding value `b', and 0 otherwise. The invalid 5674 | exception is raised if either operand is a NaN. The comparison is performed 5675 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5676 *----------------------------------------------------------------------------*/ 5677 5678 int floatx80_lt(floatx80 a, floatx80 b, float_status *status) 5679 { 5680 flag aSign, bSign; 5681 5682 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5683 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5684 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5685 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5686 ) { 5687 float_raise(float_flag_invalid, status); 5688 return 0; 5689 } 5690 aSign = extractFloatx80Sign( a ); 5691 bSign = extractFloatx80Sign( b ); 5692 if ( aSign != bSign ) { 5693 return 5694 aSign 5695 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5696 != 0 ); 5697 } 5698 return 5699 aSign ? lt128( b.high, b.low, a.high, a.low ) 5700 : lt128( a.high, a.low, b.high, b.low ); 5701 5702 } 5703 5704 /*---------------------------------------------------------------------------- 5705 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5706 | cannot be compared, and 0 otherwise. The invalid exception is raised if 5707 | either operand is a NaN. The comparison is performed according to the 5708 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5709 *----------------------------------------------------------------------------*/ 5710 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status) 5711 { 5712 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5713 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5714 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5715 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5716 ) { 5717 float_raise(float_flag_invalid, status); 5718 return 1; 5719 } 5720 return 0; 5721 } 5722 5723 /*---------------------------------------------------------------------------- 5724 | Returns 1 if the extended double-precision floating-point value `a' is 5725 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5726 | cause an exception. The comparison is performed according to the IEC/IEEE 5727 | Standard for Binary Floating-Point Arithmetic. 5728 *----------------------------------------------------------------------------*/ 5729 5730 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status) 5731 { 5732 5733 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5734 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5735 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5736 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5737 ) { 5738 if ( floatx80_is_signaling_nan( a ) 5739 || floatx80_is_signaling_nan( b ) ) { 5740 float_raise(float_flag_invalid, status); 5741 } 5742 return 0; 5743 } 5744 return 5745 ( a.low == b.low ) 5746 && ( ( a.high == b.high ) 5747 || ( ( a.low == 0 ) 5748 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5749 ); 5750 5751 } 5752 5753 /*---------------------------------------------------------------------------- 5754 | Returns 1 if the extended double-precision floating-point value `a' is less 5755 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 5756 | do not cause an exception. Otherwise, the comparison is performed according 5757 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5758 *----------------------------------------------------------------------------*/ 5759 5760 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status) 5761 { 5762 flag aSign, bSign; 5763 5764 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5765 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5766 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5767 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5768 ) { 5769 if ( floatx80_is_signaling_nan( a ) 5770 || floatx80_is_signaling_nan( b ) ) { 5771 float_raise(float_flag_invalid, status); 5772 } 5773 return 0; 5774 } 5775 aSign = extractFloatx80Sign( a ); 5776 bSign = extractFloatx80Sign( b ); 5777 if ( aSign != bSign ) { 5778 return 5779 aSign 5780 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5781 == 0 ); 5782 } 5783 return 5784 aSign ? le128( b.high, b.low, a.high, a.low ) 5785 : le128( a.high, a.low, b.high, b.low ); 5786 5787 } 5788 5789 /*---------------------------------------------------------------------------- 5790 | Returns 1 if the extended double-precision floating-point value `a' is less 5791 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 5792 | an exception. Otherwise, the comparison is performed according to the 5793 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5794 *----------------------------------------------------------------------------*/ 5795 5796 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status) 5797 { 5798 flag aSign, bSign; 5799 5800 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5801 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5802 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5803 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5804 ) { 5805 if ( floatx80_is_signaling_nan( a ) 5806 || floatx80_is_signaling_nan( b ) ) { 5807 float_raise(float_flag_invalid, status); 5808 } 5809 return 0; 5810 } 5811 aSign = extractFloatx80Sign( a ); 5812 bSign = extractFloatx80Sign( b ); 5813 if ( aSign != bSign ) { 5814 return 5815 aSign 5816 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5817 != 0 ); 5818 } 5819 return 5820 aSign ? lt128( b.high, b.low, a.high, a.low ) 5821 : lt128( a.high, a.low, b.high, b.low ); 5822 5823 } 5824 5825 /*---------------------------------------------------------------------------- 5826 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5827 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception. 5828 | The comparison is performed according to the IEC/IEEE Standard for Binary 5829 | Floating-Point Arithmetic. 5830 *----------------------------------------------------------------------------*/ 5831 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status) 5832 { 5833 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5834 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5835 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5836 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5837 ) { 5838 if ( floatx80_is_signaling_nan( a ) 5839 || floatx80_is_signaling_nan( b ) ) { 5840 float_raise(float_flag_invalid, status); 5841 } 5842 return 1; 5843 } 5844 return 0; 5845 } 5846 5847 /*---------------------------------------------------------------------------- 5848 | Returns the result of converting the quadruple-precision floating-point 5849 | value `a' to the 32-bit two's complement integer format. The conversion 5850 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5851 | Arithmetic---which means in particular that the conversion is rounded 5852 | according to the current rounding mode. If `a' is a NaN, the largest 5853 | positive integer is returned. Otherwise, if the conversion overflows, the 5854 | largest integer with the same sign as `a' is returned. 5855 *----------------------------------------------------------------------------*/ 5856 5857 int32 float128_to_int32(float128 a, float_status *status) 5858 { 5859 flag aSign; 5860 int32 aExp, shiftCount; 5861 uint64_t aSig0, aSig1; 5862 5863 aSig1 = extractFloat128Frac1( a ); 5864 aSig0 = extractFloat128Frac0( a ); 5865 aExp = extractFloat128Exp( a ); 5866 aSign = extractFloat128Sign( a ); 5867 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 5868 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5869 aSig0 |= ( aSig1 != 0 ); 5870 shiftCount = 0x4028 - aExp; 5871 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 5872 return roundAndPackInt32(aSign, aSig0, status); 5873 5874 } 5875 5876 /*---------------------------------------------------------------------------- 5877 | Returns the result of converting the quadruple-precision floating-point 5878 | value `a' to the 32-bit two's complement integer format. The conversion 5879 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5880 | Arithmetic, except that the conversion is always rounded toward zero. If 5881 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 5882 | conversion overflows, the largest integer with the same sign as `a' is 5883 | returned. 5884 *----------------------------------------------------------------------------*/ 5885 5886 int32 float128_to_int32_round_to_zero(float128 a, float_status *status) 5887 { 5888 flag aSign; 5889 int32 aExp, shiftCount; 5890 uint64_t aSig0, aSig1, savedASig; 5891 int32_t z; 5892 5893 aSig1 = extractFloat128Frac1( a ); 5894 aSig0 = extractFloat128Frac0( a ); 5895 aExp = extractFloat128Exp( a ); 5896 aSign = extractFloat128Sign( a ); 5897 aSig0 |= ( aSig1 != 0 ); 5898 if ( 0x401E < aExp ) { 5899 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 5900 goto invalid; 5901 } 5902 else if ( aExp < 0x3FFF ) { 5903 if (aExp || aSig0) { 5904 status->float_exception_flags |= float_flag_inexact; 5905 } 5906 return 0; 5907 } 5908 aSig0 |= LIT64( 0x0001000000000000 ); 5909 shiftCount = 0x402F - aExp; 5910 savedASig = aSig0; 5911 aSig0 >>= shiftCount; 5912 z = aSig0; 5913 if ( aSign ) z = - z; 5914 if ( ( z < 0 ) ^ aSign ) { 5915 invalid: 5916 float_raise(float_flag_invalid, status); 5917 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5918 } 5919 if ( ( aSig0<<shiftCount ) != savedASig ) { 5920 status->float_exception_flags |= float_flag_inexact; 5921 } 5922 return z; 5923 5924 } 5925 5926 /*---------------------------------------------------------------------------- 5927 | Returns the result of converting the quadruple-precision floating-point 5928 | value `a' to the 64-bit two's complement integer format. The conversion 5929 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5930 | Arithmetic---which means in particular that the conversion is rounded 5931 | according to the current rounding mode. If `a' is a NaN, the largest 5932 | positive integer is returned. Otherwise, if the conversion overflows, the 5933 | largest integer with the same sign as `a' is returned. 5934 *----------------------------------------------------------------------------*/ 5935 5936 int64 float128_to_int64(float128 a, float_status *status) 5937 { 5938 flag aSign; 5939 int32 aExp, shiftCount; 5940 uint64_t aSig0, aSig1; 5941 5942 aSig1 = extractFloat128Frac1( a ); 5943 aSig0 = extractFloat128Frac0( a ); 5944 aExp = extractFloat128Exp( a ); 5945 aSign = extractFloat128Sign( a ); 5946 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5947 shiftCount = 0x402F - aExp; 5948 if ( shiftCount <= 0 ) { 5949 if ( 0x403E < aExp ) { 5950 float_raise(float_flag_invalid, status); 5951 if ( ! aSign 5952 || ( ( aExp == 0x7FFF ) 5953 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 5954 ) 5955 ) { 5956 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5957 } 5958 return (int64_t) LIT64( 0x8000000000000000 ); 5959 } 5960 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 5961 } 5962 else { 5963 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 5964 } 5965 return roundAndPackInt64(aSign, aSig0, aSig1, status); 5966 5967 } 5968 5969 /*---------------------------------------------------------------------------- 5970 | Returns the result of converting the quadruple-precision floating-point 5971 | value `a' to the 64-bit two's complement integer format. The conversion 5972 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5973 | Arithmetic, except that the conversion is always rounded toward zero. 5974 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 5975 | the conversion overflows, the largest integer with the same sign as `a' is 5976 | returned. 5977 *----------------------------------------------------------------------------*/ 5978 5979 int64 float128_to_int64_round_to_zero(float128 a, float_status *status) 5980 { 5981 flag aSign; 5982 int32 aExp, shiftCount; 5983 uint64_t aSig0, aSig1; 5984 int64 z; 5985 5986 aSig1 = extractFloat128Frac1( a ); 5987 aSig0 = extractFloat128Frac0( a ); 5988 aExp = extractFloat128Exp( a ); 5989 aSign = extractFloat128Sign( a ); 5990 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5991 shiftCount = aExp - 0x402F; 5992 if ( 0 < shiftCount ) { 5993 if ( 0x403E <= aExp ) { 5994 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 5995 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 5996 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 5997 if (aSig1) { 5998 status->float_exception_flags |= float_flag_inexact; 5999 } 6000 } 6001 else { 6002 float_raise(float_flag_invalid, status); 6003 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 6004 return LIT64( 0x7FFFFFFFFFFFFFFF ); 6005 } 6006 } 6007 return (int64_t) LIT64( 0x8000000000000000 ); 6008 } 6009 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 6010 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 6011 status->float_exception_flags |= float_flag_inexact; 6012 } 6013 } 6014 else { 6015 if ( aExp < 0x3FFF ) { 6016 if ( aExp | aSig0 | aSig1 ) { 6017 status->float_exception_flags |= float_flag_inexact; 6018 } 6019 return 0; 6020 } 6021 z = aSig0>>( - shiftCount ); 6022 if ( aSig1 6023 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 6024 status->float_exception_flags |= float_flag_inexact; 6025 } 6026 } 6027 if ( aSign ) z = - z; 6028 return z; 6029 6030 } 6031 6032 /*---------------------------------------------------------------------------- 6033 | Returns the result of converting the quadruple-precision floating-point 6034 | value `a' to the single-precision floating-point format. The conversion 6035 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6036 | Arithmetic. 6037 *----------------------------------------------------------------------------*/ 6038 6039 float32 float128_to_float32(float128 a, float_status *status) 6040 { 6041 flag aSign; 6042 int32 aExp; 6043 uint64_t aSig0, aSig1; 6044 uint32_t zSig; 6045 6046 aSig1 = extractFloat128Frac1( a ); 6047 aSig0 = extractFloat128Frac0( a ); 6048 aExp = extractFloat128Exp( a ); 6049 aSign = extractFloat128Sign( a ); 6050 if ( aExp == 0x7FFF ) { 6051 if ( aSig0 | aSig1 ) { 6052 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 6053 } 6054 return packFloat32( aSign, 0xFF, 0 ); 6055 } 6056 aSig0 |= ( aSig1 != 0 ); 6057 shift64RightJamming( aSig0, 18, &aSig0 ); 6058 zSig = aSig0; 6059 if ( aExp || zSig ) { 6060 zSig |= 0x40000000; 6061 aExp -= 0x3F81; 6062 } 6063 return roundAndPackFloat32(aSign, aExp, zSig, status); 6064 6065 } 6066 6067 /*---------------------------------------------------------------------------- 6068 | Returns the result of converting the quadruple-precision floating-point 6069 | value `a' to the double-precision floating-point format. The conversion 6070 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6071 | Arithmetic. 6072 *----------------------------------------------------------------------------*/ 6073 6074 float64 float128_to_float64(float128 a, float_status *status) 6075 { 6076 flag aSign; 6077 int32 aExp; 6078 uint64_t aSig0, aSig1; 6079 6080 aSig1 = extractFloat128Frac1( a ); 6081 aSig0 = extractFloat128Frac0( a ); 6082 aExp = extractFloat128Exp( a ); 6083 aSign = extractFloat128Sign( a ); 6084 if ( aExp == 0x7FFF ) { 6085 if ( aSig0 | aSig1 ) { 6086 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 6087 } 6088 return packFloat64( aSign, 0x7FF, 0 ); 6089 } 6090 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6091 aSig0 |= ( aSig1 != 0 ); 6092 if ( aExp || aSig0 ) { 6093 aSig0 |= LIT64( 0x4000000000000000 ); 6094 aExp -= 0x3C01; 6095 } 6096 return roundAndPackFloat64(aSign, aExp, aSig0, status); 6097 6098 } 6099 6100 /*---------------------------------------------------------------------------- 6101 | Returns the result of converting the quadruple-precision floating-point 6102 | value `a' to the extended double-precision floating-point format. The 6103 | conversion is performed according to the IEC/IEEE Standard for Binary 6104 | Floating-Point Arithmetic. 6105 *----------------------------------------------------------------------------*/ 6106 6107 floatx80 float128_to_floatx80(float128 a, float_status *status) 6108 { 6109 flag aSign; 6110 int32 aExp; 6111 uint64_t aSig0, aSig1; 6112 6113 aSig1 = extractFloat128Frac1( a ); 6114 aSig0 = extractFloat128Frac0( a ); 6115 aExp = extractFloat128Exp( a ); 6116 aSign = extractFloat128Sign( a ); 6117 if ( aExp == 0x7FFF ) { 6118 if ( aSig0 | aSig1 ) { 6119 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status); 6120 } 6121 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 6122 } 6123 if ( aExp == 0 ) { 6124 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6125 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6126 } 6127 else { 6128 aSig0 |= LIT64( 0x0001000000000000 ); 6129 } 6130 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6131 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6132 6133 } 6134 6135 /*---------------------------------------------------------------------------- 6136 | Rounds the quadruple-precision floating-point value `a' to an integer, and 6137 | returns the result as a quadruple-precision floating-point value. The 6138 | operation is performed according to the IEC/IEEE Standard for Binary 6139 | Floating-Point Arithmetic. 6140 *----------------------------------------------------------------------------*/ 6141 6142 float128 float128_round_to_int(float128 a, float_status *status) 6143 { 6144 flag aSign; 6145 int32 aExp; 6146 uint64_t lastBitMask, roundBitsMask; 6147 float128 z; 6148 6149 aExp = extractFloat128Exp( a ); 6150 if ( 0x402F <= aExp ) { 6151 if ( 0x406F <= aExp ) { 6152 if ( ( aExp == 0x7FFF ) 6153 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 6154 ) { 6155 return propagateFloat128NaN(a, a, status); 6156 } 6157 return a; 6158 } 6159 lastBitMask = 1; 6160 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 6161 roundBitsMask = lastBitMask - 1; 6162 z = a; 6163 switch (status->float_rounding_mode) { 6164 case float_round_nearest_even: 6165 if ( lastBitMask ) { 6166 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 6167 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 6168 } 6169 else { 6170 if ( (int64_t) z.low < 0 ) { 6171 ++z.high; 6172 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 6173 } 6174 } 6175 break; 6176 case float_round_ties_away: 6177 if (lastBitMask) { 6178 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 6179 } else { 6180 if ((int64_t) z.low < 0) { 6181 ++z.high; 6182 } 6183 } 6184 break; 6185 case float_round_to_zero: 6186 break; 6187 case float_round_up: 6188 if (!extractFloat128Sign(z)) { 6189 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6190 } 6191 break; 6192 case float_round_down: 6193 if (extractFloat128Sign(z)) { 6194 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6195 } 6196 break; 6197 default: 6198 abort(); 6199 } 6200 z.low &= ~ roundBitsMask; 6201 } 6202 else { 6203 if ( aExp < 0x3FFF ) { 6204 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 6205 status->float_exception_flags |= float_flag_inexact; 6206 aSign = extractFloat128Sign( a ); 6207 switch (status->float_rounding_mode) { 6208 case float_round_nearest_even: 6209 if ( ( aExp == 0x3FFE ) 6210 && ( extractFloat128Frac0( a ) 6211 | extractFloat128Frac1( a ) ) 6212 ) { 6213 return packFloat128( aSign, 0x3FFF, 0, 0 ); 6214 } 6215 break; 6216 case float_round_ties_away: 6217 if (aExp == 0x3FFE) { 6218 return packFloat128(aSign, 0x3FFF, 0, 0); 6219 } 6220 break; 6221 case float_round_down: 6222 return 6223 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 6224 : packFloat128( 0, 0, 0, 0 ); 6225 case float_round_up: 6226 return 6227 aSign ? packFloat128( 1, 0, 0, 0 ) 6228 : packFloat128( 0, 0x3FFF, 0, 0 ); 6229 } 6230 return packFloat128( aSign, 0, 0, 0 ); 6231 } 6232 lastBitMask = 1; 6233 lastBitMask <<= 0x402F - aExp; 6234 roundBitsMask = lastBitMask - 1; 6235 z.low = 0; 6236 z.high = a.high; 6237 switch (status->float_rounding_mode) { 6238 case float_round_nearest_even: 6239 z.high += lastBitMask>>1; 6240 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 6241 z.high &= ~ lastBitMask; 6242 } 6243 break; 6244 case float_round_ties_away: 6245 z.high += lastBitMask>>1; 6246 break; 6247 case float_round_to_zero: 6248 break; 6249 case float_round_up: 6250 if (!extractFloat128Sign(z)) { 6251 z.high |= ( a.low != 0 ); 6252 z.high += roundBitsMask; 6253 } 6254 break; 6255 case float_round_down: 6256 if (extractFloat128Sign(z)) { 6257 z.high |= (a.low != 0); 6258 z.high += roundBitsMask; 6259 } 6260 break; 6261 default: 6262 abort(); 6263 } 6264 z.high &= ~ roundBitsMask; 6265 } 6266 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 6267 status->float_exception_flags |= float_flag_inexact; 6268 } 6269 return z; 6270 6271 } 6272 6273 /*---------------------------------------------------------------------------- 6274 | Returns the result of adding the absolute values of the quadruple-precision 6275 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 6276 | before being returned. `zSign' is ignored if the result is a NaN. 6277 | The addition is performed according to the IEC/IEEE Standard for Binary 6278 | Floating-Point Arithmetic. 6279 *----------------------------------------------------------------------------*/ 6280 6281 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign, 6282 float_status *status) 6283 { 6284 int32 aExp, bExp, zExp; 6285 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6286 int32 expDiff; 6287 6288 aSig1 = extractFloat128Frac1( a ); 6289 aSig0 = extractFloat128Frac0( a ); 6290 aExp = extractFloat128Exp( a ); 6291 bSig1 = extractFloat128Frac1( b ); 6292 bSig0 = extractFloat128Frac0( b ); 6293 bExp = extractFloat128Exp( b ); 6294 expDiff = aExp - bExp; 6295 if ( 0 < expDiff ) { 6296 if ( aExp == 0x7FFF ) { 6297 if (aSig0 | aSig1) { 6298 return propagateFloat128NaN(a, b, status); 6299 } 6300 return a; 6301 } 6302 if ( bExp == 0 ) { 6303 --expDiff; 6304 } 6305 else { 6306 bSig0 |= LIT64( 0x0001000000000000 ); 6307 } 6308 shift128ExtraRightJamming( 6309 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 6310 zExp = aExp; 6311 } 6312 else if ( expDiff < 0 ) { 6313 if ( bExp == 0x7FFF ) { 6314 if (bSig0 | bSig1) { 6315 return propagateFloat128NaN(a, b, status); 6316 } 6317 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6318 } 6319 if ( aExp == 0 ) { 6320 ++expDiff; 6321 } 6322 else { 6323 aSig0 |= LIT64( 0x0001000000000000 ); 6324 } 6325 shift128ExtraRightJamming( 6326 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 6327 zExp = bExp; 6328 } 6329 else { 6330 if ( aExp == 0x7FFF ) { 6331 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6332 return propagateFloat128NaN(a, b, status); 6333 } 6334 return a; 6335 } 6336 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6337 if ( aExp == 0 ) { 6338 if (status->flush_to_zero) { 6339 if (zSig0 | zSig1) { 6340 float_raise(float_flag_output_denormal, status); 6341 } 6342 return packFloat128(zSign, 0, 0, 0); 6343 } 6344 return packFloat128( zSign, 0, zSig0, zSig1 ); 6345 } 6346 zSig2 = 0; 6347 zSig0 |= LIT64( 0x0002000000000000 ); 6348 zExp = aExp; 6349 goto shiftRight1; 6350 } 6351 aSig0 |= LIT64( 0x0001000000000000 ); 6352 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6353 --zExp; 6354 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 6355 ++zExp; 6356 shiftRight1: 6357 shift128ExtraRightJamming( 6358 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6359 roundAndPack: 6360 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6361 6362 } 6363 6364 /*---------------------------------------------------------------------------- 6365 | Returns the result of subtracting the absolute values of the quadruple- 6366 | precision floating-point values `a' and `b'. If `zSign' is 1, the 6367 | difference is negated before being returned. `zSign' is ignored if the 6368 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6369 | Standard for Binary Floating-Point Arithmetic. 6370 *----------------------------------------------------------------------------*/ 6371 6372 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign, 6373 float_status *status) 6374 { 6375 int32 aExp, bExp, zExp; 6376 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 6377 int32 expDiff; 6378 float128 z; 6379 6380 aSig1 = extractFloat128Frac1( a ); 6381 aSig0 = extractFloat128Frac0( a ); 6382 aExp = extractFloat128Exp( a ); 6383 bSig1 = extractFloat128Frac1( b ); 6384 bSig0 = extractFloat128Frac0( b ); 6385 bExp = extractFloat128Exp( b ); 6386 expDiff = aExp - bExp; 6387 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6388 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 6389 if ( 0 < expDiff ) goto aExpBigger; 6390 if ( expDiff < 0 ) goto bExpBigger; 6391 if ( aExp == 0x7FFF ) { 6392 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6393 return propagateFloat128NaN(a, b, status); 6394 } 6395 float_raise(float_flag_invalid, status); 6396 z.low = float128_default_nan_low; 6397 z.high = float128_default_nan_high; 6398 return z; 6399 } 6400 if ( aExp == 0 ) { 6401 aExp = 1; 6402 bExp = 1; 6403 } 6404 if ( bSig0 < aSig0 ) goto aBigger; 6405 if ( aSig0 < bSig0 ) goto bBigger; 6406 if ( bSig1 < aSig1 ) goto aBigger; 6407 if ( aSig1 < bSig1 ) goto bBigger; 6408 return packFloat128(status->float_rounding_mode == float_round_down, 6409 0, 0, 0); 6410 bExpBigger: 6411 if ( bExp == 0x7FFF ) { 6412 if (bSig0 | bSig1) { 6413 return propagateFloat128NaN(a, b, status); 6414 } 6415 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 6416 } 6417 if ( aExp == 0 ) { 6418 ++expDiff; 6419 } 6420 else { 6421 aSig0 |= LIT64( 0x4000000000000000 ); 6422 } 6423 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6424 bSig0 |= LIT64( 0x4000000000000000 ); 6425 bBigger: 6426 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6427 zExp = bExp; 6428 zSign ^= 1; 6429 goto normalizeRoundAndPack; 6430 aExpBigger: 6431 if ( aExp == 0x7FFF ) { 6432 if (aSig0 | aSig1) { 6433 return propagateFloat128NaN(a, b, status); 6434 } 6435 return a; 6436 } 6437 if ( bExp == 0 ) { 6438 --expDiff; 6439 } 6440 else { 6441 bSig0 |= LIT64( 0x4000000000000000 ); 6442 } 6443 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 6444 aSig0 |= LIT64( 0x4000000000000000 ); 6445 aBigger: 6446 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6447 zExp = aExp; 6448 normalizeRoundAndPack: 6449 --zExp; 6450 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 6451 status); 6452 6453 } 6454 6455 /*---------------------------------------------------------------------------- 6456 | Returns the result of adding the quadruple-precision floating-point values 6457 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 6458 | for Binary Floating-Point Arithmetic. 6459 *----------------------------------------------------------------------------*/ 6460 6461 float128 float128_add(float128 a, float128 b, float_status *status) 6462 { 6463 flag aSign, bSign; 6464 6465 aSign = extractFloat128Sign( a ); 6466 bSign = extractFloat128Sign( b ); 6467 if ( aSign == bSign ) { 6468 return addFloat128Sigs(a, b, aSign, status); 6469 } 6470 else { 6471 return subFloat128Sigs(a, b, aSign, status); 6472 } 6473 6474 } 6475 6476 /*---------------------------------------------------------------------------- 6477 | Returns the result of subtracting the quadruple-precision floating-point 6478 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6479 | Standard for Binary Floating-Point Arithmetic. 6480 *----------------------------------------------------------------------------*/ 6481 6482 float128 float128_sub(float128 a, float128 b, float_status *status) 6483 { 6484 flag aSign, bSign; 6485 6486 aSign = extractFloat128Sign( a ); 6487 bSign = extractFloat128Sign( b ); 6488 if ( aSign == bSign ) { 6489 return subFloat128Sigs(a, b, aSign, status); 6490 } 6491 else { 6492 return addFloat128Sigs(a, b, aSign, status); 6493 } 6494 6495 } 6496 6497 /*---------------------------------------------------------------------------- 6498 | Returns the result of multiplying the quadruple-precision floating-point 6499 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6500 | Standard for Binary Floating-Point Arithmetic. 6501 *----------------------------------------------------------------------------*/ 6502 6503 float128 float128_mul(float128 a, float128 b, float_status *status) 6504 { 6505 flag aSign, bSign, zSign; 6506 int32 aExp, bExp, zExp; 6507 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 6508 float128 z; 6509 6510 aSig1 = extractFloat128Frac1( a ); 6511 aSig0 = extractFloat128Frac0( a ); 6512 aExp = extractFloat128Exp( a ); 6513 aSign = extractFloat128Sign( a ); 6514 bSig1 = extractFloat128Frac1( b ); 6515 bSig0 = extractFloat128Frac0( b ); 6516 bExp = extractFloat128Exp( b ); 6517 bSign = extractFloat128Sign( b ); 6518 zSign = aSign ^ bSign; 6519 if ( aExp == 0x7FFF ) { 6520 if ( ( aSig0 | aSig1 ) 6521 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6522 return propagateFloat128NaN(a, b, status); 6523 } 6524 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 6525 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6526 } 6527 if ( bExp == 0x7FFF ) { 6528 if (bSig0 | bSig1) { 6529 return propagateFloat128NaN(a, b, status); 6530 } 6531 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6532 invalid: 6533 float_raise(float_flag_invalid, status); 6534 z.low = float128_default_nan_low; 6535 z.high = float128_default_nan_high; 6536 return z; 6537 } 6538 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6539 } 6540 if ( aExp == 0 ) { 6541 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6542 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6543 } 6544 if ( bExp == 0 ) { 6545 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6546 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6547 } 6548 zExp = aExp + bExp - 0x4000; 6549 aSig0 |= LIT64( 0x0001000000000000 ); 6550 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 6551 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 6552 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6553 zSig2 |= ( zSig3 != 0 ); 6554 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 6555 shift128ExtraRightJamming( 6556 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6557 ++zExp; 6558 } 6559 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6560 6561 } 6562 6563 /*---------------------------------------------------------------------------- 6564 | Returns the result of dividing the quadruple-precision floating-point value 6565 | `a' by the corresponding value `b'. The operation is performed according to 6566 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6567 *----------------------------------------------------------------------------*/ 6568 6569 float128 float128_div(float128 a, float128 b, float_status *status) 6570 { 6571 flag aSign, bSign, zSign; 6572 int32 aExp, bExp, zExp; 6573 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6574 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6575 float128 z; 6576 6577 aSig1 = extractFloat128Frac1( a ); 6578 aSig0 = extractFloat128Frac0( a ); 6579 aExp = extractFloat128Exp( a ); 6580 aSign = extractFloat128Sign( a ); 6581 bSig1 = extractFloat128Frac1( b ); 6582 bSig0 = extractFloat128Frac0( b ); 6583 bExp = extractFloat128Exp( b ); 6584 bSign = extractFloat128Sign( b ); 6585 zSign = aSign ^ bSign; 6586 if ( aExp == 0x7FFF ) { 6587 if (aSig0 | aSig1) { 6588 return propagateFloat128NaN(a, b, status); 6589 } 6590 if ( bExp == 0x7FFF ) { 6591 if (bSig0 | bSig1) { 6592 return propagateFloat128NaN(a, b, status); 6593 } 6594 goto invalid; 6595 } 6596 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6597 } 6598 if ( bExp == 0x7FFF ) { 6599 if (bSig0 | bSig1) { 6600 return propagateFloat128NaN(a, b, status); 6601 } 6602 return packFloat128( zSign, 0, 0, 0 ); 6603 } 6604 if ( bExp == 0 ) { 6605 if ( ( bSig0 | bSig1 ) == 0 ) { 6606 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6607 invalid: 6608 float_raise(float_flag_invalid, status); 6609 z.low = float128_default_nan_low; 6610 z.high = float128_default_nan_high; 6611 return z; 6612 } 6613 float_raise(float_flag_divbyzero, status); 6614 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6615 } 6616 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6617 } 6618 if ( aExp == 0 ) { 6619 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6620 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6621 } 6622 zExp = aExp - bExp + 0x3FFD; 6623 shortShift128Left( 6624 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 6625 shortShift128Left( 6626 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6627 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 6628 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 6629 ++zExp; 6630 } 6631 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6632 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 6633 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 6634 while ( (int64_t) rem0 < 0 ) { 6635 --zSig0; 6636 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 6637 } 6638 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 6639 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 6640 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 6641 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 6642 while ( (int64_t) rem1 < 0 ) { 6643 --zSig1; 6644 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 6645 } 6646 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6647 } 6648 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 6649 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6650 6651 } 6652 6653 /*---------------------------------------------------------------------------- 6654 | Returns the remainder of the quadruple-precision floating-point value `a' 6655 | with respect to the corresponding value `b'. The operation is performed 6656 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6657 *----------------------------------------------------------------------------*/ 6658 6659 float128 float128_rem(float128 a, float128 b, float_status *status) 6660 { 6661 flag aSign, zSign; 6662 int32 aExp, bExp, expDiff; 6663 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 6664 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 6665 int64_t sigMean0; 6666 float128 z; 6667 6668 aSig1 = extractFloat128Frac1( a ); 6669 aSig0 = extractFloat128Frac0( a ); 6670 aExp = extractFloat128Exp( a ); 6671 aSign = extractFloat128Sign( a ); 6672 bSig1 = extractFloat128Frac1( b ); 6673 bSig0 = extractFloat128Frac0( b ); 6674 bExp = extractFloat128Exp( b ); 6675 if ( aExp == 0x7FFF ) { 6676 if ( ( aSig0 | aSig1 ) 6677 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6678 return propagateFloat128NaN(a, b, status); 6679 } 6680 goto invalid; 6681 } 6682 if ( bExp == 0x7FFF ) { 6683 if (bSig0 | bSig1) { 6684 return propagateFloat128NaN(a, b, status); 6685 } 6686 return a; 6687 } 6688 if ( bExp == 0 ) { 6689 if ( ( bSig0 | bSig1 ) == 0 ) { 6690 invalid: 6691 float_raise(float_flag_invalid, status); 6692 z.low = float128_default_nan_low; 6693 z.high = float128_default_nan_high; 6694 return z; 6695 } 6696 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6697 } 6698 if ( aExp == 0 ) { 6699 if ( ( aSig0 | aSig1 ) == 0 ) return a; 6700 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6701 } 6702 expDiff = aExp - bExp; 6703 if ( expDiff < -1 ) return a; 6704 shortShift128Left( 6705 aSig0 | LIT64( 0x0001000000000000 ), 6706 aSig1, 6707 15 - ( expDiff < 0 ), 6708 &aSig0, 6709 &aSig1 6710 ); 6711 shortShift128Left( 6712 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6713 q = le128( bSig0, bSig1, aSig0, aSig1 ); 6714 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6715 expDiff -= 64; 6716 while ( 0 < expDiff ) { 6717 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6718 q = ( 4 < q ) ? q - 4 : 0; 6719 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6720 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 6721 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 6722 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 6723 expDiff -= 61; 6724 } 6725 if ( -64 < expDiff ) { 6726 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6727 q = ( 4 < q ) ? q - 4 : 0; 6728 q >>= - expDiff; 6729 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6730 expDiff += 52; 6731 if ( expDiff < 0 ) { 6732 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6733 } 6734 else { 6735 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 6736 } 6737 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6738 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 6739 } 6740 else { 6741 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 6742 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6743 } 6744 do { 6745 alternateASig0 = aSig0; 6746 alternateASig1 = aSig1; 6747 ++q; 6748 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6749 } while ( 0 <= (int64_t) aSig0 ); 6750 add128( 6751 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 6752 if ( ( sigMean0 < 0 ) 6753 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 6754 aSig0 = alternateASig0; 6755 aSig1 = alternateASig1; 6756 } 6757 zSign = ( (int64_t) aSig0 < 0 ); 6758 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 6759 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 6760 status); 6761 } 6762 6763 /*---------------------------------------------------------------------------- 6764 | Returns the square root of the quadruple-precision floating-point value `a'. 6765 | The operation is performed according to the IEC/IEEE Standard for Binary 6766 | Floating-Point Arithmetic. 6767 *----------------------------------------------------------------------------*/ 6768 6769 float128 float128_sqrt(float128 a, float_status *status) 6770 { 6771 flag aSign; 6772 int32 aExp, zExp; 6773 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 6774 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6775 float128 z; 6776 6777 aSig1 = extractFloat128Frac1( a ); 6778 aSig0 = extractFloat128Frac0( a ); 6779 aExp = extractFloat128Exp( a ); 6780 aSign = extractFloat128Sign( a ); 6781 if ( aExp == 0x7FFF ) { 6782 if (aSig0 | aSig1) { 6783 return propagateFloat128NaN(a, a, status); 6784 } 6785 if ( ! aSign ) return a; 6786 goto invalid; 6787 } 6788 if ( aSign ) { 6789 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 6790 invalid: 6791 float_raise(float_flag_invalid, status); 6792 z.low = float128_default_nan_low; 6793 z.high = float128_default_nan_high; 6794 return z; 6795 } 6796 if ( aExp == 0 ) { 6797 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 6798 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6799 } 6800 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 6801 aSig0 |= LIT64( 0x0001000000000000 ); 6802 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 6803 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 6804 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6805 doubleZSig0 = zSig0<<1; 6806 mul64To128( zSig0, zSig0, &term0, &term1 ); 6807 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6808 while ( (int64_t) rem0 < 0 ) { 6809 --zSig0; 6810 doubleZSig0 -= 2; 6811 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6812 } 6813 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6814 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 6815 if ( zSig1 == 0 ) zSig1 = 1; 6816 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6817 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6818 mul64To128( zSig1, zSig1, &term2, &term3 ); 6819 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6820 while ( (int64_t) rem1 < 0 ) { 6821 --zSig1; 6822 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6823 term3 |= 1; 6824 term2 |= doubleZSig0; 6825 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6826 } 6827 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6828 } 6829 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 6830 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 6831 6832 } 6833 6834 /*---------------------------------------------------------------------------- 6835 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 6836 | the corresponding value `b', and 0 otherwise. The invalid exception is 6837 | raised if either operand is a NaN. Otherwise, the comparison is performed 6838 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6839 *----------------------------------------------------------------------------*/ 6840 6841 int float128_eq(float128 a, float128 b, float_status *status) 6842 { 6843 6844 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6845 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6846 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6847 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6848 ) { 6849 float_raise(float_flag_invalid, status); 6850 return 0; 6851 } 6852 return 6853 ( a.low == b.low ) 6854 && ( ( a.high == b.high ) 6855 || ( ( a.low == 0 ) 6856 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6857 ); 6858 6859 } 6860 6861 /*---------------------------------------------------------------------------- 6862 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6863 | or equal to the corresponding value `b', and 0 otherwise. The invalid 6864 | exception is raised if either operand is a NaN. The comparison is performed 6865 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6866 *----------------------------------------------------------------------------*/ 6867 6868 int float128_le(float128 a, float128 b, float_status *status) 6869 { 6870 flag aSign, bSign; 6871 6872 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6873 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6874 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6875 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6876 ) { 6877 float_raise(float_flag_invalid, status); 6878 return 0; 6879 } 6880 aSign = extractFloat128Sign( a ); 6881 bSign = extractFloat128Sign( b ); 6882 if ( aSign != bSign ) { 6883 return 6884 aSign 6885 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6886 == 0 ); 6887 } 6888 return 6889 aSign ? le128( b.high, b.low, a.high, a.low ) 6890 : le128( a.high, a.low, b.high, b.low ); 6891 6892 } 6893 6894 /*---------------------------------------------------------------------------- 6895 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6896 | the corresponding value `b', and 0 otherwise. The invalid exception is 6897 | raised if either operand is a NaN. The comparison is performed according 6898 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6899 *----------------------------------------------------------------------------*/ 6900 6901 int float128_lt(float128 a, float128 b, float_status *status) 6902 { 6903 flag aSign, bSign; 6904 6905 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6906 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6907 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6908 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6909 ) { 6910 float_raise(float_flag_invalid, status); 6911 return 0; 6912 } 6913 aSign = extractFloat128Sign( a ); 6914 bSign = extractFloat128Sign( b ); 6915 if ( aSign != bSign ) { 6916 return 6917 aSign 6918 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6919 != 0 ); 6920 } 6921 return 6922 aSign ? lt128( b.high, b.low, a.high, a.low ) 6923 : lt128( a.high, a.low, b.high, b.low ); 6924 6925 } 6926 6927 /*---------------------------------------------------------------------------- 6928 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 6929 | be compared, and 0 otherwise. The invalid exception is raised if either 6930 | operand is a NaN. The comparison is performed according to the IEC/IEEE 6931 | Standard for Binary Floating-Point Arithmetic. 6932 *----------------------------------------------------------------------------*/ 6933 6934 int float128_unordered(float128 a, float128 b, float_status *status) 6935 { 6936 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6937 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6938 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6939 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6940 ) { 6941 float_raise(float_flag_invalid, status); 6942 return 1; 6943 } 6944 return 0; 6945 } 6946 6947 /*---------------------------------------------------------------------------- 6948 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 6949 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 6950 | exception. The comparison is performed according to the IEC/IEEE Standard 6951 | for Binary Floating-Point Arithmetic. 6952 *----------------------------------------------------------------------------*/ 6953 6954 int float128_eq_quiet(float128 a, float128 b, float_status *status) 6955 { 6956 6957 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6958 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6959 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6960 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6961 ) { 6962 if ( float128_is_signaling_nan( a ) 6963 || float128_is_signaling_nan( b ) ) { 6964 float_raise(float_flag_invalid, status); 6965 } 6966 return 0; 6967 } 6968 return 6969 ( a.low == b.low ) 6970 && ( ( a.high == b.high ) 6971 || ( ( a.low == 0 ) 6972 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6973 ); 6974 6975 } 6976 6977 /*---------------------------------------------------------------------------- 6978 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6979 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 6980 | cause an exception. Otherwise, the comparison is performed according to the 6981 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6982 *----------------------------------------------------------------------------*/ 6983 6984 int float128_le_quiet(float128 a, float128 b, float_status *status) 6985 { 6986 flag aSign, bSign; 6987 6988 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6989 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6990 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6991 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6992 ) { 6993 if ( float128_is_signaling_nan( a ) 6994 || float128_is_signaling_nan( b ) ) { 6995 float_raise(float_flag_invalid, status); 6996 } 6997 return 0; 6998 } 6999 aSign = extractFloat128Sign( a ); 7000 bSign = extractFloat128Sign( b ); 7001 if ( aSign != bSign ) { 7002 return 7003 aSign 7004 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7005 == 0 ); 7006 } 7007 return 7008 aSign ? le128( b.high, b.low, a.high, a.low ) 7009 : le128( a.high, a.low, b.high, b.low ); 7010 7011 } 7012 7013 /*---------------------------------------------------------------------------- 7014 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7015 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 7016 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 7017 | Standard for Binary Floating-Point Arithmetic. 7018 *----------------------------------------------------------------------------*/ 7019 7020 int float128_lt_quiet(float128 a, float128 b, float_status *status) 7021 { 7022 flag aSign, bSign; 7023 7024 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7025 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7026 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7027 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7028 ) { 7029 if ( float128_is_signaling_nan( a ) 7030 || float128_is_signaling_nan( b ) ) { 7031 float_raise(float_flag_invalid, status); 7032 } 7033 return 0; 7034 } 7035 aSign = extractFloat128Sign( a ); 7036 bSign = extractFloat128Sign( b ); 7037 if ( aSign != bSign ) { 7038 return 7039 aSign 7040 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7041 != 0 ); 7042 } 7043 return 7044 aSign ? lt128( b.high, b.low, a.high, a.low ) 7045 : lt128( a.high, a.low, b.high, b.low ); 7046 7047 } 7048 7049 /*---------------------------------------------------------------------------- 7050 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7051 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 7052 | comparison is performed according to the IEC/IEEE Standard for Binary 7053 | Floating-Point Arithmetic. 7054 *----------------------------------------------------------------------------*/ 7055 7056 int float128_unordered_quiet(float128 a, float128 b, float_status *status) 7057 { 7058 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7059 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7060 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7061 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7062 ) { 7063 if ( float128_is_signaling_nan( a ) 7064 || float128_is_signaling_nan( b ) ) { 7065 float_raise(float_flag_invalid, status); 7066 } 7067 return 1; 7068 } 7069 return 0; 7070 } 7071 7072 /* misc functions */ 7073 float32 uint32_to_float32(uint32_t a, float_status *status) 7074 { 7075 return int64_to_float32(a, status); 7076 } 7077 7078 float64 uint32_to_float64(uint32_t a, float_status *status) 7079 { 7080 return int64_to_float64(a, status); 7081 } 7082 7083 uint32 float32_to_uint32(float32 a, float_status *status) 7084 { 7085 int64_t v; 7086 uint32 res; 7087 int old_exc_flags = get_float_exception_flags(status); 7088 7089 v = float32_to_int64(a, status); 7090 if (v < 0) { 7091 res = 0; 7092 } else if (v > 0xffffffff) { 7093 res = 0xffffffff; 7094 } else { 7095 return v; 7096 } 7097 set_float_exception_flags(old_exc_flags, status); 7098 float_raise(float_flag_invalid, status); 7099 return res; 7100 } 7101 7102 uint32 float32_to_uint32_round_to_zero(float32 a, float_status *status) 7103 { 7104 int64_t v; 7105 uint32 res; 7106 int old_exc_flags = get_float_exception_flags(status); 7107 7108 v = float32_to_int64_round_to_zero(a, status); 7109 if (v < 0) { 7110 res = 0; 7111 } else if (v > 0xffffffff) { 7112 res = 0xffffffff; 7113 } else { 7114 return v; 7115 } 7116 set_float_exception_flags(old_exc_flags, status); 7117 float_raise(float_flag_invalid, status); 7118 return res; 7119 } 7120 7121 int_fast16_t float32_to_int16(float32 a, float_status *status) 7122 { 7123 int32_t v; 7124 int_fast16_t res; 7125 int old_exc_flags = get_float_exception_flags(status); 7126 7127 v = float32_to_int32(a, status); 7128 if (v < -0x8000) { 7129 res = -0x8000; 7130 } else if (v > 0x7fff) { 7131 res = 0x7fff; 7132 } else { 7133 return v; 7134 } 7135 7136 set_float_exception_flags(old_exc_flags, status); 7137 float_raise(float_flag_invalid, status); 7138 return res; 7139 } 7140 7141 uint_fast16_t float32_to_uint16(float32 a, float_status *status) 7142 { 7143 int32_t v; 7144 uint_fast16_t res; 7145 int old_exc_flags = get_float_exception_flags(status); 7146 7147 v = float32_to_int32(a, status); 7148 if (v < 0) { 7149 res = 0; 7150 } else if (v > 0xffff) { 7151 res = 0xffff; 7152 } else { 7153 return v; 7154 } 7155 7156 set_float_exception_flags(old_exc_flags, status); 7157 float_raise(float_flag_invalid, status); 7158 return res; 7159 } 7160 7161 uint_fast16_t float32_to_uint16_round_to_zero(float32 a, float_status *status) 7162 { 7163 int64_t v; 7164 uint_fast16_t res; 7165 int old_exc_flags = get_float_exception_flags(status); 7166 7167 v = float32_to_int64_round_to_zero(a, status); 7168 if (v < 0) { 7169 res = 0; 7170 } else if (v > 0xffff) { 7171 res = 0xffff; 7172 } else { 7173 return v; 7174 } 7175 set_float_exception_flags(old_exc_flags, status); 7176 float_raise(float_flag_invalid, status); 7177 return res; 7178 } 7179 7180 uint32 float64_to_uint32(float64 a, float_status *status) 7181 { 7182 uint64_t v; 7183 uint32 res; 7184 int old_exc_flags = get_float_exception_flags(status); 7185 7186 v = float64_to_uint64(a, status); 7187 if (v > 0xffffffff) { 7188 res = 0xffffffff; 7189 } else { 7190 return v; 7191 } 7192 set_float_exception_flags(old_exc_flags, status); 7193 float_raise(float_flag_invalid, status); 7194 return res; 7195 } 7196 7197 uint32 float64_to_uint32_round_to_zero(float64 a, float_status *status) 7198 { 7199 uint64_t v; 7200 uint32 res; 7201 int old_exc_flags = get_float_exception_flags(status); 7202 7203 v = float64_to_uint64_round_to_zero(a, status); 7204 if (v > 0xffffffff) { 7205 res = 0xffffffff; 7206 } else { 7207 return v; 7208 } 7209 set_float_exception_flags(old_exc_flags, status); 7210 float_raise(float_flag_invalid, status); 7211 return res; 7212 } 7213 7214 int_fast16_t float64_to_int16(float64 a, float_status *status) 7215 { 7216 int64_t v; 7217 int_fast16_t res; 7218 int old_exc_flags = get_float_exception_flags(status); 7219 7220 v = float64_to_int32(a, status); 7221 if (v < -0x8000) { 7222 res = -0x8000; 7223 } else if (v > 0x7fff) { 7224 res = 0x7fff; 7225 } else { 7226 return v; 7227 } 7228 7229 set_float_exception_flags(old_exc_flags, status); 7230 float_raise(float_flag_invalid, status); 7231 return res; 7232 } 7233 7234 uint_fast16_t float64_to_uint16(float64 a, float_status *status) 7235 { 7236 int64_t v; 7237 uint_fast16_t res; 7238 int old_exc_flags = get_float_exception_flags(status); 7239 7240 v = float64_to_int32(a, status); 7241 if (v < 0) { 7242 res = 0; 7243 } else if (v > 0xffff) { 7244 res = 0xffff; 7245 } else { 7246 return v; 7247 } 7248 7249 set_float_exception_flags(old_exc_flags, status); 7250 float_raise(float_flag_invalid, status); 7251 return res; 7252 } 7253 7254 uint_fast16_t float64_to_uint16_round_to_zero(float64 a, float_status *status) 7255 { 7256 int64_t v; 7257 uint_fast16_t res; 7258 int old_exc_flags = get_float_exception_flags(status); 7259 7260 v = float64_to_int64_round_to_zero(a, status); 7261 if (v < 0) { 7262 res = 0; 7263 } else if (v > 0xffff) { 7264 res = 0xffff; 7265 } else { 7266 return v; 7267 } 7268 set_float_exception_flags(old_exc_flags, status); 7269 float_raise(float_flag_invalid, status); 7270 return res; 7271 } 7272 7273 /*---------------------------------------------------------------------------- 7274 | Returns the result of converting the double-precision floating-point value 7275 | `a' to the 64-bit unsigned integer format. The conversion is 7276 | performed according to the IEC/IEEE Standard for Binary Floating-Point 7277 | Arithmetic---which means in particular that the conversion is rounded 7278 | according to the current rounding mode. If `a' is a NaN, the largest 7279 | positive integer is returned. If the conversion overflows, the 7280 | largest unsigned integer is returned. If 'a' is negative, the value is 7281 | rounded and zero is returned; negative values that do not round to zero 7282 | will raise the inexact exception. 7283 *----------------------------------------------------------------------------*/ 7284 7285 uint64_t float64_to_uint64(float64 a, float_status *status) 7286 { 7287 flag aSign; 7288 int_fast16_t aExp, shiftCount; 7289 uint64_t aSig, aSigExtra; 7290 a = float64_squash_input_denormal(a, status); 7291 7292 aSig = extractFloat64Frac(a); 7293 aExp = extractFloat64Exp(a); 7294 aSign = extractFloat64Sign(a); 7295 if (aSign && (aExp > 1022)) { 7296 float_raise(float_flag_invalid, status); 7297 if (float64_is_any_nan(a)) { 7298 return LIT64(0xFFFFFFFFFFFFFFFF); 7299 } else { 7300 return 0; 7301 } 7302 } 7303 if (aExp) { 7304 aSig |= LIT64(0x0010000000000000); 7305 } 7306 shiftCount = 0x433 - aExp; 7307 if (shiftCount <= 0) { 7308 if (0x43E < aExp) { 7309 float_raise(float_flag_invalid, status); 7310 return LIT64(0xFFFFFFFFFFFFFFFF); 7311 } 7312 aSigExtra = 0; 7313 aSig <<= -shiftCount; 7314 } else { 7315 shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra); 7316 } 7317 return roundAndPackUint64(aSign, aSig, aSigExtra, status); 7318 } 7319 7320 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status) 7321 { 7322 signed char current_rounding_mode = status->float_rounding_mode; 7323 set_float_rounding_mode(float_round_to_zero, status); 7324 int64_t v = float64_to_uint64(a, status); 7325 set_float_rounding_mode(current_rounding_mode, status); 7326 return v; 7327 } 7328 7329 #define COMPARE(s, nan_exp) \ 7330 static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\ 7331 int is_quiet, float_status *status) \ 7332 { \ 7333 flag aSign, bSign; \ 7334 uint ## s ## _t av, bv; \ 7335 a = float ## s ## _squash_input_denormal(a, status); \ 7336 b = float ## s ## _squash_input_denormal(b, status); \ 7337 \ 7338 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \ 7339 extractFloat ## s ## Frac( a ) ) || \ 7340 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \ 7341 extractFloat ## s ## Frac( b ) )) { \ 7342 if (!is_quiet || \ 7343 float ## s ## _is_signaling_nan( a ) || \ 7344 float ## s ## _is_signaling_nan( b ) ) { \ 7345 float_raise(float_flag_invalid, status); \ 7346 } \ 7347 return float_relation_unordered; \ 7348 } \ 7349 aSign = extractFloat ## s ## Sign( a ); \ 7350 bSign = extractFloat ## s ## Sign( b ); \ 7351 av = float ## s ## _val(a); \ 7352 bv = float ## s ## _val(b); \ 7353 if ( aSign != bSign ) { \ 7354 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \ 7355 /* zero case */ \ 7356 return float_relation_equal; \ 7357 } else { \ 7358 return 1 - (2 * aSign); \ 7359 } \ 7360 } else { \ 7361 if (av == bv) { \ 7362 return float_relation_equal; \ 7363 } else { \ 7364 return 1 - 2 * (aSign ^ ( av < bv )); \ 7365 } \ 7366 } \ 7367 } \ 7368 \ 7369 int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \ 7370 { \ 7371 return float ## s ## _compare_internal(a, b, 0, status); \ 7372 } \ 7373 \ 7374 int float ## s ## _compare_quiet(float ## s a, float ## s b, \ 7375 float_status *status) \ 7376 { \ 7377 return float ## s ## _compare_internal(a, b, 1, status); \ 7378 } 7379 7380 COMPARE(32, 0xff) 7381 COMPARE(64, 0x7ff) 7382 7383 static inline int floatx80_compare_internal(floatx80 a, floatx80 b, 7384 int is_quiet, float_status *status) 7385 { 7386 flag aSign, bSign; 7387 7388 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 7389 ( extractFloatx80Frac( a )<<1 ) ) || 7390 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 7391 ( extractFloatx80Frac( b )<<1 ) )) { 7392 if (!is_quiet || 7393 floatx80_is_signaling_nan( a ) || 7394 floatx80_is_signaling_nan( b ) ) { 7395 float_raise(float_flag_invalid, status); 7396 } 7397 return float_relation_unordered; 7398 } 7399 aSign = extractFloatx80Sign( a ); 7400 bSign = extractFloatx80Sign( b ); 7401 if ( aSign != bSign ) { 7402 7403 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 7404 ( ( a.low | b.low ) == 0 ) ) { 7405 /* zero case */ 7406 return float_relation_equal; 7407 } else { 7408 return 1 - (2 * aSign); 7409 } 7410 } else { 7411 if (a.low == b.low && a.high == b.high) { 7412 return float_relation_equal; 7413 } else { 7414 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7415 } 7416 } 7417 } 7418 7419 int floatx80_compare(floatx80 a, floatx80 b, float_status *status) 7420 { 7421 return floatx80_compare_internal(a, b, 0, status); 7422 } 7423 7424 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status) 7425 { 7426 return floatx80_compare_internal(a, b, 1, status); 7427 } 7428 7429 static inline int float128_compare_internal(float128 a, float128 b, 7430 int is_quiet, float_status *status) 7431 { 7432 flag aSign, bSign; 7433 7434 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7435 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7436 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7437 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7438 if (!is_quiet || 7439 float128_is_signaling_nan( a ) || 7440 float128_is_signaling_nan( b ) ) { 7441 float_raise(float_flag_invalid, status); 7442 } 7443 return float_relation_unordered; 7444 } 7445 aSign = extractFloat128Sign( a ); 7446 bSign = extractFloat128Sign( b ); 7447 if ( aSign != bSign ) { 7448 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7449 /* zero case */ 7450 return float_relation_equal; 7451 } else { 7452 return 1 - (2 * aSign); 7453 } 7454 } else { 7455 if (a.low == b.low && a.high == b.high) { 7456 return float_relation_equal; 7457 } else { 7458 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7459 } 7460 } 7461 } 7462 7463 int float128_compare(float128 a, float128 b, float_status *status) 7464 { 7465 return float128_compare_internal(a, b, 0, status); 7466 } 7467 7468 int float128_compare_quiet(float128 a, float128 b, float_status *status) 7469 { 7470 return float128_compare_internal(a, b, 1, status); 7471 } 7472 7473 /* min() and max() functions. These can't be implemented as 7474 * 'compare and pick one input' because that would mishandle 7475 * NaNs and +0 vs -0. 7476 * 7477 * minnum() and maxnum() functions. These are similar to the min() 7478 * and max() functions but if one of the arguments is a QNaN and 7479 * the other is numerical then the numerical argument is returned. 7480 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 7481 * and maxNum() operations. min() and max() are the typical min/max 7482 * semantics provided by many CPUs which predate that specification. 7483 * 7484 * minnummag() and maxnummag() functions correspond to minNumMag() 7485 * and minNumMag() from the IEEE-754 2008. 7486 */ 7487 #define MINMAX(s) \ 7488 static inline float ## s float ## s ## _minmax(float ## s a, float ## s b, \ 7489 int ismin, int isieee, \ 7490 int ismag, \ 7491 float_status *status) \ 7492 { \ 7493 flag aSign, bSign; \ 7494 uint ## s ## _t av, bv, aav, abv; \ 7495 a = float ## s ## _squash_input_denormal(a, status); \ 7496 b = float ## s ## _squash_input_denormal(b, status); \ 7497 if (float ## s ## _is_any_nan(a) || \ 7498 float ## s ## _is_any_nan(b)) { \ 7499 if (isieee) { \ 7500 if (float ## s ## _is_quiet_nan(a) && \ 7501 !float ## s ##_is_any_nan(b)) { \ 7502 return b; \ 7503 } else if (float ## s ## _is_quiet_nan(b) && \ 7504 !float ## s ## _is_any_nan(a)) { \ 7505 return a; \ 7506 } \ 7507 } \ 7508 return propagateFloat ## s ## NaN(a, b, status); \ 7509 } \ 7510 aSign = extractFloat ## s ## Sign(a); \ 7511 bSign = extractFloat ## s ## Sign(b); \ 7512 av = float ## s ## _val(a); \ 7513 bv = float ## s ## _val(b); \ 7514 if (ismag) { \ 7515 aav = float ## s ## _abs(av); \ 7516 abv = float ## s ## _abs(bv); \ 7517 if (aav != abv) { \ 7518 if (ismin) { \ 7519 return (aav < abv) ? a : b; \ 7520 } else { \ 7521 return (aav < abv) ? b : a; \ 7522 } \ 7523 } \ 7524 } \ 7525 if (aSign != bSign) { \ 7526 if (ismin) { \ 7527 return aSign ? a : b; \ 7528 } else { \ 7529 return aSign ? b : a; \ 7530 } \ 7531 } else { \ 7532 if (ismin) { \ 7533 return (aSign ^ (av < bv)) ? a : b; \ 7534 } else { \ 7535 return (aSign ^ (av < bv)) ? b : a; \ 7536 } \ 7537 } \ 7538 } \ 7539 \ 7540 float ## s float ## s ## _min(float ## s a, float ## s b, \ 7541 float_status *status) \ 7542 { \ 7543 return float ## s ## _minmax(a, b, 1, 0, 0, status); \ 7544 } \ 7545 \ 7546 float ## s float ## s ## _max(float ## s a, float ## s b, \ 7547 float_status *status) \ 7548 { \ 7549 return float ## s ## _minmax(a, b, 0, 0, 0, status); \ 7550 } \ 7551 \ 7552 float ## s float ## s ## _minnum(float ## s a, float ## s b, \ 7553 float_status *status) \ 7554 { \ 7555 return float ## s ## _minmax(a, b, 1, 1, 0, status); \ 7556 } \ 7557 \ 7558 float ## s float ## s ## _maxnum(float ## s a, float ## s b, \ 7559 float_status *status) \ 7560 { \ 7561 return float ## s ## _minmax(a, b, 0, 1, 0, status); \ 7562 } \ 7563 \ 7564 float ## s float ## s ## _minnummag(float ## s a, float ## s b, \ 7565 float_status *status) \ 7566 { \ 7567 return float ## s ## _minmax(a, b, 1, 1, 1, status); \ 7568 } \ 7569 \ 7570 float ## s float ## s ## _maxnummag(float ## s a, float ## s b, \ 7571 float_status *status) \ 7572 { \ 7573 return float ## s ## _minmax(a, b, 0, 1, 1, status); \ 7574 } 7575 7576 MINMAX(32) 7577 MINMAX(64) 7578 7579 7580 /* Multiply A by 2 raised to the power N. */ 7581 float32 float32_scalbn(float32 a, int n, float_status *status) 7582 { 7583 flag aSign; 7584 int16_t aExp; 7585 uint32_t aSig; 7586 7587 a = float32_squash_input_denormal(a, status); 7588 aSig = extractFloat32Frac( a ); 7589 aExp = extractFloat32Exp( a ); 7590 aSign = extractFloat32Sign( a ); 7591 7592 if ( aExp == 0xFF ) { 7593 if ( aSig ) { 7594 return propagateFloat32NaN(a, a, status); 7595 } 7596 return a; 7597 } 7598 if (aExp != 0) { 7599 aSig |= 0x00800000; 7600 } else if (aSig == 0) { 7601 return a; 7602 } else { 7603 aExp++; 7604 } 7605 7606 if (n > 0x200) { 7607 n = 0x200; 7608 } else if (n < -0x200) { 7609 n = -0x200; 7610 } 7611 7612 aExp += n - 1; 7613 aSig <<= 7; 7614 return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status); 7615 } 7616 7617 float64 float64_scalbn(float64 a, int n, float_status *status) 7618 { 7619 flag aSign; 7620 int16_t aExp; 7621 uint64_t aSig; 7622 7623 a = float64_squash_input_denormal(a, status); 7624 aSig = extractFloat64Frac( a ); 7625 aExp = extractFloat64Exp( a ); 7626 aSign = extractFloat64Sign( a ); 7627 7628 if ( aExp == 0x7FF ) { 7629 if ( aSig ) { 7630 return propagateFloat64NaN(a, a, status); 7631 } 7632 return a; 7633 } 7634 if (aExp != 0) { 7635 aSig |= LIT64( 0x0010000000000000 ); 7636 } else if (aSig == 0) { 7637 return a; 7638 } else { 7639 aExp++; 7640 } 7641 7642 if (n > 0x1000) { 7643 n = 0x1000; 7644 } else if (n < -0x1000) { 7645 n = -0x1000; 7646 } 7647 7648 aExp += n - 1; 7649 aSig <<= 10; 7650 return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status); 7651 } 7652 7653 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7654 { 7655 flag aSign; 7656 int32_t aExp; 7657 uint64_t aSig; 7658 7659 aSig = extractFloatx80Frac( a ); 7660 aExp = extractFloatx80Exp( a ); 7661 aSign = extractFloatx80Sign( a ); 7662 7663 if ( aExp == 0x7FFF ) { 7664 if ( aSig<<1 ) { 7665 return propagateFloatx80NaN(a, a, status); 7666 } 7667 return a; 7668 } 7669 7670 if (aExp == 0) { 7671 if (aSig == 0) { 7672 return a; 7673 } 7674 aExp++; 7675 } 7676 7677 if (n > 0x10000) { 7678 n = 0x10000; 7679 } else if (n < -0x10000) { 7680 n = -0x10000; 7681 } 7682 7683 aExp += n; 7684 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7685 aSign, aExp, aSig, 0, status); 7686 } 7687 7688 float128 float128_scalbn(float128 a, int n, float_status *status) 7689 { 7690 flag aSign; 7691 int32_t aExp; 7692 uint64_t aSig0, aSig1; 7693 7694 aSig1 = extractFloat128Frac1( a ); 7695 aSig0 = extractFloat128Frac0( a ); 7696 aExp = extractFloat128Exp( a ); 7697 aSign = extractFloat128Sign( a ); 7698 if ( aExp == 0x7FFF ) { 7699 if ( aSig0 | aSig1 ) { 7700 return propagateFloat128NaN(a, a, status); 7701 } 7702 return a; 7703 } 7704 if (aExp != 0) { 7705 aSig0 |= LIT64( 0x0001000000000000 ); 7706 } else if (aSig0 == 0 && aSig1 == 0) { 7707 return a; 7708 } else { 7709 aExp++; 7710 } 7711 7712 if (n > 0x10000) { 7713 n = 0x10000; 7714 } else if (n < -0x10000) { 7715 n = -0x10000; 7716 } 7717 7718 aExp += n - 1; 7719 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7720 , status); 7721 7722 } 7723