1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include "qemu/bitops.h" 87 #include "fpu/softfloat.h" 88 89 /* We only need stdlib for abort() */ 90 91 /*---------------------------------------------------------------------------- 92 | Primitive arithmetic functions, including multi-word arithmetic, and 93 | division and square root approximations. (Can be specialized to target if 94 | desired.) 95 *----------------------------------------------------------------------------*/ 96 #include "fpu/softfloat-macros.h" 97 98 /*---------------------------------------------------------------------------- 99 | Returns the fraction bits of the half-precision floating-point value `a'. 100 *----------------------------------------------------------------------------*/ 101 102 static inline uint32_t extractFloat16Frac(float16 a) 103 { 104 return float16_val(a) & 0x3ff; 105 } 106 107 /*---------------------------------------------------------------------------- 108 | Returns the exponent bits of the half-precision floating-point value `a'. 109 *----------------------------------------------------------------------------*/ 110 111 static inline int extractFloat16Exp(float16 a) 112 { 113 return (float16_val(a) >> 10) & 0x1f; 114 } 115 116 /*---------------------------------------------------------------------------- 117 | Returns the fraction bits of the single-precision floating-point value `a'. 118 *----------------------------------------------------------------------------*/ 119 120 static inline uint32_t extractFloat32Frac(float32 a) 121 { 122 return float32_val(a) & 0x007FFFFF; 123 } 124 125 /*---------------------------------------------------------------------------- 126 | Returns the exponent bits of the single-precision floating-point value `a'. 127 *----------------------------------------------------------------------------*/ 128 129 static inline int extractFloat32Exp(float32 a) 130 { 131 return (float32_val(a) >> 23) & 0xFF; 132 } 133 134 /*---------------------------------------------------------------------------- 135 | Returns the sign bit of the single-precision floating-point value `a'. 136 *----------------------------------------------------------------------------*/ 137 138 static inline flag extractFloat32Sign(float32 a) 139 { 140 return float32_val(a) >> 31; 141 } 142 143 /*---------------------------------------------------------------------------- 144 | Returns the fraction bits of the double-precision floating-point value `a'. 145 *----------------------------------------------------------------------------*/ 146 147 static inline uint64_t extractFloat64Frac(float64 a) 148 { 149 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF); 150 } 151 152 /*---------------------------------------------------------------------------- 153 | Returns the exponent bits of the double-precision floating-point value `a'. 154 *----------------------------------------------------------------------------*/ 155 156 static inline int extractFloat64Exp(float64 a) 157 { 158 return (float64_val(a) >> 52) & 0x7FF; 159 } 160 161 /*---------------------------------------------------------------------------- 162 | Returns the sign bit of the double-precision floating-point value `a'. 163 *----------------------------------------------------------------------------*/ 164 165 static inline flag extractFloat64Sign(float64 a) 166 { 167 return float64_val(a) >> 63; 168 } 169 170 /* 171 * Classify a floating point number. Everything above float_class_qnan 172 * is a NaN so cls >= float_class_qnan is any NaN. 173 */ 174 175 typedef enum __attribute__ ((__packed__)) { 176 float_class_unclassified, 177 float_class_zero, 178 float_class_normal, 179 float_class_inf, 180 float_class_qnan, /* all NaNs from here */ 181 float_class_snan, 182 } FloatClass; 183 184 /* Simple helpers for checking if, or what kind of, NaN we have */ 185 static inline __attribute__((unused)) bool is_nan(FloatClass c) 186 { 187 return unlikely(c >= float_class_qnan); 188 } 189 190 static inline __attribute__((unused)) bool is_snan(FloatClass c) 191 { 192 return c == float_class_snan; 193 } 194 195 static inline __attribute__((unused)) bool is_qnan(FloatClass c) 196 { 197 return c == float_class_qnan; 198 } 199 200 /* 201 * Structure holding all of the decomposed parts of a float. The 202 * exponent is unbiased and the fraction is normalized. All 203 * calculations are done with a 64 bit fraction and then rounded as 204 * appropriate for the final format. 205 * 206 * Thanks to the packed FloatClass a decent compiler should be able to 207 * fit the whole structure into registers and avoid using the stack 208 * for parameter passing. 209 */ 210 211 typedef struct { 212 uint64_t frac; 213 int32_t exp; 214 FloatClass cls; 215 bool sign; 216 } FloatParts; 217 218 #define DECOMPOSED_BINARY_POINT (64 - 2) 219 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 220 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1) 221 222 /* Structure holding all of the relevant parameters for a format. 223 * exp_size: the size of the exponent field 224 * exp_bias: the offset applied to the exponent field 225 * exp_max: the maximum normalised exponent 226 * frac_size: the size of the fraction field 227 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 228 * The following are computed based the size of fraction 229 * frac_lsb: least significant bit of fraction 230 * frac_lsbm1: the bit below the least significant bit (for rounding) 231 * round_mask/roundeven_mask: masks used for rounding 232 * The following optional modifiers are available: 233 * arm_althp: handle ARM Alternative Half Precision 234 */ 235 typedef struct { 236 int exp_size; 237 int exp_bias; 238 int exp_max; 239 int frac_size; 240 int frac_shift; 241 uint64_t frac_lsb; 242 uint64_t frac_lsbm1; 243 uint64_t round_mask; 244 uint64_t roundeven_mask; 245 bool arm_althp; 246 } FloatFmt; 247 248 /* Expand fields based on the size of exponent and fraction */ 249 #define FLOAT_PARAMS(E, F) \ 250 .exp_size = E, \ 251 .exp_bias = ((1 << E) - 1) >> 1, \ 252 .exp_max = (1 << E) - 1, \ 253 .frac_size = F, \ 254 .frac_shift = DECOMPOSED_BINARY_POINT - F, \ 255 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \ 256 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \ 257 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \ 258 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1 259 260 static const FloatFmt float16_params = { 261 FLOAT_PARAMS(5, 10) 262 }; 263 264 static const FloatFmt float16_params_ahp = { 265 FLOAT_PARAMS(5, 10), 266 .arm_althp = true 267 }; 268 269 static const FloatFmt float32_params = { 270 FLOAT_PARAMS(8, 23) 271 }; 272 273 static const FloatFmt float64_params = { 274 FLOAT_PARAMS(11, 52) 275 }; 276 277 /* Unpack a float to parts, but do not canonicalize. */ 278 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw) 279 { 280 const int sign_pos = fmt.frac_size + fmt.exp_size; 281 282 return (FloatParts) { 283 .cls = float_class_unclassified, 284 .sign = extract64(raw, sign_pos, 1), 285 .exp = extract64(raw, fmt.frac_size, fmt.exp_size), 286 .frac = extract64(raw, 0, fmt.frac_size), 287 }; 288 } 289 290 static inline FloatParts float16_unpack_raw(float16 f) 291 { 292 return unpack_raw(float16_params, f); 293 } 294 295 static inline FloatParts float32_unpack_raw(float32 f) 296 { 297 return unpack_raw(float32_params, f); 298 } 299 300 static inline FloatParts float64_unpack_raw(float64 f) 301 { 302 return unpack_raw(float64_params, f); 303 } 304 305 /* Pack a float from parts, but do not canonicalize. */ 306 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p) 307 { 308 const int sign_pos = fmt.frac_size + fmt.exp_size; 309 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp); 310 return deposit64(ret, sign_pos, 1, p.sign); 311 } 312 313 static inline float16 float16_pack_raw(FloatParts p) 314 { 315 return make_float16(pack_raw(float16_params, p)); 316 } 317 318 static inline float32 float32_pack_raw(FloatParts p) 319 { 320 return make_float32(pack_raw(float32_params, p)); 321 } 322 323 static inline float64 float64_pack_raw(FloatParts p) 324 { 325 return make_float64(pack_raw(float64_params, p)); 326 } 327 328 /*---------------------------------------------------------------------------- 329 | Functions and definitions to determine: (1) whether tininess for underflow 330 | is detected before or after rounding by default, (2) what (if anything) 331 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 332 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 333 | are propagated from function inputs to output. These details are target- 334 | specific. 335 *----------------------------------------------------------------------------*/ 336 #include "softfloat-specialize.h" 337 338 /* Canonicalize EXP and FRAC, setting CLS. */ 339 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm, 340 float_status *status) 341 { 342 if (part.exp == parm->exp_max && !parm->arm_althp) { 343 if (part.frac == 0) { 344 part.cls = float_class_inf; 345 } else { 346 part.frac <<= parm->frac_shift; 347 part.cls = (parts_is_snan_frac(part.frac, status) 348 ? float_class_snan : float_class_qnan); 349 } 350 } else if (part.exp == 0) { 351 if (likely(part.frac == 0)) { 352 part.cls = float_class_zero; 353 } else if (status->flush_inputs_to_zero) { 354 float_raise(float_flag_input_denormal, status); 355 part.cls = float_class_zero; 356 part.frac = 0; 357 } else { 358 int shift = clz64(part.frac) - 1; 359 part.cls = float_class_normal; 360 part.exp = parm->frac_shift - parm->exp_bias - shift + 1; 361 part.frac <<= shift; 362 } 363 } else { 364 part.cls = float_class_normal; 365 part.exp -= parm->exp_bias; 366 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift); 367 } 368 return part; 369 } 370 371 /* Round and uncanonicalize a floating-point number by parts. There 372 * are FRAC_SHIFT bits that may require rounding at the bottom of the 373 * fraction; these bits will be removed. The exponent will be biased 374 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0]. 375 */ 376 377 static FloatParts round_canonical(FloatParts p, float_status *s, 378 const FloatFmt *parm) 379 { 380 const uint64_t frac_lsbm1 = parm->frac_lsbm1; 381 const uint64_t round_mask = parm->round_mask; 382 const uint64_t roundeven_mask = parm->roundeven_mask; 383 const int exp_max = parm->exp_max; 384 const int frac_shift = parm->frac_shift; 385 uint64_t frac, inc; 386 int exp, flags = 0; 387 bool overflow_norm; 388 389 frac = p.frac; 390 exp = p.exp; 391 392 switch (p.cls) { 393 case float_class_normal: 394 switch (s->float_rounding_mode) { 395 case float_round_nearest_even: 396 overflow_norm = false; 397 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 398 break; 399 case float_round_ties_away: 400 overflow_norm = false; 401 inc = frac_lsbm1; 402 break; 403 case float_round_to_zero: 404 overflow_norm = true; 405 inc = 0; 406 break; 407 case float_round_up: 408 inc = p.sign ? 0 : round_mask; 409 overflow_norm = p.sign; 410 break; 411 case float_round_down: 412 inc = p.sign ? round_mask : 0; 413 overflow_norm = !p.sign; 414 break; 415 default: 416 g_assert_not_reached(); 417 } 418 419 exp += parm->exp_bias; 420 if (likely(exp > 0)) { 421 if (frac & round_mask) { 422 flags |= float_flag_inexact; 423 frac += inc; 424 if (frac & DECOMPOSED_OVERFLOW_BIT) { 425 frac >>= 1; 426 exp++; 427 } 428 } 429 frac >>= frac_shift; 430 431 if (parm->arm_althp) { 432 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */ 433 if (unlikely(exp > exp_max)) { 434 /* Overflow. Return the maximum normal. */ 435 flags = float_flag_invalid; 436 exp = exp_max; 437 frac = -1; 438 } 439 } else if (unlikely(exp >= exp_max)) { 440 flags |= float_flag_overflow | float_flag_inexact; 441 if (overflow_norm) { 442 exp = exp_max - 1; 443 frac = -1; 444 } else { 445 p.cls = float_class_inf; 446 goto do_inf; 447 } 448 } 449 } else if (s->flush_to_zero) { 450 flags |= float_flag_output_denormal; 451 p.cls = float_class_zero; 452 goto do_zero; 453 } else { 454 bool is_tiny = (s->float_detect_tininess 455 == float_tininess_before_rounding) 456 || (exp < 0) 457 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT); 458 459 shift64RightJamming(frac, 1 - exp, &frac); 460 if (frac & round_mask) { 461 /* Need to recompute round-to-even. */ 462 if (s->float_rounding_mode == float_round_nearest_even) { 463 inc = ((frac & roundeven_mask) != frac_lsbm1 464 ? frac_lsbm1 : 0); 465 } 466 flags |= float_flag_inexact; 467 frac += inc; 468 } 469 470 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0); 471 frac >>= frac_shift; 472 473 if (is_tiny && (flags & float_flag_inexact)) { 474 flags |= float_flag_underflow; 475 } 476 if (exp == 0 && frac == 0) { 477 p.cls = float_class_zero; 478 } 479 } 480 break; 481 482 case float_class_zero: 483 do_zero: 484 exp = 0; 485 frac = 0; 486 break; 487 488 case float_class_inf: 489 do_inf: 490 assert(!parm->arm_althp); 491 exp = exp_max; 492 frac = 0; 493 break; 494 495 case float_class_qnan: 496 case float_class_snan: 497 assert(!parm->arm_althp); 498 exp = exp_max; 499 frac >>= parm->frac_shift; 500 break; 501 502 default: 503 g_assert_not_reached(); 504 } 505 506 float_raise(flags, s); 507 p.exp = exp; 508 p.frac = frac; 509 return p; 510 } 511 512 /* Explicit FloatFmt version */ 513 static FloatParts float16a_unpack_canonical(float16 f, float_status *s, 514 const FloatFmt *params) 515 { 516 return canonicalize(float16_unpack_raw(f), params, s); 517 } 518 519 static FloatParts float16_unpack_canonical(float16 f, float_status *s) 520 { 521 return float16a_unpack_canonical(f, s, &float16_params); 522 } 523 524 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s, 525 const FloatFmt *params) 526 { 527 return float16_pack_raw(round_canonical(p, s, params)); 528 } 529 530 static float16 float16_round_pack_canonical(FloatParts p, float_status *s) 531 { 532 return float16a_round_pack_canonical(p, s, &float16_params); 533 } 534 535 static FloatParts float32_unpack_canonical(float32 f, float_status *s) 536 { 537 return canonicalize(float32_unpack_raw(f), &float32_params, s); 538 } 539 540 static float32 float32_round_pack_canonical(FloatParts p, float_status *s) 541 { 542 return float32_pack_raw(round_canonical(p, s, &float32_params)); 543 } 544 545 static FloatParts float64_unpack_canonical(float64 f, float_status *s) 546 { 547 return canonicalize(float64_unpack_raw(f), &float64_params, s); 548 } 549 550 static float64 float64_round_pack_canonical(FloatParts p, float_status *s) 551 { 552 return float64_pack_raw(round_canonical(p, s, &float64_params)); 553 } 554 555 static FloatParts return_nan(FloatParts a, float_status *s) 556 { 557 switch (a.cls) { 558 case float_class_snan: 559 s->float_exception_flags |= float_flag_invalid; 560 a = parts_silence_nan(a, s); 561 /* fall through */ 562 case float_class_qnan: 563 if (s->default_nan_mode) { 564 return parts_default_nan(s); 565 } 566 break; 567 568 default: 569 g_assert_not_reached(); 570 } 571 return a; 572 } 573 574 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s) 575 { 576 if (is_snan(a.cls) || is_snan(b.cls)) { 577 s->float_exception_flags |= float_flag_invalid; 578 } 579 580 if (s->default_nan_mode) { 581 return parts_default_nan(s); 582 } else { 583 if (pickNaN(a.cls, b.cls, 584 a.frac > b.frac || 585 (a.frac == b.frac && a.sign < b.sign))) { 586 a = b; 587 } 588 if (is_snan(a.cls)) { 589 return parts_silence_nan(a, s); 590 } 591 } 592 return a; 593 } 594 595 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c, 596 bool inf_zero, float_status *s) 597 { 598 int which; 599 600 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) { 601 s->float_exception_flags |= float_flag_invalid; 602 } 603 604 which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s); 605 606 if (s->default_nan_mode) { 607 /* Note that this check is after pickNaNMulAdd so that function 608 * has an opportunity to set the Invalid flag. 609 */ 610 which = 3; 611 } 612 613 switch (which) { 614 case 0: 615 break; 616 case 1: 617 a = b; 618 break; 619 case 2: 620 a = c; 621 break; 622 case 3: 623 return parts_default_nan(s); 624 default: 625 g_assert_not_reached(); 626 } 627 628 if (is_snan(a.cls)) { 629 return parts_silence_nan(a, s); 630 } 631 return a; 632 } 633 634 /* 635 * Returns the result of adding or subtracting the values of the 636 * floating-point values `a' and `b'. The operation is performed 637 * according to the IEC/IEEE Standard for Binary Floating-Point 638 * Arithmetic. 639 */ 640 641 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract, 642 float_status *s) 643 { 644 bool a_sign = a.sign; 645 bool b_sign = b.sign ^ subtract; 646 647 if (a_sign != b_sign) { 648 /* Subtraction */ 649 650 if (a.cls == float_class_normal && b.cls == float_class_normal) { 651 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) { 652 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 653 a.frac = a.frac - b.frac; 654 } else { 655 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 656 a.frac = b.frac - a.frac; 657 a.exp = b.exp; 658 a_sign ^= 1; 659 } 660 661 if (a.frac == 0) { 662 a.cls = float_class_zero; 663 a.sign = s->float_rounding_mode == float_round_down; 664 } else { 665 int shift = clz64(a.frac) - 1; 666 a.frac = a.frac << shift; 667 a.exp = a.exp - shift; 668 a.sign = a_sign; 669 } 670 return a; 671 } 672 if (is_nan(a.cls) || is_nan(b.cls)) { 673 return pick_nan(a, b, s); 674 } 675 if (a.cls == float_class_inf) { 676 if (b.cls == float_class_inf) { 677 float_raise(float_flag_invalid, s); 678 return parts_default_nan(s); 679 } 680 return a; 681 } 682 if (a.cls == float_class_zero && b.cls == float_class_zero) { 683 a.sign = s->float_rounding_mode == float_round_down; 684 return a; 685 } 686 if (a.cls == float_class_zero || b.cls == float_class_inf) { 687 b.sign = a_sign ^ 1; 688 return b; 689 } 690 if (b.cls == float_class_zero) { 691 return a; 692 } 693 } else { 694 /* Addition */ 695 if (a.cls == float_class_normal && b.cls == float_class_normal) { 696 if (a.exp > b.exp) { 697 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 698 } else if (a.exp < b.exp) { 699 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 700 a.exp = b.exp; 701 } 702 a.frac += b.frac; 703 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 704 shift64RightJamming(a.frac, 1, &a.frac); 705 a.exp += 1; 706 } 707 return a; 708 } 709 if (is_nan(a.cls) || is_nan(b.cls)) { 710 return pick_nan(a, b, s); 711 } 712 if (a.cls == float_class_inf || b.cls == float_class_zero) { 713 return a; 714 } 715 if (b.cls == float_class_inf || a.cls == float_class_zero) { 716 b.sign = b_sign; 717 return b; 718 } 719 } 720 g_assert_not_reached(); 721 } 722 723 /* 724 * Returns the result of adding or subtracting the floating-point 725 * values `a' and `b'. The operation is performed according to the 726 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 727 */ 728 729 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status) 730 { 731 FloatParts pa = float16_unpack_canonical(a, status); 732 FloatParts pb = float16_unpack_canonical(b, status); 733 FloatParts pr = addsub_floats(pa, pb, false, status); 734 735 return float16_round_pack_canonical(pr, status); 736 } 737 738 float32 QEMU_FLATTEN float32_add(float32 a, float32 b, float_status *status) 739 { 740 FloatParts pa = float32_unpack_canonical(a, status); 741 FloatParts pb = float32_unpack_canonical(b, status); 742 FloatParts pr = addsub_floats(pa, pb, false, status); 743 744 return float32_round_pack_canonical(pr, status); 745 } 746 747 float64 QEMU_FLATTEN float64_add(float64 a, float64 b, float_status *status) 748 { 749 FloatParts pa = float64_unpack_canonical(a, status); 750 FloatParts pb = float64_unpack_canonical(b, status); 751 FloatParts pr = addsub_floats(pa, pb, false, status); 752 753 return float64_round_pack_canonical(pr, status); 754 } 755 756 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status) 757 { 758 FloatParts pa = float16_unpack_canonical(a, status); 759 FloatParts pb = float16_unpack_canonical(b, status); 760 FloatParts pr = addsub_floats(pa, pb, true, status); 761 762 return float16_round_pack_canonical(pr, status); 763 } 764 765 float32 QEMU_FLATTEN float32_sub(float32 a, float32 b, float_status *status) 766 { 767 FloatParts pa = float32_unpack_canonical(a, status); 768 FloatParts pb = float32_unpack_canonical(b, status); 769 FloatParts pr = addsub_floats(pa, pb, true, status); 770 771 return float32_round_pack_canonical(pr, status); 772 } 773 774 float64 QEMU_FLATTEN float64_sub(float64 a, float64 b, float_status *status) 775 { 776 FloatParts pa = float64_unpack_canonical(a, status); 777 FloatParts pb = float64_unpack_canonical(b, status); 778 FloatParts pr = addsub_floats(pa, pb, true, status); 779 780 return float64_round_pack_canonical(pr, status); 781 } 782 783 /* 784 * Returns the result of multiplying the floating-point values `a' and 785 * `b'. The operation is performed according to the IEC/IEEE Standard 786 * for Binary Floating-Point Arithmetic. 787 */ 788 789 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s) 790 { 791 bool sign = a.sign ^ b.sign; 792 793 if (a.cls == float_class_normal && b.cls == float_class_normal) { 794 uint64_t hi, lo; 795 int exp = a.exp + b.exp; 796 797 mul64To128(a.frac, b.frac, &hi, &lo); 798 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 799 if (lo & DECOMPOSED_OVERFLOW_BIT) { 800 shift64RightJamming(lo, 1, &lo); 801 exp += 1; 802 } 803 804 /* Re-use a */ 805 a.exp = exp; 806 a.sign = sign; 807 a.frac = lo; 808 return a; 809 } 810 /* handle all the NaN cases */ 811 if (is_nan(a.cls) || is_nan(b.cls)) { 812 return pick_nan(a, b, s); 813 } 814 /* Inf * Zero == NaN */ 815 if ((a.cls == float_class_inf && b.cls == float_class_zero) || 816 (a.cls == float_class_zero && b.cls == float_class_inf)) { 817 s->float_exception_flags |= float_flag_invalid; 818 return parts_default_nan(s); 819 } 820 /* Multiply by 0 or Inf */ 821 if (a.cls == float_class_inf || a.cls == float_class_zero) { 822 a.sign = sign; 823 return a; 824 } 825 if (b.cls == float_class_inf || b.cls == float_class_zero) { 826 b.sign = sign; 827 return b; 828 } 829 g_assert_not_reached(); 830 } 831 832 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status) 833 { 834 FloatParts pa = float16_unpack_canonical(a, status); 835 FloatParts pb = float16_unpack_canonical(b, status); 836 FloatParts pr = mul_floats(pa, pb, status); 837 838 return float16_round_pack_canonical(pr, status); 839 } 840 841 float32 QEMU_FLATTEN float32_mul(float32 a, float32 b, float_status *status) 842 { 843 FloatParts pa = float32_unpack_canonical(a, status); 844 FloatParts pb = float32_unpack_canonical(b, status); 845 FloatParts pr = mul_floats(pa, pb, status); 846 847 return float32_round_pack_canonical(pr, status); 848 } 849 850 float64 QEMU_FLATTEN float64_mul(float64 a, float64 b, float_status *status) 851 { 852 FloatParts pa = float64_unpack_canonical(a, status); 853 FloatParts pb = float64_unpack_canonical(b, status); 854 FloatParts pr = mul_floats(pa, pb, status); 855 856 return float64_round_pack_canonical(pr, status); 857 } 858 859 /* 860 * Returns the result of multiplying the floating-point values `a' and 861 * `b' then adding 'c', with no intermediate rounding step after the 862 * multiplication. The operation is performed according to the 863 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008. 864 * The flags argument allows the caller to select negation of the 865 * addend, the intermediate product, or the final result. (The 866 * difference between this and having the caller do a separate 867 * negation is that negating externally will flip the sign bit on 868 * NaNs.) 869 */ 870 871 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c, 872 int flags, float_status *s) 873 { 874 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) == 875 ((1 << float_class_inf) | (1 << float_class_zero)); 876 bool p_sign; 877 bool sign_flip = flags & float_muladd_negate_result; 878 FloatClass p_class; 879 uint64_t hi, lo; 880 int p_exp; 881 882 /* It is implementation-defined whether the cases of (0,inf,qnan) 883 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 884 * they return if they do), so we have to hand this information 885 * off to the target-specific pick-a-NaN routine. 886 */ 887 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) { 888 return pick_nan_muladd(a, b, c, inf_zero, s); 889 } 890 891 if (inf_zero) { 892 s->float_exception_flags |= float_flag_invalid; 893 return parts_default_nan(s); 894 } 895 896 if (flags & float_muladd_negate_c) { 897 c.sign ^= 1; 898 } 899 900 p_sign = a.sign ^ b.sign; 901 902 if (flags & float_muladd_negate_product) { 903 p_sign ^= 1; 904 } 905 906 if (a.cls == float_class_inf || b.cls == float_class_inf) { 907 p_class = float_class_inf; 908 } else if (a.cls == float_class_zero || b.cls == float_class_zero) { 909 p_class = float_class_zero; 910 } else { 911 p_class = float_class_normal; 912 } 913 914 if (c.cls == float_class_inf) { 915 if (p_class == float_class_inf && p_sign != c.sign) { 916 s->float_exception_flags |= float_flag_invalid; 917 return parts_default_nan(s); 918 } else { 919 a.cls = float_class_inf; 920 a.sign = c.sign ^ sign_flip; 921 return a; 922 } 923 } 924 925 if (p_class == float_class_inf) { 926 a.cls = float_class_inf; 927 a.sign = p_sign ^ sign_flip; 928 return a; 929 } 930 931 if (p_class == float_class_zero) { 932 if (c.cls == float_class_zero) { 933 if (p_sign != c.sign) { 934 p_sign = s->float_rounding_mode == float_round_down; 935 } 936 c.sign = p_sign; 937 } else if (flags & float_muladd_halve_result) { 938 c.exp -= 1; 939 } 940 c.sign ^= sign_flip; 941 return c; 942 } 943 944 /* a & b should be normals now... */ 945 assert(a.cls == float_class_normal && 946 b.cls == float_class_normal); 947 948 p_exp = a.exp + b.exp; 949 950 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit 951 * result. 952 */ 953 mul64To128(a.frac, b.frac, &hi, &lo); 954 /* binary point now at bit 124 */ 955 956 /* check for overflow */ 957 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) { 958 shift128RightJamming(hi, lo, 1, &hi, &lo); 959 p_exp += 1; 960 } 961 962 /* + add/sub */ 963 if (c.cls == float_class_zero) { 964 /* move binary point back to 62 */ 965 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 966 } else { 967 int exp_diff = p_exp - c.exp; 968 if (p_sign == c.sign) { 969 /* Addition */ 970 if (exp_diff <= 0) { 971 shift128RightJamming(hi, lo, 972 DECOMPOSED_BINARY_POINT - exp_diff, 973 &hi, &lo); 974 lo += c.frac; 975 p_exp = c.exp; 976 } else { 977 uint64_t c_hi, c_lo; 978 /* shift c to the same binary point as the product (124) */ 979 c_hi = c.frac >> 2; 980 c_lo = 0; 981 shift128RightJamming(c_hi, c_lo, 982 exp_diff, 983 &c_hi, &c_lo); 984 add128(hi, lo, c_hi, c_lo, &hi, &lo); 985 /* move binary point back to 62 */ 986 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 987 } 988 989 if (lo & DECOMPOSED_OVERFLOW_BIT) { 990 shift64RightJamming(lo, 1, &lo); 991 p_exp += 1; 992 } 993 994 } else { 995 /* Subtraction */ 996 uint64_t c_hi, c_lo; 997 /* make C binary point match product at bit 124 */ 998 c_hi = c.frac >> 2; 999 c_lo = 0; 1000 1001 if (exp_diff <= 0) { 1002 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo); 1003 if (exp_diff == 0 1004 && 1005 (hi > c_hi || (hi == c_hi && lo >= c_lo))) { 1006 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1007 } else { 1008 sub128(c_hi, c_lo, hi, lo, &hi, &lo); 1009 p_sign ^= 1; 1010 p_exp = c.exp; 1011 } 1012 } else { 1013 shift128RightJamming(c_hi, c_lo, 1014 exp_diff, 1015 &c_hi, &c_lo); 1016 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1017 } 1018 1019 if (hi == 0 && lo == 0) { 1020 a.cls = float_class_zero; 1021 a.sign = s->float_rounding_mode == float_round_down; 1022 a.sign ^= sign_flip; 1023 return a; 1024 } else { 1025 int shift; 1026 if (hi != 0) { 1027 shift = clz64(hi); 1028 } else { 1029 shift = clz64(lo) + 64; 1030 } 1031 /* Normalizing to a binary point of 124 is the 1032 correct adjust for the exponent. However since we're 1033 shifting, we might as well put the binary point back 1034 at 62 where we really want it. Therefore shift as 1035 if we're leaving 1 bit at the top of the word, but 1036 adjust the exponent as if we're leaving 3 bits. */ 1037 shift -= 1; 1038 if (shift >= 64) { 1039 lo = lo << (shift - 64); 1040 } else { 1041 hi = (hi << shift) | (lo >> (64 - shift)); 1042 lo = hi | ((lo << shift) != 0); 1043 } 1044 p_exp -= shift - 2; 1045 } 1046 } 1047 } 1048 1049 if (flags & float_muladd_halve_result) { 1050 p_exp -= 1; 1051 } 1052 1053 /* finally prepare our result */ 1054 a.cls = float_class_normal; 1055 a.sign = p_sign ^ sign_flip; 1056 a.exp = p_exp; 1057 a.frac = lo; 1058 1059 return a; 1060 } 1061 1062 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c, 1063 int flags, float_status *status) 1064 { 1065 FloatParts pa = float16_unpack_canonical(a, status); 1066 FloatParts pb = float16_unpack_canonical(b, status); 1067 FloatParts pc = float16_unpack_canonical(c, status); 1068 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1069 1070 return float16_round_pack_canonical(pr, status); 1071 } 1072 1073 float32 QEMU_FLATTEN float32_muladd(float32 a, float32 b, float32 c, 1074 int flags, float_status *status) 1075 { 1076 FloatParts pa = float32_unpack_canonical(a, status); 1077 FloatParts pb = float32_unpack_canonical(b, status); 1078 FloatParts pc = float32_unpack_canonical(c, status); 1079 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1080 1081 return float32_round_pack_canonical(pr, status); 1082 } 1083 1084 float64 QEMU_FLATTEN float64_muladd(float64 a, float64 b, float64 c, 1085 int flags, float_status *status) 1086 { 1087 FloatParts pa = float64_unpack_canonical(a, status); 1088 FloatParts pb = float64_unpack_canonical(b, status); 1089 FloatParts pc = float64_unpack_canonical(c, status); 1090 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1091 1092 return float64_round_pack_canonical(pr, status); 1093 } 1094 1095 /* 1096 * Returns the result of dividing the floating-point value `a' by the 1097 * corresponding value `b'. The operation is performed according to 1098 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1099 */ 1100 1101 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s) 1102 { 1103 bool sign = a.sign ^ b.sign; 1104 1105 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1106 uint64_t n0, n1, q, r; 1107 int exp = a.exp - b.exp; 1108 1109 /* 1110 * We want a 2*N / N-bit division to produce exactly an N-bit 1111 * result, so that we do not lose any precision and so that we 1112 * do not have to renormalize afterward. If A.frac < B.frac, 1113 * then division would produce an (N-1)-bit result; shift A left 1114 * by one to produce the an N-bit result, and decrement the 1115 * exponent to match. 1116 * 1117 * The udiv_qrnnd algorithm that we're using requires normalization, 1118 * i.e. the msb of the denominator must be set. Since we know that 1119 * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left 1120 * by one (more), and the remainder must be shifted right by one. 1121 */ 1122 if (a.frac < b.frac) { 1123 exp -= 1; 1124 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0); 1125 } else { 1126 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0); 1127 } 1128 q = udiv_qrnnd(&r, n1, n0, b.frac << 1); 1129 1130 /* 1131 * Set lsb if there is a remainder, to set inexact. 1132 * As mentioned above, to find the actual value of the remainder we 1133 * would need to shift right, but (1) we are only concerned about 1134 * non-zero-ness, and (2) the remainder will always be even because 1135 * both inputs to the division primitive are even. 1136 */ 1137 a.frac = q | (r != 0); 1138 a.sign = sign; 1139 a.exp = exp; 1140 return a; 1141 } 1142 /* handle all the NaN cases */ 1143 if (is_nan(a.cls) || is_nan(b.cls)) { 1144 return pick_nan(a, b, s); 1145 } 1146 /* 0/0 or Inf/Inf */ 1147 if (a.cls == b.cls 1148 && 1149 (a.cls == float_class_inf || a.cls == float_class_zero)) { 1150 s->float_exception_flags |= float_flag_invalid; 1151 return parts_default_nan(s); 1152 } 1153 /* Inf / x or 0 / x */ 1154 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1155 a.sign = sign; 1156 return a; 1157 } 1158 /* Div 0 => Inf */ 1159 if (b.cls == float_class_zero) { 1160 s->float_exception_flags |= float_flag_divbyzero; 1161 a.cls = float_class_inf; 1162 a.sign = sign; 1163 return a; 1164 } 1165 /* Div by Inf */ 1166 if (b.cls == float_class_inf) { 1167 a.cls = float_class_zero; 1168 a.sign = sign; 1169 return a; 1170 } 1171 g_assert_not_reached(); 1172 } 1173 1174 float16 float16_div(float16 a, float16 b, float_status *status) 1175 { 1176 FloatParts pa = float16_unpack_canonical(a, status); 1177 FloatParts pb = float16_unpack_canonical(b, status); 1178 FloatParts pr = div_floats(pa, pb, status); 1179 1180 return float16_round_pack_canonical(pr, status); 1181 } 1182 1183 float32 float32_div(float32 a, float32 b, float_status *status) 1184 { 1185 FloatParts pa = float32_unpack_canonical(a, status); 1186 FloatParts pb = float32_unpack_canonical(b, status); 1187 FloatParts pr = div_floats(pa, pb, status); 1188 1189 return float32_round_pack_canonical(pr, status); 1190 } 1191 1192 float64 float64_div(float64 a, float64 b, float_status *status) 1193 { 1194 FloatParts pa = float64_unpack_canonical(a, status); 1195 FloatParts pb = float64_unpack_canonical(b, status); 1196 FloatParts pr = div_floats(pa, pb, status); 1197 1198 return float64_round_pack_canonical(pr, status); 1199 } 1200 1201 /* 1202 * Float to Float conversions 1203 * 1204 * Returns the result of converting one float format to another. The 1205 * conversion is performed according to the IEC/IEEE Standard for 1206 * Binary Floating-Point Arithmetic. 1207 * 1208 * The float_to_float helper only needs to take care of raising 1209 * invalid exceptions and handling the conversion on NaNs. 1210 */ 1211 1212 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf, 1213 float_status *s) 1214 { 1215 if (dstf->arm_althp) { 1216 switch (a.cls) { 1217 case float_class_qnan: 1218 case float_class_snan: 1219 /* There is no NaN in the destination format. Raise Invalid 1220 * and return a zero with the sign of the input NaN. 1221 */ 1222 s->float_exception_flags |= float_flag_invalid; 1223 a.cls = float_class_zero; 1224 a.frac = 0; 1225 a.exp = 0; 1226 break; 1227 1228 case float_class_inf: 1229 /* There is no Inf in the destination format. Raise Invalid 1230 * and return the maximum normal with the correct sign. 1231 */ 1232 s->float_exception_flags |= float_flag_invalid; 1233 a.cls = float_class_normal; 1234 a.exp = dstf->exp_max; 1235 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift; 1236 break; 1237 1238 default: 1239 break; 1240 } 1241 } else if (is_nan(a.cls)) { 1242 if (is_snan(a.cls)) { 1243 s->float_exception_flags |= float_flag_invalid; 1244 a = parts_silence_nan(a, s); 1245 } 1246 if (s->default_nan_mode) { 1247 return parts_default_nan(s); 1248 } 1249 } 1250 return a; 1251 } 1252 1253 float32 float16_to_float32(float16 a, bool ieee, float_status *s) 1254 { 1255 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1256 FloatParts p = float16a_unpack_canonical(a, s, fmt16); 1257 FloatParts pr = float_to_float(p, &float32_params, s); 1258 return float32_round_pack_canonical(pr, s); 1259 } 1260 1261 float64 float16_to_float64(float16 a, bool ieee, float_status *s) 1262 { 1263 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1264 FloatParts p = float16a_unpack_canonical(a, s, fmt16); 1265 FloatParts pr = float_to_float(p, &float64_params, s); 1266 return float64_round_pack_canonical(pr, s); 1267 } 1268 1269 float16 float32_to_float16(float32 a, bool ieee, float_status *s) 1270 { 1271 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1272 FloatParts p = float32_unpack_canonical(a, s); 1273 FloatParts pr = float_to_float(p, fmt16, s); 1274 return float16a_round_pack_canonical(pr, s, fmt16); 1275 } 1276 1277 float64 float32_to_float64(float32 a, float_status *s) 1278 { 1279 FloatParts p = float32_unpack_canonical(a, s); 1280 FloatParts pr = float_to_float(p, &float64_params, s); 1281 return float64_round_pack_canonical(pr, s); 1282 } 1283 1284 float16 float64_to_float16(float64 a, bool ieee, float_status *s) 1285 { 1286 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1287 FloatParts p = float64_unpack_canonical(a, s); 1288 FloatParts pr = float_to_float(p, fmt16, s); 1289 return float16a_round_pack_canonical(pr, s, fmt16); 1290 } 1291 1292 float32 float64_to_float32(float64 a, float_status *s) 1293 { 1294 FloatParts p = float64_unpack_canonical(a, s); 1295 FloatParts pr = float_to_float(p, &float32_params, s); 1296 return float32_round_pack_canonical(pr, s); 1297 } 1298 1299 /* 1300 * Rounds the floating-point value `a' to an integer, and returns the 1301 * result as a floating-point value. The operation is performed 1302 * according to the IEC/IEEE Standard for Binary Floating-Point 1303 * Arithmetic. 1304 */ 1305 1306 static FloatParts round_to_int(FloatParts a, int rmode, 1307 int scale, float_status *s) 1308 { 1309 switch (a.cls) { 1310 case float_class_qnan: 1311 case float_class_snan: 1312 return return_nan(a, s); 1313 1314 case float_class_zero: 1315 case float_class_inf: 1316 /* already "integral" */ 1317 break; 1318 1319 case float_class_normal: 1320 scale = MIN(MAX(scale, -0x10000), 0x10000); 1321 a.exp += scale; 1322 1323 if (a.exp >= DECOMPOSED_BINARY_POINT) { 1324 /* already integral */ 1325 break; 1326 } 1327 if (a.exp < 0) { 1328 bool one; 1329 /* all fractional */ 1330 s->float_exception_flags |= float_flag_inexact; 1331 switch (rmode) { 1332 case float_round_nearest_even: 1333 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT; 1334 break; 1335 case float_round_ties_away: 1336 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT; 1337 break; 1338 case float_round_to_zero: 1339 one = false; 1340 break; 1341 case float_round_up: 1342 one = !a.sign; 1343 break; 1344 case float_round_down: 1345 one = a.sign; 1346 break; 1347 default: 1348 g_assert_not_reached(); 1349 } 1350 1351 if (one) { 1352 a.frac = DECOMPOSED_IMPLICIT_BIT; 1353 a.exp = 0; 1354 } else { 1355 a.cls = float_class_zero; 1356 } 1357 } else { 1358 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp; 1359 uint64_t frac_lsbm1 = frac_lsb >> 1; 1360 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb; 1361 uint64_t rnd_mask = rnd_even_mask >> 1; 1362 uint64_t inc; 1363 1364 switch (rmode) { 1365 case float_round_nearest_even: 1366 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 1367 break; 1368 case float_round_ties_away: 1369 inc = frac_lsbm1; 1370 break; 1371 case float_round_to_zero: 1372 inc = 0; 1373 break; 1374 case float_round_up: 1375 inc = a.sign ? 0 : rnd_mask; 1376 break; 1377 case float_round_down: 1378 inc = a.sign ? rnd_mask : 0; 1379 break; 1380 default: 1381 g_assert_not_reached(); 1382 } 1383 1384 if (a.frac & rnd_mask) { 1385 s->float_exception_flags |= float_flag_inexact; 1386 a.frac += inc; 1387 a.frac &= ~rnd_mask; 1388 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 1389 a.frac >>= 1; 1390 a.exp++; 1391 } 1392 } 1393 } 1394 break; 1395 default: 1396 g_assert_not_reached(); 1397 } 1398 return a; 1399 } 1400 1401 float16 float16_round_to_int(float16 a, float_status *s) 1402 { 1403 FloatParts pa = float16_unpack_canonical(a, s); 1404 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 1405 return float16_round_pack_canonical(pr, s); 1406 } 1407 1408 float32 float32_round_to_int(float32 a, float_status *s) 1409 { 1410 FloatParts pa = float32_unpack_canonical(a, s); 1411 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 1412 return float32_round_pack_canonical(pr, s); 1413 } 1414 1415 float64 float64_round_to_int(float64 a, float_status *s) 1416 { 1417 FloatParts pa = float64_unpack_canonical(a, s); 1418 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 1419 return float64_round_pack_canonical(pr, s); 1420 } 1421 1422 /* 1423 * Returns the result of converting the floating-point value `a' to 1424 * the two's complement integer format. The conversion is performed 1425 * according to the IEC/IEEE Standard for Binary Floating-Point 1426 * Arithmetic---which means in particular that the conversion is 1427 * rounded according to the current rounding mode. If `a' is a NaN, 1428 * the largest positive integer is returned. Otherwise, if the 1429 * conversion overflows, the largest integer with the same sign as `a' 1430 * is returned. 1431 */ 1432 1433 static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale, 1434 int64_t min, int64_t max, 1435 float_status *s) 1436 { 1437 uint64_t r; 1438 int orig_flags = get_float_exception_flags(s); 1439 FloatParts p = round_to_int(in, rmode, scale, s); 1440 1441 switch (p.cls) { 1442 case float_class_snan: 1443 case float_class_qnan: 1444 s->float_exception_flags = orig_flags | float_flag_invalid; 1445 return max; 1446 case float_class_inf: 1447 s->float_exception_flags = orig_flags | float_flag_invalid; 1448 return p.sign ? min : max; 1449 case float_class_zero: 1450 return 0; 1451 case float_class_normal: 1452 if (p.exp < DECOMPOSED_BINARY_POINT) { 1453 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 1454 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 1455 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 1456 } else { 1457 r = UINT64_MAX; 1458 } 1459 if (p.sign) { 1460 if (r <= -(uint64_t) min) { 1461 return -r; 1462 } else { 1463 s->float_exception_flags = orig_flags | float_flag_invalid; 1464 return min; 1465 } 1466 } else { 1467 if (r <= max) { 1468 return r; 1469 } else { 1470 s->float_exception_flags = orig_flags | float_flag_invalid; 1471 return max; 1472 } 1473 } 1474 default: 1475 g_assert_not_reached(); 1476 } 1477 } 1478 1479 int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale, 1480 float_status *s) 1481 { 1482 return round_to_int_and_pack(float16_unpack_canonical(a, s), 1483 rmode, scale, INT16_MIN, INT16_MAX, s); 1484 } 1485 1486 int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale, 1487 float_status *s) 1488 { 1489 return round_to_int_and_pack(float16_unpack_canonical(a, s), 1490 rmode, scale, INT32_MIN, INT32_MAX, s); 1491 } 1492 1493 int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale, 1494 float_status *s) 1495 { 1496 return round_to_int_and_pack(float16_unpack_canonical(a, s), 1497 rmode, scale, INT64_MIN, INT64_MAX, s); 1498 } 1499 1500 int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale, 1501 float_status *s) 1502 { 1503 return round_to_int_and_pack(float32_unpack_canonical(a, s), 1504 rmode, scale, INT16_MIN, INT16_MAX, s); 1505 } 1506 1507 int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale, 1508 float_status *s) 1509 { 1510 return round_to_int_and_pack(float32_unpack_canonical(a, s), 1511 rmode, scale, INT32_MIN, INT32_MAX, s); 1512 } 1513 1514 int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale, 1515 float_status *s) 1516 { 1517 return round_to_int_and_pack(float32_unpack_canonical(a, s), 1518 rmode, scale, INT64_MIN, INT64_MAX, s); 1519 } 1520 1521 int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale, 1522 float_status *s) 1523 { 1524 return round_to_int_and_pack(float64_unpack_canonical(a, s), 1525 rmode, scale, INT16_MIN, INT16_MAX, s); 1526 } 1527 1528 int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale, 1529 float_status *s) 1530 { 1531 return round_to_int_and_pack(float64_unpack_canonical(a, s), 1532 rmode, scale, INT32_MIN, INT32_MAX, s); 1533 } 1534 1535 int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale, 1536 float_status *s) 1537 { 1538 return round_to_int_and_pack(float64_unpack_canonical(a, s), 1539 rmode, scale, INT64_MIN, INT64_MAX, s); 1540 } 1541 1542 int16_t float16_to_int16(float16 a, float_status *s) 1543 { 1544 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 1545 } 1546 1547 int32_t float16_to_int32(float16 a, float_status *s) 1548 { 1549 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 1550 } 1551 1552 int64_t float16_to_int64(float16 a, float_status *s) 1553 { 1554 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 1555 } 1556 1557 int16_t float32_to_int16(float32 a, float_status *s) 1558 { 1559 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 1560 } 1561 1562 int32_t float32_to_int32(float32 a, float_status *s) 1563 { 1564 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 1565 } 1566 1567 int64_t float32_to_int64(float32 a, float_status *s) 1568 { 1569 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 1570 } 1571 1572 int16_t float64_to_int16(float64 a, float_status *s) 1573 { 1574 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 1575 } 1576 1577 int32_t float64_to_int32(float64 a, float_status *s) 1578 { 1579 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 1580 } 1581 1582 int64_t float64_to_int64(float64 a, float_status *s) 1583 { 1584 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 1585 } 1586 1587 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s) 1588 { 1589 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s); 1590 } 1591 1592 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s) 1593 { 1594 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s); 1595 } 1596 1597 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s) 1598 { 1599 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s); 1600 } 1601 1602 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s) 1603 { 1604 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s); 1605 } 1606 1607 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s) 1608 { 1609 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s); 1610 } 1611 1612 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s) 1613 { 1614 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s); 1615 } 1616 1617 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s) 1618 { 1619 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s); 1620 } 1621 1622 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s) 1623 { 1624 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s); 1625 } 1626 1627 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s) 1628 { 1629 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s); 1630 } 1631 1632 /* 1633 * Returns the result of converting the floating-point value `a' to 1634 * the unsigned integer format. The conversion is performed according 1635 * to the IEC/IEEE Standard for Binary Floating-Point 1636 * Arithmetic---which means in particular that the conversion is 1637 * rounded according to the current rounding mode. If `a' is a NaN, 1638 * the largest unsigned integer is returned. Otherwise, if the 1639 * conversion overflows, the largest unsigned integer is returned. If 1640 * the 'a' is negative, the result is rounded and zero is returned; 1641 * values that do not round to zero will raise the inexact exception 1642 * flag. 1643 */ 1644 1645 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale, 1646 uint64_t max, float_status *s) 1647 { 1648 int orig_flags = get_float_exception_flags(s); 1649 FloatParts p = round_to_int(in, rmode, scale, s); 1650 uint64_t r; 1651 1652 switch (p.cls) { 1653 case float_class_snan: 1654 case float_class_qnan: 1655 s->float_exception_flags = orig_flags | float_flag_invalid; 1656 return max; 1657 case float_class_inf: 1658 s->float_exception_flags = orig_flags | float_flag_invalid; 1659 return p.sign ? 0 : max; 1660 case float_class_zero: 1661 return 0; 1662 case float_class_normal: 1663 if (p.sign) { 1664 s->float_exception_flags = orig_flags | float_flag_invalid; 1665 return 0; 1666 } 1667 1668 if (p.exp < DECOMPOSED_BINARY_POINT) { 1669 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 1670 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 1671 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 1672 } else { 1673 s->float_exception_flags = orig_flags | float_flag_invalid; 1674 return max; 1675 } 1676 1677 /* For uint64 this will never trip, but if p.exp is too large 1678 * to shift a decomposed fraction we shall have exited via the 1679 * 3rd leg above. 1680 */ 1681 if (r > max) { 1682 s->float_exception_flags = orig_flags | float_flag_invalid; 1683 return max; 1684 } 1685 return r; 1686 default: 1687 g_assert_not_reached(); 1688 } 1689 } 1690 1691 uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale, 1692 float_status *s) 1693 { 1694 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 1695 rmode, scale, UINT16_MAX, s); 1696 } 1697 1698 uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale, 1699 float_status *s) 1700 { 1701 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 1702 rmode, scale, UINT32_MAX, s); 1703 } 1704 1705 uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale, 1706 float_status *s) 1707 { 1708 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 1709 rmode, scale, UINT64_MAX, s); 1710 } 1711 1712 uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale, 1713 float_status *s) 1714 { 1715 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 1716 rmode, scale, UINT16_MAX, s); 1717 } 1718 1719 uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale, 1720 float_status *s) 1721 { 1722 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 1723 rmode, scale, UINT32_MAX, s); 1724 } 1725 1726 uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale, 1727 float_status *s) 1728 { 1729 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 1730 rmode, scale, UINT64_MAX, s); 1731 } 1732 1733 uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale, 1734 float_status *s) 1735 { 1736 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 1737 rmode, scale, UINT16_MAX, s); 1738 } 1739 1740 uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale, 1741 float_status *s) 1742 { 1743 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 1744 rmode, scale, UINT32_MAX, s); 1745 } 1746 1747 uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale, 1748 float_status *s) 1749 { 1750 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 1751 rmode, scale, UINT64_MAX, s); 1752 } 1753 1754 uint16_t float16_to_uint16(float16 a, float_status *s) 1755 { 1756 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 1757 } 1758 1759 uint32_t float16_to_uint32(float16 a, float_status *s) 1760 { 1761 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 1762 } 1763 1764 uint64_t float16_to_uint64(float16 a, float_status *s) 1765 { 1766 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 1767 } 1768 1769 uint16_t float32_to_uint16(float32 a, float_status *s) 1770 { 1771 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 1772 } 1773 1774 uint32_t float32_to_uint32(float32 a, float_status *s) 1775 { 1776 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 1777 } 1778 1779 uint64_t float32_to_uint64(float32 a, float_status *s) 1780 { 1781 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 1782 } 1783 1784 uint16_t float64_to_uint16(float64 a, float_status *s) 1785 { 1786 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 1787 } 1788 1789 uint32_t float64_to_uint32(float64 a, float_status *s) 1790 { 1791 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 1792 } 1793 1794 uint64_t float64_to_uint64(float64 a, float_status *s) 1795 { 1796 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 1797 } 1798 1799 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s) 1800 { 1801 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 1802 } 1803 1804 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s) 1805 { 1806 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 1807 } 1808 1809 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s) 1810 { 1811 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 1812 } 1813 1814 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s) 1815 { 1816 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s); 1817 } 1818 1819 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s) 1820 { 1821 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s); 1822 } 1823 1824 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s) 1825 { 1826 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s); 1827 } 1828 1829 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s) 1830 { 1831 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s); 1832 } 1833 1834 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s) 1835 { 1836 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s); 1837 } 1838 1839 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s) 1840 { 1841 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s); 1842 } 1843 1844 /* 1845 * Integer to float conversions 1846 * 1847 * Returns the result of converting the two's complement integer `a' 1848 * to the floating-point format. The conversion is performed according 1849 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1850 */ 1851 1852 static FloatParts int_to_float(int64_t a, int scale, float_status *status) 1853 { 1854 FloatParts r = { .sign = false }; 1855 1856 if (a == 0) { 1857 r.cls = float_class_zero; 1858 } else { 1859 uint64_t f = a; 1860 int shift; 1861 1862 r.cls = float_class_normal; 1863 if (a < 0) { 1864 f = -f; 1865 r.sign = true; 1866 } 1867 shift = clz64(f) - 1; 1868 scale = MIN(MAX(scale, -0x10000), 0x10000); 1869 1870 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 1871 r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift); 1872 } 1873 1874 return r; 1875 } 1876 1877 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status) 1878 { 1879 FloatParts pa = int_to_float(a, scale, status); 1880 return float16_round_pack_canonical(pa, status); 1881 } 1882 1883 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status) 1884 { 1885 return int64_to_float16_scalbn(a, scale, status); 1886 } 1887 1888 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status) 1889 { 1890 return int64_to_float16_scalbn(a, scale, status); 1891 } 1892 1893 float16 int64_to_float16(int64_t a, float_status *status) 1894 { 1895 return int64_to_float16_scalbn(a, 0, status); 1896 } 1897 1898 float16 int32_to_float16(int32_t a, float_status *status) 1899 { 1900 return int64_to_float16_scalbn(a, 0, status); 1901 } 1902 1903 float16 int16_to_float16(int16_t a, float_status *status) 1904 { 1905 return int64_to_float16_scalbn(a, 0, status); 1906 } 1907 1908 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status) 1909 { 1910 FloatParts pa = int_to_float(a, scale, status); 1911 return float32_round_pack_canonical(pa, status); 1912 } 1913 1914 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status) 1915 { 1916 return int64_to_float32_scalbn(a, scale, status); 1917 } 1918 1919 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status) 1920 { 1921 return int64_to_float32_scalbn(a, scale, status); 1922 } 1923 1924 float32 int64_to_float32(int64_t a, float_status *status) 1925 { 1926 return int64_to_float32_scalbn(a, 0, status); 1927 } 1928 1929 float32 int32_to_float32(int32_t a, float_status *status) 1930 { 1931 return int64_to_float32_scalbn(a, 0, status); 1932 } 1933 1934 float32 int16_to_float32(int16_t a, float_status *status) 1935 { 1936 return int64_to_float32_scalbn(a, 0, status); 1937 } 1938 1939 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status) 1940 { 1941 FloatParts pa = int_to_float(a, scale, status); 1942 return float64_round_pack_canonical(pa, status); 1943 } 1944 1945 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status) 1946 { 1947 return int64_to_float64_scalbn(a, scale, status); 1948 } 1949 1950 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status) 1951 { 1952 return int64_to_float64_scalbn(a, scale, status); 1953 } 1954 1955 float64 int64_to_float64(int64_t a, float_status *status) 1956 { 1957 return int64_to_float64_scalbn(a, 0, status); 1958 } 1959 1960 float64 int32_to_float64(int32_t a, float_status *status) 1961 { 1962 return int64_to_float64_scalbn(a, 0, status); 1963 } 1964 1965 float64 int16_to_float64(int16_t a, float_status *status) 1966 { 1967 return int64_to_float64_scalbn(a, 0, status); 1968 } 1969 1970 1971 /* 1972 * Unsigned Integer to float conversions 1973 * 1974 * Returns the result of converting the unsigned integer `a' to the 1975 * floating-point format. The conversion is performed according to the 1976 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1977 */ 1978 1979 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status) 1980 { 1981 FloatParts r = { .sign = false }; 1982 1983 if (a == 0) { 1984 r.cls = float_class_zero; 1985 } else { 1986 scale = MIN(MAX(scale, -0x10000), 0x10000); 1987 r.cls = float_class_normal; 1988 if ((int64_t)a < 0) { 1989 r.exp = DECOMPOSED_BINARY_POINT + 1 + scale; 1990 shift64RightJamming(a, 1, &a); 1991 r.frac = a; 1992 } else { 1993 int shift = clz64(a) - 1; 1994 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 1995 r.frac = a << shift; 1996 } 1997 } 1998 1999 return r; 2000 } 2001 2002 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status) 2003 { 2004 FloatParts pa = uint_to_float(a, scale, status); 2005 return float16_round_pack_canonical(pa, status); 2006 } 2007 2008 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status) 2009 { 2010 return uint64_to_float16_scalbn(a, scale, status); 2011 } 2012 2013 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status) 2014 { 2015 return uint64_to_float16_scalbn(a, scale, status); 2016 } 2017 2018 float16 uint64_to_float16(uint64_t a, float_status *status) 2019 { 2020 return uint64_to_float16_scalbn(a, 0, status); 2021 } 2022 2023 float16 uint32_to_float16(uint32_t a, float_status *status) 2024 { 2025 return uint64_to_float16_scalbn(a, 0, status); 2026 } 2027 2028 float16 uint16_to_float16(uint16_t a, float_status *status) 2029 { 2030 return uint64_to_float16_scalbn(a, 0, status); 2031 } 2032 2033 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status) 2034 { 2035 FloatParts pa = uint_to_float(a, scale, status); 2036 return float32_round_pack_canonical(pa, status); 2037 } 2038 2039 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status) 2040 { 2041 return uint64_to_float32_scalbn(a, scale, status); 2042 } 2043 2044 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status) 2045 { 2046 return uint64_to_float32_scalbn(a, scale, status); 2047 } 2048 2049 float32 uint64_to_float32(uint64_t a, float_status *status) 2050 { 2051 return uint64_to_float32_scalbn(a, 0, status); 2052 } 2053 2054 float32 uint32_to_float32(uint32_t a, float_status *status) 2055 { 2056 return uint64_to_float32_scalbn(a, 0, status); 2057 } 2058 2059 float32 uint16_to_float32(uint16_t a, float_status *status) 2060 { 2061 return uint64_to_float32_scalbn(a, 0, status); 2062 } 2063 2064 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status) 2065 { 2066 FloatParts pa = uint_to_float(a, scale, status); 2067 return float64_round_pack_canonical(pa, status); 2068 } 2069 2070 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status) 2071 { 2072 return uint64_to_float64_scalbn(a, scale, status); 2073 } 2074 2075 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status) 2076 { 2077 return uint64_to_float64_scalbn(a, scale, status); 2078 } 2079 2080 float64 uint64_to_float64(uint64_t a, float_status *status) 2081 { 2082 return uint64_to_float64_scalbn(a, 0, status); 2083 } 2084 2085 float64 uint32_to_float64(uint32_t a, float_status *status) 2086 { 2087 return uint64_to_float64_scalbn(a, 0, status); 2088 } 2089 2090 float64 uint16_to_float64(uint16_t a, float_status *status) 2091 { 2092 return uint64_to_float64_scalbn(a, 0, status); 2093 } 2094 2095 /* Float Min/Max */ 2096 /* min() and max() functions. These can't be implemented as 2097 * 'compare and pick one input' because that would mishandle 2098 * NaNs and +0 vs -0. 2099 * 2100 * minnum() and maxnum() functions. These are similar to the min() 2101 * and max() functions but if one of the arguments is a QNaN and 2102 * the other is numerical then the numerical argument is returned. 2103 * SNaNs will get quietened before being returned. 2104 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 2105 * and maxNum() operations. min() and max() are the typical min/max 2106 * semantics provided by many CPUs which predate that specification. 2107 * 2108 * minnummag() and maxnummag() functions correspond to minNumMag() 2109 * and minNumMag() from the IEEE-754 2008. 2110 */ 2111 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin, 2112 bool ieee, bool ismag, float_status *s) 2113 { 2114 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) { 2115 if (ieee) { 2116 /* Takes two floating-point values `a' and `b', one of 2117 * which is a NaN, and returns the appropriate NaN 2118 * result. If either `a' or `b' is a signaling NaN, 2119 * the invalid exception is raised. 2120 */ 2121 if (is_snan(a.cls) || is_snan(b.cls)) { 2122 return pick_nan(a, b, s); 2123 } else if (is_nan(a.cls) && !is_nan(b.cls)) { 2124 return b; 2125 } else if (is_nan(b.cls) && !is_nan(a.cls)) { 2126 return a; 2127 } 2128 } 2129 return pick_nan(a, b, s); 2130 } else { 2131 int a_exp, b_exp; 2132 2133 switch (a.cls) { 2134 case float_class_normal: 2135 a_exp = a.exp; 2136 break; 2137 case float_class_inf: 2138 a_exp = INT_MAX; 2139 break; 2140 case float_class_zero: 2141 a_exp = INT_MIN; 2142 break; 2143 default: 2144 g_assert_not_reached(); 2145 break; 2146 } 2147 switch (b.cls) { 2148 case float_class_normal: 2149 b_exp = b.exp; 2150 break; 2151 case float_class_inf: 2152 b_exp = INT_MAX; 2153 break; 2154 case float_class_zero: 2155 b_exp = INT_MIN; 2156 break; 2157 default: 2158 g_assert_not_reached(); 2159 break; 2160 } 2161 2162 if (ismag && (a_exp != b_exp || a.frac != b.frac)) { 2163 bool a_less = a_exp < b_exp; 2164 if (a_exp == b_exp) { 2165 a_less = a.frac < b.frac; 2166 } 2167 return a_less ^ ismin ? b : a; 2168 } 2169 2170 if (a.sign == b.sign) { 2171 bool a_less = a_exp < b_exp; 2172 if (a_exp == b_exp) { 2173 a_less = a.frac < b.frac; 2174 } 2175 return a.sign ^ a_less ^ ismin ? b : a; 2176 } else { 2177 return a.sign ^ ismin ? b : a; 2178 } 2179 } 2180 } 2181 2182 #define MINMAX(sz, name, ismin, isiee, ismag) \ 2183 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \ 2184 float_status *s) \ 2185 { \ 2186 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 2187 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 2188 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 2189 \ 2190 return float ## sz ## _round_pack_canonical(pr, s); \ 2191 } 2192 2193 MINMAX(16, min, true, false, false) 2194 MINMAX(16, minnum, true, true, false) 2195 MINMAX(16, minnummag, true, true, true) 2196 MINMAX(16, max, false, false, false) 2197 MINMAX(16, maxnum, false, true, false) 2198 MINMAX(16, maxnummag, false, true, true) 2199 2200 MINMAX(32, min, true, false, false) 2201 MINMAX(32, minnum, true, true, false) 2202 MINMAX(32, minnummag, true, true, true) 2203 MINMAX(32, max, false, false, false) 2204 MINMAX(32, maxnum, false, true, false) 2205 MINMAX(32, maxnummag, false, true, true) 2206 2207 MINMAX(64, min, true, false, false) 2208 MINMAX(64, minnum, true, true, false) 2209 MINMAX(64, minnummag, true, true, true) 2210 MINMAX(64, max, false, false, false) 2211 MINMAX(64, maxnum, false, true, false) 2212 MINMAX(64, maxnummag, false, true, true) 2213 2214 #undef MINMAX 2215 2216 /* Floating point compare */ 2217 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet, 2218 float_status *s) 2219 { 2220 if (is_nan(a.cls) || is_nan(b.cls)) { 2221 if (!is_quiet || 2222 a.cls == float_class_snan || 2223 b.cls == float_class_snan) { 2224 s->float_exception_flags |= float_flag_invalid; 2225 } 2226 return float_relation_unordered; 2227 } 2228 2229 if (a.cls == float_class_zero) { 2230 if (b.cls == float_class_zero) { 2231 return float_relation_equal; 2232 } 2233 return b.sign ? float_relation_greater : float_relation_less; 2234 } else if (b.cls == float_class_zero) { 2235 return a.sign ? float_relation_less : float_relation_greater; 2236 } 2237 2238 /* The only really important thing about infinity is its sign. If 2239 * both are infinities the sign marks the smallest of the two. 2240 */ 2241 if (a.cls == float_class_inf) { 2242 if ((b.cls == float_class_inf) && (a.sign == b.sign)) { 2243 return float_relation_equal; 2244 } 2245 return a.sign ? float_relation_less : float_relation_greater; 2246 } else if (b.cls == float_class_inf) { 2247 return b.sign ? float_relation_greater : float_relation_less; 2248 } 2249 2250 if (a.sign != b.sign) { 2251 return a.sign ? float_relation_less : float_relation_greater; 2252 } 2253 2254 if (a.exp == b.exp) { 2255 if (a.frac == b.frac) { 2256 return float_relation_equal; 2257 } 2258 if (a.sign) { 2259 return a.frac > b.frac ? 2260 float_relation_less : float_relation_greater; 2261 } else { 2262 return a.frac > b.frac ? 2263 float_relation_greater : float_relation_less; 2264 } 2265 } else { 2266 if (a.sign) { 2267 return a.exp > b.exp ? float_relation_less : float_relation_greater; 2268 } else { 2269 return a.exp > b.exp ? float_relation_greater : float_relation_less; 2270 } 2271 } 2272 } 2273 2274 #define COMPARE(sz) \ 2275 int float ## sz ## _compare(float ## sz a, float ## sz b, \ 2276 float_status *s) \ 2277 { \ 2278 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 2279 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 2280 return compare_floats(pa, pb, false, s); \ 2281 } \ 2282 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \ 2283 float_status *s) \ 2284 { \ 2285 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 2286 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 2287 return compare_floats(pa, pb, true, s); \ 2288 } 2289 2290 COMPARE(16) 2291 COMPARE(32) 2292 COMPARE(64) 2293 2294 #undef COMPARE 2295 2296 /* Multiply A by 2 raised to the power N. */ 2297 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s) 2298 { 2299 if (unlikely(is_nan(a.cls))) { 2300 return return_nan(a, s); 2301 } 2302 if (a.cls == float_class_normal) { 2303 /* The largest float type (even though not supported by FloatParts) 2304 * is float128, which has a 15 bit exponent. Bounding N to 16 bits 2305 * still allows rounding to infinity, without allowing overflow 2306 * within the int32_t that backs FloatParts.exp. 2307 */ 2308 n = MIN(MAX(n, -0x10000), 0x10000); 2309 a.exp += n; 2310 } 2311 return a; 2312 } 2313 2314 float16 float16_scalbn(float16 a, int n, float_status *status) 2315 { 2316 FloatParts pa = float16_unpack_canonical(a, status); 2317 FloatParts pr = scalbn_decomposed(pa, n, status); 2318 return float16_round_pack_canonical(pr, status); 2319 } 2320 2321 float32 float32_scalbn(float32 a, int n, float_status *status) 2322 { 2323 FloatParts pa = float32_unpack_canonical(a, status); 2324 FloatParts pr = scalbn_decomposed(pa, n, status); 2325 return float32_round_pack_canonical(pr, status); 2326 } 2327 2328 float64 float64_scalbn(float64 a, int n, float_status *status) 2329 { 2330 FloatParts pa = float64_unpack_canonical(a, status); 2331 FloatParts pr = scalbn_decomposed(pa, n, status); 2332 return float64_round_pack_canonical(pr, status); 2333 } 2334 2335 /* 2336 * Square Root 2337 * 2338 * The old softfloat code did an approximation step before zeroing in 2339 * on the final result. However for simpleness we just compute the 2340 * square root by iterating down from the implicit bit to enough extra 2341 * bits to ensure we get a correctly rounded result. 2342 * 2343 * This does mean however the calculation is slower than before, 2344 * especially for 64 bit floats. 2345 */ 2346 2347 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p) 2348 { 2349 uint64_t a_frac, r_frac, s_frac; 2350 int bit, last_bit; 2351 2352 if (is_nan(a.cls)) { 2353 return return_nan(a, s); 2354 } 2355 if (a.cls == float_class_zero) { 2356 return a; /* sqrt(+-0) = +-0 */ 2357 } 2358 if (a.sign) { 2359 s->float_exception_flags |= float_flag_invalid; 2360 return parts_default_nan(s); 2361 } 2362 if (a.cls == float_class_inf) { 2363 return a; /* sqrt(+inf) = +inf */ 2364 } 2365 2366 assert(a.cls == float_class_normal); 2367 2368 /* We need two overflow bits at the top. Adding room for that is a 2369 * right shift. If the exponent is odd, we can discard the low bit 2370 * by multiplying the fraction by 2; that's a left shift. Combine 2371 * those and we shift right if the exponent is even. 2372 */ 2373 a_frac = a.frac; 2374 if (!(a.exp & 1)) { 2375 a_frac >>= 1; 2376 } 2377 a.exp >>= 1; 2378 2379 /* Bit-by-bit computation of sqrt. */ 2380 r_frac = 0; 2381 s_frac = 0; 2382 2383 /* Iterate from implicit bit down to the 3 extra bits to compute a 2384 * properly rounded result. Remember we've inserted one more bit 2385 * at the top, so these positions are one less. 2386 */ 2387 bit = DECOMPOSED_BINARY_POINT - 1; 2388 last_bit = MAX(p->frac_shift - 4, 0); 2389 do { 2390 uint64_t q = 1ULL << bit; 2391 uint64_t t_frac = s_frac + q; 2392 if (t_frac <= a_frac) { 2393 s_frac = t_frac + q; 2394 a_frac -= t_frac; 2395 r_frac += q; 2396 } 2397 a_frac <<= 1; 2398 } while (--bit >= last_bit); 2399 2400 /* Undo the right shift done above. If there is any remaining 2401 * fraction, the result is inexact. Set the sticky bit. 2402 */ 2403 a.frac = (r_frac << 1) + (a_frac != 0); 2404 2405 return a; 2406 } 2407 2408 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status) 2409 { 2410 FloatParts pa = float16_unpack_canonical(a, status); 2411 FloatParts pr = sqrt_float(pa, status, &float16_params); 2412 return float16_round_pack_canonical(pr, status); 2413 } 2414 2415 float32 QEMU_FLATTEN float32_sqrt(float32 a, float_status *status) 2416 { 2417 FloatParts pa = float32_unpack_canonical(a, status); 2418 FloatParts pr = sqrt_float(pa, status, &float32_params); 2419 return float32_round_pack_canonical(pr, status); 2420 } 2421 2422 float64 QEMU_FLATTEN float64_sqrt(float64 a, float_status *status) 2423 { 2424 FloatParts pa = float64_unpack_canonical(a, status); 2425 FloatParts pr = sqrt_float(pa, status, &float64_params); 2426 return float64_round_pack_canonical(pr, status); 2427 } 2428 2429 /*---------------------------------------------------------------------------- 2430 | The pattern for a default generated NaN. 2431 *----------------------------------------------------------------------------*/ 2432 2433 float16 float16_default_nan(float_status *status) 2434 { 2435 FloatParts p = parts_default_nan(status); 2436 p.frac >>= float16_params.frac_shift; 2437 return float16_pack_raw(p); 2438 } 2439 2440 float32 float32_default_nan(float_status *status) 2441 { 2442 FloatParts p = parts_default_nan(status); 2443 p.frac >>= float32_params.frac_shift; 2444 return float32_pack_raw(p); 2445 } 2446 2447 float64 float64_default_nan(float_status *status) 2448 { 2449 FloatParts p = parts_default_nan(status); 2450 p.frac >>= float64_params.frac_shift; 2451 return float64_pack_raw(p); 2452 } 2453 2454 float128 float128_default_nan(float_status *status) 2455 { 2456 FloatParts p = parts_default_nan(status); 2457 float128 r; 2458 2459 /* Extrapolate from the choices made by parts_default_nan to fill 2460 * in the quad-floating format. If the low bit is set, assume we 2461 * want to set all non-snan bits. 2462 */ 2463 r.low = -(p.frac & 1); 2464 r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48); 2465 r.high |= LIT64(0x7FFF000000000000); 2466 r.high |= (uint64_t)p.sign << 63; 2467 2468 return r; 2469 } 2470 2471 /*---------------------------------------------------------------------------- 2472 | Returns a quiet NaN from a signalling NaN for the floating point value `a'. 2473 *----------------------------------------------------------------------------*/ 2474 2475 float16 float16_silence_nan(float16 a, float_status *status) 2476 { 2477 FloatParts p = float16_unpack_raw(a); 2478 p.frac <<= float16_params.frac_shift; 2479 p = parts_silence_nan(p, status); 2480 p.frac >>= float16_params.frac_shift; 2481 return float16_pack_raw(p); 2482 } 2483 2484 float32 float32_silence_nan(float32 a, float_status *status) 2485 { 2486 FloatParts p = float32_unpack_raw(a); 2487 p.frac <<= float32_params.frac_shift; 2488 p = parts_silence_nan(p, status); 2489 p.frac >>= float32_params.frac_shift; 2490 return float32_pack_raw(p); 2491 } 2492 2493 float64 float64_silence_nan(float64 a, float_status *status) 2494 { 2495 FloatParts p = float64_unpack_raw(a); 2496 p.frac <<= float64_params.frac_shift; 2497 p = parts_silence_nan(p, status); 2498 p.frac >>= float64_params.frac_shift; 2499 return float64_pack_raw(p); 2500 } 2501 2502 /*---------------------------------------------------------------------------- 2503 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 2504 | and 7, and returns the properly rounded 32-bit integer corresponding to the 2505 | input. If `zSign' is 1, the input is negated before being converted to an 2506 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 2507 | is simply rounded to an integer, with the inexact exception raised if the 2508 | input cannot be represented exactly as an integer. However, if the fixed- 2509 | point input is too large, the invalid exception is raised and the largest 2510 | positive or negative integer is returned. 2511 *----------------------------------------------------------------------------*/ 2512 2513 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status) 2514 { 2515 int8_t roundingMode; 2516 flag roundNearestEven; 2517 int8_t roundIncrement, roundBits; 2518 int32_t z; 2519 2520 roundingMode = status->float_rounding_mode; 2521 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2522 switch (roundingMode) { 2523 case float_round_nearest_even: 2524 case float_round_ties_away: 2525 roundIncrement = 0x40; 2526 break; 2527 case float_round_to_zero: 2528 roundIncrement = 0; 2529 break; 2530 case float_round_up: 2531 roundIncrement = zSign ? 0 : 0x7f; 2532 break; 2533 case float_round_down: 2534 roundIncrement = zSign ? 0x7f : 0; 2535 break; 2536 default: 2537 abort(); 2538 } 2539 roundBits = absZ & 0x7F; 2540 absZ = ( absZ + roundIncrement )>>7; 2541 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 2542 z = absZ; 2543 if ( zSign ) z = - z; 2544 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 2545 float_raise(float_flag_invalid, status); 2546 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 2547 } 2548 if (roundBits) { 2549 status->float_exception_flags |= float_flag_inexact; 2550 } 2551 return z; 2552 2553 } 2554 2555 /*---------------------------------------------------------------------------- 2556 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 2557 | `absZ1', with binary point between bits 63 and 64 (between the input words), 2558 | and returns the properly rounded 64-bit integer corresponding to the input. 2559 | If `zSign' is 1, the input is negated before being converted to an integer. 2560 | Ordinarily, the fixed-point input is simply rounded to an integer, with 2561 | the inexact exception raised if the input cannot be represented exactly as 2562 | an integer. However, if the fixed-point input is too large, the invalid 2563 | exception is raised and the largest positive or negative integer is 2564 | returned. 2565 *----------------------------------------------------------------------------*/ 2566 2567 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1, 2568 float_status *status) 2569 { 2570 int8_t roundingMode; 2571 flag roundNearestEven, increment; 2572 int64_t z; 2573 2574 roundingMode = status->float_rounding_mode; 2575 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2576 switch (roundingMode) { 2577 case float_round_nearest_even: 2578 case float_round_ties_away: 2579 increment = ((int64_t) absZ1 < 0); 2580 break; 2581 case float_round_to_zero: 2582 increment = 0; 2583 break; 2584 case float_round_up: 2585 increment = !zSign && absZ1; 2586 break; 2587 case float_round_down: 2588 increment = zSign && absZ1; 2589 break; 2590 default: 2591 abort(); 2592 } 2593 if ( increment ) { 2594 ++absZ0; 2595 if ( absZ0 == 0 ) goto overflow; 2596 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 2597 } 2598 z = absZ0; 2599 if ( zSign ) z = - z; 2600 if ( z && ( ( z < 0 ) ^ zSign ) ) { 2601 overflow: 2602 float_raise(float_flag_invalid, status); 2603 return 2604 zSign ? (int64_t) LIT64( 0x8000000000000000 ) 2605 : LIT64( 0x7FFFFFFFFFFFFFFF ); 2606 } 2607 if (absZ1) { 2608 status->float_exception_flags |= float_flag_inexact; 2609 } 2610 return z; 2611 2612 } 2613 2614 /*---------------------------------------------------------------------------- 2615 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 2616 | `absZ1', with binary point between bits 63 and 64 (between the input words), 2617 | and returns the properly rounded 64-bit unsigned integer corresponding to the 2618 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 2619 | with the inexact exception raised if the input cannot be represented exactly 2620 | as an integer. However, if the fixed-point input is too large, the invalid 2621 | exception is raised and the largest unsigned integer is returned. 2622 *----------------------------------------------------------------------------*/ 2623 2624 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0, 2625 uint64_t absZ1, float_status *status) 2626 { 2627 int8_t roundingMode; 2628 flag roundNearestEven, increment; 2629 2630 roundingMode = status->float_rounding_mode; 2631 roundNearestEven = (roundingMode == float_round_nearest_even); 2632 switch (roundingMode) { 2633 case float_round_nearest_even: 2634 case float_round_ties_away: 2635 increment = ((int64_t)absZ1 < 0); 2636 break; 2637 case float_round_to_zero: 2638 increment = 0; 2639 break; 2640 case float_round_up: 2641 increment = !zSign && absZ1; 2642 break; 2643 case float_round_down: 2644 increment = zSign && absZ1; 2645 break; 2646 default: 2647 abort(); 2648 } 2649 if (increment) { 2650 ++absZ0; 2651 if (absZ0 == 0) { 2652 float_raise(float_flag_invalid, status); 2653 return LIT64(0xFFFFFFFFFFFFFFFF); 2654 } 2655 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven); 2656 } 2657 2658 if (zSign && absZ0) { 2659 float_raise(float_flag_invalid, status); 2660 return 0; 2661 } 2662 2663 if (absZ1) { 2664 status->float_exception_flags |= float_flag_inexact; 2665 } 2666 return absZ0; 2667 } 2668 2669 /*---------------------------------------------------------------------------- 2670 | If `a' is denormal and we are in flush-to-zero mode then set the 2671 | input-denormal exception and return zero. Otherwise just return the value. 2672 *----------------------------------------------------------------------------*/ 2673 float32 float32_squash_input_denormal(float32 a, float_status *status) 2674 { 2675 if (status->flush_inputs_to_zero) { 2676 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) { 2677 float_raise(float_flag_input_denormal, status); 2678 return make_float32(float32_val(a) & 0x80000000); 2679 } 2680 } 2681 return a; 2682 } 2683 2684 /*---------------------------------------------------------------------------- 2685 | Normalizes the subnormal single-precision floating-point value represented 2686 | by the denormalized significand `aSig'. The normalized exponent and 2687 | significand are stored at the locations pointed to by `zExpPtr' and 2688 | `zSigPtr', respectively. 2689 *----------------------------------------------------------------------------*/ 2690 2691 static void 2692 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 2693 { 2694 int8_t shiftCount; 2695 2696 shiftCount = clz32(aSig) - 8; 2697 *zSigPtr = aSig<<shiftCount; 2698 *zExpPtr = 1 - shiftCount; 2699 2700 } 2701 2702 /*---------------------------------------------------------------------------- 2703 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2704 | and significand `zSig', and returns the proper single-precision floating- 2705 | point value corresponding to the abstract input. Ordinarily, the abstract 2706 | value is simply rounded and packed into the single-precision format, with 2707 | the inexact exception raised if the abstract input cannot be represented 2708 | exactly. However, if the abstract value is too large, the overflow and 2709 | inexact exceptions are raised and an infinity or maximal finite value is 2710 | returned. If the abstract value is too small, the input value is rounded to 2711 | a subnormal number, and the underflow and inexact exceptions are raised if 2712 | the abstract input cannot be represented exactly as a subnormal single- 2713 | precision floating-point number. 2714 | The input significand `zSig' has its binary point between bits 30 2715 | and 29, which is 7 bits to the left of the usual location. This shifted 2716 | significand must be normalized or smaller. If `zSig' is not normalized, 2717 | `zExp' must be 0; in that case, the result returned is a subnormal number, 2718 | and it must not require rounding. In the usual case that `zSig' is 2719 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 2720 | The handling of underflow and overflow follows the IEC/IEEE Standard for 2721 | Binary Floating-Point Arithmetic. 2722 *----------------------------------------------------------------------------*/ 2723 2724 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 2725 float_status *status) 2726 { 2727 int8_t roundingMode; 2728 flag roundNearestEven; 2729 int8_t roundIncrement, roundBits; 2730 flag isTiny; 2731 2732 roundingMode = status->float_rounding_mode; 2733 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2734 switch (roundingMode) { 2735 case float_round_nearest_even: 2736 case float_round_ties_away: 2737 roundIncrement = 0x40; 2738 break; 2739 case float_round_to_zero: 2740 roundIncrement = 0; 2741 break; 2742 case float_round_up: 2743 roundIncrement = zSign ? 0 : 0x7f; 2744 break; 2745 case float_round_down: 2746 roundIncrement = zSign ? 0x7f : 0; 2747 break; 2748 default: 2749 abort(); 2750 break; 2751 } 2752 roundBits = zSig & 0x7F; 2753 if ( 0xFD <= (uint16_t) zExp ) { 2754 if ( ( 0xFD < zExp ) 2755 || ( ( zExp == 0xFD ) 2756 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 2757 ) { 2758 float_raise(float_flag_overflow | float_flag_inexact, status); 2759 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 )); 2760 } 2761 if ( zExp < 0 ) { 2762 if (status->flush_to_zero) { 2763 float_raise(float_flag_output_denormal, status); 2764 return packFloat32(zSign, 0, 0); 2765 } 2766 isTiny = 2767 (status->float_detect_tininess 2768 == float_tininess_before_rounding) 2769 || ( zExp < -1 ) 2770 || ( zSig + roundIncrement < 0x80000000 ); 2771 shift32RightJamming( zSig, - zExp, &zSig ); 2772 zExp = 0; 2773 roundBits = zSig & 0x7F; 2774 if (isTiny && roundBits) { 2775 float_raise(float_flag_underflow, status); 2776 } 2777 } 2778 } 2779 if (roundBits) { 2780 status->float_exception_flags |= float_flag_inexact; 2781 } 2782 zSig = ( zSig + roundIncrement )>>7; 2783 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 2784 if ( zSig == 0 ) zExp = 0; 2785 return packFloat32( zSign, zExp, zSig ); 2786 2787 } 2788 2789 /*---------------------------------------------------------------------------- 2790 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2791 | and significand `zSig', and returns the proper single-precision floating- 2792 | point value corresponding to the abstract input. This routine is just like 2793 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 2794 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 2795 | floating-point exponent. 2796 *----------------------------------------------------------------------------*/ 2797 2798 static float32 2799 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 2800 float_status *status) 2801 { 2802 int8_t shiftCount; 2803 2804 shiftCount = clz32(zSig) - 1; 2805 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 2806 status); 2807 2808 } 2809 2810 /*---------------------------------------------------------------------------- 2811 | If `a' is denormal and we are in flush-to-zero mode then set the 2812 | input-denormal exception and return zero. Otherwise just return the value. 2813 *----------------------------------------------------------------------------*/ 2814 float64 float64_squash_input_denormal(float64 a, float_status *status) 2815 { 2816 if (status->flush_inputs_to_zero) { 2817 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) { 2818 float_raise(float_flag_input_denormal, status); 2819 return make_float64(float64_val(a) & (1ULL << 63)); 2820 } 2821 } 2822 return a; 2823 } 2824 2825 /*---------------------------------------------------------------------------- 2826 | Normalizes the subnormal double-precision floating-point value represented 2827 | by the denormalized significand `aSig'. The normalized exponent and 2828 | significand are stored at the locations pointed to by `zExpPtr' and 2829 | `zSigPtr', respectively. 2830 *----------------------------------------------------------------------------*/ 2831 2832 static void 2833 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 2834 { 2835 int8_t shiftCount; 2836 2837 shiftCount = clz64(aSig) - 11; 2838 *zSigPtr = aSig<<shiftCount; 2839 *zExpPtr = 1 - shiftCount; 2840 2841 } 2842 2843 /*---------------------------------------------------------------------------- 2844 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 2845 | double-precision floating-point value, returning the result. After being 2846 | shifted into the proper positions, the three fields are simply added 2847 | together to form the result. This means that any integer portion of `zSig' 2848 | will be added into the exponent. Since a properly normalized significand 2849 | will have an integer portion equal to 1, the `zExp' input should be 1 less 2850 | than the desired result exponent whenever `zSig' is a complete, normalized 2851 | significand. 2852 *----------------------------------------------------------------------------*/ 2853 2854 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig) 2855 { 2856 2857 return make_float64( 2858 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 2859 2860 } 2861 2862 /*---------------------------------------------------------------------------- 2863 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2864 | and significand `zSig', and returns the proper double-precision floating- 2865 | point value corresponding to the abstract input. Ordinarily, the abstract 2866 | value is simply rounded and packed into the double-precision format, with 2867 | the inexact exception raised if the abstract input cannot be represented 2868 | exactly. However, if the abstract value is too large, the overflow and 2869 | inexact exceptions are raised and an infinity or maximal finite value is 2870 | returned. If the abstract value is too small, the input value is rounded to 2871 | a subnormal number, and the underflow and inexact exceptions are raised if 2872 | the abstract input cannot be represented exactly as a subnormal double- 2873 | precision floating-point number. 2874 | The input significand `zSig' has its binary point between bits 62 2875 | and 61, which is 10 bits to the left of the usual location. This shifted 2876 | significand must be normalized or smaller. If `zSig' is not normalized, 2877 | `zExp' must be 0; in that case, the result returned is a subnormal number, 2878 | and it must not require rounding. In the usual case that `zSig' is 2879 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 2880 | The handling of underflow and overflow follows the IEC/IEEE Standard for 2881 | Binary Floating-Point Arithmetic. 2882 *----------------------------------------------------------------------------*/ 2883 2884 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 2885 float_status *status) 2886 { 2887 int8_t roundingMode; 2888 flag roundNearestEven; 2889 int roundIncrement, roundBits; 2890 flag isTiny; 2891 2892 roundingMode = status->float_rounding_mode; 2893 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2894 switch (roundingMode) { 2895 case float_round_nearest_even: 2896 case float_round_ties_away: 2897 roundIncrement = 0x200; 2898 break; 2899 case float_round_to_zero: 2900 roundIncrement = 0; 2901 break; 2902 case float_round_up: 2903 roundIncrement = zSign ? 0 : 0x3ff; 2904 break; 2905 case float_round_down: 2906 roundIncrement = zSign ? 0x3ff : 0; 2907 break; 2908 case float_round_to_odd: 2909 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 2910 break; 2911 default: 2912 abort(); 2913 } 2914 roundBits = zSig & 0x3FF; 2915 if ( 0x7FD <= (uint16_t) zExp ) { 2916 if ( ( 0x7FD < zExp ) 2917 || ( ( zExp == 0x7FD ) 2918 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 2919 ) { 2920 bool overflow_to_inf = roundingMode != float_round_to_odd && 2921 roundIncrement != 0; 2922 float_raise(float_flag_overflow | float_flag_inexact, status); 2923 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 2924 } 2925 if ( zExp < 0 ) { 2926 if (status->flush_to_zero) { 2927 float_raise(float_flag_output_denormal, status); 2928 return packFloat64(zSign, 0, 0); 2929 } 2930 isTiny = 2931 (status->float_detect_tininess 2932 == float_tininess_before_rounding) 2933 || ( zExp < -1 ) 2934 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 2935 shift64RightJamming( zSig, - zExp, &zSig ); 2936 zExp = 0; 2937 roundBits = zSig & 0x3FF; 2938 if (isTiny && roundBits) { 2939 float_raise(float_flag_underflow, status); 2940 } 2941 if (roundingMode == float_round_to_odd) { 2942 /* 2943 * For round-to-odd case, the roundIncrement depends on 2944 * zSig which just changed. 2945 */ 2946 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 2947 } 2948 } 2949 } 2950 if (roundBits) { 2951 status->float_exception_flags |= float_flag_inexact; 2952 } 2953 zSig = ( zSig + roundIncrement )>>10; 2954 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 2955 if ( zSig == 0 ) zExp = 0; 2956 return packFloat64( zSign, zExp, zSig ); 2957 2958 } 2959 2960 /*---------------------------------------------------------------------------- 2961 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2962 | and significand `zSig', and returns the proper double-precision floating- 2963 | point value corresponding to the abstract input. This routine is just like 2964 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 2965 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 2966 | floating-point exponent. 2967 *----------------------------------------------------------------------------*/ 2968 2969 static float64 2970 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 2971 float_status *status) 2972 { 2973 int8_t shiftCount; 2974 2975 shiftCount = clz64(zSig) - 1; 2976 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 2977 status); 2978 2979 } 2980 2981 /*---------------------------------------------------------------------------- 2982 | Normalizes the subnormal extended double-precision floating-point value 2983 | represented by the denormalized significand `aSig'. The normalized exponent 2984 | and significand are stored at the locations pointed to by `zExpPtr' and 2985 | `zSigPtr', respectively. 2986 *----------------------------------------------------------------------------*/ 2987 2988 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr, 2989 uint64_t *zSigPtr) 2990 { 2991 int8_t shiftCount; 2992 2993 shiftCount = clz64(aSig); 2994 *zSigPtr = aSig<<shiftCount; 2995 *zExpPtr = 1 - shiftCount; 2996 } 2997 2998 /*---------------------------------------------------------------------------- 2999 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3000 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 3001 | and returns the proper extended double-precision floating-point value 3002 | corresponding to the abstract input. Ordinarily, the abstract value is 3003 | rounded and packed into the extended double-precision format, with the 3004 | inexact exception raised if the abstract input cannot be represented 3005 | exactly. However, if the abstract value is too large, the overflow and 3006 | inexact exceptions are raised and an infinity or maximal finite value is 3007 | returned. If the abstract value is too small, the input value is rounded to 3008 | a subnormal number, and the underflow and inexact exceptions are raised if 3009 | the abstract input cannot be represented exactly as a subnormal extended 3010 | double-precision floating-point number. 3011 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 3012 | number of bits as single or double precision, respectively. Otherwise, the 3013 | result is rounded to the full precision of the extended double-precision 3014 | format. 3015 | The input significand must be normalized or smaller. If the input 3016 | significand is not normalized, `zExp' must be 0; in that case, the result 3017 | returned is a subnormal number, and it must not require rounding. The 3018 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 3019 | Floating-Point Arithmetic. 3020 *----------------------------------------------------------------------------*/ 3021 3022 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign, 3023 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 3024 float_status *status) 3025 { 3026 int8_t roundingMode; 3027 flag roundNearestEven, increment, isTiny; 3028 int64_t roundIncrement, roundMask, roundBits; 3029 3030 roundingMode = status->float_rounding_mode; 3031 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3032 if ( roundingPrecision == 80 ) goto precision80; 3033 if ( roundingPrecision == 64 ) { 3034 roundIncrement = LIT64( 0x0000000000000400 ); 3035 roundMask = LIT64( 0x00000000000007FF ); 3036 } 3037 else if ( roundingPrecision == 32 ) { 3038 roundIncrement = LIT64( 0x0000008000000000 ); 3039 roundMask = LIT64( 0x000000FFFFFFFFFF ); 3040 } 3041 else { 3042 goto precision80; 3043 } 3044 zSig0 |= ( zSig1 != 0 ); 3045 switch (roundingMode) { 3046 case float_round_nearest_even: 3047 case float_round_ties_away: 3048 break; 3049 case float_round_to_zero: 3050 roundIncrement = 0; 3051 break; 3052 case float_round_up: 3053 roundIncrement = zSign ? 0 : roundMask; 3054 break; 3055 case float_round_down: 3056 roundIncrement = zSign ? roundMask : 0; 3057 break; 3058 default: 3059 abort(); 3060 } 3061 roundBits = zSig0 & roundMask; 3062 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 3063 if ( ( 0x7FFE < zExp ) 3064 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 3065 ) { 3066 goto overflow; 3067 } 3068 if ( zExp <= 0 ) { 3069 if (status->flush_to_zero) { 3070 float_raise(float_flag_output_denormal, status); 3071 return packFloatx80(zSign, 0, 0); 3072 } 3073 isTiny = 3074 (status->float_detect_tininess 3075 == float_tininess_before_rounding) 3076 || ( zExp < 0 ) 3077 || ( zSig0 <= zSig0 + roundIncrement ); 3078 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 3079 zExp = 0; 3080 roundBits = zSig0 & roundMask; 3081 if (isTiny && roundBits) { 3082 float_raise(float_flag_underflow, status); 3083 } 3084 if (roundBits) { 3085 status->float_exception_flags |= float_flag_inexact; 3086 } 3087 zSig0 += roundIncrement; 3088 if ( (int64_t) zSig0 < 0 ) zExp = 1; 3089 roundIncrement = roundMask + 1; 3090 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 3091 roundMask |= roundIncrement; 3092 } 3093 zSig0 &= ~ roundMask; 3094 return packFloatx80( zSign, zExp, zSig0 ); 3095 } 3096 } 3097 if (roundBits) { 3098 status->float_exception_flags |= float_flag_inexact; 3099 } 3100 zSig0 += roundIncrement; 3101 if ( zSig0 < roundIncrement ) { 3102 ++zExp; 3103 zSig0 = LIT64( 0x8000000000000000 ); 3104 } 3105 roundIncrement = roundMask + 1; 3106 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 3107 roundMask |= roundIncrement; 3108 } 3109 zSig0 &= ~ roundMask; 3110 if ( zSig0 == 0 ) zExp = 0; 3111 return packFloatx80( zSign, zExp, zSig0 ); 3112 precision80: 3113 switch (roundingMode) { 3114 case float_round_nearest_even: 3115 case float_round_ties_away: 3116 increment = ((int64_t)zSig1 < 0); 3117 break; 3118 case float_round_to_zero: 3119 increment = 0; 3120 break; 3121 case float_round_up: 3122 increment = !zSign && zSig1; 3123 break; 3124 case float_round_down: 3125 increment = zSign && zSig1; 3126 break; 3127 default: 3128 abort(); 3129 } 3130 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 3131 if ( ( 0x7FFE < zExp ) 3132 || ( ( zExp == 0x7FFE ) 3133 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 3134 && increment 3135 ) 3136 ) { 3137 roundMask = 0; 3138 overflow: 3139 float_raise(float_flag_overflow | float_flag_inexact, status); 3140 if ( ( roundingMode == float_round_to_zero ) 3141 || ( zSign && ( roundingMode == float_round_up ) ) 3142 || ( ! zSign && ( roundingMode == float_round_down ) ) 3143 ) { 3144 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 3145 } 3146 return packFloatx80(zSign, 3147 floatx80_infinity_high, 3148 floatx80_infinity_low); 3149 } 3150 if ( zExp <= 0 ) { 3151 isTiny = 3152 (status->float_detect_tininess 3153 == float_tininess_before_rounding) 3154 || ( zExp < 0 ) 3155 || ! increment 3156 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 3157 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 3158 zExp = 0; 3159 if (isTiny && zSig1) { 3160 float_raise(float_flag_underflow, status); 3161 } 3162 if (zSig1) { 3163 status->float_exception_flags |= float_flag_inexact; 3164 } 3165 switch (roundingMode) { 3166 case float_round_nearest_even: 3167 case float_round_ties_away: 3168 increment = ((int64_t)zSig1 < 0); 3169 break; 3170 case float_round_to_zero: 3171 increment = 0; 3172 break; 3173 case float_round_up: 3174 increment = !zSign && zSig1; 3175 break; 3176 case float_round_down: 3177 increment = zSign && zSig1; 3178 break; 3179 default: 3180 abort(); 3181 } 3182 if ( increment ) { 3183 ++zSig0; 3184 zSig0 &= 3185 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 3186 if ( (int64_t) zSig0 < 0 ) zExp = 1; 3187 } 3188 return packFloatx80( zSign, zExp, zSig0 ); 3189 } 3190 } 3191 if (zSig1) { 3192 status->float_exception_flags |= float_flag_inexact; 3193 } 3194 if ( increment ) { 3195 ++zSig0; 3196 if ( zSig0 == 0 ) { 3197 ++zExp; 3198 zSig0 = LIT64( 0x8000000000000000 ); 3199 } 3200 else { 3201 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 3202 } 3203 } 3204 else { 3205 if ( zSig0 == 0 ) zExp = 0; 3206 } 3207 return packFloatx80( zSign, zExp, zSig0 ); 3208 3209 } 3210 3211 /*---------------------------------------------------------------------------- 3212 | Takes an abstract floating-point value having sign `zSign', exponent 3213 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 3214 | and returns the proper extended double-precision floating-point value 3215 | corresponding to the abstract input. This routine is just like 3216 | `roundAndPackFloatx80' except that the input significand does not have to be 3217 | normalized. 3218 *----------------------------------------------------------------------------*/ 3219 3220 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 3221 flag zSign, int32_t zExp, 3222 uint64_t zSig0, uint64_t zSig1, 3223 float_status *status) 3224 { 3225 int8_t shiftCount; 3226 3227 if ( zSig0 == 0 ) { 3228 zSig0 = zSig1; 3229 zSig1 = 0; 3230 zExp -= 64; 3231 } 3232 shiftCount = clz64(zSig0); 3233 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 3234 zExp -= shiftCount; 3235 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 3236 zSig0, zSig1, status); 3237 3238 } 3239 3240 /*---------------------------------------------------------------------------- 3241 | Returns the least-significant 64 fraction bits of the quadruple-precision 3242 | floating-point value `a'. 3243 *----------------------------------------------------------------------------*/ 3244 3245 static inline uint64_t extractFloat128Frac1( float128 a ) 3246 { 3247 3248 return a.low; 3249 3250 } 3251 3252 /*---------------------------------------------------------------------------- 3253 | Returns the most-significant 48 fraction bits of the quadruple-precision 3254 | floating-point value `a'. 3255 *----------------------------------------------------------------------------*/ 3256 3257 static inline uint64_t extractFloat128Frac0( float128 a ) 3258 { 3259 3260 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 3261 3262 } 3263 3264 /*---------------------------------------------------------------------------- 3265 | Returns the exponent bits of the quadruple-precision floating-point value 3266 | `a'. 3267 *----------------------------------------------------------------------------*/ 3268 3269 static inline int32_t extractFloat128Exp( float128 a ) 3270 { 3271 3272 return ( a.high>>48 ) & 0x7FFF; 3273 3274 } 3275 3276 /*---------------------------------------------------------------------------- 3277 | Returns the sign bit of the quadruple-precision floating-point value `a'. 3278 *----------------------------------------------------------------------------*/ 3279 3280 static inline flag extractFloat128Sign( float128 a ) 3281 { 3282 3283 return a.high>>63; 3284 3285 } 3286 3287 /*---------------------------------------------------------------------------- 3288 | Normalizes the subnormal quadruple-precision floating-point value 3289 | represented by the denormalized significand formed by the concatenation of 3290 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 3291 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 3292 | significand are stored at the location pointed to by `zSig0Ptr', and the 3293 | least significant 64 bits of the normalized significand are stored at the 3294 | location pointed to by `zSig1Ptr'. 3295 *----------------------------------------------------------------------------*/ 3296 3297 static void 3298 normalizeFloat128Subnormal( 3299 uint64_t aSig0, 3300 uint64_t aSig1, 3301 int32_t *zExpPtr, 3302 uint64_t *zSig0Ptr, 3303 uint64_t *zSig1Ptr 3304 ) 3305 { 3306 int8_t shiftCount; 3307 3308 if ( aSig0 == 0 ) { 3309 shiftCount = clz64(aSig1) - 15; 3310 if ( shiftCount < 0 ) { 3311 *zSig0Ptr = aSig1>>( - shiftCount ); 3312 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 3313 } 3314 else { 3315 *zSig0Ptr = aSig1<<shiftCount; 3316 *zSig1Ptr = 0; 3317 } 3318 *zExpPtr = - shiftCount - 63; 3319 } 3320 else { 3321 shiftCount = clz64(aSig0) - 15; 3322 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 3323 *zExpPtr = 1 - shiftCount; 3324 } 3325 3326 } 3327 3328 /*---------------------------------------------------------------------------- 3329 | Packs the sign `zSign', the exponent `zExp', and the significand formed 3330 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 3331 | floating-point value, returning the result. After being shifted into the 3332 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 3333 | added together to form the most significant 32 bits of the result. This 3334 | means that any integer portion of `zSig0' will be added into the exponent. 3335 | Since a properly normalized significand will have an integer portion equal 3336 | to 1, the `zExp' input should be 1 less than the desired result exponent 3337 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 3338 | significand. 3339 *----------------------------------------------------------------------------*/ 3340 3341 static inline float128 3342 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 ) 3343 { 3344 float128 z; 3345 3346 z.low = zSig1; 3347 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0; 3348 return z; 3349 3350 } 3351 3352 /*---------------------------------------------------------------------------- 3353 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3354 | and extended significand formed by the concatenation of `zSig0', `zSig1', 3355 | and `zSig2', and returns the proper quadruple-precision floating-point value 3356 | corresponding to the abstract input. Ordinarily, the abstract value is 3357 | simply rounded and packed into the quadruple-precision format, with the 3358 | inexact exception raised if the abstract input cannot be represented 3359 | exactly. However, if the abstract value is too large, the overflow and 3360 | inexact exceptions are raised and an infinity or maximal finite value is 3361 | returned. If the abstract value is too small, the input value is rounded to 3362 | a subnormal number, and the underflow and inexact exceptions are raised if 3363 | the abstract input cannot be represented exactly as a subnormal quadruple- 3364 | precision floating-point number. 3365 | The input significand must be normalized or smaller. If the input 3366 | significand is not normalized, `zExp' must be 0; in that case, the result 3367 | returned is a subnormal number, and it must not require rounding. In the 3368 | usual case that the input significand is normalized, `zExp' must be 1 less 3369 | than the ``true'' floating-point exponent. The handling of underflow and 3370 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3371 *----------------------------------------------------------------------------*/ 3372 3373 static float128 roundAndPackFloat128(flag zSign, int32_t zExp, 3374 uint64_t zSig0, uint64_t zSig1, 3375 uint64_t zSig2, float_status *status) 3376 { 3377 int8_t roundingMode; 3378 flag roundNearestEven, increment, isTiny; 3379 3380 roundingMode = status->float_rounding_mode; 3381 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3382 switch (roundingMode) { 3383 case float_round_nearest_even: 3384 case float_round_ties_away: 3385 increment = ((int64_t)zSig2 < 0); 3386 break; 3387 case float_round_to_zero: 3388 increment = 0; 3389 break; 3390 case float_round_up: 3391 increment = !zSign && zSig2; 3392 break; 3393 case float_round_down: 3394 increment = zSign && zSig2; 3395 break; 3396 case float_round_to_odd: 3397 increment = !(zSig1 & 0x1) && zSig2; 3398 break; 3399 default: 3400 abort(); 3401 } 3402 if ( 0x7FFD <= (uint32_t) zExp ) { 3403 if ( ( 0x7FFD < zExp ) 3404 || ( ( zExp == 0x7FFD ) 3405 && eq128( 3406 LIT64( 0x0001FFFFFFFFFFFF ), 3407 LIT64( 0xFFFFFFFFFFFFFFFF ), 3408 zSig0, 3409 zSig1 3410 ) 3411 && increment 3412 ) 3413 ) { 3414 float_raise(float_flag_overflow | float_flag_inexact, status); 3415 if ( ( roundingMode == float_round_to_zero ) 3416 || ( zSign && ( roundingMode == float_round_up ) ) 3417 || ( ! zSign && ( roundingMode == float_round_down ) ) 3418 || (roundingMode == float_round_to_odd) 3419 ) { 3420 return 3421 packFloat128( 3422 zSign, 3423 0x7FFE, 3424 LIT64( 0x0000FFFFFFFFFFFF ), 3425 LIT64( 0xFFFFFFFFFFFFFFFF ) 3426 ); 3427 } 3428 return packFloat128( zSign, 0x7FFF, 0, 0 ); 3429 } 3430 if ( zExp < 0 ) { 3431 if (status->flush_to_zero) { 3432 float_raise(float_flag_output_denormal, status); 3433 return packFloat128(zSign, 0, 0, 0); 3434 } 3435 isTiny = 3436 (status->float_detect_tininess 3437 == float_tininess_before_rounding) 3438 || ( zExp < -1 ) 3439 || ! increment 3440 || lt128( 3441 zSig0, 3442 zSig1, 3443 LIT64( 0x0001FFFFFFFFFFFF ), 3444 LIT64( 0xFFFFFFFFFFFFFFFF ) 3445 ); 3446 shift128ExtraRightJamming( 3447 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 3448 zExp = 0; 3449 if (isTiny && zSig2) { 3450 float_raise(float_flag_underflow, status); 3451 } 3452 switch (roundingMode) { 3453 case float_round_nearest_even: 3454 case float_round_ties_away: 3455 increment = ((int64_t)zSig2 < 0); 3456 break; 3457 case float_round_to_zero: 3458 increment = 0; 3459 break; 3460 case float_round_up: 3461 increment = !zSign && zSig2; 3462 break; 3463 case float_round_down: 3464 increment = zSign && zSig2; 3465 break; 3466 case float_round_to_odd: 3467 increment = !(zSig1 & 0x1) && zSig2; 3468 break; 3469 default: 3470 abort(); 3471 } 3472 } 3473 } 3474 if (zSig2) { 3475 status->float_exception_flags |= float_flag_inexact; 3476 } 3477 if ( increment ) { 3478 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 3479 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 3480 } 3481 else { 3482 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 3483 } 3484 return packFloat128( zSign, zExp, zSig0, zSig1 ); 3485 3486 } 3487 3488 /*---------------------------------------------------------------------------- 3489 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3490 | and significand formed by the concatenation of `zSig0' and `zSig1', and 3491 | returns the proper quadruple-precision floating-point value corresponding 3492 | to the abstract input. This routine is just like `roundAndPackFloat128' 3493 | except that the input significand has fewer bits and does not have to be 3494 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 3495 | point exponent. 3496 *----------------------------------------------------------------------------*/ 3497 3498 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp, 3499 uint64_t zSig0, uint64_t zSig1, 3500 float_status *status) 3501 { 3502 int8_t shiftCount; 3503 uint64_t zSig2; 3504 3505 if ( zSig0 == 0 ) { 3506 zSig0 = zSig1; 3507 zSig1 = 0; 3508 zExp -= 64; 3509 } 3510 shiftCount = clz64(zSig0) - 15; 3511 if ( 0 <= shiftCount ) { 3512 zSig2 = 0; 3513 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 3514 } 3515 else { 3516 shift128ExtraRightJamming( 3517 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 3518 } 3519 zExp -= shiftCount; 3520 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 3521 3522 } 3523 3524 3525 /*---------------------------------------------------------------------------- 3526 | Returns the result of converting the 32-bit two's complement integer `a' 3527 | to the extended double-precision floating-point format. The conversion 3528 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3529 | Arithmetic. 3530 *----------------------------------------------------------------------------*/ 3531 3532 floatx80 int32_to_floatx80(int32_t a, float_status *status) 3533 { 3534 flag zSign; 3535 uint32_t absA; 3536 int8_t shiftCount; 3537 uint64_t zSig; 3538 3539 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 3540 zSign = ( a < 0 ); 3541 absA = zSign ? - a : a; 3542 shiftCount = clz32(absA) + 32; 3543 zSig = absA; 3544 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 3545 3546 } 3547 3548 /*---------------------------------------------------------------------------- 3549 | Returns the result of converting the 32-bit two's complement integer `a' to 3550 | the quadruple-precision floating-point format. The conversion is performed 3551 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3552 *----------------------------------------------------------------------------*/ 3553 3554 float128 int32_to_float128(int32_t a, float_status *status) 3555 { 3556 flag zSign; 3557 uint32_t absA; 3558 int8_t shiftCount; 3559 uint64_t zSig0; 3560 3561 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 3562 zSign = ( a < 0 ); 3563 absA = zSign ? - a : a; 3564 shiftCount = clz32(absA) + 17; 3565 zSig0 = absA; 3566 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 3567 3568 } 3569 3570 /*---------------------------------------------------------------------------- 3571 | Returns the result of converting the 64-bit two's complement integer `a' 3572 | to the extended double-precision floating-point format. The conversion 3573 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3574 | Arithmetic. 3575 *----------------------------------------------------------------------------*/ 3576 3577 floatx80 int64_to_floatx80(int64_t a, float_status *status) 3578 { 3579 flag zSign; 3580 uint64_t absA; 3581 int8_t shiftCount; 3582 3583 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 3584 zSign = ( a < 0 ); 3585 absA = zSign ? - a : a; 3586 shiftCount = clz64(absA); 3587 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 3588 3589 } 3590 3591 /*---------------------------------------------------------------------------- 3592 | Returns the result of converting the 64-bit two's complement integer `a' to 3593 | the quadruple-precision floating-point format. The conversion is performed 3594 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3595 *----------------------------------------------------------------------------*/ 3596 3597 float128 int64_to_float128(int64_t a, float_status *status) 3598 { 3599 flag zSign; 3600 uint64_t absA; 3601 int8_t shiftCount; 3602 int32_t zExp; 3603 uint64_t zSig0, zSig1; 3604 3605 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 3606 zSign = ( a < 0 ); 3607 absA = zSign ? - a : a; 3608 shiftCount = clz64(absA) + 49; 3609 zExp = 0x406E - shiftCount; 3610 if ( 64 <= shiftCount ) { 3611 zSig1 = 0; 3612 zSig0 = absA; 3613 shiftCount -= 64; 3614 } 3615 else { 3616 zSig1 = absA; 3617 zSig0 = 0; 3618 } 3619 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 3620 return packFloat128( zSign, zExp, zSig0, zSig1 ); 3621 3622 } 3623 3624 /*---------------------------------------------------------------------------- 3625 | Returns the result of converting the 64-bit unsigned integer `a' 3626 | to the quadruple-precision floating-point format. The conversion is performed 3627 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3628 *----------------------------------------------------------------------------*/ 3629 3630 float128 uint64_to_float128(uint64_t a, float_status *status) 3631 { 3632 if (a == 0) { 3633 return float128_zero; 3634 } 3635 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status); 3636 } 3637 3638 /*---------------------------------------------------------------------------- 3639 | Returns the result of converting the single-precision floating-point value 3640 | `a' to the extended double-precision floating-point format. The conversion 3641 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3642 | Arithmetic. 3643 *----------------------------------------------------------------------------*/ 3644 3645 floatx80 float32_to_floatx80(float32 a, float_status *status) 3646 { 3647 flag aSign; 3648 int aExp; 3649 uint32_t aSig; 3650 3651 a = float32_squash_input_denormal(a, status); 3652 aSig = extractFloat32Frac( a ); 3653 aExp = extractFloat32Exp( a ); 3654 aSign = extractFloat32Sign( a ); 3655 if ( aExp == 0xFF ) { 3656 if (aSig) { 3657 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status); 3658 } 3659 return packFloatx80(aSign, 3660 floatx80_infinity_high, 3661 floatx80_infinity_low); 3662 } 3663 if ( aExp == 0 ) { 3664 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 3665 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3666 } 3667 aSig |= 0x00800000; 3668 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 3669 3670 } 3671 3672 /*---------------------------------------------------------------------------- 3673 | Returns the result of converting the single-precision floating-point value 3674 | `a' to the double-precision floating-point format. The conversion is 3675 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3676 | Arithmetic. 3677 *----------------------------------------------------------------------------*/ 3678 3679 float128 float32_to_float128(float32 a, float_status *status) 3680 { 3681 flag aSign; 3682 int aExp; 3683 uint32_t aSig; 3684 3685 a = float32_squash_input_denormal(a, status); 3686 aSig = extractFloat32Frac( a ); 3687 aExp = extractFloat32Exp( a ); 3688 aSign = extractFloat32Sign( a ); 3689 if ( aExp == 0xFF ) { 3690 if (aSig) { 3691 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 3692 } 3693 return packFloat128( aSign, 0x7FFF, 0, 0 ); 3694 } 3695 if ( aExp == 0 ) { 3696 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 3697 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3698 --aExp; 3699 } 3700 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 3701 3702 } 3703 3704 /*---------------------------------------------------------------------------- 3705 | Returns the remainder of the single-precision floating-point value `a' 3706 | with respect to the corresponding value `b'. The operation is performed 3707 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3708 *----------------------------------------------------------------------------*/ 3709 3710 float32 float32_rem(float32 a, float32 b, float_status *status) 3711 { 3712 flag aSign, zSign; 3713 int aExp, bExp, expDiff; 3714 uint32_t aSig, bSig; 3715 uint32_t q; 3716 uint64_t aSig64, bSig64, q64; 3717 uint32_t alternateASig; 3718 int32_t sigMean; 3719 a = float32_squash_input_denormal(a, status); 3720 b = float32_squash_input_denormal(b, status); 3721 3722 aSig = extractFloat32Frac( a ); 3723 aExp = extractFloat32Exp( a ); 3724 aSign = extractFloat32Sign( a ); 3725 bSig = extractFloat32Frac( b ); 3726 bExp = extractFloat32Exp( b ); 3727 if ( aExp == 0xFF ) { 3728 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 3729 return propagateFloat32NaN(a, b, status); 3730 } 3731 float_raise(float_flag_invalid, status); 3732 return float32_default_nan(status); 3733 } 3734 if ( bExp == 0xFF ) { 3735 if (bSig) { 3736 return propagateFloat32NaN(a, b, status); 3737 } 3738 return a; 3739 } 3740 if ( bExp == 0 ) { 3741 if ( bSig == 0 ) { 3742 float_raise(float_flag_invalid, status); 3743 return float32_default_nan(status); 3744 } 3745 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 3746 } 3747 if ( aExp == 0 ) { 3748 if ( aSig == 0 ) return a; 3749 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3750 } 3751 expDiff = aExp - bExp; 3752 aSig |= 0x00800000; 3753 bSig |= 0x00800000; 3754 if ( expDiff < 32 ) { 3755 aSig <<= 8; 3756 bSig <<= 8; 3757 if ( expDiff < 0 ) { 3758 if ( expDiff < -1 ) return a; 3759 aSig >>= 1; 3760 } 3761 q = ( bSig <= aSig ); 3762 if ( q ) aSig -= bSig; 3763 if ( 0 < expDiff ) { 3764 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 3765 q >>= 32 - expDiff; 3766 bSig >>= 2; 3767 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 3768 } 3769 else { 3770 aSig >>= 2; 3771 bSig >>= 2; 3772 } 3773 } 3774 else { 3775 if ( bSig <= aSig ) aSig -= bSig; 3776 aSig64 = ( (uint64_t) aSig )<<40; 3777 bSig64 = ( (uint64_t) bSig )<<40; 3778 expDiff -= 64; 3779 while ( 0 < expDiff ) { 3780 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 3781 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 3782 aSig64 = - ( ( bSig * q64 )<<38 ); 3783 expDiff -= 62; 3784 } 3785 expDiff += 64; 3786 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 3787 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 3788 q = q64>>( 64 - expDiff ); 3789 bSig <<= 6; 3790 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 3791 } 3792 do { 3793 alternateASig = aSig; 3794 ++q; 3795 aSig -= bSig; 3796 } while ( 0 <= (int32_t) aSig ); 3797 sigMean = aSig + alternateASig; 3798 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 3799 aSig = alternateASig; 3800 } 3801 zSign = ( (int32_t) aSig < 0 ); 3802 if ( zSign ) aSig = - aSig; 3803 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 3804 } 3805 3806 3807 3808 /*---------------------------------------------------------------------------- 3809 | Returns the binary exponential of the single-precision floating-point value 3810 | `a'. The operation is performed according to the IEC/IEEE Standard for 3811 | Binary Floating-Point Arithmetic. 3812 | 3813 | Uses the following identities: 3814 | 3815 | 1. ------------------------------------------------------------------------- 3816 | x x*ln(2) 3817 | 2 = e 3818 | 3819 | 2. ------------------------------------------------------------------------- 3820 | 2 3 4 5 n 3821 | x x x x x x x 3822 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 3823 | 1! 2! 3! 4! 5! n! 3824 *----------------------------------------------------------------------------*/ 3825 3826 static const float64 float32_exp2_coefficients[15] = 3827 { 3828 const_float64( 0x3ff0000000000000ll ), /* 1 */ 3829 const_float64( 0x3fe0000000000000ll ), /* 2 */ 3830 const_float64( 0x3fc5555555555555ll ), /* 3 */ 3831 const_float64( 0x3fa5555555555555ll ), /* 4 */ 3832 const_float64( 0x3f81111111111111ll ), /* 5 */ 3833 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 3834 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 3835 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 3836 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 3837 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 3838 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 3839 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 3840 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 3841 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 3842 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 3843 }; 3844 3845 float32 float32_exp2(float32 a, float_status *status) 3846 { 3847 flag aSign; 3848 int aExp; 3849 uint32_t aSig; 3850 float64 r, x, xn; 3851 int i; 3852 a = float32_squash_input_denormal(a, status); 3853 3854 aSig = extractFloat32Frac( a ); 3855 aExp = extractFloat32Exp( a ); 3856 aSign = extractFloat32Sign( a ); 3857 3858 if ( aExp == 0xFF) { 3859 if (aSig) { 3860 return propagateFloat32NaN(a, float32_zero, status); 3861 } 3862 return (aSign) ? float32_zero : a; 3863 } 3864 if (aExp == 0) { 3865 if (aSig == 0) return float32_one; 3866 } 3867 3868 float_raise(float_flag_inexact, status); 3869 3870 /* ******************************* */ 3871 /* using float64 for approximation */ 3872 /* ******************************* */ 3873 x = float32_to_float64(a, status); 3874 x = float64_mul(x, float64_ln2, status); 3875 3876 xn = x; 3877 r = float64_one; 3878 for (i = 0 ; i < 15 ; i++) { 3879 float64 f; 3880 3881 f = float64_mul(xn, float32_exp2_coefficients[i], status); 3882 r = float64_add(r, f, status); 3883 3884 xn = float64_mul(xn, x, status); 3885 } 3886 3887 return float64_to_float32(r, status); 3888 } 3889 3890 /*---------------------------------------------------------------------------- 3891 | Returns the binary log of the single-precision floating-point value `a'. 3892 | The operation is performed according to the IEC/IEEE Standard for Binary 3893 | Floating-Point Arithmetic. 3894 *----------------------------------------------------------------------------*/ 3895 float32 float32_log2(float32 a, float_status *status) 3896 { 3897 flag aSign, zSign; 3898 int aExp; 3899 uint32_t aSig, zSig, i; 3900 3901 a = float32_squash_input_denormal(a, status); 3902 aSig = extractFloat32Frac( a ); 3903 aExp = extractFloat32Exp( a ); 3904 aSign = extractFloat32Sign( a ); 3905 3906 if ( aExp == 0 ) { 3907 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 3908 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3909 } 3910 if ( aSign ) { 3911 float_raise(float_flag_invalid, status); 3912 return float32_default_nan(status); 3913 } 3914 if ( aExp == 0xFF ) { 3915 if (aSig) { 3916 return propagateFloat32NaN(a, float32_zero, status); 3917 } 3918 return a; 3919 } 3920 3921 aExp -= 0x7F; 3922 aSig |= 0x00800000; 3923 zSign = aExp < 0; 3924 zSig = aExp << 23; 3925 3926 for (i = 1 << 22; i > 0; i >>= 1) { 3927 aSig = ( (uint64_t)aSig * aSig ) >> 23; 3928 if ( aSig & 0x01000000 ) { 3929 aSig >>= 1; 3930 zSig |= i; 3931 } 3932 } 3933 3934 if ( zSign ) 3935 zSig = -zSig; 3936 3937 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 3938 } 3939 3940 /*---------------------------------------------------------------------------- 3941 | Returns 1 if the single-precision floating-point value `a' is equal to 3942 | the corresponding value `b', and 0 otherwise. The invalid exception is 3943 | raised if either operand is a NaN. Otherwise, the comparison is performed 3944 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3945 *----------------------------------------------------------------------------*/ 3946 3947 int float32_eq(float32 a, float32 b, float_status *status) 3948 { 3949 uint32_t av, bv; 3950 a = float32_squash_input_denormal(a, status); 3951 b = float32_squash_input_denormal(b, status); 3952 3953 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3954 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3955 ) { 3956 float_raise(float_flag_invalid, status); 3957 return 0; 3958 } 3959 av = float32_val(a); 3960 bv = float32_val(b); 3961 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3962 } 3963 3964 /*---------------------------------------------------------------------------- 3965 | Returns 1 if the single-precision floating-point value `a' is less than 3966 | or equal to the corresponding value `b', and 0 otherwise. The invalid 3967 | exception is raised if either operand is a NaN. The comparison is performed 3968 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3969 *----------------------------------------------------------------------------*/ 3970 3971 int float32_le(float32 a, float32 b, float_status *status) 3972 { 3973 flag aSign, bSign; 3974 uint32_t av, bv; 3975 a = float32_squash_input_denormal(a, status); 3976 b = float32_squash_input_denormal(b, status); 3977 3978 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3979 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3980 ) { 3981 float_raise(float_flag_invalid, status); 3982 return 0; 3983 } 3984 aSign = extractFloat32Sign( a ); 3985 bSign = extractFloat32Sign( b ); 3986 av = float32_val(a); 3987 bv = float32_val(b); 3988 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3989 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3990 3991 } 3992 3993 /*---------------------------------------------------------------------------- 3994 | Returns 1 if the single-precision floating-point value `a' is less than 3995 | the corresponding value `b', and 0 otherwise. The invalid exception is 3996 | raised if either operand is a NaN. The comparison is performed according 3997 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3998 *----------------------------------------------------------------------------*/ 3999 4000 int float32_lt(float32 a, float32 b, float_status *status) 4001 { 4002 flag aSign, bSign; 4003 uint32_t av, bv; 4004 a = float32_squash_input_denormal(a, status); 4005 b = float32_squash_input_denormal(b, status); 4006 4007 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4008 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4009 ) { 4010 float_raise(float_flag_invalid, status); 4011 return 0; 4012 } 4013 aSign = extractFloat32Sign( a ); 4014 bSign = extractFloat32Sign( b ); 4015 av = float32_val(a); 4016 bv = float32_val(b); 4017 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 4018 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4019 4020 } 4021 4022 /*---------------------------------------------------------------------------- 4023 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 4024 | be compared, and 0 otherwise. The invalid exception is raised if either 4025 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4026 | Standard for Binary Floating-Point Arithmetic. 4027 *----------------------------------------------------------------------------*/ 4028 4029 int float32_unordered(float32 a, float32 b, float_status *status) 4030 { 4031 a = float32_squash_input_denormal(a, status); 4032 b = float32_squash_input_denormal(b, status); 4033 4034 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4035 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4036 ) { 4037 float_raise(float_flag_invalid, status); 4038 return 1; 4039 } 4040 return 0; 4041 } 4042 4043 /*---------------------------------------------------------------------------- 4044 | Returns 1 if the single-precision floating-point value `a' is equal to 4045 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4046 | exception. The comparison is performed according to the IEC/IEEE Standard 4047 | for Binary Floating-Point Arithmetic. 4048 *----------------------------------------------------------------------------*/ 4049 4050 int float32_eq_quiet(float32 a, float32 b, float_status *status) 4051 { 4052 a = float32_squash_input_denormal(a, status); 4053 b = float32_squash_input_denormal(b, status); 4054 4055 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4056 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4057 ) { 4058 if (float32_is_signaling_nan(a, status) 4059 || float32_is_signaling_nan(b, status)) { 4060 float_raise(float_flag_invalid, status); 4061 } 4062 return 0; 4063 } 4064 return ( float32_val(a) == float32_val(b) ) || 4065 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 4066 } 4067 4068 /*---------------------------------------------------------------------------- 4069 | Returns 1 if the single-precision floating-point value `a' is less than or 4070 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4071 | cause an exception. Otherwise, the comparison is performed according to the 4072 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4073 *----------------------------------------------------------------------------*/ 4074 4075 int float32_le_quiet(float32 a, float32 b, float_status *status) 4076 { 4077 flag aSign, bSign; 4078 uint32_t av, bv; 4079 a = float32_squash_input_denormal(a, status); 4080 b = float32_squash_input_denormal(b, status); 4081 4082 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4083 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4084 ) { 4085 if (float32_is_signaling_nan(a, status) 4086 || float32_is_signaling_nan(b, status)) { 4087 float_raise(float_flag_invalid, status); 4088 } 4089 return 0; 4090 } 4091 aSign = extractFloat32Sign( a ); 4092 bSign = extractFloat32Sign( b ); 4093 av = float32_val(a); 4094 bv = float32_val(b); 4095 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 4096 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4097 4098 } 4099 4100 /*---------------------------------------------------------------------------- 4101 | Returns 1 if the single-precision floating-point value `a' is less than 4102 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4103 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 4104 | Standard for Binary Floating-Point Arithmetic. 4105 *----------------------------------------------------------------------------*/ 4106 4107 int float32_lt_quiet(float32 a, float32 b, float_status *status) 4108 { 4109 flag aSign, bSign; 4110 uint32_t av, bv; 4111 a = float32_squash_input_denormal(a, status); 4112 b = float32_squash_input_denormal(b, status); 4113 4114 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4115 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4116 ) { 4117 if (float32_is_signaling_nan(a, status) 4118 || float32_is_signaling_nan(b, status)) { 4119 float_raise(float_flag_invalid, status); 4120 } 4121 return 0; 4122 } 4123 aSign = extractFloat32Sign( a ); 4124 bSign = extractFloat32Sign( b ); 4125 av = float32_val(a); 4126 bv = float32_val(b); 4127 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 4128 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4129 4130 } 4131 4132 /*---------------------------------------------------------------------------- 4133 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 4134 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 4135 | comparison is performed according to the IEC/IEEE Standard for Binary 4136 | Floating-Point Arithmetic. 4137 *----------------------------------------------------------------------------*/ 4138 4139 int float32_unordered_quiet(float32 a, float32 b, float_status *status) 4140 { 4141 a = float32_squash_input_denormal(a, status); 4142 b = float32_squash_input_denormal(b, status); 4143 4144 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4145 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4146 ) { 4147 if (float32_is_signaling_nan(a, status) 4148 || float32_is_signaling_nan(b, status)) { 4149 float_raise(float_flag_invalid, status); 4150 } 4151 return 1; 4152 } 4153 return 0; 4154 } 4155 4156 /*---------------------------------------------------------------------------- 4157 | If `a' is denormal and we are in flush-to-zero mode then set the 4158 | input-denormal exception and return zero. Otherwise just return the value. 4159 *----------------------------------------------------------------------------*/ 4160 float16 float16_squash_input_denormal(float16 a, float_status *status) 4161 { 4162 if (status->flush_inputs_to_zero) { 4163 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) { 4164 float_raise(float_flag_input_denormal, status); 4165 return make_float16(float16_val(a) & 0x8000); 4166 } 4167 } 4168 return a; 4169 } 4170 4171 /*---------------------------------------------------------------------------- 4172 | Returns the result of converting the double-precision floating-point value 4173 | `a' to the extended double-precision floating-point format. The conversion 4174 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4175 | Arithmetic. 4176 *----------------------------------------------------------------------------*/ 4177 4178 floatx80 float64_to_floatx80(float64 a, float_status *status) 4179 { 4180 flag aSign; 4181 int aExp; 4182 uint64_t aSig; 4183 4184 a = float64_squash_input_denormal(a, status); 4185 aSig = extractFloat64Frac( a ); 4186 aExp = extractFloat64Exp( a ); 4187 aSign = extractFloat64Sign( a ); 4188 if ( aExp == 0x7FF ) { 4189 if (aSig) { 4190 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status); 4191 } 4192 return packFloatx80(aSign, 4193 floatx80_infinity_high, 4194 floatx80_infinity_low); 4195 } 4196 if ( aExp == 0 ) { 4197 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 4198 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4199 } 4200 return 4201 packFloatx80( 4202 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 4203 4204 } 4205 4206 /*---------------------------------------------------------------------------- 4207 | Returns the result of converting the double-precision floating-point value 4208 | `a' to the quadruple-precision floating-point format. The conversion is 4209 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4210 | Arithmetic. 4211 *----------------------------------------------------------------------------*/ 4212 4213 float128 float64_to_float128(float64 a, float_status *status) 4214 { 4215 flag aSign; 4216 int aExp; 4217 uint64_t aSig, zSig0, zSig1; 4218 4219 a = float64_squash_input_denormal(a, status); 4220 aSig = extractFloat64Frac( a ); 4221 aExp = extractFloat64Exp( a ); 4222 aSign = extractFloat64Sign( a ); 4223 if ( aExp == 0x7FF ) { 4224 if (aSig) { 4225 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 4226 } 4227 return packFloat128( aSign, 0x7FFF, 0, 0 ); 4228 } 4229 if ( aExp == 0 ) { 4230 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 4231 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4232 --aExp; 4233 } 4234 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 4235 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 4236 4237 } 4238 4239 4240 /*---------------------------------------------------------------------------- 4241 | Returns the remainder of the double-precision floating-point value `a' 4242 | with respect to the corresponding value `b'. The operation is performed 4243 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4244 *----------------------------------------------------------------------------*/ 4245 4246 float64 float64_rem(float64 a, float64 b, float_status *status) 4247 { 4248 flag aSign, zSign; 4249 int aExp, bExp, expDiff; 4250 uint64_t aSig, bSig; 4251 uint64_t q, alternateASig; 4252 int64_t sigMean; 4253 4254 a = float64_squash_input_denormal(a, status); 4255 b = float64_squash_input_denormal(b, status); 4256 aSig = extractFloat64Frac( a ); 4257 aExp = extractFloat64Exp( a ); 4258 aSign = extractFloat64Sign( a ); 4259 bSig = extractFloat64Frac( b ); 4260 bExp = extractFloat64Exp( b ); 4261 if ( aExp == 0x7FF ) { 4262 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 4263 return propagateFloat64NaN(a, b, status); 4264 } 4265 float_raise(float_flag_invalid, status); 4266 return float64_default_nan(status); 4267 } 4268 if ( bExp == 0x7FF ) { 4269 if (bSig) { 4270 return propagateFloat64NaN(a, b, status); 4271 } 4272 return a; 4273 } 4274 if ( bExp == 0 ) { 4275 if ( bSig == 0 ) { 4276 float_raise(float_flag_invalid, status); 4277 return float64_default_nan(status); 4278 } 4279 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4280 } 4281 if ( aExp == 0 ) { 4282 if ( aSig == 0 ) return a; 4283 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4284 } 4285 expDiff = aExp - bExp; 4286 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 4287 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4288 if ( expDiff < 0 ) { 4289 if ( expDiff < -1 ) return a; 4290 aSig >>= 1; 4291 } 4292 q = ( bSig <= aSig ); 4293 if ( q ) aSig -= bSig; 4294 expDiff -= 64; 4295 while ( 0 < expDiff ) { 4296 q = estimateDiv128To64( aSig, 0, bSig ); 4297 q = ( 2 < q ) ? q - 2 : 0; 4298 aSig = - ( ( bSig>>2 ) * q ); 4299 expDiff -= 62; 4300 } 4301 expDiff += 64; 4302 if ( 0 < expDiff ) { 4303 q = estimateDiv128To64( aSig, 0, bSig ); 4304 q = ( 2 < q ) ? q - 2 : 0; 4305 q >>= 64 - expDiff; 4306 bSig >>= 2; 4307 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4308 } 4309 else { 4310 aSig >>= 2; 4311 bSig >>= 2; 4312 } 4313 do { 4314 alternateASig = aSig; 4315 ++q; 4316 aSig -= bSig; 4317 } while ( 0 <= (int64_t) aSig ); 4318 sigMean = aSig + alternateASig; 4319 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 4320 aSig = alternateASig; 4321 } 4322 zSign = ( (int64_t) aSig < 0 ); 4323 if ( zSign ) aSig = - aSig; 4324 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 4325 4326 } 4327 4328 /*---------------------------------------------------------------------------- 4329 | Returns the binary log of the double-precision floating-point value `a'. 4330 | The operation is performed according to the IEC/IEEE Standard for Binary 4331 | Floating-Point Arithmetic. 4332 *----------------------------------------------------------------------------*/ 4333 float64 float64_log2(float64 a, float_status *status) 4334 { 4335 flag aSign, zSign; 4336 int aExp; 4337 uint64_t aSig, aSig0, aSig1, zSig, i; 4338 a = float64_squash_input_denormal(a, status); 4339 4340 aSig = extractFloat64Frac( a ); 4341 aExp = extractFloat64Exp( a ); 4342 aSign = extractFloat64Sign( a ); 4343 4344 if ( aExp == 0 ) { 4345 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 4346 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4347 } 4348 if ( aSign ) { 4349 float_raise(float_flag_invalid, status); 4350 return float64_default_nan(status); 4351 } 4352 if ( aExp == 0x7FF ) { 4353 if (aSig) { 4354 return propagateFloat64NaN(a, float64_zero, status); 4355 } 4356 return a; 4357 } 4358 4359 aExp -= 0x3FF; 4360 aSig |= LIT64( 0x0010000000000000 ); 4361 zSign = aExp < 0; 4362 zSig = (uint64_t)aExp << 52; 4363 for (i = 1LL << 51; i > 0; i >>= 1) { 4364 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 4365 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 4366 if ( aSig & LIT64( 0x0020000000000000 ) ) { 4367 aSig >>= 1; 4368 zSig |= i; 4369 } 4370 } 4371 4372 if ( zSign ) 4373 zSig = -zSig; 4374 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 4375 } 4376 4377 /*---------------------------------------------------------------------------- 4378 | Returns 1 if the double-precision floating-point value `a' is equal to the 4379 | corresponding value `b', and 0 otherwise. The invalid exception is raised 4380 | if either operand is a NaN. Otherwise, the comparison is performed 4381 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4382 *----------------------------------------------------------------------------*/ 4383 4384 int float64_eq(float64 a, float64 b, float_status *status) 4385 { 4386 uint64_t av, bv; 4387 a = float64_squash_input_denormal(a, status); 4388 b = float64_squash_input_denormal(b, status); 4389 4390 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4391 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4392 ) { 4393 float_raise(float_flag_invalid, status); 4394 return 0; 4395 } 4396 av = float64_val(a); 4397 bv = float64_val(b); 4398 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4399 4400 } 4401 4402 /*---------------------------------------------------------------------------- 4403 | Returns 1 if the double-precision floating-point value `a' is less than or 4404 | equal to the corresponding value `b', and 0 otherwise. The invalid 4405 | exception is raised if either operand is a NaN. The comparison is performed 4406 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4407 *----------------------------------------------------------------------------*/ 4408 4409 int float64_le(float64 a, float64 b, float_status *status) 4410 { 4411 flag aSign, bSign; 4412 uint64_t av, bv; 4413 a = float64_squash_input_denormal(a, status); 4414 b = float64_squash_input_denormal(b, status); 4415 4416 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4417 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4418 ) { 4419 float_raise(float_flag_invalid, status); 4420 return 0; 4421 } 4422 aSign = extractFloat64Sign( a ); 4423 bSign = extractFloat64Sign( b ); 4424 av = float64_val(a); 4425 bv = float64_val(b); 4426 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4427 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4428 4429 } 4430 4431 /*---------------------------------------------------------------------------- 4432 | Returns 1 if the double-precision floating-point value `a' is less than 4433 | the corresponding value `b', and 0 otherwise. The invalid exception is 4434 | raised if either operand is a NaN. The comparison is performed according 4435 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4436 *----------------------------------------------------------------------------*/ 4437 4438 int float64_lt(float64 a, float64 b, float_status *status) 4439 { 4440 flag aSign, bSign; 4441 uint64_t av, bv; 4442 4443 a = float64_squash_input_denormal(a, status); 4444 b = float64_squash_input_denormal(b, status); 4445 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4446 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4447 ) { 4448 float_raise(float_flag_invalid, status); 4449 return 0; 4450 } 4451 aSign = extractFloat64Sign( a ); 4452 bSign = extractFloat64Sign( b ); 4453 av = float64_val(a); 4454 bv = float64_val(b); 4455 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4456 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4457 4458 } 4459 4460 /*---------------------------------------------------------------------------- 4461 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4462 | be compared, and 0 otherwise. The invalid exception is raised if either 4463 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4464 | Standard for Binary Floating-Point Arithmetic. 4465 *----------------------------------------------------------------------------*/ 4466 4467 int float64_unordered(float64 a, float64 b, float_status *status) 4468 { 4469 a = float64_squash_input_denormal(a, status); 4470 b = float64_squash_input_denormal(b, status); 4471 4472 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4473 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4474 ) { 4475 float_raise(float_flag_invalid, status); 4476 return 1; 4477 } 4478 return 0; 4479 } 4480 4481 /*---------------------------------------------------------------------------- 4482 | Returns 1 if the double-precision floating-point value `a' is equal to the 4483 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4484 | exception.The comparison is performed according to the IEC/IEEE Standard 4485 | for Binary Floating-Point Arithmetic. 4486 *----------------------------------------------------------------------------*/ 4487 4488 int float64_eq_quiet(float64 a, float64 b, float_status *status) 4489 { 4490 uint64_t av, bv; 4491 a = float64_squash_input_denormal(a, status); 4492 b = float64_squash_input_denormal(b, status); 4493 4494 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4495 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4496 ) { 4497 if (float64_is_signaling_nan(a, status) 4498 || float64_is_signaling_nan(b, status)) { 4499 float_raise(float_flag_invalid, status); 4500 } 4501 return 0; 4502 } 4503 av = float64_val(a); 4504 bv = float64_val(b); 4505 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4506 4507 } 4508 4509 /*---------------------------------------------------------------------------- 4510 | Returns 1 if the double-precision floating-point value `a' is less than or 4511 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4512 | cause an exception. Otherwise, the comparison is performed according to the 4513 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4514 *----------------------------------------------------------------------------*/ 4515 4516 int float64_le_quiet(float64 a, float64 b, float_status *status) 4517 { 4518 flag aSign, bSign; 4519 uint64_t av, bv; 4520 a = float64_squash_input_denormal(a, status); 4521 b = float64_squash_input_denormal(b, status); 4522 4523 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4524 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4525 ) { 4526 if (float64_is_signaling_nan(a, status) 4527 || float64_is_signaling_nan(b, status)) { 4528 float_raise(float_flag_invalid, status); 4529 } 4530 return 0; 4531 } 4532 aSign = extractFloat64Sign( a ); 4533 bSign = extractFloat64Sign( b ); 4534 av = float64_val(a); 4535 bv = float64_val(b); 4536 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4537 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4538 4539 } 4540 4541 /*---------------------------------------------------------------------------- 4542 | Returns 1 if the double-precision floating-point value `a' is less than 4543 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4544 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 4545 | Standard for Binary Floating-Point Arithmetic. 4546 *----------------------------------------------------------------------------*/ 4547 4548 int float64_lt_quiet(float64 a, float64 b, float_status *status) 4549 { 4550 flag aSign, bSign; 4551 uint64_t av, bv; 4552 a = float64_squash_input_denormal(a, status); 4553 b = float64_squash_input_denormal(b, status); 4554 4555 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4556 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4557 ) { 4558 if (float64_is_signaling_nan(a, status) 4559 || float64_is_signaling_nan(b, status)) { 4560 float_raise(float_flag_invalid, status); 4561 } 4562 return 0; 4563 } 4564 aSign = extractFloat64Sign( a ); 4565 bSign = extractFloat64Sign( b ); 4566 av = float64_val(a); 4567 bv = float64_val(b); 4568 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4569 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4570 4571 } 4572 4573 /*---------------------------------------------------------------------------- 4574 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4575 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 4576 | comparison is performed according to the IEC/IEEE Standard for Binary 4577 | Floating-Point Arithmetic. 4578 *----------------------------------------------------------------------------*/ 4579 4580 int float64_unordered_quiet(float64 a, float64 b, float_status *status) 4581 { 4582 a = float64_squash_input_denormal(a, status); 4583 b = float64_squash_input_denormal(b, status); 4584 4585 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4586 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4587 ) { 4588 if (float64_is_signaling_nan(a, status) 4589 || float64_is_signaling_nan(b, status)) { 4590 float_raise(float_flag_invalid, status); 4591 } 4592 return 1; 4593 } 4594 return 0; 4595 } 4596 4597 /*---------------------------------------------------------------------------- 4598 | Returns the result of converting the extended double-precision floating- 4599 | point value `a' to the 32-bit two's complement integer format. The 4600 | conversion is performed according to the IEC/IEEE Standard for Binary 4601 | Floating-Point Arithmetic---which means in particular that the conversion 4602 | is rounded according to the current rounding mode. If `a' is a NaN, the 4603 | largest positive integer is returned. Otherwise, if the conversion 4604 | overflows, the largest integer with the same sign as `a' is returned. 4605 *----------------------------------------------------------------------------*/ 4606 4607 int32_t floatx80_to_int32(floatx80 a, float_status *status) 4608 { 4609 flag aSign; 4610 int32_t aExp, shiftCount; 4611 uint64_t aSig; 4612 4613 if (floatx80_invalid_encoding(a)) { 4614 float_raise(float_flag_invalid, status); 4615 return 1 << 31; 4616 } 4617 aSig = extractFloatx80Frac( a ); 4618 aExp = extractFloatx80Exp( a ); 4619 aSign = extractFloatx80Sign( a ); 4620 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4621 shiftCount = 0x4037 - aExp; 4622 if ( shiftCount <= 0 ) shiftCount = 1; 4623 shift64RightJamming( aSig, shiftCount, &aSig ); 4624 return roundAndPackInt32(aSign, aSig, status); 4625 4626 } 4627 4628 /*---------------------------------------------------------------------------- 4629 | Returns the result of converting the extended double-precision floating- 4630 | point value `a' to the 32-bit two's complement integer format. The 4631 | conversion is performed according to the IEC/IEEE Standard for Binary 4632 | Floating-Point Arithmetic, except that the conversion is always rounded 4633 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4634 | Otherwise, if the conversion overflows, the largest integer with the same 4635 | sign as `a' is returned. 4636 *----------------------------------------------------------------------------*/ 4637 4638 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 4639 { 4640 flag aSign; 4641 int32_t aExp, shiftCount; 4642 uint64_t aSig, savedASig; 4643 int32_t z; 4644 4645 if (floatx80_invalid_encoding(a)) { 4646 float_raise(float_flag_invalid, status); 4647 return 1 << 31; 4648 } 4649 aSig = extractFloatx80Frac( a ); 4650 aExp = extractFloatx80Exp( a ); 4651 aSign = extractFloatx80Sign( a ); 4652 if ( 0x401E < aExp ) { 4653 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4654 goto invalid; 4655 } 4656 else if ( aExp < 0x3FFF ) { 4657 if (aExp || aSig) { 4658 status->float_exception_flags |= float_flag_inexact; 4659 } 4660 return 0; 4661 } 4662 shiftCount = 0x403E - aExp; 4663 savedASig = aSig; 4664 aSig >>= shiftCount; 4665 z = aSig; 4666 if ( aSign ) z = - z; 4667 if ( ( z < 0 ) ^ aSign ) { 4668 invalid: 4669 float_raise(float_flag_invalid, status); 4670 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 4671 } 4672 if ( ( aSig<<shiftCount ) != savedASig ) { 4673 status->float_exception_flags |= float_flag_inexact; 4674 } 4675 return z; 4676 4677 } 4678 4679 /*---------------------------------------------------------------------------- 4680 | Returns the result of converting the extended double-precision floating- 4681 | point value `a' to the 64-bit two's complement integer format. The 4682 | conversion is performed according to the IEC/IEEE Standard for Binary 4683 | Floating-Point Arithmetic---which means in particular that the conversion 4684 | is rounded according to the current rounding mode. If `a' is a NaN, 4685 | the largest positive integer is returned. Otherwise, if the conversion 4686 | overflows, the largest integer with the same sign as `a' is returned. 4687 *----------------------------------------------------------------------------*/ 4688 4689 int64_t floatx80_to_int64(floatx80 a, float_status *status) 4690 { 4691 flag aSign; 4692 int32_t aExp, shiftCount; 4693 uint64_t aSig, aSigExtra; 4694 4695 if (floatx80_invalid_encoding(a)) { 4696 float_raise(float_flag_invalid, status); 4697 return 1ULL << 63; 4698 } 4699 aSig = extractFloatx80Frac( a ); 4700 aExp = extractFloatx80Exp( a ); 4701 aSign = extractFloatx80Sign( a ); 4702 shiftCount = 0x403E - aExp; 4703 if ( shiftCount <= 0 ) { 4704 if ( shiftCount ) { 4705 float_raise(float_flag_invalid, status); 4706 if (!aSign || floatx80_is_any_nan(a)) { 4707 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4708 } 4709 return (int64_t) LIT64( 0x8000000000000000 ); 4710 } 4711 aSigExtra = 0; 4712 } 4713 else { 4714 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 4715 } 4716 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 4717 4718 } 4719 4720 /*---------------------------------------------------------------------------- 4721 | Returns the result of converting the extended double-precision floating- 4722 | point value `a' to the 64-bit two's complement integer format. The 4723 | conversion is performed according to the IEC/IEEE Standard for Binary 4724 | Floating-Point Arithmetic, except that the conversion is always rounded 4725 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4726 | Otherwise, if the conversion overflows, the largest integer with the same 4727 | sign as `a' is returned. 4728 *----------------------------------------------------------------------------*/ 4729 4730 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 4731 { 4732 flag aSign; 4733 int32_t aExp, shiftCount; 4734 uint64_t aSig; 4735 int64_t z; 4736 4737 if (floatx80_invalid_encoding(a)) { 4738 float_raise(float_flag_invalid, status); 4739 return 1ULL << 63; 4740 } 4741 aSig = extractFloatx80Frac( a ); 4742 aExp = extractFloatx80Exp( a ); 4743 aSign = extractFloatx80Sign( a ); 4744 shiftCount = aExp - 0x403E; 4745 if ( 0 <= shiftCount ) { 4746 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 4747 if ( ( a.high != 0xC03E ) || aSig ) { 4748 float_raise(float_flag_invalid, status); 4749 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 4750 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4751 } 4752 } 4753 return (int64_t) LIT64( 0x8000000000000000 ); 4754 } 4755 else if ( aExp < 0x3FFF ) { 4756 if (aExp | aSig) { 4757 status->float_exception_flags |= float_flag_inexact; 4758 } 4759 return 0; 4760 } 4761 z = aSig>>( - shiftCount ); 4762 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 4763 status->float_exception_flags |= float_flag_inexact; 4764 } 4765 if ( aSign ) z = - z; 4766 return z; 4767 4768 } 4769 4770 /*---------------------------------------------------------------------------- 4771 | Returns the result of converting the extended double-precision floating- 4772 | point value `a' to the single-precision floating-point format. The 4773 | conversion is performed according to the IEC/IEEE Standard for Binary 4774 | Floating-Point Arithmetic. 4775 *----------------------------------------------------------------------------*/ 4776 4777 float32 floatx80_to_float32(floatx80 a, float_status *status) 4778 { 4779 flag aSign; 4780 int32_t aExp; 4781 uint64_t aSig; 4782 4783 if (floatx80_invalid_encoding(a)) { 4784 float_raise(float_flag_invalid, status); 4785 return float32_default_nan(status); 4786 } 4787 aSig = extractFloatx80Frac( a ); 4788 aExp = extractFloatx80Exp( a ); 4789 aSign = extractFloatx80Sign( a ); 4790 if ( aExp == 0x7FFF ) { 4791 if ( (uint64_t) ( aSig<<1 ) ) { 4792 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status); 4793 } 4794 return packFloat32( aSign, 0xFF, 0 ); 4795 } 4796 shift64RightJamming( aSig, 33, &aSig ); 4797 if ( aExp || aSig ) aExp -= 0x3F81; 4798 return roundAndPackFloat32(aSign, aExp, aSig, status); 4799 4800 } 4801 4802 /*---------------------------------------------------------------------------- 4803 | Returns the result of converting the extended double-precision floating- 4804 | point value `a' to the double-precision floating-point format. The 4805 | conversion is performed according to the IEC/IEEE Standard for Binary 4806 | Floating-Point Arithmetic. 4807 *----------------------------------------------------------------------------*/ 4808 4809 float64 floatx80_to_float64(floatx80 a, float_status *status) 4810 { 4811 flag aSign; 4812 int32_t aExp; 4813 uint64_t aSig, zSig; 4814 4815 if (floatx80_invalid_encoding(a)) { 4816 float_raise(float_flag_invalid, status); 4817 return float64_default_nan(status); 4818 } 4819 aSig = extractFloatx80Frac( a ); 4820 aExp = extractFloatx80Exp( a ); 4821 aSign = extractFloatx80Sign( a ); 4822 if ( aExp == 0x7FFF ) { 4823 if ( (uint64_t) ( aSig<<1 ) ) { 4824 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status); 4825 } 4826 return packFloat64( aSign, 0x7FF, 0 ); 4827 } 4828 shift64RightJamming( aSig, 1, &zSig ); 4829 if ( aExp || aSig ) aExp -= 0x3C01; 4830 return roundAndPackFloat64(aSign, aExp, zSig, status); 4831 4832 } 4833 4834 /*---------------------------------------------------------------------------- 4835 | Returns the result of converting the extended double-precision floating- 4836 | point value `a' to the quadruple-precision floating-point format. The 4837 | conversion is performed according to the IEC/IEEE Standard for Binary 4838 | Floating-Point Arithmetic. 4839 *----------------------------------------------------------------------------*/ 4840 4841 float128 floatx80_to_float128(floatx80 a, float_status *status) 4842 { 4843 flag aSign; 4844 int aExp; 4845 uint64_t aSig, zSig0, zSig1; 4846 4847 if (floatx80_invalid_encoding(a)) { 4848 float_raise(float_flag_invalid, status); 4849 return float128_default_nan(status); 4850 } 4851 aSig = extractFloatx80Frac( a ); 4852 aExp = extractFloatx80Exp( a ); 4853 aSign = extractFloatx80Sign( a ); 4854 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 4855 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status); 4856 } 4857 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 4858 return packFloat128( aSign, aExp, zSig0, zSig1 ); 4859 4860 } 4861 4862 /*---------------------------------------------------------------------------- 4863 | Rounds the extended double-precision floating-point value `a' 4864 | to the precision provided by floatx80_rounding_precision and returns the 4865 | result as an extended double-precision floating-point value. 4866 | The operation is performed according to the IEC/IEEE Standard for Binary 4867 | Floating-Point Arithmetic. 4868 *----------------------------------------------------------------------------*/ 4869 4870 floatx80 floatx80_round(floatx80 a, float_status *status) 4871 { 4872 return roundAndPackFloatx80(status->floatx80_rounding_precision, 4873 extractFloatx80Sign(a), 4874 extractFloatx80Exp(a), 4875 extractFloatx80Frac(a), 0, status); 4876 } 4877 4878 /*---------------------------------------------------------------------------- 4879 | Rounds the extended double-precision floating-point value `a' to an integer, 4880 | and returns the result as an extended quadruple-precision floating-point 4881 | value. The operation is performed according to the IEC/IEEE Standard for 4882 | Binary Floating-Point Arithmetic. 4883 *----------------------------------------------------------------------------*/ 4884 4885 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 4886 { 4887 flag aSign; 4888 int32_t aExp; 4889 uint64_t lastBitMask, roundBitsMask; 4890 floatx80 z; 4891 4892 if (floatx80_invalid_encoding(a)) { 4893 float_raise(float_flag_invalid, status); 4894 return floatx80_default_nan(status); 4895 } 4896 aExp = extractFloatx80Exp( a ); 4897 if ( 0x403E <= aExp ) { 4898 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 4899 return propagateFloatx80NaN(a, a, status); 4900 } 4901 return a; 4902 } 4903 if ( aExp < 0x3FFF ) { 4904 if ( ( aExp == 0 ) 4905 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 4906 return a; 4907 } 4908 status->float_exception_flags |= float_flag_inexact; 4909 aSign = extractFloatx80Sign( a ); 4910 switch (status->float_rounding_mode) { 4911 case float_round_nearest_even: 4912 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 4913 ) { 4914 return 4915 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 4916 } 4917 break; 4918 case float_round_ties_away: 4919 if (aExp == 0x3FFE) { 4920 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000)); 4921 } 4922 break; 4923 case float_round_down: 4924 return 4925 aSign ? 4926 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 4927 : packFloatx80( 0, 0, 0 ); 4928 case float_round_up: 4929 return 4930 aSign ? packFloatx80( 1, 0, 0 ) 4931 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 4932 } 4933 return packFloatx80( aSign, 0, 0 ); 4934 } 4935 lastBitMask = 1; 4936 lastBitMask <<= 0x403E - aExp; 4937 roundBitsMask = lastBitMask - 1; 4938 z = a; 4939 switch (status->float_rounding_mode) { 4940 case float_round_nearest_even: 4941 z.low += lastBitMask>>1; 4942 if ((z.low & roundBitsMask) == 0) { 4943 z.low &= ~lastBitMask; 4944 } 4945 break; 4946 case float_round_ties_away: 4947 z.low += lastBitMask >> 1; 4948 break; 4949 case float_round_to_zero: 4950 break; 4951 case float_round_up: 4952 if (!extractFloatx80Sign(z)) { 4953 z.low += roundBitsMask; 4954 } 4955 break; 4956 case float_round_down: 4957 if (extractFloatx80Sign(z)) { 4958 z.low += roundBitsMask; 4959 } 4960 break; 4961 default: 4962 abort(); 4963 } 4964 z.low &= ~ roundBitsMask; 4965 if ( z.low == 0 ) { 4966 ++z.high; 4967 z.low = LIT64( 0x8000000000000000 ); 4968 } 4969 if (z.low != a.low) { 4970 status->float_exception_flags |= float_flag_inexact; 4971 } 4972 return z; 4973 4974 } 4975 4976 /*---------------------------------------------------------------------------- 4977 | Returns the result of adding the absolute values of the extended double- 4978 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 4979 | negated before being returned. `zSign' is ignored if the result is a NaN. 4980 | The addition is performed according to the IEC/IEEE Standard for Binary 4981 | Floating-Point Arithmetic. 4982 *----------------------------------------------------------------------------*/ 4983 4984 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 4985 float_status *status) 4986 { 4987 int32_t aExp, bExp, zExp; 4988 uint64_t aSig, bSig, zSig0, zSig1; 4989 int32_t expDiff; 4990 4991 aSig = extractFloatx80Frac( a ); 4992 aExp = extractFloatx80Exp( a ); 4993 bSig = extractFloatx80Frac( b ); 4994 bExp = extractFloatx80Exp( b ); 4995 expDiff = aExp - bExp; 4996 if ( 0 < expDiff ) { 4997 if ( aExp == 0x7FFF ) { 4998 if ((uint64_t)(aSig << 1)) { 4999 return propagateFloatx80NaN(a, b, status); 5000 } 5001 return a; 5002 } 5003 if ( bExp == 0 ) --expDiff; 5004 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5005 zExp = aExp; 5006 } 5007 else if ( expDiff < 0 ) { 5008 if ( bExp == 0x7FFF ) { 5009 if ((uint64_t)(bSig << 1)) { 5010 return propagateFloatx80NaN(a, b, status); 5011 } 5012 return packFloatx80(zSign, 5013 floatx80_infinity_high, 5014 floatx80_infinity_low); 5015 } 5016 if ( aExp == 0 ) ++expDiff; 5017 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5018 zExp = bExp; 5019 } 5020 else { 5021 if ( aExp == 0x7FFF ) { 5022 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5023 return propagateFloatx80NaN(a, b, status); 5024 } 5025 return a; 5026 } 5027 zSig1 = 0; 5028 zSig0 = aSig + bSig; 5029 if ( aExp == 0 ) { 5030 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 5031 goto roundAndPack; 5032 } 5033 zExp = aExp; 5034 goto shiftRight1; 5035 } 5036 zSig0 = aSig + bSig; 5037 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 5038 shiftRight1: 5039 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5040 zSig0 |= LIT64( 0x8000000000000000 ); 5041 ++zExp; 5042 roundAndPack: 5043 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5044 zSign, zExp, zSig0, zSig1, status); 5045 } 5046 5047 /*---------------------------------------------------------------------------- 5048 | Returns the result of subtracting the absolute values of the extended 5049 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 5050 | difference is negated before being returned. `zSign' is ignored if the 5051 | result is a NaN. The subtraction is performed according to the IEC/IEEE 5052 | Standard for Binary Floating-Point Arithmetic. 5053 *----------------------------------------------------------------------------*/ 5054 5055 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5056 float_status *status) 5057 { 5058 int32_t aExp, bExp, zExp; 5059 uint64_t aSig, bSig, zSig0, zSig1; 5060 int32_t expDiff; 5061 5062 aSig = extractFloatx80Frac( a ); 5063 aExp = extractFloatx80Exp( a ); 5064 bSig = extractFloatx80Frac( b ); 5065 bExp = extractFloatx80Exp( b ); 5066 expDiff = aExp - bExp; 5067 if ( 0 < expDiff ) goto aExpBigger; 5068 if ( expDiff < 0 ) goto bExpBigger; 5069 if ( aExp == 0x7FFF ) { 5070 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5071 return propagateFloatx80NaN(a, b, status); 5072 } 5073 float_raise(float_flag_invalid, status); 5074 return floatx80_default_nan(status); 5075 } 5076 if ( aExp == 0 ) { 5077 aExp = 1; 5078 bExp = 1; 5079 } 5080 zSig1 = 0; 5081 if ( bSig < aSig ) goto aBigger; 5082 if ( aSig < bSig ) goto bBigger; 5083 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 5084 bExpBigger: 5085 if ( bExp == 0x7FFF ) { 5086 if ((uint64_t)(bSig << 1)) { 5087 return propagateFloatx80NaN(a, b, status); 5088 } 5089 return packFloatx80(zSign ^ 1, floatx80_infinity_high, 5090 floatx80_infinity_low); 5091 } 5092 if ( aExp == 0 ) ++expDiff; 5093 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5094 bBigger: 5095 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 5096 zExp = bExp; 5097 zSign ^= 1; 5098 goto normalizeRoundAndPack; 5099 aExpBigger: 5100 if ( aExp == 0x7FFF ) { 5101 if ((uint64_t)(aSig << 1)) { 5102 return propagateFloatx80NaN(a, b, status); 5103 } 5104 return a; 5105 } 5106 if ( bExp == 0 ) --expDiff; 5107 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5108 aBigger: 5109 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 5110 zExp = aExp; 5111 normalizeRoundAndPack: 5112 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 5113 zSign, zExp, zSig0, zSig1, status); 5114 } 5115 5116 /*---------------------------------------------------------------------------- 5117 | Returns the result of adding the extended double-precision floating-point 5118 | values `a' and `b'. The operation is performed according to the IEC/IEEE 5119 | Standard for Binary Floating-Point Arithmetic. 5120 *----------------------------------------------------------------------------*/ 5121 5122 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 5123 { 5124 flag aSign, bSign; 5125 5126 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5127 float_raise(float_flag_invalid, status); 5128 return floatx80_default_nan(status); 5129 } 5130 aSign = extractFloatx80Sign( a ); 5131 bSign = extractFloatx80Sign( b ); 5132 if ( aSign == bSign ) { 5133 return addFloatx80Sigs(a, b, aSign, status); 5134 } 5135 else { 5136 return subFloatx80Sigs(a, b, aSign, status); 5137 } 5138 5139 } 5140 5141 /*---------------------------------------------------------------------------- 5142 | Returns the result of subtracting the extended double-precision floating- 5143 | point values `a' and `b'. The operation is performed according to the 5144 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5145 *----------------------------------------------------------------------------*/ 5146 5147 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 5148 { 5149 flag aSign, bSign; 5150 5151 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5152 float_raise(float_flag_invalid, status); 5153 return floatx80_default_nan(status); 5154 } 5155 aSign = extractFloatx80Sign( a ); 5156 bSign = extractFloatx80Sign( b ); 5157 if ( aSign == bSign ) { 5158 return subFloatx80Sigs(a, b, aSign, status); 5159 } 5160 else { 5161 return addFloatx80Sigs(a, b, aSign, status); 5162 } 5163 5164 } 5165 5166 /*---------------------------------------------------------------------------- 5167 | Returns the result of multiplying the extended double-precision floating- 5168 | point values `a' and `b'. The operation is performed according to the 5169 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5170 *----------------------------------------------------------------------------*/ 5171 5172 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 5173 { 5174 flag aSign, bSign, zSign; 5175 int32_t aExp, bExp, zExp; 5176 uint64_t aSig, bSig, zSig0, zSig1; 5177 5178 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5179 float_raise(float_flag_invalid, status); 5180 return floatx80_default_nan(status); 5181 } 5182 aSig = extractFloatx80Frac( a ); 5183 aExp = extractFloatx80Exp( a ); 5184 aSign = extractFloatx80Sign( a ); 5185 bSig = extractFloatx80Frac( b ); 5186 bExp = extractFloatx80Exp( b ); 5187 bSign = extractFloatx80Sign( b ); 5188 zSign = aSign ^ bSign; 5189 if ( aExp == 0x7FFF ) { 5190 if ( (uint64_t) ( aSig<<1 ) 5191 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5192 return propagateFloatx80NaN(a, b, status); 5193 } 5194 if ( ( bExp | bSig ) == 0 ) goto invalid; 5195 return packFloatx80(zSign, floatx80_infinity_high, 5196 floatx80_infinity_low); 5197 } 5198 if ( bExp == 0x7FFF ) { 5199 if ((uint64_t)(bSig << 1)) { 5200 return propagateFloatx80NaN(a, b, status); 5201 } 5202 if ( ( aExp | aSig ) == 0 ) { 5203 invalid: 5204 float_raise(float_flag_invalid, status); 5205 return floatx80_default_nan(status); 5206 } 5207 return packFloatx80(zSign, floatx80_infinity_high, 5208 floatx80_infinity_low); 5209 } 5210 if ( aExp == 0 ) { 5211 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5212 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5213 } 5214 if ( bExp == 0 ) { 5215 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5216 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5217 } 5218 zExp = aExp + bExp - 0x3FFE; 5219 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 5220 if ( 0 < (int64_t) zSig0 ) { 5221 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5222 --zExp; 5223 } 5224 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5225 zSign, zExp, zSig0, zSig1, status); 5226 } 5227 5228 /*---------------------------------------------------------------------------- 5229 | Returns the result of dividing the extended double-precision floating-point 5230 | value `a' by the corresponding value `b'. The operation is performed 5231 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5232 *----------------------------------------------------------------------------*/ 5233 5234 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 5235 { 5236 flag aSign, bSign, zSign; 5237 int32_t aExp, bExp, zExp; 5238 uint64_t aSig, bSig, zSig0, zSig1; 5239 uint64_t rem0, rem1, rem2, term0, term1, term2; 5240 5241 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5242 float_raise(float_flag_invalid, status); 5243 return floatx80_default_nan(status); 5244 } 5245 aSig = extractFloatx80Frac( a ); 5246 aExp = extractFloatx80Exp( a ); 5247 aSign = extractFloatx80Sign( a ); 5248 bSig = extractFloatx80Frac( b ); 5249 bExp = extractFloatx80Exp( b ); 5250 bSign = extractFloatx80Sign( b ); 5251 zSign = aSign ^ bSign; 5252 if ( aExp == 0x7FFF ) { 5253 if ((uint64_t)(aSig << 1)) { 5254 return propagateFloatx80NaN(a, b, status); 5255 } 5256 if ( bExp == 0x7FFF ) { 5257 if ((uint64_t)(bSig << 1)) { 5258 return propagateFloatx80NaN(a, b, status); 5259 } 5260 goto invalid; 5261 } 5262 return packFloatx80(zSign, floatx80_infinity_high, 5263 floatx80_infinity_low); 5264 } 5265 if ( bExp == 0x7FFF ) { 5266 if ((uint64_t)(bSig << 1)) { 5267 return propagateFloatx80NaN(a, b, status); 5268 } 5269 return packFloatx80( zSign, 0, 0 ); 5270 } 5271 if ( bExp == 0 ) { 5272 if ( bSig == 0 ) { 5273 if ( ( aExp | aSig ) == 0 ) { 5274 invalid: 5275 float_raise(float_flag_invalid, status); 5276 return floatx80_default_nan(status); 5277 } 5278 float_raise(float_flag_divbyzero, status); 5279 return packFloatx80(zSign, floatx80_infinity_high, 5280 floatx80_infinity_low); 5281 } 5282 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5283 } 5284 if ( aExp == 0 ) { 5285 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5286 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5287 } 5288 zExp = aExp - bExp + 0x3FFE; 5289 rem1 = 0; 5290 if ( bSig <= aSig ) { 5291 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 5292 ++zExp; 5293 } 5294 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 5295 mul64To128( bSig, zSig0, &term0, &term1 ); 5296 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 5297 while ( (int64_t) rem0 < 0 ) { 5298 --zSig0; 5299 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 5300 } 5301 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 5302 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 5303 mul64To128( bSig, zSig1, &term1, &term2 ); 5304 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5305 while ( (int64_t) rem1 < 0 ) { 5306 --zSig1; 5307 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 5308 } 5309 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 5310 } 5311 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5312 zSign, zExp, zSig0, zSig1, status); 5313 } 5314 5315 /*---------------------------------------------------------------------------- 5316 | Returns the remainder of the extended double-precision floating-point value 5317 | `a' with respect to the corresponding value `b'. The operation is performed 5318 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5319 *----------------------------------------------------------------------------*/ 5320 5321 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 5322 { 5323 flag aSign, zSign; 5324 int32_t aExp, bExp, expDiff; 5325 uint64_t aSig0, aSig1, bSig; 5326 uint64_t q, term0, term1, alternateASig0, alternateASig1; 5327 5328 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5329 float_raise(float_flag_invalid, status); 5330 return floatx80_default_nan(status); 5331 } 5332 aSig0 = extractFloatx80Frac( a ); 5333 aExp = extractFloatx80Exp( a ); 5334 aSign = extractFloatx80Sign( a ); 5335 bSig = extractFloatx80Frac( b ); 5336 bExp = extractFloatx80Exp( b ); 5337 if ( aExp == 0x7FFF ) { 5338 if ( (uint64_t) ( aSig0<<1 ) 5339 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5340 return propagateFloatx80NaN(a, b, status); 5341 } 5342 goto invalid; 5343 } 5344 if ( bExp == 0x7FFF ) { 5345 if ((uint64_t)(bSig << 1)) { 5346 return propagateFloatx80NaN(a, b, status); 5347 } 5348 return a; 5349 } 5350 if ( bExp == 0 ) { 5351 if ( bSig == 0 ) { 5352 invalid: 5353 float_raise(float_flag_invalid, status); 5354 return floatx80_default_nan(status); 5355 } 5356 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5357 } 5358 if ( aExp == 0 ) { 5359 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a; 5360 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5361 } 5362 bSig |= LIT64( 0x8000000000000000 ); 5363 zSign = aSign; 5364 expDiff = aExp - bExp; 5365 aSig1 = 0; 5366 if ( expDiff < 0 ) { 5367 if ( expDiff < -1 ) return a; 5368 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 5369 expDiff = 0; 5370 } 5371 q = ( bSig <= aSig0 ); 5372 if ( q ) aSig0 -= bSig; 5373 expDiff -= 64; 5374 while ( 0 < expDiff ) { 5375 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5376 q = ( 2 < q ) ? q - 2 : 0; 5377 mul64To128( bSig, q, &term0, &term1 ); 5378 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5379 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 5380 expDiff -= 62; 5381 } 5382 expDiff += 64; 5383 if ( 0 < expDiff ) { 5384 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5385 q = ( 2 < q ) ? q - 2 : 0; 5386 q >>= 64 - expDiff; 5387 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 5388 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5389 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 5390 while ( le128( term0, term1, aSig0, aSig1 ) ) { 5391 ++q; 5392 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5393 } 5394 } 5395 else { 5396 term1 = 0; 5397 term0 = bSig; 5398 } 5399 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 5400 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5401 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5402 && ( q & 1 ) ) 5403 ) { 5404 aSig0 = alternateASig0; 5405 aSig1 = alternateASig1; 5406 zSign = ! zSign; 5407 } 5408 return 5409 normalizeRoundAndPackFloatx80( 5410 80, zSign, bExp + expDiff, aSig0, aSig1, status); 5411 5412 } 5413 5414 /*---------------------------------------------------------------------------- 5415 | Returns the square root of the extended double-precision floating-point 5416 | value `a'. The operation is performed according to the IEC/IEEE Standard 5417 | for Binary Floating-Point Arithmetic. 5418 *----------------------------------------------------------------------------*/ 5419 5420 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 5421 { 5422 flag aSign; 5423 int32_t aExp, zExp; 5424 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 5425 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5426 5427 if (floatx80_invalid_encoding(a)) { 5428 float_raise(float_flag_invalid, status); 5429 return floatx80_default_nan(status); 5430 } 5431 aSig0 = extractFloatx80Frac( a ); 5432 aExp = extractFloatx80Exp( a ); 5433 aSign = extractFloatx80Sign( a ); 5434 if ( aExp == 0x7FFF ) { 5435 if ((uint64_t)(aSig0 << 1)) { 5436 return propagateFloatx80NaN(a, a, status); 5437 } 5438 if ( ! aSign ) return a; 5439 goto invalid; 5440 } 5441 if ( aSign ) { 5442 if ( ( aExp | aSig0 ) == 0 ) return a; 5443 invalid: 5444 float_raise(float_flag_invalid, status); 5445 return floatx80_default_nan(status); 5446 } 5447 if ( aExp == 0 ) { 5448 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 5449 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5450 } 5451 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 5452 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 5453 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 5454 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5455 doubleZSig0 = zSig0<<1; 5456 mul64To128( zSig0, zSig0, &term0, &term1 ); 5457 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5458 while ( (int64_t) rem0 < 0 ) { 5459 --zSig0; 5460 doubleZSig0 -= 2; 5461 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5462 } 5463 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5464 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 5465 if ( zSig1 == 0 ) zSig1 = 1; 5466 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5467 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5468 mul64To128( zSig1, zSig1, &term2, &term3 ); 5469 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5470 while ( (int64_t) rem1 < 0 ) { 5471 --zSig1; 5472 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5473 term3 |= 1; 5474 term2 |= doubleZSig0; 5475 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5476 } 5477 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5478 } 5479 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 5480 zSig0 |= doubleZSig0; 5481 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5482 0, zExp, zSig0, zSig1, status); 5483 } 5484 5485 /*---------------------------------------------------------------------------- 5486 | Returns 1 if the extended double-precision floating-point value `a' is equal 5487 | to the corresponding value `b', and 0 otherwise. The invalid exception is 5488 | raised if either operand is a NaN. Otherwise, the comparison is performed 5489 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5490 *----------------------------------------------------------------------------*/ 5491 5492 int floatx80_eq(floatx80 a, floatx80 b, float_status *status) 5493 { 5494 5495 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5496 || (extractFloatx80Exp(a) == 0x7FFF 5497 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5498 || (extractFloatx80Exp(b) == 0x7FFF 5499 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5500 ) { 5501 float_raise(float_flag_invalid, status); 5502 return 0; 5503 } 5504 return 5505 ( a.low == b.low ) 5506 && ( ( a.high == b.high ) 5507 || ( ( a.low == 0 ) 5508 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5509 ); 5510 5511 } 5512 5513 /*---------------------------------------------------------------------------- 5514 | Returns 1 if the extended double-precision floating-point value `a' is 5515 | less than or equal to the corresponding value `b', and 0 otherwise. The 5516 | invalid exception is raised if either operand is a NaN. The comparison is 5517 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5518 | Arithmetic. 5519 *----------------------------------------------------------------------------*/ 5520 5521 int floatx80_le(floatx80 a, floatx80 b, float_status *status) 5522 { 5523 flag aSign, bSign; 5524 5525 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5526 || (extractFloatx80Exp(a) == 0x7FFF 5527 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5528 || (extractFloatx80Exp(b) == 0x7FFF 5529 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5530 ) { 5531 float_raise(float_flag_invalid, status); 5532 return 0; 5533 } 5534 aSign = extractFloatx80Sign( a ); 5535 bSign = extractFloatx80Sign( b ); 5536 if ( aSign != bSign ) { 5537 return 5538 aSign 5539 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5540 == 0 ); 5541 } 5542 return 5543 aSign ? le128( b.high, b.low, a.high, a.low ) 5544 : le128( a.high, a.low, b.high, b.low ); 5545 5546 } 5547 5548 /*---------------------------------------------------------------------------- 5549 | Returns 1 if the extended double-precision floating-point value `a' is 5550 | less than the corresponding value `b', and 0 otherwise. The invalid 5551 | exception is raised if either operand is a NaN. The comparison is performed 5552 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5553 *----------------------------------------------------------------------------*/ 5554 5555 int floatx80_lt(floatx80 a, floatx80 b, float_status *status) 5556 { 5557 flag aSign, bSign; 5558 5559 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5560 || (extractFloatx80Exp(a) == 0x7FFF 5561 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5562 || (extractFloatx80Exp(b) == 0x7FFF 5563 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5564 ) { 5565 float_raise(float_flag_invalid, status); 5566 return 0; 5567 } 5568 aSign = extractFloatx80Sign( a ); 5569 bSign = extractFloatx80Sign( b ); 5570 if ( aSign != bSign ) { 5571 return 5572 aSign 5573 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5574 != 0 ); 5575 } 5576 return 5577 aSign ? lt128( b.high, b.low, a.high, a.low ) 5578 : lt128( a.high, a.low, b.high, b.low ); 5579 5580 } 5581 5582 /*---------------------------------------------------------------------------- 5583 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5584 | cannot be compared, and 0 otherwise. The invalid exception is raised if 5585 | either operand is a NaN. The comparison is performed according to the 5586 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5587 *----------------------------------------------------------------------------*/ 5588 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status) 5589 { 5590 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5591 || (extractFloatx80Exp(a) == 0x7FFF 5592 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5593 || (extractFloatx80Exp(b) == 0x7FFF 5594 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5595 ) { 5596 float_raise(float_flag_invalid, status); 5597 return 1; 5598 } 5599 return 0; 5600 } 5601 5602 /*---------------------------------------------------------------------------- 5603 | Returns 1 if the extended double-precision floating-point value `a' is 5604 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5605 | cause an exception. The comparison is performed according to the IEC/IEEE 5606 | Standard for Binary Floating-Point Arithmetic. 5607 *----------------------------------------------------------------------------*/ 5608 5609 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status) 5610 { 5611 5612 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5613 float_raise(float_flag_invalid, status); 5614 return 0; 5615 } 5616 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5617 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5618 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5619 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5620 ) { 5621 if (floatx80_is_signaling_nan(a, status) 5622 || floatx80_is_signaling_nan(b, status)) { 5623 float_raise(float_flag_invalid, status); 5624 } 5625 return 0; 5626 } 5627 return 5628 ( a.low == b.low ) 5629 && ( ( a.high == b.high ) 5630 || ( ( a.low == 0 ) 5631 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5632 ); 5633 5634 } 5635 5636 /*---------------------------------------------------------------------------- 5637 | Returns 1 if the extended double-precision floating-point value `a' is less 5638 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 5639 | do not cause an exception. Otherwise, the comparison is performed according 5640 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5641 *----------------------------------------------------------------------------*/ 5642 5643 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status) 5644 { 5645 flag aSign, bSign; 5646 5647 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5648 float_raise(float_flag_invalid, status); 5649 return 0; 5650 } 5651 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5652 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5653 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5654 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5655 ) { 5656 if (floatx80_is_signaling_nan(a, status) 5657 || floatx80_is_signaling_nan(b, status)) { 5658 float_raise(float_flag_invalid, status); 5659 } 5660 return 0; 5661 } 5662 aSign = extractFloatx80Sign( a ); 5663 bSign = extractFloatx80Sign( b ); 5664 if ( aSign != bSign ) { 5665 return 5666 aSign 5667 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5668 == 0 ); 5669 } 5670 return 5671 aSign ? le128( b.high, b.low, a.high, a.low ) 5672 : le128( a.high, a.low, b.high, b.low ); 5673 5674 } 5675 5676 /*---------------------------------------------------------------------------- 5677 | Returns 1 if the extended double-precision floating-point value `a' is less 5678 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 5679 | an exception. Otherwise, the comparison is performed according to the 5680 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5681 *----------------------------------------------------------------------------*/ 5682 5683 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status) 5684 { 5685 flag aSign, bSign; 5686 5687 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5688 float_raise(float_flag_invalid, status); 5689 return 0; 5690 } 5691 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5692 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5693 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5694 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5695 ) { 5696 if (floatx80_is_signaling_nan(a, status) 5697 || floatx80_is_signaling_nan(b, status)) { 5698 float_raise(float_flag_invalid, status); 5699 } 5700 return 0; 5701 } 5702 aSign = extractFloatx80Sign( a ); 5703 bSign = extractFloatx80Sign( b ); 5704 if ( aSign != bSign ) { 5705 return 5706 aSign 5707 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5708 != 0 ); 5709 } 5710 return 5711 aSign ? lt128( b.high, b.low, a.high, a.low ) 5712 : lt128( a.high, a.low, b.high, b.low ); 5713 5714 } 5715 5716 /*---------------------------------------------------------------------------- 5717 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5718 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception. 5719 | The comparison is performed according to the IEC/IEEE Standard for Binary 5720 | Floating-Point Arithmetic. 5721 *----------------------------------------------------------------------------*/ 5722 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status) 5723 { 5724 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5725 float_raise(float_flag_invalid, status); 5726 return 1; 5727 } 5728 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5729 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5730 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5731 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5732 ) { 5733 if (floatx80_is_signaling_nan(a, status) 5734 || floatx80_is_signaling_nan(b, status)) { 5735 float_raise(float_flag_invalid, status); 5736 } 5737 return 1; 5738 } 5739 return 0; 5740 } 5741 5742 /*---------------------------------------------------------------------------- 5743 | Returns the result of converting the quadruple-precision floating-point 5744 | value `a' to the 32-bit two's complement integer format. The conversion 5745 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5746 | Arithmetic---which means in particular that the conversion is rounded 5747 | according to the current rounding mode. If `a' is a NaN, the largest 5748 | positive integer is returned. Otherwise, if the conversion overflows, the 5749 | largest integer with the same sign as `a' is returned. 5750 *----------------------------------------------------------------------------*/ 5751 5752 int32_t float128_to_int32(float128 a, float_status *status) 5753 { 5754 flag aSign; 5755 int32_t aExp, shiftCount; 5756 uint64_t aSig0, aSig1; 5757 5758 aSig1 = extractFloat128Frac1( a ); 5759 aSig0 = extractFloat128Frac0( a ); 5760 aExp = extractFloat128Exp( a ); 5761 aSign = extractFloat128Sign( a ); 5762 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 5763 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5764 aSig0 |= ( aSig1 != 0 ); 5765 shiftCount = 0x4028 - aExp; 5766 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 5767 return roundAndPackInt32(aSign, aSig0, status); 5768 5769 } 5770 5771 /*---------------------------------------------------------------------------- 5772 | Returns the result of converting the quadruple-precision floating-point 5773 | value `a' to the 32-bit two's complement integer format. The conversion 5774 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5775 | Arithmetic, except that the conversion is always rounded toward zero. If 5776 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 5777 | conversion overflows, the largest integer with the same sign as `a' is 5778 | returned. 5779 *----------------------------------------------------------------------------*/ 5780 5781 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 5782 { 5783 flag aSign; 5784 int32_t aExp, shiftCount; 5785 uint64_t aSig0, aSig1, savedASig; 5786 int32_t z; 5787 5788 aSig1 = extractFloat128Frac1( a ); 5789 aSig0 = extractFloat128Frac0( a ); 5790 aExp = extractFloat128Exp( a ); 5791 aSign = extractFloat128Sign( a ); 5792 aSig0 |= ( aSig1 != 0 ); 5793 if ( 0x401E < aExp ) { 5794 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 5795 goto invalid; 5796 } 5797 else if ( aExp < 0x3FFF ) { 5798 if (aExp || aSig0) { 5799 status->float_exception_flags |= float_flag_inexact; 5800 } 5801 return 0; 5802 } 5803 aSig0 |= LIT64( 0x0001000000000000 ); 5804 shiftCount = 0x402F - aExp; 5805 savedASig = aSig0; 5806 aSig0 >>= shiftCount; 5807 z = aSig0; 5808 if ( aSign ) z = - z; 5809 if ( ( z < 0 ) ^ aSign ) { 5810 invalid: 5811 float_raise(float_flag_invalid, status); 5812 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5813 } 5814 if ( ( aSig0<<shiftCount ) != savedASig ) { 5815 status->float_exception_flags |= float_flag_inexact; 5816 } 5817 return z; 5818 5819 } 5820 5821 /*---------------------------------------------------------------------------- 5822 | Returns the result of converting the quadruple-precision floating-point 5823 | value `a' to the 64-bit two's complement integer format. The conversion 5824 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5825 | Arithmetic---which means in particular that the conversion is rounded 5826 | according to the current rounding mode. If `a' is a NaN, the largest 5827 | positive integer is returned. Otherwise, if the conversion overflows, the 5828 | largest integer with the same sign as `a' is returned. 5829 *----------------------------------------------------------------------------*/ 5830 5831 int64_t float128_to_int64(float128 a, float_status *status) 5832 { 5833 flag aSign; 5834 int32_t aExp, shiftCount; 5835 uint64_t aSig0, aSig1; 5836 5837 aSig1 = extractFloat128Frac1( a ); 5838 aSig0 = extractFloat128Frac0( a ); 5839 aExp = extractFloat128Exp( a ); 5840 aSign = extractFloat128Sign( a ); 5841 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5842 shiftCount = 0x402F - aExp; 5843 if ( shiftCount <= 0 ) { 5844 if ( 0x403E < aExp ) { 5845 float_raise(float_flag_invalid, status); 5846 if ( ! aSign 5847 || ( ( aExp == 0x7FFF ) 5848 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 5849 ) 5850 ) { 5851 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5852 } 5853 return (int64_t) LIT64( 0x8000000000000000 ); 5854 } 5855 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 5856 } 5857 else { 5858 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 5859 } 5860 return roundAndPackInt64(aSign, aSig0, aSig1, status); 5861 5862 } 5863 5864 /*---------------------------------------------------------------------------- 5865 | Returns the result of converting the quadruple-precision floating-point 5866 | value `a' to the 64-bit two's complement integer format. The conversion 5867 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5868 | Arithmetic, except that the conversion is always rounded toward zero. 5869 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 5870 | the conversion overflows, the largest integer with the same sign as `a' is 5871 | returned. 5872 *----------------------------------------------------------------------------*/ 5873 5874 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 5875 { 5876 flag aSign; 5877 int32_t aExp, shiftCount; 5878 uint64_t aSig0, aSig1; 5879 int64_t z; 5880 5881 aSig1 = extractFloat128Frac1( a ); 5882 aSig0 = extractFloat128Frac0( a ); 5883 aExp = extractFloat128Exp( a ); 5884 aSign = extractFloat128Sign( a ); 5885 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5886 shiftCount = aExp - 0x402F; 5887 if ( 0 < shiftCount ) { 5888 if ( 0x403E <= aExp ) { 5889 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 5890 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 5891 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 5892 if (aSig1) { 5893 status->float_exception_flags |= float_flag_inexact; 5894 } 5895 } 5896 else { 5897 float_raise(float_flag_invalid, status); 5898 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 5899 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5900 } 5901 } 5902 return (int64_t) LIT64( 0x8000000000000000 ); 5903 } 5904 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 5905 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 5906 status->float_exception_flags |= float_flag_inexact; 5907 } 5908 } 5909 else { 5910 if ( aExp < 0x3FFF ) { 5911 if ( aExp | aSig0 | aSig1 ) { 5912 status->float_exception_flags |= float_flag_inexact; 5913 } 5914 return 0; 5915 } 5916 z = aSig0>>( - shiftCount ); 5917 if ( aSig1 5918 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 5919 status->float_exception_flags |= float_flag_inexact; 5920 } 5921 } 5922 if ( aSign ) z = - z; 5923 return z; 5924 5925 } 5926 5927 /*---------------------------------------------------------------------------- 5928 | Returns the result of converting the quadruple-precision floating-point value 5929 | `a' to the 64-bit unsigned integer format. The conversion is 5930 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5931 | Arithmetic---which means in particular that the conversion is rounded 5932 | according to the current rounding mode. If `a' is a NaN, the largest 5933 | positive integer is returned. If the conversion overflows, the 5934 | largest unsigned integer is returned. If 'a' is negative, the value is 5935 | rounded and zero is returned; negative values that do not round to zero 5936 | will raise the inexact exception. 5937 *----------------------------------------------------------------------------*/ 5938 5939 uint64_t float128_to_uint64(float128 a, float_status *status) 5940 { 5941 flag aSign; 5942 int aExp; 5943 int shiftCount; 5944 uint64_t aSig0, aSig1; 5945 5946 aSig0 = extractFloat128Frac0(a); 5947 aSig1 = extractFloat128Frac1(a); 5948 aExp = extractFloat128Exp(a); 5949 aSign = extractFloat128Sign(a); 5950 if (aSign && (aExp > 0x3FFE)) { 5951 float_raise(float_flag_invalid, status); 5952 if (float128_is_any_nan(a)) { 5953 return LIT64(0xFFFFFFFFFFFFFFFF); 5954 } else { 5955 return 0; 5956 } 5957 } 5958 if (aExp) { 5959 aSig0 |= LIT64(0x0001000000000000); 5960 } 5961 shiftCount = 0x402F - aExp; 5962 if (shiftCount <= 0) { 5963 if (0x403E < aExp) { 5964 float_raise(float_flag_invalid, status); 5965 return LIT64(0xFFFFFFFFFFFFFFFF); 5966 } 5967 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 5968 } else { 5969 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 5970 } 5971 return roundAndPackUint64(aSign, aSig0, aSig1, status); 5972 } 5973 5974 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 5975 { 5976 uint64_t v; 5977 signed char current_rounding_mode = status->float_rounding_mode; 5978 5979 set_float_rounding_mode(float_round_to_zero, status); 5980 v = float128_to_uint64(a, status); 5981 set_float_rounding_mode(current_rounding_mode, status); 5982 5983 return v; 5984 } 5985 5986 /*---------------------------------------------------------------------------- 5987 | Returns the result of converting the quadruple-precision floating-point 5988 | value `a' to the 32-bit unsigned integer format. The conversion 5989 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5990 | Arithmetic except that the conversion is always rounded toward zero. 5991 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 5992 | if the conversion overflows, the largest unsigned integer is returned. 5993 | If 'a' is negative, the value is rounded and zero is returned; negative 5994 | values that do not round to zero will raise the inexact exception. 5995 *----------------------------------------------------------------------------*/ 5996 5997 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 5998 { 5999 uint64_t v; 6000 uint32_t res; 6001 int old_exc_flags = get_float_exception_flags(status); 6002 6003 v = float128_to_uint64_round_to_zero(a, status); 6004 if (v > 0xffffffff) { 6005 res = 0xffffffff; 6006 } else { 6007 return v; 6008 } 6009 set_float_exception_flags(old_exc_flags, status); 6010 float_raise(float_flag_invalid, status); 6011 return res; 6012 } 6013 6014 /*---------------------------------------------------------------------------- 6015 | Returns the result of converting the quadruple-precision floating-point 6016 | value `a' to the single-precision floating-point format. The conversion 6017 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6018 | Arithmetic. 6019 *----------------------------------------------------------------------------*/ 6020 6021 float32 float128_to_float32(float128 a, float_status *status) 6022 { 6023 flag aSign; 6024 int32_t aExp; 6025 uint64_t aSig0, aSig1; 6026 uint32_t zSig; 6027 6028 aSig1 = extractFloat128Frac1( a ); 6029 aSig0 = extractFloat128Frac0( a ); 6030 aExp = extractFloat128Exp( a ); 6031 aSign = extractFloat128Sign( a ); 6032 if ( aExp == 0x7FFF ) { 6033 if ( aSig0 | aSig1 ) { 6034 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 6035 } 6036 return packFloat32( aSign, 0xFF, 0 ); 6037 } 6038 aSig0 |= ( aSig1 != 0 ); 6039 shift64RightJamming( aSig0, 18, &aSig0 ); 6040 zSig = aSig0; 6041 if ( aExp || zSig ) { 6042 zSig |= 0x40000000; 6043 aExp -= 0x3F81; 6044 } 6045 return roundAndPackFloat32(aSign, aExp, zSig, status); 6046 6047 } 6048 6049 /*---------------------------------------------------------------------------- 6050 | Returns the result of converting the quadruple-precision floating-point 6051 | value `a' to the double-precision floating-point format. The conversion 6052 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6053 | Arithmetic. 6054 *----------------------------------------------------------------------------*/ 6055 6056 float64 float128_to_float64(float128 a, float_status *status) 6057 { 6058 flag aSign; 6059 int32_t aExp; 6060 uint64_t aSig0, aSig1; 6061 6062 aSig1 = extractFloat128Frac1( a ); 6063 aSig0 = extractFloat128Frac0( a ); 6064 aExp = extractFloat128Exp( a ); 6065 aSign = extractFloat128Sign( a ); 6066 if ( aExp == 0x7FFF ) { 6067 if ( aSig0 | aSig1 ) { 6068 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 6069 } 6070 return packFloat64( aSign, 0x7FF, 0 ); 6071 } 6072 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6073 aSig0 |= ( aSig1 != 0 ); 6074 if ( aExp || aSig0 ) { 6075 aSig0 |= LIT64( 0x4000000000000000 ); 6076 aExp -= 0x3C01; 6077 } 6078 return roundAndPackFloat64(aSign, aExp, aSig0, status); 6079 6080 } 6081 6082 /*---------------------------------------------------------------------------- 6083 | Returns the result of converting the quadruple-precision floating-point 6084 | value `a' to the extended double-precision floating-point format. The 6085 | conversion is performed according to the IEC/IEEE Standard for Binary 6086 | Floating-Point Arithmetic. 6087 *----------------------------------------------------------------------------*/ 6088 6089 floatx80 float128_to_floatx80(float128 a, float_status *status) 6090 { 6091 flag aSign; 6092 int32_t aExp; 6093 uint64_t aSig0, aSig1; 6094 6095 aSig1 = extractFloat128Frac1( a ); 6096 aSig0 = extractFloat128Frac0( a ); 6097 aExp = extractFloat128Exp( a ); 6098 aSign = extractFloat128Sign( a ); 6099 if ( aExp == 0x7FFF ) { 6100 if ( aSig0 | aSig1 ) { 6101 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status); 6102 } 6103 return packFloatx80(aSign, floatx80_infinity_high, 6104 floatx80_infinity_low); 6105 } 6106 if ( aExp == 0 ) { 6107 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6108 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6109 } 6110 else { 6111 aSig0 |= LIT64( 0x0001000000000000 ); 6112 } 6113 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6114 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6115 6116 } 6117 6118 /*---------------------------------------------------------------------------- 6119 | Rounds the quadruple-precision floating-point value `a' to an integer, and 6120 | returns the result as a quadruple-precision floating-point value. The 6121 | operation is performed according to the IEC/IEEE Standard for Binary 6122 | Floating-Point Arithmetic. 6123 *----------------------------------------------------------------------------*/ 6124 6125 float128 float128_round_to_int(float128 a, float_status *status) 6126 { 6127 flag aSign; 6128 int32_t aExp; 6129 uint64_t lastBitMask, roundBitsMask; 6130 float128 z; 6131 6132 aExp = extractFloat128Exp( a ); 6133 if ( 0x402F <= aExp ) { 6134 if ( 0x406F <= aExp ) { 6135 if ( ( aExp == 0x7FFF ) 6136 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 6137 ) { 6138 return propagateFloat128NaN(a, a, status); 6139 } 6140 return a; 6141 } 6142 lastBitMask = 1; 6143 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 6144 roundBitsMask = lastBitMask - 1; 6145 z = a; 6146 switch (status->float_rounding_mode) { 6147 case float_round_nearest_even: 6148 if ( lastBitMask ) { 6149 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 6150 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 6151 } 6152 else { 6153 if ( (int64_t) z.low < 0 ) { 6154 ++z.high; 6155 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 6156 } 6157 } 6158 break; 6159 case float_round_ties_away: 6160 if (lastBitMask) { 6161 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 6162 } else { 6163 if ((int64_t) z.low < 0) { 6164 ++z.high; 6165 } 6166 } 6167 break; 6168 case float_round_to_zero: 6169 break; 6170 case float_round_up: 6171 if (!extractFloat128Sign(z)) { 6172 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6173 } 6174 break; 6175 case float_round_down: 6176 if (extractFloat128Sign(z)) { 6177 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6178 } 6179 break; 6180 default: 6181 abort(); 6182 } 6183 z.low &= ~ roundBitsMask; 6184 } 6185 else { 6186 if ( aExp < 0x3FFF ) { 6187 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 6188 status->float_exception_flags |= float_flag_inexact; 6189 aSign = extractFloat128Sign( a ); 6190 switch (status->float_rounding_mode) { 6191 case float_round_nearest_even: 6192 if ( ( aExp == 0x3FFE ) 6193 && ( extractFloat128Frac0( a ) 6194 | extractFloat128Frac1( a ) ) 6195 ) { 6196 return packFloat128( aSign, 0x3FFF, 0, 0 ); 6197 } 6198 break; 6199 case float_round_ties_away: 6200 if (aExp == 0x3FFE) { 6201 return packFloat128(aSign, 0x3FFF, 0, 0); 6202 } 6203 break; 6204 case float_round_down: 6205 return 6206 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 6207 : packFloat128( 0, 0, 0, 0 ); 6208 case float_round_up: 6209 return 6210 aSign ? packFloat128( 1, 0, 0, 0 ) 6211 : packFloat128( 0, 0x3FFF, 0, 0 ); 6212 } 6213 return packFloat128( aSign, 0, 0, 0 ); 6214 } 6215 lastBitMask = 1; 6216 lastBitMask <<= 0x402F - aExp; 6217 roundBitsMask = lastBitMask - 1; 6218 z.low = 0; 6219 z.high = a.high; 6220 switch (status->float_rounding_mode) { 6221 case float_round_nearest_even: 6222 z.high += lastBitMask>>1; 6223 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 6224 z.high &= ~ lastBitMask; 6225 } 6226 break; 6227 case float_round_ties_away: 6228 z.high += lastBitMask>>1; 6229 break; 6230 case float_round_to_zero: 6231 break; 6232 case float_round_up: 6233 if (!extractFloat128Sign(z)) { 6234 z.high |= ( a.low != 0 ); 6235 z.high += roundBitsMask; 6236 } 6237 break; 6238 case float_round_down: 6239 if (extractFloat128Sign(z)) { 6240 z.high |= (a.low != 0); 6241 z.high += roundBitsMask; 6242 } 6243 break; 6244 default: 6245 abort(); 6246 } 6247 z.high &= ~ roundBitsMask; 6248 } 6249 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 6250 status->float_exception_flags |= float_flag_inexact; 6251 } 6252 return z; 6253 6254 } 6255 6256 /*---------------------------------------------------------------------------- 6257 | Returns the result of adding the absolute values of the quadruple-precision 6258 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 6259 | before being returned. `zSign' is ignored if the result is a NaN. 6260 | The addition is performed according to the IEC/IEEE Standard for Binary 6261 | Floating-Point Arithmetic. 6262 *----------------------------------------------------------------------------*/ 6263 6264 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign, 6265 float_status *status) 6266 { 6267 int32_t aExp, bExp, zExp; 6268 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6269 int32_t expDiff; 6270 6271 aSig1 = extractFloat128Frac1( a ); 6272 aSig0 = extractFloat128Frac0( a ); 6273 aExp = extractFloat128Exp( a ); 6274 bSig1 = extractFloat128Frac1( b ); 6275 bSig0 = extractFloat128Frac0( b ); 6276 bExp = extractFloat128Exp( b ); 6277 expDiff = aExp - bExp; 6278 if ( 0 < expDiff ) { 6279 if ( aExp == 0x7FFF ) { 6280 if (aSig0 | aSig1) { 6281 return propagateFloat128NaN(a, b, status); 6282 } 6283 return a; 6284 } 6285 if ( bExp == 0 ) { 6286 --expDiff; 6287 } 6288 else { 6289 bSig0 |= LIT64( 0x0001000000000000 ); 6290 } 6291 shift128ExtraRightJamming( 6292 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 6293 zExp = aExp; 6294 } 6295 else if ( expDiff < 0 ) { 6296 if ( bExp == 0x7FFF ) { 6297 if (bSig0 | bSig1) { 6298 return propagateFloat128NaN(a, b, status); 6299 } 6300 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6301 } 6302 if ( aExp == 0 ) { 6303 ++expDiff; 6304 } 6305 else { 6306 aSig0 |= LIT64( 0x0001000000000000 ); 6307 } 6308 shift128ExtraRightJamming( 6309 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 6310 zExp = bExp; 6311 } 6312 else { 6313 if ( aExp == 0x7FFF ) { 6314 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6315 return propagateFloat128NaN(a, b, status); 6316 } 6317 return a; 6318 } 6319 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6320 if ( aExp == 0 ) { 6321 if (status->flush_to_zero) { 6322 if (zSig0 | zSig1) { 6323 float_raise(float_flag_output_denormal, status); 6324 } 6325 return packFloat128(zSign, 0, 0, 0); 6326 } 6327 return packFloat128( zSign, 0, zSig0, zSig1 ); 6328 } 6329 zSig2 = 0; 6330 zSig0 |= LIT64( 0x0002000000000000 ); 6331 zExp = aExp; 6332 goto shiftRight1; 6333 } 6334 aSig0 |= LIT64( 0x0001000000000000 ); 6335 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6336 --zExp; 6337 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 6338 ++zExp; 6339 shiftRight1: 6340 shift128ExtraRightJamming( 6341 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6342 roundAndPack: 6343 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6344 6345 } 6346 6347 /*---------------------------------------------------------------------------- 6348 | Returns the result of subtracting the absolute values of the quadruple- 6349 | precision floating-point values `a' and `b'. If `zSign' is 1, the 6350 | difference is negated before being returned. `zSign' is ignored if the 6351 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6352 | Standard for Binary Floating-Point Arithmetic. 6353 *----------------------------------------------------------------------------*/ 6354 6355 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign, 6356 float_status *status) 6357 { 6358 int32_t aExp, bExp, zExp; 6359 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 6360 int32_t expDiff; 6361 6362 aSig1 = extractFloat128Frac1( a ); 6363 aSig0 = extractFloat128Frac0( a ); 6364 aExp = extractFloat128Exp( a ); 6365 bSig1 = extractFloat128Frac1( b ); 6366 bSig0 = extractFloat128Frac0( b ); 6367 bExp = extractFloat128Exp( b ); 6368 expDiff = aExp - bExp; 6369 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6370 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 6371 if ( 0 < expDiff ) goto aExpBigger; 6372 if ( expDiff < 0 ) goto bExpBigger; 6373 if ( aExp == 0x7FFF ) { 6374 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6375 return propagateFloat128NaN(a, b, status); 6376 } 6377 float_raise(float_flag_invalid, status); 6378 return float128_default_nan(status); 6379 } 6380 if ( aExp == 0 ) { 6381 aExp = 1; 6382 bExp = 1; 6383 } 6384 if ( bSig0 < aSig0 ) goto aBigger; 6385 if ( aSig0 < bSig0 ) goto bBigger; 6386 if ( bSig1 < aSig1 ) goto aBigger; 6387 if ( aSig1 < bSig1 ) goto bBigger; 6388 return packFloat128(status->float_rounding_mode == float_round_down, 6389 0, 0, 0); 6390 bExpBigger: 6391 if ( bExp == 0x7FFF ) { 6392 if (bSig0 | bSig1) { 6393 return propagateFloat128NaN(a, b, status); 6394 } 6395 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 6396 } 6397 if ( aExp == 0 ) { 6398 ++expDiff; 6399 } 6400 else { 6401 aSig0 |= LIT64( 0x4000000000000000 ); 6402 } 6403 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6404 bSig0 |= LIT64( 0x4000000000000000 ); 6405 bBigger: 6406 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6407 zExp = bExp; 6408 zSign ^= 1; 6409 goto normalizeRoundAndPack; 6410 aExpBigger: 6411 if ( aExp == 0x7FFF ) { 6412 if (aSig0 | aSig1) { 6413 return propagateFloat128NaN(a, b, status); 6414 } 6415 return a; 6416 } 6417 if ( bExp == 0 ) { 6418 --expDiff; 6419 } 6420 else { 6421 bSig0 |= LIT64( 0x4000000000000000 ); 6422 } 6423 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 6424 aSig0 |= LIT64( 0x4000000000000000 ); 6425 aBigger: 6426 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6427 zExp = aExp; 6428 normalizeRoundAndPack: 6429 --zExp; 6430 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 6431 status); 6432 6433 } 6434 6435 /*---------------------------------------------------------------------------- 6436 | Returns the result of adding the quadruple-precision floating-point values 6437 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 6438 | for Binary Floating-Point Arithmetic. 6439 *----------------------------------------------------------------------------*/ 6440 6441 float128 float128_add(float128 a, float128 b, float_status *status) 6442 { 6443 flag aSign, bSign; 6444 6445 aSign = extractFloat128Sign( a ); 6446 bSign = extractFloat128Sign( b ); 6447 if ( aSign == bSign ) { 6448 return addFloat128Sigs(a, b, aSign, status); 6449 } 6450 else { 6451 return subFloat128Sigs(a, b, aSign, status); 6452 } 6453 6454 } 6455 6456 /*---------------------------------------------------------------------------- 6457 | Returns the result of subtracting the quadruple-precision floating-point 6458 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6459 | Standard for Binary Floating-Point Arithmetic. 6460 *----------------------------------------------------------------------------*/ 6461 6462 float128 float128_sub(float128 a, float128 b, float_status *status) 6463 { 6464 flag aSign, bSign; 6465 6466 aSign = extractFloat128Sign( a ); 6467 bSign = extractFloat128Sign( b ); 6468 if ( aSign == bSign ) { 6469 return subFloat128Sigs(a, b, aSign, status); 6470 } 6471 else { 6472 return addFloat128Sigs(a, b, aSign, status); 6473 } 6474 6475 } 6476 6477 /*---------------------------------------------------------------------------- 6478 | Returns the result of multiplying the quadruple-precision floating-point 6479 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6480 | Standard for Binary Floating-Point Arithmetic. 6481 *----------------------------------------------------------------------------*/ 6482 6483 float128 float128_mul(float128 a, float128 b, float_status *status) 6484 { 6485 flag aSign, bSign, zSign; 6486 int32_t aExp, bExp, zExp; 6487 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 6488 6489 aSig1 = extractFloat128Frac1( a ); 6490 aSig0 = extractFloat128Frac0( a ); 6491 aExp = extractFloat128Exp( a ); 6492 aSign = extractFloat128Sign( a ); 6493 bSig1 = extractFloat128Frac1( b ); 6494 bSig0 = extractFloat128Frac0( b ); 6495 bExp = extractFloat128Exp( b ); 6496 bSign = extractFloat128Sign( b ); 6497 zSign = aSign ^ bSign; 6498 if ( aExp == 0x7FFF ) { 6499 if ( ( aSig0 | aSig1 ) 6500 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6501 return propagateFloat128NaN(a, b, status); 6502 } 6503 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 6504 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6505 } 6506 if ( bExp == 0x7FFF ) { 6507 if (bSig0 | bSig1) { 6508 return propagateFloat128NaN(a, b, status); 6509 } 6510 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6511 invalid: 6512 float_raise(float_flag_invalid, status); 6513 return float128_default_nan(status); 6514 } 6515 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6516 } 6517 if ( aExp == 0 ) { 6518 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6519 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6520 } 6521 if ( bExp == 0 ) { 6522 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6523 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6524 } 6525 zExp = aExp + bExp - 0x4000; 6526 aSig0 |= LIT64( 0x0001000000000000 ); 6527 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 6528 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 6529 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6530 zSig2 |= ( zSig3 != 0 ); 6531 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 6532 shift128ExtraRightJamming( 6533 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6534 ++zExp; 6535 } 6536 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6537 6538 } 6539 6540 /*---------------------------------------------------------------------------- 6541 | Returns the result of dividing the quadruple-precision floating-point value 6542 | `a' by the corresponding value `b'. The operation is performed according to 6543 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6544 *----------------------------------------------------------------------------*/ 6545 6546 float128 float128_div(float128 a, float128 b, float_status *status) 6547 { 6548 flag aSign, bSign, zSign; 6549 int32_t aExp, bExp, zExp; 6550 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6551 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6552 6553 aSig1 = extractFloat128Frac1( a ); 6554 aSig0 = extractFloat128Frac0( a ); 6555 aExp = extractFloat128Exp( a ); 6556 aSign = extractFloat128Sign( a ); 6557 bSig1 = extractFloat128Frac1( b ); 6558 bSig0 = extractFloat128Frac0( b ); 6559 bExp = extractFloat128Exp( b ); 6560 bSign = extractFloat128Sign( b ); 6561 zSign = aSign ^ bSign; 6562 if ( aExp == 0x7FFF ) { 6563 if (aSig0 | aSig1) { 6564 return propagateFloat128NaN(a, b, status); 6565 } 6566 if ( bExp == 0x7FFF ) { 6567 if (bSig0 | bSig1) { 6568 return propagateFloat128NaN(a, b, status); 6569 } 6570 goto invalid; 6571 } 6572 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6573 } 6574 if ( bExp == 0x7FFF ) { 6575 if (bSig0 | bSig1) { 6576 return propagateFloat128NaN(a, b, status); 6577 } 6578 return packFloat128( zSign, 0, 0, 0 ); 6579 } 6580 if ( bExp == 0 ) { 6581 if ( ( bSig0 | bSig1 ) == 0 ) { 6582 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6583 invalid: 6584 float_raise(float_flag_invalid, status); 6585 return float128_default_nan(status); 6586 } 6587 float_raise(float_flag_divbyzero, status); 6588 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6589 } 6590 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6591 } 6592 if ( aExp == 0 ) { 6593 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6594 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6595 } 6596 zExp = aExp - bExp + 0x3FFD; 6597 shortShift128Left( 6598 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 6599 shortShift128Left( 6600 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6601 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 6602 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 6603 ++zExp; 6604 } 6605 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6606 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 6607 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 6608 while ( (int64_t) rem0 < 0 ) { 6609 --zSig0; 6610 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 6611 } 6612 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 6613 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 6614 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 6615 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 6616 while ( (int64_t) rem1 < 0 ) { 6617 --zSig1; 6618 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 6619 } 6620 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6621 } 6622 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 6623 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6624 6625 } 6626 6627 /*---------------------------------------------------------------------------- 6628 | Returns the remainder of the quadruple-precision floating-point value `a' 6629 | with respect to the corresponding value `b'. The operation is performed 6630 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6631 *----------------------------------------------------------------------------*/ 6632 6633 float128 float128_rem(float128 a, float128 b, float_status *status) 6634 { 6635 flag aSign, zSign; 6636 int32_t aExp, bExp, expDiff; 6637 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 6638 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 6639 int64_t sigMean0; 6640 6641 aSig1 = extractFloat128Frac1( a ); 6642 aSig0 = extractFloat128Frac0( a ); 6643 aExp = extractFloat128Exp( a ); 6644 aSign = extractFloat128Sign( a ); 6645 bSig1 = extractFloat128Frac1( b ); 6646 bSig0 = extractFloat128Frac0( b ); 6647 bExp = extractFloat128Exp( b ); 6648 if ( aExp == 0x7FFF ) { 6649 if ( ( aSig0 | aSig1 ) 6650 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6651 return propagateFloat128NaN(a, b, status); 6652 } 6653 goto invalid; 6654 } 6655 if ( bExp == 0x7FFF ) { 6656 if (bSig0 | bSig1) { 6657 return propagateFloat128NaN(a, b, status); 6658 } 6659 return a; 6660 } 6661 if ( bExp == 0 ) { 6662 if ( ( bSig0 | bSig1 ) == 0 ) { 6663 invalid: 6664 float_raise(float_flag_invalid, status); 6665 return float128_default_nan(status); 6666 } 6667 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6668 } 6669 if ( aExp == 0 ) { 6670 if ( ( aSig0 | aSig1 ) == 0 ) return a; 6671 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6672 } 6673 expDiff = aExp - bExp; 6674 if ( expDiff < -1 ) return a; 6675 shortShift128Left( 6676 aSig0 | LIT64( 0x0001000000000000 ), 6677 aSig1, 6678 15 - ( expDiff < 0 ), 6679 &aSig0, 6680 &aSig1 6681 ); 6682 shortShift128Left( 6683 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6684 q = le128( bSig0, bSig1, aSig0, aSig1 ); 6685 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6686 expDiff -= 64; 6687 while ( 0 < expDiff ) { 6688 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6689 q = ( 4 < q ) ? q - 4 : 0; 6690 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6691 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 6692 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 6693 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 6694 expDiff -= 61; 6695 } 6696 if ( -64 < expDiff ) { 6697 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6698 q = ( 4 < q ) ? q - 4 : 0; 6699 q >>= - expDiff; 6700 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6701 expDiff += 52; 6702 if ( expDiff < 0 ) { 6703 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6704 } 6705 else { 6706 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 6707 } 6708 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6709 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 6710 } 6711 else { 6712 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 6713 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6714 } 6715 do { 6716 alternateASig0 = aSig0; 6717 alternateASig1 = aSig1; 6718 ++q; 6719 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6720 } while ( 0 <= (int64_t) aSig0 ); 6721 add128( 6722 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 6723 if ( ( sigMean0 < 0 ) 6724 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 6725 aSig0 = alternateASig0; 6726 aSig1 = alternateASig1; 6727 } 6728 zSign = ( (int64_t) aSig0 < 0 ); 6729 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 6730 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 6731 status); 6732 } 6733 6734 /*---------------------------------------------------------------------------- 6735 | Returns the square root of the quadruple-precision floating-point value `a'. 6736 | The operation is performed according to the IEC/IEEE Standard for Binary 6737 | Floating-Point Arithmetic. 6738 *----------------------------------------------------------------------------*/ 6739 6740 float128 float128_sqrt(float128 a, float_status *status) 6741 { 6742 flag aSign; 6743 int32_t aExp, zExp; 6744 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 6745 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6746 6747 aSig1 = extractFloat128Frac1( a ); 6748 aSig0 = extractFloat128Frac0( a ); 6749 aExp = extractFloat128Exp( a ); 6750 aSign = extractFloat128Sign( a ); 6751 if ( aExp == 0x7FFF ) { 6752 if (aSig0 | aSig1) { 6753 return propagateFloat128NaN(a, a, status); 6754 } 6755 if ( ! aSign ) return a; 6756 goto invalid; 6757 } 6758 if ( aSign ) { 6759 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 6760 invalid: 6761 float_raise(float_flag_invalid, status); 6762 return float128_default_nan(status); 6763 } 6764 if ( aExp == 0 ) { 6765 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 6766 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6767 } 6768 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 6769 aSig0 |= LIT64( 0x0001000000000000 ); 6770 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 6771 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 6772 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6773 doubleZSig0 = zSig0<<1; 6774 mul64To128( zSig0, zSig0, &term0, &term1 ); 6775 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6776 while ( (int64_t) rem0 < 0 ) { 6777 --zSig0; 6778 doubleZSig0 -= 2; 6779 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6780 } 6781 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6782 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 6783 if ( zSig1 == 0 ) zSig1 = 1; 6784 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6785 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6786 mul64To128( zSig1, zSig1, &term2, &term3 ); 6787 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6788 while ( (int64_t) rem1 < 0 ) { 6789 --zSig1; 6790 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6791 term3 |= 1; 6792 term2 |= doubleZSig0; 6793 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6794 } 6795 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6796 } 6797 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 6798 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 6799 6800 } 6801 6802 /*---------------------------------------------------------------------------- 6803 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 6804 | the corresponding value `b', and 0 otherwise. The invalid exception is 6805 | raised if either operand is a NaN. Otherwise, the comparison is performed 6806 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6807 *----------------------------------------------------------------------------*/ 6808 6809 int float128_eq(float128 a, float128 b, float_status *status) 6810 { 6811 6812 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6813 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6814 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6815 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6816 ) { 6817 float_raise(float_flag_invalid, status); 6818 return 0; 6819 } 6820 return 6821 ( a.low == b.low ) 6822 && ( ( a.high == b.high ) 6823 || ( ( a.low == 0 ) 6824 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6825 ); 6826 6827 } 6828 6829 /*---------------------------------------------------------------------------- 6830 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6831 | or equal to the corresponding value `b', and 0 otherwise. The invalid 6832 | exception is raised if either operand is a NaN. The comparison is performed 6833 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6834 *----------------------------------------------------------------------------*/ 6835 6836 int float128_le(float128 a, float128 b, float_status *status) 6837 { 6838 flag aSign, bSign; 6839 6840 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6841 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6842 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6843 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6844 ) { 6845 float_raise(float_flag_invalid, status); 6846 return 0; 6847 } 6848 aSign = extractFloat128Sign( a ); 6849 bSign = extractFloat128Sign( b ); 6850 if ( aSign != bSign ) { 6851 return 6852 aSign 6853 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6854 == 0 ); 6855 } 6856 return 6857 aSign ? le128( b.high, b.low, a.high, a.low ) 6858 : le128( a.high, a.low, b.high, b.low ); 6859 6860 } 6861 6862 /*---------------------------------------------------------------------------- 6863 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6864 | the corresponding value `b', and 0 otherwise. The invalid exception is 6865 | raised if either operand is a NaN. The comparison is performed according 6866 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6867 *----------------------------------------------------------------------------*/ 6868 6869 int float128_lt(float128 a, float128 b, float_status *status) 6870 { 6871 flag aSign, bSign; 6872 6873 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6874 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6875 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6876 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6877 ) { 6878 float_raise(float_flag_invalid, status); 6879 return 0; 6880 } 6881 aSign = extractFloat128Sign( a ); 6882 bSign = extractFloat128Sign( b ); 6883 if ( aSign != bSign ) { 6884 return 6885 aSign 6886 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6887 != 0 ); 6888 } 6889 return 6890 aSign ? lt128( b.high, b.low, a.high, a.low ) 6891 : lt128( a.high, a.low, b.high, b.low ); 6892 6893 } 6894 6895 /*---------------------------------------------------------------------------- 6896 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 6897 | be compared, and 0 otherwise. The invalid exception is raised if either 6898 | operand is a NaN. The comparison is performed according to the IEC/IEEE 6899 | Standard for Binary Floating-Point Arithmetic. 6900 *----------------------------------------------------------------------------*/ 6901 6902 int float128_unordered(float128 a, float128 b, float_status *status) 6903 { 6904 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6905 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6906 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6907 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6908 ) { 6909 float_raise(float_flag_invalid, status); 6910 return 1; 6911 } 6912 return 0; 6913 } 6914 6915 /*---------------------------------------------------------------------------- 6916 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 6917 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 6918 | exception. The comparison is performed according to the IEC/IEEE Standard 6919 | for Binary Floating-Point Arithmetic. 6920 *----------------------------------------------------------------------------*/ 6921 6922 int float128_eq_quiet(float128 a, float128 b, float_status *status) 6923 { 6924 6925 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6926 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6927 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6928 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6929 ) { 6930 if (float128_is_signaling_nan(a, status) 6931 || float128_is_signaling_nan(b, status)) { 6932 float_raise(float_flag_invalid, status); 6933 } 6934 return 0; 6935 } 6936 return 6937 ( a.low == b.low ) 6938 && ( ( a.high == b.high ) 6939 || ( ( a.low == 0 ) 6940 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6941 ); 6942 6943 } 6944 6945 /*---------------------------------------------------------------------------- 6946 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6947 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 6948 | cause an exception. Otherwise, the comparison is performed according to the 6949 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6950 *----------------------------------------------------------------------------*/ 6951 6952 int float128_le_quiet(float128 a, float128 b, float_status *status) 6953 { 6954 flag aSign, bSign; 6955 6956 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6957 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6958 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6959 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6960 ) { 6961 if (float128_is_signaling_nan(a, status) 6962 || float128_is_signaling_nan(b, status)) { 6963 float_raise(float_flag_invalid, status); 6964 } 6965 return 0; 6966 } 6967 aSign = extractFloat128Sign( a ); 6968 bSign = extractFloat128Sign( b ); 6969 if ( aSign != bSign ) { 6970 return 6971 aSign 6972 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6973 == 0 ); 6974 } 6975 return 6976 aSign ? le128( b.high, b.low, a.high, a.low ) 6977 : le128( a.high, a.low, b.high, b.low ); 6978 6979 } 6980 6981 /*---------------------------------------------------------------------------- 6982 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6983 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 6984 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 6985 | Standard for Binary Floating-Point Arithmetic. 6986 *----------------------------------------------------------------------------*/ 6987 6988 int float128_lt_quiet(float128 a, float128 b, float_status *status) 6989 { 6990 flag aSign, bSign; 6991 6992 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6993 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6994 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6995 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6996 ) { 6997 if (float128_is_signaling_nan(a, status) 6998 || float128_is_signaling_nan(b, status)) { 6999 float_raise(float_flag_invalid, status); 7000 } 7001 return 0; 7002 } 7003 aSign = extractFloat128Sign( a ); 7004 bSign = extractFloat128Sign( b ); 7005 if ( aSign != bSign ) { 7006 return 7007 aSign 7008 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7009 != 0 ); 7010 } 7011 return 7012 aSign ? lt128( b.high, b.low, a.high, a.low ) 7013 : lt128( a.high, a.low, b.high, b.low ); 7014 7015 } 7016 7017 /*---------------------------------------------------------------------------- 7018 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7019 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 7020 | comparison is performed according to the IEC/IEEE Standard for Binary 7021 | Floating-Point Arithmetic. 7022 *----------------------------------------------------------------------------*/ 7023 7024 int float128_unordered_quiet(float128 a, float128 b, float_status *status) 7025 { 7026 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7027 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7028 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7029 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7030 ) { 7031 if (float128_is_signaling_nan(a, status) 7032 || float128_is_signaling_nan(b, status)) { 7033 float_raise(float_flag_invalid, status); 7034 } 7035 return 1; 7036 } 7037 return 0; 7038 } 7039 7040 static inline int floatx80_compare_internal(floatx80 a, floatx80 b, 7041 int is_quiet, float_status *status) 7042 { 7043 flag aSign, bSign; 7044 7045 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 7046 float_raise(float_flag_invalid, status); 7047 return float_relation_unordered; 7048 } 7049 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 7050 ( extractFloatx80Frac( a )<<1 ) ) || 7051 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 7052 ( extractFloatx80Frac( b )<<1 ) )) { 7053 if (!is_quiet || 7054 floatx80_is_signaling_nan(a, status) || 7055 floatx80_is_signaling_nan(b, status)) { 7056 float_raise(float_flag_invalid, status); 7057 } 7058 return float_relation_unordered; 7059 } 7060 aSign = extractFloatx80Sign( a ); 7061 bSign = extractFloatx80Sign( b ); 7062 if ( aSign != bSign ) { 7063 7064 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 7065 ( ( a.low | b.low ) == 0 ) ) { 7066 /* zero case */ 7067 return float_relation_equal; 7068 } else { 7069 return 1 - (2 * aSign); 7070 } 7071 } else { 7072 if (a.low == b.low && a.high == b.high) { 7073 return float_relation_equal; 7074 } else { 7075 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7076 } 7077 } 7078 } 7079 7080 int floatx80_compare(floatx80 a, floatx80 b, float_status *status) 7081 { 7082 return floatx80_compare_internal(a, b, 0, status); 7083 } 7084 7085 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status) 7086 { 7087 return floatx80_compare_internal(a, b, 1, status); 7088 } 7089 7090 static inline int float128_compare_internal(float128 a, float128 b, 7091 int is_quiet, float_status *status) 7092 { 7093 flag aSign, bSign; 7094 7095 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7096 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7097 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7098 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7099 if (!is_quiet || 7100 float128_is_signaling_nan(a, status) || 7101 float128_is_signaling_nan(b, status)) { 7102 float_raise(float_flag_invalid, status); 7103 } 7104 return float_relation_unordered; 7105 } 7106 aSign = extractFloat128Sign( a ); 7107 bSign = extractFloat128Sign( b ); 7108 if ( aSign != bSign ) { 7109 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7110 /* zero case */ 7111 return float_relation_equal; 7112 } else { 7113 return 1 - (2 * aSign); 7114 } 7115 } else { 7116 if (a.low == b.low && a.high == b.high) { 7117 return float_relation_equal; 7118 } else { 7119 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7120 } 7121 } 7122 } 7123 7124 int float128_compare(float128 a, float128 b, float_status *status) 7125 { 7126 return float128_compare_internal(a, b, 0, status); 7127 } 7128 7129 int float128_compare_quiet(float128 a, float128 b, float_status *status) 7130 { 7131 return float128_compare_internal(a, b, 1, status); 7132 } 7133 7134 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7135 { 7136 flag aSign; 7137 int32_t aExp; 7138 uint64_t aSig; 7139 7140 if (floatx80_invalid_encoding(a)) { 7141 float_raise(float_flag_invalid, status); 7142 return floatx80_default_nan(status); 7143 } 7144 aSig = extractFloatx80Frac( a ); 7145 aExp = extractFloatx80Exp( a ); 7146 aSign = extractFloatx80Sign( a ); 7147 7148 if ( aExp == 0x7FFF ) { 7149 if ( aSig<<1 ) { 7150 return propagateFloatx80NaN(a, a, status); 7151 } 7152 return a; 7153 } 7154 7155 if (aExp == 0) { 7156 if (aSig == 0) { 7157 return a; 7158 } 7159 aExp++; 7160 } 7161 7162 if (n > 0x10000) { 7163 n = 0x10000; 7164 } else if (n < -0x10000) { 7165 n = -0x10000; 7166 } 7167 7168 aExp += n; 7169 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7170 aSign, aExp, aSig, 0, status); 7171 } 7172 7173 float128 float128_scalbn(float128 a, int n, float_status *status) 7174 { 7175 flag aSign; 7176 int32_t aExp; 7177 uint64_t aSig0, aSig1; 7178 7179 aSig1 = extractFloat128Frac1( a ); 7180 aSig0 = extractFloat128Frac0( a ); 7181 aExp = extractFloat128Exp( a ); 7182 aSign = extractFloat128Sign( a ); 7183 if ( aExp == 0x7FFF ) { 7184 if ( aSig0 | aSig1 ) { 7185 return propagateFloat128NaN(a, a, status); 7186 } 7187 return a; 7188 } 7189 if (aExp != 0) { 7190 aSig0 |= LIT64( 0x0001000000000000 ); 7191 } else if (aSig0 == 0 && aSig1 == 0) { 7192 return a; 7193 } else { 7194 aExp++; 7195 } 7196 7197 if (n > 0x10000) { 7198 n = 0x10000; 7199 } else if (n < -0x10000) { 7200 n = -0x10000; 7201 } 7202 7203 aExp += n - 1; 7204 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7205 , status); 7206 7207 } 7208