1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include "qemu/bitops.h" 87 #include "fpu/softfloat.h" 88 89 /* We only need stdlib for abort() */ 90 91 /*---------------------------------------------------------------------------- 92 | Primitive arithmetic functions, including multi-word arithmetic, and 93 | division and square root approximations. (Can be specialized to target if 94 | desired.) 95 *----------------------------------------------------------------------------*/ 96 #include "fpu/softfloat-macros.h" 97 98 /*---------------------------------------------------------------------------- 99 | Returns the fraction bits of the half-precision floating-point value `a'. 100 *----------------------------------------------------------------------------*/ 101 102 static inline uint32_t extractFloat16Frac(float16 a) 103 { 104 return float16_val(a) & 0x3ff; 105 } 106 107 /*---------------------------------------------------------------------------- 108 | Returns the exponent bits of the half-precision floating-point value `a'. 109 *----------------------------------------------------------------------------*/ 110 111 static inline int extractFloat16Exp(float16 a) 112 { 113 return (float16_val(a) >> 10) & 0x1f; 114 } 115 116 /*---------------------------------------------------------------------------- 117 | Returns the fraction bits of the single-precision floating-point value `a'. 118 *----------------------------------------------------------------------------*/ 119 120 static inline uint32_t extractFloat32Frac(float32 a) 121 { 122 return float32_val(a) & 0x007FFFFF; 123 } 124 125 /*---------------------------------------------------------------------------- 126 | Returns the exponent bits of the single-precision floating-point value `a'. 127 *----------------------------------------------------------------------------*/ 128 129 static inline int extractFloat32Exp(float32 a) 130 { 131 return (float32_val(a) >> 23) & 0xFF; 132 } 133 134 /*---------------------------------------------------------------------------- 135 | Returns the sign bit of the single-precision floating-point value `a'. 136 *----------------------------------------------------------------------------*/ 137 138 static inline flag extractFloat32Sign(float32 a) 139 { 140 return float32_val(a) >> 31; 141 } 142 143 /*---------------------------------------------------------------------------- 144 | Returns the fraction bits of the double-precision floating-point value `a'. 145 *----------------------------------------------------------------------------*/ 146 147 static inline uint64_t extractFloat64Frac(float64 a) 148 { 149 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF); 150 } 151 152 /*---------------------------------------------------------------------------- 153 | Returns the exponent bits of the double-precision floating-point value `a'. 154 *----------------------------------------------------------------------------*/ 155 156 static inline int extractFloat64Exp(float64 a) 157 { 158 return (float64_val(a) >> 52) & 0x7FF; 159 } 160 161 /*---------------------------------------------------------------------------- 162 | Returns the sign bit of the double-precision floating-point value `a'. 163 *----------------------------------------------------------------------------*/ 164 165 static inline flag extractFloat64Sign(float64 a) 166 { 167 return float64_val(a) >> 63; 168 } 169 170 /* 171 * Classify a floating point number. Everything above float_class_qnan 172 * is a NaN so cls >= float_class_qnan is any NaN. 173 */ 174 175 typedef enum __attribute__ ((__packed__)) { 176 float_class_unclassified, 177 float_class_zero, 178 float_class_normal, 179 float_class_inf, 180 float_class_qnan, /* all NaNs from here */ 181 float_class_snan, 182 } FloatClass; 183 184 /* Simple helpers for checking if, or what kind of, NaN we have */ 185 static inline __attribute__((unused)) bool is_nan(FloatClass c) 186 { 187 return unlikely(c >= float_class_qnan); 188 } 189 190 static inline __attribute__((unused)) bool is_snan(FloatClass c) 191 { 192 return c == float_class_snan; 193 } 194 195 static inline __attribute__((unused)) bool is_qnan(FloatClass c) 196 { 197 return c == float_class_qnan; 198 } 199 200 /* 201 * Structure holding all of the decomposed parts of a float. The 202 * exponent is unbiased and the fraction is normalized. All 203 * calculations are done with a 64 bit fraction and then rounded as 204 * appropriate for the final format. 205 * 206 * Thanks to the packed FloatClass a decent compiler should be able to 207 * fit the whole structure into registers and avoid using the stack 208 * for parameter passing. 209 */ 210 211 typedef struct { 212 uint64_t frac; 213 int32_t exp; 214 FloatClass cls; 215 bool sign; 216 } FloatParts; 217 218 #define DECOMPOSED_BINARY_POINT (64 - 2) 219 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 220 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1) 221 222 /* Structure holding all of the relevant parameters for a format. 223 * exp_size: the size of the exponent field 224 * exp_bias: the offset applied to the exponent field 225 * exp_max: the maximum normalised exponent 226 * frac_size: the size of the fraction field 227 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 228 * The following are computed based the size of fraction 229 * frac_lsb: least significant bit of fraction 230 * frac_lsbm1: the bit below the least significant bit (for rounding) 231 * round_mask/roundeven_mask: masks used for rounding 232 * The following optional modifiers are available: 233 * arm_althp: handle ARM Alternative Half Precision 234 */ 235 typedef struct { 236 int exp_size; 237 int exp_bias; 238 int exp_max; 239 int frac_size; 240 int frac_shift; 241 uint64_t frac_lsb; 242 uint64_t frac_lsbm1; 243 uint64_t round_mask; 244 uint64_t roundeven_mask; 245 bool arm_althp; 246 } FloatFmt; 247 248 /* Expand fields based on the size of exponent and fraction */ 249 #define FLOAT_PARAMS(E, F) \ 250 .exp_size = E, \ 251 .exp_bias = ((1 << E) - 1) >> 1, \ 252 .exp_max = (1 << E) - 1, \ 253 .frac_size = F, \ 254 .frac_shift = DECOMPOSED_BINARY_POINT - F, \ 255 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \ 256 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \ 257 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \ 258 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1 259 260 static const FloatFmt float16_params = { 261 FLOAT_PARAMS(5, 10) 262 }; 263 264 static const FloatFmt float16_params_ahp = { 265 FLOAT_PARAMS(5, 10), 266 .arm_althp = true 267 }; 268 269 static const FloatFmt float32_params = { 270 FLOAT_PARAMS(8, 23) 271 }; 272 273 static const FloatFmt float64_params = { 274 FLOAT_PARAMS(11, 52) 275 }; 276 277 /* Unpack a float to parts, but do not canonicalize. */ 278 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw) 279 { 280 const int sign_pos = fmt.frac_size + fmt.exp_size; 281 282 return (FloatParts) { 283 .cls = float_class_unclassified, 284 .sign = extract64(raw, sign_pos, 1), 285 .exp = extract64(raw, fmt.frac_size, fmt.exp_size), 286 .frac = extract64(raw, 0, fmt.frac_size), 287 }; 288 } 289 290 static inline FloatParts float16_unpack_raw(float16 f) 291 { 292 return unpack_raw(float16_params, f); 293 } 294 295 static inline FloatParts float32_unpack_raw(float32 f) 296 { 297 return unpack_raw(float32_params, f); 298 } 299 300 static inline FloatParts float64_unpack_raw(float64 f) 301 { 302 return unpack_raw(float64_params, f); 303 } 304 305 /* Pack a float from parts, but do not canonicalize. */ 306 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p) 307 { 308 const int sign_pos = fmt.frac_size + fmt.exp_size; 309 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp); 310 return deposit64(ret, sign_pos, 1, p.sign); 311 } 312 313 static inline float16 float16_pack_raw(FloatParts p) 314 { 315 return make_float16(pack_raw(float16_params, p)); 316 } 317 318 static inline float32 float32_pack_raw(FloatParts p) 319 { 320 return make_float32(pack_raw(float32_params, p)); 321 } 322 323 static inline float64 float64_pack_raw(FloatParts p) 324 { 325 return make_float64(pack_raw(float64_params, p)); 326 } 327 328 /*---------------------------------------------------------------------------- 329 | Functions and definitions to determine: (1) whether tininess for underflow 330 | is detected before or after rounding by default, (2) what (if anything) 331 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 332 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 333 | are propagated from function inputs to output. These details are target- 334 | specific. 335 *----------------------------------------------------------------------------*/ 336 #include "softfloat-specialize.h" 337 338 /* Canonicalize EXP and FRAC, setting CLS. */ 339 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm, 340 float_status *status) 341 { 342 if (part.exp == parm->exp_max && !parm->arm_althp) { 343 if (part.frac == 0) { 344 part.cls = float_class_inf; 345 } else { 346 part.frac <<= parm->frac_shift; 347 part.cls = (parts_is_snan_frac(part.frac, status) 348 ? float_class_snan : float_class_qnan); 349 } 350 } else if (part.exp == 0) { 351 if (likely(part.frac == 0)) { 352 part.cls = float_class_zero; 353 } else if (status->flush_inputs_to_zero) { 354 float_raise(float_flag_input_denormal, status); 355 part.cls = float_class_zero; 356 part.frac = 0; 357 } else { 358 int shift = clz64(part.frac) - 1; 359 part.cls = float_class_normal; 360 part.exp = parm->frac_shift - parm->exp_bias - shift + 1; 361 part.frac <<= shift; 362 } 363 } else { 364 part.cls = float_class_normal; 365 part.exp -= parm->exp_bias; 366 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift); 367 } 368 return part; 369 } 370 371 /* Round and uncanonicalize a floating-point number by parts. There 372 * are FRAC_SHIFT bits that may require rounding at the bottom of the 373 * fraction; these bits will be removed. The exponent will be biased 374 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0]. 375 */ 376 377 static FloatParts round_canonical(FloatParts p, float_status *s, 378 const FloatFmt *parm) 379 { 380 const uint64_t frac_lsbm1 = parm->frac_lsbm1; 381 const uint64_t round_mask = parm->round_mask; 382 const uint64_t roundeven_mask = parm->roundeven_mask; 383 const int exp_max = parm->exp_max; 384 const int frac_shift = parm->frac_shift; 385 uint64_t frac, inc; 386 int exp, flags = 0; 387 bool overflow_norm; 388 389 frac = p.frac; 390 exp = p.exp; 391 392 switch (p.cls) { 393 case float_class_normal: 394 switch (s->float_rounding_mode) { 395 case float_round_nearest_even: 396 overflow_norm = false; 397 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 398 break; 399 case float_round_ties_away: 400 overflow_norm = false; 401 inc = frac_lsbm1; 402 break; 403 case float_round_to_zero: 404 overflow_norm = true; 405 inc = 0; 406 break; 407 case float_round_up: 408 inc = p.sign ? 0 : round_mask; 409 overflow_norm = p.sign; 410 break; 411 case float_round_down: 412 inc = p.sign ? round_mask : 0; 413 overflow_norm = !p.sign; 414 break; 415 default: 416 g_assert_not_reached(); 417 } 418 419 exp += parm->exp_bias; 420 if (likely(exp > 0)) { 421 if (frac & round_mask) { 422 flags |= float_flag_inexact; 423 frac += inc; 424 if (frac & DECOMPOSED_OVERFLOW_BIT) { 425 frac >>= 1; 426 exp++; 427 } 428 } 429 frac >>= frac_shift; 430 431 if (parm->arm_althp) { 432 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */ 433 if (unlikely(exp > exp_max)) { 434 /* Overflow. Return the maximum normal. */ 435 flags = float_flag_invalid; 436 exp = exp_max; 437 frac = -1; 438 } 439 } else if (unlikely(exp >= exp_max)) { 440 flags |= float_flag_overflow | float_flag_inexact; 441 if (overflow_norm) { 442 exp = exp_max - 1; 443 frac = -1; 444 } else { 445 p.cls = float_class_inf; 446 goto do_inf; 447 } 448 } 449 } else if (s->flush_to_zero) { 450 flags |= float_flag_output_denormal; 451 p.cls = float_class_zero; 452 goto do_zero; 453 } else { 454 bool is_tiny = (s->float_detect_tininess 455 == float_tininess_before_rounding) 456 || (exp < 0) 457 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT); 458 459 shift64RightJamming(frac, 1 - exp, &frac); 460 if (frac & round_mask) { 461 /* Need to recompute round-to-even. */ 462 if (s->float_rounding_mode == float_round_nearest_even) { 463 inc = ((frac & roundeven_mask) != frac_lsbm1 464 ? frac_lsbm1 : 0); 465 } 466 flags |= float_flag_inexact; 467 frac += inc; 468 } 469 470 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0); 471 frac >>= frac_shift; 472 473 if (is_tiny && (flags & float_flag_inexact)) { 474 flags |= float_flag_underflow; 475 } 476 if (exp == 0 && frac == 0) { 477 p.cls = float_class_zero; 478 } 479 } 480 break; 481 482 case float_class_zero: 483 do_zero: 484 exp = 0; 485 frac = 0; 486 break; 487 488 case float_class_inf: 489 do_inf: 490 assert(!parm->arm_althp); 491 exp = exp_max; 492 frac = 0; 493 break; 494 495 case float_class_qnan: 496 case float_class_snan: 497 assert(!parm->arm_althp); 498 exp = exp_max; 499 frac >>= parm->frac_shift; 500 break; 501 502 default: 503 g_assert_not_reached(); 504 } 505 506 float_raise(flags, s); 507 p.exp = exp; 508 p.frac = frac; 509 return p; 510 } 511 512 /* Explicit FloatFmt version */ 513 static FloatParts float16a_unpack_canonical(float16 f, float_status *s, 514 const FloatFmt *params) 515 { 516 return canonicalize(float16_unpack_raw(f), params, s); 517 } 518 519 static FloatParts float16_unpack_canonical(float16 f, float_status *s) 520 { 521 return float16a_unpack_canonical(f, s, &float16_params); 522 } 523 524 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s, 525 const FloatFmt *params) 526 { 527 return float16_pack_raw(round_canonical(p, s, params)); 528 } 529 530 static float16 float16_round_pack_canonical(FloatParts p, float_status *s) 531 { 532 return float16a_round_pack_canonical(p, s, &float16_params); 533 } 534 535 static FloatParts float32_unpack_canonical(float32 f, float_status *s) 536 { 537 return canonicalize(float32_unpack_raw(f), &float32_params, s); 538 } 539 540 static float32 float32_round_pack_canonical(FloatParts p, float_status *s) 541 { 542 return float32_pack_raw(round_canonical(p, s, &float32_params)); 543 } 544 545 static FloatParts float64_unpack_canonical(float64 f, float_status *s) 546 { 547 return canonicalize(float64_unpack_raw(f), &float64_params, s); 548 } 549 550 static float64 float64_round_pack_canonical(FloatParts p, float_status *s) 551 { 552 return float64_pack_raw(round_canonical(p, s, &float64_params)); 553 } 554 555 static FloatParts return_nan(FloatParts a, float_status *s) 556 { 557 switch (a.cls) { 558 case float_class_snan: 559 s->float_exception_flags |= float_flag_invalid; 560 a = parts_silence_nan(a, s); 561 /* fall through */ 562 case float_class_qnan: 563 if (s->default_nan_mode) { 564 return parts_default_nan(s); 565 } 566 break; 567 568 default: 569 g_assert_not_reached(); 570 } 571 return a; 572 } 573 574 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s) 575 { 576 if (is_snan(a.cls) || is_snan(b.cls)) { 577 s->float_exception_flags |= float_flag_invalid; 578 } 579 580 if (s->default_nan_mode) { 581 return parts_default_nan(s); 582 } else { 583 if (pickNaN(a.cls, b.cls, 584 a.frac > b.frac || 585 (a.frac == b.frac && a.sign < b.sign))) { 586 a = b; 587 } 588 if (is_snan(a.cls)) { 589 return parts_silence_nan(a, s); 590 } 591 } 592 return a; 593 } 594 595 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c, 596 bool inf_zero, float_status *s) 597 { 598 int which; 599 600 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) { 601 s->float_exception_flags |= float_flag_invalid; 602 } 603 604 which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s); 605 606 if (s->default_nan_mode) { 607 /* Note that this check is after pickNaNMulAdd so that function 608 * has an opportunity to set the Invalid flag. 609 */ 610 which = 3; 611 } 612 613 switch (which) { 614 case 0: 615 break; 616 case 1: 617 a = b; 618 break; 619 case 2: 620 a = c; 621 break; 622 case 3: 623 return parts_default_nan(s); 624 default: 625 g_assert_not_reached(); 626 } 627 628 if (is_snan(a.cls)) { 629 return parts_silence_nan(a, s); 630 } 631 return a; 632 } 633 634 /* 635 * Returns the result of adding or subtracting the values of the 636 * floating-point values `a' and `b'. The operation is performed 637 * according to the IEC/IEEE Standard for Binary Floating-Point 638 * Arithmetic. 639 */ 640 641 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract, 642 float_status *s) 643 { 644 bool a_sign = a.sign; 645 bool b_sign = b.sign ^ subtract; 646 647 if (a_sign != b_sign) { 648 /* Subtraction */ 649 650 if (a.cls == float_class_normal && b.cls == float_class_normal) { 651 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) { 652 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 653 a.frac = a.frac - b.frac; 654 } else { 655 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 656 a.frac = b.frac - a.frac; 657 a.exp = b.exp; 658 a_sign ^= 1; 659 } 660 661 if (a.frac == 0) { 662 a.cls = float_class_zero; 663 a.sign = s->float_rounding_mode == float_round_down; 664 } else { 665 int shift = clz64(a.frac) - 1; 666 a.frac = a.frac << shift; 667 a.exp = a.exp - shift; 668 a.sign = a_sign; 669 } 670 return a; 671 } 672 if (is_nan(a.cls) || is_nan(b.cls)) { 673 return pick_nan(a, b, s); 674 } 675 if (a.cls == float_class_inf) { 676 if (b.cls == float_class_inf) { 677 float_raise(float_flag_invalid, s); 678 return parts_default_nan(s); 679 } 680 return a; 681 } 682 if (a.cls == float_class_zero && b.cls == float_class_zero) { 683 a.sign = s->float_rounding_mode == float_round_down; 684 return a; 685 } 686 if (a.cls == float_class_zero || b.cls == float_class_inf) { 687 b.sign = a_sign ^ 1; 688 return b; 689 } 690 if (b.cls == float_class_zero) { 691 return a; 692 } 693 } else { 694 /* Addition */ 695 if (a.cls == float_class_normal && b.cls == float_class_normal) { 696 if (a.exp > b.exp) { 697 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 698 } else if (a.exp < b.exp) { 699 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 700 a.exp = b.exp; 701 } 702 a.frac += b.frac; 703 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 704 shift64RightJamming(a.frac, 1, &a.frac); 705 a.exp += 1; 706 } 707 return a; 708 } 709 if (is_nan(a.cls) || is_nan(b.cls)) { 710 return pick_nan(a, b, s); 711 } 712 if (a.cls == float_class_inf || b.cls == float_class_zero) { 713 return a; 714 } 715 if (b.cls == float_class_inf || a.cls == float_class_zero) { 716 b.sign = b_sign; 717 return b; 718 } 719 } 720 g_assert_not_reached(); 721 } 722 723 /* 724 * Returns the result of adding or subtracting the floating-point 725 * values `a' and `b'. The operation is performed according to the 726 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 727 */ 728 729 float16 __attribute__((flatten)) float16_add(float16 a, float16 b, 730 float_status *status) 731 { 732 FloatParts pa = float16_unpack_canonical(a, status); 733 FloatParts pb = float16_unpack_canonical(b, status); 734 FloatParts pr = addsub_floats(pa, pb, false, status); 735 736 return float16_round_pack_canonical(pr, status); 737 } 738 739 float32 __attribute__((flatten)) float32_add(float32 a, float32 b, 740 float_status *status) 741 { 742 FloatParts pa = float32_unpack_canonical(a, status); 743 FloatParts pb = float32_unpack_canonical(b, status); 744 FloatParts pr = addsub_floats(pa, pb, false, status); 745 746 return float32_round_pack_canonical(pr, status); 747 } 748 749 float64 __attribute__((flatten)) float64_add(float64 a, float64 b, 750 float_status *status) 751 { 752 FloatParts pa = float64_unpack_canonical(a, status); 753 FloatParts pb = float64_unpack_canonical(b, status); 754 FloatParts pr = addsub_floats(pa, pb, false, status); 755 756 return float64_round_pack_canonical(pr, status); 757 } 758 759 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b, 760 float_status *status) 761 { 762 FloatParts pa = float16_unpack_canonical(a, status); 763 FloatParts pb = float16_unpack_canonical(b, status); 764 FloatParts pr = addsub_floats(pa, pb, true, status); 765 766 return float16_round_pack_canonical(pr, status); 767 } 768 769 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b, 770 float_status *status) 771 { 772 FloatParts pa = float32_unpack_canonical(a, status); 773 FloatParts pb = float32_unpack_canonical(b, status); 774 FloatParts pr = addsub_floats(pa, pb, true, status); 775 776 return float32_round_pack_canonical(pr, status); 777 } 778 779 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b, 780 float_status *status) 781 { 782 FloatParts pa = float64_unpack_canonical(a, status); 783 FloatParts pb = float64_unpack_canonical(b, status); 784 FloatParts pr = addsub_floats(pa, pb, true, status); 785 786 return float64_round_pack_canonical(pr, status); 787 } 788 789 /* 790 * Returns the result of multiplying the floating-point values `a' and 791 * `b'. The operation is performed according to the IEC/IEEE Standard 792 * for Binary Floating-Point Arithmetic. 793 */ 794 795 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s) 796 { 797 bool sign = a.sign ^ b.sign; 798 799 if (a.cls == float_class_normal && b.cls == float_class_normal) { 800 uint64_t hi, lo; 801 int exp = a.exp + b.exp; 802 803 mul64To128(a.frac, b.frac, &hi, &lo); 804 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 805 if (lo & DECOMPOSED_OVERFLOW_BIT) { 806 shift64RightJamming(lo, 1, &lo); 807 exp += 1; 808 } 809 810 /* Re-use a */ 811 a.exp = exp; 812 a.sign = sign; 813 a.frac = lo; 814 return a; 815 } 816 /* handle all the NaN cases */ 817 if (is_nan(a.cls) || is_nan(b.cls)) { 818 return pick_nan(a, b, s); 819 } 820 /* Inf * Zero == NaN */ 821 if ((a.cls == float_class_inf && b.cls == float_class_zero) || 822 (a.cls == float_class_zero && b.cls == float_class_inf)) { 823 s->float_exception_flags |= float_flag_invalid; 824 return parts_default_nan(s); 825 } 826 /* Multiply by 0 or Inf */ 827 if (a.cls == float_class_inf || a.cls == float_class_zero) { 828 a.sign = sign; 829 return a; 830 } 831 if (b.cls == float_class_inf || b.cls == float_class_zero) { 832 b.sign = sign; 833 return b; 834 } 835 g_assert_not_reached(); 836 } 837 838 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b, 839 float_status *status) 840 { 841 FloatParts pa = float16_unpack_canonical(a, status); 842 FloatParts pb = float16_unpack_canonical(b, status); 843 FloatParts pr = mul_floats(pa, pb, status); 844 845 return float16_round_pack_canonical(pr, status); 846 } 847 848 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b, 849 float_status *status) 850 { 851 FloatParts pa = float32_unpack_canonical(a, status); 852 FloatParts pb = float32_unpack_canonical(b, status); 853 FloatParts pr = mul_floats(pa, pb, status); 854 855 return float32_round_pack_canonical(pr, status); 856 } 857 858 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b, 859 float_status *status) 860 { 861 FloatParts pa = float64_unpack_canonical(a, status); 862 FloatParts pb = float64_unpack_canonical(b, status); 863 FloatParts pr = mul_floats(pa, pb, status); 864 865 return float64_round_pack_canonical(pr, status); 866 } 867 868 /* 869 * Returns the result of multiplying the floating-point values `a' and 870 * `b' then adding 'c', with no intermediate rounding step after the 871 * multiplication. The operation is performed according to the 872 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008. 873 * The flags argument allows the caller to select negation of the 874 * addend, the intermediate product, or the final result. (The 875 * difference between this and having the caller do a separate 876 * negation is that negating externally will flip the sign bit on 877 * NaNs.) 878 */ 879 880 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c, 881 int flags, float_status *s) 882 { 883 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) == 884 ((1 << float_class_inf) | (1 << float_class_zero)); 885 bool p_sign; 886 bool sign_flip = flags & float_muladd_negate_result; 887 FloatClass p_class; 888 uint64_t hi, lo; 889 int p_exp; 890 891 /* It is implementation-defined whether the cases of (0,inf,qnan) 892 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 893 * they return if they do), so we have to hand this information 894 * off to the target-specific pick-a-NaN routine. 895 */ 896 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) { 897 return pick_nan_muladd(a, b, c, inf_zero, s); 898 } 899 900 if (inf_zero) { 901 s->float_exception_flags |= float_flag_invalid; 902 return parts_default_nan(s); 903 } 904 905 if (flags & float_muladd_negate_c) { 906 c.sign ^= 1; 907 } 908 909 p_sign = a.sign ^ b.sign; 910 911 if (flags & float_muladd_negate_product) { 912 p_sign ^= 1; 913 } 914 915 if (a.cls == float_class_inf || b.cls == float_class_inf) { 916 p_class = float_class_inf; 917 } else if (a.cls == float_class_zero || b.cls == float_class_zero) { 918 p_class = float_class_zero; 919 } else { 920 p_class = float_class_normal; 921 } 922 923 if (c.cls == float_class_inf) { 924 if (p_class == float_class_inf && p_sign != c.sign) { 925 s->float_exception_flags |= float_flag_invalid; 926 return parts_default_nan(s); 927 } else { 928 a.cls = float_class_inf; 929 a.sign = c.sign ^ sign_flip; 930 return a; 931 } 932 } 933 934 if (p_class == float_class_inf) { 935 a.cls = float_class_inf; 936 a.sign = p_sign ^ sign_flip; 937 return a; 938 } 939 940 if (p_class == float_class_zero) { 941 if (c.cls == float_class_zero) { 942 if (p_sign != c.sign) { 943 p_sign = s->float_rounding_mode == float_round_down; 944 } 945 c.sign = p_sign; 946 } else if (flags & float_muladd_halve_result) { 947 c.exp -= 1; 948 } 949 c.sign ^= sign_flip; 950 return c; 951 } 952 953 /* a & b should be normals now... */ 954 assert(a.cls == float_class_normal && 955 b.cls == float_class_normal); 956 957 p_exp = a.exp + b.exp; 958 959 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit 960 * result. 961 */ 962 mul64To128(a.frac, b.frac, &hi, &lo); 963 /* binary point now at bit 124 */ 964 965 /* check for overflow */ 966 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) { 967 shift128RightJamming(hi, lo, 1, &hi, &lo); 968 p_exp += 1; 969 } 970 971 /* + add/sub */ 972 if (c.cls == float_class_zero) { 973 /* move binary point back to 62 */ 974 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 975 } else { 976 int exp_diff = p_exp - c.exp; 977 if (p_sign == c.sign) { 978 /* Addition */ 979 if (exp_diff <= 0) { 980 shift128RightJamming(hi, lo, 981 DECOMPOSED_BINARY_POINT - exp_diff, 982 &hi, &lo); 983 lo += c.frac; 984 p_exp = c.exp; 985 } else { 986 uint64_t c_hi, c_lo; 987 /* shift c to the same binary point as the product (124) */ 988 c_hi = c.frac >> 2; 989 c_lo = 0; 990 shift128RightJamming(c_hi, c_lo, 991 exp_diff, 992 &c_hi, &c_lo); 993 add128(hi, lo, c_hi, c_lo, &hi, &lo); 994 /* move binary point back to 62 */ 995 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 996 } 997 998 if (lo & DECOMPOSED_OVERFLOW_BIT) { 999 shift64RightJamming(lo, 1, &lo); 1000 p_exp += 1; 1001 } 1002 1003 } else { 1004 /* Subtraction */ 1005 uint64_t c_hi, c_lo; 1006 /* make C binary point match product at bit 124 */ 1007 c_hi = c.frac >> 2; 1008 c_lo = 0; 1009 1010 if (exp_diff <= 0) { 1011 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo); 1012 if (exp_diff == 0 1013 && 1014 (hi > c_hi || (hi == c_hi && lo >= c_lo))) { 1015 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1016 } else { 1017 sub128(c_hi, c_lo, hi, lo, &hi, &lo); 1018 p_sign ^= 1; 1019 p_exp = c.exp; 1020 } 1021 } else { 1022 shift128RightJamming(c_hi, c_lo, 1023 exp_diff, 1024 &c_hi, &c_lo); 1025 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1026 } 1027 1028 if (hi == 0 && lo == 0) { 1029 a.cls = float_class_zero; 1030 a.sign = s->float_rounding_mode == float_round_down; 1031 a.sign ^= sign_flip; 1032 return a; 1033 } else { 1034 int shift; 1035 if (hi != 0) { 1036 shift = clz64(hi); 1037 } else { 1038 shift = clz64(lo) + 64; 1039 } 1040 /* Normalizing to a binary point of 124 is the 1041 correct adjust for the exponent. However since we're 1042 shifting, we might as well put the binary point back 1043 at 62 where we really want it. Therefore shift as 1044 if we're leaving 1 bit at the top of the word, but 1045 adjust the exponent as if we're leaving 3 bits. */ 1046 shift -= 1; 1047 if (shift >= 64) { 1048 lo = lo << (shift - 64); 1049 } else { 1050 hi = (hi << shift) | (lo >> (64 - shift)); 1051 lo = hi | ((lo << shift) != 0); 1052 } 1053 p_exp -= shift - 2; 1054 } 1055 } 1056 } 1057 1058 if (flags & float_muladd_halve_result) { 1059 p_exp -= 1; 1060 } 1061 1062 /* finally prepare our result */ 1063 a.cls = float_class_normal; 1064 a.sign = p_sign ^ sign_flip; 1065 a.exp = p_exp; 1066 a.frac = lo; 1067 1068 return a; 1069 } 1070 1071 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c, 1072 int flags, float_status *status) 1073 { 1074 FloatParts pa = float16_unpack_canonical(a, status); 1075 FloatParts pb = float16_unpack_canonical(b, status); 1076 FloatParts pc = float16_unpack_canonical(c, status); 1077 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1078 1079 return float16_round_pack_canonical(pr, status); 1080 } 1081 1082 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c, 1083 int flags, float_status *status) 1084 { 1085 FloatParts pa = float32_unpack_canonical(a, status); 1086 FloatParts pb = float32_unpack_canonical(b, status); 1087 FloatParts pc = float32_unpack_canonical(c, status); 1088 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1089 1090 return float32_round_pack_canonical(pr, status); 1091 } 1092 1093 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c, 1094 int flags, float_status *status) 1095 { 1096 FloatParts pa = float64_unpack_canonical(a, status); 1097 FloatParts pb = float64_unpack_canonical(b, status); 1098 FloatParts pc = float64_unpack_canonical(c, status); 1099 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1100 1101 return float64_round_pack_canonical(pr, status); 1102 } 1103 1104 /* 1105 * Returns the result of dividing the floating-point value `a' by the 1106 * corresponding value `b'. The operation is performed according to 1107 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1108 */ 1109 1110 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s) 1111 { 1112 bool sign = a.sign ^ b.sign; 1113 1114 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1115 uint64_t temp_lo, temp_hi; 1116 int exp = a.exp - b.exp; 1117 if (a.frac < b.frac) { 1118 exp -= 1; 1119 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, 1120 &temp_hi, &temp_lo); 1121 } else { 1122 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, 1123 &temp_hi, &temp_lo); 1124 } 1125 /* LSB of quot is set if inexact which roundandpack will use 1126 * to set flags. Yet again we re-use a for the result */ 1127 a.frac = div128To64(temp_lo, temp_hi, b.frac); 1128 a.sign = sign; 1129 a.exp = exp; 1130 return a; 1131 } 1132 /* handle all the NaN cases */ 1133 if (is_nan(a.cls) || is_nan(b.cls)) { 1134 return pick_nan(a, b, s); 1135 } 1136 /* 0/0 or Inf/Inf */ 1137 if (a.cls == b.cls 1138 && 1139 (a.cls == float_class_inf || a.cls == float_class_zero)) { 1140 s->float_exception_flags |= float_flag_invalid; 1141 return parts_default_nan(s); 1142 } 1143 /* Inf / x or 0 / x */ 1144 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1145 a.sign = sign; 1146 return a; 1147 } 1148 /* Div 0 => Inf */ 1149 if (b.cls == float_class_zero) { 1150 s->float_exception_flags |= float_flag_divbyzero; 1151 a.cls = float_class_inf; 1152 a.sign = sign; 1153 return a; 1154 } 1155 /* Div by Inf */ 1156 if (b.cls == float_class_inf) { 1157 a.cls = float_class_zero; 1158 a.sign = sign; 1159 return a; 1160 } 1161 g_assert_not_reached(); 1162 } 1163 1164 float16 float16_div(float16 a, float16 b, float_status *status) 1165 { 1166 FloatParts pa = float16_unpack_canonical(a, status); 1167 FloatParts pb = float16_unpack_canonical(b, status); 1168 FloatParts pr = div_floats(pa, pb, status); 1169 1170 return float16_round_pack_canonical(pr, status); 1171 } 1172 1173 float32 float32_div(float32 a, float32 b, float_status *status) 1174 { 1175 FloatParts pa = float32_unpack_canonical(a, status); 1176 FloatParts pb = float32_unpack_canonical(b, status); 1177 FloatParts pr = div_floats(pa, pb, status); 1178 1179 return float32_round_pack_canonical(pr, status); 1180 } 1181 1182 float64 float64_div(float64 a, float64 b, float_status *status) 1183 { 1184 FloatParts pa = float64_unpack_canonical(a, status); 1185 FloatParts pb = float64_unpack_canonical(b, status); 1186 FloatParts pr = div_floats(pa, pb, status); 1187 1188 return float64_round_pack_canonical(pr, status); 1189 } 1190 1191 /* 1192 * Float to Float conversions 1193 * 1194 * Returns the result of converting one float format to another. The 1195 * conversion is performed according to the IEC/IEEE Standard for 1196 * Binary Floating-Point Arithmetic. 1197 * 1198 * The float_to_float helper only needs to take care of raising 1199 * invalid exceptions and handling the conversion on NaNs. 1200 */ 1201 1202 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf, 1203 float_status *s) 1204 { 1205 if (dstf->arm_althp) { 1206 switch (a.cls) { 1207 case float_class_qnan: 1208 case float_class_snan: 1209 /* There is no NaN in the destination format. Raise Invalid 1210 * and return a zero with the sign of the input NaN. 1211 */ 1212 s->float_exception_flags |= float_flag_invalid; 1213 a.cls = float_class_zero; 1214 a.frac = 0; 1215 a.exp = 0; 1216 break; 1217 1218 case float_class_inf: 1219 /* There is no Inf in the destination format. Raise Invalid 1220 * and return the maximum normal with the correct sign. 1221 */ 1222 s->float_exception_flags |= float_flag_invalid; 1223 a.cls = float_class_normal; 1224 a.exp = dstf->exp_max; 1225 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift; 1226 break; 1227 1228 default: 1229 break; 1230 } 1231 } else if (is_nan(a.cls)) { 1232 if (is_snan(a.cls)) { 1233 s->float_exception_flags |= float_flag_invalid; 1234 a = parts_silence_nan(a, s); 1235 } 1236 if (s->default_nan_mode) { 1237 return parts_default_nan(s); 1238 } 1239 } 1240 return a; 1241 } 1242 1243 float32 float16_to_float32(float16 a, bool ieee, float_status *s) 1244 { 1245 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1246 FloatParts p = float16a_unpack_canonical(a, s, fmt16); 1247 FloatParts pr = float_to_float(p, &float32_params, s); 1248 return float32_round_pack_canonical(pr, s); 1249 } 1250 1251 float64 float16_to_float64(float16 a, bool ieee, float_status *s) 1252 { 1253 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1254 FloatParts p = float16a_unpack_canonical(a, s, fmt16); 1255 FloatParts pr = float_to_float(p, &float64_params, s); 1256 return float64_round_pack_canonical(pr, s); 1257 } 1258 1259 float16 float32_to_float16(float32 a, bool ieee, float_status *s) 1260 { 1261 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1262 FloatParts p = float32_unpack_canonical(a, s); 1263 FloatParts pr = float_to_float(p, fmt16, s); 1264 return float16a_round_pack_canonical(pr, s, fmt16); 1265 } 1266 1267 float64 float32_to_float64(float32 a, float_status *s) 1268 { 1269 FloatParts p = float32_unpack_canonical(a, s); 1270 FloatParts pr = float_to_float(p, &float64_params, s); 1271 return float64_round_pack_canonical(pr, s); 1272 } 1273 1274 float16 float64_to_float16(float64 a, bool ieee, float_status *s) 1275 { 1276 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1277 FloatParts p = float64_unpack_canonical(a, s); 1278 FloatParts pr = float_to_float(p, fmt16, s); 1279 return float16a_round_pack_canonical(pr, s, fmt16); 1280 } 1281 1282 float32 float64_to_float32(float64 a, float_status *s) 1283 { 1284 FloatParts p = float64_unpack_canonical(a, s); 1285 FloatParts pr = float_to_float(p, &float32_params, s); 1286 return float32_round_pack_canonical(pr, s); 1287 } 1288 1289 /* 1290 * Rounds the floating-point value `a' to an integer, and returns the 1291 * result as a floating-point value. The operation is performed 1292 * according to the IEC/IEEE Standard for Binary Floating-Point 1293 * Arithmetic. 1294 */ 1295 1296 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s) 1297 { 1298 if (is_nan(a.cls)) { 1299 return return_nan(a, s); 1300 } 1301 1302 switch (a.cls) { 1303 case float_class_zero: 1304 case float_class_inf: 1305 case float_class_qnan: 1306 /* already "integral" */ 1307 break; 1308 case float_class_normal: 1309 if (a.exp >= DECOMPOSED_BINARY_POINT) { 1310 /* already integral */ 1311 break; 1312 } 1313 if (a.exp < 0) { 1314 bool one; 1315 /* all fractional */ 1316 s->float_exception_flags |= float_flag_inexact; 1317 switch (rounding_mode) { 1318 case float_round_nearest_even: 1319 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT; 1320 break; 1321 case float_round_ties_away: 1322 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT; 1323 break; 1324 case float_round_to_zero: 1325 one = false; 1326 break; 1327 case float_round_up: 1328 one = !a.sign; 1329 break; 1330 case float_round_down: 1331 one = a.sign; 1332 break; 1333 default: 1334 g_assert_not_reached(); 1335 } 1336 1337 if (one) { 1338 a.frac = DECOMPOSED_IMPLICIT_BIT; 1339 a.exp = 0; 1340 } else { 1341 a.cls = float_class_zero; 1342 } 1343 } else { 1344 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp; 1345 uint64_t frac_lsbm1 = frac_lsb >> 1; 1346 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb; 1347 uint64_t rnd_mask = rnd_even_mask >> 1; 1348 uint64_t inc; 1349 1350 switch (rounding_mode) { 1351 case float_round_nearest_even: 1352 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 1353 break; 1354 case float_round_ties_away: 1355 inc = frac_lsbm1; 1356 break; 1357 case float_round_to_zero: 1358 inc = 0; 1359 break; 1360 case float_round_up: 1361 inc = a.sign ? 0 : rnd_mask; 1362 break; 1363 case float_round_down: 1364 inc = a.sign ? rnd_mask : 0; 1365 break; 1366 default: 1367 g_assert_not_reached(); 1368 } 1369 1370 if (a.frac & rnd_mask) { 1371 s->float_exception_flags |= float_flag_inexact; 1372 a.frac += inc; 1373 a.frac &= ~rnd_mask; 1374 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 1375 a.frac >>= 1; 1376 a.exp++; 1377 } 1378 } 1379 } 1380 break; 1381 default: 1382 g_assert_not_reached(); 1383 } 1384 return a; 1385 } 1386 1387 float16 float16_round_to_int(float16 a, float_status *s) 1388 { 1389 FloatParts pa = float16_unpack_canonical(a, s); 1390 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s); 1391 return float16_round_pack_canonical(pr, s); 1392 } 1393 1394 float32 float32_round_to_int(float32 a, float_status *s) 1395 { 1396 FloatParts pa = float32_unpack_canonical(a, s); 1397 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s); 1398 return float32_round_pack_canonical(pr, s); 1399 } 1400 1401 float64 float64_round_to_int(float64 a, float_status *s) 1402 { 1403 FloatParts pa = float64_unpack_canonical(a, s); 1404 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s); 1405 return float64_round_pack_canonical(pr, s); 1406 } 1407 1408 float64 float64_trunc_to_int(float64 a, float_status *s) 1409 { 1410 FloatParts pa = float64_unpack_canonical(a, s); 1411 FloatParts pr = round_to_int(pa, float_round_to_zero, s); 1412 return float64_round_pack_canonical(pr, s); 1413 } 1414 1415 /* 1416 * Returns the result of converting the floating-point value `a' to 1417 * the two's complement integer format. The conversion is performed 1418 * according to the IEC/IEEE Standard for Binary Floating-Point 1419 * Arithmetic---which means in particular that the conversion is 1420 * rounded according to the current rounding mode. If `a' is a NaN, 1421 * the largest positive integer is returned. Otherwise, if the 1422 * conversion overflows, the largest integer with the same sign as `a' 1423 * is returned. 1424 */ 1425 1426 static int64_t round_to_int_and_pack(FloatParts in, int rmode, 1427 int64_t min, int64_t max, 1428 float_status *s) 1429 { 1430 uint64_t r; 1431 int orig_flags = get_float_exception_flags(s); 1432 FloatParts p = round_to_int(in, rmode, s); 1433 1434 switch (p.cls) { 1435 case float_class_snan: 1436 case float_class_qnan: 1437 s->float_exception_flags = orig_flags | float_flag_invalid; 1438 return max; 1439 case float_class_inf: 1440 s->float_exception_flags = orig_flags | float_flag_invalid; 1441 return p.sign ? min : max; 1442 case float_class_zero: 1443 return 0; 1444 case float_class_normal: 1445 if (p.exp < DECOMPOSED_BINARY_POINT) { 1446 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 1447 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 1448 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 1449 } else { 1450 r = UINT64_MAX; 1451 } 1452 if (p.sign) { 1453 if (r <= -(uint64_t) min) { 1454 return -r; 1455 } else { 1456 s->float_exception_flags = orig_flags | float_flag_invalid; 1457 return min; 1458 } 1459 } else { 1460 if (r <= max) { 1461 return r; 1462 } else { 1463 s->float_exception_flags = orig_flags | float_flag_invalid; 1464 return max; 1465 } 1466 } 1467 default: 1468 g_assert_not_reached(); 1469 } 1470 } 1471 1472 #define FLOAT_TO_INT(fsz, isz) \ 1473 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a, \ 1474 float_status *s) \ 1475 { \ 1476 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1477 return round_to_int_and_pack(p, s->float_rounding_mode, \ 1478 INT ## isz ## _MIN, INT ## isz ## _MAX,\ 1479 s); \ 1480 } \ 1481 \ 1482 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero \ 1483 (float ## fsz a, float_status *s) \ 1484 { \ 1485 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1486 return round_to_int_and_pack(p, float_round_to_zero, \ 1487 INT ## isz ## _MIN, INT ## isz ## _MAX,\ 1488 s); \ 1489 } 1490 1491 FLOAT_TO_INT(16, 16) 1492 FLOAT_TO_INT(16, 32) 1493 FLOAT_TO_INT(16, 64) 1494 1495 FLOAT_TO_INT(32, 16) 1496 FLOAT_TO_INT(32, 32) 1497 FLOAT_TO_INT(32, 64) 1498 1499 FLOAT_TO_INT(64, 16) 1500 FLOAT_TO_INT(64, 32) 1501 FLOAT_TO_INT(64, 64) 1502 1503 #undef FLOAT_TO_INT 1504 1505 /* 1506 * Returns the result of converting the floating-point value `a' to 1507 * the unsigned integer format. The conversion is performed according 1508 * to the IEC/IEEE Standard for Binary Floating-Point 1509 * Arithmetic---which means in particular that the conversion is 1510 * rounded according to the current rounding mode. If `a' is a NaN, 1511 * the largest unsigned integer is returned. Otherwise, if the 1512 * conversion overflows, the largest unsigned integer is returned. If 1513 * the 'a' is negative, the result is rounded and zero is returned; 1514 * values that do not round to zero will raise the inexact exception 1515 * flag. 1516 */ 1517 1518 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max, 1519 float_status *s) 1520 { 1521 int orig_flags = get_float_exception_flags(s); 1522 FloatParts p = round_to_int(in, rmode, s); 1523 1524 switch (p.cls) { 1525 case float_class_snan: 1526 case float_class_qnan: 1527 s->float_exception_flags = orig_flags | float_flag_invalid; 1528 return max; 1529 case float_class_inf: 1530 s->float_exception_flags = orig_flags | float_flag_invalid; 1531 return p.sign ? 0 : max; 1532 case float_class_zero: 1533 return 0; 1534 case float_class_normal: 1535 { 1536 uint64_t r; 1537 if (p.sign) { 1538 s->float_exception_flags = orig_flags | float_flag_invalid; 1539 return 0; 1540 } 1541 1542 if (p.exp < DECOMPOSED_BINARY_POINT) { 1543 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 1544 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 1545 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 1546 } else { 1547 s->float_exception_flags = orig_flags | float_flag_invalid; 1548 return max; 1549 } 1550 1551 /* For uint64 this will never trip, but if p.exp is too large 1552 * to shift a decomposed fraction we shall have exited via the 1553 * 3rd leg above. 1554 */ 1555 if (r > max) { 1556 s->float_exception_flags = orig_flags | float_flag_invalid; 1557 return max; 1558 } else { 1559 return r; 1560 } 1561 } 1562 default: 1563 g_assert_not_reached(); 1564 } 1565 } 1566 1567 #define FLOAT_TO_UINT(fsz, isz) \ 1568 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a, \ 1569 float_status *s) \ 1570 { \ 1571 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1572 return round_to_uint_and_pack(p, s->float_rounding_mode, \ 1573 UINT ## isz ## _MAX, s); \ 1574 } \ 1575 \ 1576 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero \ 1577 (float ## fsz a, float_status *s) \ 1578 { \ 1579 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1580 return round_to_uint_and_pack(p, float_round_to_zero, \ 1581 UINT ## isz ## _MAX, s); \ 1582 } 1583 1584 FLOAT_TO_UINT(16, 16) 1585 FLOAT_TO_UINT(16, 32) 1586 FLOAT_TO_UINT(16, 64) 1587 1588 FLOAT_TO_UINT(32, 16) 1589 FLOAT_TO_UINT(32, 32) 1590 FLOAT_TO_UINT(32, 64) 1591 1592 FLOAT_TO_UINT(64, 16) 1593 FLOAT_TO_UINT(64, 32) 1594 FLOAT_TO_UINT(64, 64) 1595 1596 #undef FLOAT_TO_UINT 1597 1598 /* 1599 * Integer to float conversions 1600 * 1601 * Returns the result of converting the two's complement integer `a' 1602 * to the floating-point format. The conversion is performed according 1603 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1604 */ 1605 1606 static FloatParts int_to_float(int64_t a, float_status *status) 1607 { 1608 FloatParts r = {}; 1609 if (a == 0) { 1610 r.cls = float_class_zero; 1611 r.sign = false; 1612 } else if (a == (1ULL << 63)) { 1613 r.cls = float_class_normal; 1614 r.sign = true; 1615 r.frac = DECOMPOSED_IMPLICIT_BIT; 1616 r.exp = 63; 1617 } else { 1618 uint64_t f; 1619 if (a < 0) { 1620 f = -a; 1621 r.sign = true; 1622 } else { 1623 f = a; 1624 r.sign = false; 1625 } 1626 int shift = clz64(f) - 1; 1627 r.cls = float_class_normal; 1628 r.exp = (DECOMPOSED_BINARY_POINT - shift); 1629 r.frac = f << shift; 1630 } 1631 1632 return r; 1633 } 1634 1635 float16 int64_to_float16(int64_t a, float_status *status) 1636 { 1637 FloatParts pa = int_to_float(a, status); 1638 return float16_round_pack_canonical(pa, status); 1639 } 1640 1641 float16 int32_to_float16(int32_t a, float_status *status) 1642 { 1643 return int64_to_float16(a, status); 1644 } 1645 1646 float16 int16_to_float16(int16_t a, float_status *status) 1647 { 1648 return int64_to_float16(a, status); 1649 } 1650 1651 float32 int64_to_float32(int64_t a, float_status *status) 1652 { 1653 FloatParts pa = int_to_float(a, status); 1654 return float32_round_pack_canonical(pa, status); 1655 } 1656 1657 float32 int32_to_float32(int32_t a, float_status *status) 1658 { 1659 return int64_to_float32(a, status); 1660 } 1661 1662 float32 int16_to_float32(int16_t a, float_status *status) 1663 { 1664 return int64_to_float32(a, status); 1665 } 1666 1667 float64 int64_to_float64(int64_t a, float_status *status) 1668 { 1669 FloatParts pa = int_to_float(a, status); 1670 return float64_round_pack_canonical(pa, status); 1671 } 1672 1673 float64 int32_to_float64(int32_t a, float_status *status) 1674 { 1675 return int64_to_float64(a, status); 1676 } 1677 1678 float64 int16_to_float64(int16_t a, float_status *status) 1679 { 1680 return int64_to_float64(a, status); 1681 } 1682 1683 1684 /* 1685 * Unsigned Integer to float conversions 1686 * 1687 * Returns the result of converting the unsigned integer `a' to the 1688 * floating-point format. The conversion is performed according to the 1689 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1690 */ 1691 1692 static FloatParts uint_to_float(uint64_t a, float_status *status) 1693 { 1694 FloatParts r = { .sign = false}; 1695 1696 if (a == 0) { 1697 r.cls = float_class_zero; 1698 } else { 1699 int spare_bits = clz64(a) - 1; 1700 r.cls = float_class_normal; 1701 r.exp = DECOMPOSED_BINARY_POINT - spare_bits; 1702 if (spare_bits < 0) { 1703 shift64RightJamming(a, -spare_bits, &a); 1704 r.frac = a; 1705 } else { 1706 r.frac = a << spare_bits; 1707 } 1708 } 1709 1710 return r; 1711 } 1712 1713 float16 uint64_to_float16(uint64_t a, float_status *status) 1714 { 1715 FloatParts pa = uint_to_float(a, status); 1716 return float16_round_pack_canonical(pa, status); 1717 } 1718 1719 float16 uint32_to_float16(uint32_t a, float_status *status) 1720 { 1721 return uint64_to_float16(a, status); 1722 } 1723 1724 float16 uint16_to_float16(uint16_t a, float_status *status) 1725 { 1726 return uint64_to_float16(a, status); 1727 } 1728 1729 float32 uint64_to_float32(uint64_t a, float_status *status) 1730 { 1731 FloatParts pa = uint_to_float(a, status); 1732 return float32_round_pack_canonical(pa, status); 1733 } 1734 1735 float32 uint32_to_float32(uint32_t a, float_status *status) 1736 { 1737 return uint64_to_float32(a, status); 1738 } 1739 1740 float32 uint16_to_float32(uint16_t a, float_status *status) 1741 { 1742 return uint64_to_float32(a, status); 1743 } 1744 1745 float64 uint64_to_float64(uint64_t a, float_status *status) 1746 { 1747 FloatParts pa = uint_to_float(a, status); 1748 return float64_round_pack_canonical(pa, status); 1749 } 1750 1751 float64 uint32_to_float64(uint32_t a, float_status *status) 1752 { 1753 return uint64_to_float64(a, status); 1754 } 1755 1756 float64 uint16_to_float64(uint16_t a, float_status *status) 1757 { 1758 return uint64_to_float64(a, status); 1759 } 1760 1761 /* Float Min/Max */ 1762 /* min() and max() functions. These can't be implemented as 1763 * 'compare and pick one input' because that would mishandle 1764 * NaNs and +0 vs -0. 1765 * 1766 * minnum() and maxnum() functions. These are similar to the min() 1767 * and max() functions but if one of the arguments is a QNaN and 1768 * the other is numerical then the numerical argument is returned. 1769 * SNaNs will get quietened before being returned. 1770 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 1771 * and maxNum() operations. min() and max() are the typical min/max 1772 * semantics provided by many CPUs which predate that specification. 1773 * 1774 * minnummag() and maxnummag() functions correspond to minNumMag() 1775 * and minNumMag() from the IEEE-754 2008. 1776 */ 1777 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin, 1778 bool ieee, bool ismag, float_status *s) 1779 { 1780 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) { 1781 if (ieee) { 1782 /* Takes two floating-point values `a' and `b', one of 1783 * which is a NaN, and returns the appropriate NaN 1784 * result. If either `a' or `b' is a signaling NaN, 1785 * the invalid exception is raised. 1786 */ 1787 if (is_snan(a.cls) || is_snan(b.cls)) { 1788 return pick_nan(a, b, s); 1789 } else if (is_nan(a.cls) && !is_nan(b.cls)) { 1790 return b; 1791 } else if (is_nan(b.cls) && !is_nan(a.cls)) { 1792 return a; 1793 } 1794 } 1795 return pick_nan(a, b, s); 1796 } else { 1797 int a_exp, b_exp; 1798 1799 switch (a.cls) { 1800 case float_class_normal: 1801 a_exp = a.exp; 1802 break; 1803 case float_class_inf: 1804 a_exp = INT_MAX; 1805 break; 1806 case float_class_zero: 1807 a_exp = INT_MIN; 1808 break; 1809 default: 1810 g_assert_not_reached(); 1811 break; 1812 } 1813 switch (b.cls) { 1814 case float_class_normal: 1815 b_exp = b.exp; 1816 break; 1817 case float_class_inf: 1818 b_exp = INT_MAX; 1819 break; 1820 case float_class_zero: 1821 b_exp = INT_MIN; 1822 break; 1823 default: 1824 g_assert_not_reached(); 1825 break; 1826 } 1827 1828 if (ismag && (a_exp != b_exp || a.frac != b.frac)) { 1829 bool a_less = a_exp < b_exp; 1830 if (a_exp == b_exp) { 1831 a_less = a.frac < b.frac; 1832 } 1833 return a_less ^ ismin ? b : a; 1834 } 1835 1836 if (a.sign == b.sign) { 1837 bool a_less = a_exp < b_exp; 1838 if (a_exp == b_exp) { 1839 a_less = a.frac < b.frac; 1840 } 1841 return a.sign ^ a_less ^ ismin ? b : a; 1842 } else { 1843 return a.sign ^ ismin ? b : a; 1844 } 1845 } 1846 } 1847 1848 #define MINMAX(sz, name, ismin, isiee, ismag) \ 1849 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \ 1850 float_status *s) \ 1851 { \ 1852 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 1853 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 1854 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 1855 \ 1856 return float ## sz ## _round_pack_canonical(pr, s); \ 1857 } 1858 1859 MINMAX(16, min, true, false, false) 1860 MINMAX(16, minnum, true, true, false) 1861 MINMAX(16, minnummag, true, true, true) 1862 MINMAX(16, max, false, false, false) 1863 MINMAX(16, maxnum, false, true, false) 1864 MINMAX(16, maxnummag, false, true, true) 1865 1866 MINMAX(32, min, true, false, false) 1867 MINMAX(32, minnum, true, true, false) 1868 MINMAX(32, minnummag, true, true, true) 1869 MINMAX(32, max, false, false, false) 1870 MINMAX(32, maxnum, false, true, false) 1871 MINMAX(32, maxnummag, false, true, true) 1872 1873 MINMAX(64, min, true, false, false) 1874 MINMAX(64, minnum, true, true, false) 1875 MINMAX(64, minnummag, true, true, true) 1876 MINMAX(64, max, false, false, false) 1877 MINMAX(64, maxnum, false, true, false) 1878 MINMAX(64, maxnummag, false, true, true) 1879 1880 #undef MINMAX 1881 1882 /* Floating point compare */ 1883 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet, 1884 float_status *s) 1885 { 1886 if (is_nan(a.cls) || is_nan(b.cls)) { 1887 if (!is_quiet || 1888 a.cls == float_class_snan || 1889 b.cls == float_class_snan) { 1890 s->float_exception_flags |= float_flag_invalid; 1891 } 1892 return float_relation_unordered; 1893 } 1894 1895 if (a.cls == float_class_zero) { 1896 if (b.cls == float_class_zero) { 1897 return float_relation_equal; 1898 } 1899 return b.sign ? float_relation_greater : float_relation_less; 1900 } else if (b.cls == float_class_zero) { 1901 return a.sign ? float_relation_less : float_relation_greater; 1902 } 1903 1904 /* The only really important thing about infinity is its sign. If 1905 * both are infinities the sign marks the smallest of the two. 1906 */ 1907 if (a.cls == float_class_inf) { 1908 if ((b.cls == float_class_inf) && (a.sign == b.sign)) { 1909 return float_relation_equal; 1910 } 1911 return a.sign ? float_relation_less : float_relation_greater; 1912 } else if (b.cls == float_class_inf) { 1913 return b.sign ? float_relation_greater : float_relation_less; 1914 } 1915 1916 if (a.sign != b.sign) { 1917 return a.sign ? float_relation_less : float_relation_greater; 1918 } 1919 1920 if (a.exp == b.exp) { 1921 if (a.frac == b.frac) { 1922 return float_relation_equal; 1923 } 1924 if (a.sign) { 1925 return a.frac > b.frac ? 1926 float_relation_less : float_relation_greater; 1927 } else { 1928 return a.frac > b.frac ? 1929 float_relation_greater : float_relation_less; 1930 } 1931 } else { 1932 if (a.sign) { 1933 return a.exp > b.exp ? float_relation_less : float_relation_greater; 1934 } else { 1935 return a.exp > b.exp ? float_relation_greater : float_relation_less; 1936 } 1937 } 1938 } 1939 1940 #define COMPARE(sz) \ 1941 int float ## sz ## _compare(float ## sz a, float ## sz b, \ 1942 float_status *s) \ 1943 { \ 1944 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 1945 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 1946 return compare_floats(pa, pb, false, s); \ 1947 } \ 1948 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \ 1949 float_status *s) \ 1950 { \ 1951 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 1952 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 1953 return compare_floats(pa, pb, true, s); \ 1954 } 1955 1956 COMPARE(16) 1957 COMPARE(32) 1958 COMPARE(64) 1959 1960 #undef COMPARE 1961 1962 /* Multiply A by 2 raised to the power N. */ 1963 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s) 1964 { 1965 if (unlikely(is_nan(a.cls))) { 1966 return return_nan(a, s); 1967 } 1968 if (a.cls == float_class_normal) { 1969 /* The largest float type (even though not supported by FloatParts) 1970 * is float128, which has a 15 bit exponent. Bounding N to 16 bits 1971 * still allows rounding to infinity, without allowing overflow 1972 * within the int32_t that backs FloatParts.exp. 1973 */ 1974 n = MIN(MAX(n, -0x10000), 0x10000); 1975 a.exp += n; 1976 } 1977 return a; 1978 } 1979 1980 float16 float16_scalbn(float16 a, int n, float_status *status) 1981 { 1982 FloatParts pa = float16_unpack_canonical(a, status); 1983 FloatParts pr = scalbn_decomposed(pa, n, status); 1984 return float16_round_pack_canonical(pr, status); 1985 } 1986 1987 float32 float32_scalbn(float32 a, int n, float_status *status) 1988 { 1989 FloatParts pa = float32_unpack_canonical(a, status); 1990 FloatParts pr = scalbn_decomposed(pa, n, status); 1991 return float32_round_pack_canonical(pr, status); 1992 } 1993 1994 float64 float64_scalbn(float64 a, int n, float_status *status) 1995 { 1996 FloatParts pa = float64_unpack_canonical(a, status); 1997 FloatParts pr = scalbn_decomposed(pa, n, status); 1998 return float64_round_pack_canonical(pr, status); 1999 } 2000 2001 /* 2002 * Square Root 2003 * 2004 * The old softfloat code did an approximation step before zeroing in 2005 * on the final result. However for simpleness we just compute the 2006 * square root by iterating down from the implicit bit to enough extra 2007 * bits to ensure we get a correctly rounded result. 2008 * 2009 * This does mean however the calculation is slower than before, 2010 * especially for 64 bit floats. 2011 */ 2012 2013 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p) 2014 { 2015 uint64_t a_frac, r_frac, s_frac; 2016 int bit, last_bit; 2017 2018 if (is_nan(a.cls)) { 2019 return return_nan(a, s); 2020 } 2021 if (a.cls == float_class_zero) { 2022 return a; /* sqrt(+-0) = +-0 */ 2023 } 2024 if (a.sign) { 2025 s->float_exception_flags |= float_flag_invalid; 2026 return parts_default_nan(s); 2027 } 2028 if (a.cls == float_class_inf) { 2029 return a; /* sqrt(+inf) = +inf */ 2030 } 2031 2032 assert(a.cls == float_class_normal); 2033 2034 /* We need two overflow bits at the top. Adding room for that is a 2035 * right shift. If the exponent is odd, we can discard the low bit 2036 * by multiplying the fraction by 2; that's a left shift. Combine 2037 * those and we shift right if the exponent is even. 2038 */ 2039 a_frac = a.frac; 2040 if (!(a.exp & 1)) { 2041 a_frac >>= 1; 2042 } 2043 a.exp >>= 1; 2044 2045 /* Bit-by-bit computation of sqrt. */ 2046 r_frac = 0; 2047 s_frac = 0; 2048 2049 /* Iterate from implicit bit down to the 3 extra bits to compute a 2050 * properly rounded result. Remember we've inserted one more bit 2051 * at the top, so these positions are one less. 2052 */ 2053 bit = DECOMPOSED_BINARY_POINT - 1; 2054 last_bit = MAX(p->frac_shift - 4, 0); 2055 do { 2056 uint64_t q = 1ULL << bit; 2057 uint64_t t_frac = s_frac + q; 2058 if (t_frac <= a_frac) { 2059 s_frac = t_frac + q; 2060 a_frac -= t_frac; 2061 r_frac += q; 2062 } 2063 a_frac <<= 1; 2064 } while (--bit >= last_bit); 2065 2066 /* Undo the right shift done above. If there is any remaining 2067 * fraction, the result is inexact. Set the sticky bit. 2068 */ 2069 a.frac = (r_frac << 1) + (a_frac != 0); 2070 2071 return a; 2072 } 2073 2074 float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status) 2075 { 2076 FloatParts pa = float16_unpack_canonical(a, status); 2077 FloatParts pr = sqrt_float(pa, status, &float16_params); 2078 return float16_round_pack_canonical(pr, status); 2079 } 2080 2081 float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status) 2082 { 2083 FloatParts pa = float32_unpack_canonical(a, status); 2084 FloatParts pr = sqrt_float(pa, status, &float32_params); 2085 return float32_round_pack_canonical(pr, status); 2086 } 2087 2088 float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status) 2089 { 2090 FloatParts pa = float64_unpack_canonical(a, status); 2091 FloatParts pr = sqrt_float(pa, status, &float64_params); 2092 return float64_round_pack_canonical(pr, status); 2093 } 2094 2095 /*---------------------------------------------------------------------------- 2096 | The pattern for a default generated NaN. 2097 *----------------------------------------------------------------------------*/ 2098 2099 float16 float16_default_nan(float_status *status) 2100 { 2101 FloatParts p = parts_default_nan(status); 2102 p.frac >>= float16_params.frac_shift; 2103 return float16_pack_raw(p); 2104 } 2105 2106 float32 float32_default_nan(float_status *status) 2107 { 2108 FloatParts p = parts_default_nan(status); 2109 p.frac >>= float32_params.frac_shift; 2110 return float32_pack_raw(p); 2111 } 2112 2113 float64 float64_default_nan(float_status *status) 2114 { 2115 FloatParts p = parts_default_nan(status); 2116 p.frac >>= float64_params.frac_shift; 2117 return float64_pack_raw(p); 2118 } 2119 2120 float128 float128_default_nan(float_status *status) 2121 { 2122 FloatParts p = parts_default_nan(status); 2123 float128 r; 2124 2125 /* Extrapolate from the choices made by parts_default_nan to fill 2126 * in the quad-floating format. If the low bit is set, assume we 2127 * want to set all non-snan bits. 2128 */ 2129 r.low = -(p.frac & 1); 2130 r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48); 2131 r.high |= LIT64(0x7FFF000000000000); 2132 r.high |= (uint64_t)p.sign << 63; 2133 2134 return r; 2135 } 2136 2137 /*---------------------------------------------------------------------------- 2138 | Returns a quiet NaN from a signalling NaN for the floating point value `a'. 2139 *----------------------------------------------------------------------------*/ 2140 2141 float16 float16_silence_nan(float16 a, float_status *status) 2142 { 2143 FloatParts p = float16_unpack_raw(a); 2144 p.frac <<= float16_params.frac_shift; 2145 p = parts_silence_nan(p, status); 2146 p.frac >>= float16_params.frac_shift; 2147 return float16_pack_raw(p); 2148 } 2149 2150 float32 float32_silence_nan(float32 a, float_status *status) 2151 { 2152 FloatParts p = float32_unpack_raw(a); 2153 p.frac <<= float32_params.frac_shift; 2154 p = parts_silence_nan(p, status); 2155 p.frac >>= float32_params.frac_shift; 2156 return float32_pack_raw(p); 2157 } 2158 2159 float64 float64_silence_nan(float64 a, float_status *status) 2160 { 2161 FloatParts p = float64_unpack_raw(a); 2162 p.frac <<= float64_params.frac_shift; 2163 p = parts_silence_nan(p, status); 2164 p.frac >>= float64_params.frac_shift; 2165 return float64_pack_raw(p); 2166 } 2167 2168 /*---------------------------------------------------------------------------- 2169 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 2170 | and 7, and returns the properly rounded 32-bit integer corresponding to the 2171 | input. If `zSign' is 1, the input is negated before being converted to an 2172 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 2173 | is simply rounded to an integer, with the inexact exception raised if the 2174 | input cannot be represented exactly as an integer. However, if the fixed- 2175 | point input is too large, the invalid exception is raised and the largest 2176 | positive or negative integer is returned. 2177 *----------------------------------------------------------------------------*/ 2178 2179 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status) 2180 { 2181 int8_t roundingMode; 2182 flag roundNearestEven; 2183 int8_t roundIncrement, roundBits; 2184 int32_t z; 2185 2186 roundingMode = status->float_rounding_mode; 2187 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2188 switch (roundingMode) { 2189 case float_round_nearest_even: 2190 case float_round_ties_away: 2191 roundIncrement = 0x40; 2192 break; 2193 case float_round_to_zero: 2194 roundIncrement = 0; 2195 break; 2196 case float_round_up: 2197 roundIncrement = zSign ? 0 : 0x7f; 2198 break; 2199 case float_round_down: 2200 roundIncrement = zSign ? 0x7f : 0; 2201 break; 2202 default: 2203 abort(); 2204 } 2205 roundBits = absZ & 0x7F; 2206 absZ = ( absZ + roundIncrement )>>7; 2207 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 2208 z = absZ; 2209 if ( zSign ) z = - z; 2210 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 2211 float_raise(float_flag_invalid, status); 2212 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 2213 } 2214 if (roundBits) { 2215 status->float_exception_flags |= float_flag_inexact; 2216 } 2217 return z; 2218 2219 } 2220 2221 /*---------------------------------------------------------------------------- 2222 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 2223 | `absZ1', with binary point between bits 63 and 64 (between the input words), 2224 | and returns the properly rounded 64-bit integer corresponding to the input. 2225 | If `zSign' is 1, the input is negated before being converted to an integer. 2226 | Ordinarily, the fixed-point input is simply rounded to an integer, with 2227 | the inexact exception raised if the input cannot be represented exactly as 2228 | an integer. However, if the fixed-point input is too large, the invalid 2229 | exception is raised and the largest positive or negative integer is 2230 | returned. 2231 *----------------------------------------------------------------------------*/ 2232 2233 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1, 2234 float_status *status) 2235 { 2236 int8_t roundingMode; 2237 flag roundNearestEven, increment; 2238 int64_t z; 2239 2240 roundingMode = status->float_rounding_mode; 2241 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2242 switch (roundingMode) { 2243 case float_round_nearest_even: 2244 case float_round_ties_away: 2245 increment = ((int64_t) absZ1 < 0); 2246 break; 2247 case float_round_to_zero: 2248 increment = 0; 2249 break; 2250 case float_round_up: 2251 increment = !zSign && absZ1; 2252 break; 2253 case float_round_down: 2254 increment = zSign && absZ1; 2255 break; 2256 default: 2257 abort(); 2258 } 2259 if ( increment ) { 2260 ++absZ0; 2261 if ( absZ0 == 0 ) goto overflow; 2262 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 2263 } 2264 z = absZ0; 2265 if ( zSign ) z = - z; 2266 if ( z && ( ( z < 0 ) ^ zSign ) ) { 2267 overflow: 2268 float_raise(float_flag_invalid, status); 2269 return 2270 zSign ? (int64_t) LIT64( 0x8000000000000000 ) 2271 : LIT64( 0x7FFFFFFFFFFFFFFF ); 2272 } 2273 if (absZ1) { 2274 status->float_exception_flags |= float_flag_inexact; 2275 } 2276 return z; 2277 2278 } 2279 2280 /*---------------------------------------------------------------------------- 2281 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 2282 | `absZ1', with binary point between bits 63 and 64 (between the input words), 2283 | and returns the properly rounded 64-bit unsigned integer corresponding to the 2284 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 2285 | with the inexact exception raised if the input cannot be represented exactly 2286 | as an integer. However, if the fixed-point input is too large, the invalid 2287 | exception is raised and the largest unsigned integer is returned. 2288 *----------------------------------------------------------------------------*/ 2289 2290 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0, 2291 uint64_t absZ1, float_status *status) 2292 { 2293 int8_t roundingMode; 2294 flag roundNearestEven, increment; 2295 2296 roundingMode = status->float_rounding_mode; 2297 roundNearestEven = (roundingMode == float_round_nearest_even); 2298 switch (roundingMode) { 2299 case float_round_nearest_even: 2300 case float_round_ties_away: 2301 increment = ((int64_t)absZ1 < 0); 2302 break; 2303 case float_round_to_zero: 2304 increment = 0; 2305 break; 2306 case float_round_up: 2307 increment = !zSign && absZ1; 2308 break; 2309 case float_round_down: 2310 increment = zSign && absZ1; 2311 break; 2312 default: 2313 abort(); 2314 } 2315 if (increment) { 2316 ++absZ0; 2317 if (absZ0 == 0) { 2318 float_raise(float_flag_invalid, status); 2319 return LIT64(0xFFFFFFFFFFFFFFFF); 2320 } 2321 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven); 2322 } 2323 2324 if (zSign && absZ0) { 2325 float_raise(float_flag_invalid, status); 2326 return 0; 2327 } 2328 2329 if (absZ1) { 2330 status->float_exception_flags |= float_flag_inexact; 2331 } 2332 return absZ0; 2333 } 2334 2335 /*---------------------------------------------------------------------------- 2336 | If `a' is denormal and we are in flush-to-zero mode then set the 2337 | input-denormal exception and return zero. Otherwise just return the value. 2338 *----------------------------------------------------------------------------*/ 2339 float32 float32_squash_input_denormal(float32 a, float_status *status) 2340 { 2341 if (status->flush_inputs_to_zero) { 2342 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) { 2343 float_raise(float_flag_input_denormal, status); 2344 return make_float32(float32_val(a) & 0x80000000); 2345 } 2346 } 2347 return a; 2348 } 2349 2350 /*---------------------------------------------------------------------------- 2351 | Normalizes the subnormal single-precision floating-point value represented 2352 | by the denormalized significand `aSig'. The normalized exponent and 2353 | significand are stored at the locations pointed to by `zExpPtr' and 2354 | `zSigPtr', respectively. 2355 *----------------------------------------------------------------------------*/ 2356 2357 static void 2358 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 2359 { 2360 int8_t shiftCount; 2361 2362 shiftCount = countLeadingZeros32( aSig ) - 8; 2363 *zSigPtr = aSig<<shiftCount; 2364 *zExpPtr = 1 - shiftCount; 2365 2366 } 2367 2368 /*---------------------------------------------------------------------------- 2369 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2370 | and significand `zSig', and returns the proper single-precision floating- 2371 | point value corresponding to the abstract input. Ordinarily, the abstract 2372 | value is simply rounded and packed into the single-precision format, with 2373 | the inexact exception raised if the abstract input cannot be represented 2374 | exactly. However, if the abstract value is too large, the overflow and 2375 | inexact exceptions are raised and an infinity or maximal finite value is 2376 | returned. If the abstract value is too small, the input value is rounded to 2377 | a subnormal number, and the underflow and inexact exceptions are raised if 2378 | the abstract input cannot be represented exactly as a subnormal single- 2379 | precision floating-point number. 2380 | The input significand `zSig' has its binary point between bits 30 2381 | and 29, which is 7 bits to the left of the usual location. This shifted 2382 | significand must be normalized or smaller. If `zSig' is not normalized, 2383 | `zExp' must be 0; in that case, the result returned is a subnormal number, 2384 | and it must not require rounding. In the usual case that `zSig' is 2385 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 2386 | The handling of underflow and overflow follows the IEC/IEEE Standard for 2387 | Binary Floating-Point Arithmetic. 2388 *----------------------------------------------------------------------------*/ 2389 2390 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 2391 float_status *status) 2392 { 2393 int8_t roundingMode; 2394 flag roundNearestEven; 2395 int8_t roundIncrement, roundBits; 2396 flag isTiny; 2397 2398 roundingMode = status->float_rounding_mode; 2399 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2400 switch (roundingMode) { 2401 case float_round_nearest_even: 2402 case float_round_ties_away: 2403 roundIncrement = 0x40; 2404 break; 2405 case float_round_to_zero: 2406 roundIncrement = 0; 2407 break; 2408 case float_round_up: 2409 roundIncrement = zSign ? 0 : 0x7f; 2410 break; 2411 case float_round_down: 2412 roundIncrement = zSign ? 0x7f : 0; 2413 break; 2414 default: 2415 abort(); 2416 break; 2417 } 2418 roundBits = zSig & 0x7F; 2419 if ( 0xFD <= (uint16_t) zExp ) { 2420 if ( ( 0xFD < zExp ) 2421 || ( ( zExp == 0xFD ) 2422 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 2423 ) { 2424 float_raise(float_flag_overflow | float_flag_inexact, status); 2425 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 )); 2426 } 2427 if ( zExp < 0 ) { 2428 if (status->flush_to_zero) { 2429 float_raise(float_flag_output_denormal, status); 2430 return packFloat32(zSign, 0, 0); 2431 } 2432 isTiny = 2433 (status->float_detect_tininess 2434 == float_tininess_before_rounding) 2435 || ( zExp < -1 ) 2436 || ( zSig + roundIncrement < 0x80000000 ); 2437 shift32RightJamming( zSig, - zExp, &zSig ); 2438 zExp = 0; 2439 roundBits = zSig & 0x7F; 2440 if (isTiny && roundBits) { 2441 float_raise(float_flag_underflow, status); 2442 } 2443 } 2444 } 2445 if (roundBits) { 2446 status->float_exception_flags |= float_flag_inexact; 2447 } 2448 zSig = ( zSig + roundIncrement )>>7; 2449 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 2450 if ( zSig == 0 ) zExp = 0; 2451 return packFloat32( zSign, zExp, zSig ); 2452 2453 } 2454 2455 /*---------------------------------------------------------------------------- 2456 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2457 | and significand `zSig', and returns the proper single-precision floating- 2458 | point value corresponding to the abstract input. This routine is just like 2459 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 2460 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 2461 | floating-point exponent. 2462 *----------------------------------------------------------------------------*/ 2463 2464 static float32 2465 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 2466 float_status *status) 2467 { 2468 int8_t shiftCount; 2469 2470 shiftCount = countLeadingZeros32( zSig ) - 1; 2471 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 2472 status); 2473 2474 } 2475 2476 /*---------------------------------------------------------------------------- 2477 | If `a' is denormal and we are in flush-to-zero mode then set the 2478 | input-denormal exception and return zero. Otherwise just return the value. 2479 *----------------------------------------------------------------------------*/ 2480 float64 float64_squash_input_denormal(float64 a, float_status *status) 2481 { 2482 if (status->flush_inputs_to_zero) { 2483 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) { 2484 float_raise(float_flag_input_denormal, status); 2485 return make_float64(float64_val(a) & (1ULL << 63)); 2486 } 2487 } 2488 return a; 2489 } 2490 2491 /*---------------------------------------------------------------------------- 2492 | Normalizes the subnormal double-precision floating-point value represented 2493 | by the denormalized significand `aSig'. The normalized exponent and 2494 | significand are stored at the locations pointed to by `zExpPtr' and 2495 | `zSigPtr', respectively. 2496 *----------------------------------------------------------------------------*/ 2497 2498 static void 2499 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 2500 { 2501 int8_t shiftCount; 2502 2503 shiftCount = countLeadingZeros64( aSig ) - 11; 2504 *zSigPtr = aSig<<shiftCount; 2505 *zExpPtr = 1 - shiftCount; 2506 2507 } 2508 2509 /*---------------------------------------------------------------------------- 2510 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 2511 | double-precision floating-point value, returning the result. After being 2512 | shifted into the proper positions, the three fields are simply added 2513 | together to form the result. This means that any integer portion of `zSig' 2514 | will be added into the exponent. Since a properly normalized significand 2515 | will have an integer portion equal to 1, the `zExp' input should be 1 less 2516 | than the desired result exponent whenever `zSig' is a complete, normalized 2517 | significand. 2518 *----------------------------------------------------------------------------*/ 2519 2520 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig) 2521 { 2522 2523 return make_float64( 2524 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 2525 2526 } 2527 2528 /*---------------------------------------------------------------------------- 2529 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2530 | and significand `zSig', and returns the proper double-precision floating- 2531 | point value corresponding to the abstract input. Ordinarily, the abstract 2532 | value is simply rounded and packed into the double-precision format, with 2533 | the inexact exception raised if the abstract input cannot be represented 2534 | exactly. However, if the abstract value is too large, the overflow and 2535 | inexact exceptions are raised and an infinity or maximal finite value is 2536 | returned. If the abstract value is too small, the input value is rounded to 2537 | a subnormal number, and the underflow and inexact exceptions are raised if 2538 | the abstract input cannot be represented exactly as a subnormal double- 2539 | precision floating-point number. 2540 | The input significand `zSig' has its binary point between bits 62 2541 | and 61, which is 10 bits to the left of the usual location. This shifted 2542 | significand must be normalized or smaller. If `zSig' is not normalized, 2543 | `zExp' must be 0; in that case, the result returned is a subnormal number, 2544 | and it must not require rounding. In the usual case that `zSig' is 2545 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 2546 | The handling of underflow and overflow follows the IEC/IEEE Standard for 2547 | Binary Floating-Point Arithmetic. 2548 *----------------------------------------------------------------------------*/ 2549 2550 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 2551 float_status *status) 2552 { 2553 int8_t roundingMode; 2554 flag roundNearestEven; 2555 int roundIncrement, roundBits; 2556 flag isTiny; 2557 2558 roundingMode = status->float_rounding_mode; 2559 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2560 switch (roundingMode) { 2561 case float_round_nearest_even: 2562 case float_round_ties_away: 2563 roundIncrement = 0x200; 2564 break; 2565 case float_round_to_zero: 2566 roundIncrement = 0; 2567 break; 2568 case float_round_up: 2569 roundIncrement = zSign ? 0 : 0x3ff; 2570 break; 2571 case float_round_down: 2572 roundIncrement = zSign ? 0x3ff : 0; 2573 break; 2574 case float_round_to_odd: 2575 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 2576 break; 2577 default: 2578 abort(); 2579 } 2580 roundBits = zSig & 0x3FF; 2581 if ( 0x7FD <= (uint16_t) zExp ) { 2582 if ( ( 0x7FD < zExp ) 2583 || ( ( zExp == 0x7FD ) 2584 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 2585 ) { 2586 bool overflow_to_inf = roundingMode != float_round_to_odd && 2587 roundIncrement != 0; 2588 float_raise(float_flag_overflow | float_flag_inexact, status); 2589 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 2590 } 2591 if ( zExp < 0 ) { 2592 if (status->flush_to_zero) { 2593 float_raise(float_flag_output_denormal, status); 2594 return packFloat64(zSign, 0, 0); 2595 } 2596 isTiny = 2597 (status->float_detect_tininess 2598 == float_tininess_before_rounding) 2599 || ( zExp < -1 ) 2600 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 2601 shift64RightJamming( zSig, - zExp, &zSig ); 2602 zExp = 0; 2603 roundBits = zSig & 0x3FF; 2604 if (isTiny && roundBits) { 2605 float_raise(float_flag_underflow, status); 2606 } 2607 if (roundingMode == float_round_to_odd) { 2608 /* 2609 * For round-to-odd case, the roundIncrement depends on 2610 * zSig which just changed. 2611 */ 2612 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 2613 } 2614 } 2615 } 2616 if (roundBits) { 2617 status->float_exception_flags |= float_flag_inexact; 2618 } 2619 zSig = ( zSig + roundIncrement )>>10; 2620 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 2621 if ( zSig == 0 ) zExp = 0; 2622 return packFloat64( zSign, zExp, zSig ); 2623 2624 } 2625 2626 /*---------------------------------------------------------------------------- 2627 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2628 | and significand `zSig', and returns the proper double-precision floating- 2629 | point value corresponding to the abstract input. This routine is just like 2630 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 2631 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 2632 | floating-point exponent. 2633 *----------------------------------------------------------------------------*/ 2634 2635 static float64 2636 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 2637 float_status *status) 2638 { 2639 int8_t shiftCount; 2640 2641 shiftCount = countLeadingZeros64( zSig ) - 1; 2642 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 2643 status); 2644 2645 } 2646 2647 /*---------------------------------------------------------------------------- 2648 | Normalizes the subnormal extended double-precision floating-point value 2649 | represented by the denormalized significand `aSig'. The normalized exponent 2650 | and significand are stored at the locations pointed to by `zExpPtr' and 2651 | `zSigPtr', respectively. 2652 *----------------------------------------------------------------------------*/ 2653 2654 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr, 2655 uint64_t *zSigPtr) 2656 { 2657 int8_t shiftCount; 2658 2659 shiftCount = countLeadingZeros64( aSig ); 2660 *zSigPtr = aSig<<shiftCount; 2661 *zExpPtr = 1 - shiftCount; 2662 } 2663 2664 /*---------------------------------------------------------------------------- 2665 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2666 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 2667 | and returns the proper extended double-precision floating-point value 2668 | corresponding to the abstract input. Ordinarily, the abstract value is 2669 | rounded and packed into the extended double-precision format, with the 2670 | inexact exception raised if the abstract input cannot be represented 2671 | exactly. However, if the abstract value is too large, the overflow and 2672 | inexact exceptions are raised and an infinity or maximal finite value is 2673 | returned. If the abstract value is too small, the input value is rounded to 2674 | a subnormal number, and the underflow and inexact exceptions are raised if 2675 | the abstract input cannot be represented exactly as a subnormal extended 2676 | double-precision floating-point number. 2677 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 2678 | number of bits as single or double precision, respectively. Otherwise, the 2679 | result is rounded to the full precision of the extended double-precision 2680 | format. 2681 | The input significand must be normalized or smaller. If the input 2682 | significand is not normalized, `zExp' must be 0; in that case, the result 2683 | returned is a subnormal number, and it must not require rounding. The 2684 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 2685 | Floating-Point Arithmetic. 2686 *----------------------------------------------------------------------------*/ 2687 2688 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign, 2689 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 2690 float_status *status) 2691 { 2692 int8_t roundingMode; 2693 flag roundNearestEven, increment, isTiny; 2694 int64_t roundIncrement, roundMask, roundBits; 2695 2696 roundingMode = status->float_rounding_mode; 2697 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2698 if ( roundingPrecision == 80 ) goto precision80; 2699 if ( roundingPrecision == 64 ) { 2700 roundIncrement = LIT64( 0x0000000000000400 ); 2701 roundMask = LIT64( 0x00000000000007FF ); 2702 } 2703 else if ( roundingPrecision == 32 ) { 2704 roundIncrement = LIT64( 0x0000008000000000 ); 2705 roundMask = LIT64( 0x000000FFFFFFFFFF ); 2706 } 2707 else { 2708 goto precision80; 2709 } 2710 zSig0 |= ( zSig1 != 0 ); 2711 switch (roundingMode) { 2712 case float_round_nearest_even: 2713 case float_round_ties_away: 2714 break; 2715 case float_round_to_zero: 2716 roundIncrement = 0; 2717 break; 2718 case float_round_up: 2719 roundIncrement = zSign ? 0 : roundMask; 2720 break; 2721 case float_round_down: 2722 roundIncrement = zSign ? roundMask : 0; 2723 break; 2724 default: 2725 abort(); 2726 } 2727 roundBits = zSig0 & roundMask; 2728 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 2729 if ( ( 0x7FFE < zExp ) 2730 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 2731 ) { 2732 goto overflow; 2733 } 2734 if ( zExp <= 0 ) { 2735 if (status->flush_to_zero) { 2736 float_raise(float_flag_output_denormal, status); 2737 return packFloatx80(zSign, 0, 0); 2738 } 2739 isTiny = 2740 (status->float_detect_tininess 2741 == float_tininess_before_rounding) 2742 || ( zExp < 0 ) 2743 || ( zSig0 <= zSig0 + roundIncrement ); 2744 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 2745 zExp = 0; 2746 roundBits = zSig0 & roundMask; 2747 if (isTiny && roundBits) { 2748 float_raise(float_flag_underflow, status); 2749 } 2750 if (roundBits) { 2751 status->float_exception_flags |= float_flag_inexact; 2752 } 2753 zSig0 += roundIncrement; 2754 if ( (int64_t) zSig0 < 0 ) zExp = 1; 2755 roundIncrement = roundMask + 1; 2756 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 2757 roundMask |= roundIncrement; 2758 } 2759 zSig0 &= ~ roundMask; 2760 return packFloatx80( zSign, zExp, zSig0 ); 2761 } 2762 } 2763 if (roundBits) { 2764 status->float_exception_flags |= float_flag_inexact; 2765 } 2766 zSig0 += roundIncrement; 2767 if ( zSig0 < roundIncrement ) { 2768 ++zExp; 2769 zSig0 = LIT64( 0x8000000000000000 ); 2770 } 2771 roundIncrement = roundMask + 1; 2772 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 2773 roundMask |= roundIncrement; 2774 } 2775 zSig0 &= ~ roundMask; 2776 if ( zSig0 == 0 ) zExp = 0; 2777 return packFloatx80( zSign, zExp, zSig0 ); 2778 precision80: 2779 switch (roundingMode) { 2780 case float_round_nearest_even: 2781 case float_round_ties_away: 2782 increment = ((int64_t)zSig1 < 0); 2783 break; 2784 case float_round_to_zero: 2785 increment = 0; 2786 break; 2787 case float_round_up: 2788 increment = !zSign && zSig1; 2789 break; 2790 case float_round_down: 2791 increment = zSign && zSig1; 2792 break; 2793 default: 2794 abort(); 2795 } 2796 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 2797 if ( ( 0x7FFE < zExp ) 2798 || ( ( zExp == 0x7FFE ) 2799 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 2800 && increment 2801 ) 2802 ) { 2803 roundMask = 0; 2804 overflow: 2805 float_raise(float_flag_overflow | float_flag_inexact, status); 2806 if ( ( roundingMode == float_round_to_zero ) 2807 || ( zSign && ( roundingMode == float_round_up ) ) 2808 || ( ! zSign && ( roundingMode == float_round_down ) ) 2809 ) { 2810 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 2811 } 2812 return packFloatx80(zSign, 2813 floatx80_infinity_high, 2814 floatx80_infinity_low); 2815 } 2816 if ( zExp <= 0 ) { 2817 isTiny = 2818 (status->float_detect_tininess 2819 == float_tininess_before_rounding) 2820 || ( zExp < 0 ) 2821 || ! increment 2822 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 2823 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 2824 zExp = 0; 2825 if (isTiny && zSig1) { 2826 float_raise(float_flag_underflow, status); 2827 } 2828 if (zSig1) { 2829 status->float_exception_flags |= float_flag_inexact; 2830 } 2831 switch (roundingMode) { 2832 case float_round_nearest_even: 2833 case float_round_ties_away: 2834 increment = ((int64_t)zSig1 < 0); 2835 break; 2836 case float_round_to_zero: 2837 increment = 0; 2838 break; 2839 case float_round_up: 2840 increment = !zSign && zSig1; 2841 break; 2842 case float_round_down: 2843 increment = zSign && zSig1; 2844 break; 2845 default: 2846 abort(); 2847 } 2848 if ( increment ) { 2849 ++zSig0; 2850 zSig0 &= 2851 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 2852 if ( (int64_t) zSig0 < 0 ) zExp = 1; 2853 } 2854 return packFloatx80( zSign, zExp, zSig0 ); 2855 } 2856 } 2857 if (zSig1) { 2858 status->float_exception_flags |= float_flag_inexact; 2859 } 2860 if ( increment ) { 2861 ++zSig0; 2862 if ( zSig0 == 0 ) { 2863 ++zExp; 2864 zSig0 = LIT64( 0x8000000000000000 ); 2865 } 2866 else { 2867 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 2868 } 2869 } 2870 else { 2871 if ( zSig0 == 0 ) zExp = 0; 2872 } 2873 return packFloatx80( zSign, zExp, zSig0 ); 2874 2875 } 2876 2877 /*---------------------------------------------------------------------------- 2878 | Takes an abstract floating-point value having sign `zSign', exponent 2879 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 2880 | and returns the proper extended double-precision floating-point value 2881 | corresponding to the abstract input. This routine is just like 2882 | `roundAndPackFloatx80' except that the input significand does not have to be 2883 | normalized. 2884 *----------------------------------------------------------------------------*/ 2885 2886 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 2887 flag zSign, int32_t zExp, 2888 uint64_t zSig0, uint64_t zSig1, 2889 float_status *status) 2890 { 2891 int8_t shiftCount; 2892 2893 if ( zSig0 == 0 ) { 2894 zSig0 = zSig1; 2895 zSig1 = 0; 2896 zExp -= 64; 2897 } 2898 shiftCount = countLeadingZeros64( zSig0 ); 2899 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 2900 zExp -= shiftCount; 2901 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 2902 zSig0, zSig1, status); 2903 2904 } 2905 2906 /*---------------------------------------------------------------------------- 2907 | Returns the least-significant 64 fraction bits of the quadruple-precision 2908 | floating-point value `a'. 2909 *----------------------------------------------------------------------------*/ 2910 2911 static inline uint64_t extractFloat128Frac1( float128 a ) 2912 { 2913 2914 return a.low; 2915 2916 } 2917 2918 /*---------------------------------------------------------------------------- 2919 | Returns the most-significant 48 fraction bits of the quadruple-precision 2920 | floating-point value `a'. 2921 *----------------------------------------------------------------------------*/ 2922 2923 static inline uint64_t extractFloat128Frac0( float128 a ) 2924 { 2925 2926 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 2927 2928 } 2929 2930 /*---------------------------------------------------------------------------- 2931 | Returns the exponent bits of the quadruple-precision floating-point value 2932 | `a'. 2933 *----------------------------------------------------------------------------*/ 2934 2935 static inline int32_t extractFloat128Exp( float128 a ) 2936 { 2937 2938 return ( a.high>>48 ) & 0x7FFF; 2939 2940 } 2941 2942 /*---------------------------------------------------------------------------- 2943 | Returns the sign bit of the quadruple-precision floating-point value `a'. 2944 *----------------------------------------------------------------------------*/ 2945 2946 static inline flag extractFloat128Sign( float128 a ) 2947 { 2948 2949 return a.high>>63; 2950 2951 } 2952 2953 /*---------------------------------------------------------------------------- 2954 | Normalizes the subnormal quadruple-precision floating-point value 2955 | represented by the denormalized significand formed by the concatenation of 2956 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 2957 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 2958 | significand are stored at the location pointed to by `zSig0Ptr', and the 2959 | least significant 64 bits of the normalized significand are stored at the 2960 | location pointed to by `zSig1Ptr'. 2961 *----------------------------------------------------------------------------*/ 2962 2963 static void 2964 normalizeFloat128Subnormal( 2965 uint64_t aSig0, 2966 uint64_t aSig1, 2967 int32_t *zExpPtr, 2968 uint64_t *zSig0Ptr, 2969 uint64_t *zSig1Ptr 2970 ) 2971 { 2972 int8_t shiftCount; 2973 2974 if ( aSig0 == 0 ) { 2975 shiftCount = countLeadingZeros64( aSig1 ) - 15; 2976 if ( shiftCount < 0 ) { 2977 *zSig0Ptr = aSig1>>( - shiftCount ); 2978 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 2979 } 2980 else { 2981 *zSig0Ptr = aSig1<<shiftCount; 2982 *zSig1Ptr = 0; 2983 } 2984 *zExpPtr = - shiftCount - 63; 2985 } 2986 else { 2987 shiftCount = countLeadingZeros64( aSig0 ) - 15; 2988 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 2989 *zExpPtr = 1 - shiftCount; 2990 } 2991 2992 } 2993 2994 /*---------------------------------------------------------------------------- 2995 | Packs the sign `zSign', the exponent `zExp', and the significand formed 2996 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 2997 | floating-point value, returning the result. After being shifted into the 2998 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 2999 | added together to form the most significant 32 bits of the result. This 3000 | means that any integer portion of `zSig0' will be added into the exponent. 3001 | Since a properly normalized significand will have an integer portion equal 3002 | to 1, the `zExp' input should be 1 less than the desired result exponent 3003 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 3004 | significand. 3005 *----------------------------------------------------------------------------*/ 3006 3007 static inline float128 3008 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 ) 3009 { 3010 float128 z; 3011 3012 z.low = zSig1; 3013 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0; 3014 return z; 3015 3016 } 3017 3018 /*---------------------------------------------------------------------------- 3019 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3020 | and extended significand formed by the concatenation of `zSig0', `zSig1', 3021 | and `zSig2', and returns the proper quadruple-precision floating-point value 3022 | corresponding to the abstract input. Ordinarily, the abstract value is 3023 | simply rounded and packed into the quadruple-precision format, with the 3024 | inexact exception raised if the abstract input cannot be represented 3025 | exactly. However, if the abstract value is too large, the overflow and 3026 | inexact exceptions are raised and an infinity or maximal finite value is 3027 | returned. If the abstract value is too small, the input value is rounded to 3028 | a subnormal number, and the underflow and inexact exceptions are raised if 3029 | the abstract input cannot be represented exactly as a subnormal quadruple- 3030 | precision floating-point number. 3031 | The input significand must be normalized or smaller. If the input 3032 | significand is not normalized, `zExp' must be 0; in that case, the result 3033 | returned is a subnormal number, and it must not require rounding. In the 3034 | usual case that the input significand is normalized, `zExp' must be 1 less 3035 | than the ``true'' floating-point exponent. The handling of underflow and 3036 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3037 *----------------------------------------------------------------------------*/ 3038 3039 static float128 roundAndPackFloat128(flag zSign, int32_t zExp, 3040 uint64_t zSig0, uint64_t zSig1, 3041 uint64_t zSig2, float_status *status) 3042 { 3043 int8_t roundingMode; 3044 flag roundNearestEven, increment, isTiny; 3045 3046 roundingMode = status->float_rounding_mode; 3047 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3048 switch (roundingMode) { 3049 case float_round_nearest_even: 3050 case float_round_ties_away: 3051 increment = ((int64_t)zSig2 < 0); 3052 break; 3053 case float_round_to_zero: 3054 increment = 0; 3055 break; 3056 case float_round_up: 3057 increment = !zSign && zSig2; 3058 break; 3059 case float_round_down: 3060 increment = zSign && zSig2; 3061 break; 3062 case float_round_to_odd: 3063 increment = !(zSig1 & 0x1) && zSig2; 3064 break; 3065 default: 3066 abort(); 3067 } 3068 if ( 0x7FFD <= (uint32_t) zExp ) { 3069 if ( ( 0x7FFD < zExp ) 3070 || ( ( zExp == 0x7FFD ) 3071 && eq128( 3072 LIT64( 0x0001FFFFFFFFFFFF ), 3073 LIT64( 0xFFFFFFFFFFFFFFFF ), 3074 zSig0, 3075 zSig1 3076 ) 3077 && increment 3078 ) 3079 ) { 3080 float_raise(float_flag_overflow | float_flag_inexact, status); 3081 if ( ( roundingMode == float_round_to_zero ) 3082 || ( zSign && ( roundingMode == float_round_up ) ) 3083 || ( ! zSign && ( roundingMode == float_round_down ) ) 3084 || (roundingMode == float_round_to_odd) 3085 ) { 3086 return 3087 packFloat128( 3088 zSign, 3089 0x7FFE, 3090 LIT64( 0x0000FFFFFFFFFFFF ), 3091 LIT64( 0xFFFFFFFFFFFFFFFF ) 3092 ); 3093 } 3094 return packFloat128( zSign, 0x7FFF, 0, 0 ); 3095 } 3096 if ( zExp < 0 ) { 3097 if (status->flush_to_zero) { 3098 float_raise(float_flag_output_denormal, status); 3099 return packFloat128(zSign, 0, 0, 0); 3100 } 3101 isTiny = 3102 (status->float_detect_tininess 3103 == float_tininess_before_rounding) 3104 || ( zExp < -1 ) 3105 || ! increment 3106 || lt128( 3107 zSig0, 3108 zSig1, 3109 LIT64( 0x0001FFFFFFFFFFFF ), 3110 LIT64( 0xFFFFFFFFFFFFFFFF ) 3111 ); 3112 shift128ExtraRightJamming( 3113 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 3114 zExp = 0; 3115 if (isTiny && zSig2) { 3116 float_raise(float_flag_underflow, status); 3117 } 3118 switch (roundingMode) { 3119 case float_round_nearest_even: 3120 case float_round_ties_away: 3121 increment = ((int64_t)zSig2 < 0); 3122 break; 3123 case float_round_to_zero: 3124 increment = 0; 3125 break; 3126 case float_round_up: 3127 increment = !zSign && zSig2; 3128 break; 3129 case float_round_down: 3130 increment = zSign && zSig2; 3131 break; 3132 case float_round_to_odd: 3133 increment = !(zSig1 & 0x1) && zSig2; 3134 break; 3135 default: 3136 abort(); 3137 } 3138 } 3139 } 3140 if (zSig2) { 3141 status->float_exception_flags |= float_flag_inexact; 3142 } 3143 if ( increment ) { 3144 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 3145 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 3146 } 3147 else { 3148 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 3149 } 3150 return packFloat128( zSign, zExp, zSig0, zSig1 ); 3151 3152 } 3153 3154 /*---------------------------------------------------------------------------- 3155 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3156 | and significand formed by the concatenation of `zSig0' and `zSig1', and 3157 | returns the proper quadruple-precision floating-point value corresponding 3158 | to the abstract input. This routine is just like `roundAndPackFloat128' 3159 | except that the input significand has fewer bits and does not have to be 3160 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 3161 | point exponent. 3162 *----------------------------------------------------------------------------*/ 3163 3164 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp, 3165 uint64_t zSig0, uint64_t zSig1, 3166 float_status *status) 3167 { 3168 int8_t shiftCount; 3169 uint64_t zSig2; 3170 3171 if ( zSig0 == 0 ) { 3172 zSig0 = zSig1; 3173 zSig1 = 0; 3174 zExp -= 64; 3175 } 3176 shiftCount = countLeadingZeros64( zSig0 ) - 15; 3177 if ( 0 <= shiftCount ) { 3178 zSig2 = 0; 3179 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 3180 } 3181 else { 3182 shift128ExtraRightJamming( 3183 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 3184 } 3185 zExp -= shiftCount; 3186 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 3187 3188 } 3189 3190 3191 /*---------------------------------------------------------------------------- 3192 | Returns the result of converting the 32-bit two's complement integer `a' 3193 | to the extended double-precision floating-point format. The conversion 3194 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3195 | Arithmetic. 3196 *----------------------------------------------------------------------------*/ 3197 3198 floatx80 int32_to_floatx80(int32_t a, float_status *status) 3199 { 3200 flag zSign; 3201 uint32_t absA; 3202 int8_t shiftCount; 3203 uint64_t zSig; 3204 3205 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 3206 zSign = ( a < 0 ); 3207 absA = zSign ? - a : a; 3208 shiftCount = countLeadingZeros32( absA ) + 32; 3209 zSig = absA; 3210 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 3211 3212 } 3213 3214 /*---------------------------------------------------------------------------- 3215 | Returns the result of converting the 32-bit two's complement integer `a' to 3216 | the quadruple-precision floating-point format. The conversion is performed 3217 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3218 *----------------------------------------------------------------------------*/ 3219 3220 float128 int32_to_float128(int32_t a, float_status *status) 3221 { 3222 flag zSign; 3223 uint32_t absA; 3224 int8_t shiftCount; 3225 uint64_t zSig0; 3226 3227 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 3228 zSign = ( a < 0 ); 3229 absA = zSign ? - a : a; 3230 shiftCount = countLeadingZeros32( absA ) + 17; 3231 zSig0 = absA; 3232 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 3233 3234 } 3235 3236 /*---------------------------------------------------------------------------- 3237 | Returns the result of converting the 64-bit two's complement integer `a' 3238 | to the extended double-precision floating-point format. The conversion 3239 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3240 | Arithmetic. 3241 *----------------------------------------------------------------------------*/ 3242 3243 floatx80 int64_to_floatx80(int64_t a, float_status *status) 3244 { 3245 flag zSign; 3246 uint64_t absA; 3247 int8_t shiftCount; 3248 3249 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 3250 zSign = ( a < 0 ); 3251 absA = zSign ? - a : a; 3252 shiftCount = countLeadingZeros64( absA ); 3253 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 3254 3255 } 3256 3257 /*---------------------------------------------------------------------------- 3258 | Returns the result of converting the 64-bit two's complement integer `a' to 3259 | the quadruple-precision floating-point format. The conversion is performed 3260 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3261 *----------------------------------------------------------------------------*/ 3262 3263 float128 int64_to_float128(int64_t a, float_status *status) 3264 { 3265 flag zSign; 3266 uint64_t absA; 3267 int8_t shiftCount; 3268 int32_t zExp; 3269 uint64_t zSig0, zSig1; 3270 3271 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 3272 zSign = ( a < 0 ); 3273 absA = zSign ? - a : a; 3274 shiftCount = countLeadingZeros64( absA ) + 49; 3275 zExp = 0x406E - shiftCount; 3276 if ( 64 <= shiftCount ) { 3277 zSig1 = 0; 3278 zSig0 = absA; 3279 shiftCount -= 64; 3280 } 3281 else { 3282 zSig1 = absA; 3283 zSig0 = 0; 3284 } 3285 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 3286 return packFloat128( zSign, zExp, zSig0, zSig1 ); 3287 3288 } 3289 3290 /*---------------------------------------------------------------------------- 3291 | Returns the result of converting the 64-bit unsigned integer `a' 3292 | to the quadruple-precision floating-point format. The conversion is performed 3293 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3294 *----------------------------------------------------------------------------*/ 3295 3296 float128 uint64_to_float128(uint64_t a, float_status *status) 3297 { 3298 if (a == 0) { 3299 return float128_zero; 3300 } 3301 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status); 3302 } 3303 3304 /*---------------------------------------------------------------------------- 3305 | Returns the result of converting the single-precision floating-point value 3306 | `a' to the extended double-precision floating-point format. The conversion 3307 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3308 | Arithmetic. 3309 *----------------------------------------------------------------------------*/ 3310 3311 floatx80 float32_to_floatx80(float32 a, float_status *status) 3312 { 3313 flag aSign; 3314 int aExp; 3315 uint32_t aSig; 3316 3317 a = float32_squash_input_denormal(a, status); 3318 aSig = extractFloat32Frac( a ); 3319 aExp = extractFloat32Exp( a ); 3320 aSign = extractFloat32Sign( a ); 3321 if ( aExp == 0xFF ) { 3322 if (aSig) { 3323 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status); 3324 } 3325 return packFloatx80(aSign, 3326 floatx80_infinity_high, 3327 floatx80_infinity_low); 3328 } 3329 if ( aExp == 0 ) { 3330 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 3331 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3332 } 3333 aSig |= 0x00800000; 3334 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 3335 3336 } 3337 3338 /*---------------------------------------------------------------------------- 3339 | Returns the result of converting the single-precision floating-point value 3340 | `a' to the double-precision floating-point format. The conversion is 3341 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3342 | Arithmetic. 3343 *----------------------------------------------------------------------------*/ 3344 3345 float128 float32_to_float128(float32 a, float_status *status) 3346 { 3347 flag aSign; 3348 int aExp; 3349 uint32_t aSig; 3350 3351 a = float32_squash_input_denormal(a, status); 3352 aSig = extractFloat32Frac( a ); 3353 aExp = extractFloat32Exp( a ); 3354 aSign = extractFloat32Sign( a ); 3355 if ( aExp == 0xFF ) { 3356 if (aSig) { 3357 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 3358 } 3359 return packFloat128( aSign, 0x7FFF, 0, 0 ); 3360 } 3361 if ( aExp == 0 ) { 3362 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 3363 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3364 --aExp; 3365 } 3366 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 3367 3368 } 3369 3370 /*---------------------------------------------------------------------------- 3371 | Returns the remainder of the single-precision floating-point value `a' 3372 | with respect to the corresponding value `b'. The operation is performed 3373 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3374 *----------------------------------------------------------------------------*/ 3375 3376 float32 float32_rem(float32 a, float32 b, float_status *status) 3377 { 3378 flag aSign, zSign; 3379 int aExp, bExp, expDiff; 3380 uint32_t aSig, bSig; 3381 uint32_t q; 3382 uint64_t aSig64, bSig64, q64; 3383 uint32_t alternateASig; 3384 int32_t sigMean; 3385 a = float32_squash_input_denormal(a, status); 3386 b = float32_squash_input_denormal(b, status); 3387 3388 aSig = extractFloat32Frac( a ); 3389 aExp = extractFloat32Exp( a ); 3390 aSign = extractFloat32Sign( a ); 3391 bSig = extractFloat32Frac( b ); 3392 bExp = extractFloat32Exp( b ); 3393 if ( aExp == 0xFF ) { 3394 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 3395 return propagateFloat32NaN(a, b, status); 3396 } 3397 float_raise(float_flag_invalid, status); 3398 return float32_default_nan(status); 3399 } 3400 if ( bExp == 0xFF ) { 3401 if (bSig) { 3402 return propagateFloat32NaN(a, b, status); 3403 } 3404 return a; 3405 } 3406 if ( bExp == 0 ) { 3407 if ( bSig == 0 ) { 3408 float_raise(float_flag_invalid, status); 3409 return float32_default_nan(status); 3410 } 3411 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 3412 } 3413 if ( aExp == 0 ) { 3414 if ( aSig == 0 ) return a; 3415 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3416 } 3417 expDiff = aExp - bExp; 3418 aSig |= 0x00800000; 3419 bSig |= 0x00800000; 3420 if ( expDiff < 32 ) { 3421 aSig <<= 8; 3422 bSig <<= 8; 3423 if ( expDiff < 0 ) { 3424 if ( expDiff < -1 ) return a; 3425 aSig >>= 1; 3426 } 3427 q = ( bSig <= aSig ); 3428 if ( q ) aSig -= bSig; 3429 if ( 0 < expDiff ) { 3430 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 3431 q >>= 32 - expDiff; 3432 bSig >>= 2; 3433 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 3434 } 3435 else { 3436 aSig >>= 2; 3437 bSig >>= 2; 3438 } 3439 } 3440 else { 3441 if ( bSig <= aSig ) aSig -= bSig; 3442 aSig64 = ( (uint64_t) aSig )<<40; 3443 bSig64 = ( (uint64_t) bSig )<<40; 3444 expDiff -= 64; 3445 while ( 0 < expDiff ) { 3446 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 3447 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 3448 aSig64 = - ( ( bSig * q64 )<<38 ); 3449 expDiff -= 62; 3450 } 3451 expDiff += 64; 3452 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 3453 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 3454 q = q64>>( 64 - expDiff ); 3455 bSig <<= 6; 3456 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 3457 } 3458 do { 3459 alternateASig = aSig; 3460 ++q; 3461 aSig -= bSig; 3462 } while ( 0 <= (int32_t) aSig ); 3463 sigMean = aSig + alternateASig; 3464 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 3465 aSig = alternateASig; 3466 } 3467 zSign = ( (int32_t) aSig < 0 ); 3468 if ( zSign ) aSig = - aSig; 3469 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 3470 } 3471 3472 3473 3474 /*---------------------------------------------------------------------------- 3475 | Returns the binary exponential of the single-precision floating-point value 3476 | `a'. The operation is performed according to the IEC/IEEE Standard for 3477 | Binary Floating-Point Arithmetic. 3478 | 3479 | Uses the following identities: 3480 | 3481 | 1. ------------------------------------------------------------------------- 3482 | x x*ln(2) 3483 | 2 = e 3484 | 3485 | 2. ------------------------------------------------------------------------- 3486 | 2 3 4 5 n 3487 | x x x x x x x 3488 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 3489 | 1! 2! 3! 4! 5! n! 3490 *----------------------------------------------------------------------------*/ 3491 3492 static const float64 float32_exp2_coefficients[15] = 3493 { 3494 const_float64( 0x3ff0000000000000ll ), /* 1 */ 3495 const_float64( 0x3fe0000000000000ll ), /* 2 */ 3496 const_float64( 0x3fc5555555555555ll ), /* 3 */ 3497 const_float64( 0x3fa5555555555555ll ), /* 4 */ 3498 const_float64( 0x3f81111111111111ll ), /* 5 */ 3499 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 3500 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 3501 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 3502 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 3503 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 3504 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 3505 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 3506 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 3507 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 3508 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 3509 }; 3510 3511 float32 float32_exp2(float32 a, float_status *status) 3512 { 3513 flag aSign; 3514 int aExp; 3515 uint32_t aSig; 3516 float64 r, x, xn; 3517 int i; 3518 a = float32_squash_input_denormal(a, status); 3519 3520 aSig = extractFloat32Frac( a ); 3521 aExp = extractFloat32Exp( a ); 3522 aSign = extractFloat32Sign( a ); 3523 3524 if ( aExp == 0xFF) { 3525 if (aSig) { 3526 return propagateFloat32NaN(a, float32_zero, status); 3527 } 3528 return (aSign) ? float32_zero : a; 3529 } 3530 if (aExp == 0) { 3531 if (aSig == 0) return float32_one; 3532 } 3533 3534 float_raise(float_flag_inexact, status); 3535 3536 /* ******************************* */ 3537 /* using float64 for approximation */ 3538 /* ******************************* */ 3539 x = float32_to_float64(a, status); 3540 x = float64_mul(x, float64_ln2, status); 3541 3542 xn = x; 3543 r = float64_one; 3544 for (i = 0 ; i < 15 ; i++) { 3545 float64 f; 3546 3547 f = float64_mul(xn, float32_exp2_coefficients[i], status); 3548 r = float64_add(r, f, status); 3549 3550 xn = float64_mul(xn, x, status); 3551 } 3552 3553 return float64_to_float32(r, status); 3554 } 3555 3556 /*---------------------------------------------------------------------------- 3557 | Returns the binary log of the single-precision floating-point value `a'. 3558 | The operation is performed according to the IEC/IEEE Standard for Binary 3559 | Floating-Point Arithmetic. 3560 *----------------------------------------------------------------------------*/ 3561 float32 float32_log2(float32 a, float_status *status) 3562 { 3563 flag aSign, zSign; 3564 int aExp; 3565 uint32_t aSig, zSig, i; 3566 3567 a = float32_squash_input_denormal(a, status); 3568 aSig = extractFloat32Frac( a ); 3569 aExp = extractFloat32Exp( a ); 3570 aSign = extractFloat32Sign( a ); 3571 3572 if ( aExp == 0 ) { 3573 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 3574 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3575 } 3576 if ( aSign ) { 3577 float_raise(float_flag_invalid, status); 3578 return float32_default_nan(status); 3579 } 3580 if ( aExp == 0xFF ) { 3581 if (aSig) { 3582 return propagateFloat32NaN(a, float32_zero, status); 3583 } 3584 return a; 3585 } 3586 3587 aExp -= 0x7F; 3588 aSig |= 0x00800000; 3589 zSign = aExp < 0; 3590 zSig = aExp << 23; 3591 3592 for (i = 1 << 22; i > 0; i >>= 1) { 3593 aSig = ( (uint64_t)aSig * aSig ) >> 23; 3594 if ( aSig & 0x01000000 ) { 3595 aSig >>= 1; 3596 zSig |= i; 3597 } 3598 } 3599 3600 if ( zSign ) 3601 zSig = -zSig; 3602 3603 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 3604 } 3605 3606 /*---------------------------------------------------------------------------- 3607 | Returns 1 if the single-precision floating-point value `a' is equal to 3608 | the corresponding value `b', and 0 otherwise. The invalid exception is 3609 | raised if either operand is a NaN. Otherwise, the comparison is performed 3610 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3611 *----------------------------------------------------------------------------*/ 3612 3613 int float32_eq(float32 a, float32 b, float_status *status) 3614 { 3615 uint32_t av, bv; 3616 a = float32_squash_input_denormal(a, status); 3617 b = float32_squash_input_denormal(b, status); 3618 3619 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3620 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3621 ) { 3622 float_raise(float_flag_invalid, status); 3623 return 0; 3624 } 3625 av = float32_val(a); 3626 bv = float32_val(b); 3627 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3628 } 3629 3630 /*---------------------------------------------------------------------------- 3631 | Returns 1 if the single-precision floating-point value `a' is less than 3632 | or equal to the corresponding value `b', and 0 otherwise. The invalid 3633 | exception is raised if either operand is a NaN. The comparison is performed 3634 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3635 *----------------------------------------------------------------------------*/ 3636 3637 int float32_le(float32 a, float32 b, float_status *status) 3638 { 3639 flag aSign, bSign; 3640 uint32_t av, bv; 3641 a = float32_squash_input_denormal(a, status); 3642 b = float32_squash_input_denormal(b, status); 3643 3644 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3645 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3646 ) { 3647 float_raise(float_flag_invalid, status); 3648 return 0; 3649 } 3650 aSign = extractFloat32Sign( a ); 3651 bSign = extractFloat32Sign( b ); 3652 av = float32_val(a); 3653 bv = float32_val(b); 3654 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3655 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3656 3657 } 3658 3659 /*---------------------------------------------------------------------------- 3660 | Returns 1 if the single-precision floating-point value `a' is less than 3661 | the corresponding value `b', and 0 otherwise. The invalid exception is 3662 | raised if either operand is a NaN. The comparison is performed according 3663 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3664 *----------------------------------------------------------------------------*/ 3665 3666 int float32_lt(float32 a, float32 b, float_status *status) 3667 { 3668 flag aSign, bSign; 3669 uint32_t av, bv; 3670 a = float32_squash_input_denormal(a, status); 3671 b = float32_squash_input_denormal(b, status); 3672 3673 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3674 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3675 ) { 3676 float_raise(float_flag_invalid, status); 3677 return 0; 3678 } 3679 aSign = extractFloat32Sign( a ); 3680 bSign = extractFloat32Sign( b ); 3681 av = float32_val(a); 3682 bv = float32_val(b); 3683 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 3684 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3685 3686 } 3687 3688 /*---------------------------------------------------------------------------- 3689 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 3690 | be compared, and 0 otherwise. The invalid exception is raised if either 3691 | operand is a NaN. The comparison is performed according to the IEC/IEEE 3692 | Standard for Binary Floating-Point Arithmetic. 3693 *----------------------------------------------------------------------------*/ 3694 3695 int float32_unordered(float32 a, float32 b, float_status *status) 3696 { 3697 a = float32_squash_input_denormal(a, status); 3698 b = float32_squash_input_denormal(b, status); 3699 3700 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3701 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3702 ) { 3703 float_raise(float_flag_invalid, status); 3704 return 1; 3705 } 3706 return 0; 3707 } 3708 3709 /*---------------------------------------------------------------------------- 3710 | Returns 1 if the single-precision floating-point value `a' is equal to 3711 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3712 | exception. The comparison is performed according to the IEC/IEEE Standard 3713 | for Binary Floating-Point Arithmetic. 3714 *----------------------------------------------------------------------------*/ 3715 3716 int float32_eq_quiet(float32 a, float32 b, float_status *status) 3717 { 3718 a = float32_squash_input_denormal(a, status); 3719 b = float32_squash_input_denormal(b, status); 3720 3721 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3722 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3723 ) { 3724 if (float32_is_signaling_nan(a, status) 3725 || float32_is_signaling_nan(b, status)) { 3726 float_raise(float_flag_invalid, status); 3727 } 3728 return 0; 3729 } 3730 return ( float32_val(a) == float32_val(b) ) || 3731 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 3732 } 3733 3734 /*---------------------------------------------------------------------------- 3735 | Returns 1 if the single-precision floating-point value `a' is less than or 3736 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 3737 | cause an exception. Otherwise, the comparison is performed according to the 3738 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3739 *----------------------------------------------------------------------------*/ 3740 3741 int float32_le_quiet(float32 a, float32 b, float_status *status) 3742 { 3743 flag aSign, bSign; 3744 uint32_t av, bv; 3745 a = float32_squash_input_denormal(a, status); 3746 b = float32_squash_input_denormal(b, status); 3747 3748 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3749 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3750 ) { 3751 if (float32_is_signaling_nan(a, status) 3752 || float32_is_signaling_nan(b, status)) { 3753 float_raise(float_flag_invalid, status); 3754 } 3755 return 0; 3756 } 3757 aSign = extractFloat32Sign( a ); 3758 bSign = extractFloat32Sign( b ); 3759 av = float32_val(a); 3760 bv = float32_val(b); 3761 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3762 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3763 3764 } 3765 3766 /*---------------------------------------------------------------------------- 3767 | Returns 1 if the single-precision floating-point value `a' is less than 3768 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3769 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 3770 | Standard for Binary Floating-Point Arithmetic. 3771 *----------------------------------------------------------------------------*/ 3772 3773 int float32_lt_quiet(float32 a, float32 b, float_status *status) 3774 { 3775 flag aSign, bSign; 3776 uint32_t av, bv; 3777 a = float32_squash_input_denormal(a, status); 3778 b = float32_squash_input_denormal(b, status); 3779 3780 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3781 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3782 ) { 3783 if (float32_is_signaling_nan(a, status) 3784 || float32_is_signaling_nan(b, status)) { 3785 float_raise(float_flag_invalid, status); 3786 } 3787 return 0; 3788 } 3789 aSign = extractFloat32Sign( a ); 3790 bSign = extractFloat32Sign( b ); 3791 av = float32_val(a); 3792 bv = float32_val(b); 3793 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 3794 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3795 3796 } 3797 3798 /*---------------------------------------------------------------------------- 3799 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 3800 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 3801 | comparison is performed according to the IEC/IEEE Standard for Binary 3802 | Floating-Point Arithmetic. 3803 *----------------------------------------------------------------------------*/ 3804 3805 int float32_unordered_quiet(float32 a, float32 b, float_status *status) 3806 { 3807 a = float32_squash_input_denormal(a, status); 3808 b = float32_squash_input_denormal(b, status); 3809 3810 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3811 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3812 ) { 3813 if (float32_is_signaling_nan(a, status) 3814 || float32_is_signaling_nan(b, status)) { 3815 float_raise(float_flag_invalid, status); 3816 } 3817 return 1; 3818 } 3819 return 0; 3820 } 3821 3822 /*---------------------------------------------------------------------------- 3823 | If `a' is denormal and we are in flush-to-zero mode then set the 3824 | input-denormal exception and return zero. Otherwise just return the value. 3825 *----------------------------------------------------------------------------*/ 3826 float16 float16_squash_input_denormal(float16 a, float_status *status) 3827 { 3828 if (status->flush_inputs_to_zero) { 3829 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) { 3830 float_raise(float_flag_input_denormal, status); 3831 return make_float16(float16_val(a) & 0x8000); 3832 } 3833 } 3834 return a; 3835 } 3836 3837 /*---------------------------------------------------------------------------- 3838 | Returns the result of converting the double-precision floating-point value 3839 | `a' to the extended double-precision floating-point format. The conversion 3840 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3841 | Arithmetic. 3842 *----------------------------------------------------------------------------*/ 3843 3844 floatx80 float64_to_floatx80(float64 a, float_status *status) 3845 { 3846 flag aSign; 3847 int aExp; 3848 uint64_t aSig; 3849 3850 a = float64_squash_input_denormal(a, status); 3851 aSig = extractFloat64Frac( a ); 3852 aExp = extractFloat64Exp( a ); 3853 aSign = extractFloat64Sign( a ); 3854 if ( aExp == 0x7FF ) { 3855 if (aSig) { 3856 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status); 3857 } 3858 return packFloatx80(aSign, 3859 floatx80_infinity_high, 3860 floatx80_infinity_low); 3861 } 3862 if ( aExp == 0 ) { 3863 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 3864 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3865 } 3866 return 3867 packFloatx80( 3868 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 3869 3870 } 3871 3872 /*---------------------------------------------------------------------------- 3873 | Returns the result of converting the double-precision floating-point value 3874 | `a' to the quadruple-precision floating-point format. The conversion is 3875 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3876 | Arithmetic. 3877 *----------------------------------------------------------------------------*/ 3878 3879 float128 float64_to_float128(float64 a, float_status *status) 3880 { 3881 flag aSign; 3882 int aExp; 3883 uint64_t aSig, zSig0, zSig1; 3884 3885 a = float64_squash_input_denormal(a, status); 3886 aSig = extractFloat64Frac( a ); 3887 aExp = extractFloat64Exp( a ); 3888 aSign = extractFloat64Sign( a ); 3889 if ( aExp == 0x7FF ) { 3890 if (aSig) { 3891 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 3892 } 3893 return packFloat128( aSign, 0x7FFF, 0, 0 ); 3894 } 3895 if ( aExp == 0 ) { 3896 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 3897 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3898 --aExp; 3899 } 3900 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 3901 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 3902 3903 } 3904 3905 3906 /*---------------------------------------------------------------------------- 3907 | Returns the remainder of the double-precision floating-point value `a' 3908 | with respect to the corresponding value `b'. The operation is performed 3909 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3910 *----------------------------------------------------------------------------*/ 3911 3912 float64 float64_rem(float64 a, float64 b, float_status *status) 3913 { 3914 flag aSign, zSign; 3915 int aExp, bExp, expDiff; 3916 uint64_t aSig, bSig; 3917 uint64_t q, alternateASig; 3918 int64_t sigMean; 3919 3920 a = float64_squash_input_denormal(a, status); 3921 b = float64_squash_input_denormal(b, status); 3922 aSig = extractFloat64Frac( a ); 3923 aExp = extractFloat64Exp( a ); 3924 aSign = extractFloat64Sign( a ); 3925 bSig = extractFloat64Frac( b ); 3926 bExp = extractFloat64Exp( b ); 3927 if ( aExp == 0x7FF ) { 3928 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 3929 return propagateFloat64NaN(a, b, status); 3930 } 3931 float_raise(float_flag_invalid, status); 3932 return float64_default_nan(status); 3933 } 3934 if ( bExp == 0x7FF ) { 3935 if (bSig) { 3936 return propagateFloat64NaN(a, b, status); 3937 } 3938 return a; 3939 } 3940 if ( bExp == 0 ) { 3941 if ( bSig == 0 ) { 3942 float_raise(float_flag_invalid, status); 3943 return float64_default_nan(status); 3944 } 3945 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 3946 } 3947 if ( aExp == 0 ) { 3948 if ( aSig == 0 ) return a; 3949 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3950 } 3951 expDiff = aExp - bExp; 3952 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 3953 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 3954 if ( expDiff < 0 ) { 3955 if ( expDiff < -1 ) return a; 3956 aSig >>= 1; 3957 } 3958 q = ( bSig <= aSig ); 3959 if ( q ) aSig -= bSig; 3960 expDiff -= 64; 3961 while ( 0 < expDiff ) { 3962 q = estimateDiv128To64( aSig, 0, bSig ); 3963 q = ( 2 < q ) ? q - 2 : 0; 3964 aSig = - ( ( bSig>>2 ) * q ); 3965 expDiff -= 62; 3966 } 3967 expDiff += 64; 3968 if ( 0 < expDiff ) { 3969 q = estimateDiv128To64( aSig, 0, bSig ); 3970 q = ( 2 < q ) ? q - 2 : 0; 3971 q >>= 64 - expDiff; 3972 bSig >>= 2; 3973 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 3974 } 3975 else { 3976 aSig >>= 2; 3977 bSig >>= 2; 3978 } 3979 do { 3980 alternateASig = aSig; 3981 ++q; 3982 aSig -= bSig; 3983 } while ( 0 <= (int64_t) aSig ); 3984 sigMean = aSig + alternateASig; 3985 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 3986 aSig = alternateASig; 3987 } 3988 zSign = ( (int64_t) aSig < 0 ); 3989 if ( zSign ) aSig = - aSig; 3990 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 3991 3992 } 3993 3994 /*---------------------------------------------------------------------------- 3995 | Returns the binary log of the double-precision floating-point value `a'. 3996 | The operation is performed according to the IEC/IEEE Standard for Binary 3997 | Floating-Point Arithmetic. 3998 *----------------------------------------------------------------------------*/ 3999 float64 float64_log2(float64 a, float_status *status) 4000 { 4001 flag aSign, zSign; 4002 int aExp; 4003 uint64_t aSig, aSig0, aSig1, zSig, i; 4004 a = float64_squash_input_denormal(a, status); 4005 4006 aSig = extractFloat64Frac( a ); 4007 aExp = extractFloat64Exp( a ); 4008 aSign = extractFloat64Sign( a ); 4009 4010 if ( aExp == 0 ) { 4011 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 4012 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4013 } 4014 if ( aSign ) { 4015 float_raise(float_flag_invalid, status); 4016 return float64_default_nan(status); 4017 } 4018 if ( aExp == 0x7FF ) { 4019 if (aSig) { 4020 return propagateFloat64NaN(a, float64_zero, status); 4021 } 4022 return a; 4023 } 4024 4025 aExp -= 0x3FF; 4026 aSig |= LIT64( 0x0010000000000000 ); 4027 zSign = aExp < 0; 4028 zSig = (uint64_t)aExp << 52; 4029 for (i = 1LL << 51; i > 0; i >>= 1) { 4030 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 4031 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 4032 if ( aSig & LIT64( 0x0020000000000000 ) ) { 4033 aSig >>= 1; 4034 zSig |= i; 4035 } 4036 } 4037 4038 if ( zSign ) 4039 zSig = -zSig; 4040 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 4041 } 4042 4043 /*---------------------------------------------------------------------------- 4044 | Returns 1 if the double-precision floating-point value `a' is equal to the 4045 | corresponding value `b', and 0 otherwise. The invalid exception is raised 4046 | if either operand is a NaN. Otherwise, the comparison is performed 4047 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4048 *----------------------------------------------------------------------------*/ 4049 4050 int float64_eq(float64 a, float64 b, float_status *status) 4051 { 4052 uint64_t av, bv; 4053 a = float64_squash_input_denormal(a, status); 4054 b = float64_squash_input_denormal(b, status); 4055 4056 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4057 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4058 ) { 4059 float_raise(float_flag_invalid, status); 4060 return 0; 4061 } 4062 av = float64_val(a); 4063 bv = float64_val(b); 4064 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4065 4066 } 4067 4068 /*---------------------------------------------------------------------------- 4069 | Returns 1 if the double-precision floating-point value `a' is less than or 4070 | equal to the corresponding value `b', and 0 otherwise. The invalid 4071 | exception is raised if either operand is a NaN. The comparison is performed 4072 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4073 *----------------------------------------------------------------------------*/ 4074 4075 int float64_le(float64 a, float64 b, float_status *status) 4076 { 4077 flag aSign, bSign; 4078 uint64_t av, bv; 4079 a = float64_squash_input_denormal(a, status); 4080 b = float64_squash_input_denormal(b, status); 4081 4082 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4083 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4084 ) { 4085 float_raise(float_flag_invalid, status); 4086 return 0; 4087 } 4088 aSign = extractFloat64Sign( a ); 4089 bSign = extractFloat64Sign( b ); 4090 av = float64_val(a); 4091 bv = float64_val(b); 4092 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4093 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4094 4095 } 4096 4097 /*---------------------------------------------------------------------------- 4098 | Returns 1 if the double-precision floating-point value `a' is less than 4099 | the corresponding value `b', and 0 otherwise. The invalid exception is 4100 | raised if either operand is a NaN. The comparison is performed according 4101 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4102 *----------------------------------------------------------------------------*/ 4103 4104 int float64_lt(float64 a, float64 b, float_status *status) 4105 { 4106 flag aSign, bSign; 4107 uint64_t av, bv; 4108 4109 a = float64_squash_input_denormal(a, status); 4110 b = float64_squash_input_denormal(b, status); 4111 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4112 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4113 ) { 4114 float_raise(float_flag_invalid, status); 4115 return 0; 4116 } 4117 aSign = extractFloat64Sign( a ); 4118 bSign = extractFloat64Sign( b ); 4119 av = float64_val(a); 4120 bv = float64_val(b); 4121 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4122 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4123 4124 } 4125 4126 /*---------------------------------------------------------------------------- 4127 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4128 | be compared, and 0 otherwise. The invalid exception is raised if either 4129 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4130 | Standard for Binary Floating-Point Arithmetic. 4131 *----------------------------------------------------------------------------*/ 4132 4133 int float64_unordered(float64 a, float64 b, float_status *status) 4134 { 4135 a = float64_squash_input_denormal(a, status); 4136 b = float64_squash_input_denormal(b, status); 4137 4138 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4139 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4140 ) { 4141 float_raise(float_flag_invalid, status); 4142 return 1; 4143 } 4144 return 0; 4145 } 4146 4147 /*---------------------------------------------------------------------------- 4148 | Returns 1 if the double-precision floating-point value `a' is equal to the 4149 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4150 | exception.The comparison is performed according to the IEC/IEEE Standard 4151 | for Binary Floating-Point Arithmetic. 4152 *----------------------------------------------------------------------------*/ 4153 4154 int float64_eq_quiet(float64 a, float64 b, float_status *status) 4155 { 4156 uint64_t av, bv; 4157 a = float64_squash_input_denormal(a, status); 4158 b = float64_squash_input_denormal(b, status); 4159 4160 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4161 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4162 ) { 4163 if (float64_is_signaling_nan(a, status) 4164 || float64_is_signaling_nan(b, status)) { 4165 float_raise(float_flag_invalid, status); 4166 } 4167 return 0; 4168 } 4169 av = float64_val(a); 4170 bv = float64_val(b); 4171 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4172 4173 } 4174 4175 /*---------------------------------------------------------------------------- 4176 | Returns 1 if the double-precision floating-point value `a' is less than or 4177 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4178 | cause an exception. Otherwise, the comparison is performed according to the 4179 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4180 *----------------------------------------------------------------------------*/ 4181 4182 int float64_le_quiet(float64 a, float64 b, float_status *status) 4183 { 4184 flag aSign, bSign; 4185 uint64_t av, bv; 4186 a = float64_squash_input_denormal(a, status); 4187 b = float64_squash_input_denormal(b, status); 4188 4189 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4190 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4191 ) { 4192 if (float64_is_signaling_nan(a, status) 4193 || float64_is_signaling_nan(b, status)) { 4194 float_raise(float_flag_invalid, status); 4195 } 4196 return 0; 4197 } 4198 aSign = extractFloat64Sign( a ); 4199 bSign = extractFloat64Sign( b ); 4200 av = float64_val(a); 4201 bv = float64_val(b); 4202 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4203 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4204 4205 } 4206 4207 /*---------------------------------------------------------------------------- 4208 | Returns 1 if the double-precision floating-point value `a' is less than 4209 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4210 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 4211 | Standard for Binary Floating-Point Arithmetic. 4212 *----------------------------------------------------------------------------*/ 4213 4214 int float64_lt_quiet(float64 a, float64 b, float_status *status) 4215 { 4216 flag aSign, bSign; 4217 uint64_t av, bv; 4218 a = float64_squash_input_denormal(a, status); 4219 b = float64_squash_input_denormal(b, status); 4220 4221 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4222 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4223 ) { 4224 if (float64_is_signaling_nan(a, status) 4225 || float64_is_signaling_nan(b, status)) { 4226 float_raise(float_flag_invalid, status); 4227 } 4228 return 0; 4229 } 4230 aSign = extractFloat64Sign( a ); 4231 bSign = extractFloat64Sign( b ); 4232 av = float64_val(a); 4233 bv = float64_val(b); 4234 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4235 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4236 4237 } 4238 4239 /*---------------------------------------------------------------------------- 4240 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4241 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 4242 | comparison is performed according to the IEC/IEEE Standard for Binary 4243 | Floating-Point Arithmetic. 4244 *----------------------------------------------------------------------------*/ 4245 4246 int float64_unordered_quiet(float64 a, float64 b, float_status *status) 4247 { 4248 a = float64_squash_input_denormal(a, status); 4249 b = float64_squash_input_denormal(b, status); 4250 4251 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4252 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4253 ) { 4254 if (float64_is_signaling_nan(a, status) 4255 || float64_is_signaling_nan(b, status)) { 4256 float_raise(float_flag_invalid, status); 4257 } 4258 return 1; 4259 } 4260 return 0; 4261 } 4262 4263 /*---------------------------------------------------------------------------- 4264 | Returns the result of converting the extended double-precision floating- 4265 | point value `a' to the 32-bit two's complement integer format. The 4266 | conversion is performed according to the IEC/IEEE Standard for Binary 4267 | Floating-Point Arithmetic---which means in particular that the conversion 4268 | is rounded according to the current rounding mode. If `a' is a NaN, the 4269 | largest positive integer is returned. Otherwise, if the conversion 4270 | overflows, the largest integer with the same sign as `a' is returned. 4271 *----------------------------------------------------------------------------*/ 4272 4273 int32_t floatx80_to_int32(floatx80 a, float_status *status) 4274 { 4275 flag aSign; 4276 int32_t aExp, shiftCount; 4277 uint64_t aSig; 4278 4279 if (floatx80_invalid_encoding(a)) { 4280 float_raise(float_flag_invalid, status); 4281 return 1 << 31; 4282 } 4283 aSig = extractFloatx80Frac( a ); 4284 aExp = extractFloatx80Exp( a ); 4285 aSign = extractFloatx80Sign( a ); 4286 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4287 shiftCount = 0x4037 - aExp; 4288 if ( shiftCount <= 0 ) shiftCount = 1; 4289 shift64RightJamming( aSig, shiftCount, &aSig ); 4290 return roundAndPackInt32(aSign, aSig, status); 4291 4292 } 4293 4294 /*---------------------------------------------------------------------------- 4295 | Returns the result of converting the extended double-precision floating- 4296 | point value `a' to the 32-bit two's complement integer format. The 4297 | conversion is performed according to the IEC/IEEE Standard for Binary 4298 | Floating-Point Arithmetic, except that the conversion is always rounded 4299 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4300 | Otherwise, if the conversion overflows, the largest integer with the same 4301 | sign as `a' is returned. 4302 *----------------------------------------------------------------------------*/ 4303 4304 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 4305 { 4306 flag aSign; 4307 int32_t aExp, shiftCount; 4308 uint64_t aSig, savedASig; 4309 int32_t z; 4310 4311 if (floatx80_invalid_encoding(a)) { 4312 float_raise(float_flag_invalid, status); 4313 return 1 << 31; 4314 } 4315 aSig = extractFloatx80Frac( a ); 4316 aExp = extractFloatx80Exp( a ); 4317 aSign = extractFloatx80Sign( a ); 4318 if ( 0x401E < aExp ) { 4319 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4320 goto invalid; 4321 } 4322 else if ( aExp < 0x3FFF ) { 4323 if (aExp || aSig) { 4324 status->float_exception_flags |= float_flag_inexact; 4325 } 4326 return 0; 4327 } 4328 shiftCount = 0x403E - aExp; 4329 savedASig = aSig; 4330 aSig >>= shiftCount; 4331 z = aSig; 4332 if ( aSign ) z = - z; 4333 if ( ( z < 0 ) ^ aSign ) { 4334 invalid: 4335 float_raise(float_flag_invalid, status); 4336 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 4337 } 4338 if ( ( aSig<<shiftCount ) != savedASig ) { 4339 status->float_exception_flags |= float_flag_inexact; 4340 } 4341 return z; 4342 4343 } 4344 4345 /*---------------------------------------------------------------------------- 4346 | Returns the result of converting the extended double-precision floating- 4347 | point value `a' to the 64-bit two's complement integer format. The 4348 | conversion is performed according to the IEC/IEEE Standard for Binary 4349 | Floating-Point Arithmetic---which means in particular that the conversion 4350 | is rounded according to the current rounding mode. If `a' is a NaN, 4351 | the largest positive integer is returned. Otherwise, if the conversion 4352 | overflows, the largest integer with the same sign as `a' is returned. 4353 *----------------------------------------------------------------------------*/ 4354 4355 int64_t floatx80_to_int64(floatx80 a, float_status *status) 4356 { 4357 flag aSign; 4358 int32_t aExp, shiftCount; 4359 uint64_t aSig, aSigExtra; 4360 4361 if (floatx80_invalid_encoding(a)) { 4362 float_raise(float_flag_invalid, status); 4363 return 1ULL << 63; 4364 } 4365 aSig = extractFloatx80Frac( a ); 4366 aExp = extractFloatx80Exp( a ); 4367 aSign = extractFloatx80Sign( a ); 4368 shiftCount = 0x403E - aExp; 4369 if ( shiftCount <= 0 ) { 4370 if ( shiftCount ) { 4371 float_raise(float_flag_invalid, status); 4372 if (!aSign || floatx80_is_any_nan(a)) { 4373 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4374 } 4375 return (int64_t) LIT64( 0x8000000000000000 ); 4376 } 4377 aSigExtra = 0; 4378 } 4379 else { 4380 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 4381 } 4382 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 4383 4384 } 4385 4386 /*---------------------------------------------------------------------------- 4387 | Returns the result of converting the extended double-precision floating- 4388 | point value `a' to the 64-bit two's complement integer format. The 4389 | conversion is performed according to the IEC/IEEE Standard for Binary 4390 | Floating-Point Arithmetic, except that the conversion is always rounded 4391 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4392 | Otherwise, if the conversion overflows, the largest integer with the same 4393 | sign as `a' is returned. 4394 *----------------------------------------------------------------------------*/ 4395 4396 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 4397 { 4398 flag aSign; 4399 int32_t aExp, shiftCount; 4400 uint64_t aSig; 4401 int64_t z; 4402 4403 if (floatx80_invalid_encoding(a)) { 4404 float_raise(float_flag_invalid, status); 4405 return 1ULL << 63; 4406 } 4407 aSig = extractFloatx80Frac( a ); 4408 aExp = extractFloatx80Exp( a ); 4409 aSign = extractFloatx80Sign( a ); 4410 shiftCount = aExp - 0x403E; 4411 if ( 0 <= shiftCount ) { 4412 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 4413 if ( ( a.high != 0xC03E ) || aSig ) { 4414 float_raise(float_flag_invalid, status); 4415 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 4416 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4417 } 4418 } 4419 return (int64_t) LIT64( 0x8000000000000000 ); 4420 } 4421 else if ( aExp < 0x3FFF ) { 4422 if (aExp | aSig) { 4423 status->float_exception_flags |= float_flag_inexact; 4424 } 4425 return 0; 4426 } 4427 z = aSig>>( - shiftCount ); 4428 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 4429 status->float_exception_flags |= float_flag_inexact; 4430 } 4431 if ( aSign ) z = - z; 4432 return z; 4433 4434 } 4435 4436 /*---------------------------------------------------------------------------- 4437 | Returns the result of converting the extended double-precision floating- 4438 | point value `a' to the single-precision floating-point format. The 4439 | conversion is performed according to the IEC/IEEE Standard for Binary 4440 | Floating-Point Arithmetic. 4441 *----------------------------------------------------------------------------*/ 4442 4443 float32 floatx80_to_float32(floatx80 a, float_status *status) 4444 { 4445 flag aSign; 4446 int32_t aExp; 4447 uint64_t aSig; 4448 4449 if (floatx80_invalid_encoding(a)) { 4450 float_raise(float_flag_invalid, status); 4451 return float32_default_nan(status); 4452 } 4453 aSig = extractFloatx80Frac( a ); 4454 aExp = extractFloatx80Exp( a ); 4455 aSign = extractFloatx80Sign( a ); 4456 if ( aExp == 0x7FFF ) { 4457 if ( (uint64_t) ( aSig<<1 ) ) { 4458 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status); 4459 } 4460 return packFloat32( aSign, 0xFF, 0 ); 4461 } 4462 shift64RightJamming( aSig, 33, &aSig ); 4463 if ( aExp || aSig ) aExp -= 0x3F81; 4464 return roundAndPackFloat32(aSign, aExp, aSig, status); 4465 4466 } 4467 4468 /*---------------------------------------------------------------------------- 4469 | Returns the result of converting the extended double-precision floating- 4470 | point value `a' to the double-precision floating-point format. The 4471 | conversion is performed according to the IEC/IEEE Standard for Binary 4472 | Floating-Point Arithmetic. 4473 *----------------------------------------------------------------------------*/ 4474 4475 float64 floatx80_to_float64(floatx80 a, float_status *status) 4476 { 4477 flag aSign; 4478 int32_t aExp; 4479 uint64_t aSig, zSig; 4480 4481 if (floatx80_invalid_encoding(a)) { 4482 float_raise(float_flag_invalid, status); 4483 return float64_default_nan(status); 4484 } 4485 aSig = extractFloatx80Frac( a ); 4486 aExp = extractFloatx80Exp( a ); 4487 aSign = extractFloatx80Sign( a ); 4488 if ( aExp == 0x7FFF ) { 4489 if ( (uint64_t) ( aSig<<1 ) ) { 4490 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status); 4491 } 4492 return packFloat64( aSign, 0x7FF, 0 ); 4493 } 4494 shift64RightJamming( aSig, 1, &zSig ); 4495 if ( aExp || aSig ) aExp -= 0x3C01; 4496 return roundAndPackFloat64(aSign, aExp, zSig, status); 4497 4498 } 4499 4500 /*---------------------------------------------------------------------------- 4501 | Returns the result of converting the extended double-precision floating- 4502 | point value `a' to the quadruple-precision floating-point format. The 4503 | conversion is performed according to the IEC/IEEE Standard for Binary 4504 | Floating-Point Arithmetic. 4505 *----------------------------------------------------------------------------*/ 4506 4507 float128 floatx80_to_float128(floatx80 a, float_status *status) 4508 { 4509 flag aSign; 4510 int aExp; 4511 uint64_t aSig, zSig0, zSig1; 4512 4513 if (floatx80_invalid_encoding(a)) { 4514 float_raise(float_flag_invalid, status); 4515 return float128_default_nan(status); 4516 } 4517 aSig = extractFloatx80Frac( a ); 4518 aExp = extractFloatx80Exp( a ); 4519 aSign = extractFloatx80Sign( a ); 4520 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 4521 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status); 4522 } 4523 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 4524 return packFloat128( aSign, aExp, zSig0, zSig1 ); 4525 4526 } 4527 4528 /*---------------------------------------------------------------------------- 4529 | Rounds the extended double-precision floating-point value `a' 4530 | to the precision provided by floatx80_rounding_precision and returns the 4531 | result as an extended double-precision floating-point value. 4532 | The operation is performed according to the IEC/IEEE Standard for Binary 4533 | Floating-Point Arithmetic. 4534 *----------------------------------------------------------------------------*/ 4535 4536 floatx80 floatx80_round(floatx80 a, float_status *status) 4537 { 4538 return roundAndPackFloatx80(status->floatx80_rounding_precision, 4539 extractFloatx80Sign(a), 4540 extractFloatx80Exp(a), 4541 extractFloatx80Frac(a), 0, status); 4542 } 4543 4544 /*---------------------------------------------------------------------------- 4545 | Rounds the extended double-precision floating-point value `a' to an integer, 4546 | and returns the result as an extended quadruple-precision floating-point 4547 | value. The operation is performed according to the IEC/IEEE Standard for 4548 | Binary Floating-Point Arithmetic. 4549 *----------------------------------------------------------------------------*/ 4550 4551 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 4552 { 4553 flag aSign; 4554 int32_t aExp; 4555 uint64_t lastBitMask, roundBitsMask; 4556 floatx80 z; 4557 4558 if (floatx80_invalid_encoding(a)) { 4559 float_raise(float_flag_invalid, status); 4560 return floatx80_default_nan(status); 4561 } 4562 aExp = extractFloatx80Exp( a ); 4563 if ( 0x403E <= aExp ) { 4564 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 4565 return propagateFloatx80NaN(a, a, status); 4566 } 4567 return a; 4568 } 4569 if ( aExp < 0x3FFF ) { 4570 if ( ( aExp == 0 ) 4571 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 4572 return a; 4573 } 4574 status->float_exception_flags |= float_flag_inexact; 4575 aSign = extractFloatx80Sign( a ); 4576 switch (status->float_rounding_mode) { 4577 case float_round_nearest_even: 4578 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 4579 ) { 4580 return 4581 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 4582 } 4583 break; 4584 case float_round_ties_away: 4585 if (aExp == 0x3FFE) { 4586 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000)); 4587 } 4588 break; 4589 case float_round_down: 4590 return 4591 aSign ? 4592 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 4593 : packFloatx80( 0, 0, 0 ); 4594 case float_round_up: 4595 return 4596 aSign ? packFloatx80( 1, 0, 0 ) 4597 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 4598 } 4599 return packFloatx80( aSign, 0, 0 ); 4600 } 4601 lastBitMask = 1; 4602 lastBitMask <<= 0x403E - aExp; 4603 roundBitsMask = lastBitMask - 1; 4604 z = a; 4605 switch (status->float_rounding_mode) { 4606 case float_round_nearest_even: 4607 z.low += lastBitMask>>1; 4608 if ((z.low & roundBitsMask) == 0) { 4609 z.low &= ~lastBitMask; 4610 } 4611 break; 4612 case float_round_ties_away: 4613 z.low += lastBitMask >> 1; 4614 break; 4615 case float_round_to_zero: 4616 break; 4617 case float_round_up: 4618 if (!extractFloatx80Sign(z)) { 4619 z.low += roundBitsMask; 4620 } 4621 break; 4622 case float_round_down: 4623 if (extractFloatx80Sign(z)) { 4624 z.low += roundBitsMask; 4625 } 4626 break; 4627 default: 4628 abort(); 4629 } 4630 z.low &= ~ roundBitsMask; 4631 if ( z.low == 0 ) { 4632 ++z.high; 4633 z.low = LIT64( 0x8000000000000000 ); 4634 } 4635 if (z.low != a.low) { 4636 status->float_exception_flags |= float_flag_inexact; 4637 } 4638 return z; 4639 4640 } 4641 4642 /*---------------------------------------------------------------------------- 4643 | Returns the result of adding the absolute values of the extended double- 4644 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 4645 | negated before being returned. `zSign' is ignored if the result is a NaN. 4646 | The addition is performed according to the IEC/IEEE Standard for Binary 4647 | Floating-Point Arithmetic. 4648 *----------------------------------------------------------------------------*/ 4649 4650 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 4651 float_status *status) 4652 { 4653 int32_t aExp, bExp, zExp; 4654 uint64_t aSig, bSig, zSig0, zSig1; 4655 int32_t expDiff; 4656 4657 aSig = extractFloatx80Frac( a ); 4658 aExp = extractFloatx80Exp( a ); 4659 bSig = extractFloatx80Frac( b ); 4660 bExp = extractFloatx80Exp( b ); 4661 expDiff = aExp - bExp; 4662 if ( 0 < expDiff ) { 4663 if ( aExp == 0x7FFF ) { 4664 if ((uint64_t)(aSig << 1)) { 4665 return propagateFloatx80NaN(a, b, status); 4666 } 4667 return a; 4668 } 4669 if ( bExp == 0 ) --expDiff; 4670 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 4671 zExp = aExp; 4672 } 4673 else if ( expDiff < 0 ) { 4674 if ( bExp == 0x7FFF ) { 4675 if ((uint64_t)(bSig << 1)) { 4676 return propagateFloatx80NaN(a, b, status); 4677 } 4678 return packFloatx80(zSign, 4679 floatx80_infinity_high, 4680 floatx80_infinity_low); 4681 } 4682 if ( aExp == 0 ) ++expDiff; 4683 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 4684 zExp = bExp; 4685 } 4686 else { 4687 if ( aExp == 0x7FFF ) { 4688 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 4689 return propagateFloatx80NaN(a, b, status); 4690 } 4691 return a; 4692 } 4693 zSig1 = 0; 4694 zSig0 = aSig + bSig; 4695 if ( aExp == 0 ) { 4696 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 4697 goto roundAndPack; 4698 } 4699 zExp = aExp; 4700 goto shiftRight1; 4701 } 4702 zSig0 = aSig + bSig; 4703 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 4704 shiftRight1: 4705 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 4706 zSig0 |= LIT64( 0x8000000000000000 ); 4707 ++zExp; 4708 roundAndPack: 4709 return roundAndPackFloatx80(status->floatx80_rounding_precision, 4710 zSign, zExp, zSig0, zSig1, status); 4711 } 4712 4713 /*---------------------------------------------------------------------------- 4714 | Returns the result of subtracting the absolute values of the extended 4715 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 4716 | difference is negated before being returned. `zSign' is ignored if the 4717 | result is a NaN. The subtraction is performed according to the IEC/IEEE 4718 | Standard for Binary Floating-Point Arithmetic. 4719 *----------------------------------------------------------------------------*/ 4720 4721 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 4722 float_status *status) 4723 { 4724 int32_t aExp, bExp, zExp; 4725 uint64_t aSig, bSig, zSig0, zSig1; 4726 int32_t expDiff; 4727 4728 aSig = extractFloatx80Frac( a ); 4729 aExp = extractFloatx80Exp( a ); 4730 bSig = extractFloatx80Frac( b ); 4731 bExp = extractFloatx80Exp( b ); 4732 expDiff = aExp - bExp; 4733 if ( 0 < expDiff ) goto aExpBigger; 4734 if ( expDiff < 0 ) goto bExpBigger; 4735 if ( aExp == 0x7FFF ) { 4736 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 4737 return propagateFloatx80NaN(a, b, status); 4738 } 4739 float_raise(float_flag_invalid, status); 4740 return floatx80_default_nan(status); 4741 } 4742 if ( aExp == 0 ) { 4743 aExp = 1; 4744 bExp = 1; 4745 } 4746 zSig1 = 0; 4747 if ( bSig < aSig ) goto aBigger; 4748 if ( aSig < bSig ) goto bBigger; 4749 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 4750 bExpBigger: 4751 if ( bExp == 0x7FFF ) { 4752 if ((uint64_t)(bSig << 1)) { 4753 return propagateFloatx80NaN(a, b, status); 4754 } 4755 return packFloatx80(zSign ^ 1, floatx80_infinity_high, 4756 floatx80_infinity_low); 4757 } 4758 if ( aExp == 0 ) ++expDiff; 4759 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 4760 bBigger: 4761 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 4762 zExp = bExp; 4763 zSign ^= 1; 4764 goto normalizeRoundAndPack; 4765 aExpBigger: 4766 if ( aExp == 0x7FFF ) { 4767 if ((uint64_t)(aSig << 1)) { 4768 return propagateFloatx80NaN(a, b, status); 4769 } 4770 return a; 4771 } 4772 if ( bExp == 0 ) --expDiff; 4773 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 4774 aBigger: 4775 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 4776 zExp = aExp; 4777 normalizeRoundAndPack: 4778 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 4779 zSign, zExp, zSig0, zSig1, status); 4780 } 4781 4782 /*---------------------------------------------------------------------------- 4783 | Returns the result of adding the extended double-precision floating-point 4784 | values `a' and `b'. The operation is performed according to the IEC/IEEE 4785 | Standard for Binary Floating-Point Arithmetic. 4786 *----------------------------------------------------------------------------*/ 4787 4788 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 4789 { 4790 flag aSign, bSign; 4791 4792 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 4793 float_raise(float_flag_invalid, status); 4794 return floatx80_default_nan(status); 4795 } 4796 aSign = extractFloatx80Sign( a ); 4797 bSign = extractFloatx80Sign( b ); 4798 if ( aSign == bSign ) { 4799 return addFloatx80Sigs(a, b, aSign, status); 4800 } 4801 else { 4802 return subFloatx80Sigs(a, b, aSign, status); 4803 } 4804 4805 } 4806 4807 /*---------------------------------------------------------------------------- 4808 | Returns the result of subtracting the extended double-precision floating- 4809 | point values `a' and `b'. The operation is performed according to the 4810 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4811 *----------------------------------------------------------------------------*/ 4812 4813 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 4814 { 4815 flag aSign, bSign; 4816 4817 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 4818 float_raise(float_flag_invalid, status); 4819 return floatx80_default_nan(status); 4820 } 4821 aSign = extractFloatx80Sign( a ); 4822 bSign = extractFloatx80Sign( b ); 4823 if ( aSign == bSign ) { 4824 return subFloatx80Sigs(a, b, aSign, status); 4825 } 4826 else { 4827 return addFloatx80Sigs(a, b, aSign, status); 4828 } 4829 4830 } 4831 4832 /*---------------------------------------------------------------------------- 4833 | Returns the result of multiplying the extended double-precision floating- 4834 | point values `a' and `b'. The operation is performed according to the 4835 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4836 *----------------------------------------------------------------------------*/ 4837 4838 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 4839 { 4840 flag aSign, bSign, zSign; 4841 int32_t aExp, bExp, zExp; 4842 uint64_t aSig, bSig, zSig0, zSig1; 4843 4844 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 4845 float_raise(float_flag_invalid, status); 4846 return floatx80_default_nan(status); 4847 } 4848 aSig = extractFloatx80Frac( a ); 4849 aExp = extractFloatx80Exp( a ); 4850 aSign = extractFloatx80Sign( a ); 4851 bSig = extractFloatx80Frac( b ); 4852 bExp = extractFloatx80Exp( b ); 4853 bSign = extractFloatx80Sign( b ); 4854 zSign = aSign ^ bSign; 4855 if ( aExp == 0x7FFF ) { 4856 if ( (uint64_t) ( aSig<<1 ) 4857 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 4858 return propagateFloatx80NaN(a, b, status); 4859 } 4860 if ( ( bExp | bSig ) == 0 ) goto invalid; 4861 return packFloatx80(zSign, floatx80_infinity_high, 4862 floatx80_infinity_low); 4863 } 4864 if ( bExp == 0x7FFF ) { 4865 if ((uint64_t)(bSig << 1)) { 4866 return propagateFloatx80NaN(a, b, status); 4867 } 4868 if ( ( aExp | aSig ) == 0 ) { 4869 invalid: 4870 float_raise(float_flag_invalid, status); 4871 return floatx80_default_nan(status); 4872 } 4873 return packFloatx80(zSign, floatx80_infinity_high, 4874 floatx80_infinity_low); 4875 } 4876 if ( aExp == 0 ) { 4877 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 4878 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 4879 } 4880 if ( bExp == 0 ) { 4881 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 4882 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 4883 } 4884 zExp = aExp + bExp - 0x3FFE; 4885 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 4886 if ( 0 < (int64_t) zSig0 ) { 4887 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 4888 --zExp; 4889 } 4890 return roundAndPackFloatx80(status->floatx80_rounding_precision, 4891 zSign, zExp, zSig0, zSig1, status); 4892 } 4893 4894 /*---------------------------------------------------------------------------- 4895 | Returns the result of dividing the extended double-precision floating-point 4896 | value `a' by the corresponding value `b'. The operation is performed 4897 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4898 *----------------------------------------------------------------------------*/ 4899 4900 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 4901 { 4902 flag aSign, bSign, zSign; 4903 int32_t aExp, bExp, zExp; 4904 uint64_t aSig, bSig, zSig0, zSig1; 4905 uint64_t rem0, rem1, rem2, term0, term1, term2; 4906 4907 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 4908 float_raise(float_flag_invalid, status); 4909 return floatx80_default_nan(status); 4910 } 4911 aSig = extractFloatx80Frac( a ); 4912 aExp = extractFloatx80Exp( a ); 4913 aSign = extractFloatx80Sign( a ); 4914 bSig = extractFloatx80Frac( b ); 4915 bExp = extractFloatx80Exp( b ); 4916 bSign = extractFloatx80Sign( b ); 4917 zSign = aSign ^ bSign; 4918 if ( aExp == 0x7FFF ) { 4919 if ((uint64_t)(aSig << 1)) { 4920 return propagateFloatx80NaN(a, b, status); 4921 } 4922 if ( bExp == 0x7FFF ) { 4923 if ((uint64_t)(bSig << 1)) { 4924 return propagateFloatx80NaN(a, b, status); 4925 } 4926 goto invalid; 4927 } 4928 return packFloatx80(zSign, floatx80_infinity_high, 4929 floatx80_infinity_low); 4930 } 4931 if ( bExp == 0x7FFF ) { 4932 if ((uint64_t)(bSig << 1)) { 4933 return propagateFloatx80NaN(a, b, status); 4934 } 4935 return packFloatx80( zSign, 0, 0 ); 4936 } 4937 if ( bExp == 0 ) { 4938 if ( bSig == 0 ) { 4939 if ( ( aExp | aSig ) == 0 ) { 4940 invalid: 4941 float_raise(float_flag_invalid, status); 4942 return floatx80_default_nan(status); 4943 } 4944 float_raise(float_flag_divbyzero, status); 4945 return packFloatx80(zSign, floatx80_infinity_high, 4946 floatx80_infinity_low); 4947 } 4948 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 4949 } 4950 if ( aExp == 0 ) { 4951 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 4952 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 4953 } 4954 zExp = aExp - bExp + 0x3FFE; 4955 rem1 = 0; 4956 if ( bSig <= aSig ) { 4957 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 4958 ++zExp; 4959 } 4960 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 4961 mul64To128( bSig, zSig0, &term0, &term1 ); 4962 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 4963 while ( (int64_t) rem0 < 0 ) { 4964 --zSig0; 4965 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 4966 } 4967 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 4968 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 4969 mul64To128( bSig, zSig1, &term1, &term2 ); 4970 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 4971 while ( (int64_t) rem1 < 0 ) { 4972 --zSig1; 4973 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 4974 } 4975 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 4976 } 4977 return roundAndPackFloatx80(status->floatx80_rounding_precision, 4978 zSign, zExp, zSig0, zSig1, status); 4979 } 4980 4981 /*---------------------------------------------------------------------------- 4982 | Returns the remainder of the extended double-precision floating-point value 4983 | `a' with respect to the corresponding value `b'. The operation is performed 4984 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4985 *----------------------------------------------------------------------------*/ 4986 4987 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 4988 { 4989 flag aSign, zSign; 4990 int32_t aExp, bExp, expDiff; 4991 uint64_t aSig0, aSig1, bSig; 4992 uint64_t q, term0, term1, alternateASig0, alternateASig1; 4993 4994 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 4995 float_raise(float_flag_invalid, status); 4996 return floatx80_default_nan(status); 4997 } 4998 aSig0 = extractFloatx80Frac( a ); 4999 aExp = extractFloatx80Exp( a ); 5000 aSign = extractFloatx80Sign( a ); 5001 bSig = extractFloatx80Frac( b ); 5002 bExp = extractFloatx80Exp( b ); 5003 if ( aExp == 0x7FFF ) { 5004 if ( (uint64_t) ( aSig0<<1 ) 5005 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5006 return propagateFloatx80NaN(a, b, status); 5007 } 5008 goto invalid; 5009 } 5010 if ( bExp == 0x7FFF ) { 5011 if ((uint64_t)(bSig << 1)) { 5012 return propagateFloatx80NaN(a, b, status); 5013 } 5014 return a; 5015 } 5016 if ( bExp == 0 ) { 5017 if ( bSig == 0 ) { 5018 invalid: 5019 float_raise(float_flag_invalid, status); 5020 return floatx80_default_nan(status); 5021 } 5022 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5023 } 5024 if ( aExp == 0 ) { 5025 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a; 5026 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5027 } 5028 bSig |= LIT64( 0x8000000000000000 ); 5029 zSign = aSign; 5030 expDiff = aExp - bExp; 5031 aSig1 = 0; 5032 if ( expDiff < 0 ) { 5033 if ( expDiff < -1 ) return a; 5034 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 5035 expDiff = 0; 5036 } 5037 q = ( bSig <= aSig0 ); 5038 if ( q ) aSig0 -= bSig; 5039 expDiff -= 64; 5040 while ( 0 < expDiff ) { 5041 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5042 q = ( 2 < q ) ? q - 2 : 0; 5043 mul64To128( bSig, q, &term0, &term1 ); 5044 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5045 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 5046 expDiff -= 62; 5047 } 5048 expDiff += 64; 5049 if ( 0 < expDiff ) { 5050 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5051 q = ( 2 < q ) ? q - 2 : 0; 5052 q >>= 64 - expDiff; 5053 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 5054 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5055 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 5056 while ( le128( term0, term1, aSig0, aSig1 ) ) { 5057 ++q; 5058 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5059 } 5060 } 5061 else { 5062 term1 = 0; 5063 term0 = bSig; 5064 } 5065 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 5066 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5067 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5068 && ( q & 1 ) ) 5069 ) { 5070 aSig0 = alternateASig0; 5071 aSig1 = alternateASig1; 5072 zSign = ! zSign; 5073 } 5074 return 5075 normalizeRoundAndPackFloatx80( 5076 80, zSign, bExp + expDiff, aSig0, aSig1, status); 5077 5078 } 5079 5080 /*---------------------------------------------------------------------------- 5081 | Returns the square root of the extended double-precision floating-point 5082 | value `a'. The operation is performed according to the IEC/IEEE Standard 5083 | for Binary Floating-Point Arithmetic. 5084 *----------------------------------------------------------------------------*/ 5085 5086 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 5087 { 5088 flag aSign; 5089 int32_t aExp, zExp; 5090 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 5091 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5092 5093 if (floatx80_invalid_encoding(a)) { 5094 float_raise(float_flag_invalid, status); 5095 return floatx80_default_nan(status); 5096 } 5097 aSig0 = extractFloatx80Frac( a ); 5098 aExp = extractFloatx80Exp( a ); 5099 aSign = extractFloatx80Sign( a ); 5100 if ( aExp == 0x7FFF ) { 5101 if ((uint64_t)(aSig0 << 1)) { 5102 return propagateFloatx80NaN(a, a, status); 5103 } 5104 if ( ! aSign ) return a; 5105 goto invalid; 5106 } 5107 if ( aSign ) { 5108 if ( ( aExp | aSig0 ) == 0 ) return a; 5109 invalid: 5110 float_raise(float_flag_invalid, status); 5111 return floatx80_default_nan(status); 5112 } 5113 if ( aExp == 0 ) { 5114 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 5115 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5116 } 5117 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 5118 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 5119 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 5120 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5121 doubleZSig0 = zSig0<<1; 5122 mul64To128( zSig0, zSig0, &term0, &term1 ); 5123 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5124 while ( (int64_t) rem0 < 0 ) { 5125 --zSig0; 5126 doubleZSig0 -= 2; 5127 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5128 } 5129 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5130 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 5131 if ( zSig1 == 0 ) zSig1 = 1; 5132 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5133 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5134 mul64To128( zSig1, zSig1, &term2, &term3 ); 5135 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5136 while ( (int64_t) rem1 < 0 ) { 5137 --zSig1; 5138 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5139 term3 |= 1; 5140 term2 |= doubleZSig0; 5141 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5142 } 5143 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5144 } 5145 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 5146 zSig0 |= doubleZSig0; 5147 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5148 0, zExp, zSig0, zSig1, status); 5149 } 5150 5151 /*---------------------------------------------------------------------------- 5152 | Returns 1 if the extended double-precision floating-point value `a' is equal 5153 | to the corresponding value `b', and 0 otherwise. The invalid exception is 5154 | raised if either operand is a NaN. Otherwise, the comparison is performed 5155 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5156 *----------------------------------------------------------------------------*/ 5157 5158 int floatx80_eq(floatx80 a, floatx80 b, float_status *status) 5159 { 5160 5161 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5162 || (extractFloatx80Exp(a) == 0x7FFF 5163 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5164 || (extractFloatx80Exp(b) == 0x7FFF 5165 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5166 ) { 5167 float_raise(float_flag_invalid, status); 5168 return 0; 5169 } 5170 return 5171 ( a.low == b.low ) 5172 && ( ( a.high == b.high ) 5173 || ( ( a.low == 0 ) 5174 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5175 ); 5176 5177 } 5178 5179 /*---------------------------------------------------------------------------- 5180 | Returns 1 if the extended double-precision floating-point value `a' is 5181 | less than or equal to the corresponding value `b', and 0 otherwise. The 5182 | invalid exception is raised if either operand is a NaN. The comparison is 5183 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5184 | Arithmetic. 5185 *----------------------------------------------------------------------------*/ 5186 5187 int floatx80_le(floatx80 a, floatx80 b, float_status *status) 5188 { 5189 flag aSign, bSign; 5190 5191 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5192 || (extractFloatx80Exp(a) == 0x7FFF 5193 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5194 || (extractFloatx80Exp(b) == 0x7FFF 5195 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5196 ) { 5197 float_raise(float_flag_invalid, status); 5198 return 0; 5199 } 5200 aSign = extractFloatx80Sign( a ); 5201 bSign = extractFloatx80Sign( b ); 5202 if ( aSign != bSign ) { 5203 return 5204 aSign 5205 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5206 == 0 ); 5207 } 5208 return 5209 aSign ? le128( b.high, b.low, a.high, a.low ) 5210 : le128( a.high, a.low, b.high, b.low ); 5211 5212 } 5213 5214 /*---------------------------------------------------------------------------- 5215 | Returns 1 if the extended double-precision floating-point value `a' is 5216 | less than the corresponding value `b', and 0 otherwise. The invalid 5217 | exception is raised if either operand is a NaN. The comparison is performed 5218 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5219 *----------------------------------------------------------------------------*/ 5220 5221 int floatx80_lt(floatx80 a, floatx80 b, float_status *status) 5222 { 5223 flag aSign, bSign; 5224 5225 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5226 || (extractFloatx80Exp(a) == 0x7FFF 5227 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5228 || (extractFloatx80Exp(b) == 0x7FFF 5229 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5230 ) { 5231 float_raise(float_flag_invalid, status); 5232 return 0; 5233 } 5234 aSign = extractFloatx80Sign( a ); 5235 bSign = extractFloatx80Sign( b ); 5236 if ( aSign != bSign ) { 5237 return 5238 aSign 5239 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5240 != 0 ); 5241 } 5242 return 5243 aSign ? lt128( b.high, b.low, a.high, a.low ) 5244 : lt128( a.high, a.low, b.high, b.low ); 5245 5246 } 5247 5248 /*---------------------------------------------------------------------------- 5249 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5250 | cannot be compared, and 0 otherwise. The invalid exception is raised if 5251 | either operand is a NaN. The comparison is performed according to the 5252 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5253 *----------------------------------------------------------------------------*/ 5254 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status) 5255 { 5256 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5257 || (extractFloatx80Exp(a) == 0x7FFF 5258 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5259 || (extractFloatx80Exp(b) == 0x7FFF 5260 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5261 ) { 5262 float_raise(float_flag_invalid, status); 5263 return 1; 5264 } 5265 return 0; 5266 } 5267 5268 /*---------------------------------------------------------------------------- 5269 | Returns 1 if the extended double-precision floating-point value `a' is 5270 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5271 | cause an exception. The comparison is performed according to the IEC/IEEE 5272 | Standard for Binary Floating-Point Arithmetic. 5273 *----------------------------------------------------------------------------*/ 5274 5275 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status) 5276 { 5277 5278 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5279 float_raise(float_flag_invalid, status); 5280 return 0; 5281 } 5282 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5283 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5284 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5285 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5286 ) { 5287 if (floatx80_is_signaling_nan(a, status) 5288 || floatx80_is_signaling_nan(b, status)) { 5289 float_raise(float_flag_invalid, status); 5290 } 5291 return 0; 5292 } 5293 return 5294 ( a.low == b.low ) 5295 && ( ( a.high == b.high ) 5296 || ( ( a.low == 0 ) 5297 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5298 ); 5299 5300 } 5301 5302 /*---------------------------------------------------------------------------- 5303 | Returns 1 if the extended double-precision floating-point value `a' is less 5304 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 5305 | do not cause an exception. Otherwise, the comparison is performed according 5306 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5307 *----------------------------------------------------------------------------*/ 5308 5309 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status) 5310 { 5311 flag aSign, bSign; 5312 5313 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5314 float_raise(float_flag_invalid, status); 5315 return 0; 5316 } 5317 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5318 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5319 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5320 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5321 ) { 5322 if (floatx80_is_signaling_nan(a, status) 5323 || floatx80_is_signaling_nan(b, status)) { 5324 float_raise(float_flag_invalid, status); 5325 } 5326 return 0; 5327 } 5328 aSign = extractFloatx80Sign( a ); 5329 bSign = extractFloatx80Sign( b ); 5330 if ( aSign != bSign ) { 5331 return 5332 aSign 5333 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5334 == 0 ); 5335 } 5336 return 5337 aSign ? le128( b.high, b.low, a.high, a.low ) 5338 : le128( a.high, a.low, b.high, b.low ); 5339 5340 } 5341 5342 /*---------------------------------------------------------------------------- 5343 | Returns 1 if the extended double-precision floating-point value `a' is less 5344 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 5345 | an exception. Otherwise, the comparison is performed according to the 5346 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5347 *----------------------------------------------------------------------------*/ 5348 5349 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status) 5350 { 5351 flag aSign, bSign; 5352 5353 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5354 float_raise(float_flag_invalid, status); 5355 return 0; 5356 } 5357 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5358 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5359 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5360 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5361 ) { 5362 if (floatx80_is_signaling_nan(a, status) 5363 || floatx80_is_signaling_nan(b, status)) { 5364 float_raise(float_flag_invalid, status); 5365 } 5366 return 0; 5367 } 5368 aSign = extractFloatx80Sign( a ); 5369 bSign = extractFloatx80Sign( b ); 5370 if ( aSign != bSign ) { 5371 return 5372 aSign 5373 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5374 != 0 ); 5375 } 5376 return 5377 aSign ? lt128( b.high, b.low, a.high, a.low ) 5378 : lt128( a.high, a.low, b.high, b.low ); 5379 5380 } 5381 5382 /*---------------------------------------------------------------------------- 5383 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5384 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception. 5385 | The comparison is performed according to the IEC/IEEE Standard for Binary 5386 | Floating-Point Arithmetic. 5387 *----------------------------------------------------------------------------*/ 5388 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status) 5389 { 5390 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5391 float_raise(float_flag_invalid, status); 5392 return 1; 5393 } 5394 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5395 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5396 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5397 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5398 ) { 5399 if (floatx80_is_signaling_nan(a, status) 5400 || floatx80_is_signaling_nan(b, status)) { 5401 float_raise(float_flag_invalid, status); 5402 } 5403 return 1; 5404 } 5405 return 0; 5406 } 5407 5408 /*---------------------------------------------------------------------------- 5409 | Returns the result of converting the quadruple-precision floating-point 5410 | value `a' to the 32-bit two's complement integer format. The conversion 5411 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5412 | Arithmetic---which means in particular that the conversion is rounded 5413 | according to the current rounding mode. If `a' is a NaN, the largest 5414 | positive integer is returned. Otherwise, if the conversion overflows, the 5415 | largest integer with the same sign as `a' is returned. 5416 *----------------------------------------------------------------------------*/ 5417 5418 int32_t float128_to_int32(float128 a, float_status *status) 5419 { 5420 flag aSign; 5421 int32_t aExp, shiftCount; 5422 uint64_t aSig0, aSig1; 5423 5424 aSig1 = extractFloat128Frac1( a ); 5425 aSig0 = extractFloat128Frac0( a ); 5426 aExp = extractFloat128Exp( a ); 5427 aSign = extractFloat128Sign( a ); 5428 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 5429 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5430 aSig0 |= ( aSig1 != 0 ); 5431 shiftCount = 0x4028 - aExp; 5432 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 5433 return roundAndPackInt32(aSign, aSig0, status); 5434 5435 } 5436 5437 /*---------------------------------------------------------------------------- 5438 | Returns the result of converting the quadruple-precision floating-point 5439 | value `a' to the 32-bit two's complement integer format. The conversion 5440 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5441 | Arithmetic, except that the conversion is always rounded toward zero. If 5442 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 5443 | conversion overflows, the largest integer with the same sign as `a' is 5444 | returned. 5445 *----------------------------------------------------------------------------*/ 5446 5447 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 5448 { 5449 flag aSign; 5450 int32_t aExp, shiftCount; 5451 uint64_t aSig0, aSig1, savedASig; 5452 int32_t z; 5453 5454 aSig1 = extractFloat128Frac1( a ); 5455 aSig0 = extractFloat128Frac0( a ); 5456 aExp = extractFloat128Exp( a ); 5457 aSign = extractFloat128Sign( a ); 5458 aSig0 |= ( aSig1 != 0 ); 5459 if ( 0x401E < aExp ) { 5460 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 5461 goto invalid; 5462 } 5463 else if ( aExp < 0x3FFF ) { 5464 if (aExp || aSig0) { 5465 status->float_exception_flags |= float_flag_inexact; 5466 } 5467 return 0; 5468 } 5469 aSig0 |= LIT64( 0x0001000000000000 ); 5470 shiftCount = 0x402F - aExp; 5471 savedASig = aSig0; 5472 aSig0 >>= shiftCount; 5473 z = aSig0; 5474 if ( aSign ) z = - z; 5475 if ( ( z < 0 ) ^ aSign ) { 5476 invalid: 5477 float_raise(float_flag_invalid, status); 5478 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5479 } 5480 if ( ( aSig0<<shiftCount ) != savedASig ) { 5481 status->float_exception_flags |= float_flag_inexact; 5482 } 5483 return z; 5484 5485 } 5486 5487 /*---------------------------------------------------------------------------- 5488 | Returns the result of converting the quadruple-precision floating-point 5489 | value `a' to the 64-bit two's complement integer format. The conversion 5490 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5491 | Arithmetic---which means in particular that the conversion is rounded 5492 | according to the current rounding mode. If `a' is a NaN, the largest 5493 | positive integer is returned. Otherwise, if the conversion overflows, the 5494 | largest integer with the same sign as `a' is returned. 5495 *----------------------------------------------------------------------------*/ 5496 5497 int64_t float128_to_int64(float128 a, float_status *status) 5498 { 5499 flag aSign; 5500 int32_t aExp, shiftCount; 5501 uint64_t aSig0, aSig1; 5502 5503 aSig1 = extractFloat128Frac1( a ); 5504 aSig0 = extractFloat128Frac0( a ); 5505 aExp = extractFloat128Exp( a ); 5506 aSign = extractFloat128Sign( a ); 5507 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5508 shiftCount = 0x402F - aExp; 5509 if ( shiftCount <= 0 ) { 5510 if ( 0x403E < aExp ) { 5511 float_raise(float_flag_invalid, status); 5512 if ( ! aSign 5513 || ( ( aExp == 0x7FFF ) 5514 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 5515 ) 5516 ) { 5517 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5518 } 5519 return (int64_t) LIT64( 0x8000000000000000 ); 5520 } 5521 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 5522 } 5523 else { 5524 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 5525 } 5526 return roundAndPackInt64(aSign, aSig0, aSig1, status); 5527 5528 } 5529 5530 /*---------------------------------------------------------------------------- 5531 | Returns the result of converting the quadruple-precision floating-point 5532 | value `a' to the 64-bit two's complement integer format. The conversion 5533 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5534 | Arithmetic, except that the conversion is always rounded toward zero. 5535 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 5536 | the conversion overflows, the largest integer with the same sign as `a' is 5537 | returned. 5538 *----------------------------------------------------------------------------*/ 5539 5540 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 5541 { 5542 flag aSign; 5543 int32_t aExp, shiftCount; 5544 uint64_t aSig0, aSig1; 5545 int64_t z; 5546 5547 aSig1 = extractFloat128Frac1( a ); 5548 aSig0 = extractFloat128Frac0( a ); 5549 aExp = extractFloat128Exp( a ); 5550 aSign = extractFloat128Sign( a ); 5551 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5552 shiftCount = aExp - 0x402F; 5553 if ( 0 < shiftCount ) { 5554 if ( 0x403E <= aExp ) { 5555 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 5556 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 5557 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 5558 if (aSig1) { 5559 status->float_exception_flags |= float_flag_inexact; 5560 } 5561 } 5562 else { 5563 float_raise(float_flag_invalid, status); 5564 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 5565 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5566 } 5567 } 5568 return (int64_t) LIT64( 0x8000000000000000 ); 5569 } 5570 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 5571 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 5572 status->float_exception_flags |= float_flag_inexact; 5573 } 5574 } 5575 else { 5576 if ( aExp < 0x3FFF ) { 5577 if ( aExp | aSig0 | aSig1 ) { 5578 status->float_exception_flags |= float_flag_inexact; 5579 } 5580 return 0; 5581 } 5582 z = aSig0>>( - shiftCount ); 5583 if ( aSig1 5584 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 5585 status->float_exception_flags |= float_flag_inexact; 5586 } 5587 } 5588 if ( aSign ) z = - z; 5589 return z; 5590 5591 } 5592 5593 /*---------------------------------------------------------------------------- 5594 | Returns the result of converting the quadruple-precision floating-point value 5595 | `a' to the 64-bit unsigned integer format. The conversion is 5596 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5597 | Arithmetic---which means in particular that the conversion is rounded 5598 | according to the current rounding mode. If `a' is a NaN, the largest 5599 | positive integer is returned. If the conversion overflows, the 5600 | largest unsigned integer is returned. If 'a' is negative, the value is 5601 | rounded and zero is returned; negative values that do not round to zero 5602 | will raise the inexact exception. 5603 *----------------------------------------------------------------------------*/ 5604 5605 uint64_t float128_to_uint64(float128 a, float_status *status) 5606 { 5607 flag aSign; 5608 int aExp; 5609 int shiftCount; 5610 uint64_t aSig0, aSig1; 5611 5612 aSig0 = extractFloat128Frac0(a); 5613 aSig1 = extractFloat128Frac1(a); 5614 aExp = extractFloat128Exp(a); 5615 aSign = extractFloat128Sign(a); 5616 if (aSign && (aExp > 0x3FFE)) { 5617 float_raise(float_flag_invalid, status); 5618 if (float128_is_any_nan(a)) { 5619 return LIT64(0xFFFFFFFFFFFFFFFF); 5620 } else { 5621 return 0; 5622 } 5623 } 5624 if (aExp) { 5625 aSig0 |= LIT64(0x0001000000000000); 5626 } 5627 shiftCount = 0x402F - aExp; 5628 if (shiftCount <= 0) { 5629 if (0x403E < aExp) { 5630 float_raise(float_flag_invalid, status); 5631 return LIT64(0xFFFFFFFFFFFFFFFF); 5632 } 5633 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 5634 } else { 5635 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 5636 } 5637 return roundAndPackUint64(aSign, aSig0, aSig1, status); 5638 } 5639 5640 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 5641 { 5642 uint64_t v; 5643 signed char current_rounding_mode = status->float_rounding_mode; 5644 5645 set_float_rounding_mode(float_round_to_zero, status); 5646 v = float128_to_uint64(a, status); 5647 set_float_rounding_mode(current_rounding_mode, status); 5648 5649 return v; 5650 } 5651 5652 /*---------------------------------------------------------------------------- 5653 | Returns the result of converting the quadruple-precision floating-point 5654 | value `a' to the 32-bit unsigned integer format. The conversion 5655 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5656 | Arithmetic except that the conversion is always rounded toward zero. 5657 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 5658 | if the conversion overflows, the largest unsigned integer is returned. 5659 | If 'a' is negative, the value is rounded and zero is returned; negative 5660 | values that do not round to zero will raise the inexact exception. 5661 *----------------------------------------------------------------------------*/ 5662 5663 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 5664 { 5665 uint64_t v; 5666 uint32_t res; 5667 int old_exc_flags = get_float_exception_flags(status); 5668 5669 v = float128_to_uint64_round_to_zero(a, status); 5670 if (v > 0xffffffff) { 5671 res = 0xffffffff; 5672 } else { 5673 return v; 5674 } 5675 set_float_exception_flags(old_exc_flags, status); 5676 float_raise(float_flag_invalid, status); 5677 return res; 5678 } 5679 5680 /*---------------------------------------------------------------------------- 5681 | Returns the result of converting the quadruple-precision floating-point 5682 | value `a' to the single-precision floating-point format. The conversion 5683 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5684 | Arithmetic. 5685 *----------------------------------------------------------------------------*/ 5686 5687 float32 float128_to_float32(float128 a, float_status *status) 5688 { 5689 flag aSign; 5690 int32_t aExp; 5691 uint64_t aSig0, aSig1; 5692 uint32_t zSig; 5693 5694 aSig1 = extractFloat128Frac1( a ); 5695 aSig0 = extractFloat128Frac0( a ); 5696 aExp = extractFloat128Exp( a ); 5697 aSign = extractFloat128Sign( a ); 5698 if ( aExp == 0x7FFF ) { 5699 if ( aSig0 | aSig1 ) { 5700 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 5701 } 5702 return packFloat32( aSign, 0xFF, 0 ); 5703 } 5704 aSig0 |= ( aSig1 != 0 ); 5705 shift64RightJamming( aSig0, 18, &aSig0 ); 5706 zSig = aSig0; 5707 if ( aExp || zSig ) { 5708 zSig |= 0x40000000; 5709 aExp -= 0x3F81; 5710 } 5711 return roundAndPackFloat32(aSign, aExp, zSig, status); 5712 5713 } 5714 5715 /*---------------------------------------------------------------------------- 5716 | Returns the result of converting the quadruple-precision floating-point 5717 | value `a' to the double-precision floating-point format. The conversion 5718 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5719 | Arithmetic. 5720 *----------------------------------------------------------------------------*/ 5721 5722 float64 float128_to_float64(float128 a, float_status *status) 5723 { 5724 flag aSign; 5725 int32_t aExp; 5726 uint64_t aSig0, aSig1; 5727 5728 aSig1 = extractFloat128Frac1( a ); 5729 aSig0 = extractFloat128Frac0( a ); 5730 aExp = extractFloat128Exp( a ); 5731 aSign = extractFloat128Sign( a ); 5732 if ( aExp == 0x7FFF ) { 5733 if ( aSig0 | aSig1 ) { 5734 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 5735 } 5736 return packFloat64( aSign, 0x7FF, 0 ); 5737 } 5738 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 5739 aSig0 |= ( aSig1 != 0 ); 5740 if ( aExp || aSig0 ) { 5741 aSig0 |= LIT64( 0x4000000000000000 ); 5742 aExp -= 0x3C01; 5743 } 5744 return roundAndPackFloat64(aSign, aExp, aSig0, status); 5745 5746 } 5747 5748 /*---------------------------------------------------------------------------- 5749 | Returns the result of converting the quadruple-precision floating-point 5750 | value `a' to the extended double-precision floating-point format. The 5751 | conversion is performed according to the IEC/IEEE Standard for Binary 5752 | Floating-Point Arithmetic. 5753 *----------------------------------------------------------------------------*/ 5754 5755 floatx80 float128_to_floatx80(float128 a, float_status *status) 5756 { 5757 flag aSign; 5758 int32_t aExp; 5759 uint64_t aSig0, aSig1; 5760 5761 aSig1 = extractFloat128Frac1( a ); 5762 aSig0 = extractFloat128Frac0( a ); 5763 aExp = extractFloat128Exp( a ); 5764 aSign = extractFloat128Sign( a ); 5765 if ( aExp == 0x7FFF ) { 5766 if ( aSig0 | aSig1 ) { 5767 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status); 5768 } 5769 return packFloatx80(aSign, floatx80_infinity_high, 5770 floatx80_infinity_low); 5771 } 5772 if ( aExp == 0 ) { 5773 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 5774 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5775 } 5776 else { 5777 aSig0 |= LIT64( 0x0001000000000000 ); 5778 } 5779 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 5780 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 5781 5782 } 5783 5784 /*---------------------------------------------------------------------------- 5785 | Rounds the quadruple-precision floating-point value `a' to an integer, and 5786 | returns the result as a quadruple-precision floating-point value. The 5787 | operation is performed according to the IEC/IEEE Standard for Binary 5788 | Floating-Point Arithmetic. 5789 *----------------------------------------------------------------------------*/ 5790 5791 float128 float128_round_to_int(float128 a, float_status *status) 5792 { 5793 flag aSign; 5794 int32_t aExp; 5795 uint64_t lastBitMask, roundBitsMask; 5796 float128 z; 5797 5798 aExp = extractFloat128Exp( a ); 5799 if ( 0x402F <= aExp ) { 5800 if ( 0x406F <= aExp ) { 5801 if ( ( aExp == 0x7FFF ) 5802 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 5803 ) { 5804 return propagateFloat128NaN(a, a, status); 5805 } 5806 return a; 5807 } 5808 lastBitMask = 1; 5809 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 5810 roundBitsMask = lastBitMask - 1; 5811 z = a; 5812 switch (status->float_rounding_mode) { 5813 case float_round_nearest_even: 5814 if ( lastBitMask ) { 5815 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 5816 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 5817 } 5818 else { 5819 if ( (int64_t) z.low < 0 ) { 5820 ++z.high; 5821 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 5822 } 5823 } 5824 break; 5825 case float_round_ties_away: 5826 if (lastBitMask) { 5827 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 5828 } else { 5829 if ((int64_t) z.low < 0) { 5830 ++z.high; 5831 } 5832 } 5833 break; 5834 case float_round_to_zero: 5835 break; 5836 case float_round_up: 5837 if (!extractFloat128Sign(z)) { 5838 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 5839 } 5840 break; 5841 case float_round_down: 5842 if (extractFloat128Sign(z)) { 5843 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 5844 } 5845 break; 5846 default: 5847 abort(); 5848 } 5849 z.low &= ~ roundBitsMask; 5850 } 5851 else { 5852 if ( aExp < 0x3FFF ) { 5853 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 5854 status->float_exception_flags |= float_flag_inexact; 5855 aSign = extractFloat128Sign( a ); 5856 switch (status->float_rounding_mode) { 5857 case float_round_nearest_even: 5858 if ( ( aExp == 0x3FFE ) 5859 && ( extractFloat128Frac0( a ) 5860 | extractFloat128Frac1( a ) ) 5861 ) { 5862 return packFloat128( aSign, 0x3FFF, 0, 0 ); 5863 } 5864 break; 5865 case float_round_ties_away: 5866 if (aExp == 0x3FFE) { 5867 return packFloat128(aSign, 0x3FFF, 0, 0); 5868 } 5869 break; 5870 case float_round_down: 5871 return 5872 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 5873 : packFloat128( 0, 0, 0, 0 ); 5874 case float_round_up: 5875 return 5876 aSign ? packFloat128( 1, 0, 0, 0 ) 5877 : packFloat128( 0, 0x3FFF, 0, 0 ); 5878 } 5879 return packFloat128( aSign, 0, 0, 0 ); 5880 } 5881 lastBitMask = 1; 5882 lastBitMask <<= 0x402F - aExp; 5883 roundBitsMask = lastBitMask - 1; 5884 z.low = 0; 5885 z.high = a.high; 5886 switch (status->float_rounding_mode) { 5887 case float_round_nearest_even: 5888 z.high += lastBitMask>>1; 5889 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 5890 z.high &= ~ lastBitMask; 5891 } 5892 break; 5893 case float_round_ties_away: 5894 z.high += lastBitMask>>1; 5895 break; 5896 case float_round_to_zero: 5897 break; 5898 case float_round_up: 5899 if (!extractFloat128Sign(z)) { 5900 z.high |= ( a.low != 0 ); 5901 z.high += roundBitsMask; 5902 } 5903 break; 5904 case float_round_down: 5905 if (extractFloat128Sign(z)) { 5906 z.high |= (a.low != 0); 5907 z.high += roundBitsMask; 5908 } 5909 break; 5910 default: 5911 abort(); 5912 } 5913 z.high &= ~ roundBitsMask; 5914 } 5915 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 5916 status->float_exception_flags |= float_flag_inexact; 5917 } 5918 return z; 5919 5920 } 5921 5922 /*---------------------------------------------------------------------------- 5923 | Returns the result of adding the absolute values of the quadruple-precision 5924 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 5925 | before being returned. `zSign' is ignored if the result is a NaN. 5926 | The addition is performed according to the IEC/IEEE Standard for Binary 5927 | Floating-Point Arithmetic. 5928 *----------------------------------------------------------------------------*/ 5929 5930 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign, 5931 float_status *status) 5932 { 5933 int32_t aExp, bExp, zExp; 5934 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 5935 int32_t expDiff; 5936 5937 aSig1 = extractFloat128Frac1( a ); 5938 aSig0 = extractFloat128Frac0( a ); 5939 aExp = extractFloat128Exp( a ); 5940 bSig1 = extractFloat128Frac1( b ); 5941 bSig0 = extractFloat128Frac0( b ); 5942 bExp = extractFloat128Exp( b ); 5943 expDiff = aExp - bExp; 5944 if ( 0 < expDiff ) { 5945 if ( aExp == 0x7FFF ) { 5946 if (aSig0 | aSig1) { 5947 return propagateFloat128NaN(a, b, status); 5948 } 5949 return a; 5950 } 5951 if ( bExp == 0 ) { 5952 --expDiff; 5953 } 5954 else { 5955 bSig0 |= LIT64( 0x0001000000000000 ); 5956 } 5957 shift128ExtraRightJamming( 5958 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 5959 zExp = aExp; 5960 } 5961 else if ( expDiff < 0 ) { 5962 if ( bExp == 0x7FFF ) { 5963 if (bSig0 | bSig1) { 5964 return propagateFloat128NaN(a, b, status); 5965 } 5966 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5967 } 5968 if ( aExp == 0 ) { 5969 ++expDiff; 5970 } 5971 else { 5972 aSig0 |= LIT64( 0x0001000000000000 ); 5973 } 5974 shift128ExtraRightJamming( 5975 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 5976 zExp = bExp; 5977 } 5978 else { 5979 if ( aExp == 0x7FFF ) { 5980 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 5981 return propagateFloat128NaN(a, b, status); 5982 } 5983 return a; 5984 } 5985 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 5986 if ( aExp == 0 ) { 5987 if (status->flush_to_zero) { 5988 if (zSig0 | zSig1) { 5989 float_raise(float_flag_output_denormal, status); 5990 } 5991 return packFloat128(zSign, 0, 0, 0); 5992 } 5993 return packFloat128( zSign, 0, zSig0, zSig1 ); 5994 } 5995 zSig2 = 0; 5996 zSig0 |= LIT64( 0x0002000000000000 ); 5997 zExp = aExp; 5998 goto shiftRight1; 5999 } 6000 aSig0 |= LIT64( 0x0001000000000000 ); 6001 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6002 --zExp; 6003 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 6004 ++zExp; 6005 shiftRight1: 6006 shift128ExtraRightJamming( 6007 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6008 roundAndPack: 6009 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6010 6011 } 6012 6013 /*---------------------------------------------------------------------------- 6014 | Returns the result of subtracting the absolute values of the quadruple- 6015 | precision floating-point values `a' and `b'. If `zSign' is 1, the 6016 | difference is negated before being returned. `zSign' is ignored if the 6017 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6018 | Standard for Binary Floating-Point Arithmetic. 6019 *----------------------------------------------------------------------------*/ 6020 6021 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign, 6022 float_status *status) 6023 { 6024 int32_t aExp, bExp, zExp; 6025 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 6026 int32_t expDiff; 6027 6028 aSig1 = extractFloat128Frac1( a ); 6029 aSig0 = extractFloat128Frac0( a ); 6030 aExp = extractFloat128Exp( a ); 6031 bSig1 = extractFloat128Frac1( b ); 6032 bSig0 = extractFloat128Frac0( b ); 6033 bExp = extractFloat128Exp( b ); 6034 expDiff = aExp - bExp; 6035 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6036 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 6037 if ( 0 < expDiff ) goto aExpBigger; 6038 if ( expDiff < 0 ) goto bExpBigger; 6039 if ( aExp == 0x7FFF ) { 6040 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6041 return propagateFloat128NaN(a, b, status); 6042 } 6043 float_raise(float_flag_invalid, status); 6044 return float128_default_nan(status); 6045 } 6046 if ( aExp == 0 ) { 6047 aExp = 1; 6048 bExp = 1; 6049 } 6050 if ( bSig0 < aSig0 ) goto aBigger; 6051 if ( aSig0 < bSig0 ) goto bBigger; 6052 if ( bSig1 < aSig1 ) goto aBigger; 6053 if ( aSig1 < bSig1 ) goto bBigger; 6054 return packFloat128(status->float_rounding_mode == float_round_down, 6055 0, 0, 0); 6056 bExpBigger: 6057 if ( bExp == 0x7FFF ) { 6058 if (bSig0 | bSig1) { 6059 return propagateFloat128NaN(a, b, status); 6060 } 6061 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 6062 } 6063 if ( aExp == 0 ) { 6064 ++expDiff; 6065 } 6066 else { 6067 aSig0 |= LIT64( 0x4000000000000000 ); 6068 } 6069 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6070 bSig0 |= LIT64( 0x4000000000000000 ); 6071 bBigger: 6072 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6073 zExp = bExp; 6074 zSign ^= 1; 6075 goto normalizeRoundAndPack; 6076 aExpBigger: 6077 if ( aExp == 0x7FFF ) { 6078 if (aSig0 | aSig1) { 6079 return propagateFloat128NaN(a, b, status); 6080 } 6081 return a; 6082 } 6083 if ( bExp == 0 ) { 6084 --expDiff; 6085 } 6086 else { 6087 bSig0 |= LIT64( 0x4000000000000000 ); 6088 } 6089 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 6090 aSig0 |= LIT64( 0x4000000000000000 ); 6091 aBigger: 6092 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6093 zExp = aExp; 6094 normalizeRoundAndPack: 6095 --zExp; 6096 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 6097 status); 6098 6099 } 6100 6101 /*---------------------------------------------------------------------------- 6102 | Returns the result of adding the quadruple-precision floating-point values 6103 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 6104 | for Binary Floating-Point Arithmetic. 6105 *----------------------------------------------------------------------------*/ 6106 6107 float128 float128_add(float128 a, float128 b, float_status *status) 6108 { 6109 flag aSign, bSign; 6110 6111 aSign = extractFloat128Sign( a ); 6112 bSign = extractFloat128Sign( b ); 6113 if ( aSign == bSign ) { 6114 return addFloat128Sigs(a, b, aSign, status); 6115 } 6116 else { 6117 return subFloat128Sigs(a, b, aSign, status); 6118 } 6119 6120 } 6121 6122 /*---------------------------------------------------------------------------- 6123 | Returns the result of subtracting the quadruple-precision floating-point 6124 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6125 | Standard for Binary Floating-Point Arithmetic. 6126 *----------------------------------------------------------------------------*/ 6127 6128 float128 float128_sub(float128 a, float128 b, float_status *status) 6129 { 6130 flag aSign, bSign; 6131 6132 aSign = extractFloat128Sign( a ); 6133 bSign = extractFloat128Sign( b ); 6134 if ( aSign == bSign ) { 6135 return subFloat128Sigs(a, b, aSign, status); 6136 } 6137 else { 6138 return addFloat128Sigs(a, b, aSign, status); 6139 } 6140 6141 } 6142 6143 /*---------------------------------------------------------------------------- 6144 | Returns the result of multiplying the quadruple-precision floating-point 6145 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6146 | Standard for Binary Floating-Point Arithmetic. 6147 *----------------------------------------------------------------------------*/ 6148 6149 float128 float128_mul(float128 a, float128 b, float_status *status) 6150 { 6151 flag aSign, bSign, zSign; 6152 int32_t aExp, bExp, zExp; 6153 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 6154 6155 aSig1 = extractFloat128Frac1( a ); 6156 aSig0 = extractFloat128Frac0( a ); 6157 aExp = extractFloat128Exp( a ); 6158 aSign = extractFloat128Sign( a ); 6159 bSig1 = extractFloat128Frac1( b ); 6160 bSig0 = extractFloat128Frac0( b ); 6161 bExp = extractFloat128Exp( b ); 6162 bSign = extractFloat128Sign( b ); 6163 zSign = aSign ^ bSign; 6164 if ( aExp == 0x7FFF ) { 6165 if ( ( aSig0 | aSig1 ) 6166 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6167 return propagateFloat128NaN(a, b, status); 6168 } 6169 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 6170 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6171 } 6172 if ( bExp == 0x7FFF ) { 6173 if (bSig0 | bSig1) { 6174 return propagateFloat128NaN(a, b, status); 6175 } 6176 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6177 invalid: 6178 float_raise(float_flag_invalid, status); 6179 return float128_default_nan(status); 6180 } 6181 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6182 } 6183 if ( aExp == 0 ) { 6184 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6185 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6186 } 6187 if ( bExp == 0 ) { 6188 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6189 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6190 } 6191 zExp = aExp + bExp - 0x4000; 6192 aSig0 |= LIT64( 0x0001000000000000 ); 6193 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 6194 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 6195 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6196 zSig2 |= ( zSig3 != 0 ); 6197 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 6198 shift128ExtraRightJamming( 6199 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6200 ++zExp; 6201 } 6202 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6203 6204 } 6205 6206 /*---------------------------------------------------------------------------- 6207 | Returns the result of dividing the quadruple-precision floating-point value 6208 | `a' by the corresponding value `b'. The operation is performed according to 6209 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6210 *----------------------------------------------------------------------------*/ 6211 6212 float128 float128_div(float128 a, float128 b, float_status *status) 6213 { 6214 flag aSign, bSign, zSign; 6215 int32_t aExp, bExp, zExp; 6216 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6217 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6218 6219 aSig1 = extractFloat128Frac1( a ); 6220 aSig0 = extractFloat128Frac0( a ); 6221 aExp = extractFloat128Exp( a ); 6222 aSign = extractFloat128Sign( a ); 6223 bSig1 = extractFloat128Frac1( b ); 6224 bSig0 = extractFloat128Frac0( b ); 6225 bExp = extractFloat128Exp( b ); 6226 bSign = extractFloat128Sign( b ); 6227 zSign = aSign ^ bSign; 6228 if ( aExp == 0x7FFF ) { 6229 if (aSig0 | aSig1) { 6230 return propagateFloat128NaN(a, b, status); 6231 } 6232 if ( bExp == 0x7FFF ) { 6233 if (bSig0 | bSig1) { 6234 return propagateFloat128NaN(a, b, status); 6235 } 6236 goto invalid; 6237 } 6238 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6239 } 6240 if ( bExp == 0x7FFF ) { 6241 if (bSig0 | bSig1) { 6242 return propagateFloat128NaN(a, b, status); 6243 } 6244 return packFloat128( zSign, 0, 0, 0 ); 6245 } 6246 if ( bExp == 0 ) { 6247 if ( ( bSig0 | bSig1 ) == 0 ) { 6248 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6249 invalid: 6250 float_raise(float_flag_invalid, status); 6251 return float128_default_nan(status); 6252 } 6253 float_raise(float_flag_divbyzero, status); 6254 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6255 } 6256 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6257 } 6258 if ( aExp == 0 ) { 6259 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6260 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6261 } 6262 zExp = aExp - bExp + 0x3FFD; 6263 shortShift128Left( 6264 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 6265 shortShift128Left( 6266 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6267 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 6268 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 6269 ++zExp; 6270 } 6271 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6272 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 6273 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 6274 while ( (int64_t) rem0 < 0 ) { 6275 --zSig0; 6276 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 6277 } 6278 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 6279 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 6280 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 6281 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 6282 while ( (int64_t) rem1 < 0 ) { 6283 --zSig1; 6284 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 6285 } 6286 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6287 } 6288 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 6289 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6290 6291 } 6292 6293 /*---------------------------------------------------------------------------- 6294 | Returns the remainder of the quadruple-precision floating-point value `a' 6295 | with respect to the corresponding value `b'. The operation is performed 6296 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6297 *----------------------------------------------------------------------------*/ 6298 6299 float128 float128_rem(float128 a, float128 b, float_status *status) 6300 { 6301 flag aSign, zSign; 6302 int32_t aExp, bExp, expDiff; 6303 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 6304 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 6305 int64_t sigMean0; 6306 6307 aSig1 = extractFloat128Frac1( a ); 6308 aSig0 = extractFloat128Frac0( a ); 6309 aExp = extractFloat128Exp( a ); 6310 aSign = extractFloat128Sign( a ); 6311 bSig1 = extractFloat128Frac1( b ); 6312 bSig0 = extractFloat128Frac0( b ); 6313 bExp = extractFloat128Exp( b ); 6314 if ( aExp == 0x7FFF ) { 6315 if ( ( aSig0 | aSig1 ) 6316 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6317 return propagateFloat128NaN(a, b, status); 6318 } 6319 goto invalid; 6320 } 6321 if ( bExp == 0x7FFF ) { 6322 if (bSig0 | bSig1) { 6323 return propagateFloat128NaN(a, b, status); 6324 } 6325 return a; 6326 } 6327 if ( bExp == 0 ) { 6328 if ( ( bSig0 | bSig1 ) == 0 ) { 6329 invalid: 6330 float_raise(float_flag_invalid, status); 6331 return float128_default_nan(status); 6332 } 6333 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6334 } 6335 if ( aExp == 0 ) { 6336 if ( ( aSig0 | aSig1 ) == 0 ) return a; 6337 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6338 } 6339 expDiff = aExp - bExp; 6340 if ( expDiff < -1 ) return a; 6341 shortShift128Left( 6342 aSig0 | LIT64( 0x0001000000000000 ), 6343 aSig1, 6344 15 - ( expDiff < 0 ), 6345 &aSig0, 6346 &aSig1 6347 ); 6348 shortShift128Left( 6349 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6350 q = le128( bSig0, bSig1, aSig0, aSig1 ); 6351 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6352 expDiff -= 64; 6353 while ( 0 < expDiff ) { 6354 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6355 q = ( 4 < q ) ? q - 4 : 0; 6356 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6357 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 6358 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 6359 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 6360 expDiff -= 61; 6361 } 6362 if ( -64 < expDiff ) { 6363 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6364 q = ( 4 < q ) ? q - 4 : 0; 6365 q >>= - expDiff; 6366 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6367 expDiff += 52; 6368 if ( expDiff < 0 ) { 6369 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6370 } 6371 else { 6372 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 6373 } 6374 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6375 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 6376 } 6377 else { 6378 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 6379 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6380 } 6381 do { 6382 alternateASig0 = aSig0; 6383 alternateASig1 = aSig1; 6384 ++q; 6385 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6386 } while ( 0 <= (int64_t) aSig0 ); 6387 add128( 6388 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 6389 if ( ( sigMean0 < 0 ) 6390 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 6391 aSig0 = alternateASig0; 6392 aSig1 = alternateASig1; 6393 } 6394 zSign = ( (int64_t) aSig0 < 0 ); 6395 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 6396 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 6397 status); 6398 } 6399 6400 /*---------------------------------------------------------------------------- 6401 | Returns the square root of the quadruple-precision floating-point value `a'. 6402 | The operation is performed according to the IEC/IEEE Standard for Binary 6403 | Floating-Point Arithmetic. 6404 *----------------------------------------------------------------------------*/ 6405 6406 float128 float128_sqrt(float128 a, float_status *status) 6407 { 6408 flag aSign; 6409 int32_t aExp, zExp; 6410 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 6411 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6412 6413 aSig1 = extractFloat128Frac1( a ); 6414 aSig0 = extractFloat128Frac0( a ); 6415 aExp = extractFloat128Exp( a ); 6416 aSign = extractFloat128Sign( a ); 6417 if ( aExp == 0x7FFF ) { 6418 if (aSig0 | aSig1) { 6419 return propagateFloat128NaN(a, a, status); 6420 } 6421 if ( ! aSign ) return a; 6422 goto invalid; 6423 } 6424 if ( aSign ) { 6425 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 6426 invalid: 6427 float_raise(float_flag_invalid, status); 6428 return float128_default_nan(status); 6429 } 6430 if ( aExp == 0 ) { 6431 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 6432 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6433 } 6434 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 6435 aSig0 |= LIT64( 0x0001000000000000 ); 6436 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 6437 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 6438 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6439 doubleZSig0 = zSig0<<1; 6440 mul64To128( zSig0, zSig0, &term0, &term1 ); 6441 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6442 while ( (int64_t) rem0 < 0 ) { 6443 --zSig0; 6444 doubleZSig0 -= 2; 6445 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6446 } 6447 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6448 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 6449 if ( zSig1 == 0 ) zSig1 = 1; 6450 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6451 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6452 mul64To128( zSig1, zSig1, &term2, &term3 ); 6453 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6454 while ( (int64_t) rem1 < 0 ) { 6455 --zSig1; 6456 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6457 term3 |= 1; 6458 term2 |= doubleZSig0; 6459 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6460 } 6461 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6462 } 6463 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 6464 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 6465 6466 } 6467 6468 /*---------------------------------------------------------------------------- 6469 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 6470 | the corresponding value `b', and 0 otherwise. The invalid exception is 6471 | raised if either operand is a NaN. Otherwise, the comparison is performed 6472 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6473 *----------------------------------------------------------------------------*/ 6474 6475 int float128_eq(float128 a, float128 b, float_status *status) 6476 { 6477 6478 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6479 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6480 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6481 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6482 ) { 6483 float_raise(float_flag_invalid, status); 6484 return 0; 6485 } 6486 return 6487 ( a.low == b.low ) 6488 && ( ( a.high == b.high ) 6489 || ( ( a.low == 0 ) 6490 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6491 ); 6492 6493 } 6494 6495 /*---------------------------------------------------------------------------- 6496 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6497 | or equal to the corresponding value `b', and 0 otherwise. The invalid 6498 | exception is raised if either operand is a NaN. The comparison is performed 6499 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6500 *----------------------------------------------------------------------------*/ 6501 6502 int float128_le(float128 a, float128 b, float_status *status) 6503 { 6504 flag aSign, bSign; 6505 6506 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6507 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6508 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6509 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6510 ) { 6511 float_raise(float_flag_invalid, status); 6512 return 0; 6513 } 6514 aSign = extractFloat128Sign( a ); 6515 bSign = extractFloat128Sign( b ); 6516 if ( aSign != bSign ) { 6517 return 6518 aSign 6519 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6520 == 0 ); 6521 } 6522 return 6523 aSign ? le128( b.high, b.low, a.high, a.low ) 6524 : le128( a.high, a.low, b.high, b.low ); 6525 6526 } 6527 6528 /*---------------------------------------------------------------------------- 6529 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6530 | the corresponding value `b', and 0 otherwise. The invalid exception is 6531 | raised if either operand is a NaN. The comparison is performed according 6532 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6533 *----------------------------------------------------------------------------*/ 6534 6535 int float128_lt(float128 a, float128 b, float_status *status) 6536 { 6537 flag aSign, bSign; 6538 6539 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6540 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6541 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6542 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6543 ) { 6544 float_raise(float_flag_invalid, status); 6545 return 0; 6546 } 6547 aSign = extractFloat128Sign( a ); 6548 bSign = extractFloat128Sign( b ); 6549 if ( aSign != bSign ) { 6550 return 6551 aSign 6552 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6553 != 0 ); 6554 } 6555 return 6556 aSign ? lt128( b.high, b.low, a.high, a.low ) 6557 : lt128( a.high, a.low, b.high, b.low ); 6558 6559 } 6560 6561 /*---------------------------------------------------------------------------- 6562 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 6563 | be compared, and 0 otherwise. The invalid exception is raised if either 6564 | operand is a NaN. The comparison is performed according to the IEC/IEEE 6565 | Standard for Binary Floating-Point Arithmetic. 6566 *----------------------------------------------------------------------------*/ 6567 6568 int float128_unordered(float128 a, float128 b, float_status *status) 6569 { 6570 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6571 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6572 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6573 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6574 ) { 6575 float_raise(float_flag_invalid, status); 6576 return 1; 6577 } 6578 return 0; 6579 } 6580 6581 /*---------------------------------------------------------------------------- 6582 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 6583 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 6584 | exception. The comparison is performed according to the IEC/IEEE Standard 6585 | for Binary Floating-Point Arithmetic. 6586 *----------------------------------------------------------------------------*/ 6587 6588 int float128_eq_quiet(float128 a, float128 b, float_status *status) 6589 { 6590 6591 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6592 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6593 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6594 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6595 ) { 6596 if (float128_is_signaling_nan(a, status) 6597 || float128_is_signaling_nan(b, status)) { 6598 float_raise(float_flag_invalid, status); 6599 } 6600 return 0; 6601 } 6602 return 6603 ( a.low == b.low ) 6604 && ( ( a.high == b.high ) 6605 || ( ( a.low == 0 ) 6606 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6607 ); 6608 6609 } 6610 6611 /*---------------------------------------------------------------------------- 6612 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6613 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 6614 | cause an exception. Otherwise, the comparison is performed according to the 6615 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6616 *----------------------------------------------------------------------------*/ 6617 6618 int float128_le_quiet(float128 a, float128 b, float_status *status) 6619 { 6620 flag aSign, bSign; 6621 6622 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6623 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6624 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6625 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6626 ) { 6627 if (float128_is_signaling_nan(a, status) 6628 || float128_is_signaling_nan(b, status)) { 6629 float_raise(float_flag_invalid, status); 6630 } 6631 return 0; 6632 } 6633 aSign = extractFloat128Sign( a ); 6634 bSign = extractFloat128Sign( b ); 6635 if ( aSign != bSign ) { 6636 return 6637 aSign 6638 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6639 == 0 ); 6640 } 6641 return 6642 aSign ? le128( b.high, b.low, a.high, a.low ) 6643 : le128( a.high, a.low, b.high, b.low ); 6644 6645 } 6646 6647 /*---------------------------------------------------------------------------- 6648 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6649 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 6650 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 6651 | Standard for Binary Floating-Point Arithmetic. 6652 *----------------------------------------------------------------------------*/ 6653 6654 int float128_lt_quiet(float128 a, float128 b, float_status *status) 6655 { 6656 flag aSign, bSign; 6657 6658 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6659 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6660 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6661 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6662 ) { 6663 if (float128_is_signaling_nan(a, status) 6664 || float128_is_signaling_nan(b, status)) { 6665 float_raise(float_flag_invalid, status); 6666 } 6667 return 0; 6668 } 6669 aSign = extractFloat128Sign( a ); 6670 bSign = extractFloat128Sign( b ); 6671 if ( aSign != bSign ) { 6672 return 6673 aSign 6674 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6675 != 0 ); 6676 } 6677 return 6678 aSign ? lt128( b.high, b.low, a.high, a.low ) 6679 : lt128( a.high, a.low, b.high, b.low ); 6680 6681 } 6682 6683 /*---------------------------------------------------------------------------- 6684 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 6685 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 6686 | comparison is performed according to the IEC/IEEE Standard for Binary 6687 | Floating-Point Arithmetic. 6688 *----------------------------------------------------------------------------*/ 6689 6690 int float128_unordered_quiet(float128 a, float128 b, float_status *status) 6691 { 6692 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6693 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6694 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6695 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6696 ) { 6697 if (float128_is_signaling_nan(a, status) 6698 || float128_is_signaling_nan(b, status)) { 6699 float_raise(float_flag_invalid, status); 6700 } 6701 return 1; 6702 } 6703 return 0; 6704 } 6705 6706 static inline int floatx80_compare_internal(floatx80 a, floatx80 b, 6707 int is_quiet, float_status *status) 6708 { 6709 flag aSign, bSign; 6710 6711 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6712 float_raise(float_flag_invalid, status); 6713 return float_relation_unordered; 6714 } 6715 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 6716 ( extractFloatx80Frac( a )<<1 ) ) || 6717 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 6718 ( extractFloatx80Frac( b )<<1 ) )) { 6719 if (!is_quiet || 6720 floatx80_is_signaling_nan(a, status) || 6721 floatx80_is_signaling_nan(b, status)) { 6722 float_raise(float_flag_invalid, status); 6723 } 6724 return float_relation_unordered; 6725 } 6726 aSign = extractFloatx80Sign( a ); 6727 bSign = extractFloatx80Sign( b ); 6728 if ( aSign != bSign ) { 6729 6730 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 6731 ( ( a.low | b.low ) == 0 ) ) { 6732 /* zero case */ 6733 return float_relation_equal; 6734 } else { 6735 return 1 - (2 * aSign); 6736 } 6737 } else { 6738 if (a.low == b.low && a.high == b.high) { 6739 return float_relation_equal; 6740 } else { 6741 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 6742 } 6743 } 6744 } 6745 6746 int floatx80_compare(floatx80 a, floatx80 b, float_status *status) 6747 { 6748 return floatx80_compare_internal(a, b, 0, status); 6749 } 6750 6751 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status) 6752 { 6753 return floatx80_compare_internal(a, b, 1, status); 6754 } 6755 6756 static inline int float128_compare_internal(float128 a, float128 b, 6757 int is_quiet, float_status *status) 6758 { 6759 flag aSign, bSign; 6760 6761 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 6762 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 6763 ( ( extractFloat128Exp( b ) == 0x7fff ) && 6764 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 6765 if (!is_quiet || 6766 float128_is_signaling_nan(a, status) || 6767 float128_is_signaling_nan(b, status)) { 6768 float_raise(float_flag_invalid, status); 6769 } 6770 return float_relation_unordered; 6771 } 6772 aSign = extractFloat128Sign( a ); 6773 bSign = extractFloat128Sign( b ); 6774 if ( aSign != bSign ) { 6775 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 6776 /* zero case */ 6777 return float_relation_equal; 6778 } else { 6779 return 1 - (2 * aSign); 6780 } 6781 } else { 6782 if (a.low == b.low && a.high == b.high) { 6783 return float_relation_equal; 6784 } else { 6785 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 6786 } 6787 } 6788 } 6789 6790 int float128_compare(float128 a, float128 b, float_status *status) 6791 { 6792 return float128_compare_internal(a, b, 0, status); 6793 } 6794 6795 int float128_compare_quiet(float128 a, float128 b, float_status *status) 6796 { 6797 return float128_compare_internal(a, b, 1, status); 6798 } 6799 6800 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 6801 { 6802 flag aSign; 6803 int32_t aExp; 6804 uint64_t aSig; 6805 6806 if (floatx80_invalid_encoding(a)) { 6807 float_raise(float_flag_invalid, status); 6808 return floatx80_default_nan(status); 6809 } 6810 aSig = extractFloatx80Frac( a ); 6811 aExp = extractFloatx80Exp( a ); 6812 aSign = extractFloatx80Sign( a ); 6813 6814 if ( aExp == 0x7FFF ) { 6815 if ( aSig<<1 ) { 6816 return propagateFloatx80NaN(a, a, status); 6817 } 6818 return a; 6819 } 6820 6821 if (aExp == 0) { 6822 if (aSig == 0) { 6823 return a; 6824 } 6825 aExp++; 6826 } 6827 6828 if (n > 0x10000) { 6829 n = 0x10000; 6830 } else if (n < -0x10000) { 6831 n = -0x10000; 6832 } 6833 6834 aExp += n; 6835 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 6836 aSign, aExp, aSig, 0, status); 6837 } 6838 6839 float128 float128_scalbn(float128 a, int n, float_status *status) 6840 { 6841 flag aSign; 6842 int32_t aExp; 6843 uint64_t aSig0, aSig1; 6844 6845 aSig1 = extractFloat128Frac1( a ); 6846 aSig0 = extractFloat128Frac0( a ); 6847 aExp = extractFloat128Exp( a ); 6848 aSign = extractFloat128Sign( a ); 6849 if ( aExp == 0x7FFF ) { 6850 if ( aSig0 | aSig1 ) { 6851 return propagateFloat128NaN(a, a, status); 6852 } 6853 return a; 6854 } 6855 if (aExp != 0) { 6856 aSig0 |= LIT64( 0x0001000000000000 ); 6857 } else if (aSig0 == 0 && aSig1 == 0) { 6858 return a; 6859 } else { 6860 aExp++; 6861 } 6862 6863 if (n > 0x10000) { 6864 n = 0x10000; 6865 } else if (n < -0x10000) { 6866 n = -0x10000; 6867 } 6868 6869 aExp += n - 1; 6870 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 6871 , status); 6872 6873 } 6874