1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include "qemu/bitops.h" 87 #include "fpu/softfloat.h" 88 89 /* We only need stdlib for abort() */ 90 91 /*---------------------------------------------------------------------------- 92 | Primitive arithmetic functions, including multi-word arithmetic, and 93 | division and square root approximations. (Can be specialized to target if 94 | desired.) 95 *----------------------------------------------------------------------------*/ 96 #include "fpu/softfloat-macros.h" 97 98 /*---------------------------------------------------------------------------- 99 | Functions and definitions to determine: (1) whether tininess for underflow 100 | is detected before or after rounding by default, (2) what (if anything) 101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 103 | are propagated from function inputs to output. These details are target- 104 | specific. 105 *----------------------------------------------------------------------------*/ 106 #include "softfloat-specialize.h" 107 108 /*---------------------------------------------------------------------------- 109 | Returns the fraction bits of the half-precision floating-point value `a'. 110 *----------------------------------------------------------------------------*/ 111 112 static inline uint32_t extractFloat16Frac(float16 a) 113 { 114 return float16_val(a) & 0x3ff; 115 } 116 117 /*---------------------------------------------------------------------------- 118 | Returns the exponent bits of the half-precision floating-point value `a'. 119 *----------------------------------------------------------------------------*/ 120 121 static inline int extractFloat16Exp(float16 a) 122 { 123 return (float16_val(a) >> 10) & 0x1f; 124 } 125 126 /*---------------------------------------------------------------------------- 127 | Returns the sign bit of the single-precision floating-point value `a'. 128 *----------------------------------------------------------------------------*/ 129 130 static inline flag extractFloat16Sign(float16 a) 131 { 132 return float16_val(a)>>15; 133 } 134 135 /*---------------------------------------------------------------------------- 136 | Returns the fraction bits of the single-precision floating-point value `a'. 137 *----------------------------------------------------------------------------*/ 138 139 static inline uint32_t extractFloat32Frac(float32 a) 140 { 141 return float32_val(a) & 0x007FFFFF; 142 } 143 144 /*---------------------------------------------------------------------------- 145 | Returns the exponent bits of the single-precision floating-point value `a'. 146 *----------------------------------------------------------------------------*/ 147 148 static inline int extractFloat32Exp(float32 a) 149 { 150 return (float32_val(a) >> 23) & 0xFF; 151 } 152 153 /*---------------------------------------------------------------------------- 154 | Returns the sign bit of the single-precision floating-point value `a'. 155 *----------------------------------------------------------------------------*/ 156 157 static inline flag extractFloat32Sign(float32 a) 158 { 159 return float32_val(a) >> 31; 160 } 161 162 /*---------------------------------------------------------------------------- 163 | Returns the fraction bits of the double-precision floating-point value `a'. 164 *----------------------------------------------------------------------------*/ 165 166 static inline uint64_t extractFloat64Frac(float64 a) 167 { 168 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF); 169 } 170 171 /*---------------------------------------------------------------------------- 172 | Returns the exponent bits of the double-precision floating-point value `a'. 173 *----------------------------------------------------------------------------*/ 174 175 static inline int extractFloat64Exp(float64 a) 176 { 177 return (float64_val(a) >> 52) & 0x7FF; 178 } 179 180 /*---------------------------------------------------------------------------- 181 | Returns the sign bit of the double-precision floating-point value `a'. 182 *----------------------------------------------------------------------------*/ 183 184 static inline flag extractFloat64Sign(float64 a) 185 { 186 return float64_val(a) >> 63; 187 } 188 189 /* 190 * Classify a floating point number. Everything above float_class_qnan 191 * is a NaN so cls >= float_class_qnan is any NaN. 192 */ 193 194 typedef enum __attribute__ ((__packed__)) { 195 float_class_unclassified, 196 float_class_zero, 197 float_class_normal, 198 float_class_inf, 199 float_class_qnan, /* all NaNs from here */ 200 float_class_snan, 201 float_class_dnan, 202 float_class_msnan, /* maybe silenced */ 203 } FloatClass; 204 205 /* 206 * Structure holding all of the decomposed parts of a float. The 207 * exponent is unbiased and the fraction is normalized. All 208 * calculations are done with a 64 bit fraction and then rounded as 209 * appropriate for the final format. 210 * 211 * Thanks to the packed FloatClass a decent compiler should be able to 212 * fit the whole structure into registers and avoid using the stack 213 * for parameter passing. 214 */ 215 216 typedef struct { 217 uint64_t frac; 218 int32_t exp; 219 FloatClass cls; 220 bool sign; 221 } FloatParts; 222 223 #define DECOMPOSED_BINARY_POINT (64 - 2) 224 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 225 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1) 226 227 /* Structure holding all of the relevant parameters for a format. 228 * exp_size: the size of the exponent field 229 * exp_bias: the offset applied to the exponent field 230 * exp_max: the maximum normalised exponent 231 * frac_size: the size of the fraction field 232 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 233 * The following are computed based the size of fraction 234 * frac_lsb: least significant bit of fraction 235 * fram_lsbm1: the bit bellow the least significant bit (for rounding) 236 * round_mask/roundeven_mask: masks used for rounding 237 */ 238 typedef struct { 239 int exp_size; 240 int exp_bias; 241 int exp_max; 242 int frac_size; 243 int frac_shift; 244 uint64_t frac_lsb; 245 uint64_t frac_lsbm1; 246 uint64_t round_mask; 247 uint64_t roundeven_mask; 248 } FloatFmt; 249 250 /* Expand fields based on the size of exponent and fraction */ 251 #define FLOAT_PARAMS(E, F) \ 252 .exp_size = E, \ 253 .exp_bias = ((1 << E) - 1) >> 1, \ 254 .exp_max = (1 << E) - 1, \ 255 .frac_size = F, \ 256 .frac_shift = DECOMPOSED_BINARY_POINT - F, \ 257 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \ 258 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \ 259 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \ 260 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1 261 262 static const FloatFmt float16_params = { 263 FLOAT_PARAMS(5, 10) 264 }; 265 266 static const FloatFmt float32_params = { 267 FLOAT_PARAMS(8, 23) 268 }; 269 270 static const FloatFmt float64_params = { 271 FLOAT_PARAMS(11, 52) 272 }; 273 274 /* Unpack a float to parts, but do not canonicalize. */ 275 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw) 276 { 277 const int sign_pos = fmt.frac_size + fmt.exp_size; 278 279 return (FloatParts) { 280 .cls = float_class_unclassified, 281 .sign = extract64(raw, sign_pos, 1), 282 .exp = extract64(raw, fmt.frac_size, fmt.exp_size), 283 .frac = extract64(raw, 0, fmt.frac_size), 284 }; 285 } 286 287 static inline FloatParts float16_unpack_raw(float16 f) 288 { 289 return unpack_raw(float16_params, f); 290 } 291 292 static inline FloatParts float32_unpack_raw(float32 f) 293 { 294 return unpack_raw(float32_params, f); 295 } 296 297 static inline FloatParts float64_unpack_raw(float64 f) 298 { 299 return unpack_raw(float64_params, f); 300 } 301 302 /* Pack a float from parts, but do not canonicalize. */ 303 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p) 304 { 305 const int sign_pos = fmt.frac_size + fmt.exp_size; 306 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp); 307 return deposit64(ret, sign_pos, 1, p.sign); 308 } 309 310 static inline float16 float16_pack_raw(FloatParts p) 311 { 312 return make_float16(pack_raw(float16_params, p)); 313 } 314 315 static inline float32 float32_pack_raw(FloatParts p) 316 { 317 return make_float32(pack_raw(float32_params, p)); 318 } 319 320 static inline float64 float64_pack_raw(FloatParts p) 321 { 322 return make_float64(pack_raw(float64_params, p)); 323 } 324 325 /* Canonicalize EXP and FRAC, setting CLS. */ 326 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm, 327 float_status *status) 328 { 329 if (part.exp == parm->exp_max) { 330 if (part.frac == 0) { 331 part.cls = float_class_inf; 332 } else { 333 #ifdef NO_SIGNALING_NANS 334 part.cls = float_class_qnan; 335 #else 336 int64_t msb = part.frac << (parm->frac_shift + 2); 337 if ((msb < 0) == status->snan_bit_is_one) { 338 part.cls = float_class_snan; 339 } else { 340 part.cls = float_class_qnan; 341 } 342 #endif 343 } 344 } else if (part.exp == 0) { 345 if (likely(part.frac == 0)) { 346 part.cls = float_class_zero; 347 } else if (status->flush_inputs_to_zero) { 348 float_raise(float_flag_input_denormal, status); 349 part.cls = float_class_zero; 350 part.frac = 0; 351 } else { 352 int shift = clz64(part.frac) - 1; 353 part.cls = float_class_normal; 354 part.exp = parm->frac_shift - parm->exp_bias - shift + 1; 355 part.frac <<= shift; 356 } 357 } else { 358 part.cls = float_class_normal; 359 part.exp -= parm->exp_bias; 360 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift); 361 } 362 return part; 363 } 364 365 /* Round and uncanonicalize a floating-point number by parts. There 366 * are FRAC_SHIFT bits that may require rounding at the bottom of the 367 * fraction; these bits will be removed. The exponent will be biased 368 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0]. 369 */ 370 371 static FloatParts round_canonical(FloatParts p, float_status *s, 372 const FloatFmt *parm) 373 { 374 const uint64_t frac_lsbm1 = parm->frac_lsbm1; 375 const uint64_t round_mask = parm->round_mask; 376 const uint64_t roundeven_mask = parm->roundeven_mask; 377 const int exp_max = parm->exp_max; 378 const int frac_shift = parm->frac_shift; 379 uint64_t frac, inc; 380 int exp, flags = 0; 381 bool overflow_norm; 382 383 frac = p.frac; 384 exp = p.exp; 385 386 switch (p.cls) { 387 case float_class_normal: 388 switch (s->float_rounding_mode) { 389 case float_round_nearest_even: 390 overflow_norm = false; 391 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 392 break; 393 case float_round_ties_away: 394 overflow_norm = false; 395 inc = frac_lsbm1; 396 break; 397 case float_round_to_zero: 398 overflow_norm = true; 399 inc = 0; 400 break; 401 case float_round_up: 402 inc = p.sign ? 0 : round_mask; 403 overflow_norm = p.sign; 404 break; 405 case float_round_down: 406 inc = p.sign ? round_mask : 0; 407 overflow_norm = !p.sign; 408 break; 409 default: 410 g_assert_not_reached(); 411 } 412 413 exp += parm->exp_bias; 414 if (likely(exp > 0)) { 415 if (frac & round_mask) { 416 flags |= float_flag_inexact; 417 frac += inc; 418 if (frac & DECOMPOSED_OVERFLOW_BIT) { 419 frac >>= 1; 420 exp++; 421 } 422 } 423 frac >>= frac_shift; 424 425 if (unlikely(exp >= exp_max)) { 426 flags |= float_flag_overflow | float_flag_inexact; 427 if (overflow_norm) { 428 exp = exp_max - 1; 429 frac = -1; 430 } else { 431 p.cls = float_class_inf; 432 goto do_inf; 433 } 434 } 435 } else if (s->flush_to_zero) { 436 flags |= float_flag_output_denormal; 437 p.cls = float_class_zero; 438 goto do_zero; 439 } else { 440 bool is_tiny = (s->float_detect_tininess 441 == float_tininess_before_rounding) 442 || (exp < 0) 443 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT); 444 445 shift64RightJamming(frac, 1 - exp, &frac); 446 if (frac & round_mask) { 447 /* Need to recompute round-to-even. */ 448 if (s->float_rounding_mode == float_round_nearest_even) { 449 inc = ((frac & roundeven_mask) != frac_lsbm1 450 ? frac_lsbm1 : 0); 451 } 452 flags |= float_flag_inexact; 453 frac += inc; 454 } 455 456 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0); 457 frac >>= frac_shift; 458 459 if (is_tiny && (flags & float_flag_inexact)) { 460 flags |= float_flag_underflow; 461 } 462 if (exp == 0 && frac == 0) { 463 p.cls = float_class_zero; 464 } 465 } 466 break; 467 468 case float_class_zero: 469 do_zero: 470 exp = 0; 471 frac = 0; 472 break; 473 474 case float_class_inf: 475 do_inf: 476 exp = exp_max; 477 frac = 0; 478 break; 479 480 case float_class_qnan: 481 case float_class_snan: 482 exp = exp_max; 483 break; 484 485 default: 486 g_assert_not_reached(); 487 } 488 489 float_raise(flags, s); 490 p.exp = exp; 491 p.frac = frac; 492 return p; 493 } 494 495 static FloatParts float16_unpack_canonical(float16 f, float_status *s) 496 { 497 return canonicalize(float16_unpack_raw(f), &float16_params, s); 498 } 499 500 static float16 float16_round_pack_canonical(FloatParts p, float_status *s) 501 { 502 switch (p.cls) { 503 case float_class_dnan: 504 return float16_default_nan(s); 505 case float_class_msnan: 506 return float16_maybe_silence_nan(float16_pack_raw(p), s); 507 default: 508 p = round_canonical(p, s, &float16_params); 509 return float16_pack_raw(p); 510 } 511 } 512 513 static FloatParts float32_unpack_canonical(float32 f, float_status *s) 514 { 515 return canonicalize(float32_unpack_raw(f), &float32_params, s); 516 } 517 518 static float32 float32_round_pack_canonical(FloatParts p, float_status *s) 519 { 520 switch (p.cls) { 521 case float_class_dnan: 522 return float32_default_nan(s); 523 case float_class_msnan: 524 return float32_maybe_silence_nan(float32_pack_raw(p), s); 525 default: 526 p = round_canonical(p, s, &float32_params); 527 return float32_pack_raw(p); 528 } 529 } 530 531 static FloatParts float64_unpack_canonical(float64 f, float_status *s) 532 { 533 return canonicalize(float64_unpack_raw(f), &float64_params, s); 534 } 535 536 static float64 float64_round_pack_canonical(FloatParts p, float_status *s) 537 { 538 switch (p.cls) { 539 case float_class_dnan: 540 return float64_default_nan(s); 541 case float_class_msnan: 542 return float64_maybe_silence_nan(float64_pack_raw(p), s); 543 default: 544 p = round_canonical(p, s, &float64_params); 545 return float64_pack_raw(p); 546 } 547 } 548 549 /* Simple helpers for checking if what NaN we have */ 550 static bool is_nan(FloatClass c) 551 { 552 return unlikely(c >= float_class_qnan); 553 } 554 static bool is_snan(FloatClass c) 555 { 556 return c == float_class_snan; 557 } 558 static bool is_qnan(FloatClass c) 559 { 560 return c == float_class_qnan; 561 } 562 563 static FloatParts return_nan(FloatParts a, float_status *s) 564 { 565 switch (a.cls) { 566 case float_class_snan: 567 s->float_exception_flags |= float_flag_invalid; 568 a.cls = float_class_msnan; 569 /* fall through */ 570 case float_class_qnan: 571 if (s->default_nan_mode) { 572 a.cls = float_class_dnan; 573 } 574 break; 575 576 default: 577 g_assert_not_reached(); 578 } 579 return a; 580 } 581 582 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s) 583 { 584 if (is_snan(a.cls) || is_snan(b.cls)) { 585 s->float_exception_flags |= float_flag_invalid; 586 } 587 588 if (s->default_nan_mode) { 589 a.cls = float_class_dnan; 590 } else { 591 if (pickNaN(is_qnan(a.cls), is_snan(a.cls), 592 is_qnan(b.cls), is_snan(b.cls), 593 a.frac > b.frac || 594 (a.frac == b.frac && a.sign < b.sign))) { 595 a = b; 596 } 597 a.cls = float_class_msnan; 598 } 599 return a; 600 } 601 602 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c, 603 bool inf_zero, float_status *s) 604 { 605 int which; 606 607 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) { 608 s->float_exception_flags |= float_flag_invalid; 609 } 610 611 which = pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls), 612 is_qnan(b.cls), is_snan(b.cls), 613 is_qnan(c.cls), is_snan(c.cls), 614 inf_zero, s); 615 616 if (s->default_nan_mode) { 617 /* Note that this check is after pickNaNMulAdd so that function 618 * has an opportunity to set the Invalid flag. 619 */ 620 a.cls = float_class_dnan; 621 return a; 622 } 623 624 switch (which) { 625 case 0: 626 break; 627 case 1: 628 a = b; 629 break; 630 case 2: 631 a = c; 632 break; 633 case 3: 634 a.cls = float_class_dnan; 635 return a; 636 default: 637 g_assert_not_reached(); 638 } 639 a.cls = float_class_msnan; 640 641 return a; 642 } 643 644 /* 645 * Returns the result of adding or subtracting the values of the 646 * floating-point values `a' and `b'. The operation is performed 647 * according to the IEC/IEEE Standard for Binary Floating-Point 648 * Arithmetic. 649 */ 650 651 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract, 652 float_status *s) 653 { 654 bool a_sign = a.sign; 655 bool b_sign = b.sign ^ subtract; 656 657 if (a_sign != b_sign) { 658 /* Subtraction */ 659 660 if (a.cls == float_class_normal && b.cls == float_class_normal) { 661 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) { 662 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 663 a.frac = a.frac - b.frac; 664 } else { 665 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 666 a.frac = b.frac - a.frac; 667 a.exp = b.exp; 668 a_sign ^= 1; 669 } 670 671 if (a.frac == 0) { 672 a.cls = float_class_zero; 673 a.sign = s->float_rounding_mode == float_round_down; 674 } else { 675 int shift = clz64(a.frac) - 1; 676 a.frac = a.frac << shift; 677 a.exp = a.exp - shift; 678 a.sign = a_sign; 679 } 680 return a; 681 } 682 if (is_nan(a.cls) || is_nan(b.cls)) { 683 return pick_nan(a, b, s); 684 } 685 if (a.cls == float_class_inf) { 686 if (b.cls == float_class_inf) { 687 float_raise(float_flag_invalid, s); 688 a.cls = float_class_dnan; 689 } 690 return a; 691 } 692 if (a.cls == float_class_zero && b.cls == float_class_zero) { 693 a.sign = s->float_rounding_mode == float_round_down; 694 return a; 695 } 696 if (a.cls == float_class_zero || b.cls == float_class_inf) { 697 b.sign = a_sign ^ 1; 698 return b; 699 } 700 if (b.cls == float_class_zero) { 701 return a; 702 } 703 } else { 704 /* Addition */ 705 if (a.cls == float_class_normal && b.cls == float_class_normal) { 706 if (a.exp > b.exp) { 707 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 708 } else if (a.exp < b.exp) { 709 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 710 a.exp = b.exp; 711 } 712 a.frac += b.frac; 713 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 714 a.frac >>= 1; 715 a.exp += 1; 716 } 717 return a; 718 } 719 if (is_nan(a.cls) || is_nan(b.cls)) { 720 return pick_nan(a, b, s); 721 } 722 if (a.cls == float_class_inf || b.cls == float_class_zero) { 723 return a; 724 } 725 if (b.cls == float_class_inf || a.cls == float_class_zero) { 726 b.sign = b_sign; 727 return b; 728 } 729 } 730 g_assert_not_reached(); 731 } 732 733 /* 734 * Returns the result of adding or subtracting the floating-point 735 * values `a' and `b'. The operation is performed according to the 736 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 737 */ 738 739 float16 __attribute__((flatten)) float16_add(float16 a, float16 b, 740 float_status *status) 741 { 742 FloatParts pa = float16_unpack_canonical(a, status); 743 FloatParts pb = float16_unpack_canonical(b, status); 744 FloatParts pr = addsub_floats(pa, pb, false, status); 745 746 return float16_round_pack_canonical(pr, status); 747 } 748 749 float32 __attribute__((flatten)) float32_add(float32 a, float32 b, 750 float_status *status) 751 { 752 FloatParts pa = float32_unpack_canonical(a, status); 753 FloatParts pb = float32_unpack_canonical(b, status); 754 FloatParts pr = addsub_floats(pa, pb, false, status); 755 756 return float32_round_pack_canonical(pr, status); 757 } 758 759 float64 __attribute__((flatten)) float64_add(float64 a, float64 b, 760 float_status *status) 761 { 762 FloatParts pa = float64_unpack_canonical(a, status); 763 FloatParts pb = float64_unpack_canonical(b, status); 764 FloatParts pr = addsub_floats(pa, pb, false, status); 765 766 return float64_round_pack_canonical(pr, status); 767 } 768 769 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b, 770 float_status *status) 771 { 772 FloatParts pa = float16_unpack_canonical(a, status); 773 FloatParts pb = float16_unpack_canonical(b, status); 774 FloatParts pr = addsub_floats(pa, pb, true, status); 775 776 return float16_round_pack_canonical(pr, status); 777 } 778 779 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b, 780 float_status *status) 781 { 782 FloatParts pa = float32_unpack_canonical(a, status); 783 FloatParts pb = float32_unpack_canonical(b, status); 784 FloatParts pr = addsub_floats(pa, pb, true, status); 785 786 return float32_round_pack_canonical(pr, status); 787 } 788 789 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b, 790 float_status *status) 791 { 792 FloatParts pa = float64_unpack_canonical(a, status); 793 FloatParts pb = float64_unpack_canonical(b, status); 794 FloatParts pr = addsub_floats(pa, pb, true, status); 795 796 return float64_round_pack_canonical(pr, status); 797 } 798 799 /* 800 * Returns the result of multiplying the floating-point values `a' and 801 * `b'. The operation is performed according to the IEC/IEEE Standard 802 * for Binary Floating-Point Arithmetic. 803 */ 804 805 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s) 806 { 807 bool sign = a.sign ^ b.sign; 808 809 if (a.cls == float_class_normal && b.cls == float_class_normal) { 810 uint64_t hi, lo; 811 int exp = a.exp + b.exp; 812 813 mul64To128(a.frac, b.frac, &hi, &lo); 814 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 815 if (lo & DECOMPOSED_OVERFLOW_BIT) { 816 shift64RightJamming(lo, 1, &lo); 817 exp += 1; 818 } 819 820 /* Re-use a */ 821 a.exp = exp; 822 a.sign = sign; 823 a.frac = lo; 824 return a; 825 } 826 /* handle all the NaN cases */ 827 if (is_nan(a.cls) || is_nan(b.cls)) { 828 return pick_nan(a, b, s); 829 } 830 /* Inf * Zero == NaN */ 831 if ((a.cls == float_class_inf && b.cls == float_class_zero) || 832 (a.cls == float_class_zero && b.cls == float_class_inf)) { 833 s->float_exception_flags |= float_flag_invalid; 834 a.cls = float_class_dnan; 835 a.sign = sign; 836 return a; 837 } 838 /* Multiply by 0 or Inf */ 839 if (a.cls == float_class_inf || a.cls == float_class_zero) { 840 a.sign = sign; 841 return a; 842 } 843 if (b.cls == float_class_inf || b.cls == float_class_zero) { 844 b.sign = sign; 845 return b; 846 } 847 g_assert_not_reached(); 848 } 849 850 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b, 851 float_status *status) 852 { 853 FloatParts pa = float16_unpack_canonical(a, status); 854 FloatParts pb = float16_unpack_canonical(b, status); 855 FloatParts pr = mul_floats(pa, pb, status); 856 857 return float16_round_pack_canonical(pr, status); 858 } 859 860 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b, 861 float_status *status) 862 { 863 FloatParts pa = float32_unpack_canonical(a, status); 864 FloatParts pb = float32_unpack_canonical(b, status); 865 FloatParts pr = mul_floats(pa, pb, status); 866 867 return float32_round_pack_canonical(pr, status); 868 } 869 870 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b, 871 float_status *status) 872 { 873 FloatParts pa = float64_unpack_canonical(a, status); 874 FloatParts pb = float64_unpack_canonical(b, status); 875 FloatParts pr = mul_floats(pa, pb, status); 876 877 return float64_round_pack_canonical(pr, status); 878 } 879 880 /* 881 * Returns the result of multiplying the floating-point values `a' and 882 * `b' then adding 'c', with no intermediate rounding step after the 883 * multiplication. The operation is performed according to the 884 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008. 885 * The flags argument allows the caller to select negation of the 886 * addend, the intermediate product, or the final result. (The 887 * difference between this and having the caller do a separate 888 * negation is that negating externally will flip the sign bit on 889 * NaNs.) 890 */ 891 892 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c, 893 int flags, float_status *s) 894 { 895 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) == 896 ((1 << float_class_inf) | (1 << float_class_zero)); 897 bool p_sign; 898 bool sign_flip = flags & float_muladd_negate_result; 899 FloatClass p_class; 900 uint64_t hi, lo; 901 int p_exp; 902 903 /* It is implementation-defined whether the cases of (0,inf,qnan) 904 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 905 * they return if they do), so we have to hand this information 906 * off to the target-specific pick-a-NaN routine. 907 */ 908 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) { 909 return pick_nan_muladd(a, b, c, inf_zero, s); 910 } 911 912 if (inf_zero) { 913 s->float_exception_flags |= float_flag_invalid; 914 a.cls = float_class_dnan; 915 return a; 916 } 917 918 if (flags & float_muladd_negate_c) { 919 c.sign ^= 1; 920 } 921 922 p_sign = a.sign ^ b.sign; 923 924 if (flags & float_muladd_negate_product) { 925 p_sign ^= 1; 926 } 927 928 if (a.cls == float_class_inf || b.cls == float_class_inf) { 929 p_class = float_class_inf; 930 } else if (a.cls == float_class_zero || b.cls == float_class_zero) { 931 p_class = float_class_zero; 932 } else { 933 p_class = float_class_normal; 934 } 935 936 if (c.cls == float_class_inf) { 937 if (p_class == float_class_inf && p_sign != c.sign) { 938 s->float_exception_flags |= float_flag_invalid; 939 a.cls = float_class_dnan; 940 } else { 941 a.cls = float_class_inf; 942 a.sign = c.sign ^ sign_flip; 943 } 944 return a; 945 } 946 947 if (p_class == float_class_inf) { 948 a.cls = float_class_inf; 949 a.sign = p_sign ^ sign_flip; 950 return a; 951 } 952 953 if (p_class == float_class_zero) { 954 if (c.cls == float_class_zero) { 955 if (p_sign != c.sign) { 956 p_sign = s->float_rounding_mode == float_round_down; 957 } 958 c.sign = p_sign; 959 } else if (flags & float_muladd_halve_result) { 960 c.exp -= 1; 961 } 962 c.sign ^= sign_flip; 963 return c; 964 } 965 966 /* a & b should be normals now... */ 967 assert(a.cls == float_class_normal && 968 b.cls == float_class_normal); 969 970 p_exp = a.exp + b.exp; 971 972 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit 973 * result. 974 */ 975 mul64To128(a.frac, b.frac, &hi, &lo); 976 /* binary point now at bit 124 */ 977 978 /* check for overflow */ 979 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) { 980 shift128RightJamming(hi, lo, 1, &hi, &lo); 981 p_exp += 1; 982 } 983 984 /* + add/sub */ 985 if (c.cls == float_class_zero) { 986 /* move binary point back to 62 */ 987 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 988 } else { 989 int exp_diff = p_exp - c.exp; 990 if (p_sign == c.sign) { 991 /* Addition */ 992 if (exp_diff <= 0) { 993 shift128RightJamming(hi, lo, 994 DECOMPOSED_BINARY_POINT - exp_diff, 995 &hi, &lo); 996 lo += c.frac; 997 p_exp = c.exp; 998 } else { 999 uint64_t c_hi, c_lo; 1000 /* shift c to the same binary point as the product (124) */ 1001 c_hi = c.frac >> 2; 1002 c_lo = 0; 1003 shift128RightJamming(c_hi, c_lo, 1004 exp_diff, 1005 &c_hi, &c_lo); 1006 add128(hi, lo, c_hi, c_lo, &hi, &lo); 1007 /* move binary point back to 62 */ 1008 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1009 } 1010 1011 if (lo & DECOMPOSED_OVERFLOW_BIT) { 1012 shift64RightJamming(lo, 1, &lo); 1013 p_exp += 1; 1014 } 1015 1016 } else { 1017 /* Subtraction */ 1018 uint64_t c_hi, c_lo; 1019 /* make C binary point match product at bit 124 */ 1020 c_hi = c.frac >> 2; 1021 c_lo = 0; 1022 1023 if (exp_diff <= 0) { 1024 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo); 1025 if (exp_diff == 0 1026 && 1027 (hi > c_hi || (hi == c_hi && lo >= c_lo))) { 1028 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1029 } else { 1030 sub128(c_hi, c_lo, hi, lo, &hi, &lo); 1031 p_sign ^= 1; 1032 p_exp = c.exp; 1033 } 1034 } else { 1035 shift128RightJamming(c_hi, c_lo, 1036 exp_diff, 1037 &c_hi, &c_lo); 1038 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1039 } 1040 1041 if (hi == 0 && lo == 0) { 1042 a.cls = float_class_zero; 1043 a.sign = s->float_rounding_mode == float_round_down; 1044 a.sign ^= sign_flip; 1045 return a; 1046 } else { 1047 int shift; 1048 if (hi != 0) { 1049 shift = clz64(hi); 1050 } else { 1051 shift = clz64(lo) + 64; 1052 } 1053 /* Normalizing to a binary point of 124 is the 1054 correct adjust for the exponent. However since we're 1055 shifting, we might as well put the binary point back 1056 at 62 where we really want it. Therefore shift as 1057 if we're leaving 1 bit at the top of the word, but 1058 adjust the exponent as if we're leaving 3 bits. */ 1059 shift -= 1; 1060 if (shift >= 64) { 1061 lo = lo << (shift - 64); 1062 } else { 1063 hi = (hi << shift) | (lo >> (64 - shift)); 1064 lo = hi | ((lo << shift) != 0); 1065 } 1066 p_exp -= shift - 2; 1067 } 1068 } 1069 } 1070 1071 if (flags & float_muladd_halve_result) { 1072 p_exp -= 1; 1073 } 1074 1075 /* finally prepare our result */ 1076 a.cls = float_class_normal; 1077 a.sign = p_sign ^ sign_flip; 1078 a.exp = p_exp; 1079 a.frac = lo; 1080 1081 return a; 1082 } 1083 1084 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c, 1085 int flags, float_status *status) 1086 { 1087 FloatParts pa = float16_unpack_canonical(a, status); 1088 FloatParts pb = float16_unpack_canonical(b, status); 1089 FloatParts pc = float16_unpack_canonical(c, status); 1090 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1091 1092 return float16_round_pack_canonical(pr, status); 1093 } 1094 1095 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c, 1096 int flags, float_status *status) 1097 { 1098 FloatParts pa = float32_unpack_canonical(a, status); 1099 FloatParts pb = float32_unpack_canonical(b, status); 1100 FloatParts pc = float32_unpack_canonical(c, status); 1101 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1102 1103 return float32_round_pack_canonical(pr, status); 1104 } 1105 1106 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c, 1107 int flags, float_status *status) 1108 { 1109 FloatParts pa = float64_unpack_canonical(a, status); 1110 FloatParts pb = float64_unpack_canonical(b, status); 1111 FloatParts pc = float64_unpack_canonical(c, status); 1112 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1113 1114 return float64_round_pack_canonical(pr, status); 1115 } 1116 1117 /* 1118 * Returns the result of dividing the floating-point value `a' by the 1119 * corresponding value `b'. The operation is performed according to 1120 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1121 */ 1122 1123 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s) 1124 { 1125 bool sign = a.sign ^ b.sign; 1126 1127 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1128 uint64_t temp_lo, temp_hi; 1129 int exp = a.exp - b.exp; 1130 if (a.frac < b.frac) { 1131 exp -= 1; 1132 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, 1133 &temp_hi, &temp_lo); 1134 } else { 1135 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, 1136 &temp_hi, &temp_lo); 1137 } 1138 /* LSB of quot is set if inexact which roundandpack will use 1139 * to set flags. Yet again we re-use a for the result */ 1140 a.frac = div128To64(temp_lo, temp_hi, b.frac); 1141 a.sign = sign; 1142 a.exp = exp; 1143 return a; 1144 } 1145 /* handle all the NaN cases */ 1146 if (is_nan(a.cls) || is_nan(b.cls)) { 1147 return pick_nan(a, b, s); 1148 } 1149 /* 0/0 or Inf/Inf */ 1150 if (a.cls == b.cls 1151 && 1152 (a.cls == float_class_inf || a.cls == float_class_zero)) { 1153 s->float_exception_flags |= float_flag_invalid; 1154 a.cls = float_class_dnan; 1155 return a; 1156 } 1157 /* Inf / x or 0 / x */ 1158 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1159 a.sign = sign; 1160 return a; 1161 } 1162 /* Div 0 => Inf */ 1163 if (b.cls == float_class_zero) { 1164 s->float_exception_flags |= float_flag_divbyzero; 1165 a.cls = float_class_inf; 1166 a.sign = sign; 1167 return a; 1168 } 1169 /* Div by Inf */ 1170 if (b.cls == float_class_inf) { 1171 a.cls = float_class_zero; 1172 a.sign = sign; 1173 return a; 1174 } 1175 g_assert_not_reached(); 1176 } 1177 1178 float16 float16_div(float16 a, float16 b, float_status *status) 1179 { 1180 FloatParts pa = float16_unpack_canonical(a, status); 1181 FloatParts pb = float16_unpack_canonical(b, status); 1182 FloatParts pr = div_floats(pa, pb, status); 1183 1184 return float16_round_pack_canonical(pr, status); 1185 } 1186 1187 float32 float32_div(float32 a, float32 b, float_status *status) 1188 { 1189 FloatParts pa = float32_unpack_canonical(a, status); 1190 FloatParts pb = float32_unpack_canonical(b, status); 1191 FloatParts pr = div_floats(pa, pb, status); 1192 1193 return float32_round_pack_canonical(pr, status); 1194 } 1195 1196 float64 float64_div(float64 a, float64 b, float_status *status) 1197 { 1198 FloatParts pa = float64_unpack_canonical(a, status); 1199 FloatParts pb = float64_unpack_canonical(b, status); 1200 FloatParts pr = div_floats(pa, pb, status); 1201 1202 return float64_round_pack_canonical(pr, status); 1203 } 1204 1205 /* 1206 * Rounds the floating-point value `a' to an integer, and returns the 1207 * result as a floating-point value. The operation is performed 1208 * according to the IEC/IEEE Standard for Binary Floating-Point 1209 * Arithmetic. 1210 */ 1211 1212 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s) 1213 { 1214 if (is_nan(a.cls)) { 1215 return return_nan(a, s); 1216 } 1217 1218 switch (a.cls) { 1219 case float_class_zero: 1220 case float_class_inf: 1221 case float_class_qnan: 1222 /* already "integral" */ 1223 break; 1224 case float_class_normal: 1225 if (a.exp >= DECOMPOSED_BINARY_POINT) { 1226 /* already integral */ 1227 break; 1228 } 1229 if (a.exp < 0) { 1230 bool one; 1231 /* all fractional */ 1232 s->float_exception_flags |= float_flag_inexact; 1233 switch (rounding_mode) { 1234 case float_round_nearest_even: 1235 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT; 1236 break; 1237 case float_round_ties_away: 1238 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT; 1239 break; 1240 case float_round_to_zero: 1241 one = false; 1242 break; 1243 case float_round_up: 1244 one = !a.sign; 1245 break; 1246 case float_round_down: 1247 one = a.sign; 1248 break; 1249 default: 1250 g_assert_not_reached(); 1251 } 1252 1253 if (one) { 1254 a.frac = DECOMPOSED_IMPLICIT_BIT; 1255 a.exp = 0; 1256 } else { 1257 a.cls = float_class_zero; 1258 } 1259 } else { 1260 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp; 1261 uint64_t frac_lsbm1 = frac_lsb >> 1; 1262 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb; 1263 uint64_t rnd_mask = rnd_even_mask >> 1; 1264 uint64_t inc; 1265 1266 switch (rounding_mode) { 1267 case float_round_nearest_even: 1268 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 1269 break; 1270 case float_round_ties_away: 1271 inc = frac_lsbm1; 1272 break; 1273 case float_round_to_zero: 1274 inc = 0; 1275 break; 1276 case float_round_up: 1277 inc = a.sign ? 0 : rnd_mask; 1278 break; 1279 case float_round_down: 1280 inc = a.sign ? rnd_mask : 0; 1281 break; 1282 default: 1283 g_assert_not_reached(); 1284 } 1285 1286 if (a.frac & rnd_mask) { 1287 s->float_exception_flags |= float_flag_inexact; 1288 a.frac += inc; 1289 a.frac &= ~rnd_mask; 1290 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 1291 a.frac >>= 1; 1292 a.exp++; 1293 } 1294 } 1295 } 1296 break; 1297 default: 1298 g_assert_not_reached(); 1299 } 1300 return a; 1301 } 1302 1303 float16 float16_round_to_int(float16 a, float_status *s) 1304 { 1305 FloatParts pa = float16_unpack_canonical(a, s); 1306 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s); 1307 return float16_round_pack_canonical(pr, s); 1308 } 1309 1310 float32 float32_round_to_int(float32 a, float_status *s) 1311 { 1312 FloatParts pa = float32_unpack_canonical(a, s); 1313 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s); 1314 return float32_round_pack_canonical(pr, s); 1315 } 1316 1317 float64 float64_round_to_int(float64 a, float_status *s) 1318 { 1319 FloatParts pa = float64_unpack_canonical(a, s); 1320 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s); 1321 return float64_round_pack_canonical(pr, s); 1322 } 1323 1324 float64 float64_trunc_to_int(float64 a, float_status *s) 1325 { 1326 FloatParts pa = float64_unpack_canonical(a, s); 1327 FloatParts pr = round_to_int(pa, float_round_to_zero, s); 1328 return float64_round_pack_canonical(pr, s); 1329 } 1330 1331 /* 1332 * Returns the result of converting the floating-point value `a' to 1333 * the two's complement integer format. The conversion is performed 1334 * according to the IEC/IEEE Standard for Binary Floating-Point 1335 * Arithmetic---which means in particular that the conversion is 1336 * rounded according to the current rounding mode. If `a' is a NaN, 1337 * the largest positive integer is returned. Otherwise, if the 1338 * conversion overflows, the largest integer with the same sign as `a' 1339 * is returned. 1340 */ 1341 1342 static int64_t round_to_int_and_pack(FloatParts in, int rmode, 1343 int64_t min, int64_t max, 1344 float_status *s) 1345 { 1346 uint64_t r; 1347 int orig_flags = get_float_exception_flags(s); 1348 FloatParts p = round_to_int(in, rmode, s); 1349 1350 switch (p.cls) { 1351 case float_class_snan: 1352 case float_class_qnan: 1353 case float_class_dnan: 1354 case float_class_msnan: 1355 s->float_exception_flags = orig_flags | float_flag_invalid; 1356 return max; 1357 case float_class_inf: 1358 s->float_exception_flags = orig_flags | float_flag_invalid; 1359 return p.sign ? min : max; 1360 case float_class_zero: 1361 return 0; 1362 case float_class_normal: 1363 if (p.exp < DECOMPOSED_BINARY_POINT) { 1364 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 1365 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 1366 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 1367 } else { 1368 r = UINT64_MAX; 1369 } 1370 if (p.sign) { 1371 if (r < -(uint64_t) min) { 1372 return -r; 1373 } else { 1374 s->float_exception_flags = orig_flags | float_flag_invalid; 1375 return min; 1376 } 1377 } else { 1378 if (r < max) { 1379 return r; 1380 } else { 1381 s->float_exception_flags = orig_flags | float_flag_invalid; 1382 return max; 1383 } 1384 } 1385 default: 1386 g_assert_not_reached(); 1387 } 1388 } 1389 1390 #define FLOAT_TO_INT(fsz, isz) \ 1391 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a, \ 1392 float_status *s) \ 1393 { \ 1394 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1395 return round_to_int_and_pack(p, s->float_rounding_mode, \ 1396 INT ## isz ## _MIN, INT ## isz ## _MAX,\ 1397 s); \ 1398 } \ 1399 \ 1400 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero \ 1401 (float ## fsz a, float_status *s) \ 1402 { \ 1403 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1404 return round_to_int_and_pack(p, float_round_to_zero, \ 1405 INT ## isz ## _MIN, INT ## isz ## _MAX,\ 1406 s); \ 1407 } 1408 1409 FLOAT_TO_INT(16, 16) 1410 FLOAT_TO_INT(16, 32) 1411 FLOAT_TO_INT(16, 64) 1412 1413 FLOAT_TO_INT(32, 16) 1414 FLOAT_TO_INT(32, 32) 1415 FLOAT_TO_INT(32, 64) 1416 1417 FLOAT_TO_INT(64, 16) 1418 FLOAT_TO_INT(64, 32) 1419 FLOAT_TO_INT(64, 64) 1420 1421 #undef FLOAT_TO_INT 1422 1423 /* 1424 * Returns the result of converting the floating-point value `a' to 1425 * the unsigned integer format. The conversion is performed according 1426 * to the IEC/IEEE Standard for Binary Floating-Point 1427 * Arithmetic---which means in particular that the conversion is 1428 * rounded according to the current rounding mode. If `a' is a NaN, 1429 * the largest unsigned integer is returned. Otherwise, if the 1430 * conversion overflows, the largest unsigned integer is returned. If 1431 * the 'a' is negative, the result is rounded and zero is returned; 1432 * values that do not round to zero will raise the inexact exception 1433 * flag. 1434 */ 1435 1436 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max, 1437 float_status *s) 1438 { 1439 int orig_flags = get_float_exception_flags(s); 1440 FloatParts p = round_to_int(in, rmode, s); 1441 1442 switch (p.cls) { 1443 case float_class_snan: 1444 case float_class_qnan: 1445 case float_class_dnan: 1446 case float_class_msnan: 1447 s->float_exception_flags = orig_flags | float_flag_invalid; 1448 return max; 1449 case float_class_inf: 1450 s->float_exception_flags = orig_flags | float_flag_invalid; 1451 return p.sign ? 0 : max; 1452 case float_class_zero: 1453 return 0; 1454 case float_class_normal: 1455 { 1456 uint64_t r; 1457 if (p.sign) { 1458 s->float_exception_flags = orig_flags | float_flag_invalid; 1459 return 0; 1460 } 1461 1462 if (p.exp < DECOMPOSED_BINARY_POINT) { 1463 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 1464 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 1465 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 1466 } else { 1467 s->float_exception_flags = orig_flags | float_flag_invalid; 1468 return max; 1469 } 1470 1471 /* For uint64 this will never trip, but if p.exp is too large 1472 * to shift a decomposed fraction we shall have exited via the 1473 * 3rd leg above. 1474 */ 1475 if (r > max) { 1476 s->float_exception_flags = orig_flags | float_flag_invalid; 1477 return max; 1478 } else { 1479 return r; 1480 } 1481 } 1482 default: 1483 g_assert_not_reached(); 1484 } 1485 } 1486 1487 #define FLOAT_TO_UINT(fsz, isz) \ 1488 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a, \ 1489 float_status *s) \ 1490 { \ 1491 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1492 return round_to_uint_and_pack(p, s->float_rounding_mode, \ 1493 UINT ## isz ## _MAX, s); \ 1494 } \ 1495 \ 1496 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero \ 1497 (float ## fsz a, float_status *s) \ 1498 { \ 1499 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1500 return round_to_uint_and_pack(p, float_round_to_zero, \ 1501 UINT ## isz ## _MAX, s); \ 1502 } 1503 1504 FLOAT_TO_UINT(16, 16) 1505 FLOAT_TO_UINT(16, 32) 1506 FLOAT_TO_UINT(16, 64) 1507 1508 FLOAT_TO_UINT(32, 16) 1509 FLOAT_TO_UINT(32, 32) 1510 FLOAT_TO_UINT(32, 64) 1511 1512 FLOAT_TO_UINT(64, 16) 1513 FLOAT_TO_UINT(64, 32) 1514 FLOAT_TO_UINT(64, 64) 1515 1516 #undef FLOAT_TO_UINT 1517 1518 /* 1519 * Integer to float conversions 1520 * 1521 * Returns the result of converting the two's complement integer `a' 1522 * to the floating-point format. The conversion is performed according 1523 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1524 */ 1525 1526 static FloatParts int_to_float(int64_t a, float_status *status) 1527 { 1528 FloatParts r; 1529 if (a == 0) { 1530 r.cls = float_class_zero; 1531 r.sign = false; 1532 } else if (a == (1ULL << 63)) { 1533 r.cls = float_class_normal; 1534 r.sign = true; 1535 r.frac = DECOMPOSED_IMPLICIT_BIT; 1536 r.exp = 63; 1537 } else { 1538 uint64_t f; 1539 if (a < 0) { 1540 f = -a; 1541 r.sign = true; 1542 } else { 1543 f = a; 1544 r.sign = false; 1545 } 1546 int shift = clz64(f) - 1; 1547 r.cls = float_class_normal; 1548 r.exp = (DECOMPOSED_BINARY_POINT - shift); 1549 r.frac = f << shift; 1550 } 1551 1552 return r; 1553 } 1554 1555 float16 int64_to_float16(int64_t a, float_status *status) 1556 { 1557 FloatParts pa = int_to_float(a, status); 1558 return float16_round_pack_canonical(pa, status); 1559 } 1560 1561 float16 int32_to_float16(int32_t a, float_status *status) 1562 { 1563 return int64_to_float16(a, status); 1564 } 1565 1566 float16 int16_to_float16(int16_t a, float_status *status) 1567 { 1568 return int64_to_float16(a, status); 1569 } 1570 1571 float32 int64_to_float32(int64_t a, float_status *status) 1572 { 1573 FloatParts pa = int_to_float(a, status); 1574 return float32_round_pack_canonical(pa, status); 1575 } 1576 1577 float32 int32_to_float32(int32_t a, float_status *status) 1578 { 1579 return int64_to_float32(a, status); 1580 } 1581 1582 float32 int16_to_float32(int16_t a, float_status *status) 1583 { 1584 return int64_to_float32(a, status); 1585 } 1586 1587 float64 int64_to_float64(int64_t a, float_status *status) 1588 { 1589 FloatParts pa = int_to_float(a, status); 1590 return float64_round_pack_canonical(pa, status); 1591 } 1592 1593 float64 int32_to_float64(int32_t a, float_status *status) 1594 { 1595 return int64_to_float64(a, status); 1596 } 1597 1598 float64 int16_to_float64(int16_t a, float_status *status) 1599 { 1600 return int64_to_float64(a, status); 1601 } 1602 1603 1604 /* 1605 * Unsigned Integer to float conversions 1606 * 1607 * Returns the result of converting the unsigned integer `a' to the 1608 * floating-point format. The conversion is performed according to the 1609 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1610 */ 1611 1612 static FloatParts uint_to_float(uint64_t a, float_status *status) 1613 { 1614 FloatParts r = { .sign = false}; 1615 1616 if (a == 0) { 1617 r.cls = float_class_zero; 1618 } else { 1619 int spare_bits = clz64(a) - 1; 1620 r.cls = float_class_normal; 1621 r.exp = DECOMPOSED_BINARY_POINT - spare_bits; 1622 if (spare_bits < 0) { 1623 shift64RightJamming(a, -spare_bits, &a); 1624 r.frac = a; 1625 } else { 1626 r.frac = a << spare_bits; 1627 } 1628 } 1629 1630 return r; 1631 } 1632 1633 float16 uint64_to_float16(uint64_t a, float_status *status) 1634 { 1635 FloatParts pa = uint_to_float(a, status); 1636 return float16_round_pack_canonical(pa, status); 1637 } 1638 1639 float16 uint32_to_float16(uint32_t a, float_status *status) 1640 { 1641 return uint64_to_float16(a, status); 1642 } 1643 1644 float16 uint16_to_float16(uint16_t a, float_status *status) 1645 { 1646 return uint64_to_float16(a, status); 1647 } 1648 1649 float32 uint64_to_float32(uint64_t a, float_status *status) 1650 { 1651 FloatParts pa = uint_to_float(a, status); 1652 return float32_round_pack_canonical(pa, status); 1653 } 1654 1655 float32 uint32_to_float32(uint32_t a, float_status *status) 1656 { 1657 return uint64_to_float32(a, status); 1658 } 1659 1660 float32 uint16_to_float32(uint16_t a, float_status *status) 1661 { 1662 return uint64_to_float32(a, status); 1663 } 1664 1665 float64 uint64_to_float64(uint64_t a, float_status *status) 1666 { 1667 FloatParts pa = uint_to_float(a, status); 1668 return float64_round_pack_canonical(pa, status); 1669 } 1670 1671 float64 uint32_to_float64(uint32_t a, float_status *status) 1672 { 1673 return uint64_to_float64(a, status); 1674 } 1675 1676 float64 uint16_to_float64(uint16_t a, float_status *status) 1677 { 1678 return uint64_to_float64(a, status); 1679 } 1680 1681 /* Float Min/Max */ 1682 /* min() and max() functions. These can't be implemented as 1683 * 'compare and pick one input' because that would mishandle 1684 * NaNs and +0 vs -0. 1685 * 1686 * minnum() and maxnum() functions. These are similar to the min() 1687 * and max() functions but if one of the arguments is a QNaN and 1688 * the other is numerical then the numerical argument is returned. 1689 * SNaNs will get quietened before being returned. 1690 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 1691 * and maxNum() operations. min() and max() are the typical min/max 1692 * semantics provided by many CPUs which predate that specification. 1693 * 1694 * minnummag() and maxnummag() functions correspond to minNumMag() 1695 * and minNumMag() from the IEEE-754 2008. 1696 */ 1697 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin, 1698 bool ieee, bool ismag, float_status *s) 1699 { 1700 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) { 1701 if (ieee) { 1702 /* Takes two floating-point values `a' and `b', one of 1703 * which is a NaN, and returns the appropriate NaN 1704 * result. If either `a' or `b' is a signaling NaN, 1705 * the invalid exception is raised. 1706 */ 1707 if (is_snan(a.cls) || is_snan(b.cls)) { 1708 return pick_nan(a, b, s); 1709 } else if (is_nan(a.cls) && !is_nan(b.cls)) { 1710 return b; 1711 } else if (is_nan(b.cls) && !is_nan(a.cls)) { 1712 return a; 1713 } 1714 } 1715 return pick_nan(a, b, s); 1716 } else { 1717 int a_exp, b_exp; 1718 1719 switch (a.cls) { 1720 case float_class_normal: 1721 a_exp = a.exp; 1722 break; 1723 case float_class_inf: 1724 a_exp = INT_MAX; 1725 break; 1726 case float_class_zero: 1727 a_exp = INT_MIN; 1728 break; 1729 default: 1730 g_assert_not_reached(); 1731 break; 1732 } 1733 switch (b.cls) { 1734 case float_class_normal: 1735 b_exp = b.exp; 1736 break; 1737 case float_class_inf: 1738 b_exp = INT_MAX; 1739 break; 1740 case float_class_zero: 1741 b_exp = INT_MIN; 1742 break; 1743 default: 1744 g_assert_not_reached(); 1745 break; 1746 } 1747 1748 if (ismag && (a_exp != b_exp || a.frac != b.frac)) { 1749 bool a_less = a_exp < b_exp; 1750 if (a_exp == b_exp) { 1751 a_less = a.frac < b.frac; 1752 } 1753 return a_less ^ ismin ? b : a; 1754 } 1755 1756 if (a.sign == b.sign) { 1757 bool a_less = a_exp < b_exp; 1758 if (a_exp == b_exp) { 1759 a_less = a.frac < b.frac; 1760 } 1761 return a.sign ^ a_less ^ ismin ? b : a; 1762 } else { 1763 return a.sign ^ ismin ? b : a; 1764 } 1765 } 1766 } 1767 1768 #define MINMAX(sz, name, ismin, isiee, ismag) \ 1769 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \ 1770 float_status *s) \ 1771 { \ 1772 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 1773 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 1774 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 1775 \ 1776 return float ## sz ## _round_pack_canonical(pr, s); \ 1777 } 1778 1779 MINMAX(16, min, true, false, false) 1780 MINMAX(16, minnum, true, true, false) 1781 MINMAX(16, minnummag, true, true, true) 1782 MINMAX(16, max, false, false, false) 1783 MINMAX(16, maxnum, false, true, false) 1784 MINMAX(16, maxnummag, false, true, true) 1785 1786 MINMAX(32, min, true, false, false) 1787 MINMAX(32, minnum, true, true, false) 1788 MINMAX(32, minnummag, true, true, true) 1789 MINMAX(32, max, false, false, false) 1790 MINMAX(32, maxnum, false, true, false) 1791 MINMAX(32, maxnummag, false, true, true) 1792 1793 MINMAX(64, min, true, false, false) 1794 MINMAX(64, minnum, true, true, false) 1795 MINMAX(64, minnummag, true, true, true) 1796 MINMAX(64, max, false, false, false) 1797 MINMAX(64, maxnum, false, true, false) 1798 MINMAX(64, maxnummag, false, true, true) 1799 1800 #undef MINMAX 1801 1802 /* Floating point compare */ 1803 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet, 1804 float_status *s) 1805 { 1806 if (is_nan(a.cls) || is_nan(b.cls)) { 1807 if (!is_quiet || 1808 a.cls == float_class_snan || 1809 b.cls == float_class_snan) { 1810 s->float_exception_flags |= float_flag_invalid; 1811 } 1812 return float_relation_unordered; 1813 } 1814 1815 if (a.cls == float_class_zero) { 1816 if (b.cls == float_class_zero) { 1817 return float_relation_equal; 1818 } 1819 return b.sign ? float_relation_greater : float_relation_less; 1820 } else if (b.cls == float_class_zero) { 1821 return a.sign ? float_relation_less : float_relation_greater; 1822 } 1823 1824 /* The only really important thing about infinity is its sign. If 1825 * both are infinities the sign marks the smallest of the two. 1826 */ 1827 if (a.cls == float_class_inf) { 1828 if ((b.cls == float_class_inf) && (a.sign == b.sign)) { 1829 return float_relation_equal; 1830 } 1831 return a.sign ? float_relation_less : float_relation_greater; 1832 } else if (b.cls == float_class_inf) { 1833 return b.sign ? float_relation_greater : float_relation_less; 1834 } 1835 1836 if (a.sign != b.sign) { 1837 return a.sign ? float_relation_less : float_relation_greater; 1838 } 1839 1840 if (a.exp == b.exp) { 1841 if (a.frac == b.frac) { 1842 return float_relation_equal; 1843 } 1844 if (a.sign) { 1845 return a.frac > b.frac ? 1846 float_relation_less : float_relation_greater; 1847 } else { 1848 return a.frac > b.frac ? 1849 float_relation_greater : float_relation_less; 1850 } 1851 } else { 1852 if (a.sign) { 1853 return a.exp > b.exp ? float_relation_less : float_relation_greater; 1854 } else { 1855 return a.exp > b.exp ? float_relation_greater : float_relation_less; 1856 } 1857 } 1858 } 1859 1860 #define COMPARE(sz) \ 1861 int float ## sz ## _compare(float ## sz a, float ## sz b, \ 1862 float_status *s) \ 1863 { \ 1864 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 1865 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 1866 return compare_floats(pa, pb, false, s); \ 1867 } \ 1868 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \ 1869 float_status *s) \ 1870 { \ 1871 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 1872 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 1873 return compare_floats(pa, pb, true, s); \ 1874 } 1875 1876 COMPARE(16) 1877 COMPARE(32) 1878 COMPARE(64) 1879 1880 #undef COMPARE 1881 1882 /* Multiply A by 2 raised to the power N. */ 1883 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s) 1884 { 1885 if (unlikely(is_nan(a.cls))) { 1886 return return_nan(a, s); 1887 } 1888 if (a.cls == float_class_normal) { 1889 /* The largest float type (even though not supported by FloatParts) 1890 * is float128, which has a 15 bit exponent. Bounding N to 16 bits 1891 * still allows rounding to infinity, without allowing overflow 1892 * within the int32_t that backs FloatParts.exp. 1893 */ 1894 n = MIN(MAX(n, -0x10000), 0x10000); 1895 a.exp += n; 1896 } 1897 return a; 1898 } 1899 1900 float16 float16_scalbn(float16 a, int n, float_status *status) 1901 { 1902 FloatParts pa = float16_unpack_canonical(a, status); 1903 FloatParts pr = scalbn_decomposed(pa, n, status); 1904 return float16_round_pack_canonical(pr, status); 1905 } 1906 1907 float32 float32_scalbn(float32 a, int n, float_status *status) 1908 { 1909 FloatParts pa = float32_unpack_canonical(a, status); 1910 FloatParts pr = scalbn_decomposed(pa, n, status); 1911 return float32_round_pack_canonical(pr, status); 1912 } 1913 1914 float64 float64_scalbn(float64 a, int n, float_status *status) 1915 { 1916 FloatParts pa = float64_unpack_canonical(a, status); 1917 FloatParts pr = scalbn_decomposed(pa, n, status); 1918 return float64_round_pack_canonical(pr, status); 1919 } 1920 1921 /* 1922 * Square Root 1923 * 1924 * The old softfloat code did an approximation step before zeroing in 1925 * on the final result. However for simpleness we just compute the 1926 * square root by iterating down from the implicit bit to enough extra 1927 * bits to ensure we get a correctly rounded result. 1928 * 1929 * This does mean however the calculation is slower than before, 1930 * especially for 64 bit floats. 1931 */ 1932 1933 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p) 1934 { 1935 uint64_t a_frac, r_frac, s_frac; 1936 int bit, last_bit; 1937 1938 if (is_nan(a.cls)) { 1939 return return_nan(a, s); 1940 } 1941 if (a.cls == float_class_zero) { 1942 return a; /* sqrt(+-0) = +-0 */ 1943 } 1944 if (a.sign) { 1945 s->float_exception_flags |= float_flag_invalid; 1946 a.cls = float_class_dnan; 1947 return a; 1948 } 1949 if (a.cls == float_class_inf) { 1950 return a; /* sqrt(+inf) = +inf */ 1951 } 1952 1953 assert(a.cls == float_class_normal); 1954 1955 /* We need two overflow bits at the top. Adding room for that is a 1956 * right shift. If the exponent is odd, we can discard the low bit 1957 * by multiplying the fraction by 2; that's a left shift. Combine 1958 * those and we shift right if the exponent is even. 1959 */ 1960 a_frac = a.frac; 1961 if (!(a.exp & 1)) { 1962 a_frac >>= 1; 1963 } 1964 a.exp >>= 1; 1965 1966 /* Bit-by-bit computation of sqrt. */ 1967 r_frac = 0; 1968 s_frac = 0; 1969 1970 /* Iterate from implicit bit down to the 3 extra bits to compute a 1971 * properly rounded result. Remember we've inserted one more bit 1972 * at the top, so these positions are one less. 1973 */ 1974 bit = DECOMPOSED_BINARY_POINT - 1; 1975 last_bit = MAX(p->frac_shift - 4, 0); 1976 do { 1977 uint64_t q = 1ULL << bit; 1978 uint64_t t_frac = s_frac + q; 1979 if (t_frac <= a_frac) { 1980 s_frac = t_frac + q; 1981 a_frac -= t_frac; 1982 r_frac += q; 1983 } 1984 a_frac <<= 1; 1985 } while (--bit >= last_bit); 1986 1987 /* Undo the right shift done above. If there is any remaining 1988 * fraction, the result is inexact. Set the sticky bit. 1989 */ 1990 a.frac = (r_frac << 1) + (a_frac != 0); 1991 1992 return a; 1993 } 1994 1995 float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status) 1996 { 1997 FloatParts pa = float16_unpack_canonical(a, status); 1998 FloatParts pr = sqrt_float(pa, status, &float16_params); 1999 return float16_round_pack_canonical(pr, status); 2000 } 2001 2002 float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status) 2003 { 2004 FloatParts pa = float32_unpack_canonical(a, status); 2005 FloatParts pr = sqrt_float(pa, status, &float32_params); 2006 return float32_round_pack_canonical(pr, status); 2007 } 2008 2009 float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status) 2010 { 2011 FloatParts pa = float64_unpack_canonical(a, status); 2012 FloatParts pr = sqrt_float(pa, status, &float64_params); 2013 return float64_round_pack_canonical(pr, status); 2014 } 2015 2016 2017 /*---------------------------------------------------------------------------- 2018 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 2019 | and 7, and returns the properly rounded 32-bit integer corresponding to the 2020 | input. If `zSign' is 1, the input is negated before being converted to an 2021 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 2022 | is simply rounded to an integer, with the inexact exception raised if the 2023 | input cannot be represented exactly as an integer. However, if the fixed- 2024 | point input is too large, the invalid exception is raised and the largest 2025 | positive or negative integer is returned. 2026 *----------------------------------------------------------------------------*/ 2027 2028 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status) 2029 { 2030 int8_t roundingMode; 2031 flag roundNearestEven; 2032 int8_t roundIncrement, roundBits; 2033 int32_t z; 2034 2035 roundingMode = status->float_rounding_mode; 2036 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2037 switch (roundingMode) { 2038 case float_round_nearest_even: 2039 case float_round_ties_away: 2040 roundIncrement = 0x40; 2041 break; 2042 case float_round_to_zero: 2043 roundIncrement = 0; 2044 break; 2045 case float_round_up: 2046 roundIncrement = zSign ? 0 : 0x7f; 2047 break; 2048 case float_round_down: 2049 roundIncrement = zSign ? 0x7f : 0; 2050 break; 2051 default: 2052 abort(); 2053 } 2054 roundBits = absZ & 0x7F; 2055 absZ = ( absZ + roundIncrement )>>7; 2056 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 2057 z = absZ; 2058 if ( zSign ) z = - z; 2059 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 2060 float_raise(float_flag_invalid, status); 2061 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 2062 } 2063 if (roundBits) { 2064 status->float_exception_flags |= float_flag_inexact; 2065 } 2066 return z; 2067 2068 } 2069 2070 /*---------------------------------------------------------------------------- 2071 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 2072 | `absZ1', with binary point between bits 63 and 64 (between the input words), 2073 | and returns the properly rounded 64-bit integer corresponding to the input. 2074 | If `zSign' is 1, the input is negated before being converted to an integer. 2075 | Ordinarily, the fixed-point input is simply rounded to an integer, with 2076 | the inexact exception raised if the input cannot be represented exactly as 2077 | an integer. However, if the fixed-point input is too large, the invalid 2078 | exception is raised and the largest positive or negative integer is 2079 | returned. 2080 *----------------------------------------------------------------------------*/ 2081 2082 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1, 2083 float_status *status) 2084 { 2085 int8_t roundingMode; 2086 flag roundNearestEven, increment; 2087 int64_t z; 2088 2089 roundingMode = status->float_rounding_mode; 2090 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2091 switch (roundingMode) { 2092 case float_round_nearest_even: 2093 case float_round_ties_away: 2094 increment = ((int64_t) absZ1 < 0); 2095 break; 2096 case float_round_to_zero: 2097 increment = 0; 2098 break; 2099 case float_round_up: 2100 increment = !zSign && absZ1; 2101 break; 2102 case float_round_down: 2103 increment = zSign && absZ1; 2104 break; 2105 default: 2106 abort(); 2107 } 2108 if ( increment ) { 2109 ++absZ0; 2110 if ( absZ0 == 0 ) goto overflow; 2111 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 2112 } 2113 z = absZ0; 2114 if ( zSign ) z = - z; 2115 if ( z && ( ( z < 0 ) ^ zSign ) ) { 2116 overflow: 2117 float_raise(float_flag_invalid, status); 2118 return 2119 zSign ? (int64_t) LIT64( 0x8000000000000000 ) 2120 : LIT64( 0x7FFFFFFFFFFFFFFF ); 2121 } 2122 if (absZ1) { 2123 status->float_exception_flags |= float_flag_inexact; 2124 } 2125 return z; 2126 2127 } 2128 2129 /*---------------------------------------------------------------------------- 2130 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 2131 | `absZ1', with binary point between bits 63 and 64 (between the input words), 2132 | and returns the properly rounded 64-bit unsigned integer corresponding to the 2133 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 2134 | with the inexact exception raised if the input cannot be represented exactly 2135 | as an integer. However, if the fixed-point input is too large, the invalid 2136 | exception is raised and the largest unsigned integer is returned. 2137 *----------------------------------------------------------------------------*/ 2138 2139 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0, 2140 uint64_t absZ1, float_status *status) 2141 { 2142 int8_t roundingMode; 2143 flag roundNearestEven, increment; 2144 2145 roundingMode = status->float_rounding_mode; 2146 roundNearestEven = (roundingMode == float_round_nearest_even); 2147 switch (roundingMode) { 2148 case float_round_nearest_even: 2149 case float_round_ties_away: 2150 increment = ((int64_t)absZ1 < 0); 2151 break; 2152 case float_round_to_zero: 2153 increment = 0; 2154 break; 2155 case float_round_up: 2156 increment = !zSign && absZ1; 2157 break; 2158 case float_round_down: 2159 increment = zSign && absZ1; 2160 break; 2161 default: 2162 abort(); 2163 } 2164 if (increment) { 2165 ++absZ0; 2166 if (absZ0 == 0) { 2167 float_raise(float_flag_invalid, status); 2168 return LIT64(0xFFFFFFFFFFFFFFFF); 2169 } 2170 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven); 2171 } 2172 2173 if (zSign && absZ0) { 2174 float_raise(float_flag_invalid, status); 2175 return 0; 2176 } 2177 2178 if (absZ1) { 2179 status->float_exception_flags |= float_flag_inexact; 2180 } 2181 return absZ0; 2182 } 2183 2184 /*---------------------------------------------------------------------------- 2185 | If `a' is denormal and we are in flush-to-zero mode then set the 2186 | input-denormal exception and return zero. Otherwise just return the value. 2187 *----------------------------------------------------------------------------*/ 2188 float32 float32_squash_input_denormal(float32 a, float_status *status) 2189 { 2190 if (status->flush_inputs_to_zero) { 2191 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) { 2192 float_raise(float_flag_input_denormal, status); 2193 return make_float32(float32_val(a) & 0x80000000); 2194 } 2195 } 2196 return a; 2197 } 2198 2199 /*---------------------------------------------------------------------------- 2200 | Normalizes the subnormal single-precision floating-point value represented 2201 | by the denormalized significand `aSig'. The normalized exponent and 2202 | significand are stored at the locations pointed to by `zExpPtr' and 2203 | `zSigPtr', respectively. 2204 *----------------------------------------------------------------------------*/ 2205 2206 static void 2207 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 2208 { 2209 int8_t shiftCount; 2210 2211 shiftCount = countLeadingZeros32( aSig ) - 8; 2212 *zSigPtr = aSig<<shiftCount; 2213 *zExpPtr = 1 - shiftCount; 2214 2215 } 2216 2217 /*---------------------------------------------------------------------------- 2218 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2219 | and significand `zSig', and returns the proper single-precision floating- 2220 | point value corresponding to the abstract input. Ordinarily, the abstract 2221 | value is simply rounded and packed into the single-precision format, with 2222 | the inexact exception raised if the abstract input cannot be represented 2223 | exactly. However, if the abstract value is too large, the overflow and 2224 | inexact exceptions are raised and an infinity or maximal finite value is 2225 | returned. If the abstract value is too small, the input value is rounded to 2226 | a subnormal number, and the underflow and inexact exceptions are raised if 2227 | the abstract input cannot be represented exactly as a subnormal single- 2228 | precision floating-point number. 2229 | The input significand `zSig' has its binary point between bits 30 2230 | and 29, which is 7 bits to the left of the usual location. This shifted 2231 | significand must be normalized or smaller. If `zSig' is not normalized, 2232 | `zExp' must be 0; in that case, the result returned is a subnormal number, 2233 | and it must not require rounding. In the usual case that `zSig' is 2234 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 2235 | The handling of underflow and overflow follows the IEC/IEEE Standard for 2236 | Binary Floating-Point Arithmetic. 2237 *----------------------------------------------------------------------------*/ 2238 2239 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 2240 float_status *status) 2241 { 2242 int8_t roundingMode; 2243 flag roundNearestEven; 2244 int8_t roundIncrement, roundBits; 2245 flag isTiny; 2246 2247 roundingMode = status->float_rounding_mode; 2248 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2249 switch (roundingMode) { 2250 case float_round_nearest_even: 2251 case float_round_ties_away: 2252 roundIncrement = 0x40; 2253 break; 2254 case float_round_to_zero: 2255 roundIncrement = 0; 2256 break; 2257 case float_round_up: 2258 roundIncrement = zSign ? 0 : 0x7f; 2259 break; 2260 case float_round_down: 2261 roundIncrement = zSign ? 0x7f : 0; 2262 break; 2263 default: 2264 abort(); 2265 break; 2266 } 2267 roundBits = zSig & 0x7F; 2268 if ( 0xFD <= (uint16_t) zExp ) { 2269 if ( ( 0xFD < zExp ) 2270 || ( ( zExp == 0xFD ) 2271 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 2272 ) { 2273 float_raise(float_flag_overflow | float_flag_inexact, status); 2274 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 )); 2275 } 2276 if ( zExp < 0 ) { 2277 if (status->flush_to_zero) { 2278 float_raise(float_flag_output_denormal, status); 2279 return packFloat32(zSign, 0, 0); 2280 } 2281 isTiny = 2282 (status->float_detect_tininess 2283 == float_tininess_before_rounding) 2284 || ( zExp < -1 ) 2285 || ( zSig + roundIncrement < 0x80000000 ); 2286 shift32RightJamming( zSig, - zExp, &zSig ); 2287 zExp = 0; 2288 roundBits = zSig & 0x7F; 2289 if (isTiny && roundBits) { 2290 float_raise(float_flag_underflow, status); 2291 } 2292 } 2293 } 2294 if (roundBits) { 2295 status->float_exception_flags |= float_flag_inexact; 2296 } 2297 zSig = ( zSig + roundIncrement )>>7; 2298 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 2299 if ( zSig == 0 ) zExp = 0; 2300 return packFloat32( zSign, zExp, zSig ); 2301 2302 } 2303 2304 /*---------------------------------------------------------------------------- 2305 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2306 | and significand `zSig', and returns the proper single-precision floating- 2307 | point value corresponding to the abstract input. This routine is just like 2308 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 2309 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 2310 | floating-point exponent. 2311 *----------------------------------------------------------------------------*/ 2312 2313 static float32 2314 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 2315 float_status *status) 2316 { 2317 int8_t shiftCount; 2318 2319 shiftCount = countLeadingZeros32( zSig ) - 1; 2320 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 2321 status); 2322 2323 } 2324 2325 /*---------------------------------------------------------------------------- 2326 | If `a' is denormal and we are in flush-to-zero mode then set the 2327 | input-denormal exception and return zero. Otherwise just return the value. 2328 *----------------------------------------------------------------------------*/ 2329 float64 float64_squash_input_denormal(float64 a, float_status *status) 2330 { 2331 if (status->flush_inputs_to_zero) { 2332 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) { 2333 float_raise(float_flag_input_denormal, status); 2334 return make_float64(float64_val(a) & (1ULL << 63)); 2335 } 2336 } 2337 return a; 2338 } 2339 2340 /*---------------------------------------------------------------------------- 2341 | Normalizes the subnormal double-precision floating-point value represented 2342 | by the denormalized significand `aSig'. The normalized exponent and 2343 | significand are stored at the locations pointed to by `zExpPtr' and 2344 | `zSigPtr', respectively. 2345 *----------------------------------------------------------------------------*/ 2346 2347 static void 2348 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 2349 { 2350 int8_t shiftCount; 2351 2352 shiftCount = countLeadingZeros64( aSig ) - 11; 2353 *zSigPtr = aSig<<shiftCount; 2354 *zExpPtr = 1 - shiftCount; 2355 2356 } 2357 2358 /*---------------------------------------------------------------------------- 2359 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 2360 | double-precision floating-point value, returning the result. After being 2361 | shifted into the proper positions, the three fields are simply added 2362 | together to form the result. This means that any integer portion of `zSig' 2363 | will be added into the exponent. Since a properly normalized significand 2364 | will have an integer portion equal to 1, the `zExp' input should be 1 less 2365 | than the desired result exponent whenever `zSig' is a complete, normalized 2366 | significand. 2367 *----------------------------------------------------------------------------*/ 2368 2369 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig) 2370 { 2371 2372 return make_float64( 2373 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 2374 2375 } 2376 2377 /*---------------------------------------------------------------------------- 2378 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2379 | and significand `zSig', and returns the proper double-precision floating- 2380 | point value corresponding to the abstract input. Ordinarily, the abstract 2381 | value is simply rounded and packed into the double-precision format, with 2382 | the inexact exception raised if the abstract input cannot be represented 2383 | exactly. However, if the abstract value is too large, the overflow and 2384 | inexact exceptions are raised and an infinity or maximal finite value is 2385 | returned. If the abstract value is too small, the input value is rounded to 2386 | a subnormal number, and the underflow and inexact exceptions are raised if 2387 | the abstract input cannot be represented exactly as a subnormal double- 2388 | precision floating-point number. 2389 | The input significand `zSig' has its binary point between bits 62 2390 | and 61, which is 10 bits to the left of the usual location. This shifted 2391 | significand must be normalized or smaller. If `zSig' is not normalized, 2392 | `zExp' must be 0; in that case, the result returned is a subnormal number, 2393 | and it must not require rounding. In the usual case that `zSig' is 2394 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 2395 | The handling of underflow and overflow follows the IEC/IEEE Standard for 2396 | Binary Floating-Point Arithmetic. 2397 *----------------------------------------------------------------------------*/ 2398 2399 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 2400 float_status *status) 2401 { 2402 int8_t roundingMode; 2403 flag roundNearestEven; 2404 int roundIncrement, roundBits; 2405 flag isTiny; 2406 2407 roundingMode = status->float_rounding_mode; 2408 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2409 switch (roundingMode) { 2410 case float_round_nearest_even: 2411 case float_round_ties_away: 2412 roundIncrement = 0x200; 2413 break; 2414 case float_round_to_zero: 2415 roundIncrement = 0; 2416 break; 2417 case float_round_up: 2418 roundIncrement = zSign ? 0 : 0x3ff; 2419 break; 2420 case float_round_down: 2421 roundIncrement = zSign ? 0x3ff : 0; 2422 break; 2423 case float_round_to_odd: 2424 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 2425 break; 2426 default: 2427 abort(); 2428 } 2429 roundBits = zSig & 0x3FF; 2430 if ( 0x7FD <= (uint16_t) zExp ) { 2431 if ( ( 0x7FD < zExp ) 2432 || ( ( zExp == 0x7FD ) 2433 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 2434 ) { 2435 bool overflow_to_inf = roundingMode != float_round_to_odd && 2436 roundIncrement != 0; 2437 float_raise(float_flag_overflow | float_flag_inexact, status); 2438 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 2439 } 2440 if ( zExp < 0 ) { 2441 if (status->flush_to_zero) { 2442 float_raise(float_flag_output_denormal, status); 2443 return packFloat64(zSign, 0, 0); 2444 } 2445 isTiny = 2446 (status->float_detect_tininess 2447 == float_tininess_before_rounding) 2448 || ( zExp < -1 ) 2449 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 2450 shift64RightJamming( zSig, - zExp, &zSig ); 2451 zExp = 0; 2452 roundBits = zSig & 0x3FF; 2453 if (isTiny && roundBits) { 2454 float_raise(float_flag_underflow, status); 2455 } 2456 if (roundingMode == float_round_to_odd) { 2457 /* 2458 * For round-to-odd case, the roundIncrement depends on 2459 * zSig which just changed. 2460 */ 2461 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 2462 } 2463 } 2464 } 2465 if (roundBits) { 2466 status->float_exception_flags |= float_flag_inexact; 2467 } 2468 zSig = ( zSig + roundIncrement )>>10; 2469 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 2470 if ( zSig == 0 ) zExp = 0; 2471 return packFloat64( zSign, zExp, zSig ); 2472 2473 } 2474 2475 /*---------------------------------------------------------------------------- 2476 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2477 | and significand `zSig', and returns the proper double-precision floating- 2478 | point value corresponding to the abstract input. This routine is just like 2479 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 2480 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 2481 | floating-point exponent. 2482 *----------------------------------------------------------------------------*/ 2483 2484 static float64 2485 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 2486 float_status *status) 2487 { 2488 int8_t shiftCount; 2489 2490 shiftCount = countLeadingZeros64( zSig ) - 1; 2491 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 2492 status); 2493 2494 } 2495 2496 /*---------------------------------------------------------------------------- 2497 | Normalizes the subnormal extended double-precision floating-point value 2498 | represented by the denormalized significand `aSig'. The normalized exponent 2499 | and significand are stored at the locations pointed to by `zExpPtr' and 2500 | `zSigPtr', respectively. 2501 *----------------------------------------------------------------------------*/ 2502 2503 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr, 2504 uint64_t *zSigPtr) 2505 { 2506 int8_t shiftCount; 2507 2508 shiftCount = countLeadingZeros64( aSig ); 2509 *zSigPtr = aSig<<shiftCount; 2510 *zExpPtr = 1 - shiftCount; 2511 } 2512 2513 /*---------------------------------------------------------------------------- 2514 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2515 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 2516 | and returns the proper extended double-precision floating-point value 2517 | corresponding to the abstract input. Ordinarily, the abstract value is 2518 | rounded and packed into the extended double-precision format, with the 2519 | inexact exception raised if the abstract input cannot be represented 2520 | exactly. However, if the abstract value is too large, the overflow and 2521 | inexact exceptions are raised and an infinity or maximal finite value is 2522 | returned. If the abstract value is too small, the input value is rounded to 2523 | a subnormal number, and the underflow and inexact exceptions are raised if 2524 | the abstract input cannot be represented exactly as a subnormal extended 2525 | double-precision floating-point number. 2526 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 2527 | number of bits as single or double precision, respectively. Otherwise, the 2528 | result is rounded to the full precision of the extended double-precision 2529 | format. 2530 | The input significand must be normalized or smaller. If the input 2531 | significand is not normalized, `zExp' must be 0; in that case, the result 2532 | returned is a subnormal number, and it must not require rounding. The 2533 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 2534 | Floating-Point Arithmetic. 2535 *----------------------------------------------------------------------------*/ 2536 2537 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign, 2538 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 2539 float_status *status) 2540 { 2541 int8_t roundingMode; 2542 flag roundNearestEven, increment, isTiny; 2543 int64_t roundIncrement, roundMask, roundBits; 2544 2545 roundingMode = status->float_rounding_mode; 2546 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2547 if ( roundingPrecision == 80 ) goto precision80; 2548 if ( roundingPrecision == 64 ) { 2549 roundIncrement = LIT64( 0x0000000000000400 ); 2550 roundMask = LIT64( 0x00000000000007FF ); 2551 } 2552 else if ( roundingPrecision == 32 ) { 2553 roundIncrement = LIT64( 0x0000008000000000 ); 2554 roundMask = LIT64( 0x000000FFFFFFFFFF ); 2555 } 2556 else { 2557 goto precision80; 2558 } 2559 zSig0 |= ( zSig1 != 0 ); 2560 switch (roundingMode) { 2561 case float_round_nearest_even: 2562 case float_round_ties_away: 2563 break; 2564 case float_round_to_zero: 2565 roundIncrement = 0; 2566 break; 2567 case float_round_up: 2568 roundIncrement = zSign ? 0 : roundMask; 2569 break; 2570 case float_round_down: 2571 roundIncrement = zSign ? roundMask : 0; 2572 break; 2573 default: 2574 abort(); 2575 } 2576 roundBits = zSig0 & roundMask; 2577 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 2578 if ( ( 0x7FFE < zExp ) 2579 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 2580 ) { 2581 goto overflow; 2582 } 2583 if ( zExp <= 0 ) { 2584 if (status->flush_to_zero) { 2585 float_raise(float_flag_output_denormal, status); 2586 return packFloatx80(zSign, 0, 0); 2587 } 2588 isTiny = 2589 (status->float_detect_tininess 2590 == float_tininess_before_rounding) 2591 || ( zExp < 0 ) 2592 || ( zSig0 <= zSig0 + roundIncrement ); 2593 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 2594 zExp = 0; 2595 roundBits = zSig0 & roundMask; 2596 if (isTiny && roundBits) { 2597 float_raise(float_flag_underflow, status); 2598 } 2599 if (roundBits) { 2600 status->float_exception_flags |= float_flag_inexact; 2601 } 2602 zSig0 += roundIncrement; 2603 if ( (int64_t) zSig0 < 0 ) zExp = 1; 2604 roundIncrement = roundMask + 1; 2605 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 2606 roundMask |= roundIncrement; 2607 } 2608 zSig0 &= ~ roundMask; 2609 return packFloatx80( zSign, zExp, zSig0 ); 2610 } 2611 } 2612 if (roundBits) { 2613 status->float_exception_flags |= float_flag_inexact; 2614 } 2615 zSig0 += roundIncrement; 2616 if ( zSig0 < roundIncrement ) { 2617 ++zExp; 2618 zSig0 = LIT64( 0x8000000000000000 ); 2619 } 2620 roundIncrement = roundMask + 1; 2621 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 2622 roundMask |= roundIncrement; 2623 } 2624 zSig0 &= ~ roundMask; 2625 if ( zSig0 == 0 ) zExp = 0; 2626 return packFloatx80( zSign, zExp, zSig0 ); 2627 precision80: 2628 switch (roundingMode) { 2629 case float_round_nearest_even: 2630 case float_round_ties_away: 2631 increment = ((int64_t)zSig1 < 0); 2632 break; 2633 case float_round_to_zero: 2634 increment = 0; 2635 break; 2636 case float_round_up: 2637 increment = !zSign && zSig1; 2638 break; 2639 case float_round_down: 2640 increment = zSign && zSig1; 2641 break; 2642 default: 2643 abort(); 2644 } 2645 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 2646 if ( ( 0x7FFE < zExp ) 2647 || ( ( zExp == 0x7FFE ) 2648 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 2649 && increment 2650 ) 2651 ) { 2652 roundMask = 0; 2653 overflow: 2654 float_raise(float_flag_overflow | float_flag_inexact, status); 2655 if ( ( roundingMode == float_round_to_zero ) 2656 || ( zSign && ( roundingMode == float_round_up ) ) 2657 || ( ! zSign && ( roundingMode == float_round_down ) ) 2658 ) { 2659 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 2660 } 2661 return packFloatx80(zSign, 2662 floatx80_infinity_high, 2663 floatx80_infinity_low); 2664 } 2665 if ( zExp <= 0 ) { 2666 isTiny = 2667 (status->float_detect_tininess 2668 == float_tininess_before_rounding) 2669 || ( zExp < 0 ) 2670 || ! increment 2671 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 2672 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 2673 zExp = 0; 2674 if (isTiny && zSig1) { 2675 float_raise(float_flag_underflow, status); 2676 } 2677 if (zSig1) { 2678 status->float_exception_flags |= float_flag_inexact; 2679 } 2680 switch (roundingMode) { 2681 case float_round_nearest_even: 2682 case float_round_ties_away: 2683 increment = ((int64_t)zSig1 < 0); 2684 break; 2685 case float_round_to_zero: 2686 increment = 0; 2687 break; 2688 case float_round_up: 2689 increment = !zSign && zSig1; 2690 break; 2691 case float_round_down: 2692 increment = zSign && zSig1; 2693 break; 2694 default: 2695 abort(); 2696 } 2697 if ( increment ) { 2698 ++zSig0; 2699 zSig0 &= 2700 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 2701 if ( (int64_t) zSig0 < 0 ) zExp = 1; 2702 } 2703 return packFloatx80( zSign, zExp, zSig0 ); 2704 } 2705 } 2706 if (zSig1) { 2707 status->float_exception_flags |= float_flag_inexact; 2708 } 2709 if ( increment ) { 2710 ++zSig0; 2711 if ( zSig0 == 0 ) { 2712 ++zExp; 2713 zSig0 = LIT64( 0x8000000000000000 ); 2714 } 2715 else { 2716 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 2717 } 2718 } 2719 else { 2720 if ( zSig0 == 0 ) zExp = 0; 2721 } 2722 return packFloatx80( zSign, zExp, zSig0 ); 2723 2724 } 2725 2726 /*---------------------------------------------------------------------------- 2727 | Takes an abstract floating-point value having sign `zSign', exponent 2728 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 2729 | and returns the proper extended double-precision floating-point value 2730 | corresponding to the abstract input. This routine is just like 2731 | `roundAndPackFloatx80' except that the input significand does not have to be 2732 | normalized. 2733 *----------------------------------------------------------------------------*/ 2734 2735 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 2736 flag zSign, int32_t zExp, 2737 uint64_t zSig0, uint64_t zSig1, 2738 float_status *status) 2739 { 2740 int8_t shiftCount; 2741 2742 if ( zSig0 == 0 ) { 2743 zSig0 = zSig1; 2744 zSig1 = 0; 2745 zExp -= 64; 2746 } 2747 shiftCount = countLeadingZeros64( zSig0 ); 2748 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 2749 zExp -= shiftCount; 2750 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 2751 zSig0, zSig1, status); 2752 2753 } 2754 2755 /*---------------------------------------------------------------------------- 2756 | Returns the least-significant 64 fraction bits of the quadruple-precision 2757 | floating-point value `a'. 2758 *----------------------------------------------------------------------------*/ 2759 2760 static inline uint64_t extractFloat128Frac1( float128 a ) 2761 { 2762 2763 return a.low; 2764 2765 } 2766 2767 /*---------------------------------------------------------------------------- 2768 | Returns the most-significant 48 fraction bits of the quadruple-precision 2769 | floating-point value `a'. 2770 *----------------------------------------------------------------------------*/ 2771 2772 static inline uint64_t extractFloat128Frac0( float128 a ) 2773 { 2774 2775 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 2776 2777 } 2778 2779 /*---------------------------------------------------------------------------- 2780 | Returns the exponent bits of the quadruple-precision floating-point value 2781 | `a'. 2782 *----------------------------------------------------------------------------*/ 2783 2784 static inline int32_t extractFloat128Exp( float128 a ) 2785 { 2786 2787 return ( a.high>>48 ) & 0x7FFF; 2788 2789 } 2790 2791 /*---------------------------------------------------------------------------- 2792 | Returns the sign bit of the quadruple-precision floating-point value `a'. 2793 *----------------------------------------------------------------------------*/ 2794 2795 static inline flag extractFloat128Sign( float128 a ) 2796 { 2797 2798 return a.high>>63; 2799 2800 } 2801 2802 /*---------------------------------------------------------------------------- 2803 | Normalizes the subnormal quadruple-precision floating-point value 2804 | represented by the denormalized significand formed by the concatenation of 2805 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 2806 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 2807 | significand are stored at the location pointed to by `zSig0Ptr', and the 2808 | least significant 64 bits of the normalized significand are stored at the 2809 | location pointed to by `zSig1Ptr'. 2810 *----------------------------------------------------------------------------*/ 2811 2812 static void 2813 normalizeFloat128Subnormal( 2814 uint64_t aSig0, 2815 uint64_t aSig1, 2816 int32_t *zExpPtr, 2817 uint64_t *zSig0Ptr, 2818 uint64_t *zSig1Ptr 2819 ) 2820 { 2821 int8_t shiftCount; 2822 2823 if ( aSig0 == 0 ) { 2824 shiftCount = countLeadingZeros64( aSig1 ) - 15; 2825 if ( shiftCount < 0 ) { 2826 *zSig0Ptr = aSig1>>( - shiftCount ); 2827 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 2828 } 2829 else { 2830 *zSig0Ptr = aSig1<<shiftCount; 2831 *zSig1Ptr = 0; 2832 } 2833 *zExpPtr = - shiftCount - 63; 2834 } 2835 else { 2836 shiftCount = countLeadingZeros64( aSig0 ) - 15; 2837 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 2838 *zExpPtr = 1 - shiftCount; 2839 } 2840 2841 } 2842 2843 /*---------------------------------------------------------------------------- 2844 | Packs the sign `zSign', the exponent `zExp', and the significand formed 2845 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 2846 | floating-point value, returning the result. After being shifted into the 2847 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 2848 | added together to form the most significant 32 bits of the result. This 2849 | means that any integer portion of `zSig0' will be added into the exponent. 2850 | Since a properly normalized significand will have an integer portion equal 2851 | to 1, the `zExp' input should be 1 less than the desired result exponent 2852 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 2853 | significand. 2854 *----------------------------------------------------------------------------*/ 2855 2856 static inline float128 2857 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 ) 2858 { 2859 float128 z; 2860 2861 z.low = zSig1; 2862 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0; 2863 return z; 2864 2865 } 2866 2867 /*---------------------------------------------------------------------------- 2868 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2869 | and extended significand formed by the concatenation of `zSig0', `zSig1', 2870 | and `zSig2', and returns the proper quadruple-precision floating-point value 2871 | corresponding to the abstract input. Ordinarily, the abstract value is 2872 | simply rounded and packed into the quadruple-precision format, with the 2873 | inexact exception raised if the abstract input cannot be represented 2874 | exactly. However, if the abstract value is too large, the overflow and 2875 | inexact exceptions are raised and an infinity or maximal finite value is 2876 | returned. If the abstract value is too small, the input value is rounded to 2877 | a subnormal number, and the underflow and inexact exceptions are raised if 2878 | the abstract input cannot be represented exactly as a subnormal quadruple- 2879 | precision floating-point number. 2880 | The input significand must be normalized or smaller. If the input 2881 | significand is not normalized, `zExp' must be 0; in that case, the result 2882 | returned is a subnormal number, and it must not require rounding. In the 2883 | usual case that the input significand is normalized, `zExp' must be 1 less 2884 | than the ``true'' floating-point exponent. The handling of underflow and 2885 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2886 *----------------------------------------------------------------------------*/ 2887 2888 static float128 roundAndPackFloat128(flag zSign, int32_t zExp, 2889 uint64_t zSig0, uint64_t zSig1, 2890 uint64_t zSig2, float_status *status) 2891 { 2892 int8_t roundingMode; 2893 flag roundNearestEven, increment, isTiny; 2894 2895 roundingMode = status->float_rounding_mode; 2896 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2897 switch (roundingMode) { 2898 case float_round_nearest_even: 2899 case float_round_ties_away: 2900 increment = ((int64_t)zSig2 < 0); 2901 break; 2902 case float_round_to_zero: 2903 increment = 0; 2904 break; 2905 case float_round_up: 2906 increment = !zSign && zSig2; 2907 break; 2908 case float_round_down: 2909 increment = zSign && zSig2; 2910 break; 2911 case float_round_to_odd: 2912 increment = !(zSig1 & 0x1) && zSig2; 2913 break; 2914 default: 2915 abort(); 2916 } 2917 if ( 0x7FFD <= (uint32_t) zExp ) { 2918 if ( ( 0x7FFD < zExp ) 2919 || ( ( zExp == 0x7FFD ) 2920 && eq128( 2921 LIT64( 0x0001FFFFFFFFFFFF ), 2922 LIT64( 0xFFFFFFFFFFFFFFFF ), 2923 zSig0, 2924 zSig1 2925 ) 2926 && increment 2927 ) 2928 ) { 2929 float_raise(float_flag_overflow | float_flag_inexact, status); 2930 if ( ( roundingMode == float_round_to_zero ) 2931 || ( zSign && ( roundingMode == float_round_up ) ) 2932 || ( ! zSign && ( roundingMode == float_round_down ) ) 2933 || (roundingMode == float_round_to_odd) 2934 ) { 2935 return 2936 packFloat128( 2937 zSign, 2938 0x7FFE, 2939 LIT64( 0x0000FFFFFFFFFFFF ), 2940 LIT64( 0xFFFFFFFFFFFFFFFF ) 2941 ); 2942 } 2943 return packFloat128( zSign, 0x7FFF, 0, 0 ); 2944 } 2945 if ( zExp < 0 ) { 2946 if (status->flush_to_zero) { 2947 float_raise(float_flag_output_denormal, status); 2948 return packFloat128(zSign, 0, 0, 0); 2949 } 2950 isTiny = 2951 (status->float_detect_tininess 2952 == float_tininess_before_rounding) 2953 || ( zExp < -1 ) 2954 || ! increment 2955 || lt128( 2956 zSig0, 2957 zSig1, 2958 LIT64( 0x0001FFFFFFFFFFFF ), 2959 LIT64( 0xFFFFFFFFFFFFFFFF ) 2960 ); 2961 shift128ExtraRightJamming( 2962 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 2963 zExp = 0; 2964 if (isTiny && zSig2) { 2965 float_raise(float_flag_underflow, status); 2966 } 2967 switch (roundingMode) { 2968 case float_round_nearest_even: 2969 case float_round_ties_away: 2970 increment = ((int64_t)zSig2 < 0); 2971 break; 2972 case float_round_to_zero: 2973 increment = 0; 2974 break; 2975 case float_round_up: 2976 increment = !zSign && zSig2; 2977 break; 2978 case float_round_down: 2979 increment = zSign && zSig2; 2980 break; 2981 case float_round_to_odd: 2982 increment = !(zSig1 & 0x1) && zSig2; 2983 break; 2984 default: 2985 abort(); 2986 } 2987 } 2988 } 2989 if (zSig2) { 2990 status->float_exception_flags |= float_flag_inexact; 2991 } 2992 if ( increment ) { 2993 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 2994 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 2995 } 2996 else { 2997 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 2998 } 2999 return packFloat128( zSign, zExp, zSig0, zSig1 ); 3000 3001 } 3002 3003 /*---------------------------------------------------------------------------- 3004 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3005 | and significand formed by the concatenation of `zSig0' and `zSig1', and 3006 | returns the proper quadruple-precision floating-point value corresponding 3007 | to the abstract input. This routine is just like `roundAndPackFloat128' 3008 | except that the input significand has fewer bits and does not have to be 3009 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 3010 | point exponent. 3011 *----------------------------------------------------------------------------*/ 3012 3013 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp, 3014 uint64_t zSig0, uint64_t zSig1, 3015 float_status *status) 3016 { 3017 int8_t shiftCount; 3018 uint64_t zSig2; 3019 3020 if ( zSig0 == 0 ) { 3021 zSig0 = zSig1; 3022 zSig1 = 0; 3023 zExp -= 64; 3024 } 3025 shiftCount = countLeadingZeros64( zSig0 ) - 15; 3026 if ( 0 <= shiftCount ) { 3027 zSig2 = 0; 3028 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 3029 } 3030 else { 3031 shift128ExtraRightJamming( 3032 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 3033 } 3034 zExp -= shiftCount; 3035 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 3036 3037 } 3038 3039 3040 /*---------------------------------------------------------------------------- 3041 | Returns the result of converting the 32-bit two's complement integer `a' 3042 | to the extended double-precision floating-point format. The conversion 3043 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3044 | Arithmetic. 3045 *----------------------------------------------------------------------------*/ 3046 3047 floatx80 int32_to_floatx80(int32_t a, float_status *status) 3048 { 3049 flag zSign; 3050 uint32_t absA; 3051 int8_t shiftCount; 3052 uint64_t zSig; 3053 3054 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 3055 zSign = ( a < 0 ); 3056 absA = zSign ? - a : a; 3057 shiftCount = countLeadingZeros32( absA ) + 32; 3058 zSig = absA; 3059 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 3060 3061 } 3062 3063 /*---------------------------------------------------------------------------- 3064 | Returns the result of converting the 32-bit two's complement integer `a' to 3065 | the quadruple-precision floating-point format. The conversion is performed 3066 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3067 *----------------------------------------------------------------------------*/ 3068 3069 float128 int32_to_float128(int32_t a, float_status *status) 3070 { 3071 flag zSign; 3072 uint32_t absA; 3073 int8_t shiftCount; 3074 uint64_t zSig0; 3075 3076 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 3077 zSign = ( a < 0 ); 3078 absA = zSign ? - a : a; 3079 shiftCount = countLeadingZeros32( absA ) + 17; 3080 zSig0 = absA; 3081 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 3082 3083 } 3084 3085 /*---------------------------------------------------------------------------- 3086 | Returns the result of converting the 64-bit two's complement integer `a' 3087 | to the extended double-precision floating-point format. The conversion 3088 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3089 | Arithmetic. 3090 *----------------------------------------------------------------------------*/ 3091 3092 floatx80 int64_to_floatx80(int64_t a, float_status *status) 3093 { 3094 flag zSign; 3095 uint64_t absA; 3096 int8_t shiftCount; 3097 3098 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 3099 zSign = ( a < 0 ); 3100 absA = zSign ? - a : a; 3101 shiftCount = countLeadingZeros64( absA ); 3102 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 3103 3104 } 3105 3106 /*---------------------------------------------------------------------------- 3107 | Returns the result of converting the 64-bit two's complement integer `a' to 3108 | the quadruple-precision floating-point format. The conversion is performed 3109 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3110 *----------------------------------------------------------------------------*/ 3111 3112 float128 int64_to_float128(int64_t a, float_status *status) 3113 { 3114 flag zSign; 3115 uint64_t absA; 3116 int8_t shiftCount; 3117 int32_t zExp; 3118 uint64_t zSig0, zSig1; 3119 3120 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 3121 zSign = ( a < 0 ); 3122 absA = zSign ? - a : a; 3123 shiftCount = countLeadingZeros64( absA ) + 49; 3124 zExp = 0x406E - shiftCount; 3125 if ( 64 <= shiftCount ) { 3126 zSig1 = 0; 3127 zSig0 = absA; 3128 shiftCount -= 64; 3129 } 3130 else { 3131 zSig1 = absA; 3132 zSig0 = 0; 3133 } 3134 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 3135 return packFloat128( zSign, zExp, zSig0, zSig1 ); 3136 3137 } 3138 3139 /*---------------------------------------------------------------------------- 3140 | Returns the result of converting the 64-bit unsigned integer `a' 3141 | to the quadruple-precision floating-point format. The conversion is performed 3142 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3143 *----------------------------------------------------------------------------*/ 3144 3145 float128 uint64_to_float128(uint64_t a, float_status *status) 3146 { 3147 if (a == 0) { 3148 return float128_zero; 3149 } 3150 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status); 3151 } 3152 3153 3154 3155 3156 /*---------------------------------------------------------------------------- 3157 | Returns the result of converting the single-precision floating-point value 3158 | `a' to the double-precision floating-point format. The conversion is 3159 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3160 | Arithmetic. 3161 *----------------------------------------------------------------------------*/ 3162 3163 float64 float32_to_float64(float32 a, float_status *status) 3164 { 3165 flag aSign; 3166 int aExp; 3167 uint32_t aSig; 3168 a = float32_squash_input_denormal(a, status); 3169 3170 aSig = extractFloat32Frac( a ); 3171 aExp = extractFloat32Exp( a ); 3172 aSign = extractFloat32Sign( a ); 3173 if ( aExp == 0xFF ) { 3174 if (aSig) { 3175 return commonNaNToFloat64(float32ToCommonNaN(a, status), status); 3176 } 3177 return packFloat64( aSign, 0x7FF, 0 ); 3178 } 3179 if ( aExp == 0 ) { 3180 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 3181 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3182 --aExp; 3183 } 3184 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 ); 3185 3186 } 3187 3188 /*---------------------------------------------------------------------------- 3189 | Returns the result of converting the single-precision floating-point value 3190 | `a' to the extended double-precision floating-point format. The conversion 3191 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3192 | Arithmetic. 3193 *----------------------------------------------------------------------------*/ 3194 3195 floatx80 float32_to_floatx80(float32 a, float_status *status) 3196 { 3197 flag aSign; 3198 int aExp; 3199 uint32_t aSig; 3200 3201 a = float32_squash_input_denormal(a, status); 3202 aSig = extractFloat32Frac( a ); 3203 aExp = extractFloat32Exp( a ); 3204 aSign = extractFloat32Sign( a ); 3205 if ( aExp == 0xFF ) { 3206 if (aSig) { 3207 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status); 3208 } 3209 return packFloatx80(aSign, 3210 floatx80_infinity_high, 3211 floatx80_infinity_low); 3212 } 3213 if ( aExp == 0 ) { 3214 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 3215 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3216 } 3217 aSig |= 0x00800000; 3218 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 3219 3220 } 3221 3222 /*---------------------------------------------------------------------------- 3223 | Returns the result of converting the single-precision floating-point value 3224 | `a' to the double-precision floating-point format. The conversion is 3225 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3226 | Arithmetic. 3227 *----------------------------------------------------------------------------*/ 3228 3229 float128 float32_to_float128(float32 a, float_status *status) 3230 { 3231 flag aSign; 3232 int aExp; 3233 uint32_t aSig; 3234 3235 a = float32_squash_input_denormal(a, status); 3236 aSig = extractFloat32Frac( a ); 3237 aExp = extractFloat32Exp( a ); 3238 aSign = extractFloat32Sign( a ); 3239 if ( aExp == 0xFF ) { 3240 if (aSig) { 3241 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 3242 } 3243 return packFloat128( aSign, 0x7FFF, 0, 0 ); 3244 } 3245 if ( aExp == 0 ) { 3246 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 3247 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3248 --aExp; 3249 } 3250 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 3251 3252 } 3253 3254 /*---------------------------------------------------------------------------- 3255 | Returns the remainder of the single-precision floating-point value `a' 3256 | with respect to the corresponding value `b'. The operation is performed 3257 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3258 *----------------------------------------------------------------------------*/ 3259 3260 float32 float32_rem(float32 a, float32 b, float_status *status) 3261 { 3262 flag aSign, zSign; 3263 int aExp, bExp, expDiff; 3264 uint32_t aSig, bSig; 3265 uint32_t q; 3266 uint64_t aSig64, bSig64, q64; 3267 uint32_t alternateASig; 3268 int32_t sigMean; 3269 a = float32_squash_input_denormal(a, status); 3270 b = float32_squash_input_denormal(b, status); 3271 3272 aSig = extractFloat32Frac( a ); 3273 aExp = extractFloat32Exp( a ); 3274 aSign = extractFloat32Sign( a ); 3275 bSig = extractFloat32Frac( b ); 3276 bExp = extractFloat32Exp( b ); 3277 if ( aExp == 0xFF ) { 3278 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 3279 return propagateFloat32NaN(a, b, status); 3280 } 3281 float_raise(float_flag_invalid, status); 3282 return float32_default_nan(status); 3283 } 3284 if ( bExp == 0xFF ) { 3285 if (bSig) { 3286 return propagateFloat32NaN(a, b, status); 3287 } 3288 return a; 3289 } 3290 if ( bExp == 0 ) { 3291 if ( bSig == 0 ) { 3292 float_raise(float_flag_invalid, status); 3293 return float32_default_nan(status); 3294 } 3295 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 3296 } 3297 if ( aExp == 0 ) { 3298 if ( aSig == 0 ) return a; 3299 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3300 } 3301 expDiff = aExp - bExp; 3302 aSig |= 0x00800000; 3303 bSig |= 0x00800000; 3304 if ( expDiff < 32 ) { 3305 aSig <<= 8; 3306 bSig <<= 8; 3307 if ( expDiff < 0 ) { 3308 if ( expDiff < -1 ) return a; 3309 aSig >>= 1; 3310 } 3311 q = ( bSig <= aSig ); 3312 if ( q ) aSig -= bSig; 3313 if ( 0 < expDiff ) { 3314 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 3315 q >>= 32 - expDiff; 3316 bSig >>= 2; 3317 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 3318 } 3319 else { 3320 aSig >>= 2; 3321 bSig >>= 2; 3322 } 3323 } 3324 else { 3325 if ( bSig <= aSig ) aSig -= bSig; 3326 aSig64 = ( (uint64_t) aSig )<<40; 3327 bSig64 = ( (uint64_t) bSig )<<40; 3328 expDiff -= 64; 3329 while ( 0 < expDiff ) { 3330 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 3331 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 3332 aSig64 = - ( ( bSig * q64 )<<38 ); 3333 expDiff -= 62; 3334 } 3335 expDiff += 64; 3336 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 3337 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 3338 q = q64>>( 64 - expDiff ); 3339 bSig <<= 6; 3340 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 3341 } 3342 do { 3343 alternateASig = aSig; 3344 ++q; 3345 aSig -= bSig; 3346 } while ( 0 <= (int32_t) aSig ); 3347 sigMean = aSig + alternateASig; 3348 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 3349 aSig = alternateASig; 3350 } 3351 zSign = ( (int32_t) aSig < 0 ); 3352 if ( zSign ) aSig = - aSig; 3353 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 3354 } 3355 3356 3357 3358 /*---------------------------------------------------------------------------- 3359 | Returns the binary exponential of the single-precision floating-point value 3360 | `a'. The operation is performed according to the IEC/IEEE Standard for 3361 | Binary Floating-Point Arithmetic. 3362 | 3363 | Uses the following identities: 3364 | 3365 | 1. ------------------------------------------------------------------------- 3366 | x x*ln(2) 3367 | 2 = e 3368 | 3369 | 2. ------------------------------------------------------------------------- 3370 | 2 3 4 5 n 3371 | x x x x x x x 3372 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 3373 | 1! 2! 3! 4! 5! n! 3374 *----------------------------------------------------------------------------*/ 3375 3376 static const float64 float32_exp2_coefficients[15] = 3377 { 3378 const_float64( 0x3ff0000000000000ll ), /* 1 */ 3379 const_float64( 0x3fe0000000000000ll ), /* 2 */ 3380 const_float64( 0x3fc5555555555555ll ), /* 3 */ 3381 const_float64( 0x3fa5555555555555ll ), /* 4 */ 3382 const_float64( 0x3f81111111111111ll ), /* 5 */ 3383 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 3384 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 3385 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 3386 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 3387 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 3388 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 3389 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 3390 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 3391 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 3392 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 3393 }; 3394 3395 float32 float32_exp2(float32 a, float_status *status) 3396 { 3397 flag aSign; 3398 int aExp; 3399 uint32_t aSig; 3400 float64 r, x, xn; 3401 int i; 3402 a = float32_squash_input_denormal(a, status); 3403 3404 aSig = extractFloat32Frac( a ); 3405 aExp = extractFloat32Exp( a ); 3406 aSign = extractFloat32Sign( a ); 3407 3408 if ( aExp == 0xFF) { 3409 if (aSig) { 3410 return propagateFloat32NaN(a, float32_zero, status); 3411 } 3412 return (aSign) ? float32_zero : a; 3413 } 3414 if (aExp == 0) { 3415 if (aSig == 0) return float32_one; 3416 } 3417 3418 float_raise(float_flag_inexact, status); 3419 3420 /* ******************************* */ 3421 /* using float64 for approximation */ 3422 /* ******************************* */ 3423 x = float32_to_float64(a, status); 3424 x = float64_mul(x, float64_ln2, status); 3425 3426 xn = x; 3427 r = float64_one; 3428 for (i = 0 ; i < 15 ; i++) { 3429 float64 f; 3430 3431 f = float64_mul(xn, float32_exp2_coefficients[i], status); 3432 r = float64_add(r, f, status); 3433 3434 xn = float64_mul(xn, x, status); 3435 } 3436 3437 return float64_to_float32(r, status); 3438 } 3439 3440 /*---------------------------------------------------------------------------- 3441 | Returns the binary log of the single-precision floating-point value `a'. 3442 | The operation is performed according to the IEC/IEEE Standard for Binary 3443 | Floating-Point Arithmetic. 3444 *----------------------------------------------------------------------------*/ 3445 float32 float32_log2(float32 a, float_status *status) 3446 { 3447 flag aSign, zSign; 3448 int aExp; 3449 uint32_t aSig, zSig, i; 3450 3451 a = float32_squash_input_denormal(a, status); 3452 aSig = extractFloat32Frac( a ); 3453 aExp = extractFloat32Exp( a ); 3454 aSign = extractFloat32Sign( a ); 3455 3456 if ( aExp == 0 ) { 3457 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 3458 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3459 } 3460 if ( aSign ) { 3461 float_raise(float_flag_invalid, status); 3462 return float32_default_nan(status); 3463 } 3464 if ( aExp == 0xFF ) { 3465 if (aSig) { 3466 return propagateFloat32NaN(a, float32_zero, status); 3467 } 3468 return a; 3469 } 3470 3471 aExp -= 0x7F; 3472 aSig |= 0x00800000; 3473 zSign = aExp < 0; 3474 zSig = aExp << 23; 3475 3476 for (i = 1 << 22; i > 0; i >>= 1) { 3477 aSig = ( (uint64_t)aSig * aSig ) >> 23; 3478 if ( aSig & 0x01000000 ) { 3479 aSig >>= 1; 3480 zSig |= i; 3481 } 3482 } 3483 3484 if ( zSign ) 3485 zSig = -zSig; 3486 3487 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 3488 } 3489 3490 /*---------------------------------------------------------------------------- 3491 | Returns 1 if the single-precision floating-point value `a' is equal to 3492 | the corresponding value `b', and 0 otherwise. The invalid exception is 3493 | raised if either operand is a NaN. Otherwise, the comparison is performed 3494 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3495 *----------------------------------------------------------------------------*/ 3496 3497 int float32_eq(float32 a, float32 b, float_status *status) 3498 { 3499 uint32_t av, bv; 3500 a = float32_squash_input_denormal(a, status); 3501 b = float32_squash_input_denormal(b, status); 3502 3503 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3504 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3505 ) { 3506 float_raise(float_flag_invalid, status); 3507 return 0; 3508 } 3509 av = float32_val(a); 3510 bv = float32_val(b); 3511 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3512 } 3513 3514 /*---------------------------------------------------------------------------- 3515 | Returns 1 if the single-precision floating-point value `a' is less than 3516 | or equal to the corresponding value `b', and 0 otherwise. The invalid 3517 | exception is raised if either operand is a NaN. The comparison is performed 3518 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3519 *----------------------------------------------------------------------------*/ 3520 3521 int float32_le(float32 a, float32 b, float_status *status) 3522 { 3523 flag aSign, bSign; 3524 uint32_t av, bv; 3525 a = float32_squash_input_denormal(a, status); 3526 b = float32_squash_input_denormal(b, status); 3527 3528 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3529 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3530 ) { 3531 float_raise(float_flag_invalid, status); 3532 return 0; 3533 } 3534 aSign = extractFloat32Sign( a ); 3535 bSign = extractFloat32Sign( b ); 3536 av = float32_val(a); 3537 bv = float32_val(b); 3538 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3539 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3540 3541 } 3542 3543 /*---------------------------------------------------------------------------- 3544 | Returns 1 if the single-precision floating-point value `a' is less than 3545 | the corresponding value `b', and 0 otherwise. The invalid exception is 3546 | raised if either operand is a NaN. The comparison is performed according 3547 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3548 *----------------------------------------------------------------------------*/ 3549 3550 int float32_lt(float32 a, float32 b, float_status *status) 3551 { 3552 flag aSign, bSign; 3553 uint32_t av, bv; 3554 a = float32_squash_input_denormal(a, status); 3555 b = float32_squash_input_denormal(b, status); 3556 3557 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3558 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3559 ) { 3560 float_raise(float_flag_invalid, status); 3561 return 0; 3562 } 3563 aSign = extractFloat32Sign( a ); 3564 bSign = extractFloat32Sign( b ); 3565 av = float32_val(a); 3566 bv = float32_val(b); 3567 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 3568 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3569 3570 } 3571 3572 /*---------------------------------------------------------------------------- 3573 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 3574 | be compared, and 0 otherwise. The invalid exception is raised if either 3575 | operand is a NaN. The comparison is performed according to the IEC/IEEE 3576 | Standard for Binary Floating-Point Arithmetic. 3577 *----------------------------------------------------------------------------*/ 3578 3579 int float32_unordered(float32 a, float32 b, float_status *status) 3580 { 3581 a = float32_squash_input_denormal(a, status); 3582 b = float32_squash_input_denormal(b, status); 3583 3584 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3585 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3586 ) { 3587 float_raise(float_flag_invalid, status); 3588 return 1; 3589 } 3590 return 0; 3591 } 3592 3593 /*---------------------------------------------------------------------------- 3594 | Returns 1 if the single-precision floating-point value `a' is equal to 3595 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3596 | exception. The comparison is performed according to the IEC/IEEE Standard 3597 | for Binary Floating-Point Arithmetic. 3598 *----------------------------------------------------------------------------*/ 3599 3600 int float32_eq_quiet(float32 a, float32 b, float_status *status) 3601 { 3602 a = float32_squash_input_denormal(a, status); 3603 b = float32_squash_input_denormal(b, status); 3604 3605 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3606 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3607 ) { 3608 if (float32_is_signaling_nan(a, status) 3609 || float32_is_signaling_nan(b, status)) { 3610 float_raise(float_flag_invalid, status); 3611 } 3612 return 0; 3613 } 3614 return ( float32_val(a) == float32_val(b) ) || 3615 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 3616 } 3617 3618 /*---------------------------------------------------------------------------- 3619 | Returns 1 if the single-precision floating-point value `a' is less than or 3620 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 3621 | cause an exception. Otherwise, the comparison is performed according to the 3622 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3623 *----------------------------------------------------------------------------*/ 3624 3625 int float32_le_quiet(float32 a, float32 b, float_status *status) 3626 { 3627 flag aSign, bSign; 3628 uint32_t av, bv; 3629 a = float32_squash_input_denormal(a, status); 3630 b = float32_squash_input_denormal(b, status); 3631 3632 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3633 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3634 ) { 3635 if (float32_is_signaling_nan(a, status) 3636 || float32_is_signaling_nan(b, status)) { 3637 float_raise(float_flag_invalid, status); 3638 } 3639 return 0; 3640 } 3641 aSign = extractFloat32Sign( a ); 3642 bSign = extractFloat32Sign( b ); 3643 av = float32_val(a); 3644 bv = float32_val(b); 3645 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3646 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3647 3648 } 3649 3650 /*---------------------------------------------------------------------------- 3651 | Returns 1 if the single-precision floating-point value `a' is less than 3652 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3653 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 3654 | Standard for Binary Floating-Point Arithmetic. 3655 *----------------------------------------------------------------------------*/ 3656 3657 int float32_lt_quiet(float32 a, float32 b, float_status *status) 3658 { 3659 flag aSign, bSign; 3660 uint32_t av, bv; 3661 a = float32_squash_input_denormal(a, status); 3662 b = float32_squash_input_denormal(b, status); 3663 3664 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3665 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3666 ) { 3667 if (float32_is_signaling_nan(a, status) 3668 || float32_is_signaling_nan(b, status)) { 3669 float_raise(float_flag_invalid, status); 3670 } 3671 return 0; 3672 } 3673 aSign = extractFloat32Sign( a ); 3674 bSign = extractFloat32Sign( b ); 3675 av = float32_val(a); 3676 bv = float32_val(b); 3677 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 3678 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3679 3680 } 3681 3682 /*---------------------------------------------------------------------------- 3683 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 3684 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 3685 | comparison is performed according to the IEC/IEEE Standard for Binary 3686 | Floating-Point Arithmetic. 3687 *----------------------------------------------------------------------------*/ 3688 3689 int float32_unordered_quiet(float32 a, float32 b, float_status *status) 3690 { 3691 a = float32_squash_input_denormal(a, status); 3692 b = float32_squash_input_denormal(b, status); 3693 3694 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3695 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3696 ) { 3697 if (float32_is_signaling_nan(a, status) 3698 || float32_is_signaling_nan(b, status)) { 3699 float_raise(float_flag_invalid, status); 3700 } 3701 return 1; 3702 } 3703 return 0; 3704 } 3705 3706 3707 /*---------------------------------------------------------------------------- 3708 | Returns the result of converting the double-precision floating-point value 3709 | `a' to the single-precision floating-point format. The conversion is 3710 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3711 | Arithmetic. 3712 *----------------------------------------------------------------------------*/ 3713 3714 float32 float64_to_float32(float64 a, float_status *status) 3715 { 3716 flag aSign; 3717 int aExp; 3718 uint64_t aSig; 3719 uint32_t zSig; 3720 a = float64_squash_input_denormal(a, status); 3721 3722 aSig = extractFloat64Frac( a ); 3723 aExp = extractFloat64Exp( a ); 3724 aSign = extractFloat64Sign( a ); 3725 if ( aExp == 0x7FF ) { 3726 if (aSig) { 3727 return commonNaNToFloat32(float64ToCommonNaN(a, status), status); 3728 } 3729 return packFloat32( aSign, 0xFF, 0 ); 3730 } 3731 shift64RightJamming( aSig, 22, &aSig ); 3732 zSig = aSig; 3733 if ( aExp || zSig ) { 3734 zSig |= 0x40000000; 3735 aExp -= 0x381; 3736 } 3737 return roundAndPackFloat32(aSign, aExp, zSig, status); 3738 3739 } 3740 3741 3742 /*---------------------------------------------------------------------------- 3743 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 3744 | half-precision floating-point value, returning the result. After being 3745 | shifted into the proper positions, the three fields are simply added 3746 | together to form the result. This means that any integer portion of `zSig' 3747 | will be added into the exponent. Since a properly normalized significand 3748 | will have an integer portion equal to 1, the `zExp' input should be 1 less 3749 | than the desired result exponent whenever `zSig' is a complete, normalized 3750 | significand. 3751 *----------------------------------------------------------------------------*/ 3752 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig) 3753 { 3754 return make_float16( 3755 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig); 3756 } 3757 3758 /*---------------------------------------------------------------------------- 3759 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3760 | and significand `zSig', and returns the proper half-precision floating- 3761 | point value corresponding to the abstract input. Ordinarily, the abstract 3762 | value is simply rounded and packed into the half-precision format, with 3763 | the inexact exception raised if the abstract input cannot be represented 3764 | exactly. However, if the abstract value is too large, the overflow and 3765 | inexact exceptions are raised and an infinity or maximal finite value is 3766 | returned. If the abstract value is too small, the input value is rounded to 3767 | a subnormal number, and the underflow and inexact exceptions are raised if 3768 | the abstract input cannot be represented exactly as a subnormal half- 3769 | precision floating-point number. 3770 | The `ieee' flag indicates whether to use IEEE standard half precision, or 3771 | ARM-style "alternative representation", which omits the NaN and Inf 3772 | encodings in order to raise the maximum representable exponent by one. 3773 | The input significand `zSig' has its binary point between bits 22 3774 | and 23, which is 13 bits to the left of the usual location. This shifted 3775 | significand must be normalized or smaller. If `zSig' is not normalized, 3776 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3777 | and it must not require rounding. In the usual case that `zSig' is 3778 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3779 | Note the slightly odd position of the binary point in zSig compared with the 3780 | other roundAndPackFloat functions. This should probably be fixed if we 3781 | need to implement more float16 routines than just conversion. 3782 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3783 | Binary Floating-Point Arithmetic. 3784 *----------------------------------------------------------------------------*/ 3785 3786 static float16 roundAndPackFloat16(flag zSign, int zExp, 3787 uint32_t zSig, flag ieee, 3788 float_status *status) 3789 { 3790 int maxexp = ieee ? 29 : 30; 3791 uint32_t mask; 3792 uint32_t increment; 3793 bool rounding_bumps_exp; 3794 bool is_tiny = false; 3795 3796 /* Calculate the mask of bits of the mantissa which are not 3797 * representable in half-precision and will be lost. 3798 */ 3799 if (zExp < 1) { 3800 /* Will be denormal in halfprec */ 3801 mask = 0x00ffffff; 3802 if (zExp >= -11) { 3803 mask >>= 11 + zExp; 3804 } 3805 } else { 3806 /* Normal number in halfprec */ 3807 mask = 0x00001fff; 3808 } 3809 3810 switch (status->float_rounding_mode) { 3811 case float_round_nearest_even: 3812 increment = (mask + 1) >> 1; 3813 if ((zSig & mask) == increment) { 3814 increment = zSig & (increment << 1); 3815 } 3816 break; 3817 case float_round_ties_away: 3818 increment = (mask + 1) >> 1; 3819 break; 3820 case float_round_up: 3821 increment = zSign ? 0 : mask; 3822 break; 3823 case float_round_down: 3824 increment = zSign ? mask : 0; 3825 break; 3826 default: /* round_to_zero */ 3827 increment = 0; 3828 break; 3829 } 3830 3831 rounding_bumps_exp = (zSig + increment >= 0x01000000); 3832 3833 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) { 3834 if (ieee) { 3835 float_raise(float_flag_overflow | float_flag_inexact, status); 3836 return packFloat16(zSign, 0x1f, 0); 3837 } else { 3838 float_raise(float_flag_invalid, status); 3839 return packFloat16(zSign, 0x1f, 0x3ff); 3840 } 3841 } 3842 3843 if (zExp < 0) { 3844 /* Note that flush-to-zero does not affect half-precision results */ 3845 is_tiny = 3846 (status->float_detect_tininess == float_tininess_before_rounding) 3847 || (zExp < -1) 3848 || (!rounding_bumps_exp); 3849 } 3850 if (zSig & mask) { 3851 float_raise(float_flag_inexact, status); 3852 if (is_tiny) { 3853 float_raise(float_flag_underflow, status); 3854 } 3855 } 3856 3857 zSig += increment; 3858 if (rounding_bumps_exp) { 3859 zSig >>= 1; 3860 zExp++; 3861 } 3862 3863 if (zExp < -10) { 3864 return packFloat16(zSign, 0, 0); 3865 } 3866 if (zExp < 0) { 3867 zSig >>= -zExp; 3868 zExp = 0; 3869 } 3870 return packFloat16(zSign, zExp, zSig >> 13); 3871 } 3872 3873 /*---------------------------------------------------------------------------- 3874 | If `a' is denormal and we are in flush-to-zero mode then set the 3875 | input-denormal exception and return zero. Otherwise just return the value. 3876 *----------------------------------------------------------------------------*/ 3877 float16 float16_squash_input_denormal(float16 a, float_status *status) 3878 { 3879 if (status->flush_inputs_to_zero) { 3880 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) { 3881 float_raise(float_flag_input_denormal, status); 3882 return make_float16(float16_val(a) & 0x8000); 3883 } 3884 } 3885 return a; 3886 } 3887 3888 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr, 3889 uint32_t *zSigPtr) 3890 { 3891 int8_t shiftCount = countLeadingZeros32(aSig) - 21; 3892 *zSigPtr = aSig << shiftCount; 3893 *zExpPtr = 1 - shiftCount; 3894 } 3895 3896 /* Half precision floats come in two formats: standard IEEE and "ARM" format. 3897 The latter gains extra exponent range by omitting the NaN/Inf encodings. */ 3898 3899 float32 float16_to_float32(float16 a, flag ieee, float_status *status) 3900 { 3901 flag aSign; 3902 int aExp; 3903 uint32_t aSig; 3904 3905 aSign = extractFloat16Sign(a); 3906 aExp = extractFloat16Exp(a); 3907 aSig = extractFloat16Frac(a); 3908 3909 if (aExp == 0x1f && ieee) { 3910 if (aSig) { 3911 return commonNaNToFloat32(float16ToCommonNaN(a, status), status); 3912 } 3913 return packFloat32(aSign, 0xff, 0); 3914 } 3915 if (aExp == 0) { 3916 if (aSig == 0) { 3917 return packFloat32(aSign, 0, 0); 3918 } 3919 3920 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 3921 aExp--; 3922 } 3923 return packFloat32( aSign, aExp + 0x70, aSig << 13); 3924 } 3925 3926 float16 float32_to_float16(float32 a, flag ieee, float_status *status) 3927 { 3928 flag aSign; 3929 int aExp; 3930 uint32_t aSig; 3931 3932 a = float32_squash_input_denormal(a, status); 3933 3934 aSig = extractFloat32Frac( a ); 3935 aExp = extractFloat32Exp( a ); 3936 aSign = extractFloat32Sign( a ); 3937 if ( aExp == 0xFF ) { 3938 if (aSig) { 3939 /* Input is a NaN */ 3940 if (!ieee) { 3941 float_raise(float_flag_invalid, status); 3942 return packFloat16(aSign, 0, 0); 3943 } 3944 return commonNaNToFloat16( 3945 float32ToCommonNaN(a, status), status); 3946 } 3947 /* Infinity */ 3948 if (!ieee) { 3949 float_raise(float_flag_invalid, status); 3950 return packFloat16(aSign, 0x1f, 0x3ff); 3951 } 3952 return packFloat16(aSign, 0x1f, 0); 3953 } 3954 if (aExp == 0 && aSig == 0) { 3955 return packFloat16(aSign, 0, 0); 3956 } 3957 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 3958 * even if the input is denormal; however this is harmless because 3959 * the largest possible single-precision denormal is still smaller 3960 * than the smallest representable half-precision denormal, and so we 3961 * will end up ignoring aSig and returning via the "always return zero" 3962 * codepath. 3963 */ 3964 aSig |= 0x00800000; 3965 aExp -= 0x71; 3966 3967 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status); 3968 } 3969 3970 float64 float16_to_float64(float16 a, flag ieee, float_status *status) 3971 { 3972 flag aSign; 3973 int aExp; 3974 uint32_t aSig; 3975 3976 aSign = extractFloat16Sign(a); 3977 aExp = extractFloat16Exp(a); 3978 aSig = extractFloat16Frac(a); 3979 3980 if (aExp == 0x1f && ieee) { 3981 if (aSig) { 3982 return commonNaNToFloat64( 3983 float16ToCommonNaN(a, status), status); 3984 } 3985 return packFloat64(aSign, 0x7ff, 0); 3986 } 3987 if (aExp == 0) { 3988 if (aSig == 0) { 3989 return packFloat64(aSign, 0, 0); 3990 } 3991 3992 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 3993 aExp--; 3994 } 3995 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42); 3996 } 3997 3998 float16 float64_to_float16(float64 a, flag ieee, float_status *status) 3999 { 4000 flag aSign; 4001 int aExp; 4002 uint64_t aSig; 4003 uint32_t zSig; 4004 4005 a = float64_squash_input_denormal(a, status); 4006 4007 aSig = extractFloat64Frac(a); 4008 aExp = extractFloat64Exp(a); 4009 aSign = extractFloat64Sign(a); 4010 if (aExp == 0x7FF) { 4011 if (aSig) { 4012 /* Input is a NaN */ 4013 if (!ieee) { 4014 float_raise(float_flag_invalid, status); 4015 return packFloat16(aSign, 0, 0); 4016 } 4017 return commonNaNToFloat16( 4018 float64ToCommonNaN(a, status), status); 4019 } 4020 /* Infinity */ 4021 if (!ieee) { 4022 float_raise(float_flag_invalid, status); 4023 return packFloat16(aSign, 0x1f, 0x3ff); 4024 } 4025 return packFloat16(aSign, 0x1f, 0); 4026 } 4027 shift64RightJamming(aSig, 29, &aSig); 4028 zSig = aSig; 4029 if (aExp == 0 && zSig == 0) { 4030 return packFloat16(aSign, 0, 0); 4031 } 4032 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 4033 * even if the input is denormal; however this is harmless because 4034 * the largest possible single-precision denormal is still smaller 4035 * than the smallest representable half-precision denormal, and so we 4036 * will end up ignoring aSig and returning via the "always return zero" 4037 * codepath. 4038 */ 4039 zSig |= 0x00800000; 4040 aExp -= 0x3F1; 4041 4042 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status); 4043 } 4044 4045 /*---------------------------------------------------------------------------- 4046 | Returns the result of converting the double-precision floating-point value 4047 | `a' to the extended double-precision floating-point format. The conversion 4048 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4049 | Arithmetic. 4050 *----------------------------------------------------------------------------*/ 4051 4052 floatx80 float64_to_floatx80(float64 a, float_status *status) 4053 { 4054 flag aSign; 4055 int aExp; 4056 uint64_t aSig; 4057 4058 a = float64_squash_input_denormal(a, status); 4059 aSig = extractFloat64Frac( a ); 4060 aExp = extractFloat64Exp( a ); 4061 aSign = extractFloat64Sign( a ); 4062 if ( aExp == 0x7FF ) { 4063 if (aSig) { 4064 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status); 4065 } 4066 return packFloatx80(aSign, 4067 floatx80_infinity_high, 4068 floatx80_infinity_low); 4069 } 4070 if ( aExp == 0 ) { 4071 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 4072 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4073 } 4074 return 4075 packFloatx80( 4076 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 4077 4078 } 4079 4080 /*---------------------------------------------------------------------------- 4081 | Returns the result of converting the double-precision floating-point value 4082 | `a' to the quadruple-precision floating-point format. The conversion is 4083 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4084 | Arithmetic. 4085 *----------------------------------------------------------------------------*/ 4086 4087 float128 float64_to_float128(float64 a, float_status *status) 4088 { 4089 flag aSign; 4090 int aExp; 4091 uint64_t aSig, zSig0, zSig1; 4092 4093 a = float64_squash_input_denormal(a, status); 4094 aSig = extractFloat64Frac( a ); 4095 aExp = extractFloat64Exp( a ); 4096 aSign = extractFloat64Sign( a ); 4097 if ( aExp == 0x7FF ) { 4098 if (aSig) { 4099 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 4100 } 4101 return packFloat128( aSign, 0x7FFF, 0, 0 ); 4102 } 4103 if ( aExp == 0 ) { 4104 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 4105 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4106 --aExp; 4107 } 4108 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 4109 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 4110 4111 } 4112 4113 4114 /*---------------------------------------------------------------------------- 4115 | Returns the remainder of the double-precision floating-point value `a' 4116 | with respect to the corresponding value `b'. The operation is performed 4117 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4118 *----------------------------------------------------------------------------*/ 4119 4120 float64 float64_rem(float64 a, float64 b, float_status *status) 4121 { 4122 flag aSign, zSign; 4123 int aExp, bExp, expDiff; 4124 uint64_t aSig, bSig; 4125 uint64_t q, alternateASig; 4126 int64_t sigMean; 4127 4128 a = float64_squash_input_denormal(a, status); 4129 b = float64_squash_input_denormal(b, status); 4130 aSig = extractFloat64Frac( a ); 4131 aExp = extractFloat64Exp( a ); 4132 aSign = extractFloat64Sign( a ); 4133 bSig = extractFloat64Frac( b ); 4134 bExp = extractFloat64Exp( b ); 4135 if ( aExp == 0x7FF ) { 4136 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 4137 return propagateFloat64NaN(a, b, status); 4138 } 4139 float_raise(float_flag_invalid, status); 4140 return float64_default_nan(status); 4141 } 4142 if ( bExp == 0x7FF ) { 4143 if (bSig) { 4144 return propagateFloat64NaN(a, b, status); 4145 } 4146 return a; 4147 } 4148 if ( bExp == 0 ) { 4149 if ( bSig == 0 ) { 4150 float_raise(float_flag_invalid, status); 4151 return float64_default_nan(status); 4152 } 4153 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4154 } 4155 if ( aExp == 0 ) { 4156 if ( aSig == 0 ) return a; 4157 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4158 } 4159 expDiff = aExp - bExp; 4160 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 4161 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4162 if ( expDiff < 0 ) { 4163 if ( expDiff < -1 ) return a; 4164 aSig >>= 1; 4165 } 4166 q = ( bSig <= aSig ); 4167 if ( q ) aSig -= bSig; 4168 expDiff -= 64; 4169 while ( 0 < expDiff ) { 4170 q = estimateDiv128To64( aSig, 0, bSig ); 4171 q = ( 2 < q ) ? q - 2 : 0; 4172 aSig = - ( ( bSig>>2 ) * q ); 4173 expDiff -= 62; 4174 } 4175 expDiff += 64; 4176 if ( 0 < expDiff ) { 4177 q = estimateDiv128To64( aSig, 0, bSig ); 4178 q = ( 2 < q ) ? q - 2 : 0; 4179 q >>= 64 - expDiff; 4180 bSig >>= 2; 4181 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4182 } 4183 else { 4184 aSig >>= 2; 4185 bSig >>= 2; 4186 } 4187 do { 4188 alternateASig = aSig; 4189 ++q; 4190 aSig -= bSig; 4191 } while ( 0 <= (int64_t) aSig ); 4192 sigMean = aSig + alternateASig; 4193 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 4194 aSig = alternateASig; 4195 } 4196 zSign = ( (int64_t) aSig < 0 ); 4197 if ( zSign ) aSig = - aSig; 4198 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 4199 4200 } 4201 4202 /*---------------------------------------------------------------------------- 4203 | Returns the binary log of the double-precision floating-point value `a'. 4204 | The operation is performed according to the IEC/IEEE Standard for Binary 4205 | Floating-Point Arithmetic. 4206 *----------------------------------------------------------------------------*/ 4207 float64 float64_log2(float64 a, float_status *status) 4208 { 4209 flag aSign, zSign; 4210 int aExp; 4211 uint64_t aSig, aSig0, aSig1, zSig, i; 4212 a = float64_squash_input_denormal(a, status); 4213 4214 aSig = extractFloat64Frac( a ); 4215 aExp = extractFloat64Exp( a ); 4216 aSign = extractFloat64Sign( a ); 4217 4218 if ( aExp == 0 ) { 4219 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 4220 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4221 } 4222 if ( aSign ) { 4223 float_raise(float_flag_invalid, status); 4224 return float64_default_nan(status); 4225 } 4226 if ( aExp == 0x7FF ) { 4227 if (aSig) { 4228 return propagateFloat64NaN(a, float64_zero, status); 4229 } 4230 return a; 4231 } 4232 4233 aExp -= 0x3FF; 4234 aSig |= LIT64( 0x0010000000000000 ); 4235 zSign = aExp < 0; 4236 zSig = (uint64_t)aExp << 52; 4237 for (i = 1LL << 51; i > 0; i >>= 1) { 4238 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 4239 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 4240 if ( aSig & LIT64( 0x0020000000000000 ) ) { 4241 aSig >>= 1; 4242 zSig |= i; 4243 } 4244 } 4245 4246 if ( zSign ) 4247 zSig = -zSig; 4248 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 4249 } 4250 4251 /*---------------------------------------------------------------------------- 4252 | Returns 1 if the double-precision floating-point value `a' is equal to the 4253 | corresponding value `b', and 0 otherwise. The invalid exception is raised 4254 | if either operand is a NaN. Otherwise, the comparison is performed 4255 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4256 *----------------------------------------------------------------------------*/ 4257 4258 int float64_eq(float64 a, float64 b, float_status *status) 4259 { 4260 uint64_t av, bv; 4261 a = float64_squash_input_denormal(a, status); 4262 b = float64_squash_input_denormal(b, status); 4263 4264 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4265 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4266 ) { 4267 float_raise(float_flag_invalid, status); 4268 return 0; 4269 } 4270 av = float64_val(a); 4271 bv = float64_val(b); 4272 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4273 4274 } 4275 4276 /*---------------------------------------------------------------------------- 4277 | Returns 1 if the double-precision floating-point value `a' is less than or 4278 | equal to the corresponding value `b', and 0 otherwise. The invalid 4279 | exception is raised if either operand is a NaN. The comparison is performed 4280 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4281 *----------------------------------------------------------------------------*/ 4282 4283 int float64_le(float64 a, float64 b, float_status *status) 4284 { 4285 flag aSign, bSign; 4286 uint64_t av, bv; 4287 a = float64_squash_input_denormal(a, status); 4288 b = float64_squash_input_denormal(b, status); 4289 4290 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4291 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4292 ) { 4293 float_raise(float_flag_invalid, status); 4294 return 0; 4295 } 4296 aSign = extractFloat64Sign( a ); 4297 bSign = extractFloat64Sign( b ); 4298 av = float64_val(a); 4299 bv = float64_val(b); 4300 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4301 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4302 4303 } 4304 4305 /*---------------------------------------------------------------------------- 4306 | Returns 1 if the double-precision floating-point value `a' is less than 4307 | the corresponding value `b', and 0 otherwise. The invalid exception is 4308 | raised if either operand is a NaN. The comparison is performed according 4309 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4310 *----------------------------------------------------------------------------*/ 4311 4312 int float64_lt(float64 a, float64 b, float_status *status) 4313 { 4314 flag aSign, bSign; 4315 uint64_t av, bv; 4316 4317 a = float64_squash_input_denormal(a, status); 4318 b = float64_squash_input_denormal(b, status); 4319 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4320 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4321 ) { 4322 float_raise(float_flag_invalid, status); 4323 return 0; 4324 } 4325 aSign = extractFloat64Sign( a ); 4326 bSign = extractFloat64Sign( b ); 4327 av = float64_val(a); 4328 bv = float64_val(b); 4329 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4330 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4331 4332 } 4333 4334 /*---------------------------------------------------------------------------- 4335 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4336 | be compared, and 0 otherwise. The invalid exception is raised if either 4337 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4338 | Standard for Binary Floating-Point Arithmetic. 4339 *----------------------------------------------------------------------------*/ 4340 4341 int float64_unordered(float64 a, float64 b, float_status *status) 4342 { 4343 a = float64_squash_input_denormal(a, status); 4344 b = float64_squash_input_denormal(b, status); 4345 4346 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4347 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4348 ) { 4349 float_raise(float_flag_invalid, status); 4350 return 1; 4351 } 4352 return 0; 4353 } 4354 4355 /*---------------------------------------------------------------------------- 4356 | Returns 1 if the double-precision floating-point value `a' is equal to the 4357 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4358 | exception.The comparison is performed according to the IEC/IEEE Standard 4359 | for Binary Floating-Point Arithmetic. 4360 *----------------------------------------------------------------------------*/ 4361 4362 int float64_eq_quiet(float64 a, float64 b, float_status *status) 4363 { 4364 uint64_t av, bv; 4365 a = float64_squash_input_denormal(a, status); 4366 b = float64_squash_input_denormal(b, status); 4367 4368 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4369 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4370 ) { 4371 if (float64_is_signaling_nan(a, status) 4372 || float64_is_signaling_nan(b, status)) { 4373 float_raise(float_flag_invalid, status); 4374 } 4375 return 0; 4376 } 4377 av = float64_val(a); 4378 bv = float64_val(b); 4379 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4380 4381 } 4382 4383 /*---------------------------------------------------------------------------- 4384 | Returns 1 if the double-precision floating-point value `a' is less than or 4385 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4386 | cause an exception. Otherwise, the comparison is performed according to the 4387 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4388 *----------------------------------------------------------------------------*/ 4389 4390 int float64_le_quiet(float64 a, float64 b, float_status *status) 4391 { 4392 flag aSign, bSign; 4393 uint64_t av, bv; 4394 a = float64_squash_input_denormal(a, status); 4395 b = float64_squash_input_denormal(b, status); 4396 4397 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4398 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4399 ) { 4400 if (float64_is_signaling_nan(a, status) 4401 || float64_is_signaling_nan(b, status)) { 4402 float_raise(float_flag_invalid, status); 4403 } 4404 return 0; 4405 } 4406 aSign = extractFloat64Sign( a ); 4407 bSign = extractFloat64Sign( b ); 4408 av = float64_val(a); 4409 bv = float64_val(b); 4410 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4411 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4412 4413 } 4414 4415 /*---------------------------------------------------------------------------- 4416 | Returns 1 if the double-precision floating-point value `a' is less than 4417 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4418 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 4419 | Standard for Binary Floating-Point Arithmetic. 4420 *----------------------------------------------------------------------------*/ 4421 4422 int float64_lt_quiet(float64 a, float64 b, float_status *status) 4423 { 4424 flag aSign, bSign; 4425 uint64_t av, bv; 4426 a = float64_squash_input_denormal(a, status); 4427 b = float64_squash_input_denormal(b, status); 4428 4429 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4430 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4431 ) { 4432 if (float64_is_signaling_nan(a, status) 4433 || float64_is_signaling_nan(b, status)) { 4434 float_raise(float_flag_invalid, status); 4435 } 4436 return 0; 4437 } 4438 aSign = extractFloat64Sign( a ); 4439 bSign = extractFloat64Sign( b ); 4440 av = float64_val(a); 4441 bv = float64_val(b); 4442 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4443 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4444 4445 } 4446 4447 /*---------------------------------------------------------------------------- 4448 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4449 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 4450 | comparison is performed according to the IEC/IEEE Standard for Binary 4451 | Floating-Point Arithmetic. 4452 *----------------------------------------------------------------------------*/ 4453 4454 int float64_unordered_quiet(float64 a, float64 b, float_status *status) 4455 { 4456 a = float64_squash_input_denormal(a, status); 4457 b = float64_squash_input_denormal(b, status); 4458 4459 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4460 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4461 ) { 4462 if (float64_is_signaling_nan(a, status) 4463 || float64_is_signaling_nan(b, status)) { 4464 float_raise(float_flag_invalid, status); 4465 } 4466 return 1; 4467 } 4468 return 0; 4469 } 4470 4471 /*---------------------------------------------------------------------------- 4472 | Returns the result of converting the extended double-precision floating- 4473 | point value `a' to the 32-bit two's complement integer format. The 4474 | conversion is performed according to the IEC/IEEE Standard for Binary 4475 | Floating-Point Arithmetic---which means in particular that the conversion 4476 | is rounded according to the current rounding mode. If `a' is a NaN, the 4477 | largest positive integer is returned. Otherwise, if the conversion 4478 | overflows, the largest integer with the same sign as `a' is returned. 4479 *----------------------------------------------------------------------------*/ 4480 4481 int32_t floatx80_to_int32(floatx80 a, float_status *status) 4482 { 4483 flag aSign; 4484 int32_t aExp, shiftCount; 4485 uint64_t aSig; 4486 4487 if (floatx80_invalid_encoding(a)) { 4488 float_raise(float_flag_invalid, status); 4489 return 1 << 31; 4490 } 4491 aSig = extractFloatx80Frac( a ); 4492 aExp = extractFloatx80Exp( a ); 4493 aSign = extractFloatx80Sign( a ); 4494 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4495 shiftCount = 0x4037 - aExp; 4496 if ( shiftCount <= 0 ) shiftCount = 1; 4497 shift64RightJamming( aSig, shiftCount, &aSig ); 4498 return roundAndPackInt32(aSign, aSig, status); 4499 4500 } 4501 4502 /*---------------------------------------------------------------------------- 4503 | Returns the result of converting the extended double-precision floating- 4504 | point value `a' to the 32-bit two's complement integer format. The 4505 | conversion is performed according to the IEC/IEEE Standard for Binary 4506 | Floating-Point Arithmetic, except that the conversion is always rounded 4507 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4508 | Otherwise, if the conversion overflows, the largest integer with the same 4509 | sign as `a' is returned. 4510 *----------------------------------------------------------------------------*/ 4511 4512 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 4513 { 4514 flag aSign; 4515 int32_t aExp, shiftCount; 4516 uint64_t aSig, savedASig; 4517 int32_t z; 4518 4519 if (floatx80_invalid_encoding(a)) { 4520 float_raise(float_flag_invalid, status); 4521 return 1 << 31; 4522 } 4523 aSig = extractFloatx80Frac( a ); 4524 aExp = extractFloatx80Exp( a ); 4525 aSign = extractFloatx80Sign( a ); 4526 if ( 0x401E < aExp ) { 4527 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4528 goto invalid; 4529 } 4530 else if ( aExp < 0x3FFF ) { 4531 if (aExp || aSig) { 4532 status->float_exception_flags |= float_flag_inexact; 4533 } 4534 return 0; 4535 } 4536 shiftCount = 0x403E - aExp; 4537 savedASig = aSig; 4538 aSig >>= shiftCount; 4539 z = aSig; 4540 if ( aSign ) z = - z; 4541 if ( ( z < 0 ) ^ aSign ) { 4542 invalid: 4543 float_raise(float_flag_invalid, status); 4544 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 4545 } 4546 if ( ( aSig<<shiftCount ) != savedASig ) { 4547 status->float_exception_flags |= float_flag_inexact; 4548 } 4549 return z; 4550 4551 } 4552 4553 /*---------------------------------------------------------------------------- 4554 | Returns the result of converting the extended double-precision floating- 4555 | point value `a' to the 64-bit two's complement integer format. The 4556 | conversion is performed according to the IEC/IEEE Standard for Binary 4557 | Floating-Point Arithmetic---which means in particular that the conversion 4558 | is rounded according to the current rounding mode. If `a' is a NaN, 4559 | the largest positive integer is returned. Otherwise, if the conversion 4560 | overflows, the largest integer with the same sign as `a' is returned. 4561 *----------------------------------------------------------------------------*/ 4562 4563 int64_t floatx80_to_int64(floatx80 a, float_status *status) 4564 { 4565 flag aSign; 4566 int32_t aExp, shiftCount; 4567 uint64_t aSig, aSigExtra; 4568 4569 if (floatx80_invalid_encoding(a)) { 4570 float_raise(float_flag_invalid, status); 4571 return 1ULL << 63; 4572 } 4573 aSig = extractFloatx80Frac( a ); 4574 aExp = extractFloatx80Exp( a ); 4575 aSign = extractFloatx80Sign( a ); 4576 shiftCount = 0x403E - aExp; 4577 if ( shiftCount <= 0 ) { 4578 if ( shiftCount ) { 4579 float_raise(float_flag_invalid, status); 4580 if (!aSign || floatx80_is_any_nan(a)) { 4581 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4582 } 4583 return (int64_t) LIT64( 0x8000000000000000 ); 4584 } 4585 aSigExtra = 0; 4586 } 4587 else { 4588 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 4589 } 4590 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 4591 4592 } 4593 4594 /*---------------------------------------------------------------------------- 4595 | Returns the result of converting the extended double-precision floating- 4596 | point value `a' to the 64-bit two's complement integer format. The 4597 | conversion is performed according to the IEC/IEEE Standard for Binary 4598 | Floating-Point Arithmetic, except that the conversion is always rounded 4599 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4600 | Otherwise, if the conversion overflows, the largest integer with the same 4601 | sign as `a' is returned. 4602 *----------------------------------------------------------------------------*/ 4603 4604 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 4605 { 4606 flag aSign; 4607 int32_t aExp, shiftCount; 4608 uint64_t aSig; 4609 int64_t z; 4610 4611 if (floatx80_invalid_encoding(a)) { 4612 float_raise(float_flag_invalid, status); 4613 return 1ULL << 63; 4614 } 4615 aSig = extractFloatx80Frac( a ); 4616 aExp = extractFloatx80Exp( a ); 4617 aSign = extractFloatx80Sign( a ); 4618 shiftCount = aExp - 0x403E; 4619 if ( 0 <= shiftCount ) { 4620 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 4621 if ( ( a.high != 0xC03E ) || aSig ) { 4622 float_raise(float_flag_invalid, status); 4623 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 4624 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4625 } 4626 } 4627 return (int64_t) LIT64( 0x8000000000000000 ); 4628 } 4629 else if ( aExp < 0x3FFF ) { 4630 if (aExp | aSig) { 4631 status->float_exception_flags |= float_flag_inexact; 4632 } 4633 return 0; 4634 } 4635 z = aSig>>( - shiftCount ); 4636 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 4637 status->float_exception_flags |= float_flag_inexact; 4638 } 4639 if ( aSign ) z = - z; 4640 return z; 4641 4642 } 4643 4644 /*---------------------------------------------------------------------------- 4645 | Returns the result of converting the extended double-precision floating- 4646 | point value `a' to the single-precision floating-point format. The 4647 | conversion is performed according to the IEC/IEEE Standard for Binary 4648 | Floating-Point Arithmetic. 4649 *----------------------------------------------------------------------------*/ 4650 4651 float32 floatx80_to_float32(floatx80 a, float_status *status) 4652 { 4653 flag aSign; 4654 int32_t aExp; 4655 uint64_t aSig; 4656 4657 if (floatx80_invalid_encoding(a)) { 4658 float_raise(float_flag_invalid, status); 4659 return float32_default_nan(status); 4660 } 4661 aSig = extractFloatx80Frac( a ); 4662 aExp = extractFloatx80Exp( a ); 4663 aSign = extractFloatx80Sign( a ); 4664 if ( aExp == 0x7FFF ) { 4665 if ( (uint64_t) ( aSig<<1 ) ) { 4666 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status); 4667 } 4668 return packFloat32( aSign, 0xFF, 0 ); 4669 } 4670 shift64RightJamming( aSig, 33, &aSig ); 4671 if ( aExp || aSig ) aExp -= 0x3F81; 4672 return roundAndPackFloat32(aSign, aExp, aSig, status); 4673 4674 } 4675 4676 /*---------------------------------------------------------------------------- 4677 | Returns the result of converting the extended double-precision floating- 4678 | point value `a' to the double-precision floating-point format. The 4679 | conversion is performed according to the IEC/IEEE Standard for Binary 4680 | Floating-Point Arithmetic. 4681 *----------------------------------------------------------------------------*/ 4682 4683 float64 floatx80_to_float64(floatx80 a, float_status *status) 4684 { 4685 flag aSign; 4686 int32_t aExp; 4687 uint64_t aSig, zSig; 4688 4689 if (floatx80_invalid_encoding(a)) { 4690 float_raise(float_flag_invalid, status); 4691 return float64_default_nan(status); 4692 } 4693 aSig = extractFloatx80Frac( a ); 4694 aExp = extractFloatx80Exp( a ); 4695 aSign = extractFloatx80Sign( a ); 4696 if ( aExp == 0x7FFF ) { 4697 if ( (uint64_t) ( aSig<<1 ) ) { 4698 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status); 4699 } 4700 return packFloat64( aSign, 0x7FF, 0 ); 4701 } 4702 shift64RightJamming( aSig, 1, &zSig ); 4703 if ( aExp || aSig ) aExp -= 0x3C01; 4704 return roundAndPackFloat64(aSign, aExp, zSig, status); 4705 4706 } 4707 4708 /*---------------------------------------------------------------------------- 4709 | Returns the result of converting the extended double-precision floating- 4710 | point value `a' to the quadruple-precision floating-point format. The 4711 | conversion is performed according to the IEC/IEEE Standard for Binary 4712 | Floating-Point Arithmetic. 4713 *----------------------------------------------------------------------------*/ 4714 4715 float128 floatx80_to_float128(floatx80 a, float_status *status) 4716 { 4717 flag aSign; 4718 int aExp; 4719 uint64_t aSig, zSig0, zSig1; 4720 4721 if (floatx80_invalid_encoding(a)) { 4722 float_raise(float_flag_invalid, status); 4723 return float128_default_nan(status); 4724 } 4725 aSig = extractFloatx80Frac( a ); 4726 aExp = extractFloatx80Exp( a ); 4727 aSign = extractFloatx80Sign( a ); 4728 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 4729 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status); 4730 } 4731 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 4732 return packFloat128( aSign, aExp, zSig0, zSig1 ); 4733 4734 } 4735 4736 /*---------------------------------------------------------------------------- 4737 | Rounds the extended double-precision floating-point value `a' 4738 | to the precision provided by floatx80_rounding_precision and returns the 4739 | result as an extended double-precision floating-point value. 4740 | The operation is performed according to the IEC/IEEE Standard for Binary 4741 | Floating-Point Arithmetic. 4742 *----------------------------------------------------------------------------*/ 4743 4744 floatx80 floatx80_round(floatx80 a, float_status *status) 4745 { 4746 return roundAndPackFloatx80(status->floatx80_rounding_precision, 4747 extractFloatx80Sign(a), 4748 extractFloatx80Exp(a), 4749 extractFloatx80Frac(a), 0, status); 4750 } 4751 4752 /*---------------------------------------------------------------------------- 4753 | Rounds the extended double-precision floating-point value `a' to an integer, 4754 | and returns the result as an extended quadruple-precision floating-point 4755 | value. The operation is performed according to the IEC/IEEE Standard for 4756 | Binary Floating-Point Arithmetic. 4757 *----------------------------------------------------------------------------*/ 4758 4759 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 4760 { 4761 flag aSign; 4762 int32_t aExp; 4763 uint64_t lastBitMask, roundBitsMask; 4764 floatx80 z; 4765 4766 if (floatx80_invalid_encoding(a)) { 4767 float_raise(float_flag_invalid, status); 4768 return floatx80_default_nan(status); 4769 } 4770 aExp = extractFloatx80Exp( a ); 4771 if ( 0x403E <= aExp ) { 4772 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 4773 return propagateFloatx80NaN(a, a, status); 4774 } 4775 return a; 4776 } 4777 if ( aExp < 0x3FFF ) { 4778 if ( ( aExp == 0 ) 4779 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 4780 return a; 4781 } 4782 status->float_exception_flags |= float_flag_inexact; 4783 aSign = extractFloatx80Sign( a ); 4784 switch (status->float_rounding_mode) { 4785 case float_round_nearest_even: 4786 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 4787 ) { 4788 return 4789 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 4790 } 4791 break; 4792 case float_round_ties_away: 4793 if (aExp == 0x3FFE) { 4794 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000)); 4795 } 4796 break; 4797 case float_round_down: 4798 return 4799 aSign ? 4800 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 4801 : packFloatx80( 0, 0, 0 ); 4802 case float_round_up: 4803 return 4804 aSign ? packFloatx80( 1, 0, 0 ) 4805 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 4806 } 4807 return packFloatx80( aSign, 0, 0 ); 4808 } 4809 lastBitMask = 1; 4810 lastBitMask <<= 0x403E - aExp; 4811 roundBitsMask = lastBitMask - 1; 4812 z = a; 4813 switch (status->float_rounding_mode) { 4814 case float_round_nearest_even: 4815 z.low += lastBitMask>>1; 4816 if ((z.low & roundBitsMask) == 0) { 4817 z.low &= ~lastBitMask; 4818 } 4819 break; 4820 case float_round_ties_away: 4821 z.low += lastBitMask >> 1; 4822 break; 4823 case float_round_to_zero: 4824 break; 4825 case float_round_up: 4826 if (!extractFloatx80Sign(z)) { 4827 z.low += roundBitsMask; 4828 } 4829 break; 4830 case float_round_down: 4831 if (extractFloatx80Sign(z)) { 4832 z.low += roundBitsMask; 4833 } 4834 break; 4835 default: 4836 abort(); 4837 } 4838 z.low &= ~ roundBitsMask; 4839 if ( z.low == 0 ) { 4840 ++z.high; 4841 z.low = LIT64( 0x8000000000000000 ); 4842 } 4843 if (z.low != a.low) { 4844 status->float_exception_flags |= float_flag_inexact; 4845 } 4846 return z; 4847 4848 } 4849 4850 /*---------------------------------------------------------------------------- 4851 | Returns the result of adding the absolute values of the extended double- 4852 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 4853 | negated before being returned. `zSign' is ignored if the result is a NaN. 4854 | The addition is performed according to the IEC/IEEE Standard for Binary 4855 | Floating-Point Arithmetic. 4856 *----------------------------------------------------------------------------*/ 4857 4858 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 4859 float_status *status) 4860 { 4861 int32_t aExp, bExp, zExp; 4862 uint64_t aSig, bSig, zSig0, zSig1; 4863 int32_t expDiff; 4864 4865 aSig = extractFloatx80Frac( a ); 4866 aExp = extractFloatx80Exp( a ); 4867 bSig = extractFloatx80Frac( b ); 4868 bExp = extractFloatx80Exp( b ); 4869 expDiff = aExp - bExp; 4870 if ( 0 < expDiff ) { 4871 if ( aExp == 0x7FFF ) { 4872 if ((uint64_t)(aSig << 1)) { 4873 return propagateFloatx80NaN(a, b, status); 4874 } 4875 return a; 4876 } 4877 if ( bExp == 0 ) --expDiff; 4878 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 4879 zExp = aExp; 4880 } 4881 else if ( expDiff < 0 ) { 4882 if ( bExp == 0x7FFF ) { 4883 if ((uint64_t)(bSig << 1)) { 4884 return propagateFloatx80NaN(a, b, status); 4885 } 4886 return packFloatx80(zSign, 4887 floatx80_infinity_high, 4888 floatx80_infinity_low); 4889 } 4890 if ( aExp == 0 ) ++expDiff; 4891 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 4892 zExp = bExp; 4893 } 4894 else { 4895 if ( aExp == 0x7FFF ) { 4896 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 4897 return propagateFloatx80NaN(a, b, status); 4898 } 4899 return a; 4900 } 4901 zSig1 = 0; 4902 zSig0 = aSig + bSig; 4903 if ( aExp == 0 ) { 4904 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 4905 goto roundAndPack; 4906 } 4907 zExp = aExp; 4908 goto shiftRight1; 4909 } 4910 zSig0 = aSig + bSig; 4911 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 4912 shiftRight1: 4913 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 4914 zSig0 |= LIT64( 0x8000000000000000 ); 4915 ++zExp; 4916 roundAndPack: 4917 return roundAndPackFloatx80(status->floatx80_rounding_precision, 4918 zSign, zExp, zSig0, zSig1, status); 4919 } 4920 4921 /*---------------------------------------------------------------------------- 4922 | Returns the result of subtracting the absolute values of the extended 4923 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 4924 | difference is negated before being returned. `zSign' is ignored if the 4925 | result is a NaN. The subtraction is performed according to the IEC/IEEE 4926 | Standard for Binary Floating-Point Arithmetic. 4927 *----------------------------------------------------------------------------*/ 4928 4929 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 4930 float_status *status) 4931 { 4932 int32_t aExp, bExp, zExp; 4933 uint64_t aSig, bSig, zSig0, zSig1; 4934 int32_t expDiff; 4935 4936 aSig = extractFloatx80Frac( a ); 4937 aExp = extractFloatx80Exp( a ); 4938 bSig = extractFloatx80Frac( b ); 4939 bExp = extractFloatx80Exp( b ); 4940 expDiff = aExp - bExp; 4941 if ( 0 < expDiff ) goto aExpBigger; 4942 if ( expDiff < 0 ) goto bExpBigger; 4943 if ( aExp == 0x7FFF ) { 4944 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 4945 return propagateFloatx80NaN(a, b, status); 4946 } 4947 float_raise(float_flag_invalid, status); 4948 return floatx80_default_nan(status); 4949 } 4950 if ( aExp == 0 ) { 4951 aExp = 1; 4952 bExp = 1; 4953 } 4954 zSig1 = 0; 4955 if ( bSig < aSig ) goto aBigger; 4956 if ( aSig < bSig ) goto bBigger; 4957 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 4958 bExpBigger: 4959 if ( bExp == 0x7FFF ) { 4960 if ((uint64_t)(bSig << 1)) { 4961 return propagateFloatx80NaN(a, b, status); 4962 } 4963 return packFloatx80(zSign ^ 1, floatx80_infinity_high, 4964 floatx80_infinity_low); 4965 } 4966 if ( aExp == 0 ) ++expDiff; 4967 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 4968 bBigger: 4969 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 4970 zExp = bExp; 4971 zSign ^= 1; 4972 goto normalizeRoundAndPack; 4973 aExpBigger: 4974 if ( aExp == 0x7FFF ) { 4975 if ((uint64_t)(aSig << 1)) { 4976 return propagateFloatx80NaN(a, b, status); 4977 } 4978 return a; 4979 } 4980 if ( bExp == 0 ) --expDiff; 4981 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 4982 aBigger: 4983 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 4984 zExp = aExp; 4985 normalizeRoundAndPack: 4986 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 4987 zSign, zExp, zSig0, zSig1, status); 4988 } 4989 4990 /*---------------------------------------------------------------------------- 4991 | Returns the result of adding the extended double-precision floating-point 4992 | values `a' and `b'. The operation is performed according to the IEC/IEEE 4993 | Standard for Binary Floating-Point Arithmetic. 4994 *----------------------------------------------------------------------------*/ 4995 4996 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 4997 { 4998 flag aSign, bSign; 4999 5000 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5001 float_raise(float_flag_invalid, status); 5002 return floatx80_default_nan(status); 5003 } 5004 aSign = extractFloatx80Sign( a ); 5005 bSign = extractFloatx80Sign( b ); 5006 if ( aSign == bSign ) { 5007 return addFloatx80Sigs(a, b, aSign, status); 5008 } 5009 else { 5010 return subFloatx80Sigs(a, b, aSign, status); 5011 } 5012 5013 } 5014 5015 /*---------------------------------------------------------------------------- 5016 | Returns the result of subtracting the extended double-precision floating- 5017 | point values `a' and `b'. The operation is performed according to the 5018 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5019 *----------------------------------------------------------------------------*/ 5020 5021 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 5022 { 5023 flag aSign, bSign; 5024 5025 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5026 float_raise(float_flag_invalid, status); 5027 return floatx80_default_nan(status); 5028 } 5029 aSign = extractFloatx80Sign( a ); 5030 bSign = extractFloatx80Sign( b ); 5031 if ( aSign == bSign ) { 5032 return subFloatx80Sigs(a, b, aSign, status); 5033 } 5034 else { 5035 return addFloatx80Sigs(a, b, aSign, status); 5036 } 5037 5038 } 5039 5040 /*---------------------------------------------------------------------------- 5041 | Returns the result of multiplying the extended double-precision floating- 5042 | point values `a' and `b'. The operation is performed according to the 5043 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5044 *----------------------------------------------------------------------------*/ 5045 5046 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 5047 { 5048 flag aSign, bSign, zSign; 5049 int32_t aExp, bExp, zExp; 5050 uint64_t aSig, bSig, zSig0, zSig1; 5051 5052 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5053 float_raise(float_flag_invalid, status); 5054 return floatx80_default_nan(status); 5055 } 5056 aSig = extractFloatx80Frac( a ); 5057 aExp = extractFloatx80Exp( a ); 5058 aSign = extractFloatx80Sign( a ); 5059 bSig = extractFloatx80Frac( b ); 5060 bExp = extractFloatx80Exp( b ); 5061 bSign = extractFloatx80Sign( b ); 5062 zSign = aSign ^ bSign; 5063 if ( aExp == 0x7FFF ) { 5064 if ( (uint64_t) ( aSig<<1 ) 5065 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5066 return propagateFloatx80NaN(a, b, status); 5067 } 5068 if ( ( bExp | bSig ) == 0 ) goto invalid; 5069 return packFloatx80(zSign, floatx80_infinity_high, 5070 floatx80_infinity_low); 5071 } 5072 if ( bExp == 0x7FFF ) { 5073 if ((uint64_t)(bSig << 1)) { 5074 return propagateFloatx80NaN(a, b, status); 5075 } 5076 if ( ( aExp | aSig ) == 0 ) { 5077 invalid: 5078 float_raise(float_flag_invalid, status); 5079 return floatx80_default_nan(status); 5080 } 5081 return packFloatx80(zSign, floatx80_infinity_high, 5082 floatx80_infinity_low); 5083 } 5084 if ( aExp == 0 ) { 5085 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5086 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5087 } 5088 if ( bExp == 0 ) { 5089 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5090 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5091 } 5092 zExp = aExp + bExp - 0x3FFE; 5093 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 5094 if ( 0 < (int64_t) zSig0 ) { 5095 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5096 --zExp; 5097 } 5098 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5099 zSign, zExp, zSig0, zSig1, status); 5100 } 5101 5102 /*---------------------------------------------------------------------------- 5103 | Returns the result of dividing the extended double-precision floating-point 5104 | value `a' by the corresponding value `b'. The operation is performed 5105 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5106 *----------------------------------------------------------------------------*/ 5107 5108 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 5109 { 5110 flag aSign, bSign, zSign; 5111 int32_t aExp, bExp, zExp; 5112 uint64_t aSig, bSig, zSig0, zSig1; 5113 uint64_t rem0, rem1, rem2, term0, term1, term2; 5114 5115 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5116 float_raise(float_flag_invalid, status); 5117 return floatx80_default_nan(status); 5118 } 5119 aSig = extractFloatx80Frac( a ); 5120 aExp = extractFloatx80Exp( a ); 5121 aSign = extractFloatx80Sign( a ); 5122 bSig = extractFloatx80Frac( b ); 5123 bExp = extractFloatx80Exp( b ); 5124 bSign = extractFloatx80Sign( b ); 5125 zSign = aSign ^ bSign; 5126 if ( aExp == 0x7FFF ) { 5127 if ((uint64_t)(aSig << 1)) { 5128 return propagateFloatx80NaN(a, b, status); 5129 } 5130 if ( bExp == 0x7FFF ) { 5131 if ((uint64_t)(bSig << 1)) { 5132 return propagateFloatx80NaN(a, b, status); 5133 } 5134 goto invalid; 5135 } 5136 return packFloatx80(zSign, floatx80_infinity_high, 5137 floatx80_infinity_low); 5138 } 5139 if ( bExp == 0x7FFF ) { 5140 if ((uint64_t)(bSig << 1)) { 5141 return propagateFloatx80NaN(a, b, status); 5142 } 5143 return packFloatx80( zSign, 0, 0 ); 5144 } 5145 if ( bExp == 0 ) { 5146 if ( bSig == 0 ) { 5147 if ( ( aExp | aSig ) == 0 ) { 5148 invalid: 5149 float_raise(float_flag_invalid, status); 5150 return floatx80_default_nan(status); 5151 } 5152 float_raise(float_flag_divbyzero, status); 5153 return packFloatx80(zSign, floatx80_infinity_high, 5154 floatx80_infinity_low); 5155 } 5156 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5157 } 5158 if ( aExp == 0 ) { 5159 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5160 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5161 } 5162 zExp = aExp - bExp + 0x3FFE; 5163 rem1 = 0; 5164 if ( bSig <= aSig ) { 5165 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 5166 ++zExp; 5167 } 5168 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 5169 mul64To128( bSig, zSig0, &term0, &term1 ); 5170 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 5171 while ( (int64_t) rem0 < 0 ) { 5172 --zSig0; 5173 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 5174 } 5175 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 5176 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 5177 mul64To128( bSig, zSig1, &term1, &term2 ); 5178 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5179 while ( (int64_t) rem1 < 0 ) { 5180 --zSig1; 5181 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 5182 } 5183 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 5184 } 5185 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5186 zSign, zExp, zSig0, zSig1, status); 5187 } 5188 5189 /*---------------------------------------------------------------------------- 5190 | Returns the remainder of the extended double-precision floating-point value 5191 | `a' with respect to the corresponding value `b'. The operation is performed 5192 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5193 *----------------------------------------------------------------------------*/ 5194 5195 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 5196 { 5197 flag aSign, zSign; 5198 int32_t aExp, bExp, expDiff; 5199 uint64_t aSig0, aSig1, bSig; 5200 uint64_t q, term0, term1, alternateASig0, alternateASig1; 5201 5202 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5203 float_raise(float_flag_invalid, status); 5204 return floatx80_default_nan(status); 5205 } 5206 aSig0 = extractFloatx80Frac( a ); 5207 aExp = extractFloatx80Exp( a ); 5208 aSign = extractFloatx80Sign( a ); 5209 bSig = extractFloatx80Frac( b ); 5210 bExp = extractFloatx80Exp( b ); 5211 if ( aExp == 0x7FFF ) { 5212 if ( (uint64_t) ( aSig0<<1 ) 5213 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5214 return propagateFloatx80NaN(a, b, status); 5215 } 5216 goto invalid; 5217 } 5218 if ( bExp == 0x7FFF ) { 5219 if ((uint64_t)(bSig << 1)) { 5220 return propagateFloatx80NaN(a, b, status); 5221 } 5222 return a; 5223 } 5224 if ( bExp == 0 ) { 5225 if ( bSig == 0 ) { 5226 invalid: 5227 float_raise(float_flag_invalid, status); 5228 return floatx80_default_nan(status); 5229 } 5230 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5231 } 5232 if ( aExp == 0 ) { 5233 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a; 5234 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5235 } 5236 bSig |= LIT64( 0x8000000000000000 ); 5237 zSign = aSign; 5238 expDiff = aExp - bExp; 5239 aSig1 = 0; 5240 if ( expDiff < 0 ) { 5241 if ( expDiff < -1 ) return a; 5242 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 5243 expDiff = 0; 5244 } 5245 q = ( bSig <= aSig0 ); 5246 if ( q ) aSig0 -= bSig; 5247 expDiff -= 64; 5248 while ( 0 < expDiff ) { 5249 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5250 q = ( 2 < q ) ? q - 2 : 0; 5251 mul64To128( bSig, q, &term0, &term1 ); 5252 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5253 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 5254 expDiff -= 62; 5255 } 5256 expDiff += 64; 5257 if ( 0 < expDiff ) { 5258 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5259 q = ( 2 < q ) ? q - 2 : 0; 5260 q >>= 64 - expDiff; 5261 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 5262 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5263 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 5264 while ( le128( term0, term1, aSig0, aSig1 ) ) { 5265 ++q; 5266 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5267 } 5268 } 5269 else { 5270 term1 = 0; 5271 term0 = bSig; 5272 } 5273 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 5274 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5275 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5276 && ( q & 1 ) ) 5277 ) { 5278 aSig0 = alternateASig0; 5279 aSig1 = alternateASig1; 5280 zSign = ! zSign; 5281 } 5282 return 5283 normalizeRoundAndPackFloatx80( 5284 80, zSign, bExp + expDiff, aSig0, aSig1, status); 5285 5286 } 5287 5288 /*---------------------------------------------------------------------------- 5289 | Returns the square root of the extended double-precision floating-point 5290 | value `a'. The operation is performed according to the IEC/IEEE Standard 5291 | for Binary Floating-Point Arithmetic. 5292 *----------------------------------------------------------------------------*/ 5293 5294 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 5295 { 5296 flag aSign; 5297 int32_t aExp, zExp; 5298 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 5299 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5300 5301 if (floatx80_invalid_encoding(a)) { 5302 float_raise(float_flag_invalid, status); 5303 return floatx80_default_nan(status); 5304 } 5305 aSig0 = extractFloatx80Frac( a ); 5306 aExp = extractFloatx80Exp( a ); 5307 aSign = extractFloatx80Sign( a ); 5308 if ( aExp == 0x7FFF ) { 5309 if ((uint64_t)(aSig0 << 1)) { 5310 return propagateFloatx80NaN(a, a, status); 5311 } 5312 if ( ! aSign ) return a; 5313 goto invalid; 5314 } 5315 if ( aSign ) { 5316 if ( ( aExp | aSig0 ) == 0 ) return a; 5317 invalid: 5318 float_raise(float_flag_invalid, status); 5319 return floatx80_default_nan(status); 5320 } 5321 if ( aExp == 0 ) { 5322 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 5323 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5324 } 5325 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 5326 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 5327 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 5328 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5329 doubleZSig0 = zSig0<<1; 5330 mul64To128( zSig0, zSig0, &term0, &term1 ); 5331 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5332 while ( (int64_t) rem0 < 0 ) { 5333 --zSig0; 5334 doubleZSig0 -= 2; 5335 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5336 } 5337 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5338 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 5339 if ( zSig1 == 0 ) zSig1 = 1; 5340 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5341 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5342 mul64To128( zSig1, zSig1, &term2, &term3 ); 5343 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5344 while ( (int64_t) rem1 < 0 ) { 5345 --zSig1; 5346 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5347 term3 |= 1; 5348 term2 |= doubleZSig0; 5349 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5350 } 5351 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5352 } 5353 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 5354 zSig0 |= doubleZSig0; 5355 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5356 0, zExp, zSig0, zSig1, status); 5357 } 5358 5359 /*---------------------------------------------------------------------------- 5360 | Returns 1 if the extended double-precision floating-point value `a' is equal 5361 | to the corresponding value `b', and 0 otherwise. The invalid exception is 5362 | raised if either operand is a NaN. Otherwise, the comparison is performed 5363 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5364 *----------------------------------------------------------------------------*/ 5365 5366 int floatx80_eq(floatx80 a, floatx80 b, float_status *status) 5367 { 5368 5369 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5370 || (extractFloatx80Exp(a) == 0x7FFF 5371 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5372 || (extractFloatx80Exp(b) == 0x7FFF 5373 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5374 ) { 5375 float_raise(float_flag_invalid, status); 5376 return 0; 5377 } 5378 return 5379 ( a.low == b.low ) 5380 && ( ( a.high == b.high ) 5381 || ( ( a.low == 0 ) 5382 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5383 ); 5384 5385 } 5386 5387 /*---------------------------------------------------------------------------- 5388 | Returns 1 if the extended double-precision floating-point value `a' is 5389 | less than or equal to the corresponding value `b', and 0 otherwise. The 5390 | invalid exception is raised if either operand is a NaN. The comparison is 5391 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5392 | Arithmetic. 5393 *----------------------------------------------------------------------------*/ 5394 5395 int floatx80_le(floatx80 a, floatx80 b, float_status *status) 5396 { 5397 flag aSign, bSign; 5398 5399 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5400 || (extractFloatx80Exp(a) == 0x7FFF 5401 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5402 || (extractFloatx80Exp(b) == 0x7FFF 5403 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5404 ) { 5405 float_raise(float_flag_invalid, status); 5406 return 0; 5407 } 5408 aSign = extractFloatx80Sign( a ); 5409 bSign = extractFloatx80Sign( b ); 5410 if ( aSign != bSign ) { 5411 return 5412 aSign 5413 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5414 == 0 ); 5415 } 5416 return 5417 aSign ? le128( b.high, b.low, a.high, a.low ) 5418 : le128( a.high, a.low, b.high, b.low ); 5419 5420 } 5421 5422 /*---------------------------------------------------------------------------- 5423 | Returns 1 if the extended double-precision floating-point value `a' is 5424 | less than the corresponding value `b', and 0 otherwise. The invalid 5425 | exception is raised if either operand is a NaN. The comparison is performed 5426 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5427 *----------------------------------------------------------------------------*/ 5428 5429 int floatx80_lt(floatx80 a, floatx80 b, float_status *status) 5430 { 5431 flag aSign, bSign; 5432 5433 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5434 || (extractFloatx80Exp(a) == 0x7FFF 5435 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5436 || (extractFloatx80Exp(b) == 0x7FFF 5437 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5438 ) { 5439 float_raise(float_flag_invalid, status); 5440 return 0; 5441 } 5442 aSign = extractFloatx80Sign( a ); 5443 bSign = extractFloatx80Sign( b ); 5444 if ( aSign != bSign ) { 5445 return 5446 aSign 5447 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5448 != 0 ); 5449 } 5450 return 5451 aSign ? lt128( b.high, b.low, a.high, a.low ) 5452 : lt128( a.high, a.low, b.high, b.low ); 5453 5454 } 5455 5456 /*---------------------------------------------------------------------------- 5457 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5458 | cannot be compared, and 0 otherwise. The invalid exception is raised if 5459 | either operand is a NaN. The comparison is performed according to the 5460 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5461 *----------------------------------------------------------------------------*/ 5462 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status) 5463 { 5464 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5465 || (extractFloatx80Exp(a) == 0x7FFF 5466 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5467 || (extractFloatx80Exp(b) == 0x7FFF 5468 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5469 ) { 5470 float_raise(float_flag_invalid, status); 5471 return 1; 5472 } 5473 return 0; 5474 } 5475 5476 /*---------------------------------------------------------------------------- 5477 | Returns 1 if the extended double-precision floating-point value `a' is 5478 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5479 | cause an exception. The comparison is performed according to the IEC/IEEE 5480 | Standard for Binary Floating-Point Arithmetic. 5481 *----------------------------------------------------------------------------*/ 5482 5483 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status) 5484 { 5485 5486 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5487 float_raise(float_flag_invalid, status); 5488 return 0; 5489 } 5490 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5491 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5492 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5493 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5494 ) { 5495 if (floatx80_is_signaling_nan(a, status) 5496 || floatx80_is_signaling_nan(b, status)) { 5497 float_raise(float_flag_invalid, status); 5498 } 5499 return 0; 5500 } 5501 return 5502 ( a.low == b.low ) 5503 && ( ( a.high == b.high ) 5504 || ( ( a.low == 0 ) 5505 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5506 ); 5507 5508 } 5509 5510 /*---------------------------------------------------------------------------- 5511 | Returns 1 if the extended double-precision floating-point value `a' is less 5512 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 5513 | do not cause an exception. Otherwise, the comparison is performed according 5514 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5515 *----------------------------------------------------------------------------*/ 5516 5517 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status) 5518 { 5519 flag aSign, bSign; 5520 5521 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5522 float_raise(float_flag_invalid, status); 5523 return 0; 5524 } 5525 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5526 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5527 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5528 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5529 ) { 5530 if (floatx80_is_signaling_nan(a, status) 5531 || floatx80_is_signaling_nan(b, status)) { 5532 float_raise(float_flag_invalid, status); 5533 } 5534 return 0; 5535 } 5536 aSign = extractFloatx80Sign( a ); 5537 bSign = extractFloatx80Sign( b ); 5538 if ( aSign != bSign ) { 5539 return 5540 aSign 5541 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5542 == 0 ); 5543 } 5544 return 5545 aSign ? le128( b.high, b.low, a.high, a.low ) 5546 : le128( a.high, a.low, b.high, b.low ); 5547 5548 } 5549 5550 /*---------------------------------------------------------------------------- 5551 | Returns 1 if the extended double-precision floating-point value `a' is less 5552 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 5553 | an exception. Otherwise, the comparison is performed according to the 5554 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5555 *----------------------------------------------------------------------------*/ 5556 5557 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status) 5558 { 5559 flag aSign, bSign; 5560 5561 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5562 float_raise(float_flag_invalid, status); 5563 return 0; 5564 } 5565 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5566 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5567 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5568 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5569 ) { 5570 if (floatx80_is_signaling_nan(a, status) 5571 || floatx80_is_signaling_nan(b, status)) { 5572 float_raise(float_flag_invalid, status); 5573 } 5574 return 0; 5575 } 5576 aSign = extractFloatx80Sign( a ); 5577 bSign = extractFloatx80Sign( b ); 5578 if ( aSign != bSign ) { 5579 return 5580 aSign 5581 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5582 != 0 ); 5583 } 5584 return 5585 aSign ? lt128( b.high, b.low, a.high, a.low ) 5586 : lt128( a.high, a.low, b.high, b.low ); 5587 5588 } 5589 5590 /*---------------------------------------------------------------------------- 5591 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5592 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception. 5593 | The comparison is performed according to the IEC/IEEE Standard for Binary 5594 | Floating-Point Arithmetic. 5595 *----------------------------------------------------------------------------*/ 5596 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status) 5597 { 5598 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5599 float_raise(float_flag_invalid, status); 5600 return 1; 5601 } 5602 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5603 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5604 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5605 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5606 ) { 5607 if (floatx80_is_signaling_nan(a, status) 5608 || floatx80_is_signaling_nan(b, status)) { 5609 float_raise(float_flag_invalid, status); 5610 } 5611 return 1; 5612 } 5613 return 0; 5614 } 5615 5616 /*---------------------------------------------------------------------------- 5617 | Returns the result of converting the quadruple-precision floating-point 5618 | value `a' to the 32-bit two's complement integer format. The conversion 5619 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5620 | Arithmetic---which means in particular that the conversion is rounded 5621 | according to the current rounding mode. If `a' is a NaN, the largest 5622 | positive integer is returned. Otherwise, if the conversion overflows, the 5623 | largest integer with the same sign as `a' is returned. 5624 *----------------------------------------------------------------------------*/ 5625 5626 int32_t float128_to_int32(float128 a, float_status *status) 5627 { 5628 flag aSign; 5629 int32_t aExp, shiftCount; 5630 uint64_t aSig0, aSig1; 5631 5632 aSig1 = extractFloat128Frac1( a ); 5633 aSig0 = extractFloat128Frac0( a ); 5634 aExp = extractFloat128Exp( a ); 5635 aSign = extractFloat128Sign( a ); 5636 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 5637 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5638 aSig0 |= ( aSig1 != 0 ); 5639 shiftCount = 0x4028 - aExp; 5640 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 5641 return roundAndPackInt32(aSign, aSig0, status); 5642 5643 } 5644 5645 /*---------------------------------------------------------------------------- 5646 | Returns the result of converting the quadruple-precision floating-point 5647 | value `a' to the 32-bit two's complement integer format. The conversion 5648 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5649 | Arithmetic, except that the conversion is always rounded toward zero. If 5650 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 5651 | conversion overflows, the largest integer with the same sign as `a' is 5652 | returned. 5653 *----------------------------------------------------------------------------*/ 5654 5655 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 5656 { 5657 flag aSign; 5658 int32_t aExp, shiftCount; 5659 uint64_t aSig0, aSig1, savedASig; 5660 int32_t z; 5661 5662 aSig1 = extractFloat128Frac1( a ); 5663 aSig0 = extractFloat128Frac0( a ); 5664 aExp = extractFloat128Exp( a ); 5665 aSign = extractFloat128Sign( a ); 5666 aSig0 |= ( aSig1 != 0 ); 5667 if ( 0x401E < aExp ) { 5668 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 5669 goto invalid; 5670 } 5671 else if ( aExp < 0x3FFF ) { 5672 if (aExp || aSig0) { 5673 status->float_exception_flags |= float_flag_inexact; 5674 } 5675 return 0; 5676 } 5677 aSig0 |= LIT64( 0x0001000000000000 ); 5678 shiftCount = 0x402F - aExp; 5679 savedASig = aSig0; 5680 aSig0 >>= shiftCount; 5681 z = aSig0; 5682 if ( aSign ) z = - z; 5683 if ( ( z < 0 ) ^ aSign ) { 5684 invalid: 5685 float_raise(float_flag_invalid, status); 5686 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5687 } 5688 if ( ( aSig0<<shiftCount ) != savedASig ) { 5689 status->float_exception_flags |= float_flag_inexact; 5690 } 5691 return z; 5692 5693 } 5694 5695 /*---------------------------------------------------------------------------- 5696 | Returns the result of converting the quadruple-precision floating-point 5697 | value `a' to the 64-bit two's complement integer format. The conversion 5698 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5699 | Arithmetic---which means in particular that the conversion is rounded 5700 | according to the current rounding mode. If `a' is a NaN, the largest 5701 | positive integer is returned. Otherwise, if the conversion overflows, the 5702 | largest integer with the same sign as `a' is returned. 5703 *----------------------------------------------------------------------------*/ 5704 5705 int64_t float128_to_int64(float128 a, float_status *status) 5706 { 5707 flag aSign; 5708 int32_t aExp, shiftCount; 5709 uint64_t aSig0, aSig1; 5710 5711 aSig1 = extractFloat128Frac1( a ); 5712 aSig0 = extractFloat128Frac0( a ); 5713 aExp = extractFloat128Exp( a ); 5714 aSign = extractFloat128Sign( a ); 5715 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5716 shiftCount = 0x402F - aExp; 5717 if ( shiftCount <= 0 ) { 5718 if ( 0x403E < aExp ) { 5719 float_raise(float_flag_invalid, status); 5720 if ( ! aSign 5721 || ( ( aExp == 0x7FFF ) 5722 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 5723 ) 5724 ) { 5725 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5726 } 5727 return (int64_t) LIT64( 0x8000000000000000 ); 5728 } 5729 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 5730 } 5731 else { 5732 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 5733 } 5734 return roundAndPackInt64(aSign, aSig0, aSig1, status); 5735 5736 } 5737 5738 /*---------------------------------------------------------------------------- 5739 | Returns the result of converting the quadruple-precision floating-point 5740 | value `a' to the 64-bit two's complement integer format. The conversion 5741 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5742 | Arithmetic, except that the conversion is always rounded toward zero. 5743 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 5744 | the conversion overflows, the largest integer with the same sign as `a' is 5745 | returned. 5746 *----------------------------------------------------------------------------*/ 5747 5748 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 5749 { 5750 flag aSign; 5751 int32_t aExp, shiftCount; 5752 uint64_t aSig0, aSig1; 5753 int64_t z; 5754 5755 aSig1 = extractFloat128Frac1( a ); 5756 aSig0 = extractFloat128Frac0( a ); 5757 aExp = extractFloat128Exp( a ); 5758 aSign = extractFloat128Sign( a ); 5759 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5760 shiftCount = aExp - 0x402F; 5761 if ( 0 < shiftCount ) { 5762 if ( 0x403E <= aExp ) { 5763 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 5764 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 5765 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 5766 if (aSig1) { 5767 status->float_exception_flags |= float_flag_inexact; 5768 } 5769 } 5770 else { 5771 float_raise(float_flag_invalid, status); 5772 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 5773 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5774 } 5775 } 5776 return (int64_t) LIT64( 0x8000000000000000 ); 5777 } 5778 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 5779 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 5780 status->float_exception_flags |= float_flag_inexact; 5781 } 5782 } 5783 else { 5784 if ( aExp < 0x3FFF ) { 5785 if ( aExp | aSig0 | aSig1 ) { 5786 status->float_exception_flags |= float_flag_inexact; 5787 } 5788 return 0; 5789 } 5790 z = aSig0>>( - shiftCount ); 5791 if ( aSig1 5792 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 5793 status->float_exception_flags |= float_flag_inexact; 5794 } 5795 } 5796 if ( aSign ) z = - z; 5797 return z; 5798 5799 } 5800 5801 /*---------------------------------------------------------------------------- 5802 | Returns the result of converting the quadruple-precision floating-point value 5803 | `a' to the 64-bit unsigned integer format. The conversion is 5804 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5805 | Arithmetic---which means in particular that the conversion is rounded 5806 | according to the current rounding mode. If `a' is a NaN, the largest 5807 | positive integer is returned. If the conversion overflows, the 5808 | largest unsigned integer is returned. If 'a' is negative, the value is 5809 | rounded and zero is returned; negative values that do not round to zero 5810 | will raise the inexact exception. 5811 *----------------------------------------------------------------------------*/ 5812 5813 uint64_t float128_to_uint64(float128 a, float_status *status) 5814 { 5815 flag aSign; 5816 int aExp; 5817 int shiftCount; 5818 uint64_t aSig0, aSig1; 5819 5820 aSig0 = extractFloat128Frac0(a); 5821 aSig1 = extractFloat128Frac1(a); 5822 aExp = extractFloat128Exp(a); 5823 aSign = extractFloat128Sign(a); 5824 if (aSign && (aExp > 0x3FFE)) { 5825 float_raise(float_flag_invalid, status); 5826 if (float128_is_any_nan(a)) { 5827 return LIT64(0xFFFFFFFFFFFFFFFF); 5828 } else { 5829 return 0; 5830 } 5831 } 5832 if (aExp) { 5833 aSig0 |= LIT64(0x0001000000000000); 5834 } 5835 shiftCount = 0x402F - aExp; 5836 if (shiftCount <= 0) { 5837 if (0x403E < aExp) { 5838 float_raise(float_flag_invalid, status); 5839 return LIT64(0xFFFFFFFFFFFFFFFF); 5840 } 5841 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 5842 } else { 5843 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 5844 } 5845 return roundAndPackUint64(aSign, aSig0, aSig1, status); 5846 } 5847 5848 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 5849 { 5850 uint64_t v; 5851 signed char current_rounding_mode = status->float_rounding_mode; 5852 5853 set_float_rounding_mode(float_round_to_zero, status); 5854 v = float128_to_uint64(a, status); 5855 set_float_rounding_mode(current_rounding_mode, status); 5856 5857 return v; 5858 } 5859 5860 /*---------------------------------------------------------------------------- 5861 | Returns the result of converting the quadruple-precision floating-point 5862 | value `a' to the 32-bit unsigned integer format. The conversion 5863 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5864 | Arithmetic except that the conversion is always rounded toward zero. 5865 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 5866 | if the conversion overflows, the largest unsigned integer is returned. 5867 | If 'a' is negative, the value is rounded and zero is returned; negative 5868 | values that do not round to zero will raise the inexact exception. 5869 *----------------------------------------------------------------------------*/ 5870 5871 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 5872 { 5873 uint64_t v; 5874 uint32_t res; 5875 int old_exc_flags = get_float_exception_flags(status); 5876 5877 v = float128_to_uint64_round_to_zero(a, status); 5878 if (v > 0xffffffff) { 5879 res = 0xffffffff; 5880 } else { 5881 return v; 5882 } 5883 set_float_exception_flags(old_exc_flags, status); 5884 float_raise(float_flag_invalid, status); 5885 return res; 5886 } 5887 5888 /*---------------------------------------------------------------------------- 5889 | Returns the result of converting the quadruple-precision floating-point 5890 | value `a' to the single-precision floating-point format. The conversion 5891 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5892 | Arithmetic. 5893 *----------------------------------------------------------------------------*/ 5894 5895 float32 float128_to_float32(float128 a, float_status *status) 5896 { 5897 flag aSign; 5898 int32_t aExp; 5899 uint64_t aSig0, aSig1; 5900 uint32_t zSig; 5901 5902 aSig1 = extractFloat128Frac1( a ); 5903 aSig0 = extractFloat128Frac0( a ); 5904 aExp = extractFloat128Exp( a ); 5905 aSign = extractFloat128Sign( a ); 5906 if ( aExp == 0x7FFF ) { 5907 if ( aSig0 | aSig1 ) { 5908 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 5909 } 5910 return packFloat32( aSign, 0xFF, 0 ); 5911 } 5912 aSig0 |= ( aSig1 != 0 ); 5913 shift64RightJamming( aSig0, 18, &aSig0 ); 5914 zSig = aSig0; 5915 if ( aExp || zSig ) { 5916 zSig |= 0x40000000; 5917 aExp -= 0x3F81; 5918 } 5919 return roundAndPackFloat32(aSign, aExp, zSig, status); 5920 5921 } 5922 5923 /*---------------------------------------------------------------------------- 5924 | Returns the result of converting the quadruple-precision floating-point 5925 | value `a' to the double-precision floating-point format. The conversion 5926 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5927 | Arithmetic. 5928 *----------------------------------------------------------------------------*/ 5929 5930 float64 float128_to_float64(float128 a, float_status *status) 5931 { 5932 flag aSign; 5933 int32_t aExp; 5934 uint64_t aSig0, aSig1; 5935 5936 aSig1 = extractFloat128Frac1( a ); 5937 aSig0 = extractFloat128Frac0( a ); 5938 aExp = extractFloat128Exp( a ); 5939 aSign = extractFloat128Sign( a ); 5940 if ( aExp == 0x7FFF ) { 5941 if ( aSig0 | aSig1 ) { 5942 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 5943 } 5944 return packFloat64( aSign, 0x7FF, 0 ); 5945 } 5946 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 5947 aSig0 |= ( aSig1 != 0 ); 5948 if ( aExp || aSig0 ) { 5949 aSig0 |= LIT64( 0x4000000000000000 ); 5950 aExp -= 0x3C01; 5951 } 5952 return roundAndPackFloat64(aSign, aExp, aSig0, status); 5953 5954 } 5955 5956 /*---------------------------------------------------------------------------- 5957 | Returns the result of converting the quadruple-precision floating-point 5958 | value `a' to the extended double-precision floating-point format. The 5959 | conversion is performed according to the IEC/IEEE Standard for Binary 5960 | Floating-Point Arithmetic. 5961 *----------------------------------------------------------------------------*/ 5962 5963 floatx80 float128_to_floatx80(float128 a, float_status *status) 5964 { 5965 flag aSign; 5966 int32_t aExp; 5967 uint64_t aSig0, aSig1; 5968 5969 aSig1 = extractFloat128Frac1( a ); 5970 aSig0 = extractFloat128Frac0( a ); 5971 aExp = extractFloat128Exp( a ); 5972 aSign = extractFloat128Sign( a ); 5973 if ( aExp == 0x7FFF ) { 5974 if ( aSig0 | aSig1 ) { 5975 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status); 5976 } 5977 return packFloatx80(aSign, floatx80_infinity_high, 5978 floatx80_infinity_low); 5979 } 5980 if ( aExp == 0 ) { 5981 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 5982 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5983 } 5984 else { 5985 aSig0 |= LIT64( 0x0001000000000000 ); 5986 } 5987 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 5988 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 5989 5990 } 5991 5992 /*---------------------------------------------------------------------------- 5993 | Rounds the quadruple-precision floating-point value `a' to an integer, and 5994 | returns the result as a quadruple-precision floating-point value. The 5995 | operation is performed according to the IEC/IEEE Standard for Binary 5996 | Floating-Point Arithmetic. 5997 *----------------------------------------------------------------------------*/ 5998 5999 float128 float128_round_to_int(float128 a, float_status *status) 6000 { 6001 flag aSign; 6002 int32_t aExp; 6003 uint64_t lastBitMask, roundBitsMask; 6004 float128 z; 6005 6006 aExp = extractFloat128Exp( a ); 6007 if ( 0x402F <= aExp ) { 6008 if ( 0x406F <= aExp ) { 6009 if ( ( aExp == 0x7FFF ) 6010 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 6011 ) { 6012 return propagateFloat128NaN(a, a, status); 6013 } 6014 return a; 6015 } 6016 lastBitMask = 1; 6017 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 6018 roundBitsMask = lastBitMask - 1; 6019 z = a; 6020 switch (status->float_rounding_mode) { 6021 case float_round_nearest_even: 6022 if ( lastBitMask ) { 6023 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 6024 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 6025 } 6026 else { 6027 if ( (int64_t) z.low < 0 ) { 6028 ++z.high; 6029 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 6030 } 6031 } 6032 break; 6033 case float_round_ties_away: 6034 if (lastBitMask) { 6035 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 6036 } else { 6037 if ((int64_t) z.low < 0) { 6038 ++z.high; 6039 } 6040 } 6041 break; 6042 case float_round_to_zero: 6043 break; 6044 case float_round_up: 6045 if (!extractFloat128Sign(z)) { 6046 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6047 } 6048 break; 6049 case float_round_down: 6050 if (extractFloat128Sign(z)) { 6051 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6052 } 6053 break; 6054 default: 6055 abort(); 6056 } 6057 z.low &= ~ roundBitsMask; 6058 } 6059 else { 6060 if ( aExp < 0x3FFF ) { 6061 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 6062 status->float_exception_flags |= float_flag_inexact; 6063 aSign = extractFloat128Sign( a ); 6064 switch (status->float_rounding_mode) { 6065 case float_round_nearest_even: 6066 if ( ( aExp == 0x3FFE ) 6067 && ( extractFloat128Frac0( a ) 6068 | extractFloat128Frac1( a ) ) 6069 ) { 6070 return packFloat128( aSign, 0x3FFF, 0, 0 ); 6071 } 6072 break; 6073 case float_round_ties_away: 6074 if (aExp == 0x3FFE) { 6075 return packFloat128(aSign, 0x3FFF, 0, 0); 6076 } 6077 break; 6078 case float_round_down: 6079 return 6080 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 6081 : packFloat128( 0, 0, 0, 0 ); 6082 case float_round_up: 6083 return 6084 aSign ? packFloat128( 1, 0, 0, 0 ) 6085 : packFloat128( 0, 0x3FFF, 0, 0 ); 6086 } 6087 return packFloat128( aSign, 0, 0, 0 ); 6088 } 6089 lastBitMask = 1; 6090 lastBitMask <<= 0x402F - aExp; 6091 roundBitsMask = lastBitMask - 1; 6092 z.low = 0; 6093 z.high = a.high; 6094 switch (status->float_rounding_mode) { 6095 case float_round_nearest_even: 6096 z.high += lastBitMask>>1; 6097 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 6098 z.high &= ~ lastBitMask; 6099 } 6100 break; 6101 case float_round_ties_away: 6102 z.high += lastBitMask>>1; 6103 break; 6104 case float_round_to_zero: 6105 break; 6106 case float_round_up: 6107 if (!extractFloat128Sign(z)) { 6108 z.high |= ( a.low != 0 ); 6109 z.high += roundBitsMask; 6110 } 6111 break; 6112 case float_round_down: 6113 if (extractFloat128Sign(z)) { 6114 z.high |= (a.low != 0); 6115 z.high += roundBitsMask; 6116 } 6117 break; 6118 default: 6119 abort(); 6120 } 6121 z.high &= ~ roundBitsMask; 6122 } 6123 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 6124 status->float_exception_flags |= float_flag_inexact; 6125 } 6126 return z; 6127 6128 } 6129 6130 /*---------------------------------------------------------------------------- 6131 | Returns the result of adding the absolute values of the quadruple-precision 6132 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 6133 | before being returned. `zSign' is ignored if the result is a NaN. 6134 | The addition is performed according to the IEC/IEEE Standard for Binary 6135 | Floating-Point Arithmetic. 6136 *----------------------------------------------------------------------------*/ 6137 6138 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign, 6139 float_status *status) 6140 { 6141 int32_t aExp, bExp, zExp; 6142 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6143 int32_t expDiff; 6144 6145 aSig1 = extractFloat128Frac1( a ); 6146 aSig0 = extractFloat128Frac0( a ); 6147 aExp = extractFloat128Exp( a ); 6148 bSig1 = extractFloat128Frac1( b ); 6149 bSig0 = extractFloat128Frac0( b ); 6150 bExp = extractFloat128Exp( b ); 6151 expDiff = aExp - bExp; 6152 if ( 0 < expDiff ) { 6153 if ( aExp == 0x7FFF ) { 6154 if (aSig0 | aSig1) { 6155 return propagateFloat128NaN(a, b, status); 6156 } 6157 return a; 6158 } 6159 if ( bExp == 0 ) { 6160 --expDiff; 6161 } 6162 else { 6163 bSig0 |= LIT64( 0x0001000000000000 ); 6164 } 6165 shift128ExtraRightJamming( 6166 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 6167 zExp = aExp; 6168 } 6169 else if ( expDiff < 0 ) { 6170 if ( bExp == 0x7FFF ) { 6171 if (bSig0 | bSig1) { 6172 return propagateFloat128NaN(a, b, status); 6173 } 6174 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6175 } 6176 if ( aExp == 0 ) { 6177 ++expDiff; 6178 } 6179 else { 6180 aSig0 |= LIT64( 0x0001000000000000 ); 6181 } 6182 shift128ExtraRightJamming( 6183 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 6184 zExp = bExp; 6185 } 6186 else { 6187 if ( aExp == 0x7FFF ) { 6188 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6189 return propagateFloat128NaN(a, b, status); 6190 } 6191 return a; 6192 } 6193 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6194 if ( aExp == 0 ) { 6195 if (status->flush_to_zero) { 6196 if (zSig0 | zSig1) { 6197 float_raise(float_flag_output_denormal, status); 6198 } 6199 return packFloat128(zSign, 0, 0, 0); 6200 } 6201 return packFloat128( zSign, 0, zSig0, zSig1 ); 6202 } 6203 zSig2 = 0; 6204 zSig0 |= LIT64( 0x0002000000000000 ); 6205 zExp = aExp; 6206 goto shiftRight1; 6207 } 6208 aSig0 |= LIT64( 0x0001000000000000 ); 6209 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6210 --zExp; 6211 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 6212 ++zExp; 6213 shiftRight1: 6214 shift128ExtraRightJamming( 6215 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6216 roundAndPack: 6217 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6218 6219 } 6220 6221 /*---------------------------------------------------------------------------- 6222 | Returns the result of subtracting the absolute values of the quadruple- 6223 | precision floating-point values `a' and `b'. If `zSign' is 1, the 6224 | difference is negated before being returned. `zSign' is ignored if the 6225 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6226 | Standard for Binary Floating-Point Arithmetic. 6227 *----------------------------------------------------------------------------*/ 6228 6229 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign, 6230 float_status *status) 6231 { 6232 int32_t aExp, bExp, zExp; 6233 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 6234 int32_t expDiff; 6235 6236 aSig1 = extractFloat128Frac1( a ); 6237 aSig0 = extractFloat128Frac0( a ); 6238 aExp = extractFloat128Exp( a ); 6239 bSig1 = extractFloat128Frac1( b ); 6240 bSig0 = extractFloat128Frac0( b ); 6241 bExp = extractFloat128Exp( b ); 6242 expDiff = aExp - bExp; 6243 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6244 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 6245 if ( 0 < expDiff ) goto aExpBigger; 6246 if ( expDiff < 0 ) goto bExpBigger; 6247 if ( aExp == 0x7FFF ) { 6248 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6249 return propagateFloat128NaN(a, b, status); 6250 } 6251 float_raise(float_flag_invalid, status); 6252 return float128_default_nan(status); 6253 } 6254 if ( aExp == 0 ) { 6255 aExp = 1; 6256 bExp = 1; 6257 } 6258 if ( bSig0 < aSig0 ) goto aBigger; 6259 if ( aSig0 < bSig0 ) goto bBigger; 6260 if ( bSig1 < aSig1 ) goto aBigger; 6261 if ( aSig1 < bSig1 ) goto bBigger; 6262 return packFloat128(status->float_rounding_mode == float_round_down, 6263 0, 0, 0); 6264 bExpBigger: 6265 if ( bExp == 0x7FFF ) { 6266 if (bSig0 | bSig1) { 6267 return propagateFloat128NaN(a, b, status); 6268 } 6269 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 6270 } 6271 if ( aExp == 0 ) { 6272 ++expDiff; 6273 } 6274 else { 6275 aSig0 |= LIT64( 0x4000000000000000 ); 6276 } 6277 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6278 bSig0 |= LIT64( 0x4000000000000000 ); 6279 bBigger: 6280 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6281 zExp = bExp; 6282 zSign ^= 1; 6283 goto normalizeRoundAndPack; 6284 aExpBigger: 6285 if ( aExp == 0x7FFF ) { 6286 if (aSig0 | aSig1) { 6287 return propagateFloat128NaN(a, b, status); 6288 } 6289 return a; 6290 } 6291 if ( bExp == 0 ) { 6292 --expDiff; 6293 } 6294 else { 6295 bSig0 |= LIT64( 0x4000000000000000 ); 6296 } 6297 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 6298 aSig0 |= LIT64( 0x4000000000000000 ); 6299 aBigger: 6300 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6301 zExp = aExp; 6302 normalizeRoundAndPack: 6303 --zExp; 6304 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 6305 status); 6306 6307 } 6308 6309 /*---------------------------------------------------------------------------- 6310 | Returns the result of adding the quadruple-precision floating-point values 6311 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 6312 | for Binary Floating-Point Arithmetic. 6313 *----------------------------------------------------------------------------*/ 6314 6315 float128 float128_add(float128 a, float128 b, float_status *status) 6316 { 6317 flag aSign, bSign; 6318 6319 aSign = extractFloat128Sign( a ); 6320 bSign = extractFloat128Sign( b ); 6321 if ( aSign == bSign ) { 6322 return addFloat128Sigs(a, b, aSign, status); 6323 } 6324 else { 6325 return subFloat128Sigs(a, b, aSign, status); 6326 } 6327 6328 } 6329 6330 /*---------------------------------------------------------------------------- 6331 | Returns the result of subtracting the quadruple-precision floating-point 6332 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6333 | Standard for Binary Floating-Point Arithmetic. 6334 *----------------------------------------------------------------------------*/ 6335 6336 float128 float128_sub(float128 a, float128 b, float_status *status) 6337 { 6338 flag aSign, bSign; 6339 6340 aSign = extractFloat128Sign( a ); 6341 bSign = extractFloat128Sign( b ); 6342 if ( aSign == bSign ) { 6343 return subFloat128Sigs(a, b, aSign, status); 6344 } 6345 else { 6346 return addFloat128Sigs(a, b, aSign, status); 6347 } 6348 6349 } 6350 6351 /*---------------------------------------------------------------------------- 6352 | Returns the result of multiplying the quadruple-precision floating-point 6353 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6354 | Standard for Binary Floating-Point Arithmetic. 6355 *----------------------------------------------------------------------------*/ 6356 6357 float128 float128_mul(float128 a, float128 b, float_status *status) 6358 { 6359 flag aSign, bSign, zSign; 6360 int32_t aExp, bExp, zExp; 6361 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 6362 6363 aSig1 = extractFloat128Frac1( a ); 6364 aSig0 = extractFloat128Frac0( a ); 6365 aExp = extractFloat128Exp( a ); 6366 aSign = extractFloat128Sign( a ); 6367 bSig1 = extractFloat128Frac1( b ); 6368 bSig0 = extractFloat128Frac0( b ); 6369 bExp = extractFloat128Exp( b ); 6370 bSign = extractFloat128Sign( b ); 6371 zSign = aSign ^ bSign; 6372 if ( aExp == 0x7FFF ) { 6373 if ( ( aSig0 | aSig1 ) 6374 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6375 return propagateFloat128NaN(a, b, status); 6376 } 6377 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 6378 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6379 } 6380 if ( bExp == 0x7FFF ) { 6381 if (bSig0 | bSig1) { 6382 return propagateFloat128NaN(a, b, status); 6383 } 6384 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6385 invalid: 6386 float_raise(float_flag_invalid, status); 6387 return float128_default_nan(status); 6388 } 6389 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6390 } 6391 if ( aExp == 0 ) { 6392 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6393 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6394 } 6395 if ( bExp == 0 ) { 6396 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6397 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6398 } 6399 zExp = aExp + bExp - 0x4000; 6400 aSig0 |= LIT64( 0x0001000000000000 ); 6401 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 6402 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 6403 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6404 zSig2 |= ( zSig3 != 0 ); 6405 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 6406 shift128ExtraRightJamming( 6407 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6408 ++zExp; 6409 } 6410 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6411 6412 } 6413 6414 /*---------------------------------------------------------------------------- 6415 | Returns the result of dividing the quadruple-precision floating-point value 6416 | `a' by the corresponding value `b'. The operation is performed according to 6417 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6418 *----------------------------------------------------------------------------*/ 6419 6420 float128 float128_div(float128 a, float128 b, float_status *status) 6421 { 6422 flag aSign, bSign, zSign; 6423 int32_t aExp, bExp, zExp; 6424 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6425 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6426 6427 aSig1 = extractFloat128Frac1( a ); 6428 aSig0 = extractFloat128Frac0( a ); 6429 aExp = extractFloat128Exp( a ); 6430 aSign = extractFloat128Sign( a ); 6431 bSig1 = extractFloat128Frac1( b ); 6432 bSig0 = extractFloat128Frac0( b ); 6433 bExp = extractFloat128Exp( b ); 6434 bSign = extractFloat128Sign( b ); 6435 zSign = aSign ^ bSign; 6436 if ( aExp == 0x7FFF ) { 6437 if (aSig0 | aSig1) { 6438 return propagateFloat128NaN(a, b, status); 6439 } 6440 if ( bExp == 0x7FFF ) { 6441 if (bSig0 | bSig1) { 6442 return propagateFloat128NaN(a, b, status); 6443 } 6444 goto invalid; 6445 } 6446 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6447 } 6448 if ( bExp == 0x7FFF ) { 6449 if (bSig0 | bSig1) { 6450 return propagateFloat128NaN(a, b, status); 6451 } 6452 return packFloat128( zSign, 0, 0, 0 ); 6453 } 6454 if ( bExp == 0 ) { 6455 if ( ( bSig0 | bSig1 ) == 0 ) { 6456 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6457 invalid: 6458 float_raise(float_flag_invalid, status); 6459 return float128_default_nan(status); 6460 } 6461 float_raise(float_flag_divbyzero, status); 6462 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6463 } 6464 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6465 } 6466 if ( aExp == 0 ) { 6467 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6468 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6469 } 6470 zExp = aExp - bExp + 0x3FFD; 6471 shortShift128Left( 6472 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 6473 shortShift128Left( 6474 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6475 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 6476 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 6477 ++zExp; 6478 } 6479 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6480 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 6481 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 6482 while ( (int64_t) rem0 < 0 ) { 6483 --zSig0; 6484 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 6485 } 6486 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 6487 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 6488 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 6489 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 6490 while ( (int64_t) rem1 < 0 ) { 6491 --zSig1; 6492 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 6493 } 6494 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6495 } 6496 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 6497 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6498 6499 } 6500 6501 /*---------------------------------------------------------------------------- 6502 | Returns the remainder of the quadruple-precision floating-point value `a' 6503 | with respect to the corresponding value `b'. The operation is performed 6504 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6505 *----------------------------------------------------------------------------*/ 6506 6507 float128 float128_rem(float128 a, float128 b, float_status *status) 6508 { 6509 flag aSign, zSign; 6510 int32_t aExp, bExp, expDiff; 6511 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 6512 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 6513 int64_t sigMean0; 6514 6515 aSig1 = extractFloat128Frac1( a ); 6516 aSig0 = extractFloat128Frac0( a ); 6517 aExp = extractFloat128Exp( a ); 6518 aSign = extractFloat128Sign( a ); 6519 bSig1 = extractFloat128Frac1( b ); 6520 bSig0 = extractFloat128Frac0( b ); 6521 bExp = extractFloat128Exp( b ); 6522 if ( aExp == 0x7FFF ) { 6523 if ( ( aSig0 | aSig1 ) 6524 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6525 return propagateFloat128NaN(a, b, status); 6526 } 6527 goto invalid; 6528 } 6529 if ( bExp == 0x7FFF ) { 6530 if (bSig0 | bSig1) { 6531 return propagateFloat128NaN(a, b, status); 6532 } 6533 return a; 6534 } 6535 if ( bExp == 0 ) { 6536 if ( ( bSig0 | bSig1 ) == 0 ) { 6537 invalid: 6538 float_raise(float_flag_invalid, status); 6539 return float128_default_nan(status); 6540 } 6541 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6542 } 6543 if ( aExp == 0 ) { 6544 if ( ( aSig0 | aSig1 ) == 0 ) return a; 6545 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6546 } 6547 expDiff = aExp - bExp; 6548 if ( expDiff < -1 ) return a; 6549 shortShift128Left( 6550 aSig0 | LIT64( 0x0001000000000000 ), 6551 aSig1, 6552 15 - ( expDiff < 0 ), 6553 &aSig0, 6554 &aSig1 6555 ); 6556 shortShift128Left( 6557 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6558 q = le128( bSig0, bSig1, aSig0, aSig1 ); 6559 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6560 expDiff -= 64; 6561 while ( 0 < expDiff ) { 6562 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6563 q = ( 4 < q ) ? q - 4 : 0; 6564 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6565 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 6566 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 6567 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 6568 expDiff -= 61; 6569 } 6570 if ( -64 < expDiff ) { 6571 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6572 q = ( 4 < q ) ? q - 4 : 0; 6573 q >>= - expDiff; 6574 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6575 expDiff += 52; 6576 if ( expDiff < 0 ) { 6577 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6578 } 6579 else { 6580 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 6581 } 6582 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6583 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 6584 } 6585 else { 6586 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 6587 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6588 } 6589 do { 6590 alternateASig0 = aSig0; 6591 alternateASig1 = aSig1; 6592 ++q; 6593 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6594 } while ( 0 <= (int64_t) aSig0 ); 6595 add128( 6596 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 6597 if ( ( sigMean0 < 0 ) 6598 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 6599 aSig0 = alternateASig0; 6600 aSig1 = alternateASig1; 6601 } 6602 zSign = ( (int64_t) aSig0 < 0 ); 6603 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 6604 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 6605 status); 6606 } 6607 6608 /*---------------------------------------------------------------------------- 6609 | Returns the square root of the quadruple-precision floating-point value `a'. 6610 | The operation is performed according to the IEC/IEEE Standard for Binary 6611 | Floating-Point Arithmetic. 6612 *----------------------------------------------------------------------------*/ 6613 6614 float128 float128_sqrt(float128 a, float_status *status) 6615 { 6616 flag aSign; 6617 int32_t aExp, zExp; 6618 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 6619 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6620 6621 aSig1 = extractFloat128Frac1( a ); 6622 aSig0 = extractFloat128Frac0( a ); 6623 aExp = extractFloat128Exp( a ); 6624 aSign = extractFloat128Sign( a ); 6625 if ( aExp == 0x7FFF ) { 6626 if (aSig0 | aSig1) { 6627 return propagateFloat128NaN(a, a, status); 6628 } 6629 if ( ! aSign ) return a; 6630 goto invalid; 6631 } 6632 if ( aSign ) { 6633 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 6634 invalid: 6635 float_raise(float_flag_invalid, status); 6636 return float128_default_nan(status); 6637 } 6638 if ( aExp == 0 ) { 6639 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 6640 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6641 } 6642 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 6643 aSig0 |= LIT64( 0x0001000000000000 ); 6644 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 6645 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 6646 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6647 doubleZSig0 = zSig0<<1; 6648 mul64To128( zSig0, zSig0, &term0, &term1 ); 6649 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6650 while ( (int64_t) rem0 < 0 ) { 6651 --zSig0; 6652 doubleZSig0 -= 2; 6653 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6654 } 6655 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6656 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 6657 if ( zSig1 == 0 ) zSig1 = 1; 6658 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6659 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6660 mul64To128( zSig1, zSig1, &term2, &term3 ); 6661 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6662 while ( (int64_t) rem1 < 0 ) { 6663 --zSig1; 6664 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6665 term3 |= 1; 6666 term2 |= doubleZSig0; 6667 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6668 } 6669 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6670 } 6671 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 6672 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 6673 6674 } 6675 6676 /*---------------------------------------------------------------------------- 6677 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 6678 | the corresponding value `b', and 0 otherwise. The invalid exception is 6679 | raised if either operand is a NaN. Otherwise, the comparison is performed 6680 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6681 *----------------------------------------------------------------------------*/ 6682 6683 int float128_eq(float128 a, float128 b, float_status *status) 6684 { 6685 6686 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6687 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6688 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6689 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6690 ) { 6691 float_raise(float_flag_invalid, status); 6692 return 0; 6693 } 6694 return 6695 ( a.low == b.low ) 6696 && ( ( a.high == b.high ) 6697 || ( ( a.low == 0 ) 6698 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6699 ); 6700 6701 } 6702 6703 /*---------------------------------------------------------------------------- 6704 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6705 | or equal to the corresponding value `b', and 0 otherwise. The invalid 6706 | exception is raised if either operand is a NaN. The comparison is performed 6707 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6708 *----------------------------------------------------------------------------*/ 6709 6710 int float128_le(float128 a, float128 b, float_status *status) 6711 { 6712 flag aSign, bSign; 6713 6714 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6715 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6716 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6717 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6718 ) { 6719 float_raise(float_flag_invalid, status); 6720 return 0; 6721 } 6722 aSign = extractFloat128Sign( a ); 6723 bSign = extractFloat128Sign( b ); 6724 if ( aSign != bSign ) { 6725 return 6726 aSign 6727 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6728 == 0 ); 6729 } 6730 return 6731 aSign ? le128( b.high, b.low, a.high, a.low ) 6732 : le128( a.high, a.low, b.high, b.low ); 6733 6734 } 6735 6736 /*---------------------------------------------------------------------------- 6737 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6738 | the corresponding value `b', and 0 otherwise. The invalid exception is 6739 | raised if either operand is a NaN. The comparison is performed according 6740 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6741 *----------------------------------------------------------------------------*/ 6742 6743 int float128_lt(float128 a, float128 b, float_status *status) 6744 { 6745 flag aSign, bSign; 6746 6747 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6748 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6749 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6750 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6751 ) { 6752 float_raise(float_flag_invalid, status); 6753 return 0; 6754 } 6755 aSign = extractFloat128Sign( a ); 6756 bSign = extractFloat128Sign( b ); 6757 if ( aSign != bSign ) { 6758 return 6759 aSign 6760 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6761 != 0 ); 6762 } 6763 return 6764 aSign ? lt128( b.high, b.low, a.high, a.low ) 6765 : lt128( a.high, a.low, b.high, b.low ); 6766 6767 } 6768 6769 /*---------------------------------------------------------------------------- 6770 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 6771 | be compared, and 0 otherwise. The invalid exception is raised if either 6772 | operand is a NaN. The comparison is performed according to the IEC/IEEE 6773 | Standard for Binary Floating-Point Arithmetic. 6774 *----------------------------------------------------------------------------*/ 6775 6776 int float128_unordered(float128 a, float128 b, float_status *status) 6777 { 6778 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6779 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6780 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6781 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6782 ) { 6783 float_raise(float_flag_invalid, status); 6784 return 1; 6785 } 6786 return 0; 6787 } 6788 6789 /*---------------------------------------------------------------------------- 6790 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 6791 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 6792 | exception. The comparison is performed according to the IEC/IEEE Standard 6793 | for Binary Floating-Point Arithmetic. 6794 *----------------------------------------------------------------------------*/ 6795 6796 int float128_eq_quiet(float128 a, float128 b, float_status *status) 6797 { 6798 6799 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6800 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6801 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6802 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6803 ) { 6804 if (float128_is_signaling_nan(a, status) 6805 || float128_is_signaling_nan(b, status)) { 6806 float_raise(float_flag_invalid, status); 6807 } 6808 return 0; 6809 } 6810 return 6811 ( a.low == b.low ) 6812 && ( ( a.high == b.high ) 6813 || ( ( a.low == 0 ) 6814 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6815 ); 6816 6817 } 6818 6819 /*---------------------------------------------------------------------------- 6820 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6821 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 6822 | cause an exception. Otherwise, the comparison is performed according to the 6823 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6824 *----------------------------------------------------------------------------*/ 6825 6826 int float128_le_quiet(float128 a, float128 b, float_status *status) 6827 { 6828 flag aSign, bSign; 6829 6830 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6831 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6832 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6833 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6834 ) { 6835 if (float128_is_signaling_nan(a, status) 6836 || float128_is_signaling_nan(b, status)) { 6837 float_raise(float_flag_invalid, status); 6838 } 6839 return 0; 6840 } 6841 aSign = extractFloat128Sign( a ); 6842 bSign = extractFloat128Sign( b ); 6843 if ( aSign != bSign ) { 6844 return 6845 aSign 6846 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6847 == 0 ); 6848 } 6849 return 6850 aSign ? le128( b.high, b.low, a.high, a.low ) 6851 : le128( a.high, a.low, b.high, b.low ); 6852 6853 } 6854 6855 /*---------------------------------------------------------------------------- 6856 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6857 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 6858 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 6859 | Standard for Binary Floating-Point Arithmetic. 6860 *----------------------------------------------------------------------------*/ 6861 6862 int float128_lt_quiet(float128 a, float128 b, float_status *status) 6863 { 6864 flag aSign, bSign; 6865 6866 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6867 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6868 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6869 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6870 ) { 6871 if (float128_is_signaling_nan(a, status) 6872 || float128_is_signaling_nan(b, status)) { 6873 float_raise(float_flag_invalid, status); 6874 } 6875 return 0; 6876 } 6877 aSign = extractFloat128Sign( a ); 6878 bSign = extractFloat128Sign( b ); 6879 if ( aSign != bSign ) { 6880 return 6881 aSign 6882 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6883 != 0 ); 6884 } 6885 return 6886 aSign ? lt128( b.high, b.low, a.high, a.low ) 6887 : lt128( a.high, a.low, b.high, b.low ); 6888 6889 } 6890 6891 /*---------------------------------------------------------------------------- 6892 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 6893 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 6894 | comparison is performed according to the IEC/IEEE Standard for Binary 6895 | Floating-Point Arithmetic. 6896 *----------------------------------------------------------------------------*/ 6897 6898 int float128_unordered_quiet(float128 a, float128 b, float_status *status) 6899 { 6900 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6901 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6902 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6903 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6904 ) { 6905 if (float128_is_signaling_nan(a, status) 6906 || float128_is_signaling_nan(b, status)) { 6907 float_raise(float_flag_invalid, status); 6908 } 6909 return 1; 6910 } 6911 return 0; 6912 } 6913 6914 static inline int floatx80_compare_internal(floatx80 a, floatx80 b, 6915 int is_quiet, float_status *status) 6916 { 6917 flag aSign, bSign; 6918 6919 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6920 float_raise(float_flag_invalid, status); 6921 return float_relation_unordered; 6922 } 6923 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 6924 ( extractFloatx80Frac( a )<<1 ) ) || 6925 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 6926 ( extractFloatx80Frac( b )<<1 ) )) { 6927 if (!is_quiet || 6928 floatx80_is_signaling_nan(a, status) || 6929 floatx80_is_signaling_nan(b, status)) { 6930 float_raise(float_flag_invalid, status); 6931 } 6932 return float_relation_unordered; 6933 } 6934 aSign = extractFloatx80Sign( a ); 6935 bSign = extractFloatx80Sign( b ); 6936 if ( aSign != bSign ) { 6937 6938 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 6939 ( ( a.low | b.low ) == 0 ) ) { 6940 /* zero case */ 6941 return float_relation_equal; 6942 } else { 6943 return 1 - (2 * aSign); 6944 } 6945 } else { 6946 if (a.low == b.low && a.high == b.high) { 6947 return float_relation_equal; 6948 } else { 6949 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 6950 } 6951 } 6952 } 6953 6954 int floatx80_compare(floatx80 a, floatx80 b, float_status *status) 6955 { 6956 return floatx80_compare_internal(a, b, 0, status); 6957 } 6958 6959 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status) 6960 { 6961 return floatx80_compare_internal(a, b, 1, status); 6962 } 6963 6964 static inline int float128_compare_internal(float128 a, float128 b, 6965 int is_quiet, float_status *status) 6966 { 6967 flag aSign, bSign; 6968 6969 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 6970 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 6971 ( ( extractFloat128Exp( b ) == 0x7fff ) && 6972 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 6973 if (!is_quiet || 6974 float128_is_signaling_nan(a, status) || 6975 float128_is_signaling_nan(b, status)) { 6976 float_raise(float_flag_invalid, status); 6977 } 6978 return float_relation_unordered; 6979 } 6980 aSign = extractFloat128Sign( a ); 6981 bSign = extractFloat128Sign( b ); 6982 if ( aSign != bSign ) { 6983 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 6984 /* zero case */ 6985 return float_relation_equal; 6986 } else { 6987 return 1 - (2 * aSign); 6988 } 6989 } else { 6990 if (a.low == b.low && a.high == b.high) { 6991 return float_relation_equal; 6992 } else { 6993 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 6994 } 6995 } 6996 } 6997 6998 int float128_compare(float128 a, float128 b, float_status *status) 6999 { 7000 return float128_compare_internal(a, b, 0, status); 7001 } 7002 7003 int float128_compare_quiet(float128 a, float128 b, float_status *status) 7004 { 7005 return float128_compare_internal(a, b, 1, status); 7006 } 7007 7008 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7009 { 7010 flag aSign; 7011 int32_t aExp; 7012 uint64_t aSig; 7013 7014 if (floatx80_invalid_encoding(a)) { 7015 float_raise(float_flag_invalid, status); 7016 return floatx80_default_nan(status); 7017 } 7018 aSig = extractFloatx80Frac( a ); 7019 aExp = extractFloatx80Exp( a ); 7020 aSign = extractFloatx80Sign( a ); 7021 7022 if ( aExp == 0x7FFF ) { 7023 if ( aSig<<1 ) { 7024 return propagateFloatx80NaN(a, a, status); 7025 } 7026 return a; 7027 } 7028 7029 if (aExp == 0) { 7030 if (aSig == 0) { 7031 return a; 7032 } 7033 aExp++; 7034 } 7035 7036 if (n > 0x10000) { 7037 n = 0x10000; 7038 } else if (n < -0x10000) { 7039 n = -0x10000; 7040 } 7041 7042 aExp += n; 7043 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7044 aSign, aExp, aSig, 0, status); 7045 } 7046 7047 float128 float128_scalbn(float128 a, int n, float_status *status) 7048 { 7049 flag aSign; 7050 int32_t aExp; 7051 uint64_t aSig0, aSig1; 7052 7053 aSig1 = extractFloat128Frac1( a ); 7054 aSig0 = extractFloat128Frac0( a ); 7055 aExp = extractFloat128Exp( a ); 7056 aSign = extractFloat128Sign( a ); 7057 if ( aExp == 0x7FFF ) { 7058 if ( aSig0 | aSig1 ) { 7059 return propagateFloat128NaN(a, a, status); 7060 } 7061 return a; 7062 } 7063 if (aExp != 0) { 7064 aSig0 |= LIT64( 0x0001000000000000 ); 7065 } else if (aSig0 == 0 && aSig1 == 0) { 7066 return a; 7067 } else { 7068 aExp++; 7069 } 7070 7071 if (n > 0x10000) { 7072 n = 0x10000; 7073 } else if (n < -0x10000) { 7074 n = -0x10000; 7075 } 7076 7077 aExp += n - 1; 7078 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7079 , status); 7080 7081 } 7082