1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include "qemu/bitops.h" 87 #include "fpu/softfloat.h" 88 89 /* We only need stdlib for abort() */ 90 91 /*---------------------------------------------------------------------------- 92 | Primitive arithmetic functions, including multi-word arithmetic, and 93 | division and square root approximations. (Can be specialized to target if 94 | desired.) 95 *----------------------------------------------------------------------------*/ 96 #include "fpu/softfloat-macros.h" 97 98 /*---------------------------------------------------------------------------- 99 | Functions and definitions to determine: (1) whether tininess for underflow 100 | is detected before or after rounding by default, (2) what (if anything) 101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 103 | are propagated from function inputs to output. These details are target- 104 | specific. 105 *----------------------------------------------------------------------------*/ 106 #include "softfloat-specialize.h" 107 108 /*---------------------------------------------------------------------------- 109 | Returns the fraction bits of the half-precision floating-point value `a'. 110 *----------------------------------------------------------------------------*/ 111 112 static inline uint32_t extractFloat16Frac(float16 a) 113 { 114 return float16_val(a) & 0x3ff; 115 } 116 117 /*---------------------------------------------------------------------------- 118 | Returns the exponent bits of the half-precision floating-point value `a'. 119 *----------------------------------------------------------------------------*/ 120 121 static inline int extractFloat16Exp(float16 a) 122 { 123 return (float16_val(a) >> 10) & 0x1f; 124 } 125 126 /*---------------------------------------------------------------------------- 127 | Returns the sign bit of the single-precision floating-point value `a'. 128 *----------------------------------------------------------------------------*/ 129 130 static inline flag extractFloat16Sign(float16 a) 131 { 132 return float16_val(a)>>15; 133 } 134 135 /*---------------------------------------------------------------------------- 136 | Returns the fraction bits of the single-precision floating-point value `a'. 137 *----------------------------------------------------------------------------*/ 138 139 static inline uint32_t extractFloat32Frac(float32 a) 140 { 141 return float32_val(a) & 0x007FFFFF; 142 } 143 144 /*---------------------------------------------------------------------------- 145 | Returns the exponent bits of the single-precision floating-point value `a'. 146 *----------------------------------------------------------------------------*/ 147 148 static inline int extractFloat32Exp(float32 a) 149 { 150 return (float32_val(a) >> 23) & 0xFF; 151 } 152 153 /*---------------------------------------------------------------------------- 154 | Returns the sign bit of the single-precision floating-point value `a'. 155 *----------------------------------------------------------------------------*/ 156 157 static inline flag extractFloat32Sign(float32 a) 158 { 159 return float32_val(a) >> 31; 160 } 161 162 /*---------------------------------------------------------------------------- 163 | Returns the fraction bits of the double-precision floating-point value `a'. 164 *----------------------------------------------------------------------------*/ 165 166 static inline uint64_t extractFloat64Frac(float64 a) 167 { 168 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF); 169 } 170 171 /*---------------------------------------------------------------------------- 172 | Returns the exponent bits of the double-precision floating-point value `a'. 173 *----------------------------------------------------------------------------*/ 174 175 static inline int extractFloat64Exp(float64 a) 176 { 177 return (float64_val(a) >> 52) & 0x7FF; 178 } 179 180 /*---------------------------------------------------------------------------- 181 | Returns the sign bit of the double-precision floating-point value `a'. 182 *----------------------------------------------------------------------------*/ 183 184 static inline flag extractFloat64Sign(float64 a) 185 { 186 return float64_val(a) >> 63; 187 } 188 189 /* 190 * Classify a floating point number. Everything above float_class_qnan 191 * is a NaN so cls >= float_class_qnan is any NaN. 192 */ 193 194 typedef enum __attribute__ ((__packed__)) { 195 float_class_unclassified, 196 float_class_zero, 197 float_class_normal, 198 float_class_inf, 199 float_class_qnan, /* all NaNs from here */ 200 float_class_snan, 201 float_class_dnan, 202 float_class_msnan, /* maybe silenced */ 203 } FloatClass; 204 205 /* 206 * Structure holding all of the decomposed parts of a float. The 207 * exponent is unbiased and the fraction is normalized. All 208 * calculations are done with a 64 bit fraction and then rounded as 209 * appropriate for the final format. 210 * 211 * Thanks to the packed FloatClass a decent compiler should be able to 212 * fit the whole structure into registers and avoid using the stack 213 * for parameter passing. 214 */ 215 216 typedef struct { 217 uint64_t frac; 218 int32_t exp; 219 FloatClass cls; 220 bool sign; 221 } FloatParts; 222 223 #define DECOMPOSED_BINARY_POINT (64 - 2) 224 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 225 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1) 226 227 /* Structure holding all of the relevant parameters for a format. 228 * exp_size: the size of the exponent field 229 * exp_bias: the offset applied to the exponent field 230 * exp_max: the maximum normalised exponent 231 * frac_size: the size of the fraction field 232 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 233 * The following are computed based the size of fraction 234 * frac_lsb: least significant bit of fraction 235 * fram_lsbm1: the bit bellow the least significant bit (for rounding) 236 * round_mask/roundeven_mask: masks used for rounding 237 */ 238 typedef struct { 239 int exp_size; 240 int exp_bias; 241 int exp_max; 242 int frac_size; 243 int frac_shift; 244 uint64_t frac_lsb; 245 uint64_t frac_lsbm1; 246 uint64_t round_mask; 247 uint64_t roundeven_mask; 248 } FloatFmt; 249 250 /* Expand fields based on the size of exponent and fraction */ 251 #define FLOAT_PARAMS(E, F) \ 252 .exp_size = E, \ 253 .exp_bias = ((1 << E) - 1) >> 1, \ 254 .exp_max = (1 << E) - 1, \ 255 .frac_size = F, \ 256 .frac_shift = DECOMPOSED_BINARY_POINT - F, \ 257 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \ 258 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \ 259 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \ 260 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1 261 262 static const FloatFmt float16_params = { 263 FLOAT_PARAMS(5, 10) 264 }; 265 266 static const FloatFmt float32_params = { 267 FLOAT_PARAMS(8, 23) 268 }; 269 270 static const FloatFmt float64_params = { 271 FLOAT_PARAMS(11, 52) 272 }; 273 274 /* Unpack a float to parts, but do not canonicalize. */ 275 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw) 276 { 277 const int sign_pos = fmt.frac_size + fmt.exp_size; 278 279 return (FloatParts) { 280 .cls = float_class_unclassified, 281 .sign = extract64(raw, sign_pos, 1), 282 .exp = extract64(raw, fmt.frac_size, fmt.exp_size), 283 .frac = extract64(raw, 0, fmt.frac_size), 284 }; 285 } 286 287 static inline FloatParts float16_unpack_raw(float16 f) 288 { 289 return unpack_raw(float16_params, f); 290 } 291 292 static inline FloatParts float32_unpack_raw(float32 f) 293 { 294 return unpack_raw(float32_params, f); 295 } 296 297 static inline FloatParts float64_unpack_raw(float64 f) 298 { 299 return unpack_raw(float64_params, f); 300 } 301 302 /* Pack a float from parts, but do not canonicalize. */ 303 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p) 304 { 305 const int sign_pos = fmt.frac_size + fmt.exp_size; 306 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp); 307 return deposit64(ret, sign_pos, 1, p.sign); 308 } 309 310 static inline float16 float16_pack_raw(FloatParts p) 311 { 312 return make_float16(pack_raw(float16_params, p)); 313 } 314 315 static inline float32 float32_pack_raw(FloatParts p) 316 { 317 return make_float32(pack_raw(float32_params, p)); 318 } 319 320 static inline float64 float64_pack_raw(FloatParts p) 321 { 322 return make_float64(pack_raw(float64_params, p)); 323 } 324 325 /* Canonicalize EXP and FRAC, setting CLS. */ 326 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm, 327 float_status *status) 328 { 329 if (part.exp == parm->exp_max) { 330 if (part.frac == 0) { 331 part.cls = float_class_inf; 332 } else { 333 #ifdef NO_SIGNALING_NANS 334 part.cls = float_class_qnan; 335 #else 336 int64_t msb = part.frac << (parm->frac_shift + 2); 337 if ((msb < 0) == status->snan_bit_is_one) { 338 part.cls = float_class_snan; 339 } else { 340 part.cls = float_class_qnan; 341 } 342 #endif 343 } 344 } else if (part.exp == 0) { 345 if (likely(part.frac == 0)) { 346 part.cls = float_class_zero; 347 } else if (status->flush_inputs_to_zero) { 348 float_raise(float_flag_input_denormal, status); 349 part.cls = float_class_zero; 350 part.frac = 0; 351 } else { 352 int shift = clz64(part.frac) - 1; 353 part.cls = float_class_normal; 354 part.exp = parm->frac_shift - parm->exp_bias - shift + 1; 355 part.frac <<= shift; 356 } 357 } else { 358 part.cls = float_class_normal; 359 part.exp -= parm->exp_bias; 360 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift); 361 } 362 return part; 363 } 364 365 /* Round and uncanonicalize a floating-point number by parts. There 366 * are FRAC_SHIFT bits that may require rounding at the bottom of the 367 * fraction; these bits will be removed. The exponent will be biased 368 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0]. 369 */ 370 371 static FloatParts round_canonical(FloatParts p, float_status *s, 372 const FloatFmt *parm) 373 { 374 const uint64_t frac_lsbm1 = parm->frac_lsbm1; 375 const uint64_t round_mask = parm->round_mask; 376 const uint64_t roundeven_mask = parm->roundeven_mask; 377 const int exp_max = parm->exp_max; 378 const int frac_shift = parm->frac_shift; 379 uint64_t frac, inc; 380 int exp, flags = 0; 381 bool overflow_norm; 382 383 frac = p.frac; 384 exp = p.exp; 385 386 switch (p.cls) { 387 case float_class_normal: 388 switch (s->float_rounding_mode) { 389 case float_round_nearest_even: 390 overflow_norm = false; 391 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 392 break; 393 case float_round_ties_away: 394 overflow_norm = false; 395 inc = frac_lsbm1; 396 break; 397 case float_round_to_zero: 398 overflow_norm = true; 399 inc = 0; 400 break; 401 case float_round_up: 402 inc = p.sign ? 0 : round_mask; 403 overflow_norm = p.sign; 404 break; 405 case float_round_down: 406 inc = p.sign ? round_mask : 0; 407 overflow_norm = !p.sign; 408 break; 409 default: 410 g_assert_not_reached(); 411 } 412 413 exp += parm->exp_bias; 414 if (likely(exp > 0)) { 415 if (frac & round_mask) { 416 flags |= float_flag_inexact; 417 frac += inc; 418 if (frac & DECOMPOSED_OVERFLOW_BIT) { 419 frac >>= 1; 420 exp++; 421 } 422 } 423 frac >>= frac_shift; 424 425 if (unlikely(exp >= exp_max)) { 426 flags |= float_flag_overflow | float_flag_inexact; 427 if (overflow_norm) { 428 exp = exp_max - 1; 429 frac = -1; 430 } else { 431 p.cls = float_class_inf; 432 goto do_inf; 433 } 434 } 435 } else if (s->flush_to_zero) { 436 flags |= float_flag_output_denormal; 437 p.cls = float_class_zero; 438 goto do_zero; 439 } else { 440 bool is_tiny = (s->float_detect_tininess 441 == float_tininess_before_rounding) 442 || (exp < 0) 443 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT); 444 445 shift64RightJamming(frac, 1 - exp, &frac); 446 if (frac & round_mask) { 447 /* Need to recompute round-to-even. */ 448 if (s->float_rounding_mode == float_round_nearest_even) { 449 inc = ((frac & roundeven_mask) != frac_lsbm1 450 ? frac_lsbm1 : 0); 451 } 452 flags |= float_flag_inexact; 453 frac += inc; 454 } 455 456 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0); 457 frac >>= frac_shift; 458 459 if (is_tiny && (flags & float_flag_inexact)) { 460 flags |= float_flag_underflow; 461 } 462 if (exp == 0 && frac == 0) { 463 p.cls = float_class_zero; 464 } 465 } 466 break; 467 468 case float_class_zero: 469 do_zero: 470 exp = 0; 471 frac = 0; 472 break; 473 474 case float_class_inf: 475 do_inf: 476 exp = exp_max; 477 frac = 0; 478 break; 479 480 case float_class_qnan: 481 case float_class_snan: 482 exp = exp_max; 483 break; 484 485 default: 486 g_assert_not_reached(); 487 } 488 489 float_raise(flags, s); 490 p.exp = exp; 491 p.frac = frac; 492 return p; 493 } 494 495 static FloatParts float16_unpack_canonical(float16 f, float_status *s) 496 { 497 return canonicalize(float16_unpack_raw(f), &float16_params, s); 498 } 499 500 static float16 float16_round_pack_canonical(FloatParts p, float_status *s) 501 { 502 switch (p.cls) { 503 case float_class_dnan: 504 return float16_default_nan(s); 505 case float_class_msnan: 506 return float16_maybe_silence_nan(float16_pack_raw(p), s); 507 default: 508 p = round_canonical(p, s, &float16_params); 509 return float16_pack_raw(p); 510 } 511 } 512 513 static FloatParts float32_unpack_canonical(float32 f, float_status *s) 514 { 515 return canonicalize(float32_unpack_raw(f), &float32_params, s); 516 } 517 518 static float32 float32_round_pack_canonical(FloatParts p, float_status *s) 519 { 520 switch (p.cls) { 521 case float_class_dnan: 522 return float32_default_nan(s); 523 case float_class_msnan: 524 return float32_maybe_silence_nan(float32_pack_raw(p), s); 525 default: 526 p = round_canonical(p, s, &float32_params); 527 return float32_pack_raw(p); 528 } 529 } 530 531 static FloatParts float64_unpack_canonical(float64 f, float_status *s) 532 { 533 return canonicalize(float64_unpack_raw(f), &float64_params, s); 534 } 535 536 static float64 float64_round_pack_canonical(FloatParts p, float_status *s) 537 { 538 switch (p.cls) { 539 case float_class_dnan: 540 return float64_default_nan(s); 541 case float_class_msnan: 542 return float64_maybe_silence_nan(float64_pack_raw(p), s); 543 default: 544 p = round_canonical(p, s, &float64_params); 545 return float64_pack_raw(p); 546 } 547 } 548 549 /* Simple helpers for checking if what NaN we have */ 550 static bool is_nan(FloatClass c) 551 { 552 return unlikely(c >= float_class_qnan); 553 } 554 static bool is_snan(FloatClass c) 555 { 556 return c == float_class_snan; 557 } 558 static bool is_qnan(FloatClass c) 559 { 560 return c == float_class_qnan; 561 } 562 563 static FloatParts return_nan(FloatParts a, float_status *s) 564 { 565 switch (a.cls) { 566 case float_class_snan: 567 s->float_exception_flags |= float_flag_invalid; 568 a.cls = float_class_msnan; 569 /* fall through */ 570 case float_class_qnan: 571 if (s->default_nan_mode) { 572 a.cls = float_class_dnan; 573 } 574 break; 575 576 default: 577 g_assert_not_reached(); 578 } 579 return a; 580 } 581 582 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s) 583 { 584 if (is_snan(a.cls) || is_snan(b.cls)) { 585 s->float_exception_flags |= float_flag_invalid; 586 } 587 588 if (s->default_nan_mode) { 589 a.cls = float_class_dnan; 590 } else { 591 if (pickNaN(is_qnan(a.cls), is_snan(a.cls), 592 is_qnan(b.cls), is_snan(b.cls), 593 a.frac > b.frac || 594 (a.frac == b.frac && a.sign < b.sign))) { 595 a = b; 596 } 597 a.cls = float_class_msnan; 598 } 599 return a; 600 } 601 602 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c, 603 bool inf_zero, float_status *s) 604 { 605 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) { 606 s->float_exception_flags |= float_flag_invalid; 607 } 608 609 if (s->default_nan_mode) { 610 a.cls = float_class_dnan; 611 } else { 612 switch (pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls), 613 is_qnan(b.cls), is_snan(b.cls), 614 is_qnan(c.cls), is_snan(c.cls), 615 inf_zero, s)) { 616 case 0: 617 break; 618 case 1: 619 a = b; 620 break; 621 case 2: 622 a = c; 623 break; 624 case 3: 625 a.cls = float_class_dnan; 626 return a; 627 default: 628 g_assert_not_reached(); 629 } 630 631 a.cls = float_class_msnan; 632 } 633 return a; 634 } 635 636 /* 637 * Returns the result of adding or subtracting the values of the 638 * floating-point values `a' and `b'. The operation is performed 639 * according to the IEC/IEEE Standard for Binary Floating-Point 640 * Arithmetic. 641 */ 642 643 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract, 644 float_status *s) 645 { 646 bool a_sign = a.sign; 647 bool b_sign = b.sign ^ subtract; 648 649 if (a_sign != b_sign) { 650 /* Subtraction */ 651 652 if (a.cls == float_class_normal && b.cls == float_class_normal) { 653 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) { 654 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 655 a.frac = a.frac - b.frac; 656 } else { 657 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 658 a.frac = b.frac - a.frac; 659 a.exp = b.exp; 660 a_sign ^= 1; 661 } 662 663 if (a.frac == 0) { 664 a.cls = float_class_zero; 665 a.sign = s->float_rounding_mode == float_round_down; 666 } else { 667 int shift = clz64(a.frac) - 1; 668 a.frac = a.frac << shift; 669 a.exp = a.exp - shift; 670 a.sign = a_sign; 671 } 672 return a; 673 } 674 if (is_nan(a.cls) || is_nan(b.cls)) { 675 return pick_nan(a, b, s); 676 } 677 if (a.cls == float_class_inf) { 678 if (b.cls == float_class_inf) { 679 float_raise(float_flag_invalid, s); 680 a.cls = float_class_dnan; 681 } 682 return a; 683 } 684 if (a.cls == float_class_zero && b.cls == float_class_zero) { 685 a.sign = s->float_rounding_mode == float_round_down; 686 return a; 687 } 688 if (a.cls == float_class_zero || b.cls == float_class_inf) { 689 b.sign = a_sign ^ 1; 690 return b; 691 } 692 if (b.cls == float_class_zero) { 693 return a; 694 } 695 } else { 696 /* Addition */ 697 if (a.cls == float_class_normal && b.cls == float_class_normal) { 698 if (a.exp > b.exp) { 699 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 700 } else if (a.exp < b.exp) { 701 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 702 a.exp = b.exp; 703 } 704 a.frac += b.frac; 705 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 706 a.frac >>= 1; 707 a.exp += 1; 708 } 709 return a; 710 } 711 if (is_nan(a.cls) || is_nan(b.cls)) { 712 return pick_nan(a, b, s); 713 } 714 if (a.cls == float_class_inf || b.cls == float_class_zero) { 715 return a; 716 } 717 if (b.cls == float_class_inf || a.cls == float_class_zero) { 718 b.sign = b_sign; 719 return b; 720 } 721 } 722 g_assert_not_reached(); 723 } 724 725 /* 726 * Returns the result of adding or subtracting the floating-point 727 * values `a' and `b'. The operation is performed according to the 728 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 729 */ 730 731 float16 __attribute__((flatten)) float16_add(float16 a, float16 b, 732 float_status *status) 733 { 734 FloatParts pa = float16_unpack_canonical(a, status); 735 FloatParts pb = float16_unpack_canonical(b, status); 736 FloatParts pr = addsub_floats(pa, pb, false, status); 737 738 return float16_round_pack_canonical(pr, status); 739 } 740 741 float32 __attribute__((flatten)) float32_add(float32 a, float32 b, 742 float_status *status) 743 { 744 FloatParts pa = float32_unpack_canonical(a, status); 745 FloatParts pb = float32_unpack_canonical(b, status); 746 FloatParts pr = addsub_floats(pa, pb, false, status); 747 748 return float32_round_pack_canonical(pr, status); 749 } 750 751 float64 __attribute__((flatten)) float64_add(float64 a, float64 b, 752 float_status *status) 753 { 754 FloatParts pa = float64_unpack_canonical(a, status); 755 FloatParts pb = float64_unpack_canonical(b, status); 756 FloatParts pr = addsub_floats(pa, pb, false, status); 757 758 return float64_round_pack_canonical(pr, status); 759 } 760 761 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b, 762 float_status *status) 763 { 764 FloatParts pa = float16_unpack_canonical(a, status); 765 FloatParts pb = float16_unpack_canonical(b, status); 766 FloatParts pr = addsub_floats(pa, pb, true, status); 767 768 return float16_round_pack_canonical(pr, status); 769 } 770 771 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b, 772 float_status *status) 773 { 774 FloatParts pa = float32_unpack_canonical(a, status); 775 FloatParts pb = float32_unpack_canonical(b, status); 776 FloatParts pr = addsub_floats(pa, pb, true, status); 777 778 return float32_round_pack_canonical(pr, status); 779 } 780 781 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b, 782 float_status *status) 783 { 784 FloatParts pa = float64_unpack_canonical(a, status); 785 FloatParts pb = float64_unpack_canonical(b, status); 786 FloatParts pr = addsub_floats(pa, pb, true, status); 787 788 return float64_round_pack_canonical(pr, status); 789 } 790 791 /* 792 * Returns the result of multiplying the floating-point values `a' and 793 * `b'. The operation is performed according to the IEC/IEEE Standard 794 * for Binary Floating-Point Arithmetic. 795 */ 796 797 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s) 798 { 799 bool sign = a.sign ^ b.sign; 800 801 if (a.cls == float_class_normal && b.cls == float_class_normal) { 802 uint64_t hi, lo; 803 int exp = a.exp + b.exp; 804 805 mul64To128(a.frac, b.frac, &hi, &lo); 806 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 807 if (lo & DECOMPOSED_OVERFLOW_BIT) { 808 shift64RightJamming(lo, 1, &lo); 809 exp += 1; 810 } 811 812 /* Re-use a */ 813 a.exp = exp; 814 a.sign = sign; 815 a.frac = lo; 816 return a; 817 } 818 /* handle all the NaN cases */ 819 if (is_nan(a.cls) || is_nan(b.cls)) { 820 return pick_nan(a, b, s); 821 } 822 /* Inf * Zero == NaN */ 823 if ((a.cls == float_class_inf && b.cls == float_class_zero) || 824 (a.cls == float_class_zero && b.cls == float_class_inf)) { 825 s->float_exception_flags |= float_flag_invalid; 826 a.cls = float_class_dnan; 827 a.sign = sign; 828 return a; 829 } 830 /* Multiply by 0 or Inf */ 831 if (a.cls == float_class_inf || a.cls == float_class_zero) { 832 a.sign = sign; 833 return a; 834 } 835 if (b.cls == float_class_inf || b.cls == float_class_zero) { 836 b.sign = sign; 837 return b; 838 } 839 g_assert_not_reached(); 840 } 841 842 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b, 843 float_status *status) 844 { 845 FloatParts pa = float16_unpack_canonical(a, status); 846 FloatParts pb = float16_unpack_canonical(b, status); 847 FloatParts pr = mul_floats(pa, pb, status); 848 849 return float16_round_pack_canonical(pr, status); 850 } 851 852 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b, 853 float_status *status) 854 { 855 FloatParts pa = float32_unpack_canonical(a, status); 856 FloatParts pb = float32_unpack_canonical(b, status); 857 FloatParts pr = mul_floats(pa, pb, status); 858 859 return float32_round_pack_canonical(pr, status); 860 } 861 862 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b, 863 float_status *status) 864 { 865 FloatParts pa = float64_unpack_canonical(a, status); 866 FloatParts pb = float64_unpack_canonical(b, status); 867 FloatParts pr = mul_floats(pa, pb, status); 868 869 return float64_round_pack_canonical(pr, status); 870 } 871 872 /* 873 * Returns the result of multiplying the floating-point values `a' and 874 * `b' then adding 'c', with no intermediate rounding step after the 875 * multiplication. The operation is performed according to the 876 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008. 877 * The flags argument allows the caller to select negation of the 878 * addend, the intermediate product, or the final result. (The 879 * difference between this and having the caller do a separate 880 * negation is that negating externally will flip the sign bit on 881 * NaNs.) 882 */ 883 884 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c, 885 int flags, float_status *s) 886 { 887 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) == 888 ((1 << float_class_inf) | (1 << float_class_zero)); 889 bool p_sign; 890 bool sign_flip = flags & float_muladd_negate_result; 891 FloatClass p_class; 892 uint64_t hi, lo; 893 int p_exp; 894 895 /* It is implementation-defined whether the cases of (0,inf,qnan) 896 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 897 * they return if they do), so we have to hand this information 898 * off to the target-specific pick-a-NaN routine. 899 */ 900 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) { 901 return pick_nan_muladd(a, b, c, inf_zero, s); 902 } 903 904 if (inf_zero) { 905 s->float_exception_flags |= float_flag_invalid; 906 a.cls = float_class_dnan; 907 return a; 908 } 909 910 if (flags & float_muladd_negate_c) { 911 c.sign ^= 1; 912 } 913 914 p_sign = a.sign ^ b.sign; 915 916 if (flags & float_muladd_negate_product) { 917 p_sign ^= 1; 918 } 919 920 if (a.cls == float_class_inf || b.cls == float_class_inf) { 921 p_class = float_class_inf; 922 } else if (a.cls == float_class_zero || b.cls == float_class_zero) { 923 p_class = float_class_zero; 924 } else { 925 p_class = float_class_normal; 926 } 927 928 if (c.cls == float_class_inf) { 929 if (p_class == float_class_inf && p_sign != c.sign) { 930 s->float_exception_flags |= float_flag_invalid; 931 a.cls = float_class_dnan; 932 } else { 933 a.cls = float_class_inf; 934 a.sign = c.sign ^ sign_flip; 935 } 936 return a; 937 } 938 939 if (p_class == float_class_inf) { 940 a.cls = float_class_inf; 941 a.sign = p_sign ^ sign_flip; 942 return a; 943 } 944 945 if (p_class == float_class_zero) { 946 if (c.cls == float_class_zero) { 947 if (p_sign != c.sign) { 948 p_sign = s->float_rounding_mode == float_round_down; 949 } 950 c.sign = p_sign; 951 } else if (flags & float_muladd_halve_result) { 952 c.exp -= 1; 953 } 954 c.sign ^= sign_flip; 955 return c; 956 } 957 958 /* a & b should be normals now... */ 959 assert(a.cls == float_class_normal && 960 b.cls == float_class_normal); 961 962 p_exp = a.exp + b.exp; 963 964 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit 965 * result. 966 */ 967 mul64To128(a.frac, b.frac, &hi, &lo); 968 /* binary point now at bit 124 */ 969 970 /* check for overflow */ 971 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) { 972 shift128RightJamming(hi, lo, 1, &hi, &lo); 973 p_exp += 1; 974 } 975 976 /* + add/sub */ 977 if (c.cls == float_class_zero) { 978 /* move binary point back to 62 */ 979 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 980 } else { 981 int exp_diff = p_exp - c.exp; 982 if (p_sign == c.sign) { 983 /* Addition */ 984 if (exp_diff <= 0) { 985 shift128RightJamming(hi, lo, 986 DECOMPOSED_BINARY_POINT - exp_diff, 987 &hi, &lo); 988 lo += c.frac; 989 p_exp = c.exp; 990 } else { 991 uint64_t c_hi, c_lo; 992 /* shift c to the same binary point as the product (124) */ 993 c_hi = c.frac >> 2; 994 c_lo = 0; 995 shift128RightJamming(c_hi, c_lo, 996 exp_diff, 997 &c_hi, &c_lo); 998 add128(hi, lo, c_hi, c_lo, &hi, &lo); 999 /* move binary point back to 62 */ 1000 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1001 } 1002 1003 if (lo & DECOMPOSED_OVERFLOW_BIT) { 1004 shift64RightJamming(lo, 1, &lo); 1005 p_exp += 1; 1006 } 1007 1008 } else { 1009 /* Subtraction */ 1010 uint64_t c_hi, c_lo; 1011 /* make C binary point match product at bit 124 */ 1012 c_hi = c.frac >> 2; 1013 c_lo = 0; 1014 1015 if (exp_diff <= 0) { 1016 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo); 1017 if (exp_diff == 0 1018 && 1019 (hi > c_hi || (hi == c_hi && lo >= c_lo))) { 1020 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1021 } else { 1022 sub128(c_hi, c_lo, hi, lo, &hi, &lo); 1023 p_sign ^= 1; 1024 p_exp = c.exp; 1025 } 1026 } else { 1027 shift128RightJamming(c_hi, c_lo, 1028 exp_diff, 1029 &c_hi, &c_lo); 1030 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1031 } 1032 1033 if (hi == 0 && lo == 0) { 1034 a.cls = float_class_zero; 1035 a.sign = s->float_rounding_mode == float_round_down; 1036 a.sign ^= sign_flip; 1037 return a; 1038 } else { 1039 int shift; 1040 if (hi != 0) { 1041 shift = clz64(hi); 1042 } else { 1043 shift = clz64(lo) + 64; 1044 } 1045 /* Normalizing to a binary point of 124 is the 1046 correct adjust for the exponent. However since we're 1047 shifting, we might as well put the binary point back 1048 at 62 where we really want it. Therefore shift as 1049 if we're leaving 1 bit at the top of the word, but 1050 adjust the exponent as if we're leaving 3 bits. */ 1051 shift -= 1; 1052 if (shift >= 64) { 1053 lo = lo << (shift - 64); 1054 } else { 1055 hi = (hi << shift) | (lo >> (64 - shift)); 1056 lo = hi | ((lo << shift) != 0); 1057 } 1058 p_exp -= shift - 2; 1059 } 1060 } 1061 } 1062 1063 if (flags & float_muladd_halve_result) { 1064 p_exp -= 1; 1065 } 1066 1067 /* finally prepare our result */ 1068 a.cls = float_class_normal; 1069 a.sign = p_sign ^ sign_flip; 1070 a.exp = p_exp; 1071 a.frac = lo; 1072 1073 return a; 1074 } 1075 1076 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c, 1077 int flags, float_status *status) 1078 { 1079 FloatParts pa = float16_unpack_canonical(a, status); 1080 FloatParts pb = float16_unpack_canonical(b, status); 1081 FloatParts pc = float16_unpack_canonical(c, status); 1082 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1083 1084 return float16_round_pack_canonical(pr, status); 1085 } 1086 1087 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c, 1088 int flags, float_status *status) 1089 { 1090 FloatParts pa = float32_unpack_canonical(a, status); 1091 FloatParts pb = float32_unpack_canonical(b, status); 1092 FloatParts pc = float32_unpack_canonical(c, status); 1093 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1094 1095 return float32_round_pack_canonical(pr, status); 1096 } 1097 1098 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c, 1099 int flags, float_status *status) 1100 { 1101 FloatParts pa = float64_unpack_canonical(a, status); 1102 FloatParts pb = float64_unpack_canonical(b, status); 1103 FloatParts pc = float64_unpack_canonical(c, status); 1104 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1105 1106 return float64_round_pack_canonical(pr, status); 1107 } 1108 1109 /* 1110 * Returns the result of dividing the floating-point value `a' by the 1111 * corresponding value `b'. The operation is performed according to 1112 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1113 */ 1114 1115 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s) 1116 { 1117 bool sign = a.sign ^ b.sign; 1118 1119 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1120 uint64_t temp_lo, temp_hi; 1121 int exp = a.exp - b.exp; 1122 if (a.frac < b.frac) { 1123 exp -= 1; 1124 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, 1125 &temp_hi, &temp_lo); 1126 } else { 1127 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, 1128 &temp_hi, &temp_lo); 1129 } 1130 /* LSB of quot is set if inexact which roundandpack will use 1131 * to set flags. Yet again we re-use a for the result */ 1132 a.frac = div128To64(temp_lo, temp_hi, b.frac); 1133 a.sign = sign; 1134 a.exp = exp; 1135 return a; 1136 } 1137 /* handle all the NaN cases */ 1138 if (is_nan(a.cls) || is_nan(b.cls)) { 1139 return pick_nan(a, b, s); 1140 } 1141 /* 0/0 or Inf/Inf */ 1142 if (a.cls == b.cls 1143 && 1144 (a.cls == float_class_inf || a.cls == float_class_zero)) { 1145 s->float_exception_flags |= float_flag_invalid; 1146 a.cls = float_class_dnan; 1147 return a; 1148 } 1149 /* Div 0 => Inf */ 1150 if (b.cls == float_class_zero) { 1151 s->float_exception_flags |= float_flag_divbyzero; 1152 a.cls = float_class_inf; 1153 a.sign = sign; 1154 return a; 1155 } 1156 /* Inf / x or 0 / x */ 1157 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1158 a.sign = sign; 1159 return a; 1160 } 1161 /* Div by Inf */ 1162 if (b.cls == float_class_inf) { 1163 a.cls = float_class_zero; 1164 a.sign = sign; 1165 return a; 1166 } 1167 g_assert_not_reached(); 1168 } 1169 1170 float16 float16_div(float16 a, float16 b, float_status *status) 1171 { 1172 FloatParts pa = float16_unpack_canonical(a, status); 1173 FloatParts pb = float16_unpack_canonical(b, status); 1174 FloatParts pr = div_floats(pa, pb, status); 1175 1176 return float16_round_pack_canonical(pr, status); 1177 } 1178 1179 float32 float32_div(float32 a, float32 b, float_status *status) 1180 { 1181 FloatParts pa = float32_unpack_canonical(a, status); 1182 FloatParts pb = float32_unpack_canonical(b, status); 1183 FloatParts pr = div_floats(pa, pb, status); 1184 1185 return float32_round_pack_canonical(pr, status); 1186 } 1187 1188 float64 float64_div(float64 a, float64 b, float_status *status) 1189 { 1190 FloatParts pa = float64_unpack_canonical(a, status); 1191 FloatParts pb = float64_unpack_canonical(b, status); 1192 FloatParts pr = div_floats(pa, pb, status); 1193 1194 return float64_round_pack_canonical(pr, status); 1195 } 1196 1197 /* 1198 * Rounds the floating-point value `a' to an integer, and returns the 1199 * result as a floating-point value. The operation is performed 1200 * according to the IEC/IEEE Standard for Binary Floating-Point 1201 * Arithmetic. 1202 */ 1203 1204 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s) 1205 { 1206 if (is_nan(a.cls)) { 1207 return return_nan(a, s); 1208 } 1209 1210 switch (a.cls) { 1211 case float_class_zero: 1212 case float_class_inf: 1213 case float_class_qnan: 1214 /* already "integral" */ 1215 break; 1216 case float_class_normal: 1217 if (a.exp >= DECOMPOSED_BINARY_POINT) { 1218 /* already integral */ 1219 break; 1220 } 1221 if (a.exp < 0) { 1222 bool one; 1223 /* all fractional */ 1224 s->float_exception_flags |= float_flag_inexact; 1225 switch (rounding_mode) { 1226 case float_round_nearest_even: 1227 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT; 1228 break; 1229 case float_round_ties_away: 1230 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT; 1231 break; 1232 case float_round_to_zero: 1233 one = false; 1234 break; 1235 case float_round_up: 1236 one = !a.sign; 1237 break; 1238 case float_round_down: 1239 one = a.sign; 1240 break; 1241 default: 1242 g_assert_not_reached(); 1243 } 1244 1245 if (one) { 1246 a.frac = DECOMPOSED_IMPLICIT_BIT; 1247 a.exp = 0; 1248 } else { 1249 a.cls = float_class_zero; 1250 } 1251 } else { 1252 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp; 1253 uint64_t frac_lsbm1 = frac_lsb >> 1; 1254 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb; 1255 uint64_t rnd_mask = rnd_even_mask >> 1; 1256 uint64_t inc; 1257 1258 switch (rounding_mode) { 1259 case float_round_nearest_even: 1260 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 1261 break; 1262 case float_round_ties_away: 1263 inc = frac_lsbm1; 1264 break; 1265 case float_round_to_zero: 1266 inc = 0; 1267 break; 1268 case float_round_up: 1269 inc = a.sign ? 0 : rnd_mask; 1270 break; 1271 case float_round_down: 1272 inc = a.sign ? rnd_mask : 0; 1273 break; 1274 default: 1275 g_assert_not_reached(); 1276 } 1277 1278 if (a.frac & rnd_mask) { 1279 s->float_exception_flags |= float_flag_inexact; 1280 a.frac += inc; 1281 a.frac &= ~rnd_mask; 1282 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 1283 a.frac >>= 1; 1284 a.exp++; 1285 } 1286 } 1287 } 1288 break; 1289 default: 1290 g_assert_not_reached(); 1291 } 1292 return a; 1293 } 1294 1295 float16 float16_round_to_int(float16 a, float_status *s) 1296 { 1297 FloatParts pa = float16_unpack_canonical(a, s); 1298 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s); 1299 return float16_round_pack_canonical(pr, s); 1300 } 1301 1302 float32 float32_round_to_int(float32 a, float_status *s) 1303 { 1304 FloatParts pa = float32_unpack_canonical(a, s); 1305 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s); 1306 return float32_round_pack_canonical(pr, s); 1307 } 1308 1309 float64 float64_round_to_int(float64 a, float_status *s) 1310 { 1311 FloatParts pa = float64_unpack_canonical(a, s); 1312 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s); 1313 return float64_round_pack_canonical(pr, s); 1314 } 1315 1316 float64 float64_trunc_to_int(float64 a, float_status *s) 1317 { 1318 FloatParts pa = float64_unpack_canonical(a, s); 1319 FloatParts pr = round_to_int(pa, float_round_to_zero, s); 1320 return float64_round_pack_canonical(pr, s); 1321 } 1322 1323 /* 1324 * Returns the result of converting the floating-point value `a' to 1325 * the two's complement integer format. The conversion is performed 1326 * according to the IEC/IEEE Standard for Binary Floating-Point 1327 * Arithmetic---which means in particular that the conversion is 1328 * rounded according to the current rounding mode. If `a' is a NaN, 1329 * the largest positive integer is returned. Otherwise, if the 1330 * conversion overflows, the largest integer with the same sign as `a' 1331 * is returned. 1332 */ 1333 1334 static int64_t round_to_int_and_pack(FloatParts in, int rmode, 1335 int64_t min, int64_t max, 1336 float_status *s) 1337 { 1338 uint64_t r; 1339 int orig_flags = get_float_exception_flags(s); 1340 FloatParts p = round_to_int(in, rmode, s); 1341 1342 switch (p.cls) { 1343 case float_class_snan: 1344 case float_class_qnan: 1345 case float_class_dnan: 1346 case float_class_msnan: 1347 return max; 1348 case float_class_inf: 1349 return p.sign ? min : max; 1350 case float_class_zero: 1351 return 0; 1352 case float_class_normal: 1353 if (p.exp < DECOMPOSED_BINARY_POINT) { 1354 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 1355 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 1356 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 1357 } else { 1358 r = UINT64_MAX; 1359 } 1360 if (p.sign) { 1361 if (r < -(uint64_t) min) { 1362 return -r; 1363 } else { 1364 s->float_exception_flags = orig_flags | float_flag_invalid; 1365 return min; 1366 } 1367 } else { 1368 if (r < max) { 1369 return r; 1370 } else { 1371 s->float_exception_flags = orig_flags | float_flag_invalid; 1372 return max; 1373 } 1374 } 1375 default: 1376 g_assert_not_reached(); 1377 } 1378 } 1379 1380 #define FLOAT_TO_INT(fsz, isz) \ 1381 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a, \ 1382 float_status *s) \ 1383 { \ 1384 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1385 return round_to_int_and_pack(p, s->float_rounding_mode, \ 1386 INT ## isz ## _MIN, INT ## isz ## _MAX,\ 1387 s); \ 1388 } \ 1389 \ 1390 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero \ 1391 (float ## fsz a, float_status *s) \ 1392 { \ 1393 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1394 return round_to_int_and_pack(p, float_round_to_zero, \ 1395 INT ## isz ## _MIN, INT ## isz ## _MAX,\ 1396 s); \ 1397 } 1398 1399 FLOAT_TO_INT(16, 16) 1400 FLOAT_TO_INT(16, 32) 1401 FLOAT_TO_INT(16, 64) 1402 1403 FLOAT_TO_INT(32, 16) 1404 FLOAT_TO_INT(32, 32) 1405 FLOAT_TO_INT(32, 64) 1406 1407 FLOAT_TO_INT(64, 16) 1408 FLOAT_TO_INT(64, 32) 1409 FLOAT_TO_INT(64, 64) 1410 1411 #undef FLOAT_TO_INT 1412 1413 /* 1414 * Returns the result of converting the floating-point value `a' to 1415 * the unsigned integer format. The conversion is performed according 1416 * to the IEC/IEEE Standard for Binary Floating-Point 1417 * Arithmetic---which means in particular that the conversion is 1418 * rounded according to the current rounding mode. If `a' is a NaN, 1419 * the largest unsigned integer is returned. Otherwise, if the 1420 * conversion overflows, the largest unsigned integer is returned. If 1421 * the 'a' is negative, the result is rounded and zero is returned; 1422 * values that do not round to zero will raise the inexact exception 1423 * flag. 1424 */ 1425 1426 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max, 1427 float_status *s) 1428 { 1429 int orig_flags = get_float_exception_flags(s); 1430 FloatParts p = round_to_int(in, rmode, s); 1431 1432 switch (p.cls) { 1433 case float_class_snan: 1434 case float_class_qnan: 1435 case float_class_dnan: 1436 case float_class_msnan: 1437 s->float_exception_flags = orig_flags | float_flag_invalid; 1438 return max; 1439 case float_class_inf: 1440 return p.sign ? 0 : max; 1441 case float_class_zero: 1442 return 0; 1443 case float_class_normal: 1444 { 1445 uint64_t r; 1446 if (p.sign) { 1447 s->float_exception_flags = orig_flags | float_flag_invalid; 1448 return 0; 1449 } 1450 1451 if (p.exp < DECOMPOSED_BINARY_POINT) { 1452 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 1453 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 1454 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 1455 } else { 1456 s->float_exception_flags = orig_flags | float_flag_invalid; 1457 return max; 1458 } 1459 1460 /* For uint64 this will never trip, but if p.exp is too large 1461 * to shift a decomposed fraction we shall have exited via the 1462 * 3rd leg above. 1463 */ 1464 if (r > max) { 1465 s->float_exception_flags = orig_flags | float_flag_invalid; 1466 return max; 1467 } else { 1468 return r; 1469 } 1470 } 1471 default: 1472 g_assert_not_reached(); 1473 } 1474 } 1475 1476 #define FLOAT_TO_UINT(fsz, isz) \ 1477 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a, \ 1478 float_status *s) \ 1479 { \ 1480 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1481 return round_to_uint_and_pack(p, s->float_rounding_mode, \ 1482 UINT ## isz ## _MAX, s); \ 1483 } \ 1484 \ 1485 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero \ 1486 (float ## fsz a, float_status *s) \ 1487 { \ 1488 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1489 return round_to_uint_and_pack(p, s->float_rounding_mode, \ 1490 UINT ## isz ## _MAX, s); \ 1491 } 1492 1493 FLOAT_TO_UINT(16, 16) 1494 FLOAT_TO_UINT(16, 32) 1495 FLOAT_TO_UINT(16, 64) 1496 1497 FLOAT_TO_UINT(32, 16) 1498 FLOAT_TO_UINT(32, 32) 1499 FLOAT_TO_UINT(32, 64) 1500 1501 FLOAT_TO_UINT(64, 16) 1502 FLOAT_TO_UINT(64, 32) 1503 FLOAT_TO_UINT(64, 64) 1504 1505 #undef FLOAT_TO_UINT 1506 1507 /* 1508 * Integer to float conversions 1509 * 1510 * Returns the result of converting the two's complement integer `a' 1511 * to the floating-point format. The conversion is performed according 1512 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1513 */ 1514 1515 static FloatParts int_to_float(int64_t a, float_status *status) 1516 { 1517 FloatParts r; 1518 if (a == 0) { 1519 r.cls = float_class_zero; 1520 r.sign = false; 1521 } else if (a == (1ULL << 63)) { 1522 r.cls = float_class_normal; 1523 r.sign = true; 1524 r.frac = DECOMPOSED_IMPLICIT_BIT; 1525 r.exp = 63; 1526 } else { 1527 uint64_t f; 1528 if (a < 0) { 1529 f = -a; 1530 r.sign = true; 1531 } else { 1532 f = a; 1533 r.sign = false; 1534 } 1535 int shift = clz64(f) - 1; 1536 r.cls = float_class_normal; 1537 r.exp = (DECOMPOSED_BINARY_POINT - shift); 1538 r.frac = f << shift; 1539 } 1540 1541 return r; 1542 } 1543 1544 float16 int64_to_float16(int64_t a, float_status *status) 1545 { 1546 FloatParts pa = int_to_float(a, status); 1547 return float16_round_pack_canonical(pa, status); 1548 } 1549 1550 float16 int32_to_float16(int32_t a, float_status *status) 1551 { 1552 return int64_to_float16(a, status); 1553 } 1554 1555 float16 int16_to_float16(int16_t a, float_status *status) 1556 { 1557 return int64_to_float16(a, status); 1558 } 1559 1560 float32 int64_to_float32(int64_t a, float_status *status) 1561 { 1562 FloatParts pa = int_to_float(a, status); 1563 return float32_round_pack_canonical(pa, status); 1564 } 1565 1566 float32 int32_to_float32(int32_t a, float_status *status) 1567 { 1568 return int64_to_float32(a, status); 1569 } 1570 1571 float32 int16_to_float32(int16_t a, float_status *status) 1572 { 1573 return int64_to_float32(a, status); 1574 } 1575 1576 float64 int64_to_float64(int64_t a, float_status *status) 1577 { 1578 FloatParts pa = int_to_float(a, status); 1579 return float64_round_pack_canonical(pa, status); 1580 } 1581 1582 float64 int32_to_float64(int32_t a, float_status *status) 1583 { 1584 return int64_to_float64(a, status); 1585 } 1586 1587 float64 int16_to_float64(int16_t a, float_status *status) 1588 { 1589 return int64_to_float64(a, status); 1590 } 1591 1592 1593 /* 1594 * Unsigned Integer to float conversions 1595 * 1596 * Returns the result of converting the unsigned integer `a' to the 1597 * floating-point format. The conversion is performed according to the 1598 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1599 */ 1600 1601 static FloatParts uint_to_float(uint64_t a, float_status *status) 1602 { 1603 FloatParts r = { .sign = false}; 1604 1605 if (a == 0) { 1606 r.cls = float_class_zero; 1607 } else { 1608 int spare_bits = clz64(a) - 1; 1609 r.cls = float_class_normal; 1610 r.exp = DECOMPOSED_BINARY_POINT - spare_bits; 1611 if (spare_bits < 0) { 1612 shift64RightJamming(a, -spare_bits, &a); 1613 r.frac = a; 1614 } else { 1615 r.frac = a << spare_bits; 1616 } 1617 } 1618 1619 return r; 1620 } 1621 1622 float16 uint64_to_float16(uint64_t a, float_status *status) 1623 { 1624 FloatParts pa = uint_to_float(a, status); 1625 return float16_round_pack_canonical(pa, status); 1626 } 1627 1628 float16 uint32_to_float16(uint32_t a, float_status *status) 1629 { 1630 return uint64_to_float16(a, status); 1631 } 1632 1633 float16 uint16_to_float16(uint16_t a, float_status *status) 1634 { 1635 return uint64_to_float16(a, status); 1636 } 1637 1638 float32 uint64_to_float32(uint64_t a, float_status *status) 1639 { 1640 FloatParts pa = uint_to_float(a, status); 1641 return float32_round_pack_canonical(pa, status); 1642 } 1643 1644 float32 uint32_to_float32(uint32_t a, float_status *status) 1645 { 1646 return uint64_to_float32(a, status); 1647 } 1648 1649 float32 uint16_to_float32(uint16_t a, float_status *status) 1650 { 1651 return uint64_to_float32(a, status); 1652 } 1653 1654 float64 uint64_to_float64(uint64_t a, float_status *status) 1655 { 1656 FloatParts pa = uint_to_float(a, status); 1657 return float64_round_pack_canonical(pa, status); 1658 } 1659 1660 float64 uint32_to_float64(uint32_t a, float_status *status) 1661 { 1662 return uint64_to_float64(a, status); 1663 } 1664 1665 float64 uint16_to_float64(uint16_t a, float_status *status) 1666 { 1667 return uint64_to_float64(a, status); 1668 } 1669 1670 /* Float Min/Max */ 1671 /* min() and max() functions. These can't be implemented as 1672 * 'compare and pick one input' because that would mishandle 1673 * NaNs and +0 vs -0. 1674 * 1675 * minnum() and maxnum() functions. These are similar to the min() 1676 * and max() functions but if one of the arguments is a QNaN and 1677 * the other is numerical then the numerical argument is returned. 1678 * SNaNs will get quietened before being returned. 1679 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 1680 * and maxNum() operations. min() and max() are the typical min/max 1681 * semantics provided by many CPUs which predate that specification. 1682 * 1683 * minnummag() and maxnummag() functions correspond to minNumMag() 1684 * and minNumMag() from the IEEE-754 2008. 1685 */ 1686 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin, 1687 bool ieee, bool ismag, float_status *s) 1688 { 1689 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) { 1690 if (ieee) { 1691 /* Takes two floating-point values `a' and `b', one of 1692 * which is a NaN, and returns the appropriate NaN 1693 * result. If either `a' or `b' is a signaling NaN, 1694 * the invalid exception is raised. 1695 */ 1696 if (is_snan(a.cls) || is_snan(b.cls)) { 1697 return pick_nan(a, b, s); 1698 } else if (is_nan(a.cls) && !is_nan(b.cls)) { 1699 return b; 1700 } else if (is_nan(b.cls) && !is_nan(a.cls)) { 1701 return a; 1702 } 1703 } 1704 return pick_nan(a, b, s); 1705 } else { 1706 int a_exp, b_exp; 1707 bool a_sign, b_sign; 1708 1709 switch (a.cls) { 1710 case float_class_normal: 1711 a_exp = a.exp; 1712 break; 1713 case float_class_inf: 1714 a_exp = INT_MAX; 1715 break; 1716 case float_class_zero: 1717 a_exp = INT_MIN; 1718 break; 1719 default: 1720 g_assert_not_reached(); 1721 break; 1722 } 1723 switch (b.cls) { 1724 case float_class_normal: 1725 b_exp = b.exp; 1726 break; 1727 case float_class_inf: 1728 b_exp = INT_MAX; 1729 break; 1730 case float_class_zero: 1731 b_exp = INT_MIN; 1732 break; 1733 default: 1734 g_assert_not_reached(); 1735 break; 1736 } 1737 1738 a_sign = a.sign; 1739 b_sign = b.sign; 1740 if (ismag) { 1741 a_sign = b_sign = 0; 1742 } 1743 1744 if (a_sign == b_sign) { 1745 bool a_less = a_exp < b_exp; 1746 if (a_exp == b_exp) { 1747 a_less = a.frac < b.frac; 1748 } 1749 return a_sign ^ a_less ^ ismin ? b : a; 1750 } else { 1751 return a_sign ^ ismin ? b : a; 1752 } 1753 } 1754 } 1755 1756 #define MINMAX(sz, name, ismin, isiee, ismag) \ 1757 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \ 1758 float_status *s) \ 1759 { \ 1760 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 1761 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 1762 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 1763 \ 1764 return float ## sz ## _round_pack_canonical(pr, s); \ 1765 } 1766 1767 MINMAX(16, min, true, false, false) 1768 MINMAX(16, minnum, true, true, false) 1769 MINMAX(16, minnummag, true, true, true) 1770 MINMAX(16, max, false, false, false) 1771 MINMAX(16, maxnum, false, true, false) 1772 MINMAX(16, maxnummag, false, true, true) 1773 1774 MINMAX(32, min, true, false, false) 1775 MINMAX(32, minnum, true, true, false) 1776 MINMAX(32, minnummag, true, true, true) 1777 MINMAX(32, max, false, false, false) 1778 MINMAX(32, maxnum, false, true, false) 1779 MINMAX(32, maxnummag, false, true, true) 1780 1781 MINMAX(64, min, true, false, false) 1782 MINMAX(64, minnum, true, true, false) 1783 MINMAX(64, minnummag, true, true, true) 1784 MINMAX(64, max, false, false, false) 1785 MINMAX(64, maxnum, false, true, false) 1786 MINMAX(64, maxnummag, false, true, true) 1787 1788 #undef MINMAX 1789 1790 /* Floating point compare */ 1791 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet, 1792 float_status *s) 1793 { 1794 if (is_nan(a.cls) || is_nan(b.cls)) { 1795 if (!is_quiet || 1796 a.cls == float_class_snan || 1797 b.cls == float_class_snan) { 1798 s->float_exception_flags |= float_flag_invalid; 1799 } 1800 return float_relation_unordered; 1801 } 1802 1803 if (a.cls == float_class_zero) { 1804 if (b.cls == float_class_zero) { 1805 return float_relation_equal; 1806 } 1807 return b.sign ? float_relation_greater : float_relation_less; 1808 } else if (b.cls == float_class_zero) { 1809 return a.sign ? float_relation_less : float_relation_greater; 1810 } 1811 1812 /* The only really important thing about infinity is its sign. If 1813 * both are infinities the sign marks the smallest of the two. 1814 */ 1815 if (a.cls == float_class_inf) { 1816 if ((b.cls == float_class_inf) && (a.sign == b.sign)) { 1817 return float_relation_equal; 1818 } 1819 return a.sign ? float_relation_less : float_relation_greater; 1820 } else if (b.cls == float_class_inf) { 1821 return b.sign ? float_relation_greater : float_relation_less; 1822 } 1823 1824 if (a.sign != b.sign) { 1825 return a.sign ? float_relation_less : float_relation_greater; 1826 } 1827 1828 if (a.exp == b.exp) { 1829 if (a.frac == b.frac) { 1830 return float_relation_equal; 1831 } 1832 if (a.sign) { 1833 return a.frac > b.frac ? 1834 float_relation_less : float_relation_greater; 1835 } else { 1836 return a.frac > b.frac ? 1837 float_relation_greater : float_relation_less; 1838 } 1839 } else { 1840 if (a.sign) { 1841 return a.exp > b.exp ? float_relation_less : float_relation_greater; 1842 } else { 1843 return a.exp > b.exp ? float_relation_greater : float_relation_less; 1844 } 1845 } 1846 } 1847 1848 #define COMPARE(sz) \ 1849 int float ## sz ## _compare(float ## sz a, float ## sz b, \ 1850 float_status *s) \ 1851 { \ 1852 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 1853 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 1854 return compare_floats(pa, pb, false, s); \ 1855 } \ 1856 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \ 1857 float_status *s) \ 1858 { \ 1859 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 1860 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 1861 return compare_floats(pa, pb, true, s); \ 1862 } 1863 1864 COMPARE(16) 1865 COMPARE(32) 1866 COMPARE(64) 1867 1868 #undef COMPARE 1869 1870 /* Multiply A by 2 raised to the power N. */ 1871 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s) 1872 { 1873 if (unlikely(is_nan(a.cls))) { 1874 return return_nan(a, s); 1875 } 1876 if (a.cls == float_class_normal) { 1877 a.exp += n; 1878 } 1879 return a; 1880 } 1881 1882 float16 float16_scalbn(float16 a, int n, float_status *status) 1883 { 1884 FloatParts pa = float16_unpack_canonical(a, status); 1885 FloatParts pr = scalbn_decomposed(pa, n, status); 1886 return float16_round_pack_canonical(pr, status); 1887 } 1888 1889 float32 float32_scalbn(float32 a, int n, float_status *status) 1890 { 1891 FloatParts pa = float32_unpack_canonical(a, status); 1892 FloatParts pr = scalbn_decomposed(pa, n, status); 1893 return float32_round_pack_canonical(pr, status); 1894 } 1895 1896 float64 float64_scalbn(float64 a, int n, float_status *status) 1897 { 1898 FloatParts pa = float64_unpack_canonical(a, status); 1899 FloatParts pr = scalbn_decomposed(pa, n, status); 1900 return float64_round_pack_canonical(pr, status); 1901 } 1902 1903 /* 1904 * Square Root 1905 * 1906 * The old softfloat code did an approximation step before zeroing in 1907 * on the final result. However for simpleness we just compute the 1908 * square root by iterating down from the implicit bit to enough extra 1909 * bits to ensure we get a correctly rounded result. 1910 * 1911 * This does mean however the calculation is slower than before, 1912 * especially for 64 bit floats. 1913 */ 1914 1915 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p) 1916 { 1917 uint64_t a_frac, r_frac, s_frac; 1918 int bit, last_bit; 1919 1920 if (is_nan(a.cls)) { 1921 return return_nan(a, s); 1922 } 1923 if (a.cls == float_class_zero) { 1924 return a; /* sqrt(+-0) = +-0 */ 1925 } 1926 if (a.sign) { 1927 s->float_exception_flags |= float_flag_invalid; 1928 a.cls = float_class_dnan; 1929 return a; 1930 } 1931 if (a.cls == float_class_inf) { 1932 return a; /* sqrt(+inf) = +inf */ 1933 } 1934 1935 assert(a.cls == float_class_normal); 1936 1937 /* We need two overflow bits at the top. Adding room for that is a 1938 * right shift. If the exponent is odd, we can discard the low bit 1939 * by multiplying the fraction by 2; that's a left shift. Combine 1940 * those and we shift right if the exponent is even. 1941 */ 1942 a_frac = a.frac; 1943 if (!(a.exp & 1)) { 1944 a_frac >>= 1; 1945 } 1946 a.exp >>= 1; 1947 1948 /* Bit-by-bit computation of sqrt. */ 1949 r_frac = 0; 1950 s_frac = 0; 1951 1952 /* Iterate from implicit bit down to the 3 extra bits to compute a 1953 * properly rounded result. Remember we've inserted one more bit 1954 * at the top, so these positions are one less. 1955 */ 1956 bit = DECOMPOSED_BINARY_POINT - 1; 1957 last_bit = MAX(p->frac_shift - 4, 0); 1958 do { 1959 uint64_t q = 1ULL << bit; 1960 uint64_t t_frac = s_frac + q; 1961 if (t_frac <= a_frac) { 1962 s_frac = t_frac + q; 1963 a_frac -= t_frac; 1964 r_frac += q; 1965 } 1966 a_frac <<= 1; 1967 } while (--bit >= last_bit); 1968 1969 /* Undo the right shift done above. If there is any remaining 1970 * fraction, the result is inexact. Set the sticky bit. 1971 */ 1972 a.frac = (r_frac << 1) + (a_frac != 0); 1973 1974 return a; 1975 } 1976 1977 float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status) 1978 { 1979 FloatParts pa = float16_unpack_canonical(a, status); 1980 FloatParts pr = sqrt_float(pa, status, &float16_params); 1981 return float16_round_pack_canonical(pr, status); 1982 } 1983 1984 float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status) 1985 { 1986 FloatParts pa = float32_unpack_canonical(a, status); 1987 FloatParts pr = sqrt_float(pa, status, &float32_params); 1988 return float32_round_pack_canonical(pr, status); 1989 } 1990 1991 float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status) 1992 { 1993 FloatParts pa = float64_unpack_canonical(a, status); 1994 FloatParts pr = sqrt_float(pa, status, &float64_params); 1995 return float64_round_pack_canonical(pr, status); 1996 } 1997 1998 1999 /*---------------------------------------------------------------------------- 2000 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 2001 | and 7, and returns the properly rounded 32-bit integer corresponding to the 2002 | input. If `zSign' is 1, the input is negated before being converted to an 2003 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 2004 | is simply rounded to an integer, with the inexact exception raised if the 2005 | input cannot be represented exactly as an integer. However, if the fixed- 2006 | point input is too large, the invalid exception is raised and the largest 2007 | positive or negative integer is returned. 2008 *----------------------------------------------------------------------------*/ 2009 2010 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status) 2011 { 2012 int8_t roundingMode; 2013 flag roundNearestEven; 2014 int8_t roundIncrement, roundBits; 2015 int32_t z; 2016 2017 roundingMode = status->float_rounding_mode; 2018 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2019 switch (roundingMode) { 2020 case float_round_nearest_even: 2021 case float_round_ties_away: 2022 roundIncrement = 0x40; 2023 break; 2024 case float_round_to_zero: 2025 roundIncrement = 0; 2026 break; 2027 case float_round_up: 2028 roundIncrement = zSign ? 0 : 0x7f; 2029 break; 2030 case float_round_down: 2031 roundIncrement = zSign ? 0x7f : 0; 2032 break; 2033 default: 2034 abort(); 2035 } 2036 roundBits = absZ & 0x7F; 2037 absZ = ( absZ + roundIncrement )>>7; 2038 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 2039 z = absZ; 2040 if ( zSign ) z = - z; 2041 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 2042 float_raise(float_flag_invalid, status); 2043 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 2044 } 2045 if (roundBits) { 2046 status->float_exception_flags |= float_flag_inexact; 2047 } 2048 return z; 2049 2050 } 2051 2052 /*---------------------------------------------------------------------------- 2053 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 2054 | `absZ1', with binary point between bits 63 and 64 (between the input words), 2055 | and returns the properly rounded 64-bit integer corresponding to the input. 2056 | If `zSign' is 1, the input is negated before being converted to an integer. 2057 | Ordinarily, the fixed-point input is simply rounded to an integer, with 2058 | the inexact exception raised if the input cannot be represented exactly as 2059 | an integer. However, if the fixed-point input is too large, the invalid 2060 | exception is raised and the largest positive or negative integer is 2061 | returned. 2062 *----------------------------------------------------------------------------*/ 2063 2064 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1, 2065 float_status *status) 2066 { 2067 int8_t roundingMode; 2068 flag roundNearestEven, increment; 2069 int64_t z; 2070 2071 roundingMode = status->float_rounding_mode; 2072 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2073 switch (roundingMode) { 2074 case float_round_nearest_even: 2075 case float_round_ties_away: 2076 increment = ((int64_t) absZ1 < 0); 2077 break; 2078 case float_round_to_zero: 2079 increment = 0; 2080 break; 2081 case float_round_up: 2082 increment = !zSign && absZ1; 2083 break; 2084 case float_round_down: 2085 increment = zSign && absZ1; 2086 break; 2087 default: 2088 abort(); 2089 } 2090 if ( increment ) { 2091 ++absZ0; 2092 if ( absZ0 == 0 ) goto overflow; 2093 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 2094 } 2095 z = absZ0; 2096 if ( zSign ) z = - z; 2097 if ( z && ( ( z < 0 ) ^ zSign ) ) { 2098 overflow: 2099 float_raise(float_flag_invalid, status); 2100 return 2101 zSign ? (int64_t) LIT64( 0x8000000000000000 ) 2102 : LIT64( 0x7FFFFFFFFFFFFFFF ); 2103 } 2104 if (absZ1) { 2105 status->float_exception_flags |= float_flag_inexact; 2106 } 2107 return z; 2108 2109 } 2110 2111 /*---------------------------------------------------------------------------- 2112 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 2113 | `absZ1', with binary point between bits 63 and 64 (between the input words), 2114 | and returns the properly rounded 64-bit unsigned integer corresponding to the 2115 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 2116 | with the inexact exception raised if the input cannot be represented exactly 2117 | as an integer. However, if the fixed-point input is too large, the invalid 2118 | exception is raised and the largest unsigned integer is returned. 2119 *----------------------------------------------------------------------------*/ 2120 2121 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0, 2122 uint64_t absZ1, float_status *status) 2123 { 2124 int8_t roundingMode; 2125 flag roundNearestEven, increment; 2126 2127 roundingMode = status->float_rounding_mode; 2128 roundNearestEven = (roundingMode == float_round_nearest_even); 2129 switch (roundingMode) { 2130 case float_round_nearest_even: 2131 case float_round_ties_away: 2132 increment = ((int64_t)absZ1 < 0); 2133 break; 2134 case float_round_to_zero: 2135 increment = 0; 2136 break; 2137 case float_round_up: 2138 increment = !zSign && absZ1; 2139 break; 2140 case float_round_down: 2141 increment = zSign && absZ1; 2142 break; 2143 default: 2144 abort(); 2145 } 2146 if (increment) { 2147 ++absZ0; 2148 if (absZ0 == 0) { 2149 float_raise(float_flag_invalid, status); 2150 return LIT64(0xFFFFFFFFFFFFFFFF); 2151 } 2152 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven); 2153 } 2154 2155 if (zSign && absZ0) { 2156 float_raise(float_flag_invalid, status); 2157 return 0; 2158 } 2159 2160 if (absZ1) { 2161 status->float_exception_flags |= float_flag_inexact; 2162 } 2163 return absZ0; 2164 } 2165 2166 /*---------------------------------------------------------------------------- 2167 | If `a' is denormal and we are in flush-to-zero mode then set the 2168 | input-denormal exception and return zero. Otherwise just return the value. 2169 *----------------------------------------------------------------------------*/ 2170 float32 float32_squash_input_denormal(float32 a, float_status *status) 2171 { 2172 if (status->flush_inputs_to_zero) { 2173 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) { 2174 float_raise(float_flag_input_denormal, status); 2175 return make_float32(float32_val(a) & 0x80000000); 2176 } 2177 } 2178 return a; 2179 } 2180 2181 /*---------------------------------------------------------------------------- 2182 | Normalizes the subnormal single-precision floating-point value represented 2183 | by the denormalized significand `aSig'. The normalized exponent and 2184 | significand are stored at the locations pointed to by `zExpPtr' and 2185 | `zSigPtr', respectively. 2186 *----------------------------------------------------------------------------*/ 2187 2188 static void 2189 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 2190 { 2191 int8_t shiftCount; 2192 2193 shiftCount = countLeadingZeros32( aSig ) - 8; 2194 *zSigPtr = aSig<<shiftCount; 2195 *zExpPtr = 1 - shiftCount; 2196 2197 } 2198 2199 /*---------------------------------------------------------------------------- 2200 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2201 | and significand `zSig', and returns the proper single-precision floating- 2202 | point value corresponding to the abstract input. Ordinarily, the abstract 2203 | value is simply rounded and packed into the single-precision format, with 2204 | the inexact exception raised if the abstract input cannot be represented 2205 | exactly. However, if the abstract value is too large, the overflow and 2206 | inexact exceptions are raised and an infinity or maximal finite value is 2207 | returned. If the abstract value is too small, the input value is rounded to 2208 | a subnormal number, and the underflow and inexact exceptions are raised if 2209 | the abstract input cannot be represented exactly as a subnormal single- 2210 | precision floating-point number. 2211 | The input significand `zSig' has its binary point between bits 30 2212 | and 29, which is 7 bits to the left of the usual location. This shifted 2213 | significand must be normalized or smaller. If `zSig' is not normalized, 2214 | `zExp' must be 0; in that case, the result returned is a subnormal number, 2215 | and it must not require rounding. In the usual case that `zSig' is 2216 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 2217 | The handling of underflow and overflow follows the IEC/IEEE Standard for 2218 | Binary Floating-Point Arithmetic. 2219 *----------------------------------------------------------------------------*/ 2220 2221 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 2222 float_status *status) 2223 { 2224 int8_t roundingMode; 2225 flag roundNearestEven; 2226 int8_t roundIncrement, roundBits; 2227 flag isTiny; 2228 2229 roundingMode = status->float_rounding_mode; 2230 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2231 switch (roundingMode) { 2232 case float_round_nearest_even: 2233 case float_round_ties_away: 2234 roundIncrement = 0x40; 2235 break; 2236 case float_round_to_zero: 2237 roundIncrement = 0; 2238 break; 2239 case float_round_up: 2240 roundIncrement = zSign ? 0 : 0x7f; 2241 break; 2242 case float_round_down: 2243 roundIncrement = zSign ? 0x7f : 0; 2244 break; 2245 default: 2246 abort(); 2247 break; 2248 } 2249 roundBits = zSig & 0x7F; 2250 if ( 0xFD <= (uint16_t) zExp ) { 2251 if ( ( 0xFD < zExp ) 2252 || ( ( zExp == 0xFD ) 2253 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 2254 ) { 2255 float_raise(float_flag_overflow | float_flag_inexact, status); 2256 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 )); 2257 } 2258 if ( zExp < 0 ) { 2259 if (status->flush_to_zero) { 2260 float_raise(float_flag_output_denormal, status); 2261 return packFloat32(zSign, 0, 0); 2262 } 2263 isTiny = 2264 (status->float_detect_tininess 2265 == float_tininess_before_rounding) 2266 || ( zExp < -1 ) 2267 || ( zSig + roundIncrement < 0x80000000 ); 2268 shift32RightJamming( zSig, - zExp, &zSig ); 2269 zExp = 0; 2270 roundBits = zSig & 0x7F; 2271 if (isTiny && roundBits) { 2272 float_raise(float_flag_underflow, status); 2273 } 2274 } 2275 } 2276 if (roundBits) { 2277 status->float_exception_flags |= float_flag_inexact; 2278 } 2279 zSig = ( zSig + roundIncrement )>>7; 2280 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 2281 if ( zSig == 0 ) zExp = 0; 2282 return packFloat32( zSign, zExp, zSig ); 2283 2284 } 2285 2286 /*---------------------------------------------------------------------------- 2287 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2288 | and significand `zSig', and returns the proper single-precision floating- 2289 | point value corresponding to the abstract input. This routine is just like 2290 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 2291 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 2292 | floating-point exponent. 2293 *----------------------------------------------------------------------------*/ 2294 2295 static float32 2296 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 2297 float_status *status) 2298 { 2299 int8_t shiftCount; 2300 2301 shiftCount = countLeadingZeros32( zSig ) - 1; 2302 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 2303 status); 2304 2305 } 2306 2307 /*---------------------------------------------------------------------------- 2308 | If `a' is denormal and we are in flush-to-zero mode then set the 2309 | input-denormal exception and return zero. Otherwise just return the value. 2310 *----------------------------------------------------------------------------*/ 2311 float64 float64_squash_input_denormal(float64 a, float_status *status) 2312 { 2313 if (status->flush_inputs_to_zero) { 2314 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) { 2315 float_raise(float_flag_input_denormal, status); 2316 return make_float64(float64_val(a) & (1ULL << 63)); 2317 } 2318 } 2319 return a; 2320 } 2321 2322 /*---------------------------------------------------------------------------- 2323 | Normalizes the subnormal double-precision floating-point value represented 2324 | by the denormalized significand `aSig'. The normalized exponent and 2325 | significand are stored at the locations pointed to by `zExpPtr' and 2326 | `zSigPtr', respectively. 2327 *----------------------------------------------------------------------------*/ 2328 2329 static void 2330 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 2331 { 2332 int8_t shiftCount; 2333 2334 shiftCount = countLeadingZeros64( aSig ) - 11; 2335 *zSigPtr = aSig<<shiftCount; 2336 *zExpPtr = 1 - shiftCount; 2337 2338 } 2339 2340 /*---------------------------------------------------------------------------- 2341 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 2342 | double-precision floating-point value, returning the result. After being 2343 | shifted into the proper positions, the three fields are simply added 2344 | together to form the result. This means that any integer portion of `zSig' 2345 | will be added into the exponent. Since a properly normalized significand 2346 | will have an integer portion equal to 1, the `zExp' input should be 1 less 2347 | than the desired result exponent whenever `zSig' is a complete, normalized 2348 | significand. 2349 *----------------------------------------------------------------------------*/ 2350 2351 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig) 2352 { 2353 2354 return make_float64( 2355 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 2356 2357 } 2358 2359 /*---------------------------------------------------------------------------- 2360 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2361 | and significand `zSig', and returns the proper double-precision floating- 2362 | point value corresponding to the abstract input. Ordinarily, the abstract 2363 | value is simply rounded and packed into the double-precision format, with 2364 | the inexact exception raised if the abstract input cannot be represented 2365 | exactly. However, if the abstract value is too large, the overflow and 2366 | inexact exceptions are raised and an infinity or maximal finite value is 2367 | returned. If the abstract value is too small, the input value is rounded to 2368 | a subnormal number, and the underflow and inexact exceptions are raised if 2369 | the abstract input cannot be represented exactly as a subnormal double- 2370 | precision floating-point number. 2371 | The input significand `zSig' has its binary point between bits 62 2372 | and 61, which is 10 bits to the left of the usual location. This shifted 2373 | significand must be normalized or smaller. If `zSig' is not normalized, 2374 | `zExp' must be 0; in that case, the result returned is a subnormal number, 2375 | and it must not require rounding. In the usual case that `zSig' is 2376 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 2377 | The handling of underflow and overflow follows the IEC/IEEE Standard for 2378 | Binary Floating-Point Arithmetic. 2379 *----------------------------------------------------------------------------*/ 2380 2381 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 2382 float_status *status) 2383 { 2384 int8_t roundingMode; 2385 flag roundNearestEven; 2386 int roundIncrement, roundBits; 2387 flag isTiny; 2388 2389 roundingMode = status->float_rounding_mode; 2390 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2391 switch (roundingMode) { 2392 case float_round_nearest_even: 2393 case float_round_ties_away: 2394 roundIncrement = 0x200; 2395 break; 2396 case float_round_to_zero: 2397 roundIncrement = 0; 2398 break; 2399 case float_round_up: 2400 roundIncrement = zSign ? 0 : 0x3ff; 2401 break; 2402 case float_round_down: 2403 roundIncrement = zSign ? 0x3ff : 0; 2404 break; 2405 case float_round_to_odd: 2406 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 2407 break; 2408 default: 2409 abort(); 2410 } 2411 roundBits = zSig & 0x3FF; 2412 if ( 0x7FD <= (uint16_t) zExp ) { 2413 if ( ( 0x7FD < zExp ) 2414 || ( ( zExp == 0x7FD ) 2415 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 2416 ) { 2417 bool overflow_to_inf = roundingMode != float_round_to_odd && 2418 roundIncrement != 0; 2419 float_raise(float_flag_overflow | float_flag_inexact, status); 2420 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 2421 } 2422 if ( zExp < 0 ) { 2423 if (status->flush_to_zero) { 2424 float_raise(float_flag_output_denormal, status); 2425 return packFloat64(zSign, 0, 0); 2426 } 2427 isTiny = 2428 (status->float_detect_tininess 2429 == float_tininess_before_rounding) 2430 || ( zExp < -1 ) 2431 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 2432 shift64RightJamming( zSig, - zExp, &zSig ); 2433 zExp = 0; 2434 roundBits = zSig & 0x3FF; 2435 if (isTiny && roundBits) { 2436 float_raise(float_flag_underflow, status); 2437 } 2438 if (roundingMode == float_round_to_odd) { 2439 /* 2440 * For round-to-odd case, the roundIncrement depends on 2441 * zSig which just changed. 2442 */ 2443 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 2444 } 2445 } 2446 } 2447 if (roundBits) { 2448 status->float_exception_flags |= float_flag_inexact; 2449 } 2450 zSig = ( zSig + roundIncrement )>>10; 2451 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 2452 if ( zSig == 0 ) zExp = 0; 2453 return packFloat64( zSign, zExp, zSig ); 2454 2455 } 2456 2457 /*---------------------------------------------------------------------------- 2458 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2459 | and significand `zSig', and returns the proper double-precision floating- 2460 | point value corresponding to the abstract input. This routine is just like 2461 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 2462 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 2463 | floating-point exponent. 2464 *----------------------------------------------------------------------------*/ 2465 2466 static float64 2467 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 2468 float_status *status) 2469 { 2470 int8_t shiftCount; 2471 2472 shiftCount = countLeadingZeros64( zSig ) - 1; 2473 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 2474 status); 2475 2476 } 2477 2478 /*---------------------------------------------------------------------------- 2479 | Normalizes the subnormal extended double-precision floating-point value 2480 | represented by the denormalized significand `aSig'. The normalized exponent 2481 | and significand are stored at the locations pointed to by `zExpPtr' and 2482 | `zSigPtr', respectively. 2483 *----------------------------------------------------------------------------*/ 2484 2485 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr, 2486 uint64_t *zSigPtr) 2487 { 2488 int8_t shiftCount; 2489 2490 shiftCount = countLeadingZeros64( aSig ); 2491 *zSigPtr = aSig<<shiftCount; 2492 *zExpPtr = 1 - shiftCount; 2493 } 2494 2495 /*---------------------------------------------------------------------------- 2496 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2497 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 2498 | and returns the proper extended double-precision floating-point value 2499 | corresponding to the abstract input. Ordinarily, the abstract value is 2500 | rounded and packed into the extended double-precision format, with the 2501 | inexact exception raised if the abstract input cannot be represented 2502 | exactly. However, if the abstract value is too large, the overflow and 2503 | inexact exceptions are raised and an infinity or maximal finite value is 2504 | returned. If the abstract value is too small, the input value is rounded to 2505 | a subnormal number, and the underflow and inexact exceptions are raised if 2506 | the abstract input cannot be represented exactly as a subnormal extended 2507 | double-precision floating-point number. 2508 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 2509 | number of bits as single or double precision, respectively. Otherwise, the 2510 | result is rounded to the full precision of the extended double-precision 2511 | format. 2512 | The input significand must be normalized or smaller. If the input 2513 | significand is not normalized, `zExp' must be 0; in that case, the result 2514 | returned is a subnormal number, and it must not require rounding. The 2515 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 2516 | Floating-Point Arithmetic. 2517 *----------------------------------------------------------------------------*/ 2518 2519 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign, 2520 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 2521 float_status *status) 2522 { 2523 int8_t roundingMode; 2524 flag roundNearestEven, increment, isTiny; 2525 int64_t roundIncrement, roundMask, roundBits; 2526 2527 roundingMode = status->float_rounding_mode; 2528 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2529 if ( roundingPrecision == 80 ) goto precision80; 2530 if ( roundingPrecision == 64 ) { 2531 roundIncrement = LIT64( 0x0000000000000400 ); 2532 roundMask = LIT64( 0x00000000000007FF ); 2533 } 2534 else if ( roundingPrecision == 32 ) { 2535 roundIncrement = LIT64( 0x0000008000000000 ); 2536 roundMask = LIT64( 0x000000FFFFFFFFFF ); 2537 } 2538 else { 2539 goto precision80; 2540 } 2541 zSig0 |= ( zSig1 != 0 ); 2542 switch (roundingMode) { 2543 case float_round_nearest_even: 2544 case float_round_ties_away: 2545 break; 2546 case float_round_to_zero: 2547 roundIncrement = 0; 2548 break; 2549 case float_round_up: 2550 roundIncrement = zSign ? 0 : roundMask; 2551 break; 2552 case float_round_down: 2553 roundIncrement = zSign ? roundMask : 0; 2554 break; 2555 default: 2556 abort(); 2557 } 2558 roundBits = zSig0 & roundMask; 2559 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 2560 if ( ( 0x7FFE < zExp ) 2561 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 2562 ) { 2563 goto overflow; 2564 } 2565 if ( zExp <= 0 ) { 2566 if (status->flush_to_zero) { 2567 float_raise(float_flag_output_denormal, status); 2568 return packFloatx80(zSign, 0, 0); 2569 } 2570 isTiny = 2571 (status->float_detect_tininess 2572 == float_tininess_before_rounding) 2573 || ( zExp < 0 ) 2574 || ( zSig0 <= zSig0 + roundIncrement ); 2575 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 2576 zExp = 0; 2577 roundBits = zSig0 & roundMask; 2578 if (isTiny && roundBits) { 2579 float_raise(float_flag_underflow, status); 2580 } 2581 if (roundBits) { 2582 status->float_exception_flags |= float_flag_inexact; 2583 } 2584 zSig0 += roundIncrement; 2585 if ( (int64_t) zSig0 < 0 ) zExp = 1; 2586 roundIncrement = roundMask + 1; 2587 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 2588 roundMask |= roundIncrement; 2589 } 2590 zSig0 &= ~ roundMask; 2591 return packFloatx80( zSign, zExp, zSig0 ); 2592 } 2593 } 2594 if (roundBits) { 2595 status->float_exception_flags |= float_flag_inexact; 2596 } 2597 zSig0 += roundIncrement; 2598 if ( zSig0 < roundIncrement ) { 2599 ++zExp; 2600 zSig0 = LIT64( 0x8000000000000000 ); 2601 } 2602 roundIncrement = roundMask + 1; 2603 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 2604 roundMask |= roundIncrement; 2605 } 2606 zSig0 &= ~ roundMask; 2607 if ( zSig0 == 0 ) zExp = 0; 2608 return packFloatx80( zSign, zExp, zSig0 ); 2609 precision80: 2610 switch (roundingMode) { 2611 case float_round_nearest_even: 2612 case float_round_ties_away: 2613 increment = ((int64_t)zSig1 < 0); 2614 break; 2615 case float_round_to_zero: 2616 increment = 0; 2617 break; 2618 case float_round_up: 2619 increment = !zSign && zSig1; 2620 break; 2621 case float_round_down: 2622 increment = zSign && zSig1; 2623 break; 2624 default: 2625 abort(); 2626 } 2627 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 2628 if ( ( 0x7FFE < zExp ) 2629 || ( ( zExp == 0x7FFE ) 2630 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 2631 && increment 2632 ) 2633 ) { 2634 roundMask = 0; 2635 overflow: 2636 float_raise(float_flag_overflow | float_flag_inexact, status); 2637 if ( ( roundingMode == float_round_to_zero ) 2638 || ( zSign && ( roundingMode == float_round_up ) ) 2639 || ( ! zSign && ( roundingMode == float_round_down ) ) 2640 ) { 2641 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 2642 } 2643 return packFloatx80(zSign, 2644 floatx80_infinity_high, 2645 floatx80_infinity_low); 2646 } 2647 if ( zExp <= 0 ) { 2648 isTiny = 2649 (status->float_detect_tininess 2650 == float_tininess_before_rounding) 2651 || ( zExp < 0 ) 2652 || ! increment 2653 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 2654 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 2655 zExp = 0; 2656 if (isTiny && zSig1) { 2657 float_raise(float_flag_underflow, status); 2658 } 2659 if (zSig1) { 2660 status->float_exception_flags |= float_flag_inexact; 2661 } 2662 switch (roundingMode) { 2663 case float_round_nearest_even: 2664 case float_round_ties_away: 2665 increment = ((int64_t)zSig1 < 0); 2666 break; 2667 case float_round_to_zero: 2668 increment = 0; 2669 break; 2670 case float_round_up: 2671 increment = !zSign && zSig1; 2672 break; 2673 case float_round_down: 2674 increment = zSign && zSig1; 2675 break; 2676 default: 2677 abort(); 2678 } 2679 if ( increment ) { 2680 ++zSig0; 2681 zSig0 &= 2682 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 2683 if ( (int64_t) zSig0 < 0 ) zExp = 1; 2684 } 2685 return packFloatx80( zSign, zExp, zSig0 ); 2686 } 2687 } 2688 if (zSig1) { 2689 status->float_exception_flags |= float_flag_inexact; 2690 } 2691 if ( increment ) { 2692 ++zSig0; 2693 if ( zSig0 == 0 ) { 2694 ++zExp; 2695 zSig0 = LIT64( 0x8000000000000000 ); 2696 } 2697 else { 2698 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 2699 } 2700 } 2701 else { 2702 if ( zSig0 == 0 ) zExp = 0; 2703 } 2704 return packFloatx80( zSign, zExp, zSig0 ); 2705 2706 } 2707 2708 /*---------------------------------------------------------------------------- 2709 | Takes an abstract floating-point value having sign `zSign', exponent 2710 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 2711 | and returns the proper extended double-precision floating-point value 2712 | corresponding to the abstract input. This routine is just like 2713 | `roundAndPackFloatx80' except that the input significand does not have to be 2714 | normalized. 2715 *----------------------------------------------------------------------------*/ 2716 2717 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 2718 flag zSign, int32_t zExp, 2719 uint64_t zSig0, uint64_t zSig1, 2720 float_status *status) 2721 { 2722 int8_t shiftCount; 2723 2724 if ( zSig0 == 0 ) { 2725 zSig0 = zSig1; 2726 zSig1 = 0; 2727 zExp -= 64; 2728 } 2729 shiftCount = countLeadingZeros64( zSig0 ); 2730 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 2731 zExp -= shiftCount; 2732 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 2733 zSig0, zSig1, status); 2734 2735 } 2736 2737 /*---------------------------------------------------------------------------- 2738 | Returns the least-significant 64 fraction bits of the quadruple-precision 2739 | floating-point value `a'. 2740 *----------------------------------------------------------------------------*/ 2741 2742 static inline uint64_t extractFloat128Frac1( float128 a ) 2743 { 2744 2745 return a.low; 2746 2747 } 2748 2749 /*---------------------------------------------------------------------------- 2750 | Returns the most-significant 48 fraction bits of the quadruple-precision 2751 | floating-point value `a'. 2752 *----------------------------------------------------------------------------*/ 2753 2754 static inline uint64_t extractFloat128Frac0( float128 a ) 2755 { 2756 2757 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 2758 2759 } 2760 2761 /*---------------------------------------------------------------------------- 2762 | Returns the exponent bits of the quadruple-precision floating-point value 2763 | `a'. 2764 *----------------------------------------------------------------------------*/ 2765 2766 static inline int32_t extractFloat128Exp( float128 a ) 2767 { 2768 2769 return ( a.high>>48 ) & 0x7FFF; 2770 2771 } 2772 2773 /*---------------------------------------------------------------------------- 2774 | Returns the sign bit of the quadruple-precision floating-point value `a'. 2775 *----------------------------------------------------------------------------*/ 2776 2777 static inline flag extractFloat128Sign( float128 a ) 2778 { 2779 2780 return a.high>>63; 2781 2782 } 2783 2784 /*---------------------------------------------------------------------------- 2785 | Normalizes the subnormal quadruple-precision floating-point value 2786 | represented by the denormalized significand formed by the concatenation of 2787 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 2788 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 2789 | significand are stored at the location pointed to by `zSig0Ptr', and the 2790 | least significant 64 bits of the normalized significand are stored at the 2791 | location pointed to by `zSig1Ptr'. 2792 *----------------------------------------------------------------------------*/ 2793 2794 static void 2795 normalizeFloat128Subnormal( 2796 uint64_t aSig0, 2797 uint64_t aSig1, 2798 int32_t *zExpPtr, 2799 uint64_t *zSig0Ptr, 2800 uint64_t *zSig1Ptr 2801 ) 2802 { 2803 int8_t shiftCount; 2804 2805 if ( aSig0 == 0 ) { 2806 shiftCount = countLeadingZeros64( aSig1 ) - 15; 2807 if ( shiftCount < 0 ) { 2808 *zSig0Ptr = aSig1>>( - shiftCount ); 2809 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 2810 } 2811 else { 2812 *zSig0Ptr = aSig1<<shiftCount; 2813 *zSig1Ptr = 0; 2814 } 2815 *zExpPtr = - shiftCount - 63; 2816 } 2817 else { 2818 shiftCount = countLeadingZeros64( aSig0 ) - 15; 2819 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 2820 *zExpPtr = 1 - shiftCount; 2821 } 2822 2823 } 2824 2825 /*---------------------------------------------------------------------------- 2826 | Packs the sign `zSign', the exponent `zExp', and the significand formed 2827 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 2828 | floating-point value, returning the result. After being shifted into the 2829 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 2830 | added together to form the most significant 32 bits of the result. This 2831 | means that any integer portion of `zSig0' will be added into the exponent. 2832 | Since a properly normalized significand will have an integer portion equal 2833 | to 1, the `zExp' input should be 1 less than the desired result exponent 2834 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 2835 | significand. 2836 *----------------------------------------------------------------------------*/ 2837 2838 static inline float128 2839 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 ) 2840 { 2841 float128 z; 2842 2843 z.low = zSig1; 2844 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0; 2845 return z; 2846 2847 } 2848 2849 /*---------------------------------------------------------------------------- 2850 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2851 | and extended significand formed by the concatenation of `zSig0', `zSig1', 2852 | and `zSig2', and returns the proper quadruple-precision floating-point value 2853 | corresponding to the abstract input. Ordinarily, the abstract value is 2854 | simply rounded and packed into the quadruple-precision format, with the 2855 | inexact exception raised if the abstract input cannot be represented 2856 | exactly. However, if the abstract value is too large, the overflow and 2857 | inexact exceptions are raised and an infinity or maximal finite value is 2858 | returned. If the abstract value is too small, the input value is rounded to 2859 | a subnormal number, and the underflow and inexact exceptions are raised if 2860 | the abstract input cannot be represented exactly as a subnormal quadruple- 2861 | precision floating-point number. 2862 | The input significand must be normalized or smaller. If the input 2863 | significand is not normalized, `zExp' must be 0; in that case, the result 2864 | returned is a subnormal number, and it must not require rounding. In the 2865 | usual case that the input significand is normalized, `zExp' must be 1 less 2866 | than the ``true'' floating-point exponent. The handling of underflow and 2867 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2868 *----------------------------------------------------------------------------*/ 2869 2870 static float128 roundAndPackFloat128(flag zSign, int32_t zExp, 2871 uint64_t zSig0, uint64_t zSig1, 2872 uint64_t zSig2, float_status *status) 2873 { 2874 int8_t roundingMode; 2875 flag roundNearestEven, increment, isTiny; 2876 2877 roundingMode = status->float_rounding_mode; 2878 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2879 switch (roundingMode) { 2880 case float_round_nearest_even: 2881 case float_round_ties_away: 2882 increment = ((int64_t)zSig2 < 0); 2883 break; 2884 case float_round_to_zero: 2885 increment = 0; 2886 break; 2887 case float_round_up: 2888 increment = !zSign && zSig2; 2889 break; 2890 case float_round_down: 2891 increment = zSign && zSig2; 2892 break; 2893 case float_round_to_odd: 2894 increment = !(zSig1 & 0x1) && zSig2; 2895 break; 2896 default: 2897 abort(); 2898 } 2899 if ( 0x7FFD <= (uint32_t) zExp ) { 2900 if ( ( 0x7FFD < zExp ) 2901 || ( ( zExp == 0x7FFD ) 2902 && eq128( 2903 LIT64( 0x0001FFFFFFFFFFFF ), 2904 LIT64( 0xFFFFFFFFFFFFFFFF ), 2905 zSig0, 2906 zSig1 2907 ) 2908 && increment 2909 ) 2910 ) { 2911 float_raise(float_flag_overflow | float_flag_inexact, status); 2912 if ( ( roundingMode == float_round_to_zero ) 2913 || ( zSign && ( roundingMode == float_round_up ) ) 2914 || ( ! zSign && ( roundingMode == float_round_down ) ) 2915 || (roundingMode == float_round_to_odd) 2916 ) { 2917 return 2918 packFloat128( 2919 zSign, 2920 0x7FFE, 2921 LIT64( 0x0000FFFFFFFFFFFF ), 2922 LIT64( 0xFFFFFFFFFFFFFFFF ) 2923 ); 2924 } 2925 return packFloat128( zSign, 0x7FFF, 0, 0 ); 2926 } 2927 if ( zExp < 0 ) { 2928 if (status->flush_to_zero) { 2929 float_raise(float_flag_output_denormal, status); 2930 return packFloat128(zSign, 0, 0, 0); 2931 } 2932 isTiny = 2933 (status->float_detect_tininess 2934 == float_tininess_before_rounding) 2935 || ( zExp < -1 ) 2936 || ! increment 2937 || lt128( 2938 zSig0, 2939 zSig1, 2940 LIT64( 0x0001FFFFFFFFFFFF ), 2941 LIT64( 0xFFFFFFFFFFFFFFFF ) 2942 ); 2943 shift128ExtraRightJamming( 2944 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 2945 zExp = 0; 2946 if (isTiny && zSig2) { 2947 float_raise(float_flag_underflow, status); 2948 } 2949 switch (roundingMode) { 2950 case float_round_nearest_even: 2951 case float_round_ties_away: 2952 increment = ((int64_t)zSig2 < 0); 2953 break; 2954 case float_round_to_zero: 2955 increment = 0; 2956 break; 2957 case float_round_up: 2958 increment = !zSign && zSig2; 2959 break; 2960 case float_round_down: 2961 increment = zSign && zSig2; 2962 break; 2963 case float_round_to_odd: 2964 increment = !(zSig1 & 0x1) && zSig2; 2965 break; 2966 default: 2967 abort(); 2968 } 2969 } 2970 } 2971 if (zSig2) { 2972 status->float_exception_flags |= float_flag_inexact; 2973 } 2974 if ( increment ) { 2975 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 2976 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 2977 } 2978 else { 2979 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 2980 } 2981 return packFloat128( zSign, zExp, zSig0, zSig1 ); 2982 2983 } 2984 2985 /*---------------------------------------------------------------------------- 2986 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2987 | and significand formed by the concatenation of `zSig0' and `zSig1', and 2988 | returns the proper quadruple-precision floating-point value corresponding 2989 | to the abstract input. This routine is just like `roundAndPackFloat128' 2990 | except that the input significand has fewer bits and does not have to be 2991 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 2992 | point exponent. 2993 *----------------------------------------------------------------------------*/ 2994 2995 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp, 2996 uint64_t zSig0, uint64_t zSig1, 2997 float_status *status) 2998 { 2999 int8_t shiftCount; 3000 uint64_t zSig2; 3001 3002 if ( zSig0 == 0 ) { 3003 zSig0 = zSig1; 3004 zSig1 = 0; 3005 zExp -= 64; 3006 } 3007 shiftCount = countLeadingZeros64( zSig0 ) - 15; 3008 if ( 0 <= shiftCount ) { 3009 zSig2 = 0; 3010 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 3011 } 3012 else { 3013 shift128ExtraRightJamming( 3014 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 3015 } 3016 zExp -= shiftCount; 3017 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 3018 3019 } 3020 3021 3022 /*---------------------------------------------------------------------------- 3023 | Returns the result of converting the 32-bit two's complement integer `a' 3024 | to the extended double-precision floating-point format. The conversion 3025 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3026 | Arithmetic. 3027 *----------------------------------------------------------------------------*/ 3028 3029 floatx80 int32_to_floatx80(int32_t a, float_status *status) 3030 { 3031 flag zSign; 3032 uint32_t absA; 3033 int8_t shiftCount; 3034 uint64_t zSig; 3035 3036 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 3037 zSign = ( a < 0 ); 3038 absA = zSign ? - a : a; 3039 shiftCount = countLeadingZeros32( absA ) + 32; 3040 zSig = absA; 3041 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 3042 3043 } 3044 3045 /*---------------------------------------------------------------------------- 3046 | Returns the result of converting the 32-bit two's complement integer `a' to 3047 | the quadruple-precision floating-point format. The conversion is performed 3048 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3049 *----------------------------------------------------------------------------*/ 3050 3051 float128 int32_to_float128(int32_t a, float_status *status) 3052 { 3053 flag zSign; 3054 uint32_t absA; 3055 int8_t shiftCount; 3056 uint64_t zSig0; 3057 3058 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 3059 zSign = ( a < 0 ); 3060 absA = zSign ? - a : a; 3061 shiftCount = countLeadingZeros32( absA ) + 17; 3062 zSig0 = absA; 3063 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 3064 3065 } 3066 3067 /*---------------------------------------------------------------------------- 3068 | Returns the result of converting the 64-bit two's complement integer `a' 3069 | to the extended double-precision floating-point format. The conversion 3070 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3071 | Arithmetic. 3072 *----------------------------------------------------------------------------*/ 3073 3074 floatx80 int64_to_floatx80(int64_t a, float_status *status) 3075 { 3076 flag zSign; 3077 uint64_t absA; 3078 int8_t shiftCount; 3079 3080 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 3081 zSign = ( a < 0 ); 3082 absA = zSign ? - a : a; 3083 shiftCount = countLeadingZeros64( absA ); 3084 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 3085 3086 } 3087 3088 /*---------------------------------------------------------------------------- 3089 | Returns the result of converting the 64-bit two's complement integer `a' to 3090 | the quadruple-precision floating-point format. The conversion is performed 3091 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3092 *----------------------------------------------------------------------------*/ 3093 3094 float128 int64_to_float128(int64_t a, float_status *status) 3095 { 3096 flag zSign; 3097 uint64_t absA; 3098 int8_t shiftCount; 3099 int32_t zExp; 3100 uint64_t zSig0, zSig1; 3101 3102 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 3103 zSign = ( a < 0 ); 3104 absA = zSign ? - a : a; 3105 shiftCount = countLeadingZeros64( absA ) + 49; 3106 zExp = 0x406E - shiftCount; 3107 if ( 64 <= shiftCount ) { 3108 zSig1 = 0; 3109 zSig0 = absA; 3110 shiftCount -= 64; 3111 } 3112 else { 3113 zSig1 = absA; 3114 zSig0 = 0; 3115 } 3116 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 3117 return packFloat128( zSign, zExp, zSig0, zSig1 ); 3118 3119 } 3120 3121 /*---------------------------------------------------------------------------- 3122 | Returns the result of converting the 64-bit unsigned integer `a' 3123 | to the quadruple-precision floating-point format. The conversion is performed 3124 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3125 *----------------------------------------------------------------------------*/ 3126 3127 float128 uint64_to_float128(uint64_t a, float_status *status) 3128 { 3129 if (a == 0) { 3130 return float128_zero; 3131 } 3132 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status); 3133 } 3134 3135 3136 3137 3138 /*---------------------------------------------------------------------------- 3139 | Returns the result of converting the single-precision floating-point value 3140 | `a' to the double-precision floating-point format. The conversion is 3141 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3142 | Arithmetic. 3143 *----------------------------------------------------------------------------*/ 3144 3145 float64 float32_to_float64(float32 a, float_status *status) 3146 { 3147 flag aSign; 3148 int aExp; 3149 uint32_t aSig; 3150 a = float32_squash_input_denormal(a, status); 3151 3152 aSig = extractFloat32Frac( a ); 3153 aExp = extractFloat32Exp( a ); 3154 aSign = extractFloat32Sign( a ); 3155 if ( aExp == 0xFF ) { 3156 if (aSig) { 3157 return commonNaNToFloat64(float32ToCommonNaN(a, status), status); 3158 } 3159 return packFloat64( aSign, 0x7FF, 0 ); 3160 } 3161 if ( aExp == 0 ) { 3162 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 3163 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3164 --aExp; 3165 } 3166 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 ); 3167 3168 } 3169 3170 /*---------------------------------------------------------------------------- 3171 | Returns the result of converting the single-precision floating-point value 3172 | `a' to the extended double-precision floating-point format. The conversion 3173 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3174 | Arithmetic. 3175 *----------------------------------------------------------------------------*/ 3176 3177 floatx80 float32_to_floatx80(float32 a, float_status *status) 3178 { 3179 flag aSign; 3180 int aExp; 3181 uint32_t aSig; 3182 3183 a = float32_squash_input_denormal(a, status); 3184 aSig = extractFloat32Frac( a ); 3185 aExp = extractFloat32Exp( a ); 3186 aSign = extractFloat32Sign( a ); 3187 if ( aExp == 0xFF ) { 3188 if (aSig) { 3189 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status); 3190 } 3191 return packFloatx80(aSign, 3192 floatx80_infinity_high, 3193 floatx80_infinity_low); 3194 } 3195 if ( aExp == 0 ) { 3196 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 3197 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3198 } 3199 aSig |= 0x00800000; 3200 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 3201 3202 } 3203 3204 /*---------------------------------------------------------------------------- 3205 | Returns the result of converting the single-precision floating-point value 3206 | `a' to the double-precision floating-point format. The conversion is 3207 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3208 | Arithmetic. 3209 *----------------------------------------------------------------------------*/ 3210 3211 float128 float32_to_float128(float32 a, float_status *status) 3212 { 3213 flag aSign; 3214 int aExp; 3215 uint32_t aSig; 3216 3217 a = float32_squash_input_denormal(a, status); 3218 aSig = extractFloat32Frac( a ); 3219 aExp = extractFloat32Exp( a ); 3220 aSign = extractFloat32Sign( a ); 3221 if ( aExp == 0xFF ) { 3222 if (aSig) { 3223 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 3224 } 3225 return packFloat128( aSign, 0x7FFF, 0, 0 ); 3226 } 3227 if ( aExp == 0 ) { 3228 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 3229 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3230 --aExp; 3231 } 3232 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 3233 3234 } 3235 3236 /*---------------------------------------------------------------------------- 3237 | Returns the remainder of the single-precision floating-point value `a' 3238 | with respect to the corresponding value `b'. The operation is performed 3239 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3240 *----------------------------------------------------------------------------*/ 3241 3242 float32 float32_rem(float32 a, float32 b, float_status *status) 3243 { 3244 flag aSign, zSign; 3245 int aExp, bExp, expDiff; 3246 uint32_t aSig, bSig; 3247 uint32_t q; 3248 uint64_t aSig64, bSig64, q64; 3249 uint32_t alternateASig; 3250 int32_t sigMean; 3251 a = float32_squash_input_denormal(a, status); 3252 b = float32_squash_input_denormal(b, status); 3253 3254 aSig = extractFloat32Frac( a ); 3255 aExp = extractFloat32Exp( a ); 3256 aSign = extractFloat32Sign( a ); 3257 bSig = extractFloat32Frac( b ); 3258 bExp = extractFloat32Exp( b ); 3259 if ( aExp == 0xFF ) { 3260 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 3261 return propagateFloat32NaN(a, b, status); 3262 } 3263 float_raise(float_flag_invalid, status); 3264 return float32_default_nan(status); 3265 } 3266 if ( bExp == 0xFF ) { 3267 if (bSig) { 3268 return propagateFloat32NaN(a, b, status); 3269 } 3270 return a; 3271 } 3272 if ( bExp == 0 ) { 3273 if ( bSig == 0 ) { 3274 float_raise(float_flag_invalid, status); 3275 return float32_default_nan(status); 3276 } 3277 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 3278 } 3279 if ( aExp == 0 ) { 3280 if ( aSig == 0 ) return a; 3281 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3282 } 3283 expDiff = aExp - bExp; 3284 aSig |= 0x00800000; 3285 bSig |= 0x00800000; 3286 if ( expDiff < 32 ) { 3287 aSig <<= 8; 3288 bSig <<= 8; 3289 if ( expDiff < 0 ) { 3290 if ( expDiff < -1 ) return a; 3291 aSig >>= 1; 3292 } 3293 q = ( bSig <= aSig ); 3294 if ( q ) aSig -= bSig; 3295 if ( 0 < expDiff ) { 3296 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 3297 q >>= 32 - expDiff; 3298 bSig >>= 2; 3299 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 3300 } 3301 else { 3302 aSig >>= 2; 3303 bSig >>= 2; 3304 } 3305 } 3306 else { 3307 if ( bSig <= aSig ) aSig -= bSig; 3308 aSig64 = ( (uint64_t) aSig )<<40; 3309 bSig64 = ( (uint64_t) bSig )<<40; 3310 expDiff -= 64; 3311 while ( 0 < expDiff ) { 3312 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 3313 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 3314 aSig64 = - ( ( bSig * q64 )<<38 ); 3315 expDiff -= 62; 3316 } 3317 expDiff += 64; 3318 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 3319 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 3320 q = q64>>( 64 - expDiff ); 3321 bSig <<= 6; 3322 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 3323 } 3324 do { 3325 alternateASig = aSig; 3326 ++q; 3327 aSig -= bSig; 3328 } while ( 0 <= (int32_t) aSig ); 3329 sigMean = aSig + alternateASig; 3330 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 3331 aSig = alternateASig; 3332 } 3333 zSign = ( (int32_t) aSig < 0 ); 3334 if ( zSign ) aSig = - aSig; 3335 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 3336 } 3337 3338 3339 3340 /*---------------------------------------------------------------------------- 3341 | Returns the binary exponential of the single-precision floating-point value 3342 | `a'. The operation is performed according to the IEC/IEEE Standard for 3343 | Binary Floating-Point Arithmetic. 3344 | 3345 | Uses the following identities: 3346 | 3347 | 1. ------------------------------------------------------------------------- 3348 | x x*ln(2) 3349 | 2 = e 3350 | 3351 | 2. ------------------------------------------------------------------------- 3352 | 2 3 4 5 n 3353 | x x x x x x x 3354 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 3355 | 1! 2! 3! 4! 5! n! 3356 *----------------------------------------------------------------------------*/ 3357 3358 static const float64 float32_exp2_coefficients[15] = 3359 { 3360 const_float64( 0x3ff0000000000000ll ), /* 1 */ 3361 const_float64( 0x3fe0000000000000ll ), /* 2 */ 3362 const_float64( 0x3fc5555555555555ll ), /* 3 */ 3363 const_float64( 0x3fa5555555555555ll ), /* 4 */ 3364 const_float64( 0x3f81111111111111ll ), /* 5 */ 3365 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 3366 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 3367 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 3368 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 3369 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 3370 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 3371 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 3372 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 3373 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 3374 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 3375 }; 3376 3377 float32 float32_exp2(float32 a, float_status *status) 3378 { 3379 flag aSign; 3380 int aExp; 3381 uint32_t aSig; 3382 float64 r, x, xn; 3383 int i; 3384 a = float32_squash_input_denormal(a, status); 3385 3386 aSig = extractFloat32Frac( a ); 3387 aExp = extractFloat32Exp( a ); 3388 aSign = extractFloat32Sign( a ); 3389 3390 if ( aExp == 0xFF) { 3391 if (aSig) { 3392 return propagateFloat32NaN(a, float32_zero, status); 3393 } 3394 return (aSign) ? float32_zero : a; 3395 } 3396 if (aExp == 0) { 3397 if (aSig == 0) return float32_one; 3398 } 3399 3400 float_raise(float_flag_inexact, status); 3401 3402 /* ******************************* */ 3403 /* using float64 for approximation */ 3404 /* ******************************* */ 3405 x = float32_to_float64(a, status); 3406 x = float64_mul(x, float64_ln2, status); 3407 3408 xn = x; 3409 r = float64_one; 3410 for (i = 0 ; i < 15 ; i++) { 3411 float64 f; 3412 3413 f = float64_mul(xn, float32_exp2_coefficients[i], status); 3414 r = float64_add(r, f, status); 3415 3416 xn = float64_mul(xn, x, status); 3417 } 3418 3419 return float64_to_float32(r, status); 3420 } 3421 3422 /*---------------------------------------------------------------------------- 3423 | Returns the binary log of the single-precision floating-point value `a'. 3424 | The operation is performed according to the IEC/IEEE Standard for Binary 3425 | Floating-Point Arithmetic. 3426 *----------------------------------------------------------------------------*/ 3427 float32 float32_log2(float32 a, float_status *status) 3428 { 3429 flag aSign, zSign; 3430 int aExp; 3431 uint32_t aSig, zSig, i; 3432 3433 a = float32_squash_input_denormal(a, status); 3434 aSig = extractFloat32Frac( a ); 3435 aExp = extractFloat32Exp( a ); 3436 aSign = extractFloat32Sign( a ); 3437 3438 if ( aExp == 0 ) { 3439 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 3440 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3441 } 3442 if ( aSign ) { 3443 float_raise(float_flag_invalid, status); 3444 return float32_default_nan(status); 3445 } 3446 if ( aExp == 0xFF ) { 3447 if (aSig) { 3448 return propagateFloat32NaN(a, float32_zero, status); 3449 } 3450 return a; 3451 } 3452 3453 aExp -= 0x7F; 3454 aSig |= 0x00800000; 3455 zSign = aExp < 0; 3456 zSig = aExp << 23; 3457 3458 for (i = 1 << 22; i > 0; i >>= 1) { 3459 aSig = ( (uint64_t)aSig * aSig ) >> 23; 3460 if ( aSig & 0x01000000 ) { 3461 aSig >>= 1; 3462 zSig |= i; 3463 } 3464 } 3465 3466 if ( zSign ) 3467 zSig = -zSig; 3468 3469 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 3470 } 3471 3472 /*---------------------------------------------------------------------------- 3473 | Returns 1 if the single-precision floating-point value `a' is equal to 3474 | the corresponding value `b', and 0 otherwise. The invalid exception is 3475 | raised if either operand is a NaN. Otherwise, the comparison is performed 3476 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3477 *----------------------------------------------------------------------------*/ 3478 3479 int float32_eq(float32 a, float32 b, float_status *status) 3480 { 3481 uint32_t av, bv; 3482 a = float32_squash_input_denormal(a, status); 3483 b = float32_squash_input_denormal(b, status); 3484 3485 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3486 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3487 ) { 3488 float_raise(float_flag_invalid, status); 3489 return 0; 3490 } 3491 av = float32_val(a); 3492 bv = float32_val(b); 3493 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3494 } 3495 3496 /*---------------------------------------------------------------------------- 3497 | Returns 1 if the single-precision floating-point value `a' is less than 3498 | or equal to the corresponding value `b', and 0 otherwise. The invalid 3499 | exception is raised if either operand is a NaN. The comparison is performed 3500 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3501 *----------------------------------------------------------------------------*/ 3502 3503 int float32_le(float32 a, float32 b, float_status *status) 3504 { 3505 flag aSign, bSign; 3506 uint32_t av, bv; 3507 a = float32_squash_input_denormal(a, status); 3508 b = float32_squash_input_denormal(b, status); 3509 3510 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3511 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3512 ) { 3513 float_raise(float_flag_invalid, status); 3514 return 0; 3515 } 3516 aSign = extractFloat32Sign( a ); 3517 bSign = extractFloat32Sign( b ); 3518 av = float32_val(a); 3519 bv = float32_val(b); 3520 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3521 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3522 3523 } 3524 3525 /*---------------------------------------------------------------------------- 3526 | Returns 1 if the single-precision floating-point value `a' is less than 3527 | the corresponding value `b', and 0 otherwise. The invalid exception is 3528 | raised if either operand is a NaN. The comparison is performed according 3529 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3530 *----------------------------------------------------------------------------*/ 3531 3532 int float32_lt(float32 a, float32 b, float_status *status) 3533 { 3534 flag aSign, bSign; 3535 uint32_t av, bv; 3536 a = float32_squash_input_denormal(a, status); 3537 b = float32_squash_input_denormal(b, status); 3538 3539 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3540 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3541 ) { 3542 float_raise(float_flag_invalid, status); 3543 return 0; 3544 } 3545 aSign = extractFloat32Sign( a ); 3546 bSign = extractFloat32Sign( b ); 3547 av = float32_val(a); 3548 bv = float32_val(b); 3549 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 3550 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3551 3552 } 3553 3554 /*---------------------------------------------------------------------------- 3555 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 3556 | be compared, and 0 otherwise. The invalid exception is raised if either 3557 | operand is a NaN. The comparison is performed according to the IEC/IEEE 3558 | Standard for Binary Floating-Point Arithmetic. 3559 *----------------------------------------------------------------------------*/ 3560 3561 int float32_unordered(float32 a, float32 b, float_status *status) 3562 { 3563 a = float32_squash_input_denormal(a, status); 3564 b = float32_squash_input_denormal(b, status); 3565 3566 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3567 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3568 ) { 3569 float_raise(float_flag_invalid, status); 3570 return 1; 3571 } 3572 return 0; 3573 } 3574 3575 /*---------------------------------------------------------------------------- 3576 | Returns 1 if the single-precision floating-point value `a' is equal to 3577 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3578 | exception. The comparison is performed according to the IEC/IEEE Standard 3579 | for Binary Floating-Point Arithmetic. 3580 *----------------------------------------------------------------------------*/ 3581 3582 int float32_eq_quiet(float32 a, float32 b, float_status *status) 3583 { 3584 a = float32_squash_input_denormal(a, status); 3585 b = float32_squash_input_denormal(b, status); 3586 3587 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3588 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3589 ) { 3590 if (float32_is_signaling_nan(a, status) 3591 || float32_is_signaling_nan(b, status)) { 3592 float_raise(float_flag_invalid, status); 3593 } 3594 return 0; 3595 } 3596 return ( float32_val(a) == float32_val(b) ) || 3597 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 3598 } 3599 3600 /*---------------------------------------------------------------------------- 3601 | Returns 1 if the single-precision floating-point value `a' is less than or 3602 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 3603 | cause an exception. Otherwise, the comparison is performed according to the 3604 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3605 *----------------------------------------------------------------------------*/ 3606 3607 int float32_le_quiet(float32 a, float32 b, float_status *status) 3608 { 3609 flag aSign, bSign; 3610 uint32_t av, bv; 3611 a = float32_squash_input_denormal(a, status); 3612 b = float32_squash_input_denormal(b, status); 3613 3614 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3615 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3616 ) { 3617 if (float32_is_signaling_nan(a, status) 3618 || float32_is_signaling_nan(b, status)) { 3619 float_raise(float_flag_invalid, status); 3620 } 3621 return 0; 3622 } 3623 aSign = extractFloat32Sign( a ); 3624 bSign = extractFloat32Sign( b ); 3625 av = float32_val(a); 3626 bv = float32_val(b); 3627 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3628 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3629 3630 } 3631 3632 /*---------------------------------------------------------------------------- 3633 | Returns 1 if the single-precision floating-point value `a' is less than 3634 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3635 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 3636 | Standard for Binary Floating-Point Arithmetic. 3637 *----------------------------------------------------------------------------*/ 3638 3639 int float32_lt_quiet(float32 a, float32 b, float_status *status) 3640 { 3641 flag aSign, bSign; 3642 uint32_t av, bv; 3643 a = float32_squash_input_denormal(a, status); 3644 b = float32_squash_input_denormal(b, status); 3645 3646 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3647 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3648 ) { 3649 if (float32_is_signaling_nan(a, status) 3650 || float32_is_signaling_nan(b, status)) { 3651 float_raise(float_flag_invalid, status); 3652 } 3653 return 0; 3654 } 3655 aSign = extractFloat32Sign( a ); 3656 bSign = extractFloat32Sign( b ); 3657 av = float32_val(a); 3658 bv = float32_val(b); 3659 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 3660 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3661 3662 } 3663 3664 /*---------------------------------------------------------------------------- 3665 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 3666 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 3667 | comparison is performed according to the IEC/IEEE Standard for Binary 3668 | Floating-Point Arithmetic. 3669 *----------------------------------------------------------------------------*/ 3670 3671 int float32_unordered_quiet(float32 a, float32 b, float_status *status) 3672 { 3673 a = float32_squash_input_denormal(a, status); 3674 b = float32_squash_input_denormal(b, status); 3675 3676 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3677 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3678 ) { 3679 if (float32_is_signaling_nan(a, status) 3680 || float32_is_signaling_nan(b, status)) { 3681 float_raise(float_flag_invalid, status); 3682 } 3683 return 1; 3684 } 3685 return 0; 3686 } 3687 3688 3689 /*---------------------------------------------------------------------------- 3690 | Returns the result of converting the double-precision floating-point value 3691 | `a' to the single-precision floating-point format. The conversion is 3692 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3693 | Arithmetic. 3694 *----------------------------------------------------------------------------*/ 3695 3696 float32 float64_to_float32(float64 a, float_status *status) 3697 { 3698 flag aSign; 3699 int aExp; 3700 uint64_t aSig; 3701 uint32_t zSig; 3702 a = float64_squash_input_denormal(a, status); 3703 3704 aSig = extractFloat64Frac( a ); 3705 aExp = extractFloat64Exp( a ); 3706 aSign = extractFloat64Sign( a ); 3707 if ( aExp == 0x7FF ) { 3708 if (aSig) { 3709 return commonNaNToFloat32(float64ToCommonNaN(a, status), status); 3710 } 3711 return packFloat32( aSign, 0xFF, 0 ); 3712 } 3713 shift64RightJamming( aSig, 22, &aSig ); 3714 zSig = aSig; 3715 if ( aExp || zSig ) { 3716 zSig |= 0x40000000; 3717 aExp -= 0x381; 3718 } 3719 return roundAndPackFloat32(aSign, aExp, zSig, status); 3720 3721 } 3722 3723 3724 /*---------------------------------------------------------------------------- 3725 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 3726 | half-precision floating-point value, returning the result. After being 3727 | shifted into the proper positions, the three fields are simply added 3728 | together to form the result. This means that any integer portion of `zSig' 3729 | will be added into the exponent. Since a properly normalized significand 3730 | will have an integer portion equal to 1, the `zExp' input should be 1 less 3731 | than the desired result exponent whenever `zSig' is a complete, normalized 3732 | significand. 3733 *----------------------------------------------------------------------------*/ 3734 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig) 3735 { 3736 return make_float16( 3737 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig); 3738 } 3739 3740 /*---------------------------------------------------------------------------- 3741 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3742 | and significand `zSig', and returns the proper half-precision floating- 3743 | point value corresponding to the abstract input. Ordinarily, the abstract 3744 | value is simply rounded and packed into the half-precision format, with 3745 | the inexact exception raised if the abstract input cannot be represented 3746 | exactly. However, if the abstract value is too large, the overflow and 3747 | inexact exceptions are raised and an infinity or maximal finite value is 3748 | returned. If the abstract value is too small, the input value is rounded to 3749 | a subnormal number, and the underflow and inexact exceptions are raised if 3750 | the abstract input cannot be represented exactly as a subnormal half- 3751 | precision floating-point number. 3752 | The `ieee' flag indicates whether to use IEEE standard half precision, or 3753 | ARM-style "alternative representation", which omits the NaN and Inf 3754 | encodings in order to raise the maximum representable exponent by one. 3755 | The input significand `zSig' has its binary point between bits 22 3756 | and 23, which is 13 bits to the left of the usual location. This shifted 3757 | significand must be normalized or smaller. If `zSig' is not normalized, 3758 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3759 | and it must not require rounding. In the usual case that `zSig' is 3760 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3761 | Note the slightly odd position of the binary point in zSig compared with the 3762 | other roundAndPackFloat functions. This should probably be fixed if we 3763 | need to implement more float16 routines than just conversion. 3764 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3765 | Binary Floating-Point Arithmetic. 3766 *----------------------------------------------------------------------------*/ 3767 3768 static float16 roundAndPackFloat16(flag zSign, int zExp, 3769 uint32_t zSig, flag ieee, 3770 float_status *status) 3771 { 3772 int maxexp = ieee ? 29 : 30; 3773 uint32_t mask; 3774 uint32_t increment; 3775 bool rounding_bumps_exp; 3776 bool is_tiny = false; 3777 3778 /* Calculate the mask of bits of the mantissa which are not 3779 * representable in half-precision and will be lost. 3780 */ 3781 if (zExp < 1) { 3782 /* Will be denormal in halfprec */ 3783 mask = 0x00ffffff; 3784 if (zExp >= -11) { 3785 mask >>= 11 + zExp; 3786 } 3787 } else { 3788 /* Normal number in halfprec */ 3789 mask = 0x00001fff; 3790 } 3791 3792 switch (status->float_rounding_mode) { 3793 case float_round_nearest_even: 3794 increment = (mask + 1) >> 1; 3795 if ((zSig & mask) == increment) { 3796 increment = zSig & (increment << 1); 3797 } 3798 break; 3799 case float_round_ties_away: 3800 increment = (mask + 1) >> 1; 3801 break; 3802 case float_round_up: 3803 increment = zSign ? 0 : mask; 3804 break; 3805 case float_round_down: 3806 increment = zSign ? mask : 0; 3807 break; 3808 default: /* round_to_zero */ 3809 increment = 0; 3810 break; 3811 } 3812 3813 rounding_bumps_exp = (zSig + increment >= 0x01000000); 3814 3815 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) { 3816 if (ieee) { 3817 float_raise(float_flag_overflow | float_flag_inexact, status); 3818 return packFloat16(zSign, 0x1f, 0); 3819 } else { 3820 float_raise(float_flag_invalid, status); 3821 return packFloat16(zSign, 0x1f, 0x3ff); 3822 } 3823 } 3824 3825 if (zExp < 0) { 3826 /* Note that flush-to-zero does not affect half-precision results */ 3827 is_tiny = 3828 (status->float_detect_tininess == float_tininess_before_rounding) 3829 || (zExp < -1) 3830 || (!rounding_bumps_exp); 3831 } 3832 if (zSig & mask) { 3833 float_raise(float_flag_inexact, status); 3834 if (is_tiny) { 3835 float_raise(float_flag_underflow, status); 3836 } 3837 } 3838 3839 zSig += increment; 3840 if (rounding_bumps_exp) { 3841 zSig >>= 1; 3842 zExp++; 3843 } 3844 3845 if (zExp < -10) { 3846 return packFloat16(zSign, 0, 0); 3847 } 3848 if (zExp < 0) { 3849 zSig >>= -zExp; 3850 zExp = 0; 3851 } 3852 return packFloat16(zSign, zExp, zSig >> 13); 3853 } 3854 3855 /*---------------------------------------------------------------------------- 3856 | If `a' is denormal and we are in flush-to-zero mode then set the 3857 | input-denormal exception and return zero. Otherwise just return the value. 3858 *----------------------------------------------------------------------------*/ 3859 float16 float16_squash_input_denormal(float16 a, float_status *status) 3860 { 3861 if (status->flush_inputs_to_zero) { 3862 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) { 3863 float_raise(float_flag_input_denormal, status); 3864 return make_float16(float16_val(a) & 0x8000); 3865 } 3866 } 3867 return a; 3868 } 3869 3870 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr, 3871 uint32_t *zSigPtr) 3872 { 3873 int8_t shiftCount = countLeadingZeros32(aSig) - 21; 3874 *zSigPtr = aSig << shiftCount; 3875 *zExpPtr = 1 - shiftCount; 3876 } 3877 3878 /* Half precision floats come in two formats: standard IEEE and "ARM" format. 3879 The latter gains extra exponent range by omitting the NaN/Inf encodings. */ 3880 3881 float32 float16_to_float32(float16 a, flag ieee, float_status *status) 3882 { 3883 flag aSign; 3884 int aExp; 3885 uint32_t aSig; 3886 3887 aSign = extractFloat16Sign(a); 3888 aExp = extractFloat16Exp(a); 3889 aSig = extractFloat16Frac(a); 3890 3891 if (aExp == 0x1f && ieee) { 3892 if (aSig) { 3893 return commonNaNToFloat32(float16ToCommonNaN(a, status), status); 3894 } 3895 return packFloat32(aSign, 0xff, 0); 3896 } 3897 if (aExp == 0) { 3898 if (aSig == 0) { 3899 return packFloat32(aSign, 0, 0); 3900 } 3901 3902 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 3903 aExp--; 3904 } 3905 return packFloat32( aSign, aExp + 0x70, aSig << 13); 3906 } 3907 3908 float16 float32_to_float16(float32 a, flag ieee, float_status *status) 3909 { 3910 flag aSign; 3911 int aExp; 3912 uint32_t aSig; 3913 3914 a = float32_squash_input_denormal(a, status); 3915 3916 aSig = extractFloat32Frac( a ); 3917 aExp = extractFloat32Exp( a ); 3918 aSign = extractFloat32Sign( a ); 3919 if ( aExp == 0xFF ) { 3920 if (aSig) { 3921 /* Input is a NaN */ 3922 if (!ieee) { 3923 float_raise(float_flag_invalid, status); 3924 return packFloat16(aSign, 0, 0); 3925 } 3926 return commonNaNToFloat16( 3927 float32ToCommonNaN(a, status), status); 3928 } 3929 /* Infinity */ 3930 if (!ieee) { 3931 float_raise(float_flag_invalid, status); 3932 return packFloat16(aSign, 0x1f, 0x3ff); 3933 } 3934 return packFloat16(aSign, 0x1f, 0); 3935 } 3936 if (aExp == 0 && aSig == 0) { 3937 return packFloat16(aSign, 0, 0); 3938 } 3939 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 3940 * even if the input is denormal; however this is harmless because 3941 * the largest possible single-precision denormal is still smaller 3942 * than the smallest representable half-precision denormal, and so we 3943 * will end up ignoring aSig and returning via the "always return zero" 3944 * codepath. 3945 */ 3946 aSig |= 0x00800000; 3947 aExp -= 0x71; 3948 3949 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status); 3950 } 3951 3952 float64 float16_to_float64(float16 a, flag ieee, float_status *status) 3953 { 3954 flag aSign; 3955 int aExp; 3956 uint32_t aSig; 3957 3958 aSign = extractFloat16Sign(a); 3959 aExp = extractFloat16Exp(a); 3960 aSig = extractFloat16Frac(a); 3961 3962 if (aExp == 0x1f && ieee) { 3963 if (aSig) { 3964 return commonNaNToFloat64( 3965 float16ToCommonNaN(a, status), status); 3966 } 3967 return packFloat64(aSign, 0x7ff, 0); 3968 } 3969 if (aExp == 0) { 3970 if (aSig == 0) { 3971 return packFloat64(aSign, 0, 0); 3972 } 3973 3974 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 3975 aExp--; 3976 } 3977 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42); 3978 } 3979 3980 float16 float64_to_float16(float64 a, flag ieee, float_status *status) 3981 { 3982 flag aSign; 3983 int aExp; 3984 uint64_t aSig; 3985 uint32_t zSig; 3986 3987 a = float64_squash_input_denormal(a, status); 3988 3989 aSig = extractFloat64Frac(a); 3990 aExp = extractFloat64Exp(a); 3991 aSign = extractFloat64Sign(a); 3992 if (aExp == 0x7FF) { 3993 if (aSig) { 3994 /* Input is a NaN */ 3995 if (!ieee) { 3996 float_raise(float_flag_invalid, status); 3997 return packFloat16(aSign, 0, 0); 3998 } 3999 return commonNaNToFloat16( 4000 float64ToCommonNaN(a, status), status); 4001 } 4002 /* Infinity */ 4003 if (!ieee) { 4004 float_raise(float_flag_invalid, status); 4005 return packFloat16(aSign, 0x1f, 0x3ff); 4006 } 4007 return packFloat16(aSign, 0x1f, 0); 4008 } 4009 shift64RightJamming(aSig, 29, &aSig); 4010 zSig = aSig; 4011 if (aExp == 0 && zSig == 0) { 4012 return packFloat16(aSign, 0, 0); 4013 } 4014 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 4015 * even if the input is denormal; however this is harmless because 4016 * the largest possible single-precision denormal is still smaller 4017 * than the smallest representable half-precision denormal, and so we 4018 * will end up ignoring aSig and returning via the "always return zero" 4019 * codepath. 4020 */ 4021 zSig |= 0x00800000; 4022 aExp -= 0x3F1; 4023 4024 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status); 4025 } 4026 4027 /*---------------------------------------------------------------------------- 4028 | Returns the result of converting the double-precision floating-point value 4029 | `a' to the extended double-precision floating-point format. The conversion 4030 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4031 | Arithmetic. 4032 *----------------------------------------------------------------------------*/ 4033 4034 floatx80 float64_to_floatx80(float64 a, float_status *status) 4035 { 4036 flag aSign; 4037 int aExp; 4038 uint64_t aSig; 4039 4040 a = float64_squash_input_denormal(a, status); 4041 aSig = extractFloat64Frac( a ); 4042 aExp = extractFloat64Exp( a ); 4043 aSign = extractFloat64Sign( a ); 4044 if ( aExp == 0x7FF ) { 4045 if (aSig) { 4046 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status); 4047 } 4048 return packFloatx80(aSign, 4049 floatx80_infinity_high, 4050 floatx80_infinity_low); 4051 } 4052 if ( aExp == 0 ) { 4053 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 4054 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4055 } 4056 return 4057 packFloatx80( 4058 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 4059 4060 } 4061 4062 /*---------------------------------------------------------------------------- 4063 | Returns the result of converting the double-precision floating-point value 4064 | `a' to the quadruple-precision floating-point format. The conversion is 4065 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4066 | Arithmetic. 4067 *----------------------------------------------------------------------------*/ 4068 4069 float128 float64_to_float128(float64 a, float_status *status) 4070 { 4071 flag aSign; 4072 int aExp; 4073 uint64_t aSig, zSig0, zSig1; 4074 4075 a = float64_squash_input_denormal(a, status); 4076 aSig = extractFloat64Frac( a ); 4077 aExp = extractFloat64Exp( a ); 4078 aSign = extractFloat64Sign( a ); 4079 if ( aExp == 0x7FF ) { 4080 if (aSig) { 4081 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 4082 } 4083 return packFloat128( aSign, 0x7FFF, 0, 0 ); 4084 } 4085 if ( aExp == 0 ) { 4086 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 4087 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4088 --aExp; 4089 } 4090 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 4091 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 4092 4093 } 4094 4095 4096 /*---------------------------------------------------------------------------- 4097 | Returns the remainder of the double-precision floating-point value `a' 4098 | with respect to the corresponding value `b'. The operation is performed 4099 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4100 *----------------------------------------------------------------------------*/ 4101 4102 float64 float64_rem(float64 a, float64 b, float_status *status) 4103 { 4104 flag aSign, zSign; 4105 int aExp, bExp, expDiff; 4106 uint64_t aSig, bSig; 4107 uint64_t q, alternateASig; 4108 int64_t sigMean; 4109 4110 a = float64_squash_input_denormal(a, status); 4111 b = float64_squash_input_denormal(b, status); 4112 aSig = extractFloat64Frac( a ); 4113 aExp = extractFloat64Exp( a ); 4114 aSign = extractFloat64Sign( a ); 4115 bSig = extractFloat64Frac( b ); 4116 bExp = extractFloat64Exp( b ); 4117 if ( aExp == 0x7FF ) { 4118 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 4119 return propagateFloat64NaN(a, b, status); 4120 } 4121 float_raise(float_flag_invalid, status); 4122 return float64_default_nan(status); 4123 } 4124 if ( bExp == 0x7FF ) { 4125 if (bSig) { 4126 return propagateFloat64NaN(a, b, status); 4127 } 4128 return a; 4129 } 4130 if ( bExp == 0 ) { 4131 if ( bSig == 0 ) { 4132 float_raise(float_flag_invalid, status); 4133 return float64_default_nan(status); 4134 } 4135 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4136 } 4137 if ( aExp == 0 ) { 4138 if ( aSig == 0 ) return a; 4139 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4140 } 4141 expDiff = aExp - bExp; 4142 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 4143 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4144 if ( expDiff < 0 ) { 4145 if ( expDiff < -1 ) return a; 4146 aSig >>= 1; 4147 } 4148 q = ( bSig <= aSig ); 4149 if ( q ) aSig -= bSig; 4150 expDiff -= 64; 4151 while ( 0 < expDiff ) { 4152 q = estimateDiv128To64( aSig, 0, bSig ); 4153 q = ( 2 < q ) ? q - 2 : 0; 4154 aSig = - ( ( bSig>>2 ) * q ); 4155 expDiff -= 62; 4156 } 4157 expDiff += 64; 4158 if ( 0 < expDiff ) { 4159 q = estimateDiv128To64( aSig, 0, bSig ); 4160 q = ( 2 < q ) ? q - 2 : 0; 4161 q >>= 64 - expDiff; 4162 bSig >>= 2; 4163 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4164 } 4165 else { 4166 aSig >>= 2; 4167 bSig >>= 2; 4168 } 4169 do { 4170 alternateASig = aSig; 4171 ++q; 4172 aSig -= bSig; 4173 } while ( 0 <= (int64_t) aSig ); 4174 sigMean = aSig + alternateASig; 4175 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 4176 aSig = alternateASig; 4177 } 4178 zSign = ( (int64_t) aSig < 0 ); 4179 if ( zSign ) aSig = - aSig; 4180 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 4181 4182 } 4183 4184 /*---------------------------------------------------------------------------- 4185 | Returns the binary log of the double-precision floating-point value `a'. 4186 | The operation is performed according to the IEC/IEEE Standard for Binary 4187 | Floating-Point Arithmetic. 4188 *----------------------------------------------------------------------------*/ 4189 float64 float64_log2(float64 a, float_status *status) 4190 { 4191 flag aSign, zSign; 4192 int aExp; 4193 uint64_t aSig, aSig0, aSig1, zSig, i; 4194 a = float64_squash_input_denormal(a, status); 4195 4196 aSig = extractFloat64Frac( a ); 4197 aExp = extractFloat64Exp( a ); 4198 aSign = extractFloat64Sign( a ); 4199 4200 if ( aExp == 0 ) { 4201 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 4202 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4203 } 4204 if ( aSign ) { 4205 float_raise(float_flag_invalid, status); 4206 return float64_default_nan(status); 4207 } 4208 if ( aExp == 0x7FF ) { 4209 if (aSig) { 4210 return propagateFloat64NaN(a, float64_zero, status); 4211 } 4212 return a; 4213 } 4214 4215 aExp -= 0x3FF; 4216 aSig |= LIT64( 0x0010000000000000 ); 4217 zSign = aExp < 0; 4218 zSig = (uint64_t)aExp << 52; 4219 for (i = 1LL << 51; i > 0; i >>= 1) { 4220 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 4221 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 4222 if ( aSig & LIT64( 0x0020000000000000 ) ) { 4223 aSig >>= 1; 4224 zSig |= i; 4225 } 4226 } 4227 4228 if ( zSign ) 4229 zSig = -zSig; 4230 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 4231 } 4232 4233 /*---------------------------------------------------------------------------- 4234 | Returns 1 if the double-precision floating-point value `a' is equal to the 4235 | corresponding value `b', and 0 otherwise. The invalid exception is raised 4236 | if either operand is a NaN. Otherwise, the comparison is performed 4237 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4238 *----------------------------------------------------------------------------*/ 4239 4240 int float64_eq(float64 a, float64 b, float_status *status) 4241 { 4242 uint64_t av, bv; 4243 a = float64_squash_input_denormal(a, status); 4244 b = float64_squash_input_denormal(b, status); 4245 4246 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4247 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4248 ) { 4249 float_raise(float_flag_invalid, status); 4250 return 0; 4251 } 4252 av = float64_val(a); 4253 bv = float64_val(b); 4254 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4255 4256 } 4257 4258 /*---------------------------------------------------------------------------- 4259 | Returns 1 if the double-precision floating-point value `a' is less than or 4260 | equal to the corresponding value `b', and 0 otherwise. The invalid 4261 | exception is raised if either operand is a NaN. The comparison is performed 4262 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4263 *----------------------------------------------------------------------------*/ 4264 4265 int float64_le(float64 a, float64 b, float_status *status) 4266 { 4267 flag aSign, bSign; 4268 uint64_t av, bv; 4269 a = float64_squash_input_denormal(a, status); 4270 b = float64_squash_input_denormal(b, status); 4271 4272 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4273 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4274 ) { 4275 float_raise(float_flag_invalid, status); 4276 return 0; 4277 } 4278 aSign = extractFloat64Sign( a ); 4279 bSign = extractFloat64Sign( b ); 4280 av = float64_val(a); 4281 bv = float64_val(b); 4282 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4283 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4284 4285 } 4286 4287 /*---------------------------------------------------------------------------- 4288 | Returns 1 if the double-precision floating-point value `a' is less than 4289 | the corresponding value `b', and 0 otherwise. The invalid exception is 4290 | raised if either operand is a NaN. The comparison is performed according 4291 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4292 *----------------------------------------------------------------------------*/ 4293 4294 int float64_lt(float64 a, float64 b, float_status *status) 4295 { 4296 flag aSign, bSign; 4297 uint64_t av, bv; 4298 4299 a = float64_squash_input_denormal(a, status); 4300 b = float64_squash_input_denormal(b, status); 4301 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4302 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4303 ) { 4304 float_raise(float_flag_invalid, status); 4305 return 0; 4306 } 4307 aSign = extractFloat64Sign( a ); 4308 bSign = extractFloat64Sign( b ); 4309 av = float64_val(a); 4310 bv = float64_val(b); 4311 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4312 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4313 4314 } 4315 4316 /*---------------------------------------------------------------------------- 4317 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4318 | be compared, and 0 otherwise. The invalid exception is raised if either 4319 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4320 | Standard for Binary Floating-Point Arithmetic. 4321 *----------------------------------------------------------------------------*/ 4322 4323 int float64_unordered(float64 a, float64 b, float_status *status) 4324 { 4325 a = float64_squash_input_denormal(a, status); 4326 b = float64_squash_input_denormal(b, status); 4327 4328 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4329 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4330 ) { 4331 float_raise(float_flag_invalid, status); 4332 return 1; 4333 } 4334 return 0; 4335 } 4336 4337 /*---------------------------------------------------------------------------- 4338 | Returns 1 if the double-precision floating-point value `a' is equal to the 4339 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4340 | exception.The comparison is performed according to the IEC/IEEE Standard 4341 | for Binary Floating-Point Arithmetic. 4342 *----------------------------------------------------------------------------*/ 4343 4344 int float64_eq_quiet(float64 a, float64 b, float_status *status) 4345 { 4346 uint64_t av, bv; 4347 a = float64_squash_input_denormal(a, status); 4348 b = float64_squash_input_denormal(b, status); 4349 4350 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4351 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4352 ) { 4353 if (float64_is_signaling_nan(a, status) 4354 || float64_is_signaling_nan(b, status)) { 4355 float_raise(float_flag_invalid, status); 4356 } 4357 return 0; 4358 } 4359 av = float64_val(a); 4360 bv = float64_val(b); 4361 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4362 4363 } 4364 4365 /*---------------------------------------------------------------------------- 4366 | Returns 1 if the double-precision floating-point value `a' is less than or 4367 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4368 | cause an exception. Otherwise, the comparison is performed according to the 4369 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4370 *----------------------------------------------------------------------------*/ 4371 4372 int float64_le_quiet(float64 a, float64 b, float_status *status) 4373 { 4374 flag aSign, bSign; 4375 uint64_t av, bv; 4376 a = float64_squash_input_denormal(a, status); 4377 b = float64_squash_input_denormal(b, status); 4378 4379 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4380 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4381 ) { 4382 if (float64_is_signaling_nan(a, status) 4383 || float64_is_signaling_nan(b, status)) { 4384 float_raise(float_flag_invalid, status); 4385 } 4386 return 0; 4387 } 4388 aSign = extractFloat64Sign( a ); 4389 bSign = extractFloat64Sign( b ); 4390 av = float64_val(a); 4391 bv = float64_val(b); 4392 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4393 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4394 4395 } 4396 4397 /*---------------------------------------------------------------------------- 4398 | Returns 1 if the double-precision floating-point value `a' is less than 4399 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4400 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 4401 | Standard for Binary Floating-Point Arithmetic. 4402 *----------------------------------------------------------------------------*/ 4403 4404 int float64_lt_quiet(float64 a, float64 b, float_status *status) 4405 { 4406 flag aSign, bSign; 4407 uint64_t av, bv; 4408 a = float64_squash_input_denormal(a, status); 4409 b = float64_squash_input_denormal(b, status); 4410 4411 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4412 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4413 ) { 4414 if (float64_is_signaling_nan(a, status) 4415 || float64_is_signaling_nan(b, status)) { 4416 float_raise(float_flag_invalid, status); 4417 } 4418 return 0; 4419 } 4420 aSign = extractFloat64Sign( a ); 4421 bSign = extractFloat64Sign( b ); 4422 av = float64_val(a); 4423 bv = float64_val(b); 4424 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4425 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4426 4427 } 4428 4429 /*---------------------------------------------------------------------------- 4430 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4431 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 4432 | comparison is performed according to the IEC/IEEE Standard for Binary 4433 | Floating-Point Arithmetic. 4434 *----------------------------------------------------------------------------*/ 4435 4436 int float64_unordered_quiet(float64 a, float64 b, float_status *status) 4437 { 4438 a = float64_squash_input_denormal(a, status); 4439 b = float64_squash_input_denormal(b, status); 4440 4441 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4442 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4443 ) { 4444 if (float64_is_signaling_nan(a, status) 4445 || float64_is_signaling_nan(b, status)) { 4446 float_raise(float_flag_invalid, status); 4447 } 4448 return 1; 4449 } 4450 return 0; 4451 } 4452 4453 /*---------------------------------------------------------------------------- 4454 | Returns the result of converting the extended double-precision floating- 4455 | point value `a' to the 32-bit two's complement integer format. The 4456 | conversion is performed according to the IEC/IEEE Standard for Binary 4457 | Floating-Point Arithmetic---which means in particular that the conversion 4458 | is rounded according to the current rounding mode. If `a' is a NaN, the 4459 | largest positive integer is returned. Otherwise, if the conversion 4460 | overflows, the largest integer with the same sign as `a' is returned. 4461 *----------------------------------------------------------------------------*/ 4462 4463 int32_t floatx80_to_int32(floatx80 a, float_status *status) 4464 { 4465 flag aSign; 4466 int32_t aExp, shiftCount; 4467 uint64_t aSig; 4468 4469 if (floatx80_invalid_encoding(a)) { 4470 float_raise(float_flag_invalid, status); 4471 return 1 << 31; 4472 } 4473 aSig = extractFloatx80Frac( a ); 4474 aExp = extractFloatx80Exp( a ); 4475 aSign = extractFloatx80Sign( a ); 4476 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4477 shiftCount = 0x4037 - aExp; 4478 if ( shiftCount <= 0 ) shiftCount = 1; 4479 shift64RightJamming( aSig, shiftCount, &aSig ); 4480 return roundAndPackInt32(aSign, aSig, status); 4481 4482 } 4483 4484 /*---------------------------------------------------------------------------- 4485 | Returns the result of converting the extended double-precision floating- 4486 | point value `a' to the 32-bit two's complement integer format. The 4487 | conversion is performed according to the IEC/IEEE Standard for Binary 4488 | Floating-Point Arithmetic, except that the conversion is always rounded 4489 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4490 | Otherwise, if the conversion overflows, the largest integer with the same 4491 | sign as `a' is returned. 4492 *----------------------------------------------------------------------------*/ 4493 4494 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 4495 { 4496 flag aSign; 4497 int32_t aExp, shiftCount; 4498 uint64_t aSig, savedASig; 4499 int32_t z; 4500 4501 if (floatx80_invalid_encoding(a)) { 4502 float_raise(float_flag_invalid, status); 4503 return 1 << 31; 4504 } 4505 aSig = extractFloatx80Frac( a ); 4506 aExp = extractFloatx80Exp( a ); 4507 aSign = extractFloatx80Sign( a ); 4508 if ( 0x401E < aExp ) { 4509 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4510 goto invalid; 4511 } 4512 else if ( aExp < 0x3FFF ) { 4513 if (aExp || aSig) { 4514 status->float_exception_flags |= float_flag_inexact; 4515 } 4516 return 0; 4517 } 4518 shiftCount = 0x403E - aExp; 4519 savedASig = aSig; 4520 aSig >>= shiftCount; 4521 z = aSig; 4522 if ( aSign ) z = - z; 4523 if ( ( z < 0 ) ^ aSign ) { 4524 invalid: 4525 float_raise(float_flag_invalid, status); 4526 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 4527 } 4528 if ( ( aSig<<shiftCount ) != savedASig ) { 4529 status->float_exception_flags |= float_flag_inexact; 4530 } 4531 return z; 4532 4533 } 4534 4535 /*---------------------------------------------------------------------------- 4536 | Returns the result of converting the extended double-precision floating- 4537 | point value `a' to the 64-bit two's complement integer format. The 4538 | conversion is performed according to the IEC/IEEE Standard for Binary 4539 | Floating-Point Arithmetic---which means in particular that the conversion 4540 | is rounded according to the current rounding mode. If `a' is a NaN, 4541 | the largest positive integer is returned. Otherwise, if the conversion 4542 | overflows, the largest integer with the same sign as `a' is returned. 4543 *----------------------------------------------------------------------------*/ 4544 4545 int64_t floatx80_to_int64(floatx80 a, float_status *status) 4546 { 4547 flag aSign; 4548 int32_t aExp, shiftCount; 4549 uint64_t aSig, aSigExtra; 4550 4551 if (floatx80_invalid_encoding(a)) { 4552 float_raise(float_flag_invalid, status); 4553 return 1ULL << 63; 4554 } 4555 aSig = extractFloatx80Frac( a ); 4556 aExp = extractFloatx80Exp( a ); 4557 aSign = extractFloatx80Sign( a ); 4558 shiftCount = 0x403E - aExp; 4559 if ( shiftCount <= 0 ) { 4560 if ( shiftCount ) { 4561 float_raise(float_flag_invalid, status); 4562 if (!aSign || floatx80_is_any_nan(a)) { 4563 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4564 } 4565 return (int64_t) LIT64( 0x8000000000000000 ); 4566 } 4567 aSigExtra = 0; 4568 } 4569 else { 4570 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 4571 } 4572 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 4573 4574 } 4575 4576 /*---------------------------------------------------------------------------- 4577 | Returns the result of converting the extended double-precision floating- 4578 | point value `a' to the 64-bit two's complement integer format. The 4579 | conversion is performed according to the IEC/IEEE Standard for Binary 4580 | Floating-Point Arithmetic, except that the conversion is always rounded 4581 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4582 | Otherwise, if the conversion overflows, the largest integer with the same 4583 | sign as `a' is returned. 4584 *----------------------------------------------------------------------------*/ 4585 4586 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 4587 { 4588 flag aSign; 4589 int32_t aExp, shiftCount; 4590 uint64_t aSig; 4591 int64_t z; 4592 4593 if (floatx80_invalid_encoding(a)) { 4594 float_raise(float_flag_invalid, status); 4595 return 1ULL << 63; 4596 } 4597 aSig = extractFloatx80Frac( a ); 4598 aExp = extractFloatx80Exp( a ); 4599 aSign = extractFloatx80Sign( a ); 4600 shiftCount = aExp - 0x403E; 4601 if ( 0 <= shiftCount ) { 4602 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 4603 if ( ( a.high != 0xC03E ) || aSig ) { 4604 float_raise(float_flag_invalid, status); 4605 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 4606 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4607 } 4608 } 4609 return (int64_t) LIT64( 0x8000000000000000 ); 4610 } 4611 else if ( aExp < 0x3FFF ) { 4612 if (aExp | aSig) { 4613 status->float_exception_flags |= float_flag_inexact; 4614 } 4615 return 0; 4616 } 4617 z = aSig>>( - shiftCount ); 4618 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 4619 status->float_exception_flags |= float_flag_inexact; 4620 } 4621 if ( aSign ) z = - z; 4622 return z; 4623 4624 } 4625 4626 /*---------------------------------------------------------------------------- 4627 | Returns the result of converting the extended double-precision floating- 4628 | point value `a' to the single-precision floating-point format. The 4629 | conversion is performed according to the IEC/IEEE Standard for Binary 4630 | Floating-Point Arithmetic. 4631 *----------------------------------------------------------------------------*/ 4632 4633 float32 floatx80_to_float32(floatx80 a, float_status *status) 4634 { 4635 flag aSign; 4636 int32_t aExp; 4637 uint64_t aSig; 4638 4639 if (floatx80_invalid_encoding(a)) { 4640 float_raise(float_flag_invalid, status); 4641 return float32_default_nan(status); 4642 } 4643 aSig = extractFloatx80Frac( a ); 4644 aExp = extractFloatx80Exp( a ); 4645 aSign = extractFloatx80Sign( a ); 4646 if ( aExp == 0x7FFF ) { 4647 if ( (uint64_t) ( aSig<<1 ) ) { 4648 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status); 4649 } 4650 return packFloat32( aSign, 0xFF, 0 ); 4651 } 4652 shift64RightJamming( aSig, 33, &aSig ); 4653 if ( aExp || aSig ) aExp -= 0x3F81; 4654 return roundAndPackFloat32(aSign, aExp, aSig, status); 4655 4656 } 4657 4658 /*---------------------------------------------------------------------------- 4659 | Returns the result of converting the extended double-precision floating- 4660 | point value `a' to the double-precision floating-point format. The 4661 | conversion is performed according to the IEC/IEEE Standard for Binary 4662 | Floating-Point Arithmetic. 4663 *----------------------------------------------------------------------------*/ 4664 4665 float64 floatx80_to_float64(floatx80 a, float_status *status) 4666 { 4667 flag aSign; 4668 int32_t aExp; 4669 uint64_t aSig, zSig; 4670 4671 if (floatx80_invalid_encoding(a)) { 4672 float_raise(float_flag_invalid, status); 4673 return float64_default_nan(status); 4674 } 4675 aSig = extractFloatx80Frac( a ); 4676 aExp = extractFloatx80Exp( a ); 4677 aSign = extractFloatx80Sign( a ); 4678 if ( aExp == 0x7FFF ) { 4679 if ( (uint64_t) ( aSig<<1 ) ) { 4680 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status); 4681 } 4682 return packFloat64( aSign, 0x7FF, 0 ); 4683 } 4684 shift64RightJamming( aSig, 1, &zSig ); 4685 if ( aExp || aSig ) aExp -= 0x3C01; 4686 return roundAndPackFloat64(aSign, aExp, zSig, status); 4687 4688 } 4689 4690 /*---------------------------------------------------------------------------- 4691 | Returns the result of converting the extended double-precision floating- 4692 | point value `a' to the quadruple-precision floating-point format. The 4693 | conversion is performed according to the IEC/IEEE Standard for Binary 4694 | Floating-Point Arithmetic. 4695 *----------------------------------------------------------------------------*/ 4696 4697 float128 floatx80_to_float128(floatx80 a, float_status *status) 4698 { 4699 flag aSign; 4700 int aExp; 4701 uint64_t aSig, zSig0, zSig1; 4702 4703 if (floatx80_invalid_encoding(a)) { 4704 float_raise(float_flag_invalid, status); 4705 return float128_default_nan(status); 4706 } 4707 aSig = extractFloatx80Frac( a ); 4708 aExp = extractFloatx80Exp( a ); 4709 aSign = extractFloatx80Sign( a ); 4710 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 4711 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status); 4712 } 4713 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 4714 return packFloat128( aSign, aExp, zSig0, zSig1 ); 4715 4716 } 4717 4718 /*---------------------------------------------------------------------------- 4719 | Rounds the extended double-precision floating-point value `a' 4720 | to the precision provided by floatx80_rounding_precision and returns the 4721 | result as an extended double-precision floating-point value. 4722 | The operation is performed according to the IEC/IEEE Standard for Binary 4723 | Floating-Point Arithmetic. 4724 *----------------------------------------------------------------------------*/ 4725 4726 floatx80 floatx80_round(floatx80 a, float_status *status) 4727 { 4728 return roundAndPackFloatx80(status->floatx80_rounding_precision, 4729 extractFloatx80Sign(a), 4730 extractFloatx80Exp(a), 4731 extractFloatx80Frac(a), 0, status); 4732 } 4733 4734 /*---------------------------------------------------------------------------- 4735 | Rounds the extended double-precision floating-point value `a' to an integer, 4736 | and returns the result as an extended quadruple-precision floating-point 4737 | value. The operation is performed according to the IEC/IEEE Standard for 4738 | Binary Floating-Point Arithmetic. 4739 *----------------------------------------------------------------------------*/ 4740 4741 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 4742 { 4743 flag aSign; 4744 int32_t aExp; 4745 uint64_t lastBitMask, roundBitsMask; 4746 floatx80 z; 4747 4748 if (floatx80_invalid_encoding(a)) { 4749 float_raise(float_flag_invalid, status); 4750 return floatx80_default_nan(status); 4751 } 4752 aExp = extractFloatx80Exp( a ); 4753 if ( 0x403E <= aExp ) { 4754 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 4755 return propagateFloatx80NaN(a, a, status); 4756 } 4757 return a; 4758 } 4759 if ( aExp < 0x3FFF ) { 4760 if ( ( aExp == 0 ) 4761 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 4762 return a; 4763 } 4764 status->float_exception_flags |= float_flag_inexact; 4765 aSign = extractFloatx80Sign( a ); 4766 switch (status->float_rounding_mode) { 4767 case float_round_nearest_even: 4768 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 4769 ) { 4770 return 4771 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 4772 } 4773 break; 4774 case float_round_ties_away: 4775 if (aExp == 0x3FFE) { 4776 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000)); 4777 } 4778 break; 4779 case float_round_down: 4780 return 4781 aSign ? 4782 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 4783 : packFloatx80( 0, 0, 0 ); 4784 case float_round_up: 4785 return 4786 aSign ? packFloatx80( 1, 0, 0 ) 4787 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 4788 } 4789 return packFloatx80( aSign, 0, 0 ); 4790 } 4791 lastBitMask = 1; 4792 lastBitMask <<= 0x403E - aExp; 4793 roundBitsMask = lastBitMask - 1; 4794 z = a; 4795 switch (status->float_rounding_mode) { 4796 case float_round_nearest_even: 4797 z.low += lastBitMask>>1; 4798 if ((z.low & roundBitsMask) == 0) { 4799 z.low &= ~lastBitMask; 4800 } 4801 break; 4802 case float_round_ties_away: 4803 z.low += lastBitMask >> 1; 4804 break; 4805 case float_round_to_zero: 4806 break; 4807 case float_round_up: 4808 if (!extractFloatx80Sign(z)) { 4809 z.low += roundBitsMask; 4810 } 4811 break; 4812 case float_round_down: 4813 if (extractFloatx80Sign(z)) { 4814 z.low += roundBitsMask; 4815 } 4816 break; 4817 default: 4818 abort(); 4819 } 4820 z.low &= ~ roundBitsMask; 4821 if ( z.low == 0 ) { 4822 ++z.high; 4823 z.low = LIT64( 0x8000000000000000 ); 4824 } 4825 if (z.low != a.low) { 4826 status->float_exception_flags |= float_flag_inexact; 4827 } 4828 return z; 4829 4830 } 4831 4832 /*---------------------------------------------------------------------------- 4833 | Returns the result of adding the absolute values of the extended double- 4834 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 4835 | negated before being returned. `zSign' is ignored if the result is a NaN. 4836 | The addition is performed according to the IEC/IEEE Standard for Binary 4837 | Floating-Point Arithmetic. 4838 *----------------------------------------------------------------------------*/ 4839 4840 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 4841 float_status *status) 4842 { 4843 int32_t aExp, bExp, zExp; 4844 uint64_t aSig, bSig, zSig0, zSig1; 4845 int32_t expDiff; 4846 4847 aSig = extractFloatx80Frac( a ); 4848 aExp = extractFloatx80Exp( a ); 4849 bSig = extractFloatx80Frac( b ); 4850 bExp = extractFloatx80Exp( b ); 4851 expDiff = aExp - bExp; 4852 if ( 0 < expDiff ) { 4853 if ( aExp == 0x7FFF ) { 4854 if ((uint64_t)(aSig << 1)) { 4855 return propagateFloatx80NaN(a, b, status); 4856 } 4857 return a; 4858 } 4859 if ( bExp == 0 ) --expDiff; 4860 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 4861 zExp = aExp; 4862 } 4863 else if ( expDiff < 0 ) { 4864 if ( bExp == 0x7FFF ) { 4865 if ((uint64_t)(bSig << 1)) { 4866 return propagateFloatx80NaN(a, b, status); 4867 } 4868 return packFloatx80(zSign, 4869 floatx80_infinity_high, 4870 floatx80_infinity_low); 4871 } 4872 if ( aExp == 0 ) ++expDiff; 4873 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 4874 zExp = bExp; 4875 } 4876 else { 4877 if ( aExp == 0x7FFF ) { 4878 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 4879 return propagateFloatx80NaN(a, b, status); 4880 } 4881 return a; 4882 } 4883 zSig1 = 0; 4884 zSig0 = aSig + bSig; 4885 if ( aExp == 0 ) { 4886 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 4887 goto roundAndPack; 4888 } 4889 zExp = aExp; 4890 goto shiftRight1; 4891 } 4892 zSig0 = aSig + bSig; 4893 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 4894 shiftRight1: 4895 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 4896 zSig0 |= LIT64( 0x8000000000000000 ); 4897 ++zExp; 4898 roundAndPack: 4899 return roundAndPackFloatx80(status->floatx80_rounding_precision, 4900 zSign, zExp, zSig0, zSig1, status); 4901 } 4902 4903 /*---------------------------------------------------------------------------- 4904 | Returns the result of subtracting the absolute values of the extended 4905 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 4906 | difference is negated before being returned. `zSign' is ignored if the 4907 | result is a NaN. The subtraction is performed according to the IEC/IEEE 4908 | Standard for Binary Floating-Point Arithmetic. 4909 *----------------------------------------------------------------------------*/ 4910 4911 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 4912 float_status *status) 4913 { 4914 int32_t aExp, bExp, zExp; 4915 uint64_t aSig, bSig, zSig0, zSig1; 4916 int32_t expDiff; 4917 4918 aSig = extractFloatx80Frac( a ); 4919 aExp = extractFloatx80Exp( a ); 4920 bSig = extractFloatx80Frac( b ); 4921 bExp = extractFloatx80Exp( b ); 4922 expDiff = aExp - bExp; 4923 if ( 0 < expDiff ) goto aExpBigger; 4924 if ( expDiff < 0 ) goto bExpBigger; 4925 if ( aExp == 0x7FFF ) { 4926 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 4927 return propagateFloatx80NaN(a, b, status); 4928 } 4929 float_raise(float_flag_invalid, status); 4930 return floatx80_default_nan(status); 4931 } 4932 if ( aExp == 0 ) { 4933 aExp = 1; 4934 bExp = 1; 4935 } 4936 zSig1 = 0; 4937 if ( bSig < aSig ) goto aBigger; 4938 if ( aSig < bSig ) goto bBigger; 4939 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 4940 bExpBigger: 4941 if ( bExp == 0x7FFF ) { 4942 if ((uint64_t)(bSig << 1)) { 4943 return propagateFloatx80NaN(a, b, status); 4944 } 4945 return packFloatx80(zSign ^ 1, floatx80_infinity_high, 4946 floatx80_infinity_low); 4947 } 4948 if ( aExp == 0 ) ++expDiff; 4949 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 4950 bBigger: 4951 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 4952 zExp = bExp; 4953 zSign ^= 1; 4954 goto normalizeRoundAndPack; 4955 aExpBigger: 4956 if ( aExp == 0x7FFF ) { 4957 if ((uint64_t)(aSig << 1)) { 4958 return propagateFloatx80NaN(a, b, status); 4959 } 4960 return a; 4961 } 4962 if ( bExp == 0 ) --expDiff; 4963 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 4964 aBigger: 4965 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 4966 zExp = aExp; 4967 normalizeRoundAndPack: 4968 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 4969 zSign, zExp, zSig0, zSig1, status); 4970 } 4971 4972 /*---------------------------------------------------------------------------- 4973 | Returns the result of adding the extended double-precision floating-point 4974 | values `a' and `b'. The operation is performed according to the IEC/IEEE 4975 | Standard for Binary Floating-Point Arithmetic. 4976 *----------------------------------------------------------------------------*/ 4977 4978 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 4979 { 4980 flag aSign, bSign; 4981 4982 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 4983 float_raise(float_flag_invalid, status); 4984 return floatx80_default_nan(status); 4985 } 4986 aSign = extractFloatx80Sign( a ); 4987 bSign = extractFloatx80Sign( b ); 4988 if ( aSign == bSign ) { 4989 return addFloatx80Sigs(a, b, aSign, status); 4990 } 4991 else { 4992 return subFloatx80Sigs(a, b, aSign, status); 4993 } 4994 4995 } 4996 4997 /*---------------------------------------------------------------------------- 4998 | Returns the result of subtracting the extended double-precision floating- 4999 | point values `a' and `b'. The operation is performed according to the 5000 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5001 *----------------------------------------------------------------------------*/ 5002 5003 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 5004 { 5005 flag aSign, bSign; 5006 5007 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5008 float_raise(float_flag_invalid, status); 5009 return floatx80_default_nan(status); 5010 } 5011 aSign = extractFloatx80Sign( a ); 5012 bSign = extractFloatx80Sign( b ); 5013 if ( aSign == bSign ) { 5014 return subFloatx80Sigs(a, b, aSign, status); 5015 } 5016 else { 5017 return addFloatx80Sigs(a, b, aSign, status); 5018 } 5019 5020 } 5021 5022 /*---------------------------------------------------------------------------- 5023 | Returns the result of multiplying the extended double-precision floating- 5024 | point values `a' and `b'. The operation is performed according to the 5025 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5026 *----------------------------------------------------------------------------*/ 5027 5028 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 5029 { 5030 flag aSign, bSign, zSign; 5031 int32_t aExp, bExp, zExp; 5032 uint64_t aSig, bSig, zSig0, zSig1; 5033 5034 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5035 float_raise(float_flag_invalid, status); 5036 return floatx80_default_nan(status); 5037 } 5038 aSig = extractFloatx80Frac( a ); 5039 aExp = extractFloatx80Exp( a ); 5040 aSign = extractFloatx80Sign( a ); 5041 bSig = extractFloatx80Frac( b ); 5042 bExp = extractFloatx80Exp( b ); 5043 bSign = extractFloatx80Sign( b ); 5044 zSign = aSign ^ bSign; 5045 if ( aExp == 0x7FFF ) { 5046 if ( (uint64_t) ( aSig<<1 ) 5047 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5048 return propagateFloatx80NaN(a, b, status); 5049 } 5050 if ( ( bExp | bSig ) == 0 ) goto invalid; 5051 return packFloatx80(zSign, floatx80_infinity_high, 5052 floatx80_infinity_low); 5053 } 5054 if ( bExp == 0x7FFF ) { 5055 if ((uint64_t)(bSig << 1)) { 5056 return propagateFloatx80NaN(a, b, status); 5057 } 5058 if ( ( aExp | aSig ) == 0 ) { 5059 invalid: 5060 float_raise(float_flag_invalid, status); 5061 return floatx80_default_nan(status); 5062 } 5063 return packFloatx80(zSign, floatx80_infinity_high, 5064 floatx80_infinity_low); 5065 } 5066 if ( aExp == 0 ) { 5067 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5068 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5069 } 5070 if ( bExp == 0 ) { 5071 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5072 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5073 } 5074 zExp = aExp + bExp - 0x3FFE; 5075 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 5076 if ( 0 < (int64_t) zSig0 ) { 5077 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5078 --zExp; 5079 } 5080 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5081 zSign, zExp, zSig0, zSig1, status); 5082 } 5083 5084 /*---------------------------------------------------------------------------- 5085 | Returns the result of dividing the extended double-precision floating-point 5086 | value `a' by the corresponding value `b'. The operation is performed 5087 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5088 *----------------------------------------------------------------------------*/ 5089 5090 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 5091 { 5092 flag aSign, bSign, zSign; 5093 int32_t aExp, bExp, zExp; 5094 uint64_t aSig, bSig, zSig0, zSig1; 5095 uint64_t rem0, rem1, rem2, term0, term1, term2; 5096 5097 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5098 float_raise(float_flag_invalid, status); 5099 return floatx80_default_nan(status); 5100 } 5101 aSig = extractFloatx80Frac( a ); 5102 aExp = extractFloatx80Exp( a ); 5103 aSign = extractFloatx80Sign( a ); 5104 bSig = extractFloatx80Frac( b ); 5105 bExp = extractFloatx80Exp( b ); 5106 bSign = extractFloatx80Sign( b ); 5107 zSign = aSign ^ bSign; 5108 if ( aExp == 0x7FFF ) { 5109 if ((uint64_t)(aSig << 1)) { 5110 return propagateFloatx80NaN(a, b, status); 5111 } 5112 if ( bExp == 0x7FFF ) { 5113 if ((uint64_t)(bSig << 1)) { 5114 return propagateFloatx80NaN(a, b, status); 5115 } 5116 goto invalid; 5117 } 5118 return packFloatx80(zSign, floatx80_infinity_high, 5119 floatx80_infinity_low); 5120 } 5121 if ( bExp == 0x7FFF ) { 5122 if ((uint64_t)(bSig << 1)) { 5123 return propagateFloatx80NaN(a, b, status); 5124 } 5125 return packFloatx80( zSign, 0, 0 ); 5126 } 5127 if ( bExp == 0 ) { 5128 if ( bSig == 0 ) { 5129 if ( ( aExp | aSig ) == 0 ) { 5130 invalid: 5131 float_raise(float_flag_invalid, status); 5132 return floatx80_default_nan(status); 5133 } 5134 float_raise(float_flag_divbyzero, status); 5135 return packFloatx80(zSign, floatx80_infinity_high, 5136 floatx80_infinity_low); 5137 } 5138 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5139 } 5140 if ( aExp == 0 ) { 5141 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5142 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5143 } 5144 zExp = aExp - bExp + 0x3FFE; 5145 rem1 = 0; 5146 if ( bSig <= aSig ) { 5147 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 5148 ++zExp; 5149 } 5150 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 5151 mul64To128( bSig, zSig0, &term0, &term1 ); 5152 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 5153 while ( (int64_t) rem0 < 0 ) { 5154 --zSig0; 5155 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 5156 } 5157 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 5158 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 5159 mul64To128( bSig, zSig1, &term1, &term2 ); 5160 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5161 while ( (int64_t) rem1 < 0 ) { 5162 --zSig1; 5163 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 5164 } 5165 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 5166 } 5167 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5168 zSign, zExp, zSig0, zSig1, status); 5169 } 5170 5171 /*---------------------------------------------------------------------------- 5172 | Returns the remainder of the extended double-precision floating-point value 5173 | `a' with respect to the corresponding value `b'. The operation is performed 5174 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5175 *----------------------------------------------------------------------------*/ 5176 5177 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 5178 { 5179 flag aSign, zSign; 5180 int32_t aExp, bExp, expDiff; 5181 uint64_t aSig0, aSig1, bSig; 5182 uint64_t q, term0, term1, alternateASig0, alternateASig1; 5183 5184 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5185 float_raise(float_flag_invalid, status); 5186 return floatx80_default_nan(status); 5187 } 5188 aSig0 = extractFloatx80Frac( a ); 5189 aExp = extractFloatx80Exp( a ); 5190 aSign = extractFloatx80Sign( a ); 5191 bSig = extractFloatx80Frac( b ); 5192 bExp = extractFloatx80Exp( b ); 5193 if ( aExp == 0x7FFF ) { 5194 if ( (uint64_t) ( aSig0<<1 ) 5195 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5196 return propagateFloatx80NaN(a, b, status); 5197 } 5198 goto invalid; 5199 } 5200 if ( bExp == 0x7FFF ) { 5201 if ((uint64_t)(bSig << 1)) { 5202 return propagateFloatx80NaN(a, b, status); 5203 } 5204 return a; 5205 } 5206 if ( bExp == 0 ) { 5207 if ( bSig == 0 ) { 5208 invalid: 5209 float_raise(float_flag_invalid, status); 5210 return floatx80_default_nan(status); 5211 } 5212 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5213 } 5214 if ( aExp == 0 ) { 5215 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a; 5216 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5217 } 5218 bSig |= LIT64( 0x8000000000000000 ); 5219 zSign = aSign; 5220 expDiff = aExp - bExp; 5221 aSig1 = 0; 5222 if ( expDiff < 0 ) { 5223 if ( expDiff < -1 ) return a; 5224 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 5225 expDiff = 0; 5226 } 5227 q = ( bSig <= aSig0 ); 5228 if ( q ) aSig0 -= bSig; 5229 expDiff -= 64; 5230 while ( 0 < expDiff ) { 5231 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5232 q = ( 2 < q ) ? q - 2 : 0; 5233 mul64To128( bSig, q, &term0, &term1 ); 5234 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5235 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 5236 expDiff -= 62; 5237 } 5238 expDiff += 64; 5239 if ( 0 < expDiff ) { 5240 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5241 q = ( 2 < q ) ? q - 2 : 0; 5242 q >>= 64 - expDiff; 5243 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 5244 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5245 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 5246 while ( le128( term0, term1, aSig0, aSig1 ) ) { 5247 ++q; 5248 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5249 } 5250 } 5251 else { 5252 term1 = 0; 5253 term0 = bSig; 5254 } 5255 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 5256 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5257 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5258 && ( q & 1 ) ) 5259 ) { 5260 aSig0 = alternateASig0; 5261 aSig1 = alternateASig1; 5262 zSign = ! zSign; 5263 } 5264 return 5265 normalizeRoundAndPackFloatx80( 5266 80, zSign, bExp + expDiff, aSig0, aSig1, status); 5267 5268 } 5269 5270 /*---------------------------------------------------------------------------- 5271 | Returns the square root of the extended double-precision floating-point 5272 | value `a'. The operation is performed according to the IEC/IEEE Standard 5273 | for Binary Floating-Point Arithmetic. 5274 *----------------------------------------------------------------------------*/ 5275 5276 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 5277 { 5278 flag aSign; 5279 int32_t aExp, zExp; 5280 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 5281 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5282 5283 if (floatx80_invalid_encoding(a)) { 5284 float_raise(float_flag_invalid, status); 5285 return floatx80_default_nan(status); 5286 } 5287 aSig0 = extractFloatx80Frac( a ); 5288 aExp = extractFloatx80Exp( a ); 5289 aSign = extractFloatx80Sign( a ); 5290 if ( aExp == 0x7FFF ) { 5291 if ((uint64_t)(aSig0 << 1)) { 5292 return propagateFloatx80NaN(a, a, status); 5293 } 5294 if ( ! aSign ) return a; 5295 goto invalid; 5296 } 5297 if ( aSign ) { 5298 if ( ( aExp | aSig0 ) == 0 ) return a; 5299 invalid: 5300 float_raise(float_flag_invalid, status); 5301 return floatx80_default_nan(status); 5302 } 5303 if ( aExp == 0 ) { 5304 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 5305 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5306 } 5307 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 5308 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 5309 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 5310 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5311 doubleZSig0 = zSig0<<1; 5312 mul64To128( zSig0, zSig0, &term0, &term1 ); 5313 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5314 while ( (int64_t) rem0 < 0 ) { 5315 --zSig0; 5316 doubleZSig0 -= 2; 5317 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5318 } 5319 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5320 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 5321 if ( zSig1 == 0 ) zSig1 = 1; 5322 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5323 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5324 mul64To128( zSig1, zSig1, &term2, &term3 ); 5325 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5326 while ( (int64_t) rem1 < 0 ) { 5327 --zSig1; 5328 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5329 term3 |= 1; 5330 term2 |= doubleZSig0; 5331 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5332 } 5333 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5334 } 5335 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 5336 zSig0 |= doubleZSig0; 5337 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5338 0, zExp, zSig0, zSig1, status); 5339 } 5340 5341 /*---------------------------------------------------------------------------- 5342 | Returns 1 if the extended double-precision floating-point value `a' is equal 5343 | to the corresponding value `b', and 0 otherwise. The invalid exception is 5344 | raised if either operand is a NaN. Otherwise, the comparison is performed 5345 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5346 *----------------------------------------------------------------------------*/ 5347 5348 int floatx80_eq(floatx80 a, floatx80 b, float_status *status) 5349 { 5350 5351 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5352 || (extractFloatx80Exp(a) == 0x7FFF 5353 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5354 || (extractFloatx80Exp(b) == 0x7FFF 5355 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5356 ) { 5357 float_raise(float_flag_invalid, status); 5358 return 0; 5359 } 5360 return 5361 ( a.low == b.low ) 5362 && ( ( a.high == b.high ) 5363 || ( ( a.low == 0 ) 5364 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5365 ); 5366 5367 } 5368 5369 /*---------------------------------------------------------------------------- 5370 | Returns 1 if the extended double-precision floating-point value `a' is 5371 | less than or equal to the corresponding value `b', and 0 otherwise. The 5372 | invalid exception is raised if either operand is a NaN. The comparison is 5373 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5374 | Arithmetic. 5375 *----------------------------------------------------------------------------*/ 5376 5377 int floatx80_le(floatx80 a, floatx80 b, float_status *status) 5378 { 5379 flag aSign, bSign; 5380 5381 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5382 || (extractFloatx80Exp(a) == 0x7FFF 5383 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5384 || (extractFloatx80Exp(b) == 0x7FFF 5385 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5386 ) { 5387 float_raise(float_flag_invalid, status); 5388 return 0; 5389 } 5390 aSign = extractFloatx80Sign( a ); 5391 bSign = extractFloatx80Sign( b ); 5392 if ( aSign != bSign ) { 5393 return 5394 aSign 5395 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5396 == 0 ); 5397 } 5398 return 5399 aSign ? le128( b.high, b.low, a.high, a.low ) 5400 : le128( a.high, a.low, b.high, b.low ); 5401 5402 } 5403 5404 /*---------------------------------------------------------------------------- 5405 | Returns 1 if the extended double-precision floating-point value `a' is 5406 | less than the corresponding value `b', and 0 otherwise. The invalid 5407 | exception is raised if either operand is a NaN. The comparison is performed 5408 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5409 *----------------------------------------------------------------------------*/ 5410 5411 int floatx80_lt(floatx80 a, floatx80 b, float_status *status) 5412 { 5413 flag aSign, bSign; 5414 5415 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5416 || (extractFloatx80Exp(a) == 0x7FFF 5417 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5418 || (extractFloatx80Exp(b) == 0x7FFF 5419 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5420 ) { 5421 float_raise(float_flag_invalid, status); 5422 return 0; 5423 } 5424 aSign = extractFloatx80Sign( a ); 5425 bSign = extractFloatx80Sign( b ); 5426 if ( aSign != bSign ) { 5427 return 5428 aSign 5429 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5430 != 0 ); 5431 } 5432 return 5433 aSign ? lt128( b.high, b.low, a.high, a.low ) 5434 : lt128( a.high, a.low, b.high, b.low ); 5435 5436 } 5437 5438 /*---------------------------------------------------------------------------- 5439 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5440 | cannot be compared, and 0 otherwise. The invalid exception is raised if 5441 | either operand is a NaN. The comparison is performed according to the 5442 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5443 *----------------------------------------------------------------------------*/ 5444 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status) 5445 { 5446 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5447 || (extractFloatx80Exp(a) == 0x7FFF 5448 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5449 || (extractFloatx80Exp(b) == 0x7FFF 5450 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5451 ) { 5452 float_raise(float_flag_invalid, status); 5453 return 1; 5454 } 5455 return 0; 5456 } 5457 5458 /*---------------------------------------------------------------------------- 5459 | Returns 1 if the extended double-precision floating-point value `a' is 5460 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5461 | cause an exception. The comparison is performed according to the IEC/IEEE 5462 | Standard for Binary Floating-Point Arithmetic. 5463 *----------------------------------------------------------------------------*/ 5464 5465 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status) 5466 { 5467 5468 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5469 float_raise(float_flag_invalid, status); 5470 return 0; 5471 } 5472 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5473 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5474 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5475 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5476 ) { 5477 if (floatx80_is_signaling_nan(a, status) 5478 || floatx80_is_signaling_nan(b, status)) { 5479 float_raise(float_flag_invalid, status); 5480 } 5481 return 0; 5482 } 5483 return 5484 ( a.low == b.low ) 5485 && ( ( a.high == b.high ) 5486 || ( ( a.low == 0 ) 5487 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5488 ); 5489 5490 } 5491 5492 /*---------------------------------------------------------------------------- 5493 | Returns 1 if the extended double-precision floating-point value `a' is less 5494 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 5495 | do not cause an exception. Otherwise, the comparison is performed according 5496 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5497 *----------------------------------------------------------------------------*/ 5498 5499 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status) 5500 { 5501 flag aSign, bSign; 5502 5503 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5504 float_raise(float_flag_invalid, status); 5505 return 0; 5506 } 5507 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5508 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5509 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5510 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5511 ) { 5512 if (floatx80_is_signaling_nan(a, status) 5513 || floatx80_is_signaling_nan(b, status)) { 5514 float_raise(float_flag_invalid, status); 5515 } 5516 return 0; 5517 } 5518 aSign = extractFloatx80Sign( a ); 5519 bSign = extractFloatx80Sign( b ); 5520 if ( aSign != bSign ) { 5521 return 5522 aSign 5523 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5524 == 0 ); 5525 } 5526 return 5527 aSign ? le128( b.high, b.low, a.high, a.low ) 5528 : le128( a.high, a.low, b.high, b.low ); 5529 5530 } 5531 5532 /*---------------------------------------------------------------------------- 5533 | Returns 1 if the extended double-precision floating-point value `a' is less 5534 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 5535 | an exception. Otherwise, the comparison is performed according to the 5536 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5537 *----------------------------------------------------------------------------*/ 5538 5539 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status) 5540 { 5541 flag aSign, bSign; 5542 5543 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5544 float_raise(float_flag_invalid, status); 5545 return 0; 5546 } 5547 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5548 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5549 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5550 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5551 ) { 5552 if (floatx80_is_signaling_nan(a, status) 5553 || floatx80_is_signaling_nan(b, status)) { 5554 float_raise(float_flag_invalid, status); 5555 } 5556 return 0; 5557 } 5558 aSign = extractFloatx80Sign( a ); 5559 bSign = extractFloatx80Sign( b ); 5560 if ( aSign != bSign ) { 5561 return 5562 aSign 5563 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5564 != 0 ); 5565 } 5566 return 5567 aSign ? lt128( b.high, b.low, a.high, a.low ) 5568 : lt128( a.high, a.low, b.high, b.low ); 5569 5570 } 5571 5572 /*---------------------------------------------------------------------------- 5573 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5574 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception. 5575 | The comparison is performed according to the IEC/IEEE Standard for Binary 5576 | Floating-Point Arithmetic. 5577 *----------------------------------------------------------------------------*/ 5578 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status) 5579 { 5580 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5581 float_raise(float_flag_invalid, status); 5582 return 1; 5583 } 5584 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5585 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5586 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5587 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5588 ) { 5589 if (floatx80_is_signaling_nan(a, status) 5590 || floatx80_is_signaling_nan(b, status)) { 5591 float_raise(float_flag_invalid, status); 5592 } 5593 return 1; 5594 } 5595 return 0; 5596 } 5597 5598 /*---------------------------------------------------------------------------- 5599 | Returns the result of converting the quadruple-precision floating-point 5600 | value `a' to the 32-bit two's complement integer format. The conversion 5601 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5602 | Arithmetic---which means in particular that the conversion is rounded 5603 | according to the current rounding mode. If `a' is a NaN, the largest 5604 | positive integer is returned. Otherwise, if the conversion overflows, the 5605 | largest integer with the same sign as `a' is returned. 5606 *----------------------------------------------------------------------------*/ 5607 5608 int32_t float128_to_int32(float128 a, float_status *status) 5609 { 5610 flag aSign; 5611 int32_t aExp, shiftCount; 5612 uint64_t aSig0, aSig1; 5613 5614 aSig1 = extractFloat128Frac1( a ); 5615 aSig0 = extractFloat128Frac0( a ); 5616 aExp = extractFloat128Exp( a ); 5617 aSign = extractFloat128Sign( a ); 5618 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 5619 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5620 aSig0 |= ( aSig1 != 0 ); 5621 shiftCount = 0x4028 - aExp; 5622 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 5623 return roundAndPackInt32(aSign, aSig0, status); 5624 5625 } 5626 5627 /*---------------------------------------------------------------------------- 5628 | Returns the result of converting the quadruple-precision floating-point 5629 | value `a' to the 32-bit two's complement integer format. The conversion 5630 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5631 | Arithmetic, except that the conversion is always rounded toward zero. If 5632 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 5633 | conversion overflows, the largest integer with the same sign as `a' is 5634 | returned. 5635 *----------------------------------------------------------------------------*/ 5636 5637 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 5638 { 5639 flag aSign; 5640 int32_t aExp, shiftCount; 5641 uint64_t aSig0, aSig1, savedASig; 5642 int32_t z; 5643 5644 aSig1 = extractFloat128Frac1( a ); 5645 aSig0 = extractFloat128Frac0( a ); 5646 aExp = extractFloat128Exp( a ); 5647 aSign = extractFloat128Sign( a ); 5648 aSig0 |= ( aSig1 != 0 ); 5649 if ( 0x401E < aExp ) { 5650 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 5651 goto invalid; 5652 } 5653 else if ( aExp < 0x3FFF ) { 5654 if (aExp || aSig0) { 5655 status->float_exception_flags |= float_flag_inexact; 5656 } 5657 return 0; 5658 } 5659 aSig0 |= LIT64( 0x0001000000000000 ); 5660 shiftCount = 0x402F - aExp; 5661 savedASig = aSig0; 5662 aSig0 >>= shiftCount; 5663 z = aSig0; 5664 if ( aSign ) z = - z; 5665 if ( ( z < 0 ) ^ aSign ) { 5666 invalid: 5667 float_raise(float_flag_invalid, status); 5668 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5669 } 5670 if ( ( aSig0<<shiftCount ) != savedASig ) { 5671 status->float_exception_flags |= float_flag_inexact; 5672 } 5673 return z; 5674 5675 } 5676 5677 /*---------------------------------------------------------------------------- 5678 | Returns the result of converting the quadruple-precision floating-point 5679 | value `a' to the 64-bit two's complement integer format. The conversion 5680 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5681 | Arithmetic---which means in particular that the conversion is rounded 5682 | according to the current rounding mode. If `a' is a NaN, the largest 5683 | positive integer is returned. Otherwise, if the conversion overflows, the 5684 | largest integer with the same sign as `a' is returned. 5685 *----------------------------------------------------------------------------*/ 5686 5687 int64_t float128_to_int64(float128 a, float_status *status) 5688 { 5689 flag aSign; 5690 int32_t aExp, shiftCount; 5691 uint64_t aSig0, aSig1; 5692 5693 aSig1 = extractFloat128Frac1( a ); 5694 aSig0 = extractFloat128Frac0( a ); 5695 aExp = extractFloat128Exp( a ); 5696 aSign = extractFloat128Sign( a ); 5697 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5698 shiftCount = 0x402F - aExp; 5699 if ( shiftCount <= 0 ) { 5700 if ( 0x403E < aExp ) { 5701 float_raise(float_flag_invalid, status); 5702 if ( ! aSign 5703 || ( ( aExp == 0x7FFF ) 5704 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 5705 ) 5706 ) { 5707 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5708 } 5709 return (int64_t) LIT64( 0x8000000000000000 ); 5710 } 5711 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 5712 } 5713 else { 5714 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 5715 } 5716 return roundAndPackInt64(aSign, aSig0, aSig1, status); 5717 5718 } 5719 5720 /*---------------------------------------------------------------------------- 5721 | Returns the result of converting the quadruple-precision floating-point 5722 | value `a' to the 64-bit two's complement integer format. The conversion 5723 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5724 | Arithmetic, except that the conversion is always rounded toward zero. 5725 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 5726 | the conversion overflows, the largest integer with the same sign as `a' is 5727 | returned. 5728 *----------------------------------------------------------------------------*/ 5729 5730 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 5731 { 5732 flag aSign; 5733 int32_t aExp, shiftCount; 5734 uint64_t aSig0, aSig1; 5735 int64_t z; 5736 5737 aSig1 = extractFloat128Frac1( a ); 5738 aSig0 = extractFloat128Frac0( a ); 5739 aExp = extractFloat128Exp( a ); 5740 aSign = extractFloat128Sign( a ); 5741 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5742 shiftCount = aExp - 0x402F; 5743 if ( 0 < shiftCount ) { 5744 if ( 0x403E <= aExp ) { 5745 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 5746 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 5747 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 5748 if (aSig1) { 5749 status->float_exception_flags |= float_flag_inexact; 5750 } 5751 } 5752 else { 5753 float_raise(float_flag_invalid, status); 5754 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 5755 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5756 } 5757 } 5758 return (int64_t) LIT64( 0x8000000000000000 ); 5759 } 5760 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 5761 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 5762 status->float_exception_flags |= float_flag_inexact; 5763 } 5764 } 5765 else { 5766 if ( aExp < 0x3FFF ) { 5767 if ( aExp | aSig0 | aSig1 ) { 5768 status->float_exception_flags |= float_flag_inexact; 5769 } 5770 return 0; 5771 } 5772 z = aSig0>>( - shiftCount ); 5773 if ( aSig1 5774 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 5775 status->float_exception_flags |= float_flag_inexact; 5776 } 5777 } 5778 if ( aSign ) z = - z; 5779 return z; 5780 5781 } 5782 5783 /*---------------------------------------------------------------------------- 5784 | Returns the result of converting the quadruple-precision floating-point value 5785 | `a' to the 64-bit unsigned integer format. The conversion is 5786 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5787 | Arithmetic---which means in particular that the conversion is rounded 5788 | according to the current rounding mode. If `a' is a NaN, the largest 5789 | positive integer is returned. If the conversion overflows, the 5790 | largest unsigned integer is returned. If 'a' is negative, the value is 5791 | rounded and zero is returned; negative values that do not round to zero 5792 | will raise the inexact exception. 5793 *----------------------------------------------------------------------------*/ 5794 5795 uint64_t float128_to_uint64(float128 a, float_status *status) 5796 { 5797 flag aSign; 5798 int aExp; 5799 int shiftCount; 5800 uint64_t aSig0, aSig1; 5801 5802 aSig0 = extractFloat128Frac0(a); 5803 aSig1 = extractFloat128Frac1(a); 5804 aExp = extractFloat128Exp(a); 5805 aSign = extractFloat128Sign(a); 5806 if (aSign && (aExp > 0x3FFE)) { 5807 float_raise(float_flag_invalid, status); 5808 if (float128_is_any_nan(a)) { 5809 return LIT64(0xFFFFFFFFFFFFFFFF); 5810 } else { 5811 return 0; 5812 } 5813 } 5814 if (aExp) { 5815 aSig0 |= LIT64(0x0001000000000000); 5816 } 5817 shiftCount = 0x402F - aExp; 5818 if (shiftCount <= 0) { 5819 if (0x403E < aExp) { 5820 float_raise(float_flag_invalid, status); 5821 return LIT64(0xFFFFFFFFFFFFFFFF); 5822 } 5823 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 5824 } else { 5825 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 5826 } 5827 return roundAndPackUint64(aSign, aSig0, aSig1, status); 5828 } 5829 5830 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 5831 { 5832 uint64_t v; 5833 signed char current_rounding_mode = status->float_rounding_mode; 5834 5835 set_float_rounding_mode(float_round_to_zero, status); 5836 v = float128_to_uint64(a, status); 5837 set_float_rounding_mode(current_rounding_mode, status); 5838 5839 return v; 5840 } 5841 5842 /*---------------------------------------------------------------------------- 5843 | Returns the result of converting the quadruple-precision floating-point 5844 | value `a' to the 32-bit unsigned integer format. The conversion 5845 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5846 | Arithmetic except that the conversion is always rounded toward zero. 5847 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 5848 | if the conversion overflows, the largest unsigned integer is returned. 5849 | If 'a' is negative, the value is rounded and zero is returned; negative 5850 | values that do not round to zero will raise the inexact exception. 5851 *----------------------------------------------------------------------------*/ 5852 5853 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 5854 { 5855 uint64_t v; 5856 uint32_t res; 5857 int old_exc_flags = get_float_exception_flags(status); 5858 5859 v = float128_to_uint64_round_to_zero(a, status); 5860 if (v > 0xffffffff) { 5861 res = 0xffffffff; 5862 } else { 5863 return v; 5864 } 5865 set_float_exception_flags(old_exc_flags, status); 5866 float_raise(float_flag_invalid, status); 5867 return res; 5868 } 5869 5870 /*---------------------------------------------------------------------------- 5871 | Returns the result of converting the quadruple-precision floating-point 5872 | value `a' to the single-precision floating-point format. The conversion 5873 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5874 | Arithmetic. 5875 *----------------------------------------------------------------------------*/ 5876 5877 float32 float128_to_float32(float128 a, float_status *status) 5878 { 5879 flag aSign; 5880 int32_t aExp; 5881 uint64_t aSig0, aSig1; 5882 uint32_t zSig; 5883 5884 aSig1 = extractFloat128Frac1( a ); 5885 aSig0 = extractFloat128Frac0( a ); 5886 aExp = extractFloat128Exp( a ); 5887 aSign = extractFloat128Sign( a ); 5888 if ( aExp == 0x7FFF ) { 5889 if ( aSig0 | aSig1 ) { 5890 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 5891 } 5892 return packFloat32( aSign, 0xFF, 0 ); 5893 } 5894 aSig0 |= ( aSig1 != 0 ); 5895 shift64RightJamming( aSig0, 18, &aSig0 ); 5896 zSig = aSig0; 5897 if ( aExp || zSig ) { 5898 zSig |= 0x40000000; 5899 aExp -= 0x3F81; 5900 } 5901 return roundAndPackFloat32(aSign, aExp, zSig, status); 5902 5903 } 5904 5905 /*---------------------------------------------------------------------------- 5906 | Returns the result of converting the quadruple-precision floating-point 5907 | value `a' to the double-precision floating-point format. The conversion 5908 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5909 | Arithmetic. 5910 *----------------------------------------------------------------------------*/ 5911 5912 float64 float128_to_float64(float128 a, float_status *status) 5913 { 5914 flag aSign; 5915 int32_t aExp; 5916 uint64_t aSig0, aSig1; 5917 5918 aSig1 = extractFloat128Frac1( a ); 5919 aSig0 = extractFloat128Frac0( a ); 5920 aExp = extractFloat128Exp( a ); 5921 aSign = extractFloat128Sign( a ); 5922 if ( aExp == 0x7FFF ) { 5923 if ( aSig0 | aSig1 ) { 5924 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 5925 } 5926 return packFloat64( aSign, 0x7FF, 0 ); 5927 } 5928 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 5929 aSig0 |= ( aSig1 != 0 ); 5930 if ( aExp || aSig0 ) { 5931 aSig0 |= LIT64( 0x4000000000000000 ); 5932 aExp -= 0x3C01; 5933 } 5934 return roundAndPackFloat64(aSign, aExp, aSig0, status); 5935 5936 } 5937 5938 /*---------------------------------------------------------------------------- 5939 | Returns the result of converting the quadruple-precision floating-point 5940 | value `a' to the extended double-precision floating-point format. The 5941 | conversion is performed according to the IEC/IEEE Standard for Binary 5942 | Floating-Point Arithmetic. 5943 *----------------------------------------------------------------------------*/ 5944 5945 floatx80 float128_to_floatx80(float128 a, float_status *status) 5946 { 5947 flag aSign; 5948 int32_t aExp; 5949 uint64_t aSig0, aSig1; 5950 5951 aSig1 = extractFloat128Frac1( a ); 5952 aSig0 = extractFloat128Frac0( a ); 5953 aExp = extractFloat128Exp( a ); 5954 aSign = extractFloat128Sign( a ); 5955 if ( aExp == 0x7FFF ) { 5956 if ( aSig0 | aSig1 ) { 5957 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status); 5958 } 5959 return packFloatx80(aSign, floatx80_infinity_high, 5960 floatx80_infinity_low); 5961 } 5962 if ( aExp == 0 ) { 5963 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 5964 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5965 } 5966 else { 5967 aSig0 |= LIT64( 0x0001000000000000 ); 5968 } 5969 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 5970 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 5971 5972 } 5973 5974 /*---------------------------------------------------------------------------- 5975 | Rounds the quadruple-precision floating-point value `a' to an integer, and 5976 | returns the result as a quadruple-precision floating-point value. The 5977 | operation is performed according to the IEC/IEEE Standard for Binary 5978 | Floating-Point Arithmetic. 5979 *----------------------------------------------------------------------------*/ 5980 5981 float128 float128_round_to_int(float128 a, float_status *status) 5982 { 5983 flag aSign; 5984 int32_t aExp; 5985 uint64_t lastBitMask, roundBitsMask; 5986 float128 z; 5987 5988 aExp = extractFloat128Exp( a ); 5989 if ( 0x402F <= aExp ) { 5990 if ( 0x406F <= aExp ) { 5991 if ( ( aExp == 0x7FFF ) 5992 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 5993 ) { 5994 return propagateFloat128NaN(a, a, status); 5995 } 5996 return a; 5997 } 5998 lastBitMask = 1; 5999 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 6000 roundBitsMask = lastBitMask - 1; 6001 z = a; 6002 switch (status->float_rounding_mode) { 6003 case float_round_nearest_even: 6004 if ( lastBitMask ) { 6005 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 6006 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 6007 } 6008 else { 6009 if ( (int64_t) z.low < 0 ) { 6010 ++z.high; 6011 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 6012 } 6013 } 6014 break; 6015 case float_round_ties_away: 6016 if (lastBitMask) { 6017 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 6018 } else { 6019 if ((int64_t) z.low < 0) { 6020 ++z.high; 6021 } 6022 } 6023 break; 6024 case float_round_to_zero: 6025 break; 6026 case float_round_up: 6027 if (!extractFloat128Sign(z)) { 6028 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6029 } 6030 break; 6031 case float_round_down: 6032 if (extractFloat128Sign(z)) { 6033 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6034 } 6035 break; 6036 default: 6037 abort(); 6038 } 6039 z.low &= ~ roundBitsMask; 6040 } 6041 else { 6042 if ( aExp < 0x3FFF ) { 6043 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 6044 status->float_exception_flags |= float_flag_inexact; 6045 aSign = extractFloat128Sign( a ); 6046 switch (status->float_rounding_mode) { 6047 case float_round_nearest_even: 6048 if ( ( aExp == 0x3FFE ) 6049 && ( extractFloat128Frac0( a ) 6050 | extractFloat128Frac1( a ) ) 6051 ) { 6052 return packFloat128( aSign, 0x3FFF, 0, 0 ); 6053 } 6054 break; 6055 case float_round_ties_away: 6056 if (aExp == 0x3FFE) { 6057 return packFloat128(aSign, 0x3FFF, 0, 0); 6058 } 6059 break; 6060 case float_round_down: 6061 return 6062 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 6063 : packFloat128( 0, 0, 0, 0 ); 6064 case float_round_up: 6065 return 6066 aSign ? packFloat128( 1, 0, 0, 0 ) 6067 : packFloat128( 0, 0x3FFF, 0, 0 ); 6068 } 6069 return packFloat128( aSign, 0, 0, 0 ); 6070 } 6071 lastBitMask = 1; 6072 lastBitMask <<= 0x402F - aExp; 6073 roundBitsMask = lastBitMask - 1; 6074 z.low = 0; 6075 z.high = a.high; 6076 switch (status->float_rounding_mode) { 6077 case float_round_nearest_even: 6078 z.high += lastBitMask>>1; 6079 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 6080 z.high &= ~ lastBitMask; 6081 } 6082 break; 6083 case float_round_ties_away: 6084 z.high += lastBitMask>>1; 6085 break; 6086 case float_round_to_zero: 6087 break; 6088 case float_round_up: 6089 if (!extractFloat128Sign(z)) { 6090 z.high |= ( a.low != 0 ); 6091 z.high += roundBitsMask; 6092 } 6093 break; 6094 case float_round_down: 6095 if (extractFloat128Sign(z)) { 6096 z.high |= (a.low != 0); 6097 z.high += roundBitsMask; 6098 } 6099 break; 6100 default: 6101 abort(); 6102 } 6103 z.high &= ~ roundBitsMask; 6104 } 6105 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 6106 status->float_exception_flags |= float_flag_inexact; 6107 } 6108 return z; 6109 6110 } 6111 6112 /*---------------------------------------------------------------------------- 6113 | Returns the result of adding the absolute values of the quadruple-precision 6114 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 6115 | before being returned. `zSign' is ignored if the result is a NaN. 6116 | The addition is performed according to the IEC/IEEE Standard for Binary 6117 | Floating-Point Arithmetic. 6118 *----------------------------------------------------------------------------*/ 6119 6120 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign, 6121 float_status *status) 6122 { 6123 int32_t aExp, bExp, zExp; 6124 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6125 int32_t expDiff; 6126 6127 aSig1 = extractFloat128Frac1( a ); 6128 aSig0 = extractFloat128Frac0( a ); 6129 aExp = extractFloat128Exp( a ); 6130 bSig1 = extractFloat128Frac1( b ); 6131 bSig0 = extractFloat128Frac0( b ); 6132 bExp = extractFloat128Exp( b ); 6133 expDiff = aExp - bExp; 6134 if ( 0 < expDiff ) { 6135 if ( aExp == 0x7FFF ) { 6136 if (aSig0 | aSig1) { 6137 return propagateFloat128NaN(a, b, status); 6138 } 6139 return a; 6140 } 6141 if ( bExp == 0 ) { 6142 --expDiff; 6143 } 6144 else { 6145 bSig0 |= LIT64( 0x0001000000000000 ); 6146 } 6147 shift128ExtraRightJamming( 6148 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 6149 zExp = aExp; 6150 } 6151 else if ( expDiff < 0 ) { 6152 if ( bExp == 0x7FFF ) { 6153 if (bSig0 | bSig1) { 6154 return propagateFloat128NaN(a, b, status); 6155 } 6156 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6157 } 6158 if ( aExp == 0 ) { 6159 ++expDiff; 6160 } 6161 else { 6162 aSig0 |= LIT64( 0x0001000000000000 ); 6163 } 6164 shift128ExtraRightJamming( 6165 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 6166 zExp = bExp; 6167 } 6168 else { 6169 if ( aExp == 0x7FFF ) { 6170 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6171 return propagateFloat128NaN(a, b, status); 6172 } 6173 return a; 6174 } 6175 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6176 if ( aExp == 0 ) { 6177 if (status->flush_to_zero) { 6178 if (zSig0 | zSig1) { 6179 float_raise(float_flag_output_denormal, status); 6180 } 6181 return packFloat128(zSign, 0, 0, 0); 6182 } 6183 return packFloat128( zSign, 0, zSig0, zSig1 ); 6184 } 6185 zSig2 = 0; 6186 zSig0 |= LIT64( 0x0002000000000000 ); 6187 zExp = aExp; 6188 goto shiftRight1; 6189 } 6190 aSig0 |= LIT64( 0x0001000000000000 ); 6191 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6192 --zExp; 6193 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 6194 ++zExp; 6195 shiftRight1: 6196 shift128ExtraRightJamming( 6197 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6198 roundAndPack: 6199 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6200 6201 } 6202 6203 /*---------------------------------------------------------------------------- 6204 | Returns the result of subtracting the absolute values of the quadruple- 6205 | precision floating-point values `a' and `b'. If `zSign' is 1, the 6206 | difference is negated before being returned. `zSign' is ignored if the 6207 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6208 | Standard for Binary Floating-Point Arithmetic. 6209 *----------------------------------------------------------------------------*/ 6210 6211 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign, 6212 float_status *status) 6213 { 6214 int32_t aExp, bExp, zExp; 6215 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 6216 int32_t expDiff; 6217 6218 aSig1 = extractFloat128Frac1( a ); 6219 aSig0 = extractFloat128Frac0( a ); 6220 aExp = extractFloat128Exp( a ); 6221 bSig1 = extractFloat128Frac1( b ); 6222 bSig0 = extractFloat128Frac0( b ); 6223 bExp = extractFloat128Exp( b ); 6224 expDiff = aExp - bExp; 6225 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6226 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 6227 if ( 0 < expDiff ) goto aExpBigger; 6228 if ( expDiff < 0 ) goto bExpBigger; 6229 if ( aExp == 0x7FFF ) { 6230 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6231 return propagateFloat128NaN(a, b, status); 6232 } 6233 float_raise(float_flag_invalid, status); 6234 return float128_default_nan(status); 6235 } 6236 if ( aExp == 0 ) { 6237 aExp = 1; 6238 bExp = 1; 6239 } 6240 if ( bSig0 < aSig0 ) goto aBigger; 6241 if ( aSig0 < bSig0 ) goto bBigger; 6242 if ( bSig1 < aSig1 ) goto aBigger; 6243 if ( aSig1 < bSig1 ) goto bBigger; 6244 return packFloat128(status->float_rounding_mode == float_round_down, 6245 0, 0, 0); 6246 bExpBigger: 6247 if ( bExp == 0x7FFF ) { 6248 if (bSig0 | bSig1) { 6249 return propagateFloat128NaN(a, b, status); 6250 } 6251 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 6252 } 6253 if ( aExp == 0 ) { 6254 ++expDiff; 6255 } 6256 else { 6257 aSig0 |= LIT64( 0x4000000000000000 ); 6258 } 6259 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6260 bSig0 |= LIT64( 0x4000000000000000 ); 6261 bBigger: 6262 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6263 zExp = bExp; 6264 zSign ^= 1; 6265 goto normalizeRoundAndPack; 6266 aExpBigger: 6267 if ( aExp == 0x7FFF ) { 6268 if (aSig0 | aSig1) { 6269 return propagateFloat128NaN(a, b, status); 6270 } 6271 return a; 6272 } 6273 if ( bExp == 0 ) { 6274 --expDiff; 6275 } 6276 else { 6277 bSig0 |= LIT64( 0x4000000000000000 ); 6278 } 6279 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 6280 aSig0 |= LIT64( 0x4000000000000000 ); 6281 aBigger: 6282 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6283 zExp = aExp; 6284 normalizeRoundAndPack: 6285 --zExp; 6286 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 6287 status); 6288 6289 } 6290 6291 /*---------------------------------------------------------------------------- 6292 | Returns the result of adding the quadruple-precision floating-point values 6293 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 6294 | for Binary Floating-Point Arithmetic. 6295 *----------------------------------------------------------------------------*/ 6296 6297 float128 float128_add(float128 a, float128 b, float_status *status) 6298 { 6299 flag aSign, bSign; 6300 6301 aSign = extractFloat128Sign( a ); 6302 bSign = extractFloat128Sign( b ); 6303 if ( aSign == bSign ) { 6304 return addFloat128Sigs(a, b, aSign, status); 6305 } 6306 else { 6307 return subFloat128Sigs(a, b, aSign, status); 6308 } 6309 6310 } 6311 6312 /*---------------------------------------------------------------------------- 6313 | Returns the result of subtracting the quadruple-precision floating-point 6314 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6315 | Standard for Binary Floating-Point Arithmetic. 6316 *----------------------------------------------------------------------------*/ 6317 6318 float128 float128_sub(float128 a, float128 b, float_status *status) 6319 { 6320 flag aSign, bSign; 6321 6322 aSign = extractFloat128Sign( a ); 6323 bSign = extractFloat128Sign( b ); 6324 if ( aSign == bSign ) { 6325 return subFloat128Sigs(a, b, aSign, status); 6326 } 6327 else { 6328 return addFloat128Sigs(a, b, aSign, status); 6329 } 6330 6331 } 6332 6333 /*---------------------------------------------------------------------------- 6334 | Returns the result of multiplying the quadruple-precision floating-point 6335 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6336 | Standard for Binary Floating-Point Arithmetic. 6337 *----------------------------------------------------------------------------*/ 6338 6339 float128 float128_mul(float128 a, float128 b, float_status *status) 6340 { 6341 flag aSign, bSign, zSign; 6342 int32_t aExp, bExp, zExp; 6343 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 6344 6345 aSig1 = extractFloat128Frac1( a ); 6346 aSig0 = extractFloat128Frac0( a ); 6347 aExp = extractFloat128Exp( a ); 6348 aSign = extractFloat128Sign( a ); 6349 bSig1 = extractFloat128Frac1( b ); 6350 bSig0 = extractFloat128Frac0( b ); 6351 bExp = extractFloat128Exp( b ); 6352 bSign = extractFloat128Sign( b ); 6353 zSign = aSign ^ bSign; 6354 if ( aExp == 0x7FFF ) { 6355 if ( ( aSig0 | aSig1 ) 6356 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6357 return propagateFloat128NaN(a, b, status); 6358 } 6359 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 6360 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6361 } 6362 if ( bExp == 0x7FFF ) { 6363 if (bSig0 | bSig1) { 6364 return propagateFloat128NaN(a, b, status); 6365 } 6366 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6367 invalid: 6368 float_raise(float_flag_invalid, status); 6369 return float128_default_nan(status); 6370 } 6371 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6372 } 6373 if ( aExp == 0 ) { 6374 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6375 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6376 } 6377 if ( bExp == 0 ) { 6378 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6379 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6380 } 6381 zExp = aExp + bExp - 0x4000; 6382 aSig0 |= LIT64( 0x0001000000000000 ); 6383 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 6384 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 6385 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6386 zSig2 |= ( zSig3 != 0 ); 6387 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 6388 shift128ExtraRightJamming( 6389 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6390 ++zExp; 6391 } 6392 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6393 6394 } 6395 6396 /*---------------------------------------------------------------------------- 6397 | Returns the result of dividing the quadruple-precision floating-point value 6398 | `a' by the corresponding value `b'. The operation is performed according to 6399 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6400 *----------------------------------------------------------------------------*/ 6401 6402 float128 float128_div(float128 a, float128 b, float_status *status) 6403 { 6404 flag aSign, bSign, zSign; 6405 int32_t aExp, bExp, zExp; 6406 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6407 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6408 6409 aSig1 = extractFloat128Frac1( a ); 6410 aSig0 = extractFloat128Frac0( a ); 6411 aExp = extractFloat128Exp( a ); 6412 aSign = extractFloat128Sign( a ); 6413 bSig1 = extractFloat128Frac1( b ); 6414 bSig0 = extractFloat128Frac0( b ); 6415 bExp = extractFloat128Exp( b ); 6416 bSign = extractFloat128Sign( b ); 6417 zSign = aSign ^ bSign; 6418 if ( aExp == 0x7FFF ) { 6419 if (aSig0 | aSig1) { 6420 return propagateFloat128NaN(a, b, status); 6421 } 6422 if ( bExp == 0x7FFF ) { 6423 if (bSig0 | bSig1) { 6424 return propagateFloat128NaN(a, b, status); 6425 } 6426 goto invalid; 6427 } 6428 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6429 } 6430 if ( bExp == 0x7FFF ) { 6431 if (bSig0 | bSig1) { 6432 return propagateFloat128NaN(a, b, status); 6433 } 6434 return packFloat128( zSign, 0, 0, 0 ); 6435 } 6436 if ( bExp == 0 ) { 6437 if ( ( bSig0 | bSig1 ) == 0 ) { 6438 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6439 invalid: 6440 float_raise(float_flag_invalid, status); 6441 return float128_default_nan(status); 6442 } 6443 float_raise(float_flag_divbyzero, status); 6444 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6445 } 6446 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6447 } 6448 if ( aExp == 0 ) { 6449 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6450 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6451 } 6452 zExp = aExp - bExp + 0x3FFD; 6453 shortShift128Left( 6454 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 6455 shortShift128Left( 6456 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6457 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 6458 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 6459 ++zExp; 6460 } 6461 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6462 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 6463 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 6464 while ( (int64_t) rem0 < 0 ) { 6465 --zSig0; 6466 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 6467 } 6468 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 6469 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 6470 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 6471 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 6472 while ( (int64_t) rem1 < 0 ) { 6473 --zSig1; 6474 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 6475 } 6476 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6477 } 6478 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 6479 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6480 6481 } 6482 6483 /*---------------------------------------------------------------------------- 6484 | Returns the remainder of the quadruple-precision floating-point value `a' 6485 | with respect to the corresponding value `b'. The operation is performed 6486 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6487 *----------------------------------------------------------------------------*/ 6488 6489 float128 float128_rem(float128 a, float128 b, float_status *status) 6490 { 6491 flag aSign, zSign; 6492 int32_t aExp, bExp, expDiff; 6493 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 6494 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 6495 int64_t sigMean0; 6496 6497 aSig1 = extractFloat128Frac1( a ); 6498 aSig0 = extractFloat128Frac0( a ); 6499 aExp = extractFloat128Exp( a ); 6500 aSign = extractFloat128Sign( a ); 6501 bSig1 = extractFloat128Frac1( b ); 6502 bSig0 = extractFloat128Frac0( b ); 6503 bExp = extractFloat128Exp( b ); 6504 if ( aExp == 0x7FFF ) { 6505 if ( ( aSig0 | aSig1 ) 6506 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6507 return propagateFloat128NaN(a, b, status); 6508 } 6509 goto invalid; 6510 } 6511 if ( bExp == 0x7FFF ) { 6512 if (bSig0 | bSig1) { 6513 return propagateFloat128NaN(a, b, status); 6514 } 6515 return a; 6516 } 6517 if ( bExp == 0 ) { 6518 if ( ( bSig0 | bSig1 ) == 0 ) { 6519 invalid: 6520 float_raise(float_flag_invalid, status); 6521 return float128_default_nan(status); 6522 } 6523 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6524 } 6525 if ( aExp == 0 ) { 6526 if ( ( aSig0 | aSig1 ) == 0 ) return a; 6527 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6528 } 6529 expDiff = aExp - bExp; 6530 if ( expDiff < -1 ) return a; 6531 shortShift128Left( 6532 aSig0 | LIT64( 0x0001000000000000 ), 6533 aSig1, 6534 15 - ( expDiff < 0 ), 6535 &aSig0, 6536 &aSig1 6537 ); 6538 shortShift128Left( 6539 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6540 q = le128( bSig0, bSig1, aSig0, aSig1 ); 6541 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6542 expDiff -= 64; 6543 while ( 0 < expDiff ) { 6544 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6545 q = ( 4 < q ) ? q - 4 : 0; 6546 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6547 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 6548 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 6549 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 6550 expDiff -= 61; 6551 } 6552 if ( -64 < expDiff ) { 6553 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6554 q = ( 4 < q ) ? q - 4 : 0; 6555 q >>= - expDiff; 6556 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6557 expDiff += 52; 6558 if ( expDiff < 0 ) { 6559 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6560 } 6561 else { 6562 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 6563 } 6564 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6565 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 6566 } 6567 else { 6568 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 6569 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6570 } 6571 do { 6572 alternateASig0 = aSig0; 6573 alternateASig1 = aSig1; 6574 ++q; 6575 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6576 } while ( 0 <= (int64_t) aSig0 ); 6577 add128( 6578 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 6579 if ( ( sigMean0 < 0 ) 6580 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 6581 aSig0 = alternateASig0; 6582 aSig1 = alternateASig1; 6583 } 6584 zSign = ( (int64_t) aSig0 < 0 ); 6585 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 6586 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 6587 status); 6588 } 6589 6590 /*---------------------------------------------------------------------------- 6591 | Returns the square root of the quadruple-precision floating-point value `a'. 6592 | The operation is performed according to the IEC/IEEE Standard for Binary 6593 | Floating-Point Arithmetic. 6594 *----------------------------------------------------------------------------*/ 6595 6596 float128 float128_sqrt(float128 a, float_status *status) 6597 { 6598 flag aSign; 6599 int32_t aExp, zExp; 6600 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 6601 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6602 6603 aSig1 = extractFloat128Frac1( a ); 6604 aSig0 = extractFloat128Frac0( a ); 6605 aExp = extractFloat128Exp( a ); 6606 aSign = extractFloat128Sign( a ); 6607 if ( aExp == 0x7FFF ) { 6608 if (aSig0 | aSig1) { 6609 return propagateFloat128NaN(a, a, status); 6610 } 6611 if ( ! aSign ) return a; 6612 goto invalid; 6613 } 6614 if ( aSign ) { 6615 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 6616 invalid: 6617 float_raise(float_flag_invalid, status); 6618 return float128_default_nan(status); 6619 } 6620 if ( aExp == 0 ) { 6621 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 6622 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6623 } 6624 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 6625 aSig0 |= LIT64( 0x0001000000000000 ); 6626 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 6627 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 6628 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6629 doubleZSig0 = zSig0<<1; 6630 mul64To128( zSig0, zSig0, &term0, &term1 ); 6631 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6632 while ( (int64_t) rem0 < 0 ) { 6633 --zSig0; 6634 doubleZSig0 -= 2; 6635 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6636 } 6637 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6638 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 6639 if ( zSig1 == 0 ) zSig1 = 1; 6640 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6641 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6642 mul64To128( zSig1, zSig1, &term2, &term3 ); 6643 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6644 while ( (int64_t) rem1 < 0 ) { 6645 --zSig1; 6646 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6647 term3 |= 1; 6648 term2 |= doubleZSig0; 6649 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6650 } 6651 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6652 } 6653 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 6654 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 6655 6656 } 6657 6658 /*---------------------------------------------------------------------------- 6659 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 6660 | the corresponding value `b', and 0 otherwise. The invalid exception is 6661 | raised if either operand is a NaN. Otherwise, the comparison is performed 6662 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6663 *----------------------------------------------------------------------------*/ 6664 6665 int float128_eq(float128 a, float128 b, float_status *status) 6666 { 6667 6668 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6669 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6670 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6671 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6672 ) { 6673 float_raise(float_flag_invalid, status); 6674 return 0; 6675 } 6676 return 6677 ( a.low == b.low ) 6678 && ( ( a.high == b.high ) 6679 || ( ( a.low == 0 ) 6680 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6681 ); 6682 6683 } 6684 6685 /*---------------------------------------------------------------------------- 6686 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6687 | or equal to the corresponding value `b', and 0 otherwise. The invalid 6688 | exception is raised if either operand is a NaN. The comparison is performed 6689 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6690 *----------------------------------------------------------------------------*/ 6691 6692 int float128_le(float128 a, float128 b, float_status *status) 6693 { 6694 flag aSign, bSign; 6695 6696 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6697 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6698 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6699 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6700 ) { 6701 float_raise(float_flag_invalid, status); 6702 return 0; 6703 } 6704 aSign = extractFloat128Sign( a ); 6705 bSign = extractFloat128Sign( b ); 6706 if ( aSign != bSign ) { 6707 return 6708 aSign 6709 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6710 == 0 ); 6711 } 6712 return 6713 aSign ? le128( b.high, b.low, a.high, a.low ) 6714 : le128( a.high, a.low, b.high, b.low ); 6715 6716 } 6717 6718 /*---------------------------------------------------------------------------- 6719 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6720 | the corresponding value `b', and 0 otherwise. The invalid exception is 6721 | raised if either operand is a NaN. The comparison is performed according 6722 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6723 *----------------------------------------------------------------------------*/ 6724 6725 int float128_lt(float128 a, float128 b, float_status *status) 6726 { 6727 flag aSign, bSign; 6728 6729 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6730 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6731 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6732 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6733 ) { 6734 float_raise(float_flag_invalid, status); 6735 return 0; 6736 } 6737 aSign = extractFloat128Sign( a ); 6738 bSign = extractFloat128Sign( b ); 6739 if ( aSign != bSign ) { 6740 return 6741 aSign 6742 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6743 != 0 ); 6744 } 6745 return 6746 aSign ? lt128( b.high, b.low, a.high, a.low ) 6747 : lt128( a.high, a.low, b.high, b.low ); 6748 6749 } 6750 6751 /*---------------------------------------------------------------------------- 6752 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 6753 | be compared, and 0 otherwise. The invalid exception is raised if either 6754 | operand is a NaN. The comparison is performed according to the IEC/IEEE 6755 | Standard for Binary Floating-Point Arithmetic. 6756 *----------------------------------------------------------------------------*/ 6757 6758 int float128_unordered(float128 a, float128 b, float_status *status) 6759 { 6760 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6761 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6762 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6763 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6764 ) { 6765 float_raise(float_flag_invalid, status); 6766 return 1; 6767 } 6768 return 0; 6769 } 6770 6771 /*---------------------------------------------------------------------------- 6772 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 6773 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 6774 | exception. The comparison is performed according to the IEC/IEEE Standard 6775 | for Binary Floating-Point Arithmetic. 6776 *----------------------------------------------------------------------------*/ 6777 6778 int float128_eq_quiet(float128 a, float128 b, float_status *status) 6779 { 6780 6781 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6782 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6783 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6784 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6785 ) { 6786 if (float128_is_signaling_nan(a, status) 6787 || float128_is_signaling_nan(b, status)) { 6788 float_raise(float_flag_invalid, status); 6789 } 6790 return 0; 6791 } 6792 return 6793 ( a.low == b.low ) 6794 && ( ( a.high == b.high ) 6795 || ( ( a.low == 0 ) 6796 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6797 ); 6798 6799 } 6800 6801 /*---------------------------------------------------------------------------- 6802 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6803 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 6804 | cause an exception. Otherwise, the comparison is performed according to the 6805 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6806 *----------------------------------------------------------------------------*/ 6807 6808 int float128_le_quiet(float128 a, float128 b, float_status *status) 6809 { 6810 flag aSign, bSign; 6811 6812 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6813 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6814 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6815 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6816 ) { 6817 if (float128_is_signaling_nan(a, status) 6818 || float128_is_signaling_nan(b, status)) { 6819 float_raise(float_flag_invalid, status); 6820 } 6821 return 0; 6822 } 6823 aSign = extractFloat128Sign( a ); 6824 bSign = extractFloat128Sign( b ); 6825 if ( aSign != bSign ) { 6826 return 6827 aSign 6828 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6829 == 0 ); 6830 } 6831 return 6832 aSign ? le128( b.high, b.low, a.high, a.low ) 6833 : le128( a.high, a.low, b.high, b.low ); 6834 6835 } 6836 6837 /*---------------------------------------------------------------------------- 6838 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6839 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 6840 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 6841 | Standard for Binary Floating-Point Arithmetic. 6842 *----------------------------------------------------------------------------*/ 6843 6844 int float128_lt_quiet(float128 a, float128 b, float_status *status) 6845 { 6846 flag aSign, bSign; 6847 6848 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6849 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6850 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6851 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6852 ) { 6853 if (float128_is_signaling_nan(a, status) 6854 || float128_is_signaling_nan(b, status)) { 6855 float_raise(float_flag_invalid, status); 6856 } 6857 return 0; 6858 } 6859 aSign = extractFloat128Sign( a ); 6860 bSign = extractFloat128Sign( b ); 6861 if ( aSign != bSign ) { 6862 return 6863 aSign 6864 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6865 != 0 ); 6866 } 6867 return 6868 aSign ? lt128( b.high, b.low, a.high, a.low ) 6869 : lt128( a.high, a.low, b.high, b.low ); 6870 6871 } 6872 6873 /*---------------------------------------------------------------------------- 6874 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 6875 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 6876 | comparison is performed according to the IEC/IEEE Standard for Binary 6877 | Floating-Point Arithmetic. 6878 *----------------------------------------------------------------------------*/ 6879 6880 int float128_unordered_quiet(float128 a, float128 b, float_status *status) 6881 { 6882 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6883 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6884 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6885 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6886 ) { 6887 if (float128_is_signaling_nan(a, status) 6888 || float128_is_signaling_nan(b, status)) { 6889 float_raise(float_flag_invalid, status); 6890 } 6891 return 1; 6892 } 6893 return 0; 6894 } 6895 6896 static inline int floatx80_compare_internal(floatx80 a, floatx80 b, 6897 int is_quiet, float_status *status) 6898 { 6899 flag aSign, bSign; 6900 6901 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6902 float_raise(float_flag_invalid, status); 6903 return float_relation_unordered; 6904 } 6905 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 6906 ( extractFloatx80Frac( a )<<1 ) ) || 6907 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 6908 ( extractFloatx80Frac( b )<<1 ) )) { 6909 if (!is_quiet || 6910 floatx80_is_signaling_nan(a, status) || 6911 floatx80_is_signaling_nan(b, status)) { 6912 float_raise(float_flag_invalid, status); 6913 } 6914 return float_relation_unordered; 6915 } 6916 aSign = extractFloatx80Sign( a ); 6917 bSign = extractFloatx80Sign( b ); 6918 if ( aSign != bSign ) { 6919 6920 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 6921 ( ( a.low | b.low ) == 0 ) ) { 6922 /* zero case */ 6923 return float_relation_equal; 6924 } else { 6925 return 1 - (2 * aSign); 6926 } 6927 } else { 6928 if (a.low == b.low && a.high == b.high) { 6929 return float_relation_equal; 6930 } else { 6931 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 6932 } 6933 } 6934 } 6935 6936 int floatx80_compare(floatx80 a, floatx80 b, float_status *status) 6937 { 6938 return floatx80_compare_internal(a, b, 0, status); 6939 } 6940 6941 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status) 6942 { 6943 return floatx80_compare_internal(a, b, 1, status); 6944 } 6945 6946 static inline int float128_compare_internal(float128 a, float128 b, 6947 int is_quiet, float_status *status) 6948 { 6949 flag aSign, bSign; 6950 6951 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 6952 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 6953 ( ( extractFloat128Exp( b ) == 0x7fff ) && 6954 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 6955 if (!is_quiet || 6956 float128_is_signaling_nan(a, status) || 6957 float128_is_signaling_nan(b, status)) { 6958 float_raise(float_flag_invalid, status); 6959 } 6960 return float_relation_unordered; 6961 } 6962 aSign = extractFloat128Sign( a ); 6963 bSign = extractFloat128Sign( b ); 6964 if ( aSign != bSign ) { 6965 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 6966 /* zero case */ 6967 return float_relation_equal; 6968 } else { 6969 return 1 - (2 * aSign); 6970 } 6971 } else { 6972 if (a.low == b.low && a.high == b.high) { 6973 return float_relation_equal; 6974 } else { 6975 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 6976 } 6977 } 6978 } 6979 6980 int float128_compare(float128 a, float128 b, float_status *status) 6981 { 6982 return float128_compare_internal(a, b, 0, status); 6983 } 6984 6985 int float128_compare_quiet(float128 a, float128 b, float_status *status) 6986 { 6987 return float128_compare_internal(a, b, 1, status); 6988 } 6989 6990 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 6991 { 6992 flag aSign; 6993 int32_t aExp; 6994 uint64_t aSig; 6995 6996 if (floatx80_invalid_encoding(a)) { 6997 float_raise(float_flag_invalid, status); 6998 return floatx80_default_nan(status); 6999 } 7000 aSig = extractFloatx80Frac( a ); 7001 aExp = extractFloatx80Exp( a ); 7002 aSign = extractFloatx80Sign( a ); 7003 7004 if ( aExp == 0x7FFF ) { 7005 if ( aSig<<1 ) { 7006 return propagateFloatx80NaN(a, a, status); 7007 } 7008 return a; 7009 } 7010 7011 if (aExp == 0) { 7012 if (aSig == 0) { 7013 return a; 7014 } 7015 aExp++; 7016 } 7017 7018 if (n > 0x10000) { 7019 n = 0x10000; 7020 } else if (n < -0x10000) { 7021 n = -0x10000; 7022 } 7023 7024 aExp += n; 7025 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7026 aSign, aExp, aSig, 0, status); 7027 } 7028 7029 float128 float128_scalbn(float128 a, int n, float_status *status) 7030 { 7031 flag aSign; 7032 int32_t aExp; 7033 uint64_t aSig0, aSig1; 7034 7035 aSig1 = extractFloat128Frac1( a ); 7036 aSig0 = extractFloat128Frac0( a ); 7037 aExp = extractFloat128Exp( a ); 7038 aSign = extractFloat128Sign( a ); 7039 if ( aExp == 0x7FFF ) { 7040 if ( aSig0 | aSig1 ) { 7041 return propagateFloat128NaN(a, a, status); 7042 } 7043 return a; 7044 } 7045 if (aExp != 0) { 7046 aSig0 |= LIT64( 0x0001000000000000 ); 7047 } else if (aSig0 == 0 && aSig1 == 0) { 7048 return a; 7049 } else { 7050 aExp++; 7051 } 7052 7053 if (n > 0x10000) { 7054 n = 0x10000; 7055 } else if (n < -0x10000) { 7056 n = -0x10000; 7057 } 7058 7059 aExp += n - 1; 7060 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7061 , status); 7062 7063 } 7064