1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include "qemu/bitops.h" 87 #include "fpu/softfloat.h" 88 89 /* We only need stdlib for abort() */ 90 91 /*---------------------------------------------------------------------------- 92 | Primitive arithmetic functions, including multi-word arithmetic, and 93 | division and square root approximations. (Can be specialized to target if 94 | desired.) 95 *----------------------------------------------------------------------------*/ 96 #include "fpu/softfloat-macros.h" 97 98 /*---------------------------------------------------------------------------- 99 | Functions and definitions to determine: (1) whether tininess for underflow 100 | is detected before or after rounding by default, (2) what (if anything) 101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 103 | are propagated from function inputs to output. These details are target- 104 | specific. 105 *----------------------------------------------------------------------------*/ 106 #include "softfloat-specialize.h" 107 108 /*---------------------------------------------------------------------------- 109 | Returns the fraction bits of the half-precision floating-point value `a'. 110 *----------------------------------------------------------------------------*/ 111 112 static inline uint32_t extractFloat16Frac(float16 a) 113 { 114 return float16_val(a) & 0x3ff; 115 } 116 117 /*---------------------------------------------------------------------------- 118 | Returns the exponent bits of the half-precision floating-point value `a'. 119 *----------------------------------------------------------------------------*/ 120 121 static inline int extractFloat16Exp(float16 a) 122 { 123 return (float16_val(a) >> 10) & 0x1f; 124 } 125 126 /*---------------------------------------------------------------------------- 127 | Returns the sign bit of the single-precision floating-point value `a'. 128 *----------------------------------------------------------------------------*/ 129 130 static inline flag extractFloat16Sign(float16 a) 131 { 132 return float16_val(a)>>15; 133 } 134 135 /*---------------------------------------------------------------------------- 136 | Returns the fraction bits of the single-precision floating-point value `a'. 137 *----------------------------------------------------------------------------*/ 138 139 static inline uint32_t extractFloat32Frac(float32 a) 140 { 141 return float32_val(a) & 0x007FFFFF; 142 } 143 144 /*---------------------------------------------------------------------------- 145 | Returns the exponent bits of the single-precision floating-point value `a'. 146 *----------------------------------------------------------------------------*/ 147 148 static inline int extractFloat32Exp(float32 a) 149 { 150 return (float32_val(a) >> 23) & 0xFF; 151 } 152 153 /*---------------------------------------------------------------------------- 154 | Returns the sign bit of the single-precision floating-point value `a'. 155 *----------------------------------------------------------------------------*/ 156 157 static inline flag extractFloat32Sign(float32 a) 158 { 159 return float32_val(a) >> 31; 160 } 161 162 /*---------------------------------------------------------------------------- 163 | Returns the fraction bits of the double-precision floating-point value `a'. 164 *----------------------------------------------------------------------------*/ 165 166 static inline uint64_t extractFloat64Frac(float64 a) 167 { 168 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF); 169 } 170 171 /*---------------------------------------------------------------------------- 172 | Returns the exponent bits of the double-precision floating-point value `a'. 173 *----------------------------------------------------------------------------*/ 174 175 static inline int extractFloat64Exp(float64 a) 176 { 177 return (float64_val(a) >> 52) & 0x7FF; 178 } 179 180 /*---------------------------------------------------------------------------- 181 | Returns the sign bit of the double-precision floating-point value `a'. 182 *----------------------------------------------------------------------------*/ 183 184 static inline flag extractFloat64Sign(float64 a) 185 { 186 return float64_val(a) >> 63; 187 } 188 189 /* 190 * Classify a floating point number. Everything above float_class_qnan 191 * is a NaN so cls >= float_class_qnan is any NaN. 192 */ 193 194 typedef enum __attribute__ ((__packed__)) { 195 float_class_unclassified, 196 float_class_zero, 197 float_class_normal, 198 float_class_inf, 199 float_class_qnan, /* all NaNs from here */ 200 float_class_snan, 201 float_class_dnan, 202 float_class_msnan, /* maybe silenced */ 203 } FloatClass; 204 205 /* 206 * Structure holding all of the decomposed parts of a float. The 207 * exponent is unbiased and the fraction is normalized. All 208 * calculations are done with a 64 bit fraction and then rounded as 209 * appropriate for the final format. 210 * 211 * Thanks to the packed FloatClass a decent compiler should be able to 212 * fit the whole structure into registers and avoid using the stack 213 * for parameter passing. 214 */ 215 216 typedef struct { 217 uint64_t frac; 218 int32_t exp; 219 FloatClass cls; 220 bool sign; 221 } FloatParts; 222 223 #define DECOMPOSED_BINARY_POINT (64 - 2) 224 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 225 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1) 226 227 /* Structure holding all of the relevant parameters for a format. 228 * exp_size: the size of the exponent field 229 * exp_bias: the offset applied to the exponent field 230 * exp_max: the maximum normalised exponent 231 * frac_size: the size of the fraction field 232 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 233 * The following are computed based the size of fraction 234 * frac_lsb: least significant bit of fraction 235 * fram_lsbm1: the bit bellow the least significant bit (for rounding) 236 * round_mask/roundeven_mask: masks used for rounding 237 */ 238 typedef struct { 239 int exp_size; 240 int exp_bias; 241 int exp_max; 242 int frac_size; 243 int frac_shift; 244 uint64_t frac_lsb; 245 uint64_t frac_lsbm1; 246 uint64_t round_mask; 247 uint64_t roundeven_mask; 248 } FloatFmt; 249 250 /* Expand fields based on the size of exponent and fraction */ 251 #define FLOAT_PARAMS(E, F) \ 252 .exp_size = E, \ 253 .exp_bias = ((1 << E) - 1) >> 1, \ 254 .exp_max = (1 << E) - 1, \ 255 .frac_size = F, \ 256 .frac_shift = DECOMPOSED_BINARY_POINT - F, \ 257 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \ 258 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \ 259 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \ 260 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1 261 262 static const FloatFmt float16_params = { 263 FLOAT_PARAMS(5, 10) 264 }; 265 266 static const FloatFmt float32_params = { 267 FLOAT_PARAMS(8, 23) 268 }; 269 270 static const FloatFmt float64_params = { 271 FLOAT_PARAMS(11, 52) 272 }; 273 274 /* Unpack a float to parts, but do not canonicalize. */ 275 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw) 276 { 277 const int sign_pos = fmt.frac_size + fmt.exp_size; 278 279 return (FloatParts) { 280 .cls = float_class_unclassified, 281 .sign = extract64(raw, sign_pos, 1), 282 .exp = extract64(raw, fmt.frac_size, fmt.exp_size), 283 .frac = extract64(raw, 0, fmt.frac_size), 284 }; 285 } 286 287 static inline FloatParts float16_unpack_raw(float16 f) 288 { 289 return unpack_raw(float16_params, f); 290 } 291 292 static inline FloatParts float32_unpack_raw(float32 f) 293 { 294 return unpack_raw(float32_params, f); 295 } 296 297 static inline FloatParts float64_unpack_raw(float64 f) 298 { 299 return unpack_raw(float64_params, f); 300 } 301 302 /* Pack a float from parts, but do not canonicalize. */ 303 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p) 304 { 305 const int sign_pos = fmt.frac_size + fmt.exp_size; 306 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp); 307 return deposit64(ret, sign_pos, 1, p.sign); 308 } 309 310 static inline float16 float16_pack_raw(FloatParts p) 311 { 312 return make_float16(pack_raw(float16_params, p)); 313 } 314 315 static inline float32 float32_pack_raw(FloatParts p) 316 { 317 return make_float32(pack_raw(float32_params, p)); 318 } 319 320 static inline float64 float64_pack_raw(FloatParts p) 321 { 322 return make_float64(pack_raw(float64_params, p)); 323 } 324 325 /* Canonicalize EXP and FRAC, setting CLS. */ 326 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm, 327 float_status *status) 328 { 329 if (part.exp == parm->exp_max) { 330 if (part.frac == 0) { 331 part.cls = float_class_inf; 332 } else { 333 #ifdef NO_SIGNALING_NANS 334 part.cls = float_class_qnan; 335 #else 336 int64_t msb = part.frac << (parm->frac_shift + 2); 337 if ((msb < 0) == status->snan_bit_is_one) { 338 part.cls = float_class_snan; 339 } else { 340 part.cls = float_class_qnan; 341 } 342 #endif 343 } 344 } else if (part.exp == 0) { 345 if (likely(part.frac == 0)) { 346 part.cls = float_class_zero; 347 } else if (status->flush_inputs_to_zero) { 348 float_raise(float_flag_input_denormal, status); 349 part.cls = float_class_zero; 350 part.frac = 0; 351 } else { 352 int shift = clz64(part.frac) - 1; 353 part.cls = float_class_normal; 354 part.exp = parm->frac_shift - parm->exp_bias - shift + 1; 355 part.frac <<= shift; 356 } 357 } else { 358 part.cls = float_class_normal; 359 part.exp -= parm->exp_bias; 360 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift); 361 } 362 return part; 363 } 364 365 /* Round and uncanonicalize a floating-point number by parts. There 366 * are FRAC_SHIFT bits that may require rounding at the bottom of the 367 * fraction; these bits will be removed. The exponent will be biased 368 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0]. 369 */ 370 371 static FloatParts round_canonical(FloatParts p, float_status *s, 372 const FloatFmt *parm) 373 { 374 const uint64_t frac_lsbm1 = parm->frac_lsbm1; 375 const uint64_t round_mask = parm->round_mask; 376 const uint64_t roundeven_mask = parm->roundeven_mask; 377 const int exp_max = parm->exp_max; 378 const int frac_shift = parm->frac_shift; 379 uint64_t frac, inc; 380 int exp, flags = 0; 381 bool overflow_norm; 382 383 frac = p.frac; 384 exp = p.exp; 385 386 switch (p.cls) { 387 case float_class_normal: 388 switch (s->float_rounding_mode) { 389 case float_round_nearest_even: 390 overflow_norm = false; 391 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 392 break; 393 case float_round_ties_away: 394 overflow_norm = false; 395 inc = frac_lsbm1; 396 break; 397 case float_round_to_zero: 398 overflow_norm = true; 399 inc = 0; 400 break; 401 case float_round_up: 402 inc = p.sign ? 0 : round_mask; 403 overflow_norm = p.sign; 404 break; 405 case float_round_down: 406 inc = p.sign ? round_mask : 0; 407 overflow_norm = !p.sign; 408 break; 409 default: 410 g_assert_not_reached(); 411 } 412 413 exp += parm->exp_bias; 414 if (likely(exp > 0)) { 415 if (frac & round_mask) { 416 flags |= float_flag_inexact; 417 frac += inc; 418 if (frac & DECOMPOSED_OVERFLOW_BIT) { 419 frac >>= 1; 420 exp++; 421 } 422 } 423 frac >>= frac_shift; 424 425 if (unlikely(exp >= exp_max)) { 426 flags |= float_flag_overflow | float_flag_inexact; 427 if (overflow_norm) { 428 exp = exp_max - 1; 429 frac = -1; 430 } else { 431 p.cls = float_class_inf; 432 goto do_inf; 433 } 434 } 435 } else if (s->flush_to_zero) { 436 flags |= float_flag_output_denormal; 437 p.cls = float_class_zero; 438 goto do_zero; 439 } else { 440 bool is_tiny = (s->float_detect_tininess 441 == float_tininess_before_rounding) 442 || (exp < 0) 443 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT); 444 445 shift64RightJamming(frac, 1 - exp, &frac); 446 if (frac & round_mask) { 447 /* Need to recompute round-to-even. */ 448 if (s->float_rounding_mode == float_round_nearest_even) { 449 inc = ((frac & roundeven_mask) != frac_lsbm1 450 ? frac_lsbm1 : 0); 451 } 452 flags |= float_flag_inexact; 453 frac += inc; 454 } 455 456 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0); 457 frac >>= frac_shift; 458 459 if (is_tiny && (flags & float_flag_inexact)) { 460 flags |= float_flag_underflow; 461 } 462 if (exp == 0 && frac == 0) { 463 p.cls = float_class_zero; 464 } 465 } 466 break; 467 468 case float_class_zero: 469 do_zero: 470 exp = 0; 471 frac = 0; 472 break; 473 474 case float_class_inf: 475 do_inf: 476 exp = exp_max; 477 frac = 0; 478 break; 479 480 case float_class_qnan: 481 case float_class_snan: 482 exp = exp_max; 483 break; 484 485 default: 486 g_assert_not_reached(); 487 } 488 489 float_raise(flags, s); 490 p.exp = exp; 491 p.frac = frac; 492 return p; 493 } 494 495 static FloatParts float16_unpack_canonical(float16 f, float_status *s) 496 { 497 return canonicalize(float16_unpack_raw(f), &float16_params, s); 498 } 499 500 static float16 float16_round_pack_canonical(FloatParts p, float_status *s) 501 { 502 switch (p.cls) { 503 case float_class_dnan: 504 return float16_default_nan(s); 505 case float_class_msnan: 506 return float16_maybe_silence_nan(float16_pack_raw(p), s); 507 default: 508 p = round_canonical(p, s, &float16_params); 509 return float16_pack_raw(p); 510 } 511 } 512 513 static FloatParts float32_unpack_canonical(float32 f, float_status *s) 514 { 515 return canonicalize(float32_unpack_raw(f), &float32_params, s); 516 } 517 518 static float32 float32_round_pack_canonical(FloatParts p, float_status *s) 519 { 520 switch (p.cls) { 521 case float_class_dnan: 522 return float32_default_nan(s); 523 case float_class_msnan: 524 return float32_maybe_silence_nan(float32_pack_raw(p), s); 525 default: 526 p = round_canonical(p, s, &float32_params); 527 return float32_pack_raw(p); 528 } 529 } 530 531 static FloatParts float64_unpack_canonical(float64 f, float_status *s) 532 { 533 return canonicalize(float64_unpack_raw(f), &float64_params, s); 534 } 535 536 static float64 float64_round_pack_canonical(FloatParts p, float_status *s) 537 { 538 switch (p.cls) { 539 case float_class_dnan: 540 return float64_default_nan(s); 541 case float_class_msnan: 542 return float64_maybe_silence_nan(float64_pack_raw(p), s); 543 default: 544 p = round_canonical(p, s, &float64_params); 545 return float64_pack_raw(p); 546 } 547 } 548 549 /* Simple helpers for checking if what NaN we have */ 550 static bool is_nan(FloatClass c) 551 { 552 return unlikely(c >= float_class_qnan); 553 } 554 static bool is_snan(FloatClass c) 555 { 556 return c == float_class_snan; 557 } 558 static bool is_qnan(FloatClass c) 559 { 560 return c == float_class_qnan; 561 } 562 563 static FloatParts return_nan(FloatParts a, float_status *s) 564 { 565 switch (a.cls) { 566 case float_class_snan: 567 s->float_exception_flags |= float_flag_invalid; 568 a.cls = float_class_msnan; 569 /* fall through */ 570 case float_class_qnan: 571 if (s->default_nan_mode) { 572 a.cls = float_class_dnan; 573 } 574 break; 575 576 default: 577 g_assert_not_reached(); 578 } 579 return a; 580 } 581 582 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s) 583 { 584 if (is_snan(a.cls) || is_snan(b.cls)) { 585 s->float_exception_flags |= float_flag_invalid; 586 } 587 588 if (s->default_nan_mode) { 589 a.cls = float_class_dnan; 590 } else { 591 if (pickNaN(is_qnan(a.cls), is_snan(a.cls), 592 is_qnan(b.cls), is_snan(b.cls), 593 a.frac > b.frac || 594 (a.frac == b.frac && a.sign < b.sign))) { 595 a = b; 596 } 597 a.cls = float_class_msnan; 598 } 599 return a; 600 } 601 602 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c, 603 bool inf_zero, float_status *s) 604 { 605 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) { 606 s->float_exception_flags |= float_flag_invalid; 607 } 608 609 if (s->default_nan_mode) { 610 a.cls = float_class_dnan; 611 } else { 612 switch (pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls), 613 is_qnan(b.cls), is_snan(b.cls), 614 is_qnan(c.cls), is_snan(c.cls), 615 inf_zero, s)) { 616 case 0: 617 break; 618 case 1: 619 a = b; 620 break; 621 case 2: 622 a = c; 623 break; 624 case 3: 625 a.cls = float_class_dnan; 626 return a; 627 default: 628 g_assert_not_reached(); 629 } 630 631 a.cls = float_class_msnan; 632 } 633 return a; 634 } 635 636 /* 637 * Returns the result of adding or subtracting the values of the 638 * floating-point values `a' and `b'. The operation is performed 639 * according to the IEC/IEEE Standard for Binary Floating-Point 640 * Arithmetic. 641 */ 642 643 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract, 644 float_status *s) 645 { 646 bool a_sign = a.sign; 647 bool b_sign = b.sign ^ subtract; 648 649 if (a_sign != b_sign) { 650 /* Subtraction */ 651 652 if (a.cls == float_class_normal && b.cls == float_class_normal) { 653 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) { 654 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 655 a.frac = a.frac - b.frac; 656 } else { 657 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 658 a.frac = b.frac - a.frac; 659 a.exp = b.exp; 660 a_sign ^= 1; 661 } 662 663 if (a.frac == 0) { 664 a.cls = float_class_zero; 665 a.sign = s->float_rounding_mode == float_round_down; 666 } else { 667 int shift = clz64(a.frac) - 1; 668 a.frac = a.frac << shift; 669 a.exp = a.exp - shift; 670 a.sign = a_sign; 671 } 672 return a; 673 } 674 if (is_nan(a.cls) || is_nan(b.cls)) { 675 return pick_nan(a, b, s); 676 } 677 if (a.cls == float_class_inf) { 678 if (b.cls == float_class_inf) { 679 float_raise(float_flag_invalid, s); 680 a.cls = float_class_dnan; 681 } 682 return a; 683 } 684 if (a.cls == float_class_zero && b.cls == float_class_zero) { 685 a.sign = s->float_rounding_mode == float_round_down; 686 return a; 687 } 688 if (a.cls == float_class_zero || b.cls == float_class_inf) { 689 b.sign = a_sign ^ 1; 690 return b; 691 } 692 if (b.cls == float_class_zero) { 693 return a; 694 } 695 } else { 696 /* Addition */ 697 if (a.cls == float_class_normal && b.cls == float_class_normal) { 698 if (a.exp > b.exp) { 699 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 700 } else if (a.exp < b.exp) { 701 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 702 a.exp = b.exp; 703 } 704 a.frac += b.frac; 705 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 706 a.frac >>= 1; 707 a.exp += 1; 708 } 709 return a; 710 } 711 if (is_nan(a.cls) || is_nan(b.cls)) { 712 return pick_nan(a, b, s); 713 } 714 if (a.cls == float_class_inf || b.cls == float_class_zero) { 715 return a; 716 } 717 if (b.cls == float_class_inf || a.cls == float_class_zero) { 718 b.sign = b_sign; 719 return b; 720 } 721 } 722 g_assert_not_reached(); 723 } 724 725 /* 726 * Returns the result of adding or subtracting the floating-point 727 * values `a' and `b'. The operation is performed according to the 728 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 729 */ 730 731 float16 __attribute__((flatten)) float16_add(float16 a, float16 b, 732 float_status *status) 733 { 734 FloatParts pa = float16_unpack_canonical(a, status); 735 FloatParts pb = float16_unpack_canonical(b, status); 736 FloatParts pr = addsub_floats(pa, pb, false, status); 737 738 return float16_round_pack_canonical(pr, status); 739 } 740 741 float32 __attribute__((flatten)) float32_add(float32 a, float32 b, 742 float_status *status) 743 { 744 FloatParts pa = float32_unpack_canonical(a, status); 745 FloatParts pb = float32_unpack_canonical(b, status); 746 FloatParts pr = addsub_floats(pa, pb, false, status); 747 748 return float32_round_pack_canonical(pr, status); 749 } 750 751 float64 __attribute__((flatten)) float64_add(float64 a, float64 b, 752 float_status *status) 753 { 754 FloatParts pa = float64_unpack_canonical(a, status); 755 FloatParts pb = float64_unpack_canonical(b, status); 756 FloatParts pr = addsub_floats(pa, pb, false, status); 757 758 return float64_round_pack_canonical(pr, status); 759 } 760 761 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b, 762 float_status *status) 763 { 764 FloatParts pa = float16_unpack_canonical(a, status); 765 FloatParts pb = float16_unpack_canonical(b, status); 766 FloatParts pr = addsub_floats(pa, pb, true, status); 767 768 return float16_round_pack_canonical(pr, status); 769 } 770 771 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b, 772 float_status *status) 773 { 774 FloatParts pa = float32_unpack_canonical(a, status); 775 FloatParts pb = float32_unpack_canonical(b, status); 776 FloatParts pr = addsub_floats(pa, pb, true, status); 777 778 return float32_round_pack_canonical(pr, status); 779 } 780 781 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b, 782 float_status *status) 783 { 784 FloatParts pa = float64_unpack_canonical(a, status); 785 FloatParts pb = float64_unpack_canonical(b, status); 786 FloatParts pr = addsub_floats(pa, pb, true, status); 787 788 return float64_round_pack_canonical(pr, status); 789 } 790 791 /* 792 * Returns the result of multiplying the floating-point values `a' and 793 * `b'. The operation is performed according to the IEC/IEEE Standard 794 * for Binary Floating-Point Arithmetic. 795 */ 796 797 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s) 798 { 799 bool sign = a.sign ^ b.sign; 800 801 if (a.cls == float_class_normal && b.cls == float_class_normal) { 802 uint64_t hi, lo; 803 int exp = a.exp + b.exp; 804 805 mul64To128(a.frac, b.frac, &hi, &lo); 806 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 807 if (lo & DECOMPOSED_OVERFLOW_BIT) { 808 shift64RightJamming(lo, 1, &lo); 809 exp += 1; 810 } 811 812 /* Re-use a */ 813 a.exp = exp; 814 a.sign = sign; 815 a.frac = lo; 816 return a; 817 } 818 /* handle all the NaN cases */ 819 if (is_nan(a.cls) || is_nan(b.cls)) { 820 return pick_nan(a, b, s); 821 } 822 /* Inf * Zero == NaN */ 823 if ((a.cls == float_class_inf && b.cls == float_class_zero) || 824 (a.cls == float_class_zero && b.cls == float_class_inf)) { 825 s->float_exception_flags |= float_flag_invalid; 826 a.cls = float_class_dnan; 827 a.sign = sign; 828 return a; 829 } 830 /* Multiply by 0 or Inf */ 831 if (a.cls == float_class_inf || a.cls == float_class_zero) { 832 a.sign = sign; 833 return a; 834 } 835 if (b.cls == float_class_inf || b.cls == float_class_zero) { 836 b.sign = sign; 837 return b; 838 } 839 g_assert_not_reached(); 840 } 841 842 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b, 843 float_status *status) 844 { 845 FloatParts pa = float16_unpack_canonical(a, status); 846 FloatParts pb = float16_unpack_canonical(b, status); 847 FloatParts pr = mul_floats(pa, pb, status); 848 849 return float16_round_pack_canonical(pr, status); 850 } 851 852 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b, 853 float_status *status) 854 { 855 FloatParts pa = float32_unpack_canonical(a, status); 856 FloatParts pb = float32_unpack_canonical(b, status); 857 FloatParts pr = mul_floats(pa, pb, status); 858 859 return float32_round_pack_canonical(pr, status); 860 } 861 862 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b, 863 float_status *status) 864 { 865 FloatParts pa = float64_unpack_canonical(a, status); 866 FloatParts pb = float64_unpack_canonical(b, status); 867 FloatParts pr = mul_floats(pa, pb, status); 868 869 return float64_round_pack_canonical(pr, status); 870 } 871 872 /* 873 * Returns the result of multiplying the floating-point values `a' and 874 * `b' then adding 'c', with no intermediate rounding step after the 875 * multiplication. The operation is performed according to the 876 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008. 877 * The flags argument allows the caller to select negation of the 878 * addend, the intermediate product, or the final result. (The 879 * difference between this and having the caller do a separate 880 * negation is that negating externally will flip the sign bit on 881 * NaNs.) 882 */ 883 884 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c, 885 int flags, float_status *s) 886 { 887 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) == 888 ((1 << float_class_inf) | (1 << float_class_zero)); 889 bool p_sign; 890 bool sign_flip = flags & float_muladd_negate_result; 891 FloatClass p_class; 892 uint64_t hi, lo; 893 int p_exp; 894 895 /* It is implementation-defined whether the cases of (0,inf,qnan) 896 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 897 * they return if they do), so we have to hand this information 898 * off to the target-specific pick-a-NaN routine. 899 */ 900 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) { 901 return pick_nan_muladd(a, b, c, inf_zero, s); 902 } 903 904 if (inf_zero) { 905 s->float_exception_flags |= float_flag_invalid; 906 a.cls = float_class_dnan; 907 return a; 908 } 909 910 if (flags & float_muladd_negate_c) { 911 c.sign ^= 1; 912 } 913 914 p_sign = a.sign ^ b.sign; 915 916 if (flags & float_muladd_negate_product) { 917 p_sign ^= 1; 918 } 919 920 if (a.cls == float_class_inf || b.cls == float_class_inf) { 921 p_class = float_class_inf; 922 } else if (a.cls == float_class_zero || b.cls == float_class_zero) { 923 p_class = float_class_zero; 924 } else { 925 p_class = float_class_normal; 926 } 927 928 if (c.cls == float_class_inf) { 929 if (p_class == float_class_inf && p_sign != c.sign) { 930 s->float_exception_flags |= float_flag_invalid; 931 a.cls = float_class_dnan; 932 } else { 933 a.cls = float_class_inf; 934 a.sign = c.sign ^ sign_flip; 935 } 936 return a; 937 } 938 939 if (p_class == float_class_inf) { 940 a.cls = float_class_inf; 941 a.sign = p_sign ^ sign_flip; 942 return a; 943 } 944 945 if (p_class == float_class_zero) { 946 if (c.cls == float_class_zero) { 947 if (p_sign != c.sign) { 948 p_sign = s->float_rounding_mode == float_round_down; 949 } 950 c.sign = p_sign; 951 } else if (flags & float_muladd_halve_result) { 952 c.exp -= 1; 953 } 954 c.sign ^= sign_flip; 955 return c; 956 } 957 958 /* a & b should be normals now... */ 959 assert(a.cls == float_class_normal && 960 b.cls == float_class_normal); 961 962 p_exp = a.exp + b.exp; 963 964 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit 965 * result. 966 */ 967 mul64To128(a.frac, b.frac, &hi, &lo); 968 /* binary point now at bit 124 */ 969 970 /* check for overflow */ 971 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) { 972 shift128RightJamming(hi, lo, 1, &hi, &lo); 973 p_exp += 1; 974 } 975 976 /* + add/sub */ 977 if (c.cls == float_class_zero) { 978 /* move binary point back to 62 */ 979 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 980 } else { 981 int exp_diff = p_exp - c.exp; 982 if (p_sign == c.sign) { 983 /* Addition */ 984 if (exp_diff <= 0) { 985 shift128RightJamming(hi, lo, 986 DECOMPOSED_BINARY_POINT - exp_diff, 987 &hi, &lo); 988 lo += c.frac; 989 p_exp = c.exp; 990 } else { 991 uint64_t c_hi, c_lo; 992 /* shift c to the same binary point as the product (124) */ 993 c_hi = c.frac >> 2; 994 c_lo = 0; 995 shift128RightJamming(c_hi, c_lo, 996 exp_diff, 997 &c_hi, &c_lo); 998 add128(hi, lo, c_hi, c_lo, &hi, &lo); 999 /* move binary point back to 62 */ 1000 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1001 } 1002 1003 if (lo & DECOMPOSED_OVERFLOW_BIT) { 1004 shift64RightJamming(lo, 1, &lo); 1005 p_exp += 1; 1006 } 1007 1008 } else { 1009 /* Subtraction */ 1010 uint64_t c_hi, c_lo; 1011 /* make C binary point match product at bit 124 */ 1012 c_hi = c.frac >> 2; 1013 c_lo = 0; 1014 1015 if (exp_diff <= 0) { 1016 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo); 1017 if (exp_diff == 0 1018 && 1019 (hi > c_hi || (hi == c_hi && lo >= c_lo))) { 1020 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1021 } else { 1022 sub128(c_hi, c_lo, hi, lo, &hi, &lo); 1023 p_sign ^= 1; 1024 p_exp = c.exp; 1025 } 1026 } else { 1027 shift128RightJamming(c_hi, c_lo, 1028 exp_diff, 1029 &c_hi, &c_lo); 1030 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1031 } 1032 1033 if (hi == 0 && lo == 0) { 1034 a.cls = float_class_zero; 1035 a.sign = s->float_rounding_mode == float_round_down; 1036 a.sign ^= sign_flip; 1037 return a; 1038 } else { 1039 int shift; 1040 if (hi != 0) { 1041 shift = clz64(hi); 1042 } else { 1043 shift = clz64(lo) + 64; 1044 } 1045 /* Normalizing to a binary point of 124 is the 1046 correct adjust for the exponent. However since we're 1047 shifting, we might as well put the binary point back 1048 at 62 where we really want it. Therefore shift as 1049 if we're leaving 1 bit at the top of the word, but 1050 adjust the exponent as if we're leaving 3 bits. */ 1051 shift -= 1; 1052 if (shift >= 64) { 1053 lo = lo << (shift - 64); 1054 } else { 1055 hi = (hi << shift) | (lo >> (64 - shift)); 1056 lo = hi | ((lo << shift) != 0); 1057 } 1058 p_exp -= shift - 2; 1059 } 1060 } 1061 } 1062 1063 if (flags & float_muladd_halve_result) { 1064 p_exp -= 1; 1065 } 1066 1067 /* finally prepare our result */ 1068 a.cls = float_class_normal; 1069 a.sign = p_sign ^ sign_flip; 1070 a.exp = p_exp; 1071 a.frac = lo; 1072 1073 return a; 1074 } 1075 1076 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c, 1077 int flags, float_status *status) 1078 { 1079 FloatParts pa = float16_unpack_canonical(a, status); 1080 FloatParts pb = float16_unpack_canonical(b, status); 1081 FloatParts pc = float16_unpack_canonical(c, status); 1082 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1083 1084 return float16_round_pack_canonical(pr, status); 1085 } 1086 1087 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c, 1088 int flags, float_status *status) 1089 { 1090 FloatParts pa = float32_unpack_canonical(a, status); 1091 FloatParts pb = float32_unpack_canonical(b, status); 1092 FloatParts pc = float32_unpack_canonical(c, status); 1093 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1094 1095 return float32_round_pack_canonical(pr, status); 1096 } 1097 1098 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c, 1099 int flags, float_status *status) 1100 { 1101 FloatParts pa = float64_unpack_canonical(a, status); 1102 FloatParts pb = float64_unpack_canonical(b, status); 1103 FloatParts pc = float64_unpack_canonical(c, status); 1104 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1105 1106 return float64_round_pack_canonical(pr, status); 1107 } 1108 1109 /* 1110 * Returns the result of dividing the floating-point value `a' by the 1111 * corresponding value `b'. The operation is performed according to 1112 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1113 */ 1114 1115 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s) 1116 { 1117 bool sign = a.sign ^ b.sign; 1118 1119 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1120 uint64_t temp_lo, temp_hi; 1121 int exp = a.exp - b.exp; 1122 if (a.frac < b.frac) { 1123 exp -= 1; 1124 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, 1125 &temp_hi, &temp_lo); 1126 } else { 1127 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, 1128 &temp_hi, &temp_lo); 1129 } 1130 /* LSB of quot is set if inexact which roundandpack will use 1131 * to set flags. Yet again we re-use a for the result */ 1132 a.frac = div128To64(temp_lo, temp_hi, b.frac); 1133 a.sign = sign; 1134 a.exp = exp; 1135 return a; 1136 } 1137 /* handle all the NaN cases */ 1138 if (is_nan(a.cls) || is_nan(b.cls)) { 1139 return pick_nan(a, b, s); 1140 } 1141 /* 0/0 or Inf/Inf */ 1142 if (a.cls == b.cls 1143 && 1144 (a.cls == float_class_inf || a.cls == float_class_zero)) { 1145 s->float_exception_flags |= float_flag_invalid; 1146 a.cls = float_class_dnan; 1147 return a; 1148 } 1149 /* Div 0 => Inf */ 1150 if (b.cls == float_class_zero) { 1151 s->float_exception_flags |= float_flag_divbyzero; 1152 a.cls = float_class_inf; 1153 a.sign = sign; 1154 return a; 1155 } 1156 /* Inf / x or 0 / x */ 1157 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1158 a.sign = sign; 1159 return a; 1160 } 1161 /* Div by Inf */ 1162 if (b.cls == float_class_inf) { 1163 a.cls = float_class_zero; 1164 a.sign = sign; 1165 return a; 1166 } 1167 g_assert_not_reached(); 1168 } 1169 1170 float16 float16_div(float16 a, float16 b, float_status *status) 1171 { 1172 FloatParts pa = float16_unpack_canonical(a, status); 1173 FloatParts pb = float16_unpack_canonical(b, status); 1174 FloatParts pr = div_floats(pa, pb, status); 1175 1176 return float16_round_pack_canonical(pr, status); 1177 } 1178 1179 float32 float32_div(float32 a, float32 b, float_status *status) 1180 { 1181 FloatParts pa = float32_unpack_canonical(a, status); 1182 FloatParts pb = float32_unpack_canonical(b, status); 1183 FloatParts pr = div_floats(pa, pb, status); 1184 1185 return float32_round_pack_canonical(pr, status); 1186 } 1187 1188 float64 float64_div(float64 a, float64 b, float_status *status) 1189 { 1190 FloatParts pa = float64_unpack_canonical(a, status); 1191 FloatParts pb = float64_unpack_canonical(b, status); 1192 FloatParts pr = div_floats(pa, pb, status); 1193 1194 return float64_round_pack_canonical(pr, status); 1195 } 1196 1197 /* 1198 * Rounds the floating-point value `a' to an integer, and returns the 1199 * result as a floating-point value. The operation is performed 1200 * according to the IEC/IEEE Standard for Binary Floating-Point 1201 * Arithmetic. 1202 */ 1203 1204 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s) 1205 { 1206 if (is_nan(a.cls)) { 1207 return return_nan(a, s); 1208 } 1209 1210 switch (a.cls) { 1211 case float_class_zero: 1212 case float_class_inf: 1213 case float_class_qnan: 1214 /* already "integral" */ 1215 break; 1216 case float_class_normal: 1217 if (a.exp >= DECOMPOSED_BINARY_POINT) { 1218 /* already integral */ 1219 break; 1220 } 1221 if (a.exp < 0) { 1222 bool one; 1223 /* all fractional */ 1224 s->float_exception_flags |= float_flag_inexact; 1225 switch (rounding_mode) { 1226 case float_round_nearest_even: 1227 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT; 1228 break; 1229 case float_round_ties_away: 1230 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT; 1231 break; 1232 case float_round_to_zero: 1233 one = false; 1234 break; 1235 case float_round_up: 1236 one = !a.sign; 1237 break; 1238 case float_round_down: 1239 one = a.sign; 1240 break; 1241 default: 1242 g_assert_not_reached(); 1243 } 1244 1245 if (one) { 1246 a.frac = DECOMPOSED_IMPLICIT_BIT; 1247 a.exp = 0; 1248 } else { 1249 a.cls = float_class_zero; 1250 } 1251 } else { 1252 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp; 1253 uint64_t frac_lsbm1 = frac_lsb >> 1; 1254 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb; 1255 uint64_t rnd_mask = rnd_even_mask >> 1; 1256 uint64_t inc; 1257 1258 switch (rounding_mode) { 1259 case float_round_nearest_even: 1260 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 1261 break; 1262 case float_round_ties_away: 1263 inc = frac_lsbm1; 1264 break; 1265 case float_round_to_zero: 1266 inc = 0; 1267 break; 1268 case float_round_up: 1269 inc = a.sign ? 0 : rnd_mask; 1270 break; 1271 case float_round_down: 1272 inc = a.sign ? rnd_mask : 0; 1273 break; 1274 default: 1275 g_assert_not_reached(); 1276 } 1277 1278 if (a.frac & rnd_mask) { 1279 s->float_exception_flags |= float_flag_inexact; 1280 a.frac += inc; 1281 a.frac &= ~rnd_mask; 1282 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 1283 a.frac >>= 1; 1284 a.exp++; 1285 } 1286 } 1287 } 1288 break; 1289 default: 1290 g_assert_not_reached(); 1291 } 1292 return a; 1293 } 1294 1295 float16 float16_round_to_int(float16 a, float_status *s) 1296 { 1297 FloatParts pa = float16_unpack_canonical(a, s); 1298 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s); 1299 return float16_round_pack_canonical(pr, s); 1300 } 1301 1302 float32 float32_round_to_int(float32 a, float_status *s) 1303 { 1304 FloatParts pa = float32_unpack_canonical(a, s); 1305 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s); 1306 return float32_round_pack_canonical(pr, s); 1307 } 1308 1309 float64 float64_round_to_int(float64 a, float_status *s) 1310 { 1311 FloatParts pa = float64_unpack_canonical(a, s); 1312 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s); 1313 return float64_round_pack_canonical(pr, s); 1314 } 1315 1316 float64 float64_trunc_to_int(float64 a, float_status *s) 1317 { 1318 FloatParts pa = float64_unpack_canonical(a, s); 1319 FloatParts pr = round_to_int(pa, float_round_to_zero, s); 1320 return float64_round_pack_canonical(pr, s); 1321 } 1322 1323 /* 1324 * Returns the result of converting the floating-point value `a' to 1325 * the two's complement integer format. The conversion is performed 1326 * according to the IEC/IEEE Standard for Binary Floating-Point 1327 * Arithmetic---which means in particular that the conversion is 1328 * rounded according to the current rounding mode. If `a' is a NaN, 1329 * the largest positive integer is returned. Otherwise, if the 1330 * conversion overflows, the largest integer with the same sign as `a' 1331 * is returned. 1332 */ 1333 1334 static int64_t round_to_int_and_pack(FloatParts in, int rmode, 1335 int64_t min, int64_t max, 1336 float_status *s) 1337 { 1338 uint64_t r; 1339 int orig_flags = get_float_exception_flags(s); 1340 FloatParts p = round_to_int(in, rmode, s); 1341 1342 switch (p.cls) { 1343 case float_class_snan: 1344 case float_class_qnan: 1345 return max; 1346 case float_class_inf: 1347 return p.sign ? min : max; 1348 case float_class_zero: 1349 return 0; 1350 case float_class_normal: 1351 if (p.exp < DECOMPOSED_BINARY_POINT) { 1352 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 1353 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 1354 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 1355 } else { 1356 r = UINT64_MAX; 1357 } 1358 if (p.sign) { 1359 if (r < -(uint64_t) min) { 1360 return -r; 1361 } else { 1362 s->float_exception_flags = orig_flags | float_flag_invalid; 1363 return min; 1364 } 1365 } else { 1366 if (r < max) { 1367 return r; 1368 } else { 1369 s->float_exception_flags = orig_flags | float_flag_invalid; 1370 return max; 1371 } 1372 } 1373 default: 1374 g_assert_not_reached(); 1375 } 1376 } 1377 1378 #define FLOAT_TO_INT(fsz, isz) \ 1379 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a, \ 1380 float_status *s) \ 1381 { \ 1382 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1383 return round_to_int_and_pack(p, s->float_rounding_mode, \ 1384 INT ## isz ## _MIN, INT ## isz ## _MAX,\ 1385 s); \ 1386 } \ 1387 \ 1388 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero \ 1389 (float ## fsz a, float_status *s) \ 1390 { \ 1391 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1392 return round_to_int_and_pack(p, float_round_to_zero, \ 1393 INT ## isz ## _MIN, INT ## isz ## _MAX,\ 1394 s); \ 1395 } 1396 1397 FLOAT_TO_INT(16, 16) 1398 FLOAT_TO_INT(16, 32) 1399 FLOAT_TO_INT(16, 64) 1400 1401 FLOAT_TO_INT(32, 16) 1402 FLOAT_TO_INT(32, 32) 1403 FLOAT_TO_INT(32, 64) 1404 1405 FLOAT_TO_INT(64, 16) 1406 FLOAT_TO_INT(64, 32) 1407 FLOAT_TO_INT(64, 64) 1408 1409 #undef FLOAT_TO_INT 1410 1411 /* 1412 * Returns the result of converting the floating-point value `a' to 1413 * the unsigned integer format. The conversion is performed according 1414 * to the IEC/IEEE Standard for Binary Floating-Point 1415 * Arithmetic---which means in particular that the conversion is 1416 * rounded according to the current rounding mode. If `a' is a NaN, 1417 * the largest unsigned integer is returned. Otherwise, if the 1418 * conversion overflows, the largest unsigned integer is returned. If 1419 * the 'a' is negative, the result is rounded and zero is returned; 1420 * values that do not round to zero will raise the inexact exception 1421 * flag. 1422 */ 1423 1424 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max, 1425 float_status *s) 1426 { 1427 int orig_flags = get_float_exception_flags(s); 1428 FloatParts p = round_to_int(in, rmode, s); 1429 1430 switch (p.cls) { 1431 case float_class_snan: 1432 case float_class_qnan: 1433 s->float_exception_flags = orig_flags | float_flag_invalid; 1434 return max; 1435 case float_class_inf: 1436 return p.sign ? 0 : max; 1437 case float_class_zero: 1438 return 0; 1439 case float_class_normal: 1440 { 1441 uint64_t r; 1442 if (p.sign) { 1443 s->float_exception_flags = orig_flags | float_flag_invalid; 1444 return 0; 1445 } 1446 1447 if (p.exp < DECOMPOSED_BINARY_POINT) { 1448 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 1449 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 1450 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 1451 } else { 1452 s->float_exception_flags = orig_flags | float_flag_invalid; 1453 return max; 1454 } 1455 1456 /* For uint64 this will never trip, but if p.exp is too large 1457 * to shift a decomposed fraction we shall have exited via the 1458 * 3rd leg above. 1459 */ 1460 if (r > max) { 1461 s->float_exception_flags = orig_flags | float_flag_invalid; 1462 return max; 1463 } else { 1464 return r; 1465 } 1466 } 1467 default: 1468 g_assert_not_reached(); 1469 } 1470 } 1471 1472 #define FLOAT_TO_UINT(fsz, isz) \ 1473 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a, \ 1474 float_status *s) \ 1475 { \ 1476 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1477 return round_to_uint_and_pack(p, s->float_rounding_mode, \ 1478 UINT ## isz ## _MAX, s); \ 1479 } \ 1480 \ 1481 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero \ 1482 (float ## fsz a, float_status *s) \ 1483 { \ 1484 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1485 return round_to_uint_and_pack(p, s->float_rounding_mode, \ 1486 UINT ## isz ## _MAX, s); \ 1487 } 1488 1489 FLOAT_TO_UINT(16, 16) 1490 FLOAT_TO_UINT(16, 32) 1491 FLOAT_TO_UINT(16, 64) 1492 1493 FLOAT_TO_UINT(32, 16) 1494 FLOAT_TO_UINT(32, 32) 1495 FLOAT_TO_UINT(32, 64) 1496 1497 FLOAT_TO_UINT(64, 16) 1498 FLOAT_TO_UINT(64, 32) 1499 FLOAT_TO_UINT(64, 64) 1500 1501 #undef FLOAT_TO_UINT 1502 1503 /* 1504 * Integer to float conversions 1505 * 1506 * Returns the result of converting the two's complement integer `a' 1507 * to the floating-point format. The conversion is performed according 1508 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1509 */ 1510 1511 static FloatParts int_to_float(int64_t a, float_status *status) 1512 { 1513 FloatParts r; 1514 if (a == 0) { 1515 r.cls = float_class_zero; 1516 r.sign = false; 1517 } else if (a == (1ULL << 63)) { 1518 r.cls = float_class_normal; 1519 r.sign = true; 1520 r.frac = DECOMPOSED_IMPLICIT_BIT; 1521 r.exp = 63; 1522 } else { 1523 uint64_t f; 1524 if (a < 0) { 1525 f = -a; 1526 r.sign = true; 1527 } else { 1528 f = a; 1529 r.sign = false; 1530 } 1531 int shift = clz64(f) - 1; 1532 r.cls = float_class_normal; 1533 r.exp = (DECOMPOSED_BINARY_POINT - shift); 1534 r.frac = f << shift; 1535 } 1536 1537 return r; 1538 } 1539 1540 float16 int64_to_float16(int64_t a, float_status *status) 1541 { 1542 FloatParts pa = int_to_float(a, status); 1543 return float16_round_pack_canonical(pa, status); 1544 } 1545 1546 float16 int32_to_float16(int32_t a, float_status *status) 1547 { 1548 return int64_to_float16(a, status); 1549 } 1550 1551 float16 int16_to_float16(int16_t a, float_status *status) 1552 { 1553 return int64_to_float16(a, status); 1554 } 1555 1556 float32 int64_to_float32(int64_t a, float_status *status) 1557 { 1558 FloatParts pa = int_to_float(a, status); 1559 return float32_round_pack_canonical(pa, status); 1560 } 1561 1562 float32 int32_to_float32(int32_t a, float_status *status) 1563 { 1564 return int64_to_float32(a, status); 1565 } 1566 1567 float32 int16_to_float32(int16_t a, float_status *status) 1568 { 1569 return int64_to_float32(a, status); 1570 } 1571 1572 float64 int64_to_float64(int64_t a, float_status *status) 1573 { 1574 FloatParts pa = int_to_float(a, status); 1575 return float64_round_pack_canonical(pa, status); 1576 } 1577 1578 float64 int32_to_float64(int32_t a, float_status *status) 1579 { 1580 return int64_to_float64(a, status); 1581 } 1582 1583 float64 int16_to_float64(int16_t a, float_status *status) 1584 { 1585 return int64_to_float64(a, status); 1586 } 1587 1588 1589 /* 1590 * Unsigned Integer to float conversions 1591 * 1592 * Returns the result of converting the unsigned integer `a' to the 1593 * floating-point format. The conversion is performed according to the 1594 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1595 */ 1596 1597 static FloatParts uint_to_float(uint64_t a, float_status *status) 1598 { 1599 FloatParts r = { .sign = false}; 1600 1601 if (a == 0) { 1602 r.cls = float_class_zero; 1603 } else { 1604 int spare_bits = clz64(a) - 1; 1605 r.cls = float_class_normal; 1606 r.exp = DECOMPOSED_BINARY_POINT - spare_bits; 1607 if (spare_bits < 0) { 1608 shift64RightJamming(a, -spare_bits, &a); 1609 r.frac = a; 1610 } else { 1611 r.frac = a << spare_bits; 1612 } 1613 } 1614 1615 return r; 1616 } 1617 1618 float16 uint64_to_float16(uint64_t a, float_status *status) 1619 { 1620 FloatParts pa = uint_to_float(a, status); 1621 return float16_round_pack_canonical(pa, status); 1622 } 1623 1624 float16 uint32_to_float16(uint32_t a, float_status *status) 1625 { 1626 return uint64_to_float16(a, status); 1627 } 1628 1629 float16 uint16_to_float16(uint16_t a, float_status *status) 1630 { 1631 return uint64_to_float16(a, status); 1632 } 1633 1634 float32 uint64_to_float32(uint64_t a, float_status *status) 1635 { 1636 FloatParts pa = uint_to_float(a, status); 1637 return float32_round_pack_canonical(pa, status); 1638 } 1639 1640 float32 uint32_to_float32(uint32_t a, float_status *status) 1641 { 1642 return uint64_to_float32(a, status); 1643 } 1644 1645 float32 uint16_to_float32(uint16_t a, float_status *status) 1646 { 1647 return uint64_to_float32(a, status); 1648 } 1649 1650 float64 uint64_to_float64(uint64_t a, float_status *status) 1651 { 1652 FloatParts pa = uint_to_float(a, status); 1653 return float64_round_pack_canonical(pa, status); 1654 } 1655 1656 float64 uint32_to_float64(uint32_t a, float_status *status) 1657 { 1658 return uint64_to_float64(a, status); 1659 } 1660 1661 float64 uint16_to_float64(uint16_t a, float_status *status) 1662 { 1663 return uint64_to_float64(a, status); 1664 } 1665 1666 /* Float Min/Max */ 1667 /* min() and max() functions. These can't be implemented as 1668 * 'compare and pick one input' because that would mishandle 1669 * NaNs and +0 vs -0. 1670 * 1671 * minnum() and maxnum() functions. These are similar to the min() 1672 * and max() functions but if one of the arguments is a QNaN and 1673 * the other is numerical then the numerical argument is returned. 1674 * SNaNs will get quietened before being returned. 1675 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 1676 * and maxNum() operations. min() and max() are the typical min/max 1677 * semantics provided by many CPUs which predate that specification. 1678 * 1679 * minnummag() and maxnummag() functions correspond to minNumMag() 1680 * and minNumMag() from the IEEE-754 2008. 1681 */ 1682 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin, 1683 bool ieee, bool ismag, float_status *s) 1684 { 1685 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) { 1686 if (ieee) { 1687 /* Takes two floating-point values `a' and `b', one of 1688 * which is a NaN, and returns the appropriate NaN 1689 * result. If either `a' or `b' is a signaling NaN, 1690 * the invalid exception is raised. 1691 */ 1692 if (is_snan(a.cls) || is_snan(b.cls)) { 1693 return pick_nan(a, b, s); 1694 } else if (is_nan(a.cls) && !is_nan(b.cls)) { 1695 return b; 1696 } else if (is_nan(b.cls) && !is_nan(a.cls)) { 1697 return a; 1698 } 1699 } 1700 return pick_nan(a, b, s); 1701 } else { 1702 int a_exp, b_exp; 1703 bool a_sign, b_sign; 1704 1705 switch (a.cls) { 1706 case float_class_normal: 1707 a_exp = a.exp; 1708 break; 1709 case float_class_inf: 1710 a_exp = INT_MAX; 1711 break; 1712 case float_class_zero: 1713 a_exp = INT_MIN; 1714 break; 1715 default: 1716 g_assert_not_reached(); 1717 break; 1718 } 1719 switch (b.cls) { 1720 case float_class_normal: 1721 b_exp = b.exp; 1722 break; 1723 case float_class_inf: 1724 b_exp = INT_MAX; 1725 break; 1726 case float_class_zero: 1727 b_exp = INT_MIN; 1728 break; 1729 default: 1730 g_assert_not_reached(); 1731 break; 1732 } 1733 1734 a_sign = a.sign; 1735 b_sign = b.sign; 1736 if (ismag) { 1737 a_sign = b_sign = 0; 1738 } 1739 1740 if (a_sign == b_sign) { 1741 bool a_less = a_exp < b_exp; 1742 if (a_exp == b_exp) { 1743 a_less = a.frac < b.frac; 1744 } 1745 return a_sign ^ a_less ^ ismin ? b : a; 1746 } else { 1747 return a_sign ^ ismin ? b : a; 1748 } 1749 } 1750 } 1751 1752 #define MINMAX(sz, name, ismin, isiee, ismag) \ 1753 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \ 1754 float_status *s) \ 1755 { \ 1756 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 1757 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 1758 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 1759 \ 1760 return float ## sz ## _round_pack_canonical(pr, s); \ 1761 } 1762 1763 MINMAX(16, min, true, false, false) 1764 MINMAX(16, minnum, true, true, false) 1765 MINMAX(16, minnummag, true, true, true) 1766 MINMAX(16, max, false, false, false) 1767 MINMAX(16, maxnum, false, true, false) 1768 MINMAX(16, maxnummag, false, true, true) 1769 1770 MINMAX(32, min, true, false, false) 1771 MINMAX(32, minnum, true, true, false) 1772 MINMAX(32, minnummag, true, true, true) 1773 MINMAX(32, max, false, false, false) 1774 MINMAX(32, maxnum, false, true, false) 1775 MINMAX(32, maxnummag, false, true, true) 1776 1777 MINMAX(64, min, true, false, false) 1778 MINMAX(64, minnum, true, true, false) 1779 MINMAX(64, minnummag, true, true, true) 1780 MINMAX(64, max, false, false, false) 1781 MINMAX(64, maxnum, false, true, false) 1782 MINMAX(64, maxnummag, false, true, true) 1783 1784 #undef MINMAX 1785 1786 /* Floating point compare */ 1787 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet, 1788 float_status *s) 1789 { 1790 if (is_nan(a.cls) || is_nan(b.cls)) { 1791 if (!is_quiet || 1792 a.cls == float_class_snan || 1793 b.cls == float_class_snan) { 1794 s->float_exception_flags |= float_flag_invalid; 1795 } 1796 return float_relation_unordered; 1797 } 1798 1799 if (a.cls == float_class_zero) { 1800 if (b.cls == float_class_zero) { 1801 return float_relation_equal; 1802 } 1803 return b.sign ? float_relation_greater : float_relation_less; 1804 } else if (b.cls == float_class_zero) { 1805 return a.sign ? float_relation_less : float_relation_greater; 1806 } 1807 1808 /* The only really important thing about infinity is its sign. If 1809 * both are infinities the sign marks the smallest of the two. 1810 */ 1811 if (a.cls == float_class_inf) { 1812 if ((b.cls == float_class_inf) && (a.sign == b.sign)) { 1813 return float_relation_equal; 1814 } 1815 return a.sign ? float_relation_less : float_relation_greater; 1816 } else if (b.cls == float_class_inf) { 1817 return b.sign ? float_relation_greater : float_relation_less; 1818 } 1819 1820 if (a.sign != b.sign) { 1821 return a.sign ? float_relation_less : float_relation_greater; 1822 } 1823 1824 if (a.exp == b.exp) { 1825 if (a.frac == b.frac) { 1826 return float_relation_equal; 1827 } 1828 if (a.sign) { 1829 return a.frac > b.frac ? 1830 float_relation_less : float_relation_greater; 1831 } else { 1832 return a.frac > b.frac ? 1833 float_relation_greater : float_relation_less; 1834 } 1835 } else { 1836 if (a.sign) { 1837 return a.exp > b.exp ? float_relation_less : float_relation_greater; 1838 } else { 1839 return a.exp > b.exp ? float_relation_greater : float_relation_less; 1840 } 1841 } 1842 } 1843 1844 #define COMPARE(sz) \ 1845 int float ## sz ## _compare(float ## sz a, float ## sz b, \ 1846 float_status *s) \ 1847 { \ 1848 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 1849 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 1850 return compare_floats(pa, pb, false, s); \ 1851 } \ 1852 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \ 1853 float_status *s) \ 1854 { \ 1855 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 1856 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 1857 return compare_floats(pa, pb, true, s); \ 1858 } 1859 1860 COMPARE(16) 1861 COMPARE(32) 1862 COMPARE(64) 1863 1864 #undef COMPARE 1865 1866 /* Multiply A by 2 raised to the power N. */ 1867 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s) 1868 { 1869 if (unlikely(is_nan(a.cls))) { 1870 return return_nan(a, s); 1871 } 1872 if (a.cls == float_class_normal) { 1873 a.exp += n; 1874 } 1875 return a; 1876 } 1877 1878 float16 float16_scalbn(float16 a, int n, float_status *status) 1879 { 1880 FloatParts pa = float16_unpack_canonical(a, status); 1881 FloatParts pr = scalbn_decomposed(pa, n, status); 1882 return float16_round_pack_canonical(pr, status); 1883 } 1884 1885 float32 float32_scalbn(float32 a, int n, float_status *status) 1886 { 1887 FloatParts pa = float32_unpack_canonical(a, status); 1888 FloatParts pr = scalbn_decomposed(pa, n, status); 1889 return float32_round_pack_canonical(pr, status); 1890 } 1891 1892 float64 float64_scalbn(float64 a, int n, float_status *status) 1893 { 1894 FloatParts pa = float64_unpack_canonical(a, status); 1895 FloatParts pr = scalbn_decomposed(pa, n, status); 1896 return float64_round_pack_canonical(pr, status); 1897 } 1898 1899 /* 1900 * Square Root 1901 * 1902 * The old softfloat code did an approximation step before zeroing in 1903 * on the final result. However for simpleness we just compute the 1904 * square root by iterating down from the implicit bit to enough extra 1905 * bits to ensure we get a correctly rounded result. 1906 * 1907 * This does mean however the calculation is slower than before, 1908 * especially for 64 bit floats. 1909 */ 1910 1911 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p) 1912 { 1913 uint64_t a_frac, r_frac, s_frac; 1914 int bit, last_bit; 1915 1916 if (is_nan(a.cls)) { 1917 return return_nan(a, s); 1918 } 1919 if (a.cls == float_class_zero) { 1920 return a; /* sqrt(+-0) = +-0 */ 1921 } 1922 if (a.sign) { 1923 s->float_exception_flags |= float_flag_invalid; 1924 a.cls = float_class_dnan; 1925 return a; 1926 } 1927 if (a.cls == float_class_inf) { 1928 return a; /* sqrt(+inf) = +inf */ 1929 } 1930 1931 assert(a.cls == float_class_normal); 1932 1933 /* We need two overflow bits at the top. Adding room for that is a 1934 * right shift. If the exponent is odd, we can discard the low bit 1935 * by multiplying the fraction by 2; that's a left shift. Combine 1936 * those and we shift right if the exponent is even. 1937 */ 1938 a_frac = a.frac; 1939 if (!(a.exp & 1)) { 1940 a_frac >>= 1; 1941 } 1942 a.exp >>= 1; 1943 1944 /* Bit-by-bit computation of sqrt. */ 1945 r_frac = 0; 1946 s_frac = 0; 1947 1948 /* Iterate from implicit bit down to the 3 extra bits to compute a 1949 * properly rounded result. Remember we've inserted one more bit 1950 * at the top, so these positions are one less. 1951 */ 1952 bit = DECOMPOSED_BINARY_POINT - 1; 1953 last_bit = MAX(p->frac_shift - 4, 0); 1954 do { 1955 uint64_t q = 1ULL << bit; 1956 uint64_t t_frac = s_frac + q; 1957 if (t_frac <= a_frac) { 1958 s_frac = t_frac + q; 1959 a_frac -= t_frac; 1960 r_frac += q; 1961 } 1962 a_frac <<= 1; 1963 } while (--bit >= last_bit); 1964 1965 /* Undo the right shift done above. If there is any remaining 1966 * fraction, the result is inexact. Set the sticky bit. 1967 */ 1968 a.frac = (r_frac << 1) + (a_frac != 0); 1969 1970 return a; 1971 } 1972 1973 float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status) 1974 { 1975 FloatParts pa = float16_unpack_canonical(a, status); 1976 FloatParts pr = sqrt_float(pa, status, &float16_params); 1977 return float16_round_pack_canonical(pr, status); 1978 } 1979 1980 float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status) 1981 { 1982 FloatParts pa = float32_unpack_canonical(a, status); 1983 FloatParts pr = sqrt_float(pa, status, &float32_params); 1984 return float32_round_pack_canonical(pr, status); 1985 } 1986 1987 float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status) 1988 { 1989 FloatParts pa = float64_unpack_canonical(a, status); 1990 FloatParts pr = sqrt_float(pa, status, &float64_params); 1991 return float64_round_pack_canonical(pr, status); 1992 } 1993 1994 1995 /*---------------------------------------------------------------------------- 1996 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 1997 | and 7, and returns the properly rounded 32-bit integer corresponding to the 1998 | input. If `zSign' is 1, the input is negated before being converted to an 1999 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 2000 | is simply rounded to an integer, with the inexact exception raised if the 2001 | input cannot be represented exactly as an integer. However, if the fixed- 2002 | point input is too large, the invalid exception is raised and the largest 2003 | positive or negative integer is returned. 2004 *----------------------------------------------------------------------------*/ 2005 2006 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status) 2007 { 2008 int8_t roundingMode; 2009 flag roundNearestEven; 2010 int8_t roundIncrement, roundBits; 2011 int32_t z; 2012 2013 roundingMode = status->float_rounding_mode; 2014 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2015 switch (roundingMode) { 2016 case float_round_nearest_even: 2017 case float_round_ties_away: 2018 roundIncrement = 0x40; 2019 break; 2020 case float_round_to_zero: 2021 roundIncrement = 0; 2022 break; 2023 case float_round_up: 2024 roundIncrement = zSign ? 0 : 0x7f; 2025 break; 2026 case float_round_down: 2027 roundIncrement = zSign ? 0x7f : 0; 2028 break; 2029 default: 2030 abort(); 2031 } 2032 roundBits = absZ & 0x7F; 2033 absZ = ( absZ + roundIncrement )>>7; 2034 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 2035 z = absZ; 2036 if ( zSign ) z = - z; 2037 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 2038 float_raise(float_flag_invalid, status); 2039 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 2040 } 2041 if (roundBits) { 2042 status->float_exception_flags |= float_flag_inexact; 2043 } 2044 return z; 2045 2046 } 2047 2048 /*---------------------------------------------------------------------------- 2049 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 2050 | `absZ1', with binary point between bits 63 and 64 (between the input words), 2051 | and returns the properly rounded 64-bit integer corresponding to the input. 2052 | If `zSign' is 1, the input is negated before being converted to an integer. 2053 | Ordinarily, the fixed-point input is simply rounded to an integer, with 2054 | the inexact exception raised if the input cannot be represented exactly as 2055 | an integer. However, if the fixed-point input is too large, the invalid 2056 | exception is raised and the largest positive or negative integer is 2057 | returned. 2058 *----------------------------------------------------------------------------*/ 2059 2060 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1, 2061 float_status *status) 2062 { 2063 int8_t roundingMode; 2064 flag roundNearestEven, increment; 2065 int64_t z; 2066 2067 roundingMode = status->float_rounding_mode; 2068 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2069 switch (roundingMode) { 2070 case float_round_nearest_even: 2071 case float_round_ties_away: 2072 increment = ((int64_t) absZ1 < 0); 2073 break; 2074 case float_round_to_zero: 2075 increment = 0; 2076 break; 2077 case float_round_up: 2078 increment = !zSign && absZ1; 2079 break; 2080 case float_round_down: 2081 increment = zSign && absZ1; 2082 break; 2083 default: 2084 abort(); 2085 } 2086 if ( increment ) { 2087 ++absZ0; 2088 if ( absZ0 == 0 ) goto overflow; 2089 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 2090 } 2091 z = absZ0; 2092 if ( zSign ) z = - z; 2093 if ( z && ( ( z < 0 ) ^ zSign ) ) { 2094 overflow: 2095 float_raise(float_flag_invalid, status); 2096 return 2097 zSign ? (int64_t) LIT64( 0x8000000000000000 ) 2098 : LIT64( 0x7FFFFFFFFFFFFFFF ); 2099 } 2100 if (absZ1) { 2101 status->float_exception_flags |= float_flag_inexact; 2102 } 2103 return z; 2104 2105 } 2106 2107 /*---------------------------------------------------------------------------- 2108 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 2109 | `absZ1', with binary point between bits 63 and 64 (between the input words), 2110 | and returns the properly rounded 64-bit unsigned integer corresponding to the 2111 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 2112 | with the inexact exception raised if the input cannot be represented exactly 2113 | as an integer. However, if the fixed-point input is too large, the invalid 2114 | exception is raised and the largest unsigned integer is returned. 2115 *----------------------------------------------------------------------------*/ 2116 2117 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0, 2118 uint64_t absZ1, float_status *status) 2119 { 2120 int8_t roundingMode; 2121 flag roundNearestEven, increment; 2122 2123 roundingMode = status->float_rounding_mode; 2124 roundNearestEven = (roundingMode == float_round_nearest_even); 2125 switch (roundingMode) { 2126 case float_round_nearest_even: 2127 case float_round_ties_away: 2128 increment = ((int64_t)absZ1 < 0); 2129 break; 2130 case float_round_to_zero: 2131 increment = 0; 2132 break; 2133 case float_round_up: 2134 increment = !zSign && absZ1; 2135 break; 2136 case float_round_down: 2137 increment = zSign && absZ1; 2138 break; 2139 default: 2140 abort(); 2141 } 2142 if (increment) { 2143 ++absZ0; 2144 if (absZ0 == 0) { 2145 float_raise(float_flag_invalid, status); 2146 return LIT64(0xFFFFFFFFFFFFFFFF); 2147 } 2148 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven); 2149 } 2150 2151 if (zSign && absZ0) { 2152 float_raise(float_flag_invalid, status); 2153 return 0; 2154 } 2155 2156 if (absZ1) { 2157 status->float_exception_flags |= float_flag_inexact; 2158 } 2159 return absZ0; 2160 } 2161 2162 /*---------------------------------------------------------------------------- 2163 | If `a' is denormal and we are in flush-to-zero mode then set the 2164 | input-denormal exception and return zero. Otherwise just return the value. 2165 *----------------------------------------------------------------------------*/ 2166 float32 float32_squash_input_denormal(float32 a, float_status *status) 2167 { 2168 if (status->flush_inputs_to_zero) { 2169 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) { 2170 float_raise(float_flag_input_denormal, status); 2171 return make_float32(float32_val(a) & 0x80000000); 2172 } 2173 } 2174 return a; 2175 } 2176 2177 /*---------------------------------------------------------------------------- 2178 | Normalizes the subnormal single-precision floating-point value represented 2179 | by the denormalized significand `aSig'. The normalized exponent and 2180 | significand are stored at the locations pointed to by `zExpPtr' and 2181 | `zSigPtr', respectively. 2182 *----------------------------------------------------------------------------*/ 2183 2184 static void 2185 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 2186 { 2187 int8_t shiftCount; 2188 2189 shiftCount = countLeadingZeros32( aSig ) - 8; 2190 *zSigPtr = aSig<<shiftCount; 2191 *zExpPtr = 1 - shiftCount; 2192 2193 } 2194 2195 /*---------------------------------------------------------------------------- 2196 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2197 | and significand `zSig', and returns the proper single-precision floating- 2198 | point value corresponding to the abstract input. Ordinarily, the abstract 2199 | value is simply rounded and packed into the single-precision format, with 2200 | the inexact exception raised if the abstract input cannot be represented 2201 | exactly. However, if the abstract value is too large, the overflow and 2202 | inexact exceptions are raised and an infinity or maximal finite value is 2203 | returned. If the abstract value is too small, the input value is rounded to 2204 | a subnormal number, and the underflow and inexact exceptions are raised if 2205 | the abstract input cannot be represented exactly as a subnormal single- 2206 | precision floating-point number. 2207 | The input significand `zSig' has its binary point between bits 30 2208 | and 29, which is 7 bits to the left of the usual location. This shifted 2209 | significand must be normalized or smaller. If `zSig' is not normalized, 2210 | `zExp' must be 0; in that case, the result returned is a subnormal number, 2211 | and it must not require rounding. In the usual case that `zSig' is 2212 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 2213 | The handling of underflow and overflow follows the IEC/IEEE Standard for 2214 | Binary Floating-Point Arithmetic. 2215 *----------------------------------------------------------------------------*/ 2216 2217 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 2218 float_status *status) 2219 { 2220 int8_t roundingMode; 2221 flag roundNearestEven; 2222 int8_t roundIncrement, roundBits; 2223 flag isTiny; 2224 2225 roundingMode = status->float_rounding_mode; 2226 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2227 switch (roundingMode) { 2228 case float_round_nearest_even: 2229 case float_round_ties_away: 2230 roundIncrement = 0x40; 2231 break; 2232 case float_round_to_zero: 2233 roundIncrement = 0; 2234 break; 2235 case float_round_up: 2236 roundIncrement = zSign ? 0 : 0x7f; 2237 break; 2238 case float_round_down: 2239 roundIncrement = zSign ? 0x7f : 0; 2240 break; 2241 default: 2242 abort(); 2243 break; 2244 } 2245 roundBits = zSig & 0x7F; 2246 if ( 0xFD <= (uint16_t) zExp ) { 2247 if ( ( 0xFD < zExp ) 2248 || ( ( zExp == 0xFD ) 2249 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 2250 ) { 2251 float_raise(float_flag_overflow | float_flag_inexact, status); 2252 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 )); 2253 } 2254 if ( zExp < 0 ) { 2255 if (status->flush_to_zero) { 2256 float_raise(float_flag_output_denormal, status); 2257 return packFloat32(zSign, 0, 0); 2258 } 2259 isTiny = 2260 (status->float_detect_tininess 2261 == float_tininess_before_rounding) 2262 || ( zExp < -1 ) 2263 || ( zSig + roundIncrement < 0x80000000 ); 2264 shift32RightJamming( zSig, - zExp, &zSig ); 2265 zExp = 0; 2266 roundBits = zSig & 0x7F; 2267 if (isTiny && roundBits) { 2268 float_raise(float_flag_underflow, status); 2269 } 2270 } 2271 } 2272 if (roundBits) { 2273 status->float_exception_flags |= float_flag_inexact; 2274 } 2275 zSig = ( zSig + roundIncrement )>>7; 2276 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 2277 if ( zSig == 0 ) zExp = 0; 2278 return packFloat32( zSign, zExp, zSig ); 2279 2280 } 2281 2282 /*---------------------------------------------------------------------------- 2283 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2284 | and significand `zSig', and returns the proper single-precision floating- 2285 | point value corresponding to the abstract input. This routine is just like 2286 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 2287 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 2288 | floating-point exponent. 2289 *----------------------------------------------------------------------------*/ 2290 2291 static float32 2292 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 2293 float_status *status) 2294 { 2295 int8_t shiftCount; 2296 2297 shiftCount = countLeadingZeros32( zSig ) - 1; 2298 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 2299 status); 2300 2301 } 2302 2303 /*---------------------------------------------------------------------------- 2304 | If `a' is denormal and we are in flush-to-zero mode then set the 2305 | input-denormal exception and return zero. Otherwise just return the value. 2306 *----------------------------------------------------------------------------*/ 2307 float64 float64_squash_input_denormal(float64 a, float_status *status) 2308 { 2309 if (status->flush_inputs_to_zero) { 2310 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) { 2311 float_raise(float_flag_input_denormal, status); 2312 return make_float64(float64_val(a) & (1ULL << 63)); 2313 } 2314 } 2315 return a; 2316 } 2317 2318 /*---------------------------------------------------------------------------- 2319 | Normalizes the subnormal double-precision floating-point value represented 2320 | by the denormalized significand `aSig'. The normalized exponent and 2321 | significand are stored at the locations pointed to by `zExpPtr' and 2322 | `zSigPtr', respectively. 2323 *----------------------------------------------------------------------------*/ 2324 2325 static void 2326 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 2327 { 2328 int8_t shiftCount; 2329 2330 shiftCount = countLeadingZeros64( aSig ) - 11; 2331 *zSigPtr = aSig<<shiftCount; 2332 *zExpPtr = 1 - shiftCount; 2333 2334 } 2335 2336 /*---------------------------------------------------------------------------- 2337 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 2338 | double-precision floating-point value, returning the result. After being 2339 | shifted into the proper positions, the three fields are simply added 2340 | together to form the result. This means that any integer portion of `zSig' 2341 | will be added into the exponent. Since a properly normalized significand 2342 | will have an integer portion equal to 1, the `zExp' input should be 1 less 2343 | than the desired result exponent whenever `zSig' is a complete, normalized 2344 | significand. 2345 *----------------------------------------------------------------------------*/ 2346 2347 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig) 2348 { 2349 2350 return make_float64( 2351 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 2352 2353 } 2354 2355 /*---------------------------------------------------------------------------- 2356 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2357 | and significand `zSig', and returns the proper double-precision floating- 2358 | point value corresponding to the abstract input. Ordinarily, the abstract 2359 | value is simply rounded and packed into the double-precision format, with 2360 | the inexact exception raised if the abstract input cannot be represented 2361 | exactly. However, if the abstract value is too large, the overflow and 2362 | inexact exceptions are raised and an infinity or maximal finite value is 2363 | returned. If the abstract value is too small, the input value is rounded to 2364 | a subnormal number, and the underflow and inexact exceptions are raised if 2365 | the abstract input cannot be represented exactly as a subnormal double- 2366 | precision floating-point number. 2367 | The input significand `zSig' has its binary point between bits 62 2368 | and 61, which is 10 bits to the left of the usual location. This shifted 2369 | significand must be normalized or smaller. If `zSig' is not normalized, 2370 | `zExp' must be 0; in that case, the result returned is a subnormal number, 2371 | and it must not require rounding. In the usual case that `zSig' is 2372 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 2373 | The handling of underflow and overflow follows the IEC/IEEE Standard for 2374 | Binary Floating-Point Arithmetic. 2375 *----------------------------------------------------------------------------*/ 2376 2377 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 2378 float_status *status) 2379 { 2380 int8_t roundingMode; 2381 flag roundNearestEven; 2382 int roundIncrement, roundBits; 2383 flag isTiny; 2384 2385 roundingMode = status->float_rounding_mode; 2386 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2387 switch (roundingMode) { 2388 case float_round_nearest_even: 2389 case float_round_ties_away: 2390 roundIncrement = 0x200; 2391 break; 2392 case float_round_to_zero: 2393 roundIncrement = 0; 2394 break; 2395 case float_round_up: 2396 roundIncrement = zSign ? 0 : 0x3ff; 2397 break; 2398 case float_round_down: 2399 roundIncrement = zSign ? 0x3ff : 0; 2400 break; 2401 case float_round_to_odd: 2402 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 2403 break; 2404 default: 2405 abort(); 2406 } 2407 roundBits = zSig & 0x3FF; 2408 if ( 0x7FD <= (uint16_t) zExp ) { 2409 if ( ( 0x7FD < zExp ) 2410 || ( ( zExp == 0x7FD ) 2411 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 2412 ) { 2413 bool overflow_to_inf = roundingMode != float_round_to_odd && 2414 roundIncrement != 0; 2415 float_raise(float_flag_overflow | float_flag_inexact, status); 2416 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 2417 } 2418 if ( zExp < 0 ) { 2419 if (status->flush_to_zero) { 2420 float_raise(float_flag_output_denormal, status); 2421 return packFloat64(zSign, 0, 0); 2422 } 2423 isTiny = 2424 (status->float_detect_tininess 2425 == float_tininess_before_rounding) 2426 || ( zExp < -1 ) 2427 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 2428 shift64RightJamming( zSig, - zExp, &zSig ); 2429 zExp = 0; 2430 roundBits = zSig & 0x3FF; 2431 if (isTiny && roundBits) { 2432 float_raise(float_flag_underflow, status); 2433 } 2434 if (roundingMode == float_round_to_odd) { 2435 /* 2436 * For round-to-odd case, the roundIncrement depends on 2437 * zSig which just changed. 2438 */ 2439 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 2440 } 2441 } 2442 } 2443 if (roundBits) { 2444 status->float_exception_flags |= float_flag_inexact; 2445 } 2446 zSig = ( zSig + roundIncrement )>>10; 2447 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 2448 if ( zSig == 0 ) zExp = 0; 2449 return packFloat64( zSign, zExp, zSig ); 2450 2451 } 2452 2453 /*---------------------------------------------------------------------------- 2454 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2455 | and significand `zSig', and returns the proper double-precision floating- 2456 | point value corresponding to the abstract input. This routine is just like 2457 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 2458 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 2459 | floating-point exponent. 2460 *----------------------------------------------------------------------------*/ 2461 2462 static float64 2463 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 2464 float_status *status) 2465 { 2466 int8_t shiftCount; 2467 2468 shiftCount = countLeadingZeros64( zSig ) - 1; 2469 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 2470 status); 2471 2472 } 2473 2474 /*---------------------------------------------------------------------------- 2475 | Normalizes the subnormal extended double-precision floating-point value 2476 | represented by the denormalized significand `aSig'. The normalized exponent 2477 | and significand are stored at the locations pointed to by `zExpPtr' and 2478 | `zSigPtr', respectively. 2479 *----------------------------------------------------------------------------*/ 2480 2481 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr, 2482 uint64_t *zSigPtr) 2483 { 2484 int8_t shiftCount; 2485 2486 shiftCount = countLeadingZeros64( aSig ); 2487 *zSigPtr = aSig<<shiftCount; 2488 *zExpPtr = 1 - shiftCount; 2489 } 2490 2491 /*---------------------------------------------------------------------------- 2492 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2493 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 2494 | and returns the proper extended double-precision floating-point value 2495 | corresponding to the abstract input. Ordinarily, the abstract value is 2496 | rounded and packed into the extended double-precision format, with the 2497 | inexact exception raised if the abstract input cannot be represented 2498 | exactly. However, if the abstract value is too large, the overflow and 2499 | inexact exceptions are raised and an infinity or maximal finite value is 2500 | returned. If the abstract value is too small, the input value is rounded to 2501 | a subnormal number, and the underflow and inexact exceptions are raised if 2502 | the abstract input cannot be represented exactly as a subnormal extended 2503 | double-precision floating-point number. 2504 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 2505 | number of bits as single or double precision, respectively. Otherwise, the 2506 | result is rounded to the full precision of the extended double-precision 2507 | format. 2508 | The input significand must be normalized or smaller. If the input 2509 | significand is not normalized, `zExp' must be 0; in that case, the result 2510 | returned is a subnormal number, and it must not require rounding. The 2511 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 2512 | Floating-Point Arithmetic. 2513 *----------------------------------------------------------------------------*/ 2514 2515 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign, 2516 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 2517 float_status *status) 2518 { 2519 int8_t roundingMode; 2520 flag roundNearestEven, increment, isTiny; 2521 int64_t roundIncrement, roundMask, roundBits; 2522 2523 roundingMode = status->float_rounding_mode; 2524 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2525 if ( roundingPrecision == 80 ) goto precision80; 2526 if ( roundingPrecision == 64 ) { 2527 roundIncrement = LIT64( 0x0000000000000400 ); 2528 roundMask = LIT64( 0x00000000000007FF ); 2529 } 2530 else if ( roundingPrecision == 32 ) { 2531 roundIncrement = LIT64( 0x0000008000000000 ); 2532 roundMask = LIT64( 0x000000FFFFFFFFFF ); 2533 } 2534 else { 2535 goto precision80; 2536 } 2537 zSig0 |= ( zSig1 != 0 ); 2538 switch (roundingMode) { 2539 case float_round_nearest_even: 2540 case float_round_ties_away: 2541 break; 2542 case float_round_to_zero: 2543 roundIncrement = 0; 2544 break; 2545 case float_round_up: 2546 roundIncrement = zSign ? 0 : roundMask; 2547 break; 2548 case float_round_down: 2549 roundIncrement = zSign ? roundMask : 0; 2550 break; 2551 default: 2552 abort(); 2553 } 2554 roundBits = zSig0 & roundMask; 2555 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 2556 if ( ( 0x7FFE < zExp ) 2557 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 2558 ) { 2559 goto overflow; 2560 } 2561 if ( zExp <= 0 ) { 2562 if (status->flush_to_zero) { 2563 float_raise(float_flag_output_denormal, status); 2564 return packFloatx80(zSign, 0, 0); 2565 } 2566 isTiny = 2567 (status->float_detect_tininess 2568 == float_tininess_before_rounding) 2569 || ( zExp < 0 ) 2570 || ( zSig0 <= zSig0 + roundIncrement ); 2571 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 2572 zExp = 0; 2573 roundBits = zSig0 & roundMask; 2574 if (isTiny && roundBits) { 2575 float_raise(float_flag_underflow, status); 2576 } 2577 if (roundBits) { 2578 status->float_exception_flags |= float_flag_inexact; 2579 } 2580 zSig0 += roundIncrement; 2581 if ( (int64_t) zSig0 < 0 ) zExp = 1; 2582 roundIncrement = roundMask + 1; 2583 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 2584 roundMask |= roundIncrement; 2585 } 2586 zSig0 &= ~ roundMask; 2587 return packFloatx80( zSign, zExp, zSig0 ); 2588 } 2589 } 2590 if (roundBits) { 2591 status->float_exception_flags |= float_flag_inexact; 2592 } 2593 zSig0 += roundIncrement; 2594 if ( zSig0 < roundIncrement ) { 2595 ++zExp; 2596 zSig0 = LIT64( 0x8000000000000000 ); 2597 } 2598 roundIncrement = roundMask + 1; 2599 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 2600 roundMask |= roundIncrement; 2601 } 2602 zSig0 &= ~ roundMask; 2603 if ( zSig0 == 0 ) zExp = 0; 2604 return packFloatx80( zSign, zExp, zSig0 ); 2605 precision80: 2606 switch (roundingMode) { 2607 case float_round_nearest_even: 2608 case float_round_ties_away: 2609 increment = ((int64_t)zSig1 < 0); 2610 break; 2611 case float_round_to_zero: 2612 increment = 0; 2613 break; 2614 case float_round_up: 2615 increment = !zSign && zSig1; 2616 break; 2617 case float_round_down: 2618 increment = zSign && zSig1; 2619 break; 2620 default: 2621 abort(); 2622 } 2623 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 2624 if ( ( 0x7FFE < zExp ) 2625 || ( ( zExp == 0x7FFE ) 2626 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 2627 && increment 2628 ) 2629 ) { 2630 roundMask = 0; 2631 overflow: 2632 float_raise(float_flag_overflow | float_flag_inexact, status); 2633 if ( ( roundingMode == float_round_to_zero ) 2634 || ( zSign && ( roundingMode == float_round_up ) ) 2635 || ( ! zSign && ( roundingMode == float_round_down ) ) 2636 ) { 2637 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 2638 } 2639 return packFloatx80(zSign, 2640 floatx80_infinity_high, 2641 floatx80_infinity_low); 2642 } 2643 if ( zExp <= 0 ) { 2644 isTiny = 2645 (status->float_detect_tininess 2646 == float_tininess_before_rounding) 2647 || ( zExp < 0 ) 2648 || ! increment 2649 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 2650 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 2651 zExp = 0; 2652 if (isTiny && zSig1) { 2653 float_raise(float_flag_underflow, status); 2654 } 2655 if (zSig1) { 2656 status->float_exception_flags |= float_flag_inexact; 2657 } 2658 switch (roundingMode) { 2659 case float_round_nearest_even: 2660 case float_round_ties_away: 2661 increment = ((int64_t)zSig1 < 0); 2662 break; 2663 case float_round_to_zero: 2664 increment = 0; 2665 break; 2666 case float_round_up: 2667 increment = !zSign && zSig1; 2668 break; 2669 case float_round_down: 2670 increment = zSign && zSig1; 2671 break; 2672 default: 2673 abort(); 2674 } 2675 if ( increment ) { 2676 ++zSig0; 2677 zSig0 &= 2678 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 2679 if ( (int64_t) zSig0 < 0 ) zExp = 1; 2680 } 2681 return packFloatx80( zSign, zExp, zSig0 ); 2682 } 2683 } 2684 if (zSig1) { 2685 status->float_exception_flags |= float_flag_inexact; 2686 } 2687 if ( increment ) { 2688 ++zSig0; 2689 if ( zSig0 == 0 ) { 2690 ++zExp; 2691 zSig0 = LIT64( 0x8000000000000000 ); 2692 } 2693 else { 2694 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 2695 } 2696 } 2697 else { 2698 if ( zSig0 == 0 ) zExp = 0; 2699 } 2700 return packFloatx80( zSign, zExp, zSig0 ); 2701 2702 } 2703 2704 /*---------------------------------------------------------------------------- 2705 | Takes an abstract floating-point value having sign `zSign', exponent 2706 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 2707 | and returns the proper extended double-precision floating-point value 2708 | corresponding to the abstract input. This routine is just like 2709 | `roundAndPackFloatx80' except that the input significand does not have to be 2710 | normalized. 2711 *----------------------------------------------------------------------------*/ 2712 2713 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 2714 flag zSign, int32_t zExp, 2715 uint64_t zSig0, uint64_t zSig1, 2716 float_status *status) 2717 { 2718 int8_t shiftCount; 2719 2720 if ( zSig0 == 0 ) { 2721 zSig0 = zSig1; 2722 zSig1 = 0; 2723 zExp -= 64; 2724 } 2725 shiftCount = countLeadingZeros64( zSig0 ); 2726 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 2727 zExp -= shiftCount; 2728 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 2729 zSig0, zSig1, status); 2730 2731 } 2732 2733 /*---------------------------------------------------------------------------- 2734 | Returns the least-significant 64 fraction bits of the quadruple-precision 2735 | floating-point value `a'. 2736 *----------------------------------------------------------------------------*/ 2737 2738 static inline uint64_t extractFloat128Frac1( float128 a ) 2739 { 2740 2741 return a.low; 2742 2743 } 2744 2745 /*---------------------------------------------------------------------------- 2746 | Returns the most-significant 48 fraction bits of the quadruple-precision 2747 | floating-point value `a'. 2748 *----------------------------------------------------------------------------*/ 2749 2750 static inline uint64_t extractFloat128Frac0( float128 a ) 2751 { 2752 2753 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 2754 2755 } 2756 2757 /*---------------------------------------------------------------------------- 2758 | Returns the exponent bits of the quadruple-precision floating-point value 2759 | `a'. 2760 *----------------------------------------------------------------------------*/ 2761 2762 static inline int32_t extractFloat128Exp( float128 a ) 2763 { 2764 2765 return ( a.high>>48 ) & 0x7FFF; 2766 2767 } 2768 2769 /*---------------------------------------------------------------------------- 2770 | Returns the sign bit of the quadruple-precision floating-point value `a'. 2771 *----------------------------------------------------------------------------*/ 2772 2773 static inline flag extractFloat128Sign( float128 a ) 2774 { 2775 2776 return a.high>>63; 2777 2778 } 2779 2780 /*---------------------------------------------------------------------------- 2781 | Normalizes the subnormal quadruple-precision floating-point value 2782 | represented by the denormalized significand formed by the concatenation of 2783 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 2784 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 2785 | significand are stored at the location pointed to by `zSig0Ptr', and the 2786 | least significant 64 bits of the normalized significand are stored at the 2787 | location pointed to by `zSig1Ptr'. 2788 *----------------------------------------------------------------------------*/ 2789 2790 static void 2791 normalizeFloat128Subnormal( 2792 uint64_t aSig0, 2793 uint64_t aSig1, 2794 int32_t *zExpPtr, 2795 uint64_t *zSig0Ptr, 2796 uint64_t *zSig1Ptr 2797 ) 2798 { 2799 int8_t shiftCount; 2800 2801 if ( aSig0 == 0 ) { 2802 shiftCount = countLeadingZeros64( aSig1 ) - 15; 2803 if ( shiftCount < 0 ) { 2804 *zSig0Ptr = aSig1>>( - shiftCount ); 2805 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 2806 } 2807 else { 2808 *zSig0Ptr = aSig1<<shiftCount; 2809 *zSig1Ptr = 0; 2810 } 2811 *zExpPtr = - shiftCount - 63; 2812 } 2813 else { 2814 shiftCount = countLeadingZeros64( aSig0 ) - 15; 2815 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 2816 *zExpPtr = 1 - shiftCount; 2817 } 2818 2819 } 2820 2821 /*---------------------------------------------------------------------------- 2822 | Packs the sign `zSign', the exponent `zExp', and the significand formed 2823 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 2824 | floating-point value, returning the result. After being shifted into the 2825 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 2826 | added together to form the most significant 32 bits of the result. This 2827 | means that any integer portion of `zSig0' will be added into the exponent. 2828 | Since a properly normalized significand will have an integer portion equal 2829 | to 1, the `zExp' input should be 1 less than the desired result exponent 2830 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 2831 | significand. 2832 *----------------------------------------------------------------------------*/ 2833 2834 static inline float128 2835 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 ) 2836 { 2837 float128 z; 2838 2839 z.low = zSig1; 2840 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0; 2841 return z; 2842 2843 } 2844 2845 /*---------------------------------------------------------------------------- 2846 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2847 | and extended significand formed by the concatenation of `zSig0', `zSig1', 2848 | and `zSig2', and returns the proper quadruple-precision floating-point value 2849 | corresponding to the abstract input. Ordinarily, the abstract value is 2850 | simply rounded and packed into the quadruple-precision format, with the 2851 | inexact exception raised if the abstract input cannot be represented 2852 | exactly. However, if the abstract value is too large, the overflow and 2853 | inexact exceptions are raised and an infinity or maximal finite value is 2854 | returned. If the abstract value is too small, the input value is rounded to 2855 | a subnormal number, and the underflow and inexact exceptions are raised if 2856 | the abstract input cannot be represented exactly as a subnormal quadruple- 2857 | precision floating-point number. 2858 | The input significand must be normalized or smaller. If the input 2859 | significand is not normalized, `zExp' must be 0; in that case, the result 2860 | returned is a subnormal number, and it must not require rounding. In the 2861 | usual case that the input significand is normalized, `zExp' must be 1 less 2862 | than the ``true'' floating-point exponent. The handling of underflow and 2863 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2864 *----------------------------------------------------------------------------*/ 2865 2866 static float128 roundAndPackFloat128(flag zSign, int32_t zExp, 2867 uint64_t zSig0, uint64_t zSig1, 2868 uint64_t zSig2, float_status *status) 2869 { 2870 int8_t roundingMode; 2871 flag roundNearestEven, increment, isTiny; 2872 2873 roundingMode = status->float_rounding_mode; 2874 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2875 switch (roundingMode) { 2876 case float_round_nearest_even: 2877 case float_round_ties_away: 2878 increment = ((int64_t)zSig2 < 0); 2879 break; 2880 case float_round_to_zero: 2881 increment = 0; 2882 break; 2883 case float_round_up: 2884 increment = !zSign && zSig2; 2885 break; 2886 case float_round_down: 2887 increment = zSign && zSig2; 2888 break; 2889 case float_round_to_odd: 2890 increment = !(zSig1 & 0x1) && zSig2; 2891 break; 2892 default: 2893 abort(); 2894 } 2895 if ( 0x7FFD <= (uint32_t) zExp ) { 2896 if ( ( 0x7FFD < zExp ) 2897 || ( ( zExp == 0x7FFD ) 2898 && eq128( 2899 LIT64( 0x0001FFFFFFFFFFFF ), 2900 LIT64( 0xFFFFFFFFFFFFFFFF ), 2901 zSig0, 2902 zSig1 2903 ) 2904 && increment 2905 ) 2906 ) { 2907 float_raise(float_flag_overflow | float_flag_inexact, status); 2908 if ( ( roundingMode == float_round_to_zero ) 2909 || ( zSign && ( roundingMode == float_round_up ) ) 2910 || ( ! zSign && ( roundingMode == float_round_down ) ) 2911 || (roundingMode == float_round_to_odd) 2912 ) { 2913 return 2914 packFloat128( 2915 zSign, 2916 0x7FFE, 2917 LIT64( 0x0000FFFFFFFFFFFF ), 2918 LIT64( 0xFFFFFFFFFFFFFFFF ) 2919 ); 2920 } 2921 return packFloat128( zSign, 0x7FFF, 0, 0 ); 2922 } 2923 if ( zExp < 0 ) { 2924 if (status->flush_to_zero) { 2925 float_raise(float_flag_output_denormal, status); 2926 return packFloat128(zSign, 0, 0, 0); 2927 } 2928 isTiny = 2929 (status->float_detect_tininess 2930 == float_tininess_before_rounding) 2931 || ( zExp < -1 ) 2932 || ! increment 2933 || lt128( 2934 zSig0, 2935 zSig1, 2936 LIT64( 0x0001FFFFFFFFFFFF ), 2937 LIT64( 0xFFFFFFFFFFFFFFFF ) 2938 ); 2939 shift128ExtraRightJamming( 2940 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 2941 zExp = 0; 2942 if (isTiny && zSig2) { 2943 float_raise(float_flag_underflow, status); 2944 } 2945 switch (roundingMode) { 2946 case float_round_nearest_even: 2947 case float_round_ties_away: 2948 increment = ((int64_t)zSig2 < 0); 2949 break; 2950 case float_round_to_zero: 2951 increment = 0; 2952 break; 2953 case float_round_up: 2954 increment = !zSign && zSig2; 2955 break; 2956 case float_round_down: 2957 increment = zSign && zSig2; 2958 break; 2959 case float_round_to_odd: 2960 increment = !(zSig1 & 0x1) && zSig2; 2961 break; 2962 default: 2963 abort(); 2964 } 2965 } 2966 } 2967 if (zSig2) { 2968 status->float_exception_flags |= float_flag_inexact; 2969 } 2970 if ( increment ) { 2971 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 2972 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 2973 } 2974 else { 2975 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 2976 } 2977 return packFloat128( zSign, zExp, zSig0, zSig1 ); 2978 2979 } 2980 2981 /*---------------------------------------------------------------------------- 2982 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2983 | and significand formed by the concatenation of `zSig0' and `zSig1', and 2984 | returns the proper quadruple-precision floating-point value corresponding 2985 | to the abstract input. This routine is just like `roundAndPackFloat128' 2986 | except that the input significand has fewer bits and does not have to be 2987 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 2988 | point exponent. 2989 *----------------------------------------------------------------------------*/ 2990 2991 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp, 2992 uint64_t zSig0, uint64_t zSig1, 2993 float_status *status) 2994 { 2995 int8_t shiftCount; 2996 uint64_t zSig2; 2997 2998 if ( zSig0 == 0 ) { 2999 zSig0 = zSig1; 3000 zSig1 = 0; 3001 zExp -= 64; 3002 } 3003 shiftCount = countLeadingZeros64( zSig0 ) - 15; 3004 if ( 0 <= shiftCount ) { 3005 zSig2 = 0; 3006 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 3007 } 3008 else { 3009 shift128ExtraRightJamming( 3010 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 3011 } 3012 zExp -= shiftCount; 3013 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 3014 3015 } 3016 3017 3018 /*---------------------------------------------------------------------------- 3019 | Returns the result of converting the 32-bit two's complement integer `a' 3020 | to the extended double-precision floating-point format. The conversion 3021 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3022 | Arithmetic. 3023 *----------------------------------------------------------------------------*/ 3024 3025 floatx80 int32_to_floatx80(int32_t a, float_status *status) 3026 { 3027 flag zSign; 3028 uint32_t absA; 3029 int8_t shiftCount; 3030 uint64_t zSig; 3031 3032 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 3033 zSign = ( a < 0 ); 3034 absA = zSign ? - a : a; 3035 shiftCount = countLeadingZeros32( absA ) + 32; 3036 zSig = absA; 3037 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 3038 3039 } 3040 3041 /*---------------------------------------------------------------------------- 3042 | Returns the result of converting the 32-bit two's complement integer `a' to 3043 | the quadruple-precision floating-point format. The conversion is performed 3044 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3045 *----------------------------------------------------------------------------*/ 3046 3047 float128 int32_to_float128(int32_t a, float_status *status) 3048 { 3049 flag zSign; 3050 uint32_t absA; 3051 int8_t shiftCount; 3052 uint64_t zSig0; 3053 3054 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 3055 zSign = ( a < 0 ); 3056 absA = zSign ? - a : a; 3057 shiftCount = countLeadingZeros32( absA ) + 17; 3058 zSig0 = absA; 3059 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 3060 3061 } 3062 3063 /*---------------------------------------------------------------------------- 3064 | Returns the result of converting the 64-bit two's complement integer `a' 3065 | to the extended double-precision floating-point format. The conversion 3066 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3067 | Arithmetic. 3068 *----------------------------------------------------------------------------*/ 3069 3070 floatx80 int64_to_floatx80(int64_t a, float_status *status) 3071 { 3072 flag zSign; 3073 uint64_t absA; 3074 int8_t shiftCount; 3075 3076 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 3077 zSign = ( a < 0 ); 3078 absA = zSign ? - a : a; 3079 shiftCount = countLeadingZeros64( absA ); 3080 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 3081 3082 } 3083 3084 /*---------------------------------------------------------------------------- 3085 | Returns the result of converting the 64-bit two's complement integer `a' to 3086 | the quadruple-precision floating-point format. The conversion is performed 3087 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3088 *----------------------------------------------------------------------------*/ 3089 3090 float128 int64_to_float128(int64_t a, float_status *status) 3091 { 3092 flag zSign; 3093 uint64_t absA; 3094 int8_t shiftCount; 3095 int32_t zExp; 3096 uint64_t zSig0, zSig1; 3097 3098 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 3099 zSign = ( a < 0 ); 3100 absA = zSign ? - a : a; 3101 shiftCount = countLeadingZeros64( absA ) + 49; 3102 zExp = 0x406E - shiftCount; 3103 if ( 64 <= shiftCount ) { 3104 zSig1 = 0; 3105 zSig0 = absA; 3106 shiftCount -= 64; 3107 } 3108 else { 3109 zSig1 = absA; 3110 zSig0 = 0; 3111 } 3112 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 3113 return packFloat128( zSign, zExp, zSig0, zSig1 ); 3114 3115 } 3116 3117 /*---------------------------------------------------------------------------- 3118 | Returns the result of converting the 64-bit unsigned integer `a' 3119 | to the quadruple-precision floating-point format. The conversion is performed 3120 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3121 *----------------------------------------------------------------------------*/ 3122 3123 float128 uint64_to_float128(uint64_t a, float_status *status) 3124 { 3125 if (a == 0) { 3126 return float128_zero; 3127 } 3128 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status); 3129 } 3130 3131 3132 3133 3134 /*---------------------------------------------------------------------------- 3135 | Returns the result of converting the single-precision floating-point value 3136 | `a' to the double-precision floating-point format. The conversion is 3137 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3138 | Arithmetic. 3139 *----------------------------------------------------------------------------*/ 3140 3141 float64 float32_to_float64(float32 a, float_status *status) 3142 { 3143 flag aSign; 3144 int aExp; 3145 uint32_t aSig; 3146 a = float32_squash_input_denormal(a, status); 3147 3148 aSig = extractFloat32Frac( a ); 3149 aExp = extractFloat32Exp( a ); 3150 aSign = extractFloat32Sign( a ); 3151 if ( aExp == 0xFF ) { 3152 if (aSig) { 3153 return commonNaNToFloat64(float32ToCommonNaN(a, status), status); 3154 } 3155 return packFloat64( aSign, 0x7FF, 0 ); 3156 } 3157 if ( aExp == 0 ) { 3158 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 3159 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3160 --aExp; 3161 } 3162 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 ); 3163 3164 } 3165 3166 /*---------------------------------------------------------------------------- 3167 | Returns the result of converting the single-precision floating-point value 3168 | `a' to the extended double-precision floating-point format. The conversion 3169 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3170 | Arithmetic. 3171 *----------------------------------------------------------------------------*/ 3172 3173 floatx80 float32_to_floatx80(float32 a, float_status *status) 3174 { 3175 flag aSign; 3176 int aExp; 3177 uint32_t aSig; 3178 3179 a = float32_squash_input_denormal(a, status); 3180 aSig = extractFloat32Frac( a ); 3181 aExp = extractFloat32Exp( a ); 3182 aSign = extractFloat32Sign( a ); 3183 if ( aExp == 0xFF ) { 3184 if (aSig) { 3185 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status); 3186 } 3187 return packFloatx80(aSign, 3188 floatx80_infinity_high, 3189 floatx80_infinity_low); 3190 } 3191 if ( aExp == 0 ) { 3192 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 3193 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3194 } 3195 aSig |= 0x00800000; 3196 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 3197 3198 } 3199 3200 /*---------------------------------------------------------------------------- 3201 | Returns the result of converting the single-precision floating-point value 3202 | `a' to the double-precision floating-point format. The conversion is 3203 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3204 | Arithmetic. 3205 *----------------------------------------------------------------------------*/ 3206 3207 float128 float32_to_float128(float32 a, float_status *status) 3208 { 3209 flag aSign; 3210 int aExp; 3211 uint32_t aSig; 3212 3213 a = float32_squash_input_denormal(a, status); 3214 aSig = extractFloat32Frac( a ); 3215 aExp = extractFloat32Exp( a ); 3216 aSign = extractFloat32Sign( a ); 3217 if ( aExp == 0xFF ) { 3218 if (aSig) { 3219 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 3220 } 3221 return packFloat128( aSign, 0x7FFF, 0, 0 ); 3222 } 3223 if ( aExp == 0 ) { 3224 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 3225 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3226 --aExp; 3227 } 3228 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 3229 3230 } 3231 3232 /*---------------------------------------------------------------------------- 3233 | Returns the remainder of the single-precision floating-point value `a' 3234 | with respect to the corresponding value `b'. The operation is performed 3235 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3236 *----------------------------------------------------------------------------*/ 3237 3238 float32 float32_rem(float32 a, float32 b, float_status *status) 3239 { 3240 flag aSign, zSign; 3241 int aExp, bExp, expDiff; 3242 uint32_t aSig, bSig; 3243 uint32_t q; 3244 uint64_t aSig64, bSig64, q64; 3245 uint32_t alternateASig; 3246 int32_t sigMean; 3247 a = float32_squash_input_denormal(a, status); 3248 b = float32_squash_input_denormal(b, status); 3249 3250 aSig = extractFloat32Frac( a ); 3251 aExp = extractFloat32Exp( a ); 3252 aSign = extractFloat32Sign( a ); 3253 bSig = extractFloat32Frac( b ); 3254 bExp = extractFloat32Exp( b ); 3255 if ( aExp == 0xFF ) { 3256 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 3257 return propagateFloat32NaN(a, b, status); 3258 } 3259 float_raise(float_flag_invalid, status); 3260 return float32_default_nan(status); 3261 } 3262 if ( bExp == 0xFF ) { 3263 if (bSig) { 3264 return propagateFloat32NaN(a, b, status); 3265 } 3266 return a; 3267 } 3268 if ( bExp == 0 ) { 3269 if ( bSig == 0 ) { 3270 float_raise(float_flag_invalid, status); 3271 return float32_default_nan(status); 3272 } 3273 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 3274 } 3275 if ( aExp == 0 ) { 3276 if ( aSig == 0 ) return a; 3277 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3278 } 3279 expDiff = aExp - bExp; 3280 aSig |= 0x00800000; 3281 bSig |= 0x00800000; 3282 if ( expDiff < 32 ) { 3283 aSig <<= 8; 3284 bSig <<= 8; 3285 if ( expDiff < 0 ) { 3286 if ( expDiff < -1 ) return a; 3287 aSig >>= 1; 3288 } 3289 q = ( bSig <= aSig ); 3290 if ( q ) aSig -= bSig; 3291 if ( 0 < expDiff ) { 3292 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 3293 q >>= 32 - expDiff; 3294 bSig >>= 2; 3295 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 3296 } 3297 else { 3298 aSig >>= 2; 3299 bSig >>= 2; 3300 } 3301 } 3302 else { 3303 if ( bSig <= aSig ) aSig -= bSig; 3304 aSig64 = ( (uint64_t) aSig )<<40; 3305 bSig64 = ( (uint64_t) bSig )<<40; 3306 expDiff -= 64; 3307 while ( 0 < expDiff ) { 3308 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 3309 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 3310 aSig64 = - ( ( bSig * q64 )<<38 ); 3311 expDiff -= 62; 3312 } 3313 expDiff += 64; 3314 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 3315 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 3316 q = q64>>( 64 - expDiff ); 3317 bSig <<= 6; 3318 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 3319 } 3320 do { 3321 alternateASig = aSig; 3322 ++q; 3323 aSig -= bSig; 3324 } while ( 0 <= (int32_t) aSig ); 3325 sigMean = aSig + alternateASig; 3326 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 3327 aSig = alternateASig; 3328 } 3329 zSign = ( (int32_t) aSig < 0 ); 3330 if ( zSign ) aSig = - aSig; 3331 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 3332 } 3333 3334 3335 3336 /*---------------------------------------------------------------------------- 3337 | Returns the binary exponential of the single-precision floating-point value 3338 | `a'. The operation is performed according to the IEC/IEEE Standard for 3339 | Binary Floating-Point Arithmetic. 3340 | 3341 | Uses the following identities: 3342 | 3343 | 1. ------------------------------------------------------------------------- 3344 | x x*ln(2) 3345 | 2 = e 3346 | 3347 | 2. ------------------------------------------------------------------------- 3348 | 2 3 4 5 n 3349 | x x x x x x x 3350 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 3351 | 1! 2! 3! 4! 5! n! 3352 *----------------------------------------------------------------------------*/ 3353 3354 static const float64 float32_exp2_coefficients[15] = 3355 { 3356 const_float64( 0x3ff0000000000000ll ), /* 1 */ 3357 const_float64( 0x3fe0000000000000ll ), /* 2 */ 3358 const_float64( 0x3fc5555555555555ll ), /* 3 */ 3359 const_float64( 0x3fa5555555555555ll ), /* 4 */ 3360 const_float64( 0x3f81111111111111ll ), /* 5 */ 3361 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 3362 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 3363 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 3364 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 3365 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 3366 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 3367 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 3368 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 3369 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 3370 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 3371 }; 3372 3373 float32 float32_exp2(float32 a, float_status *status) 3374 { 3375 flag aSign; 3376 int aExp; 3377 uint32_t aSig; 3378 float64 r, x, xn; 3379 int i; 3380 a = float32_squash_input_denormal(a, status); 3381 3382 aSig = extractFloat32Frac( a ); 3383 aExp = extractFloat32Exp( a ); 3384 aSign = extractFloat32Sign( a ); 3385 3386 if ( aExp == 0xFF) { 3387 if (aSig) { 3388 return propagateFloat32NaN(a, float32_zero, status); 3389 } 3390 return (aSign) ? float32_zero : a; 3391 } 3392 if (aExp == 0) { 3393 if (aSig == 0) return float32_one; 3394 } 3395 3396 float_raise(float_flag_inexact, status); 3397 3398 /* ******************************* */ 3399 /* using float64 for approximation */ 3400 /* ******************************* */ 3401 x = float32_to_float64(a, status); 3402 x = float64_mul(x, float64_ln2, status); 3403 3404 xn = x; 3405 r = float64_one; 3406 for (i = 0 ; i < 15 ; i++) { 3407 float64 f; 3408 3409 f = float64_mul(xn, float32_exp2_coefficients[i], status); 3410 r = float64_add(r, f, status); 3411 3412 xn = float64_mul(xn, x, status); 3413 } 3414 3415 return float64_to_float32(r, status); 3416 } 3417 3418 /*---------------------------------------------------------------------------- 3419 | Returns the binary log of the single-precision floating-point value `a'. 3420 | The operation is performed according to the IEC/IEEE Standard for Binary 3421 | Floating-Point Arithmetic. 3422 *----------------------------------------------------------------------------*/ 3423 float32 float32_log2(float32 a, float_status *status) 3424 { 3425 flag aSign, zSign; 3426 int aExp; 3427 uint32_t aSig, zSig, i; 3428 3429 a = float32_squash_input_denormal(a, status); 3430 aSig = extractFloat32Frac( a ); 3431 aExp = extractFloat32Exp( a ); 3432 aSign = extractFloat32Sign( a ); 3433 3434 if ( aExp == 0 ) { 3435 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 3436 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3437 } 3438 if ( aSign ) { 3439 float_raise(float_flag_invalid, status); 3440 return float32_default_nan(status); 3441 } 3442 if ( aExp == 0xFF ) { 3443 if (aSig) { 3444 return propagateFloat32NaN(a, float32_zero, status); 3445 } 3446 return a; 3447 } 3448 3449 aExp -= 0x7F; 3450 aSig |= 0x00800000; 3451 zSign = aExp < 0; 3452 zSig = aExp << 23; 3453 3454 for (i = 1 << 22; i > 0; i >>= 1) { 3455 aSig = ( (uint64_t)aSig * aSig ) >> 23; 3456 if ( aSig & 0x01000000 ) { 3457 aSig >>= 1; 3458 zSig |= i; 3459 } 3460 } 3461 3462 if ( zSign ) 3463 zSig = -zSig; 3464 3465 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 3466 } 3467 3468 /*---------------------------------------------------------------------------- 3469 | Returns 1 if the single-precision floating-point value `a' is equal to 3470 | the corresponding value `b', and 0 otherwise. The invalid exception is 3471 | raised if either operand is a NaN. Otherwise, the comparison is performed 3472 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3473 *----------------------------------------------------------------------------*/ 3474 3475 int float32_eq(float32 a, float32 b, float_status *status) 3476 { 3477 uint32_t av, bv; 3478 a = float32_squash_input_denormal(a, status); 3479 b = float32_squash_input_denormal(b, status); 3480 3481 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3482 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3483 ) { 3484 float_raise(float_flag_invalid, status); 3485 return 0; 3486 } 3487 av = float32_val(a); 3488 bv = float32_val(b); 3489 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3490 } 3491 3492 /*---------------------------------------------------------------------------- 3493 | Returns 1 if the single-precision floating-point value `a' is less than 3494 | or equal to the corresponding value `b', and 0 otherwise. The invalid 3495 | exception is raised if either operand is a NaN. The comparison is performed 3496 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3497 *----------------------------------------------------------------------------*/ 3498 3499 int float32_le(float32 a, float32 b, float_status *status) 3500 { 3501 flag aSign, bSign; 3502 uint32_t av, bv; 3503 a = float32_squash_input_denormal(a, status); 3504 b = float32_squash_input_denormal(b, status); 3505 3506 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3507 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3508 ) { 3509 float_raise(float_flag_invalid, status); 3510 return 0; 3511 } 3512 aSign = extractFloat32Sign( a ); 3513 bSign = extractFloat32Sign( b ); 3514 av = float32_val(a); 3515 bv = float32_val(b); 3516 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3517 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3518 3519 } 3520 3521 /*---------------------------------------------------------------------------- 3522 | Returns 1 if the single-precision floating-point value `a' is less than 3523 | the corresponding value `b', and 0 otherwise. The invalid exception is 3524 | raised if either operand is a NaN. The comparison is performed according 3525 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3526 *----------------------------------------------------------------------------*/ 3527 3528 int float32_lt(float32 a, float32 b, float_status *status) 3529 { 3530 flag aSign, bSign; 3531 uint32_t av, bv; 3532 a = float32_squash_input_denormal(a, status); 3533 b = float32_squash_input_denormal(b, status); 3534 3535 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3536 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3537 ) { 3538 float_raise(float_flag_invalid, status); 3539 return 0; 3540 } 3541 aSign = extractFloat32Sign( a ); 3542 bSign = extractFloat32Sign( b ); 3543 av = float32_val(a); 3544 bv = float32_val(b); 3545 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 3546 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3547 3548 } 3549 3550 /*---------------------------------------------------------------------------- 3551 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 3552 | be compared, and 0 otherwise. The invalid exception is raised if either 3553 | operand is a NaN. The comparison is performed according to the IEC/IEEE 3554 | Standard for Binary Floating-Point Arithmetic. 3555 *----------------------------------------------------------------------------*/ 3556 3557 int float32_unordered(float32 a, float32 b, float_status *status) 3558 { 3559 a = float32_squash_input_denormal(a, status); 3560 b = float32_squash_input_denormal(b, status); 3561 3562 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3563 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3564 ) { 3565 float_raise(float_flag_invalid, status); 3566 return 1; 3567 } 3568 return 0; 3569 } 3570 3571 /*---------------------------------------------------------------------------- 3572 | Returns 1 if the single-precision floating-point value `a' is equal to 3573 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3574 | exception. The comparison is performed according to the IEC/IEEE Standard 3575 | for Binary Floating-Point Arithmetic. 3576 *----------------------------------------------------------------------------*/ 3577 3578 int float32_eq_quiet(float32 a, float32 b, float_status *status) 3579 { 3580 a = float32_squash_input_denormal(a, status); 3581 b = float32_squash_input_denormal(b, status); 3582 3583 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3584 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3585 ) { 3586 if (float32_is_signaling_nan(a, status) 3587 || float32_is_signaling_nan(b, status)) { 3588 float_raise(float_flag_invalid, status); 3589 } 3590 return 0; 3591 } 3592 return ( float32_val(a) == float32_val(b) ) || 3593 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 3594 } 3595 3596 /*---------------------------------------------------------------------------- 3597 | Returns 1 if the single-precision floating-point value `a' is less than or 3598 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 3599 | cause an exception. Otherwise, the comparison is performed according to the 3600 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3601 *----------------------------------------------------------------------------*/ 3602 3603 int float32_le_quiet(float32 a, float32 b, float_status *status) 3604 { 3605 flag aSign, bSign; 3606 uint32_t av, bv; 3607 a = float32_squash_input_denormal(a, status); 3608 b = float32_squash_input_denormal(b, status); 3609 3610 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3611 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3612 ) { 3613 if (float32_is_signaling_nan(a, status) 3614 || float32_is_signaling_nan(b, status)) { 3615 float_raise(float_flag_invalid, status); 3616 } 3617 return 0; 3618 } 3619 aSign = extractFloat32Sign( a ); 3620 bSign = extractFloat32Sign( b ); 3621 av = float32_val(a); 3622 bv = float32_val(b); 3623 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3624 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3625 3626 } 3627 3628 /*---------------------------------------------------------------------------- 3629 | Returns 1 if the single-precision floating-point value `a' is less than 3630 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3631 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 3632 | Standard for Binary Floating-Point Arithmetic. 3633 *----------------------------------------------------------------------------*/ 3634 3635 int float32_lt_quiet(float32 a, float32 b, float_status *status) 3636 { 3637 flag aSign, bSign; 3638 uint32_t av, bv; 3639 a = float32_squash_input_denormal(a, status); 3640 b = float32_squash_input_denormal(b, status); 3641 3642 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3643 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3644 ) { 3645 if (float32_is_signaling_nan(a, status) 3646 || float32_is_signaling_nan(b, status)) { 3647 float_raise(float_flag_invalid, status); 3648 } 3649 return 0; 3650 } 3651 aSign = extractFloat32Sign( a ); 3652 bSign = extractFloat32Sign( b ); 3653 av = float32_val(a); 3654 bv = float32_val(b); 3655 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 3656 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3657 3658 } 3659 3660 /*---------------------------------------------------------------------------- 3661 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 3662 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 3663 | comparison is performed according to the IEC/IEEE Standard for Binary 3664 | Floating-Point Arithmetic. 3665 *----------------------------------------------------------------------------*/ 3666 3667 int float32_unordered_quiet(float32 a, float32 b, float_status *status) 3668 { 3669 a = float32_squash_input_denormal(a, status); 3670 b = float32_squash_input_denormal(b, status); 3671 3672 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3673 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3674 ) { 3675 if (float32_is_signaling_nan(a, status) 3676 || float32_is_signaling_nan(b, status)) { 3677 float_raise(float_flag_invalid, status); 3678 } 3679 return 1; 3680 } 3681 return 0; 3682 } 3683 3684 3685 /*---------------------------------------------------------------------------- 3686 | Returns the result of converting the double-precision floating-point value 3687 | `a' to the single-precision floating-point format. The conversion is 3688 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3689 | Arithmetic. 3690 *----------------------------------------------------------------------------*/ 3691 3692 float32 float64_to_float32(float64 a, float_status *status) 3693 { 3694 flag aSign; 3695 int aExp; 3696 uint64_t aSig; 3697 uint32_t zSig; 3698 a = float64_squash_input_denormal(a, status); 3699 3700 aSig = extractFloat64Frac( a ); 3701 aExp = extractFloat64Exp( a ); 3702 aSign = extractFloat64Sign( a ); 3703 if ( aExp == 0x7FF ) { 3704 if (aSig) { 3705 return commonNaNToFloat32(float64ToCommonNaN(a, status), status); 3706 } 3707 return packFloat32( aSign, 0xFF, 0 ); 3708 } 3709 shift64RightJamming( aSig, 22, &aSig ); 3710 zSig = aSig; 3711 if ( aExp || zSig ) { 3712 zSig |= 0x40000000; 3713 aExp -= 0x381; 3714 } 3715 return roundAndPackFloat32(aSign, aExp, zSig, status); 3716 3717 } 3718 3719 3720 /*---------------------------------------------------------------------------- 3721 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 3722 | half-precision floating-point value, returning the result. After being 3723 | shifted into the proper positions, the three fields are simply added 3724 | together to form the result. This means that any integer portion of `zSig' 3725 | will be added into the exponent. Since a properly normalized significand 3726 | will have an integer portion equal to 1, the `zExp' input should be 1 less 3727 | than the desired result exponent whenever `zSig' is a complete, normalized 3728 | significand. 3729 *----------------------------------------------------------------------------*/ 3730 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig) 3731 { 3732 return make_float16( 3733 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig); 3734 } 3735 3736 /*---------------------------------------------------------------------------- 3737 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3738 | and significand `zSig', and returns the proper half-precision floating- 3739 | point value corresponding to the abstract input. Ordinarily, the abstract 3740 | value is simply rounded and packed into the half-precision format, with 3741 | the inexact exception raised if the abstract input cannot be represented 3742 | exactly. However, if the abstract value is too large, the overflow and 3743 | inexact exceptions are raised and an infinity or maximal finite value is 3744 | returned. If the abstract value is too small, the input value is rounded to 3745 | a subnormal number, and the underflow and inexact exceptions are raised if 3746 | the abstract input cannot be represented exactly as a subnormal half- 3747 | precision floating-point number. 3748 | The `ieee' flag indicates whether to use IEEE standard half precision, or 3749 | ARM-style "alternative representation", which omits the NaN and Inf 3750 | encodings in order to raise the maximum representable exponent by one. 3751 | The input significand `zSig' has its binary point between bits 22 3752 | and 23, which is 13 bits to the left of the usual location. This shifted 3753 | significand must be normalized or smaller. If `zSig' is not normalized, 3754 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3755 | and it must not require rounding. In the usual case that `zSig' is 3756 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3757 | Note the slightly odd position of the binary point in zSig compared with the 3758 | other roundAndPackFloat functions. This should probably be fixed if we 3759 | need to implement more float16 routines than just conversion. 3760 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3761 | Binary Floating-Point Arithmetic. 3762 *----------------------------------------------------------------------------*/ 3763 3764 static float16 roundAndPackFloat16(flag zSign, int zExp, 3765 uint32_t zSig, flag ieee, 3766 float_status *status) 3767 { 3768 int maxexp = ieee ? 29 : 30; 3769 uint32_t mask; 3770 uint32_t increment; 3771 bool rounding_bumps_exp; 3772 bool is_tiny = false; 3773 3774 /* Calculate the mask of bits of the mantissa which are not 3775 * representable in half-precision and will be lost. 3776 */ 3777 if (zExp < 1) { 3778 /* Will be denormal in halfprec */ 3779 mask = 0x00ffffff; 3780 if (zExp >= -11) { 3781 mask >>= 11 + zExp; 3782 } 3783 } else { 3784 /* Normal number in halfprec */ 3785 mask = 0x00001fff; 3786 } 3787 3788 switch (status->float_rounding_mode) { 3789 case float_round_nearest_even: 3790 increment = (mask + 1) >> 1; 3791 if ((zSig & mask) == increment) { 3792 increment = zSig & (increment << 1); 3793 } 3794 break; 3795 case float_round_ties_away: 3796 increment = (mask + 1) >> 1; 3797 break; 3798 case float_round_up: 3799 increment = zSign ? 0 : mask; 3800 break; 3801 case float_round_down: 3802 increment = zSign ? mask : 0; 3803 break; 3804 default: /* round_to_zero */ 3805 increment = 0; 3806 break; 3807 } 3808 3809 rounding_bumps_exp = (zSig + increment >= 0x01000000); 3810 3811 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) { 3812 if (ieee) { 3813 float_raise(float_flag_overflow | float_flag_inexact, status); 3814 return packFloat16(zSign, 0x1f, 0); 3815 } else { 3816 float_raise(float_flag_invalid, status); 3817 return packFloat16(zSign, 0x1f, 0x3ff); 3818 } 3819 } 3820 3821 if (zExp < 0) { 3822 /* Note that flush-to-zero does not affect half-precision results */ 3823 is_tiny = 3824 (status->float_detect_tininess == float_tininess_before_rounding) 3825 || (zExp < -1) 3826 || (!rounding_bumps_exp); 3827 } 3828 if (zSig & mask) { 3829 float_raise(float_flag_inexact, status); 3830 if (is_tiny) { 3831 float_raise(float_flag_underflow, status); 3832 } 3833 } 3834 3835 zSig += increment; 3836 if (rounding_bumps_exp) { 3837 zSig >>= 1; 3838 zExp++; 3839 } 3840 3841 if (zExp < -10) { 3842 return packFloat16(zSign, 0, 0); 3843 } 3844 if (zExp < 0) { 3845 zSig >>= -zExp; 3846 zExp = 0; 3847 } 3848 return packFloat16(zSign, zExp, zSig >> 13); 3849 } 3850 3851 /*---------------------------------------------------------------------------- 3852 | If `a' is denormal and we are in flush-to-zero mode then set the 3853 | input-denormal exception and return zero. Otherwise just return the value. 3854 *----------------------------------------------------------------------------*/ 3855 float16 float16_squash_input_denormal(float16 a, float_status *status) 3856 { 3857 if (status->flush_inputs_to_zero) { 3858 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) { 3859 float_raise(float_flag_input_denormal, status); 3860 return make_float16(float16_val(a) & 0x8000); 3861 } 3862 } 3863 return a; 3864 } 3865 3866 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr, 3867 uint32_t *zSigPtr) 3868 { 3869 int8_t shiftCount = countLeadingZeros32(aSig) - 21; 3870 *zSigPtr = aSig << shiftCount; 3871 *zExpPtr = 1 - shiftCount; 3872 } 3873 3874 /* Half precision floats come in two formats: standard IEEE and "ARM" format. 3875 The latter gains extra exponent range by omitting the NaN/Inf encodings. */ 3876 3877 float32 float16_to_float32(float16 a, flag ieee, float_status *status) 3878 { 3879 flag aSign; 3880 int aExp; 3881 uint32_t aSig; 3882 3883 aSign = extractFloat16Sign(a); 3884 aExp = extractFloat16Exp(a); 3885 aSig = extractFloat16Frac(a); 3886 3887 if (aExp == 0x1f && ieee) { 3888 if (aSig) { 3889 return commonNaNToFloat32(float16ToCommonNaN(a, status), status); 3890 } 3891 return packFloat32(aSign, 0xff, 0); 3892 } 3893 if (aExp == 0) { 3894 if (aSig == 0) { 3895 return packFloat32(aSign, 0, 0); 3896 } 3897 3898 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 3899 aExp--; 3900 } 3901 return packFloat32( aSign, aExp + 0x70, aSig << 13); 3902 } 3903 3904 float16 float32_to_float16(float32 a, flag ieee, float_status *status) 3905 { 3906 flag aSign; 3907 int aExp; 3908 uint32_t aSig; 3909 3910 a = float32_squash_input_denormal(a, status); 3911 3912 aSig = extractFloat32Frac( a ); 3913 aExp = extractFloat32Exp( a ); 3914 aSign = extractFloat32Sign( a ); 3915 if ( aExp == 0xFF ) { 3916 if (aSig) { 3917 /* Input is a NaN */ 3918 if (!ieee) { 3919 float_raise(float_flag_invalid, status); 3920 return packFloat16(aSign, 0, 0); 3921 } 3922 return commonNaNToFloat16( 3923 float32ToCommonNaN(a, status), status); 3924 } 3925 /* Infinity */ 3926 if (!ieee) { 3927 float_raise(float_flag_invalid, status); 3928 return packFloat16(aSign, 0x1f, 0x3ff); 3929 } 3930 return packFloat16(aSign, 0x1f, 0); 3931 } 3932 if (aExp == 0 && aSig == 0) { 3933 return packFloat16(aSign, 0, 0); 3934 } 3935 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 3936 * even if the input is denormal; however this is harmless because 3937 * the largest possible single-precision denormal is still smaller 3938 * than the smallest representable half-precision denormal, and so we 3939 * will end up ignoring aSig and returning via the "always return zero" 3940 * codepath. 3941 */ 3942 aSig |= 0x00800000; 3943 aExp -= 0x71; 3944 3945 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status); 3946 } 3947 3948 float64 float16_to_float64(float16 a, flag ieee, float_status *status) 3949 { 3950 flag aSign; 3951 int aExp; 3952 uint32_t aSig; 3953 3954 aSign = extractFloat16Sign(a); 3955 aExp = extractFloat16Exp(a); 3956 aSig = extractFloat16Frac(a); 3957 3958 if (aExp == 0x1f && ieee) { 3959 if (aSig) { 3960 return commonNaNToFloat64( 3961 float16ToCommonNaN(a, status), status); 3962 } 3963 return packFloat64(aSign, 0x7ff, 0); 3964 } 3965 if (aExp == 0) { 3966 if (aSig == 0) { 3967 return packFloat64(aSign, 0, 0); 3968 } 3969 3970 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 3971 aExp--; 3972 } 3973 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42); 3974 } 3975 3976 float16 float64_to_float16(float64 a, flag ieee, float_status *status) 3977 { 3978 flag aSign; 3979 int aExp; 3980 uint64_t aSig; 3981 uint32_t zSig; 3982 3983 a = float64_squash_input_denormal(a, status); 3984 3985 aSig = extractFloat64Frac(a); 3986 aExp = extractFloat64Exp(a); 3987 aSign = extractFloat64Sign(a); 3988 if (aExp == 0x7FF) { 3989 if (aSig) { 3990 /* Input is a NaN */ 3991 if (!ieee) { 3992 float_raise(float_flag_invalid, status); 3993 return packFloat16(aSign, 0, 0); 3994 } 3995 return commonNaNToFloat16( 3996 float64ToCommonNaN(a, status), status); 3997 } 3998 /* Infinity */ 3999 if (!ieee) { 4000 float_raise(float_flag_invalid, status); 4001 return packFloat16(aSign, 0x1f, 0x3ff); 4002 } 4003 return packFloat16(aSign, 0x1f, 0); 4004 } 4005 shift64RightJamming(aSig, 29, &aSig); 4006 zSig = aSig; 4007 if (aExp == 0 && zSig == 0) { 4008 return packFloat16(aSign, 0, 0); 4009 } 4010 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 4011 * even if the input is denormal; however this is harmless because 4012 * the largest possible single-precision denormal is still smaller 4013 * than the smallest representable half-precision denormal, and so we 4014 * will end up ignoring aSig and returning via the "always return zero" 4015 * codepath. 4016 */ 4017 zSig |= 0x00800000; 4018 aExp -= 0x3F1; 4019 4020 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status); 4021 } 4022 4023 /*---------------------------------------------------------------------------- 4024 | Returns the result of converting the double-precision floating-point value 4025 | `a' to the extended double-precision floating-point format. The conversion 4026 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4027 | Arithmetic. 4028 *----------------------------------------------------------------------------*/ 4029 4030 floatx80 float64_to_floatx80(float64 a, float_status *status) 4031 { 4032 flag aSign; 4033 int aExp; 4034 uint64_t aSig; 4035 4036 a = float64_squash_input_denormal(a, status); 4037 aSig = extractFloat64Frac( a ); 4038 aExp = extractFloat64Exp( a ); 4039 aSign = extractFloat64Sign( a ); 4040 if ( aExp == 0x7FF ) { 4041 if (aSig) { 4042 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status); 4043 } 4044 return packFloatx80(aSign, 4045 floatx80_infinity_high, 4046 floatx80_infinity_low); 4047 } 4048 if ( aExp == 0 ) { 4049 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 4050 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4051 } 4052 return 4053 packFloatx80( 4054 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 4055 4056 } 4057 4058 /*---------------------------------------------------------------------------- 4059 | Returns the result of converting the double-precision floating-point value 4060 | `a' to the quadruple-precision floating-point format. The conversion is 4061 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4062 | Arithmetic. 4063 *----------------------------------------------------------------------------*/ 4064 4065 float128 float64_to_float128(float64 a, float_status *status) 4066 { 4067 flag aSign; 4068 int aExp; 4069 uint64_t aSig, zSig0, zSig1; 4070 4071 a = float64_squash_input_denormal(a, status); 4072 aSig = extractFloat64Frac( a ); 4073 aExp = extractFloat64Exp( a ); 4074 aSign = extractFloat64Sign( a ); 4075 if ( aExp == 0x7FF ) { 4076 if (aSig) { 4077 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 4078 } 4079 return packFloat128( aSign, 0x7FFF, 0, 0 ); 4080 } 4081 if ( aExp == 0 ) { 4082 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 4083 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4084 --aExp; 4085 } 4086 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 4087 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 4088 4089 } 4090 4091 4092 /*---------------------------------------------------------------------------- 4093 | Returns the remainder of the double-precision floating-point value `a' 4094 | with respect to the corresponding value `b'. The operation is performed 4095 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4096 *----------------------------------------------------------------------------*/ 4097 4098 float64 float64_rem(float64 a, float64 b, float_status *status) 4099 { 4100 flag aSign, zSign; 4101 int aExp, bExp, expDiff; 4102 uint64_t aSig, bSig; 4103 uint64_t q, alternateASig; 4104 int64_t sigMean; 4105 4106 a = float64_squash_input_denormal(a, status); 4107 b = float64_squash_input_denormal(b, status); 4108 aSig = extractFloat64Frac( a ); 4109 aExp = extractFloat64Exp( a ); 4110 aSign = extractFloat64Sign( a ); 4111 bSig = extractFloat64Frac( b ); 4112 bExp = extractFloat64Exp( b ); 4113 if ( aExp == 0x7FF ) { 4114 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 4115 return propagateFloat64NaN(a, b, status); 4116 } 4117 float_raise(float_flag_invalid, status); 4118 return float64_default_nan(status); 4119 } 4120 if ( bExp == 0x7FF ) { 4121 if (bSig) { 4122 return propagateFloat64NaN(a, b, status); 4123 } 4124 return a; 4125 } 4126 if ( bExp == 0 ) { 4127 if ( bSig == 0 ) { 4128 float_raise(float_flag_invalid, status); 4129 return float64_default_nan(status); 4130 } 4131 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4132 } 4133 if ( aExp == 0 ) { 4134 if ( aSig == 0 ) return a; 4135 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4136 } 4137 expDiff = aExp - bExp; 4138 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 4139 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4140 if ( expDiff < 0 ) { 4141 if ( expDiff < -1 ) return a; 4142 aSig >>= 1; 4143 } 4144 q = ( bSig <= aSig ); 4145 if ( q ) aSig -= bSig; 4146 expDiff -= 64; 4147 while ( 0 < expDiff ) { 4148 q = estimateDiv128To64( aSig, 0, bSig ); 4149 q = ( 2 < q ) ? q - 2 : 0; 4150 aSig = - ( ( bSig>>2 ) * q ); 4151 expDiff -= 62; 4152 } 4153 expDiff += 64; 4154 if ( 0 < expDiff ) { 4155 q = estimateDiv128To64( aSig, 0, bSig ); 4156 q = ( 2 < q ) ? q - 2 : 0; 4157 q >>= 64 - expDiff; 4158 bSig >>= 2; 4159 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4160 } 4161 else { 4162 aSig >>= 2; 4163 bSig >>= 2; 4164 } 4165 do { 4166 alternateASig = aSig; 4167 ++q; 4168 aSig -= bSig; 4169 } while ( 0 <= (int64_t) aSig ); 4170 sigMean = aSig + alternateASig; 4171 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 4172 aSig = alternateASig; 4173 } 4174 zSign = ( (int64_t) aSig < 0 ); 4175 if ( zSign ) aSig = - aSig; 4176 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 4177 4178 } 4179 4180 /*---------------------------------------------------------------------------- 4181 | Returns the binary log of the double-precision floating-point value `a'. 4182 | The operation is performed according to the IEC/IEEE Standard for Binary 4183 | Floating-Point Arithmetic. 4184 *----------------------------------------------------------------------------*/ 4185 float64 float64_log2(float64 a, float_status *status) 4186 { 4187 flag aSign, zSign; 4188 int aExp; 4189 uint64_t aSig, aSig0, aSig1, zSig, i; 4190 a = float64_squash_input_denormal(a, status); 4191 4192 aSig = extractFloat64Frac( a ); 4193 aExp = extractFloat64Exp( a ); 4194 aSign = extractFloat64Sign( a ); 4195 4196 if ( aExp == 0 ) { 4197 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 4198 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4199 } 4200 if ( aSign ) { 4201 float_raise(float_flag_invalid, status); 4202 return float64_default_nan(status); 4203 } 4204 if ( aExp == 0x7FF ) { 4205 if (aSig) { 4206 return propagateFloat64NaN(a, float64_zero, status); 4207 } 4208 return a; 4209 } 4210 4211 aExp -= 0x3FF; 4212 aSig |= LIT64( 0x0010000000000000 ); 4213 zSign = aExp < 0; 4214 zSig = (uint64_t)aExp << 52; 4215 for (i = 1LL << 51; i > 0; i >>= 1) { 4216 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 4217 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 4218 if ( aSig & LIT64( 0x0020000000000000 ) ) { 4219 aSig >>= 1; 4220 zSig |= i; 4221 } 4222 } 4223 4224 if ( zSign ) 4225 zSig = -zSig; 4226 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 4227 } 4228 4229 /*---------------------------------------------------------------------------- 4230 | Returns 1 if the double-precision floating-point value `a' is equal to the 4231 | corresponding value `b', and 0 otherwise. The invalid exception is raised 4232 | if either operand is a NaN. Otherwise, the comparison is performed 4233 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4234 *----------------------------------------------------------------------------*/ 4235 4236 int float64_eq(float64 a, float64 b, float_status *status) 4237 { 4238 uint64_t av, bv; 4239 a = float64_squash_input_denormal(a, status); 4240 b = float64_squash_input_denormal(b, status); 4241 4242 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4243 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4244 ) { 4245 float_raise(float_flag_invalid, status); 4246 return 0; 4247 } 4248 av = float64_val(a); 4249 bv = float64_val(b); 4250 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4251 4252 } 4253 4254 /*---------------------------------------------------------------------------- 4255 | Returns 1 if the double-precision floating-point value `a' is less than or 4256 | equal to the corresponding value `b', and 0 otherwise. The invalid 4257 | exception is raised if either operand is a NaN. The comparison is performed 4258 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4259 *----------------------------------------------------------------------------*/ 4260 4261 int float64_le(float64 a, float64 b, float_status *status) 4262 { 4263 flag aSign, bSign; 4264 uint64_t av, bv; 4265 a = float64_squash_input_denormal(a, status); 4266 b = float64_squash_input_denormal(b, status); 4267 4268 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4269 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4270 ) { 4271 float_raise(float_flag_invalid, status); 4272 return 0; 4273 } 4274 aSign = extractFloat64Sign( a ); 4275 bSign = extractFloat64Sign( b ); 4276 av = float64_val(a); 4277 bv = float64_val(b); 4278 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4279 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4280 4281 } 4282 4283 /*---------------------------------------------------------------------------- 4284 | Returns 1 if the double-precision floating-point value `a' is less than 4285 | the corresponding value `b', and 0 otherwise. The invalid exception is 4286 | raised if either operand is a NaN. The comparison is performed according 4287 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4288 *----------------------------------------------------------------------------*/ 4289 4290 int float64_lt(float64 a, float64 b, float_status *status) 4291 { 4292 flag aSign, bSign; 4293 uint64_t av, bv; 4294 4295 a = float64_squash_input_denormal(a, status); 4296 b = float64_squash_input_denormal(b, status); 4297 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4298 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4299 ) { 4300 float_raise(float_flag_invalid, status); 4301 return 0; 4302 } 4303 aSign = extractFloat64Sign( a ); 4304 bSign = extractFloat64Sign( b ); 4305 av = float64_val(a); 4306 bv = float64_val(b); 4307 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4308 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4309 4310 } 4311 4312 /*---------------------------------------------------------------------------- 4313 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4314 | be compared, and 0 otherwise. The invalid exception is raised if either 4315 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4316 | Standard for Binary Floating-Point Arithmetic. 4317 *----------------------------------------------------------------------------*/ 4318 4319 int float64_unordered(float64 a, float64 b, float_status *status) 4320 { 4321 a = float64_squash_input_denormal(a, status); 4322 b = float64_squash_input_denormal(b, status); 4323 4324 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4325 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4326 ) { 4327 float_raise(float_flag_invalid, status); 4328 return 1; 4329 } 4330 return 0; 4331 } 4332 4333 /*---------------------------------------------------------------------------- 4334 | Returns 1 if the double-precision floating-point value `a' is equal to the 4335 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4336 | exception.The comparison is performed according to the IEC/IEEE Standard 4337 | for Binary Floating-Point Arithmetic. 4338 *----------------------------------------------------------------------------*/ 4339 4340 int float64_eq_quiet(float64 a, float64 b, float_status *status) 4341 { 4342 uint64_t av, bv; 4343 a = float64_squash_input_denormal(a, status); 4344 b = float64_squash_input_denormal(b, status); 4345 4346 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4347 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4348 ) { 4349 if (float64_is_signaling_nan(a, status) 4350 || float64_is_signaling_nan(b, status)) { 4351 float_raise(float_flag_invalid, status); 4352 } 4353 return 0; 4354 } 4355 av = float64_val(a); 4356 bv = float64_val(b); 4357 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4358 4359 } 4360 4361 /*---------------------------------------------------------------------------- 4362 | Returns 1 if the double-precision floating-point value `a' is less than or 4363 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4364 | cause an exception. Otherwise, the comparison is performed according to the 4365 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4366 *----------------------------------------------------------------------------*/ 4367 4368 int float64_le_quiet(float64 a, float64 b, float_status *status) 4369 { 4370 flag aSign, bSign; 4371 uint64_t av, bv; 4372 a = float64_squash_input_denormal(a, status); 4373 b = float64_squash_input_denormal(b, status); 4374 4375 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4376 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4377 ) { 4378 if (float64_is_signaling_nan(a, status) 4379 || float64_is_signaling_nan(b, status)) { 4380 float_raise(float_flag_invalid, status); 4381 } 4382 return 0; 4383 } 4384 aSign = extractFloat64Sign( a ); 4385 bSign = extractFloat64Sign( b ); 4386 av = float64_val(a); 4387 bv = float64_val(b); 4388 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4389 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4390 4391 } 4392 4393 /*---------------------------------------------------------------------------- 4394 | Returns 1 if the double-precision floating-point value `a' is less than 4395 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4396 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 4397 | Standard for Binary Floating-Point Arithmetic. 4398 *----------------------------------------------------------------------------*/ 4399 4400 int float64_lt_quiet(float64 a, float64 b, float_status *status) 4401 { 4402 flag aSign, bSign; 4403 uint64_t av, bv; 4404 a = float64_squash_input_denormal(a, status); 4405 b = float64_squash_input_denormal(b, status); 4406 4407 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4408 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4409 ) { 4410 if (float64_is_signaling_nan(a, status) 4411 || float64_is_signaling_nan(b, status)) { 4412 float_raise(float_flag_invalid, status); 4413 } 4414 return 0; 4415 } 4416 aSign = extractFloat64Sign( a ); 4417 bSign = extractFloat64Sign( b ); 4418 av = float64_val(a); 4419 bv = float64_val(b); 4420 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4421 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4422 4423 } 4424 4425 /*---------------------------------------------------------------------------- 4426 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4427 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 4428 | comparison is performed according to the IEC/IEEE Standard for Binary 4429 | Floating-Point Arithmetic. 4430 *----------------------------------------------------------------------------*/ 4431 4432 int float64_unordered_quiet(float64 a, float64 b, float_status *status) 4433 { 4434 a = float64_squash_input_denormal(a, status); 4435 b = float64_squash_input_denormal(b, status); 4436 4437 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4438 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4439 ) { 4440 if (float64_is_signaling_nan(a, status) 4441 || float64_is_signaling_nan(b, status)) { 4442 float_raise(float_flag_invalid, status); 4443 } 4444 return 1; 4445 } 4446 return 0; 4447 } 4448 4449 /*---------------------------------------------------------------------------- 4450 | Returns the result of converting the extended double-precision floating- 4451 | point value `a' to the 32-bit two's complement integer format. The 4452 | conversion is performed according to the IEC/IEEE Standard for Binary 4453 | Floating-Point Arithmetic---which means in particular that the conversion 4454 | is rounded according to the current rounding mode. If `a' is a NaN, the 4455 | largest positive integer is returned. Otherwise, if the conversion 4456 | overflows, the largest integer with the same sign as `a' is returned. 4457 *----------------------------------------------------------------------------*/ 4458 4459 int32_t floatx80_to_int32(floatx80 a, float_status *status) 4460 { 4461 flag aSign; 4462 int32_t aExp, shiftCount; 4463 uint64_t aSig; 4464 4465 if (floatx80_invalid_encoding(a)) { 4466 float_raise(float_flag_invalid, status); 4467 return 1 << 31; 4468 } 4469 aSig = extractFloatx80Frac( a ); 4470 aExp = extractFloatx80Exp( a ); 4471 aSign = extractFloatx80Sign( a ); 4472 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4473 shiftCount = 0x4037 - aExp; 4474 if ( shiftCount <= 0 ) shiftCount = 1; 4475 shift64RightJamming( aSig, shiftCount, &aSig ); 4476 return roundAndPackInt32(aSign, aSig, status); 4477 4478 } 4479 4480 /*---------------------------------------------------------------------------- 4481 | Returns the result of converting the extended double-precision floating- 4482 | point value `a' to the 32-bit two's complement integer format. The 4483 | conversion is performed according to the IEC/IEEE Standard for Binary 4484 | Floating-Point Arithmetic, except that the conversion is always rounded 4485 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4486 | Otherwise, if the conversion overflows, the largest integer with the same 4487 | sign as `a' is returned. 4488 *----------------------------------------------------------------------------*/ 4489 4490 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 4491 { 4492 flag aSign; 4493 int32_t aExp, shiftCount; 4494 uint64_t aSig, savedASig; 4495 int32_t z; 4496 4497 if (floatx80_invalid_encoding(a)) { 4498 float_raise(float_flag_invalid, status); 4499 return 1 << 31; 4500 } 4501 aSig = extractFloatx80Frac( a ); 4502 aExp = extractFloatx80Exp( a ); 4503 aSign = extractFloatx80Sign( a ); 4504 if ( 0x401E < aExp ) { 4505 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4506 goto invalid; 4507 } 4508 else if ( aExp < 0x3FFF ) { 4509 if (aExp || aSig) { 4510 status->float_exception_flags |= float_flag_inexact; 4511 } 4512 return 0; 4513 } 4514 shiftCount = 0x403E - aExp; 4515 savedASig = aSig; 4516 aSig >>= shiftCount; 4517 z = aSig; 4518 if ( aSign ) z = - z; 4519 if ( ( z < 0 ) ^ aSign ) { 4520 invalid: 4521 float_raise(float_flag_invalid, status); 4522 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 4523 } 4524 if ( ( aSig<<shiftCount ) != savedASig ) { 4525 status->float_exception_flags |= float_flag_inexact; 4526 } 4527 return z; 4528 4529 } 4530 4531 /*---------------------------------------------------------------------------- 4532 | Returns the result of converting the extended double-precision floating- 4533 | point value `a' to the 64-bit two's complement integer format. The 4534 | conversion is performed according to the IEC/IEEE Standard for Binary 4535 | Floating-Point Arithmetic---which means in particular that the conversion 4536 | is rounded according to the current rounding mode. If `a' is a NaN, 4537 | the largest positive integer is returned. Otherwise, if the conversion 4538 | overflows, the largest integer with the same sign as `a' is returned. 4539 *----------------------------------------------------------------------------*/ 4540 4541 int64_t floatx80_to_int64(floatx80 a, float_status *status) 4542 { 4543 flag aSign; 4544 int32_t aExp, shiftCount; 4545 uint64_t aSig, aSigExtra; 4546 4547 if (floatx80_invalid_encoding(a)) { 4548 float_raise(float_flag_invalid, status); 4549 return 1ULL << 63; 4550 } 4551 aSig = extractFloatx80Frac( a ); 4552 aExp = extractFloatx80Exp( a ); 4553 aSign = extractFloatx80Sign( a ); 4554 shiftCount = 0x403E - aExp; 4555 if ( shiftCount <= 0 ) { 4556 if ( shiftCount ) { 4557 float_raise(float_flag_invalid, status); 4558 if (!aSign || floatx80_is_any_nan(a)) { 4559 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4560 } 4561 return (int64_t) LIT64( 0x8000000000000000 ); 4562 } 4563 aSigExtra = 0; 4564 } 4565 else { 4566 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 4567 } 4568 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 4569 4570 } 4571 4572 /*---------------------------------------------------------------------------- 4573 | Returns the result of converting the extended double-precision floating- 4574 | point value `a' to the 64-bit two's complement integer format. The 4575 | conversion is performed according to the IEC/IEEE Standard for Binary 4576 | Floating-Point Arithmetic, except that the conversion is always rounded 4577 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4578 | Otherwise, if the conversion overflows, the largest integer with the same 4579 | sign as `a' is returned. 4580 *----------------------------------------------------------------------------*/ 4581 4582 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 4583 { 4584 flag aSign; 4585 int32_t aExp, shiftCount; 4586 uint64_t aSig; 4587 int64_t z; 4588 4589 if (floatx80_invalid_encoding(a)) { 4590 float_raise(float_flag_invalid, status); 4591 return 1ULL << 63; 4592 } 4593 aSig = extractFloatx80Frac( a ); 4594 aExp = extractFloatx80Exp( a ); 4595 aSign = extractFloatx80Sign( a ); 4596 shiftCount = aExp - 0x403E; 4597 if ( 0 <= shiftCount ) { 4598 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 4599 if ( ( a.high != 0xC03E ) || aSig ) { 4600 float_raise(float_flag_invalid, status); 4601 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 4602 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4603 } 4604 } 4605 return (int64_t) LIT64( 0x8000000000000000 ); 4606 } 4607 else if ( aExp < 0x3FFF ) { 4608 if (aExp | aSig) { 4609 status->float_exception_flags |= float_flag_inexact; 4610 } 4611 return 0; 4612 } 4613 z = aSig>>( - shiftCount ); 4614 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 4615 status->float_exception_flags |= float_flag_inexact; 4616 } 4617 if ( aSign ) z = - z; 4618 return z; 4619 4620 } 4621 4622 /*---------------------------------------------------------------------------- 4623 | Returns the result of converting the extended double-precision floating- 4624 | point value `a' to the single-precision floating-point format. The 4625 | conversion is performed according to the IEC/IEEE Standard for Binary 4626 | Floating-Point Arithmetic. 4627 *----------------------------------------------------------------------------*/ 4628 4629 float32 floatx80_to_float32(floatx80 a, float_status *status) 4630 { 4631 flag aSign; 4632 int32_t aExp; 4633 uint64_t aSig; 4634 4635 if (floatx80_invalid_encoding(a)) { 4636 float_raise(float_flag_invalid, status); 4637 return float32_default_nan(status); 4638 } 4639 aSig = extractFloatx80Frac( a ); 4640 aExp = extractFloatx80Exp( a ); 4641 aSign = extractFloatx80Sign( a ); 4642 if ( aExp == 0x7FFF ) { 4643 if ( (uint64_t) ( aSig<<1 ) ) { 4644 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status); 4645 } 4646 return packFloat32( aSign, 0xFF, 0 ); 4647 } 4648 shift64RightJamming( aSig, 33, &aSig ); 4649 if ( aExp || aSig ) aExp -= 0x3F81; 4650 return roundAndPackFloat32(aSign, aExp, aSig, status); 4651 4652 } 4653 4654 /*---------------------------------------------------------------------------- 4655 | Returns the result of converting the extended double-precision floating- 4656 | point value `a' to the double-precision floating-point format. The 4657 | conversion is performed according to the IEC/IEEE Standard for Binary 4658 | Floating-Point Arithmetic. 4659 *----------------------------------------------------------------------------*/ 4660 4661 float64 floatx80_to_float64(floatx80 a, float_status *status) 4662 { 4663 flag aSign; 4664 int32_t aExp; 4665 uint64_t aSig, zSig; 4666 4667 if (floatx80_invalid_encoding(a)) { 4668 float_raise(float_flag_invalid, status); 4669 return float64_default_nan(status); 4670 } 4671 aSig = extractFloatx80Frac( a ); 4672 aExp = extractFloatx80Exp( a ); 4673 aSign = extractFloatx80Sign( a ); 4674 if ( aExp == 0x7FFF ) { 4675 if ( (uint64_t) ( aSig<<1 ) ) { 4676 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status); 4677 } 4678 return packFloat64( aSign, 0x7FF, 0 ); 4679 } 4680 shift64RightJamming( aSig, 1, &zSig ); 4681 if ( aExp || aSig ) aExp -= 0x3C01; 4682 return roundAndPackFloat64(aSign, aExp, zSig, status); 4683 4684 } 4685 4686 /*---------------------------------------------------------------------------- 4687 | Returns the result of converting the extended double-precision floating- 4688 | point value `a' to the quadruple-precision floating-point format. The 4689 | conversion is performed according to the IEC/IEEE Standard for Binary 4690 | Floating-Point Arithmetic. 4691 *----------------------------------------------------------------------------*/ 4692 4693 float128 floatx80_to_float128(floatx80 a, float_status *status) 4694 { 4695 flag aSign; 4696 int aExp; 4697 uint64_t aSig, zSig0, zSig1; 4698 4699 if (floatx80_invalid_encoding(a)) { 4700 float_raise(float_flag_invalid, status); 4701 return float128_default_nan(status); 4702 } 4703 aSig = extractFloatx80Frac( a ); 4704 aExp = extractFloatx80Exp( a ); 4705 aSign = extractFloatx80Sign( a ); 4706 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 4707 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status); 4708 } 4709 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 4710 return packFloat128( aSign, aExp, zSig0, zSig1 ); 4711 4712 } 4713 4714 /*---------------------------------------------------------------------------- 4715 | Rounds the extended double-precision floating-point value `a' 4716 | to the precision provided by floatx80_rounding_precision and returns the 4717 | result as an extended double-precision floating-point value. 4718 | The operation is performed according to the IEC/IEEE Standard for Binary 4719 | Floating-Point Arithmetic. 4720 *----------------------------------------------------------------------------*/ 4721 4722 floatx80 floatx80_round(floatx80 a, float_status *status) 4723 { 4724 return roundAndPackFloatx80(status->floatx80_rounding_precision, 4725 extractFloatx80Sign(a), 4726 extractFloatx80Exp(a), 4727 extractFloatx80Frac(a), 0, status); 4728 } 4729 4730 /*---------------------------------------------------------------------------- 4731 | Rounds the extended double-precision floating-point value `a' to an integer, 4732 | and returns the result as an extended quadruple-precision floating-point 4733 | value. The operation is performed according to the IEC/IEEE Standard for 4734 | Binary Floating-Point Arithmetic. 4735 *----------------------------------------------------------------------------*/ 4736 4737 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 4738 { 4739 flag aSign; 4740 int32_t aExp; 4741 uint64_t lastBitMask, roundBitsMask; 4742 floatx80 z; 4743 4744 if (floatx80_invalid_encoding(a)) { 4745 float_raise(float_flag_invalid, status); 4746 return floatx80_default_nan(status); 4747 } 4748 aExp = extractFloatx80Exp( a ); 4749 if ( 0x403E <= aExp ) { 4750 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 4751 return propagateFloatx80NaN(a, a, status); 4752 } 4753 return a; 4754 } 4755 if ( aExp < 0x3FFF ) { 4756 if ( ( aExp == 0 ) 4757 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 4758 return a; 4759 } 4760 status->float_exception_flags |= float_flag_inexact; 4761 aSign = extractFloatx80Sign( a ); 4762 switch (status->float_rounding_mode) { 4763 case float_round_nearest_even: 4764 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 4765 ) { 4766 return 4767 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 4768 } 4769 break; 4770 case float_round_ties_away: 4771 if (aExp == 0x3FFE) { 4772 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000)); 4773 } 4774 break; 4775 case float_round_down: 4776 return 4777 aSign ? 4778 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 4779 : packFloatx80( 0, 0, 0 ); 4780 case float_round_up: 4781 return 4782 aSign ? packFloatx80( 1, 0, 0 ) 4783 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 4784 } 4785 return packFloatx80( aSign, 0, 0 ); 4786 } 4787 lastBitMask = 1; 4788 lastBitMask <<= 0x403E - aExp; 4789 roundBitsMask = lastBitMask - 1; 4790 z = a; 4791 switch (status->float_rounding_mode) { 4792 case float_round_nearest_even: 4793 z.low += lastBitMask>>1; 4794 if ((z.low & roundBitsMask) == 0) { 4795 z.low &= ~lastBitMask; 4796 } 4797 break; 4798 case float_round_ties_away: 4799 z.low += lastBitMask >> 1; 4800 break; 4801 case float_round_to_zero: 4802 break; 4803 case float_round_up: 4804 if (!extractFloatx80Sign(z)) { 4805 z.low += roundBitsMask; 4806 } 4807 break; 4808 case float_round_down: 4809 if (extractFloatx80Sign(z)) { 4810 z.low += roundBitsMask; 4811 } 4812 break; 4813 default: 4814 abort(); 4815 } 4816 z.low &= ~ roundBitsMask; 4817 if ( z.low == 0 ) { 4818 ++z.high; 4819 z.low = LIT64( 0x8000000000000000 ); 4820 } 4821 if (z.low != a.low) { 4822 status->float_exception_flags |= float_flag_inexact; 4823 } 4824 return z; 4825 4826 } 4827 4828 /*---------------------------------------------------------------------------- 4829 | Returns the result of adding the absolute values of the extended double- 4830 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 4831 | negated before being returned. `zSign' is ignored if the result is a NaN. 4832 | The addition is performed according to the IEC/IEEE Standard for Binary 4833 | Floating-Point Arithmetic. 4834 *----------------------------------------------------------------------------*/ 4835 4836 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 4837 float_status *status) 4838 { 4839 int32_t aExp, bExp, zExp; 4840 uint64_t aSig, bSig, zSig0, zSig1; 4841 int32_t expDiff; 4842 4843 aSig = extractFloatx80Frac( a ); 4844 aExp = extractFloatx80Exp( a ); 4845 bSig = extractFloatx80Frac( b ); 4846 bExp = extractFloatx80Exp( b ); 4847 expDiff = aExp - bExp; 4848 if ( 0 < expDiff ) { 4849 if ( aExp == 0x7FFF ) { 4850 if ((uint64_t)(aSig << 1)) { 4851 return propagateFloatx80NaN(a, b, status); 4852 } 4853 return a; 4854 } 4855 if ( bExp == 0 ) --expDiff; 4856 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 4857 zExp = aExp; 4858 } 4859 else if ( expDiff < 0 ) { 4860 if ( bExp == 0x7FFF ) { 4861 if ((uint64_t)(bSig << 1)) { 4862 return propagateFloatx80NaN(a, b, status); 4863 } 4864 return packFloatx80(zSign, 4865 floatx80_infinity_high, 4866 floatx80_infinity_low); 4867 } 4868 if ( aExp == 0 ) ++expDiff; 4869 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 4870 zExp = bExp; 4871 } 4872 else { 4873 if ( aExp == 0x7FFF ) { 4874 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 4875 return propagateFloatx80NaN(a, b, status); 4876 } 4877 return a; 4878 } 4879 zSig1 = 0; 4880 zSig0 = aSig + bSig; 4881 if ( aExp == 0 ) { 4882 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 4883 goto roundAndPack; 4884 } 4885 zExp = aExp; 4886 goto shiftRight1; 4887 } 4888 zSig0 = aSig + bSig; 4889 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 4890 shiftRight1: 4891 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 4892 zSig0 |= LIT64( 0x8000000000000000 ); 4893 ++zExp; 4894 roundAndPack: 4895 return roundAndPackFloatx80(status->floatx80_rounding_precision, 4896 zSign, zExp, zSig0, zSig1, status); 4897 } 4898 4899 /*---------------------------------------------------------------------------- 4900 | Returns the result of subtracting the absolute values of the extended 4901 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 4902 | difference is negated before being returned. `zSign' is ignored if the 4903 | result is a NaN. The subtraction is performed according to the IEC/IEEE 4904 | Standard for Binary Floating-Point Arithmetic. 4905 *----------------------------------------------------------------------------*/ 4906 4907 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 4908 float_status *status) 4909 { 4910 int32_t aExp, bExp, zExp; 4911 uint64_t aSig, bSig, zSig0, zSig1; 4912 int32_t expDiff; 4913 4914 aSig = extractFloatx80Frac( a ); 4915 aExp = extractFloatx80Exp( a ); 4916 bSig = extractFloatx80Frac( b ); 4917 bExp = extractFloatx80Exp( b ); 4918 expDiff = aExp - bExp; 4919 if ( 0 < expDiff ) goto aExpBigger; 4920 if ( expDiff < 0 ) goto bExpBigger; 4921 if ( aExp == 0x7FFF ) { 4922 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 4923 return propagateFloatx80NaN(a, b, status); 4924 } 4925 float_raise(float_flag_invalid, status); 4926 return floatx80_default_nan(status); 4927 } 4928 if ( aExp == 0 ) { 4929 aExp = 1; 4930 bExp = 1; 4931 } 4932 zSig1 = 0; 4933 if ( bSig < aSig ) goto aBigger; 4934 if ( aSig < bSig ) goto bBigger; 4935 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 4936 bExpBigger: 4937 if ( bExp == 0x7FFF ) { 4938 if ((uint64_t)(bSig << 1)) { 4939 return propagateFloatx80NaN(a, b, status); 4940 } 4941 return packFloatx80(zSign ^ 1, floatx80_infinity_high, 4942 floatx80_infinity_low); 4943 } 4944 if ( aExp == 0 ) ++expDiff; 4945 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 4946 bBigger: 4947 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 4948 zExp = bExp; 4949 zSign ^= 1; 4950 goto normalizeRoundAndPack; 4951 aExpBigger: 4952 if ( aExp == 0x7FFF ) { 4953 if ((uint64_t)(aSig << 1)) { 4954 return propagateFloatx80NaN(a, b, status); 4955 } 4956 return a; 4957 } 4958 if ( bExp == 0 ) --expDiff; 4959 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 4960 aBigger: 4961 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 4962 zExp = aExp; 4963 normalizeRoundAndPack: 4964 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 4965 zSign, zExp, zSig0, zSig1, status); 4966 } 4967 4968 /*---------------------------------------------------------------------------- 4969 | Returns the result of adding the extended double-precision floating-point 4970 | values `a' and `b'. The operation is performed according to the IEC/IEEE 4971 | Standard for Binary Floating-Point Arithmetic. 4972 *----------------------------------------------------------------------------*/ 4973 4974 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 4975 { 4976 flag aSign, bSign; 4977 4978 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 4979 float_raise(float_flag_invalid, status); 4980 return floatx80_default_nan(status); 4981 } 4982 aSign = extractFloatx80Sign( a ); 4983 bSign = extractFloatx80Sign( b ); 4984 if ( aSign == bSign ) { 4985 return addFloatx80Sigs(a, b, aSign, status); 4986 } 4987 else { 4988 return subFloatx80Sigs(a, b, aSign, status); 4989 } 4990 4991 } 4992 4993 /*---------------------------------------------------------------------------- 4994 | Returns the result of subtracting the extended double-precision floating- 4995 | point values `a' and `b'. The operation is performed according to the 4996 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4997 *----------------------------------------------------------------------------*/ 4998 4999 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 5000 { 5001 flag aSign, bSign; 5002 5003 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5004 float_raise(float_flag_invalid, status); 5005 return floatx80_default_nan(status); 5006 } 5007 aSign = extractFloatx80Sign( a ); 5008 bSign = extractFloatx80Sign( b ); 5009 if ( aSign == bSign ) { 5010 return subFloatx80Sigs(a, b, aSign, status); 5011 } 5012 else { 5013 return addFloatx80Sigs(a, b, aSign, status); 5014 } 5015 5016 } 5017 5018 /*---------------------------------------------------------------------------- 5019 | Returns the result of multiplying the extended double-precision floating- 5020 | point values `a' and `b'. The operation is performed according to the 5021 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5022 *----------------------------------------------------------------------------*/ 5023 5024 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 5025 { 5026 flag aSign, bSign, zSign; 5027 int32_t aExp, bExp, zExp; 5028 uint64_t aSig, bSig, zSig0, zSig1; 5029 5030 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5031 float_raise(float_flag_invalid, status); 5032 return floatx80_default_nan(status); 5033 } 5034 aSig = extractFloatx80Frac( a ); 5035 aExp = extractFloatx80Exp( a ); 5036 aSign = extractFloatx80Sign( a ); 5037 bSig = extractFloatx80Frac( b ); 5038 bExp = extractFloatx80Exp( b ); 5039 bSign = extractFloatx80Sign( b ); 5040 zSign = aSign ^ bSign; 5041 if ( aExp == 0x7FFF ) { 5042 if ( (uint64_t) ( aSig<<1 ) 5043 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5044 return propagateFloatx80NaN(a, b, status); 5045 } 5046 if ( ( bExp | bSig ) == 0 ) goto invalid; 5047 return packFloatx80(zSign, floatx80_infinity_high, 5048 floatx80_infinity_low); 5049 } 5050 if ( bExp == 0x7FFF ) { 5051 if ((uint64_t)(bSig << 1)) { 5052 return propagateFloatx80NaN(a, b, status); 5053 } 5054 if ( ( aExp | aSig ) == 0 ) { 5055 invalid: 5056 float_raise(float_flag_invalid, status); 5057 return floatx80_default_nan(status); 5058 } 5059 return packFloatx80(zSign, floatx80_infinity_high, 5060 floatx80_infinity_low); 5061 } 5062 if ( aExp == 0 ) { 5063 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5064 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5065 } 5066 if ( bExp == 0 ) { 5067 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5068 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5069 } 5070 zExp = aExp + bExp - 0x3FFE; 5071 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 5072 if ( 0 < (int64_t) zSig0 ) { 5073 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5074 --zExp; 5075 } 5076 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5077 zSign, zExp, zSig0, zSig1, status); 5078 } 5079 5080 /*---------------------------------------------------------------------------- 5081 | Returns the result of dividing the extended double-precision floating-point 5082 | value `a' by the corresponding value `b'. The operation is performed 5083 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5084 *----------------------------------------------------------------------------*/ 5085 5086 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 5087 { 5088 flag aSign, bSign, zSign; 5089 int32_t aExp, bExp, zExp; 5090 uint64_t aSig, bSig, zSig0, zSig1; 5091 uint64_t rem0, rem1, rem2, term0, term1, term2; 5092 5093 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5094 float_raise(float_flag_invalid, status); 5095 return floatx80_default_nan(status); 5096 } 5097 aSig = extractFloatx80Frac( a ); 5098 aExp = extractFloatx80Exp( a ); 5099 aSign = extractFloatx80Sign( a ); 5100 bSig = extractFloatx80Frac( b ); 5101 bExp = extractFloatx80Exp( b ); 5102 bSign = extractFloatx80Sign( b ); 5103 zSign = aSign ^ bSign; 5104 if ( aExp == 0x7FFF ) { 5105 if ((uint64_t)(aSig << 1)) { 5106 return propagateFloatx80NaN(a, b, status); 5107 } 5108 if ( bExp == 0x7FFF ) { 5109 if ((uint64_t)(bSig << 1)) { 5110 return propagateFloatx80NaN(a, b, status); 5111 } 5112 goto invalid; 5113 } 5114 return packFloatx80(zSign, floatx80_infinity_high, 5115 floatx80_infinity_low); 5116 } 5117 if ( bExp == 0x7FFF ) { 5118 if ((uint64_t)(bSig << 1)) { 5119 return propagateFloatx80NaN(a, b, status); 5120 } 5121 return packFloatx80( zSign, 0, 0 ); 5122 } 5123 if ( bExp == 0 ) { 5124 if ( bSig == 0 ) { 5125 if ( ( aExp | aSig ) == 0 ) { 5126 invalid: 5127 float_raise(float_flag_invalid, status); 5128 return floatx80_default_nan(status); 5129 } 5130 float_raise(float_flag_divbyzero, status); 5131 return packFloatx80(zSign, floatx80_infinity_high, 5132 floatx80_infinity_low); 5133 } 5134 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5135 } 5136 if ( aExp == 0 ) { 5137 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5138 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5139 } 5140 zExp = aExp - bExp + 0x3FFE; 5141 rem1 = 0; 5142 if ( bSig <= aSig ) { 5143 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 5144 ++zExp; 5145 } 5146 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 5147 mul64To128( bSig, zSig0, &term0, &term1 ); 5148 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 5149 while ( (int64_t) rem0 < 0 ) { 5150 --zSig0; 5151 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 5152 } 5153 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 5154 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 5155 mul64To128( bSig, zSig1, &term1, &term2 ); 5156 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5157 while ( (int64_t) rem1 < 0 ) { 5158 --zSig1; 5159 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 5160 } 5161 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 5162 } 5163 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5164 zSign, zExp, zSig0, zSig1, status); 5165 } 5166 5167 /*---------------------------------------------------------------------------- 5168 | Returns the remainder of the extended double-precision floating-point value 5169 | `a' with respect to the corresponding value `b'. The operation is performed 5170 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5171 *----------------------------------------------------------------------------*/ 5172 5173 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 5174 { 5175 flag aSign, zSign; 5176 int32_t aExp, bExp, expDiff; 5177 uint64_t aSig0, aSig1, bSig; 5178 uint64_t q, term0, term1, alternateASig0, alternateASig1; 5179 5180 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5181 float_raise(float_flag_invalid, status); 5182 return floatx80_default_nan(status); 5183 } 5184 aSig0 = extractFloatx80Frac( a ); 5185 aExp = extractFloatx80Exp( a ); 5186 aSign = extractFloatx80Sign( a ); 5187 bSig = extractFloatx80Frac( b ); 5188 bExp = extractFloatx80Exp( b ); 5189 if ( aExp == 0x7FFF ) { 5190 if ( (uint64_t) ( aSig0<<1 ) 5191 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5192 return propagateFloatx80NaN(a, b, status); 5193 } 5194 goto invalid; 5195 } 5196 if ( bExp == 0x7FFF ) { 5197 if ((uint64_t)(bSig << 1)) { 5198 return propagateFloatx80NaN(a, b, status); 5199 } 5200 return a; 5201 } 5202 if ( bExp == 0 ) { 5203 if ( bSig == 0 ) { 5204 invalid: 5205 float_raise(float_flag_invalid, status); 5206 return floatx80_default_nan(status); 5207 } 5208 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5209 } 5210 if ( aExp == 0 ) { 5211 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a; 5212 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5213 } 5214 bSig |= LIT64( 0x8000000000000000 ); 5215 zSign = aSign; 5216 expDiff = aExp - bExp; 5217 aSig1 = 0; 5218 if ( expDiff < 0 ) { 5219 if ( expDiff < -1 ) return a; 5220 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 5221 expDiff = 0; 5222 } 5223 q = ( bSig <= aSig0 ); 5224 if ( q ) aSig0 -= bSig; 5225 expDiff -= 64; 5226 while ( 0 < expDiff ) { 5227 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5228 q = ( 2 < q ) ? q - 2 : 0; 5229 mul64To128( bSig, q, &term0, &term1 ); 5230 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5231 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 5232 expDiff -= 62; 5233 } 5234 expDiff += 64; 5235 if ( 0 < expDiff ) { 5236 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5237 q = ( 2 < q ) ? q - 2 : 0; 5238 q >>= 64 - expDiff; 5239 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 5240 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5241 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 5242 while ( le128( term0, term1, aSig0, aSig1 ) ) { 5243 ++q; 5244 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5245 } 5246 } 5247 else { 5248 term1 = 0; 5249 term0 = bSig; 5250 } 5251 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 5252 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5253 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5254 && ( q & 1 ) ) 5255 ) { 5256 aSig0 = alternateASig0; 5257 aSig1 = alternateASig1; 5258 zSign = ! zSign; 5259 } 5260 return 5261 normalizeRoundAndPackFloatx80( 5262 80, zSign, bExp + expDiff, aSig0, aSig1, status); 5263 5264 } 5265 5266 /*---------------------------------------------------------------------------- 5267 | Returns the square root of the extended double-precision floating-point 5268 | value `a'. The operation is performed according to the IEC/IEEE Standard 5269 | for Binary Floating-Point Arithmetic. 5270 *----------------------------------------------------------------------------*/ 5271 5272 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 5273 { 5274 flag aSign; 5275 int32_t aExp, zExp; 5276 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 5277 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5278 5279 if (floatx80_invalid_encoding(a)) { 5280 float_raise(float_flag_invalid, status); 5281 return floatx80_default_nan(status); 5282 } 5283 aSig0 = extractFloatx80Frac( a ); 5284 aExp = extractFloatx80Exp( a ); 5285 aSign = extractFloatx80Sign( a ); 5286 if ( aExp == 0x7FFF ) { 5287 if ((uint64_t)(aSig0 << 1)) { 5288 return propagateFloatx80NaN(a, a, status); 5289 } 5290 if ( ! aSign ) return a; 5291 goto invalid; 5292 } 5293 if ( aSign ) { 5294 if ( ( aExp | aSig0 ) == 0 ) return a; 5295 invalid: 5296 float_raise(float_flag_invalid, status); 5297 return floatx80_default_nan(status); 5298 } 5299 if ( aExp == 0 ) { 5300 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 5301 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5302 } 5303 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 5304 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 5305 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 5306 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5307 doubleZSig0 = zSig0<<1; 5308 mul64To128( zSig0, zSig0, &term0, &term1 ); 5309 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5310 while ( (int64_t) rem0 < 0 ) { 5311 --zSig0; 5312 doubleZSig0 -= 2; 5313 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5314 } 5315 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5316 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 5317 if ( zSig1 == 0 ) zSig1 = 1; 5318 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5319 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5320 mul64To128( zSig1, zSig1, &term2, &term3 ); 5321 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5322 while ( (int64_t) rem1 < 0 ) { 5323 --zSig1; 5324 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5325 term3 |= 1; 5326 term2 |= doubleZSig0; 5327 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5328 } 5329 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5330 } 5331 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 5332 zSig0 |= doubleZSig0; 5333 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5334 0, zExp, zSig0, zSig1, status); 5335 } 5336 5337 /*---------------------------------------------------------------------------- 5338 | Returns 1 if the extended double-precision floating-point value `a' is equal 5339 | to the corresponding value `b', and 0 otherwise. The invalid exception is 5340 | raised if either operand is a NaN. Otherwise, the comparison is performed 5341 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5342 *----------------------------------------------------------------------------*/ 5343 5344 int floatx80_eq(floatx80 a, floatx80 b, float_status *status) 5345 { 5346 5347 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5348 || (extractFloatx80Exp(a) == 0x7FFF 5349 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5350 || (extractFloatx80Exp(b) == 0x7FFF 5351 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5352 ) { 5353 float_raise(float_flag_invalid, status); 5354 return 0; 5355 } 5356 return 5357 ( a.low == b.low ) 5358 && ( ( a.high == b.high ) 5359 || ( ( a.low == 0 ) 5360 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5361 ); 5362 5363 } 5364 5365 /*---------------------------------------------------------------------------- 5366 | Returns 1 if the extended double-precision floating-point value `a' is 5367 | less than or equal to the corresponding value `b', and 0 otherwise. The 5368 | invalid exception is raised if either operand is a NaN. The comparison is 5369 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5370 | Arithmetic. 5371 *----------------------------------------------------------------------------*/ 5372 5373 int floatx80_le(floatx80 a, floatx80 b, float_status *status) 5374 { 5375 flag aSign, bSign; 5376 5377 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5378 || (extractFloatx80Exp(a) == 0x7FFF 5379 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5380 || (extractFloatx80Exp(b) == 0x7FFF 5381 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5382 ) { 5383 float_raise(float_flag_invalid, status); 5384 return 0; 5385 } 5386 aSign = extractFloatx80Sign( a ); 5387 bSign = extractFloatx80Sign( b ); 5388 if ( aSign != bSign ) { 5389 return 5390 aSign 5391 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5392 == 0 ); 5393 } 5394 return 5395 aSign ? le128( b.high, b.low, a.high, a.low ) 5396 : le128( a.high, a.low, b.high, b.low ); 5397 5398 } 5399 5400 /*---------------------------------------------------------------------------- 5401 | Returns 1 if the extended double-precision floating-point value `a' is 5402 | less than the corresponding value `b', and 0 otherwise. The invalid 5403 | exception is raised if either operand is a NaN. The comparison is performed 5404 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5405 *----------------------------------------------------------------------------*/ 5406 5407 int floatx80_lt(floatx80 a, floatx80 b, float_status *status) 5408 { 5409 flag aSign, bSign; 5410 5411 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5412 || (extractFloatx80Exp(a) == 0x7FFF 5413 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5414 || (extractFloatx80Exp(b) == 0x7FFF 5415 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5416 ) { 5417 float_raise(float_flag_invalid, status); 5418 return 0; 5419 } 5420 aSign = extractFloatx80Sign( a ); 5421 bSign = extractFloatx80Sign( b ); 5422 if ( aSign != bSign ) { 5423 return 5424 aSign 5425 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5426 != 0 ); 5427 } 5428 return 5429 aSign ? lt128( b.high, b.low, a.high, a.low ) 5430 : lt128( a.high, a.low, b.high, b.low ); 5431 5432 } 5433 5434 /*---------------------------------------------------------------------------- 5435 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5436 | cannot be compared, and 0 otherwise. The invalid exception is raised if 5437 | either operand is a NaN. The comparison is performed according to the 5438 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5439 *----------------------------------------------------------------------------*/ 5440 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status) 5441 { 5442 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5443 || (extractFloatx80Exp(a) == 0x7FFF 5444 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5445 || (extractFloatx80Exp(b) == 0x7FFF 5446 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5447 ) { 5448 float_raise(float_flag_invalid, status); 5449 return 1; 5450 } 5451 return 0; 5452 } 5453 5454 /*---------------------------------------------------------------------------- 5455 | Returns 1 if the extended double-precision floating-point value `a' is 5456 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5457 | cause an exception. The comparison is performed according to the IEC/IEEE 5458 | Standard for Binary Floating-Point Arithmetic. 5459 *----------------------------------------------------------------------------*/ 5460 5461 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status) 5462 { 5463 5464 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5465 float_raise(float_flag_invalid, status); 5466 return 0; 5467 } 5468 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5469 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5470 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5471 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5472 ) { 5473 if (floatx80_is_signaling_nan(a, status) 5474 || floatx80_is_signaling_nan(b, status)) { 5475 float_raise(float_flag_invalid, status); 5476 } 5477 return 0; 5478 } 5479 return 5480 ( a.low == b.low ) 5481 && ( ( a.high == b.high ) 5482 || ( ( a.low == 0 ) 5483 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5484 ); 5485 5486 } 5487 5488 /*---------------------------------------------------------------------------- 5489 | Returns 1 if the extended double-precision floating-point value `a' is less 5490 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 5491 | do not cause an exception. Otherwise, the comparison is performed according 5492 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5493 *----------------------------------------------------------------------------*/ 5494 5495 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status) 5496 { 5497 flag aSign, bSign; 5498 5499 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5500 float_raise(float_flag_invalid, status); 5501 return 0; 5502 } 5503 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5504 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5505 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5506 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5507 ) { 5508 if (floatx80_is_signaling_nan(a, status) 5509 || floatx80_is_signaling_nan(b, status)) { 5510 float_raise(float_flag_invalid, status); 5511 } 5512 return 0; 5513 } 5514 aSign = extractFloatx80Sign( a ); 5515 bSign = extractFloatx80Sign( b ); 5516 if ( aSign != bSign ) { 5517 return 5518 aSign 5519 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5520 == 0 ); 5521 } 5522 return 5523 aSign ? le128( b.high, b.low, a.high, a.low ) 5524 : le128( a.high, a.low, b.high, b.low ); 5525 5526 } 5527 5528 /*---------------------------------------------------------------------------- 5529 | Returns 1 if the extended double-precision floating-point value `a' is less 5530 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 5531 | an exception. Otherwise, the comparison is performed according to the 5532 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5533 *----------------------------------------------------------------------------*/ 5534 5535 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status) 5536 { 5537 flag aSign, bSign; 5538 5539 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5540 float_raise(float_flag_invalid, status); 5541 return 0; 5542 } 5543 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5544 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5545 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5546 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5547 ) { 5548 if (floatx80_is_signaling_nan(a, status) 5549 || floatx80_is_signaling_nan(b, status)) { 5550 float_raise(float_flag_invalid, status); 5551 } 5552 return 0; 5553 } 5554 aSign = extractFloatx80Sign( a ); 5555 bSign = extractFloatx80Sign( b ); 5556 if ( aSign != bSign ) { 5557 return 5558 aSign 5559 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5560 != 0 ); 5561 } 5562 return 5563 aSign ? lt128( b.high, b.low, a.high, a.low ) 5564 : lt128( a.high, a.low, b.high, b.low ); 5565 5566 } 5567 5568 /*---------------------------------------------------------------------------- 5569 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5570 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception. 5571 | The comparison is performed according to the IEC/IEEE Standard for Binary 5572 | Floating-Point Arithmetic. 5573 *----------------------------------------------------------------------------*/ 5574 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status) 5575 { 5576 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5577 float_raise(float_flag_invalid, status); 5578 return 1; 5579 } 5580 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5581 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5582 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5583 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5584 ) { 5585 if (floatx80_is_signaling_nan(a, status) 5586 || floatx80_is_signaling_nan(b, status)) { 5587 float_raise(float_flag_invalid, status); 5588 } 5589 return 1; 5590 } 5591 return 0; 5592 } 5593 5594 /*---------------------------------------------------------------------------- 5595 | Returns the result of converting the quadruple-precision floating-point 5596 | value `a' to the 32-bit two's complement integer format. The conversion 5597 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5598 | Arithmetic---which means in particular that the conversion is rounded 5599 | according to the current rounding mode. If `a' is a NaN, the largest 5600 | positive integer is returned. Otherwise, if the conversion overflows, the 5601 | largest integer with the same sign as `a' is returned. 5602 *----------------------------------------------------------------------------*/ 5603 5604 int32_t float128_to_int32(float128 a, float_status *status) 5605 { 5606 flag aSign; 5607 int32_t aExp, shiftCount; 5608 uint64_t aSig0, aSig1; 5609 5610 aSig1 = extractFloat128Frac1( a ); 5611 aSig0 = extractFloat128Frac0( a ); 5612 aExp = extractFloat128Exp( a ); 5613 aSign = extractFloat128Sign( a ); 5614 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 5615 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5616 aSig0 |= ( aSig1 != 0 ); 5617 shiftCount = 0x4028 - aExp; 5618 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 5619 return roundAndPackInt32(aSign, aSig0, status); 5620 5621 } 5622 5623 /*---------------------------------------------------------------------------- 5624 | Returns the result of converting the quadruple-precision floating-point 5625 | value `a' to the 32-bit two's complement integer format. The conversion 5626 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5627 | Arithmetic, except that the conversion is always rounded toward zero. If 5628 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 5629 | conversion overflows, the largest integer with the same sign as `a' is 5630 | returned. 5631 *----------------------------------------------------------------------------*/ 5632 5633 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 5634 { 5635 flag aSign; 5636 int32_t aExp, shiftCount; 5637 uint64_t aSig0, aSig1, savedASig; 5638 int32_t z; 5639 5640 aSig1 = extractFloat128Frac1( a ); 5641 aSig0 = extractFloat128Frac0( a ); 5642 aExp = extractFloat128Exp( a ); 5643 aSign = extractFloat128Sign( a ); 5644 aSig0 |= ( aSig1 != 0 ); 5645 if ( 0x401E < aExp ) { 5646 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 5647 goto invalid; 5648 } 5649 else if ( aExp < 0x3FFF ) { 5650 if (aExp || aSig0) { 5651 status->float_exception_flags |= float_flag_inexact; 5652 } 5653 return 0; 5654 } 5655 aSig0 |= LIT64( 0x0001000000000000 ); 5656 shiftCount = 0x402F - aExp; 5657 savedASig = aSig0; 5658 aSig0 >>= shiftCount; 5659 z = aSig0; 5660 if ( aSign ) z = - z; 5661 if ( ( z < 0 ) ^ aSign ) { 5662 invalid: 5663 float_raise(float_flag_invalid, status); 5664 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5665 } 5666 if ( ( aSig0<<shiftCount ) != savedASig ) { 5667 status->float_exception_flags |= float_flag_inexact; 5668 } 5669 return z; 5670 5671 } 5672 5673 /*---------------------------------------------------------------------------- 5674 | Returns the result of converting the quadruple-precision floating-point 5675 | value `a' to the 64-bit two's complement integer format. The conversion 5676 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5677 | Arithmetic---which means in particular that the conversion is rounded 5678 | according to the current rounding mode. If `a' is a NaN, the largest 5679 | positive integer is returned. Otherwise, if the conversion overflows, the 5680 | largest integer with the same sign as `a' is returned. 5681 *----------------------------------------------------------------------------*/ 5682 5683 int64_t float128_to_int64(float128 a, float_status *status) 5684 { 5685 flag aSign; 5686 int32_t aExp, shiftCount; 5687 uint64_t aSig0, aSig1; 5688 5689 aSig1 = extractFloat128Frac1( a ); 5690 aSig0 = extractFloat128Frac0( a ); 5691 aExp = extractFloat128Exp( a ); 5692 aSign = extractFloat128Sign( a ); 5693 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5694 shiftCount = 0x402F - aExp; 5695 if ( shiftCount <= 0 ) { 5696 if ( 0x403E < aExp ) { 5697 float_raise(float_flag_invalid, status); 5698 if ( ! aSign 5699 || ( ( aExp == 0x7FFF ) 5700 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 5701 ) 5702 ) { 5703 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5704 } 5705 return (int64_t) LIT64( 0x8000000000000000 ); 5706 } 5707 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 5708 } 5709 else { 5710 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 5711 } 5712 return roundAndPackInt64(aSign, aSig0, aSig1, status); 5713 5714 } 5715 5716 /*---------------------------------------------------------------------------- 5717 | Returns the result of converting the quadruple-precision floating-point 5718 | value `a' to the 64-bit two's complement integer format. The conversion 5719 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5720 | Arithmetic, except that the conversion is always rounded toward zero. 5721 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 5722 | the conversion overflows, the largest integer with the same sign as `a' is 5723 | returned. 5724 *----------------------------------------------------------------------------*/ 5725 5726 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 5727 { 5728 flag aSign; 5729 int32_t aExp, shiftCount; 5730 uint64_t aSig0, aSig1; 5731 int64_t z; 5732 5733 aSig1 = extractFloat128Frac1( a ); 5734 aSig0 = extractFloat128Frac0( a ); 5735 aExp = extractFloat128Exp( a ); 5736 aSign = extractFloat128Sign( a ); 5737 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5738 shiftCount = aExp - 0x402F; 5739 if ( 0 < shiftCount ) { 5740 if ( 0x403E <= aExp ) { 5741 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 5742 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 5743 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 5744 if (aSig1) { 5745 status->float_exception_flags |= float_flag_inexact; 5746 } 5747 } 5748 else { 5749 float_raise(float_flag_invalid, status); 5750 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 5751 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5752 } 5753 } 5754 return (int64_t) LIT64( 0x8000000000000000 ); 5755 } 5756 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 5757 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 5758 status->float_exception_flags |= float_flag_inexact; 5759 } 5760 } 5761 else { 5762 if ( aExp < 0x3FFF ) { 5763 if ( aExp | aSig0 | aSig1 ) { 5764 status->float_exception_flags |= float_flag_inexact; 5765 } 5766 return 0; 5767 } 5768 z = aSig0>>( - shiftCount ); 5769 if ( aSig1 5770 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 5771 status->float_exception_flags |= float_flag_inexact; 5772 } 5773 } 5774 if ( aSign ) z = - z; 5775 return z; 5776 5777 } 5778 5779 /*---------------------------------------------------------------------------- 5780 | Returns the result of converting the quadruple-precision floating-point value 5781 | `a' to the 64-bit unsigned integer format. The conversion is 5782 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5783 | Arithmetic---which means in particular that the conversion is rounded 5784 | according to the current rounding mode. If `a' is a NaN, the largest 5785 | positive integer is returned. If the conversion overflows, the 5786 | largest unsigned integer is returned. If 'a' is negative, the value is 5787 | rounded and zero is returned; negative values that do not round to zero 5788 | will raise the inexact exception. 5789 *----------------------------------------------------------------------------*/ 5790 5791 uint64_t float128_to_uint64(float128 a, float_status *status) 5792 { 5793 flag aSign; 5794 int aExp; 5795 int shiftCount; 5796 uint64_t aSig0, aSig1; 5797 5798 aSig0 = extractFloat128Frac0(a); 5799 aSig1 = extractFloat128Frac1(a); 5800 aExp = extractFloat128Exp(a); 5801 aSign = extractFloat128Sign(a); 5802 if (aSign && (aExp > 0x3FFE)) { 5803 float_raise(float_flag_invalid, status); 5804 if (float128_is_any_nan(a)) { 5805 return LIT64(0xFFFFFFFFFFFFFFFF); 5806 } else { 5807 return 0; 5808 } 5809 } 5810 if (aExp) { 5811 aSig0 |= LIT64(0x0001000000000000); 5812 } 5813 shiftCount = 0x402F - aExp; 5814 if (shiftCount <= 0) { 5815 if (0x403E < aExp) { 5816 float_raise(float_flag_invalid, status); 5817 return LIT64(0xFFFFFFFFFFFFFFFF); 5818 } 5819 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 5820 } else { 5821 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 5822 } 5823 return roundAndPackUint64(aSign, aSig0, aSig1, status); 5824 } 5825 5826 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 5827 { 5828 uint64_t v; 5829 signed char current_rounding_mode = status->float_rounding_mode; 5830 5831 set_float_rounding_mode(float_round_to_zero, status); 5832 v = float128_to_uint64(a, status); 5833 set_float_rounding_mode(current_rounding_mode, status); 5834 5835 return v; 5836 } 5837 5838 /*---------------------------------------------------------------------------- 5839 | Returns the result of converting the quadruple-precision floating-point 5840 | value `a' to the 32-bit unsigned integer format. The conversion 5841 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5842 | Arithmetic except that the conversion is always rounded toward zero. 5843 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 5844 | if the conversion overflows, the largest unsigned integer is returned. 5845 | If 'a' is negative, the value is rounded and zero is returned; negative 5846 | values that do not round to zero will raise the inexact exception. 5847 *----------------------------------------------------------------------------*/ 5848 5849 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 5850 { 5851 uint64_t v; 5852 uint32_t res; 5853 int old_exc_flags = get_float_exception_flags(status); 5854 5855 v = float128_to_uint64_round_to_zero(a, status); 5856 if (v > 0xffffffff) { 5857 res = 0xffffffff; 5858 } else { 5859 return v; 5860 } 5861 set_float_exception_flags(old_exc_flags, status); 5862 float_raise(float_flag_invalid, status); 5863 return res; 5864 } 5865 5866 /*---------------------------------------------------------------------------- 5867 | Returns the result of converting the quadruple-precision floating-point 5868 | value `a' to the single-precision floating-point format. The conversion 5869 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5870 | Arithmetic. 5871 *----------------------------------------------------------------------------*/ 5872 5873 float32 float128_to_float32(float128 a, float_status *status) 5874 { 5875 flag aSign; 5876 int32_t aExp; 5877 uint64_t aSig0, aSig1; 5878 uint32_t zSig; 5879 5880 aSig1 = extractFloat128Frac1( a ); 5881 aSig0 = extractFloat128Frac0( a ); 5882 aExp = extractFloat128Exp( a ); 5883 aSign = extractFloat128Sign( a ); 5884 if ( aExp == 0x7FFF ) { 5885 if ( aSig0 | aSig1 ) { 5886 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 5887 } 5888 return packFloat32( aSign, 0xFF, 0 ); 5889 } 5890 aSig0 |= ( aSig1 != 0 ); 5891 shift64RightJamming( aSig0, 18, &aSig0 ); 5892 zSig = aSig0; 5893 if ( aExp || zSig ) { 5894 zSig |= 0x40000000; 5895 aExp -= 0x3F81; 5896 } 5897 return roundAndPackFloat32(aSign, aExp, zSig, status); 5898 5899 } 5900 5901 /*---------------------------------------------------------------------------- 5902 | Returns the result of converting the quadruple-precision floating-point 5903 | value `a' to the double-precision floating-point format. The conversion 5904 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5905 | Arithmetic. 5906 *----------------------------------------------------------------------------*/ 5907 5908 float64 float128_to_float64(float128 a, float_status *status) 5909 { 5910 flag aSign; 5911 int32_t aExp; 5912 uint64_t aSig0, aSig1; 5913 5914 aSig1 = extractFloat128Frac1( a ); 5915 aSig0 = extractFloat128Frac0( a ); 5916 aExp = extractFloat128Exp( a ); 5917 aSign = extractFloat128Sign( a ); 5918 if ( aExp == 0x7FFF ) { 5919 if ( aSig0 | aSig1 ) { 5920 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 5921 } 5922 return packFloat64( aSign, 0x7FF, 0 ); 5923 } 5924 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 5925 aSig0 |= ( aSig1 != 0 ); 5926 if ( aExp || aSig0 ) { 5927 aSig0 |= LIT64( 0x4000000000000000 ); 5928 aExp -= 0x3C01; 5929 } 5930 return roundAndPackFloat64(aSign, aExp, aSig0, status); 5931 5932 } 5933 5934 /*---------------------------------------------------------------------------- 5935 | Returns the result of converting the quadruple-precision floating-point 5936 | value `a' to the extended double-precision floating-point format. The 5937 | conversion is performed according to the IEC/IEEE Standard for Binary 5938 | Floating-Point Arithmetic. 5939 *----------------------------------------------------------------------------*/ 5940 5941 floatx80 float128_to_floatx80(float128 a, float_status *status) 5942 { 5943 flag aSign; 5944 int32_t aExp; 5945 uint64_t aSig0, aSig1; 5946 5947 aSig1 = extractFloat128Frac1( a ); 5948 aSig0 = extractFloat128Frac0( a ); 5949 aExp = extractFloat128Exp( a ); 5950 aSign = extractFloat128Sign( a ); 5951 if ( aExp == 0x7FFF ) { 5952 if ( aSig0 | aSig1 ) { 5953 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status); 5954 } 5955 return packFloatx80(aSign, floatx80_infinity_high, 5956 floatx80_infinity_low); 5957 } 5958 if ( aExp == 0 ) { 5959 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 5960 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5961 } 5962 else { 5963 aSig0 |= LIT64( 0x0001000000000000 ); 5964 } 5965 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 5966 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 5967 5968 } 5969 5970 /*---------------------------------------------------------------------------- 5971 | Rounds the quadruple-precision floating-point value `a' to an integer, and 5972 | returns the result as a quadruple-precision floating-point value. The 5973 | operation is performed according to the IEC/IEEE Standard for Binary 5974 | Floating-Point Arithmetic. 5975 *----------------------------------------------------------------------------*/ 5976 5977 float128 float128_round_to_int(float128 a, float_status *status) 5978 { 5979 flag aSign; 5980 int32_t aExp; 5981 uint64_t lastBitMask, roundBitsMask; 5982 float128 z; 5983 5984 aExp = extractFloat128Exp( a ); 5985 if ( 0x402F <= aExp ) { 5986 if ( 0x406F <= aExp ) { 5987 if ( ( aExp == 0x7FFF ) 5988 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 5989 ) { 5990 return propagateFloat128NaN(a, a, status); 5991 } 5992 return a; 5993 } 5994 lastBitMask = 1; 5995 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 5996 roundBitsMask = lastBitMask - 1; 5997 z = a; 5998 switch (status->float_rounding_mode) { 5999 case float_round_nearest_even: 6000 if ( lastBitMask ) { 6001 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 6002 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 6003 } 6004 else { 6005 if ( (int64_t) z.low < 0 ) { 6006 ++z.high; 6007 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 6008 } 6009 } 6010 break; 6011 case float_round_ties_away: 6012 if (lastBitMask) { 6013 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 6014 } else { 6015 if ((int64_t) z.low < 0) { 6016 ++z.high; 6017 } 6018 } 6019 break; 6020 case float_round_to_zero: 6021 break; 6022 case float_round_up: 6023 if (!extractFloat128Sign(z)) { 6024 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6025 } 6026 break; 6027 case float_round_down: 6028 if (extractFloat128Sign(z)) { 6029 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6030 } 6031 break; 6032 default: 6033 abort(); 6034 } 6035 z.low &= ~ roundBitsMask; 6036 } 6037 else { 6038 if ( aExp < 0x3FFF ) { 6039 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 6040 status->float_exception_flags |= float_flag_inexact; 6041 aSign = extractFloat128Sign( a ); 6042 switch (status->float_rounding_mode) { 6043 case float_round_nearest_even: 6044 if ( ( aExp == 0x3FFE ) 6045 && ( extractFloat128Frac0( a ) 6046 | extractFloat128Frac1( a ) ) 6047 ) { 6048 return packFloat128( aSign, 0x3FFF, 0, 0 ); 6049 } 6050 break; 6051 case float_round_ties_away: 6052 if (aExp == 0x3FFE) { 6053 return packFloat128(aSign, 0x3FFF, 0, 0); 6054 } 6055 break; 6056 case float_round_down: 6057 return 6058 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 6059 : packFloat128( 0, 0, 0, 0 ); 6060 case float_round_up: 6061 return 6062 aSign ? packFloat128( 1, 0, 0, 0 ) 6063 : packFloat128( 0, 0x3FFF, 0, 0 ); 6064 } 6065 return packFloat128( aSign, 0, 0, 0 ); 6066 } 6067 lastBitMask = 1; 6068 lastBitMask <<= 0x402F - aExp; 6069 roundBitsMask = lastBitMask - 1; 6070 z.low = 0; 6071 z.high = a.high; 6072 switch (status->float_rounding_mode) { 6073 case float_round_nearest_even: 6074 z.high += lastBitMask>>1; 6075 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 6076 z.high &= ~ lastBitMask; 6077 } 6078 break; 6079 case float_round_ties_away: 6080 z.high += lastBitMask>>1; 6081 break; 6082 case float_round_to_zero: 6083 break; 6084 case float_round_up: 6085 if (!extractFloat128Sign(z)) { 6086 z.high |= ( a.low != 0 ); 6087 z.high += roundBitsMask; 6088 } 6089 break; 6090 case float_round_down: 6091 if (extractFloat128Sign(z)) { 6092 z.high |= (a.low != 0); 6093 z.high += roundBitsMask; 6094 } 6095 break; 6096 default: 6097 abort(); 6098 } 6099 z.high &= ~ roundBitsMask; 6100 } 6101 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 6102 status->float_exception_flags |= float_flag_inexact; 6103 } 6104 return z; 6105 6106 } 6107 6108 /*---------------------------------------------------------------------------- 6109 | Returns the result of adding the absolute values of the quadruple-precision 6110 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 6111 | before being returned. `zSign' is ignored if the result is a NaN. 6112 | The addition is performed according to the IEC/IEEE Standard for Binary 6113 | Floating-Point Arithmetic. 6114 *----------------------------------------------------------------------------*/ 6115 6116 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign, 6117 float_status *status) 6118 { 6119 int32_t aExp, bExp, zExp; 6120 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6121 int32_t expDiff; 6122 6123 aSig1 = extractFloat128Frac1( a ); 6124 aSig0 = extractFloat128Frac0( a ); 6125 aExp = extractFloat128Exp( a ); 6126 bSig1 = extractFloat128Frac1( b ); 6127 bSig0 = extractFloat128Frac0( b ); 6128 bExp = extractFloat128Exp( b ); 6129 expDiff = aExp - bExp; 6130 if ( 0 < expDiff ) { 6131 if ( aExp == 0x7FFF ) { 6132 if (aSig0 | aSig1) { 6133 return propagateFloat128NaN(a, b, status); 6134 } 6135 return a; 6136 } 6137 if ( bExp == 0 ) { 6138 --expDiff; 6139 } 6140 else { 6141 bSig0 |= LIT64( 0x0001000000000000 ); 6142 } 6143 shift128ExtraRightJamming( 6144 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 6145 zExp = aExp; 6146 } 6147 else if ( expDiff < 0 ) { 6148 if ( bExp == 0x7FFF ) { 6149 if (bSig0 | bSig1) { 6150 return propagateFloat128NaN(a, b, status); 6151 } 6152 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6153 } 6154 if ( aExp == 0 ) { 6155 ++expDiff; 6156 } 6157 else { 6158 aSig0 |= LIT64( 0x0001000000000000 ); 6159 } 6160 shift128ExtraRightJamming( 6161 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 6162 zExp = bExp; 6163 } 6164 else { 6165 if ( aExp == 0x7FFF ) { 6166 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6167 return propagateFloat128NaN(a, b, status); 6168 } 6169 return a; 6170 } 6171 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6172 if ( aExp == 0 ) { 6173 if (status->flush_to_zero) { 6174 if (zSig0 | zSig1) { 6175 float_raise(float_flag_output_denormal, status); 6176 } 6177 return packFloat128(zSign, 0, 0, 0); 6178 } 6179 return packFloat128( zSign, 0, zSig0, zSig1 ); 6180 } 6181 zSig2 = 0; 6182 zSig0 |= LIT64( 0x0002000000000000 ); 6183 zExp = aExp; 6184 goto shiftRight1; 6185 } 6186 aSig0 |= LIT64( 0x0001000000000000 ); 6187 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6188 --zExp; 6189 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 6190 ++zExp; 6191 shiftRight1: 6192 shift128ExtraRightJamming( 6193 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6194 roundAndPack: 6195 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6196 6197 } 6198 6199 /*---------------------------------------------------------------------------- 6200 | Returns the result of subtracting the absolute values of the quadruple- 6201 | precision floating-point values `a' and `b'. If `zSign' is 1, the 6202 | difference is negated before being returned. `zSign' is ignored if the 6203 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6204 | Standard for Binary Floating-Point Arithmetic. 6205 *----------------------------------------------------------------------------*/ 6206 6207 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign, 6208 float_status *status) 6209 { 6210 int32_t aExp, bExp, zExp; 6211 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 6212 int32_t expDiff; 6213 6214 aSig1 = extractFloat128Frac1( a ); 6215 aSig0 = extractFloat128Frac0( a ); 6216 aExp = extractFloat128Exp( a ); 6217 bSig1 = extractFloat128Frac1( b ); 6218 bSig0 = extractFloat128Frac0( b ); 6219 bExp = extractFloat128Exp( b ); 6220 expDiff = aExp - bExp; 6221 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6222 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 6223 if ( 0 < expDiff ) goto aExpBigger; 6224 if ( expDiff < 0 ) goto bExpBigger; 6225 if ( aExp == 0x7FFF ) { 6226 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6227 return propagateFloat128NaN(a, b, status); 6228 } 6229 float_raise(float_flag_invalid, status); 6230 return float128_default_nan(status); 6231 } 6232 if ( aExp == 0 ) { 6233 aExp = 1; 6234 bExp = 1; 6235 } 6236 if ( bSig0 < aSig0 ) goto aBigger; 6237 if ( aSig0 < bSig0 ) goto bBigger; 6238 if ( bSig1 < aSig1 ) goto aBigger; 6239 if ( aSig1 < bSig1 ) goto bBigger; 6240 return packFloat128(status->float_rounding_mode == float_round_down, 6241 0, 0, 0); 6242 bExpBigger: 6243 if ( bExp == 0x7FFF ) { 6244 if (bSig0 | bSig1) { 6245 return propagateFloat128NaN(a, b, status); 6246 } 6247 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 6248 } 6249 if ( aExp == 0 ) { 6250 ++expDiff; 6251 } 6252 else { 6253 aSig0 |= LIT64( 0x4000000000000000 ); 6254 } 6255 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6256 bSig0 |= LIT64( 0x4000000000000000 ); 6257 bBigger: 6258 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6259 zExp = bExp; 6260 zSign ^= 1; 6261 goto normalizeRoundAndPack; 6262 aExpBigger: 6263 if ( aExp == 0x7FFF ) { 6264 if (aSig0 | aSig1) { 6265 return propagateFloat128NaN(a, b, status); 6266 } 6267 return a; 6268 } 6269 if ( bExp == 0 ) { 6270 --expDiff; 6271 } 6272 else { 6273 bSig0 |= LIT64( 0x4000000000000000 ); 6274 } 6275 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 6276 aSig0 |= LIT64( 0x4000000000000000 ); 6277 aBigger: 6278 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6279 zExp = aExp; 6280 normalizeRoundAndPack: 6281 --zExp; 6282 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 6283 status); 6284 6285 } 6286 6287 /*---------------------------------------------------------------------------- 6288 | Returns the result of adding the quadruple-precision floating-point values 6289 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 6290 | for Binary Floating-Point Arithmetic. 6291 *----------------------------------------------------------------------------*/ 6292 6293 float128 float128_add(float128 a, float128 b, float_status *status) 6294 { 6295 flag aSign, bSign; 6296 6297 aSign = extractFloat128Sign( a ); 6298 bSign = extractFloat128Sign( b ); 6299 if ( aSign == bSign ) { 6300 return addFloat128Sigs(a, b, aSign, status); 6301 } 6302 else { 6303 return subFloat128Sigs(a, b, aSign, status); 6304 } 6305 6306 } 6307 6308 /*---------------------------------------------------------------------------- 6309 | Returns the result of subtracting the quadruple-precision floating-point 6310 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6311 | Standard for Binary Floating-Point Arithmetic. 6312 *----------------------------------------------------------------------------*/ 6313 6314 float128 float128_sub(float128 a, float128 b, float_status *status) 6315 { 6316 flag aSign, bSign; 6317 6318 aSign = extractFloat128Sign( a ); 6319 bSign = extractFloat128Sign( b ); 6320 if ( aSign == bSign ) { 6321 return subFloat128Sigs(a, b, aSign, status); 6322 } 6323 else { 6324 return addFloat128Sigs(a, b, aSign, status); 6325 } 6326 6327 } 6328 6329 /*---------------------------------------------------------------------------- 6330 | Returns the result of multiplying the quadruple-precision floating-point 6331 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6332 | Standard for Binary Floating-Point Arithmetic. 6333 *----------------------------------------------------------------------------*/ 6334 6335 float128 float128_mul(float128 a, float128 b, float_status *status) 6336 { 6337 flag aSign, bSign, zSign; 6338 int32_t aExp, bExp, zExp; 6339 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 6340 6341 aSig1 = extractFloat128Frac1( a ); 6342 aSig0 = extractFloat128Frac0( a ); 6343 aExp = extractFloat128Exp( a ); 6344 aSign = extractFloat128Sign( a ); 6345 bSig1 = extractFloat128Frac1( b ); 6346 bSig0 = extractFloat128Frac0( b ); 6347 bExp = extractFloat128Exp( b ); 6348 bSign = extractFloat128Sign( b ); 6349 zSign = aSign ^ bSign; 6350 if ( aExp == 0x7FFF ) { 6351 if ( ( aSig0 | aSig1 ) 6352 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6353 return propagateFloat128NaN(a, b, status); 6354 } 6355 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 6356 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6357 } 6358 if ( bExp == 0x7FFF ) { 6359 if (bSig0 | bSig1) { 6360 return propagateFloat128NaN(a, b, status); 6361 } 6362 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6363 invalid: 6364 float_raise(float_flag_invalid, status); 6365 return float128_default_nan(status); 6366 } 6367 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6368 } 6369 if ( aExp == 0 ) { 6370 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6371 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6372 } 6373 if ( bExp == 0 ) { 6374 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6375 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6376 } 6377 zExp = aExp + bExp - 0x4000; 6378 aSig0 |= LIT64( 0x0001000000000000 ); 6379 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 6380 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 6381 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6382 zSig2 |= ( zSig3 != 0 ); 6383 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 6384 shift128ExtraRightJamming( 6385 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6386 ++zExp; 6387 } 6388 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6389 6390 } 6391 6392 /*---------------------------------------------------------------------------- 6393 | Returns the result of dividing the quadruple-precision floating-point value 6394 | `a' by the corresponding value `b'. The operation is performed according to 6395 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6396 *----------------------------------------------------------------------------*/ 6397 6398 float128 float128_div(float128 a, float128 b, float_status *status) 6399 { 6400 flag aSign, bSign, zSign; 6401 int32_t aExp, bExp, zExp; 6402 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6403 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6404 6405 aSig1 = extractFloat128Frac1( a ); 6406 aSig0 = extractFloat128Frac0( a ); 6407 aExp = extractFloat128Exp( a ); 6408 aSign = extractFloat128Sign( a ); 6409 bSig1 = extractFloat128Frac1( b ); 6410 bSig0 = extractFloat128Frac0( b ); 6411 bExp = extractFloat128Exp( b ); 6412 bSign = extractFloat128Sign( b ); 6413 zSign = aSign ^ bSign; 6414 if ( aExp == 0x7FFF ) { 6415 if (aSig0 | aSig1) { 6416 return propagateFloat128NaN(a, b, status); 6417 } 6418 if ( bExp == 0x7FFF ) { 6419 if (bSig0 | bSig1) { 6420 return propagateFloat128NaN(a, b, status); 6421 } 6422 goto invalid; 6423 } 6424 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6425 } 6426 if ( bExp == 0x7FFF ) { 6427 if (bSig0 | bSig1) { 6428 return propagateFloat128NaN(a, b, status); 6429 } 6430 return packFloat128( zSign, 0, 0, 0 ); 6431 } 6432 if ( bExp == 0 ) { 6433 if ( ( bSig0 | bSig1 ) == 0 ) { 6434 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6435 invalid: 6436 float_raise(float_flag_invalid, status); 6437 return float128_default_nan(status); 6438 } 6439 float_raise(float_flag_divbyzero, status); 6440 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6441 } 6442 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6443 } 6444 if ( aExp == 0 ) { 6445 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6446 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6447 } 6448 zExp = aExp - bExp + 0x3FFD; 6449 shortShift128Left( 6450 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 6451 shortShift128Left( 6452 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6453 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 6454 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 6455 ++zExp; 6456 } 6457 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6458 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 6459 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 6460 while ( (int64_t) rem0 < 0 ) { 6461 --zSig0; 6462 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 6463 } 6464 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 6465 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 6466 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 6467 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 6468 while ( (int64_t) rem1 < 0 ) { 6469 --zSig1; 6470 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 6471 } 6472 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6473 } 6474 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 6475 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6476 6477 } 6478 6479 /*---------------------------------------------------------------------------- 6480 | Returns the remainder of the quadruple-precision floating-point value `a' 6481 | with respect to the corresponding value `b'. The operation is performed 6482 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6483 *----------------------------------------------------------------------------*/ 6484 6485 float128 float128_rem(float128 a, float128 b, float_status *status) 6486 { 6487 flag aSign, zSign; 6488 int32_t aExp, bExp, expDiff; 6489 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 6490 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 6491 int64_t sigMean0; 6492 6493 aSig1 = extractFloat128Frac1( a ); 6494 aSig0 = extractFloat128Frac0( a ); 6495 aExp = extractFloat128Exp( a ); 6496 aSign = extractFloat128Sign( a ); 6497 bSig1 = extractFloat128Frac1( b ); 6498 bSig0 = extractFloat128Frac0( b ); 6499 bExp = extractFloat128Exp( b ); 6500 if ( aExp == 0x7FFF ) { 6501 if ( ( aSig0 | aSig1 ) 6502 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6503 return propagateFloat128NaN(a, b, status); 6504 } 6505 goto invalid; 6506 } 6507 if ( bExp == 0x7FFF ) { 6508 if (bSig0 | bSig1) { 6509 return propagateFloat128NaN(a, b, status); 6510 } 6511 return a; 6512 } 6513 if ( bExp == 0 ) { 6514 if ( ( bSig0 | bSig1 ) == 0 ) { 6515 invalid: 6516 float_raise(float_flag_invalid, status); 6517 return float128_default_nan(status); 6518 } 6519 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6520 } 6521 if ( aExp == 0 ) { 6522 if ( ( aSig0 | aSig1 ) == 0 ) return a; 6523 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6524 } 6525 expDiff = aExp - bExp; 6526 if ( expDiff < -1 ) return a; 6527 shortShift128Left( 6528 aSig0 | LIT64( 0x0001000000000000 ), 6529 aSig1, 6530 15 - ( expDiff < 0 ), 6531 &aSig0, 6532 &aSig1 6533 ); 6534 shortShift128Left( 6535 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6536 q = le128( bSig0, bSig1, aSig0, aSig1 ); 6537 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6538 expDiff -= 64; 6539 while ( 0 < expDiff ) { 6540 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6541 q = ( 4 < q ) ? q - 4 : 0; 6542 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6543 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 6544 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 6545 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 6546 expDiff -= 61; 6547 } 6548 if ( -64 < expDiff ) { 6549 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6550 q = ( 4 < q ) ? q - 4 : 0; 6551 q >>= - expDiff; 6552 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6553 expDiff += 52; 6554 if ( expDiff < 0 ) { 6555 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6556 } 6557 else { 6558 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 6559 } 6560 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6561 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 6562 } 6563 else { 6564 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 6565 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6566 } 6567 do { 6568 alternateASig0 = aSig0; 6569 alternateASig1 = aSig1; 6570 ++q; 6571 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6572 } while ( 0 <= (int64_t) aSig0 ); 6573 add128( 6574 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 6575 if ( ( sigMean0 < 0 ) 6576 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 6577 aSig0 = alternateASig0; 6578 aSig1 = alternateASig1; 6579 } 6580 zSign = ( (int64_t) aSig0 < 0 ); 6581 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 6582 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 6583 status); 6584 } 6585 6586 /*---------------------------------------------------------------------------- 6587 | Returns the square root of the quadruple-precision floating-point value `a'. 6588 | The operation is performed according to the IEC/IEEE Standard for Binary 6589 | Floating-Point Arithmetic. 6590 *----------------------------------------------------------------------------*/ 6591 6592 float128 float128_sqrt(float128 a, float_status *status) 6593 { 6594 flag aSign; 6595 int32_t aExp, zExp; 6596 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 6597 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6598 6599 aSig1 = extractFloat128Frac1( a ); 6600 aSig0 = extractFloat128Frac0( a ); 6601 aExp = extractFloat128Exp( a ); 6602 aSign = extractFloat128Sign( a ); 6603 if ( aExp == 0x7FFF ) { 6604 if (aSig0 | aSig1) { 6605 return propagateFloat128NaN(a, a, status); 6606 } 6607 if ( ! aSign ) return a; 6608 goto invalid; 6609 } 6610 if ( aSign ) { 6611 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 6612 invalid: 6613 float_raise(float_flag_invalid, status); 6614 return float128_default_nan(status); 6615 } 6616 if ( aExp == 0 ) { 6617 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 6618 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6619 } 6620 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 6621 aSig0 |= LIT64( 0x0001000000000000 ); 6622 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 6623 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 6624 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6625 doubleZSig0 = zSig0<<1; 6626 mul64To128( zSig0, zSig0, &term0, &term1 ); 6627 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6628 while ( (int64_t) rem0 < 0 ) { 6629 --zSig0; 6630 doubleZSig0 -= 2; 6631 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6632 } 6633 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6634 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 6635 if ( zSig1 == 0 ) zSig1 = 1; 6636 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6637 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6638 mul64To128( zSig1, zSig1, &term2, &term3 ); 6639 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6640 while ( (int64_t) rem1 < 0 ) { 6641 --zSig1; 6642 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6643 term3 |= 1; 6644 term2 |= doubleZSig0; 6645 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6646 } 6647 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6648 } 6649 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 6650 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 6651 6652 } 6653 6654 /*---------------------------------------------------------------------------- 6655 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 6656 | the corresponding value `b', and 0 otherwise. The invalid exception is 6657 | raised if either operand is a NaN. Otherwise, the comparison is performed 6658 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6659 *----------------------------------------------------------------------------*/ 6660 6661 int float128_eq(float128 a, float128 b, float_status *status) 6662 { 6663 6664 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6665 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6666 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6667 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6668 ) { 6669 float_raise(float_flag_invalid, status); 6670 return 0; 6671 } 6672 return 6673 ( a.low == b.low ) 6674 && ( ( a.high == b.high ) 6675 || ( ( a.low == 0 ) 6676 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6677 ); 6678 6679 } 6680 6681 /*---------------------------------------------------------------------------- 6682 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6683 | or equal to the corresponding value `b', and 0 otherwise. The invalid 6684 | exception is raised if either operand is a NaN. The comparison is performed 6685 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6686 *----------------------------------------------------------------------------*/ 6687 6688 int float128_le(float128 a, float128 b, float_status *status) 6689 { 6690 flag aSign, bSign; 6691 6692 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6693 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6694 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6695 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6696 ) { 6697 float_raise(float_flag_invalid, status); 6698 return 0; 6699 } 6700 aSign = extractFloat128Sign( a ); 6701 bSign = extractFloat128Sign( b ); 6702 if ( aSign != bSign ) { 6703 return 6704 aSign 6705 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6706 == 0 ); 6707 } 6708 return 6709 aSign ? le128( b.high, b.low, a.high, a.low ) 6710 : le128( a.high, a.low, b.high, b.low ); 6711 6712 } 6713 6714 /*---------------------------------------------------------------------------- 6715 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6716 | the corresponding value `b', and 0 otherwise. The invalid exception is 6717 | raised if either operand is a NaN. The comparison is performed according 6718 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6719 *----------------------------------------------------------------------------*/ 6720 6721 int float128_lt(float128 a, float128 b, float_status *status) 6722 { 6723 flag aSign, bSign; 6724 6725 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6726 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6727 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6728 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6729 ) { 6730 float_raise(float_flag_invalid, status); 6731 return 0; 6732 } 6733 aSign = extractFloat128Sign( a ); 6734 bSign = extractFloat128Sign( b ); 6735 if ( aSign != bSign ) { 6736 return 6737 aSign 6738 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6739 != 0 ); 6740 } 6741 return 6742 aSign ? lt128( b.high, b.low, a.high, a.low ) 6743 : lt128( a.high, a.low, b.high, b.low ); 6744 6745 } 6746 6747 /*---------------------------------------------------------------------------- 6748 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 6749 | be compared, and 0 otherwise. The invalid exception is raised if either 6750 | operand is a NaN. The comparison is performed according to the IEC/IEEE 6751 | Standard for Binary Floating-Point Arithmetic. 6752 *----------------------------------------------------------------------------*/ 6753 6754 int float128_unordered(float128 a, float128 b, float_status *status) 6755 { 6756 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6757 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6758 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6759 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6760 ) { 6761 float_raise(float_flag_invalid, status); 6762 return 1; 6763 } 6764 return 0; 6765 } 6766 6767 /*---------------------------------------------------------------------------- 6768 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 6769 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 6770 | exception. The comparison is performed according to the IEC/IEEE Standard 6771 | for Binary Floating-Point Arithmetic. 6772 *----------------------------------------------------------------------------*/ 6773 6774 int float128_eq_quiet(float128 a, float128 b, float_status *status) 6775 { 6776 6777 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6778 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6779 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6780 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6781 ) { 6782 if (float128_is_signaling_nan(a, status) 6783 || float128_is_signaling_nan(b, status)) { 6784 float_raise(float_flag_invalid, status); 6785 } 6786 return 0; 6787 } 6788 return 6789 ( a.low == b.low ) 6790 && ( ( a.high == b.high ) 6791 || ( ( a.low == 0 ) 6792 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6793 ); 6794 6795 } 6796 6797 /*---------------------------------------------------------------------------- 6798 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6799 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 6800 | cause an exception. Otherwise, the comparison is performed according to the 6801 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6802 *----------------------------------------------------------------------------*/ 6803 6804 int float128_le_quiet(float128 a, float128 b, float_status *status) 6805 { 6806 flag aSign, bSign; 6807 6808 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6809 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6810 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6811 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6812 ) { 6813 if (float128_is_signaling_nan(a, status) 6814 || float128_is_signaling_nan(b, status)) { 6815 float_raise(float_flag_invalid, status); 6816 } 6817 return 0; 6818 } 6819 aSign = extractFloat128Sign( a ); 6820 bSign = extractFloat128Sign( b ); 6821 if ( aSign != bSign ) { 6822 return 6823 aSign 6824 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6825 == 0 ); 6826 } 6827 return 6828 aSign ? le128( b.high, b.low, a.high, a.low ) 6829 : le128( a.high, a.low, b.high, b.low ); 6830 6831 } 6832 6833 /*---------------------------------------------------------------------------- 6834 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6835 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 6836 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 6837 | Standard for Binary Floating-Point Arithmetic. 6838 *----------------------------------------------------------------------------*/ 6839 6840 int float128_lt_quiet(float128 a, float128 b, float_status *status) 6841 { 6842 flag aSign, bSign; 6843 6844 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6845 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6846 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6847 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6848 ) { 6849 if (float128_is_signaling_nan(a, status) 6850 || float128_is_signaling_nan(b, status)) { 6851 float_raise(float_flag_invalid, status); 6852 } 6853 return 0; 6854 } 6855 aSign = extractFloat128Sign( a ); 6856 bSign = extractFloat128Sign( b ); 6857 if ( aSign != bSign ) { 6858 return 6859 aSign 6860 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6861 != 0 ); 6862 } 6863 return 6864 aSign ? lt128( b.high, b.low, a.high, a.low ) 6865 : lt128( a.high, a.low, b.high, b.low ); 6866 6867 } 6868 6869 /*---------------------------------------------------------------------------- 6870 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 6871 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 6872 | comparison is performed according to the IEC/IEEE Standard for Binary 6873 | Floating-Point Arithmetic. 6874 *----------------------------------------------------------------------------*/ 6875 6876 int float128_unordered_quiet(float128 a, float128 b, float_status *status) 6877 { 6878 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6879 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6880 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6881 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6882 ) { 6883 if (float128_is_signaling_nan(a, status) 6884 || float128_is_signaling_nan(b, status)) { 6885 float_raise(float_flag_invalid, status); 6886 } 6887 return 1; 6888 } 6889 return 0; 6890 } 6891 6892 static inline int floatx80_compare_internal(floatx80 a, floatx80 b, 6893 int is_quiet, float_status *status) 6894 { 6895 flag aSign, bSign; 6896 6897 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6898 float_raise(float_flag_invalid, status); 6899 return float_relation_unordered; 6900 } 6901 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 6902 ( extractFloatx80Frac( a )<<1 ) ) || 6903 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 6904 ( extractFloatx80Frac( b )<<1 ) )) { 6905 if (!is_quiet || 6906 floatx80_is_signaling_nan(a, status) || 6907 floatx80_is_signaling_nan(b, status)) { 6908 float_raise(float_flag_invalid, status); 6909 } 6910 return float_relation_unordered; 6911 } 6912 aSign = extractFloatx80Sign( a ); 6913 bSign = extractFloatx80Sign( b ); 6914 if ( aSign != bSign ) { 6915 6916 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 6917 ( ( a.low | b.low ) == 0 ) ) { 6918 /* zero case */ 6919 return float_relation_equal; 6920 } else { 6921 return 1 - (2 * aSign); 6922 } 6923 } else { 6924 if (a.low == b.low && a.high == b.high) { 6925 return float_relation_equal; 6926 } else { 6927 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 6928 } 6929 } 6930 } 6931 6932 int floatx80_compare(floatx80 a, floatx80 b, float_status *status) 6933 { 6934 return floatx80_compare_internal(a, b, 0, status); 6935 } 6936 6937 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status) 6938 { 6939 return floatx80_compare_internal(a, b, 1, status); 6940 } 6941 6942 static inline int float128_compare_internal(float128 a, float128 b, 6943 int is_quiet, float_status *status) 6944 { 6945 flag aSign, bSign; 6946 6947 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 6948 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 6949 ( ( extractFloat128Exp( b ) == 0x7fff ) && 6950 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 6951 if (!is_quiet || 6952 float128_is_signaling_nan(a, status) || 6953 float128_is_signaling_nan(b, status)) { 6954 float_raise(float_flag_invalid, status); 6955 } 6956 return float_relation_unordered; 6957 } 6958 aSign = extractFloat128Sign( a ); 6959 bSign = extractFloat128Sign( b ); 6960 if ( aSign != bSign ) { 6961 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 6962 /* zero case */ 6963 return float_relation_equal; 6964 } else { 6965 return 1 - (2 * aSign); 6966 } 6967 } else { 6968 if (a.low == b.low && a.high == b.high) { 6969 return float_relation_equal; 6970 } else { 6971 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 6972 } 6973 } 6974 } 6975 6976 int float128_compare(float128 a, float128 b, float_status *status) 6977 { 6978 return float128_compare_internal(a, b, 0, status); 6979 } 6980 6981 int float128_compare_quiet(float128 a, float128 b, float_status *status) 6982 { 6983 return float128_compare_internal(a, b, 1, status); 6984 } 6985 6986 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 6987 { 6988 flag aSign; 6989 int32_t aExp; 6990 uint64_t aSig; 6991 6992 if (floatx80_invalid_encoding(a)) { 6993 float_raise(float_flag_invalid, status); 6994 return floatx80_default_nan(status); 6995 } 6996 aSig = extractFloatx80Frac( a ); 6997 aExp = extractFloatx80Exp( a ); 6998 aSign = extractFloatx80Sign( a ); 6999 7000 if ( aExp == 0x7FFF ) { 7001 if ( aSig<<1 ) { 7002 return propagateFloatx80NaN(a, a, status); 7003 } 7004 return a; 7005 } 7006 7007 if (aExp == 0) { 7008 if (aSig == 0) { 7009 return a; 7010 } 7011 aExp++; 7012 } 7013 7014 if (n > 0x10000) { 7015 n = 0x10000; 7016 } else if (n < -0x10000) { 7017 n = -0x10000; 7018 } 7019 7020 aExp += n; 7021 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7022 aSign, aExp, aSig, 0, status); 7023 } 7024 7025 float128 float128_scalbn(float128 a, int n, float_status *status) 7026 { 7027 flag aSign; 7028 int32_t aExp; 7029 uint64_t aSig0, aSig1; 7030 7031 aSig1 = extractFloat128Frac1( a ); 7032 aSig0 = extractFloat128Frac0( a ); 7033 aExp = extractFloat128Exp( a ); 7034 aSign = extractFloat128Sign( a ); 7035 if ( aExp == 0x7FFF ) { 7036 if ( aSig0 | aSig1 ) { 7037 return propagateFloat128NaN(a, a, status); 7038 } 7039 return a; 7040 } 7041 if (aExp != 0) { 7042 aSig0 |= LIT64( 0x0001000000000000 ); 7043 } else if (aSig0 == 0 && aSig1 == 0) { 7044 return a; 7045 } else { 7046 aExp++; 7047 } 7048 7049 if (n > 0x10000) { 7050 n = 0x10000; 7051 } else if (n < -0x10000) { 7052 n = -0x10000; 7053 } 7054 7055 aExp += n - 1; 7056 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7057 , status); 7058 7059 } 7060