1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include "qemu/bitops.h" 87 #include "fpu/softfloat.h" 88 89 /* We only need stdlib for abort() */ 90 91 /*---------------------------------------------------------------------------- 92 | Primitive arithmetic functions, including multi-word arithmetic, and 93 | division and square root approximations. (Can be specialized to target if 94 | desired.) 95 *----------------------------------------------------------------------------*/ 96 #include "softfloat-macros.h" 97 98 /*---------------------------------------------------------------------------- 99 | Functions and definitions to determine: (1) whether tininess for underflow 100 | is detected before or after rounding by default, (2) what (if anything) 101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 103 | are propagated from function inputs to output. These details are target- 104 | specific. 105 *----------------------------------------------------------------------------*/ 106 #include "softfloat-specialize.h" 107 108 /*---------------------------------------------------------------------------- 109 | Returns the fraction bits of the half-precision floating-point value `a'. 110 *----------------------------------------------------------------------------*/ 111 112 static inline uint32_t extractFloat16Frac(float16 a) 113 { 114 return float16_val(a) & 0x3ff; 115 } 116 117 /*---------------------------------------------------------------------------- 118 | Returns the exponent bits of the half-precision floating-point value `a'. 119 *----------------------------------------------------------------------------*/ 120 121 static inline int extractFloat16Exp(float16 a) 122 { 123 return (float16_val(a) >> 10) & 0x1f; 124 } 125 126 /*---------------------------------------------------------------------------- 127 | Returns the sign bit of the single-precision floating-point value `a'. 128 *----------------------------------------------------------------------------*/ 129 130 static inline flag extractFloat16Sign(float16 a) 131 { 132 return float16_val(a)>>15; 133 } 134 135 /*---------------------------------------------------------------------------- 136 | Returns the fraction bits of the single-precision floating-point value `a'. 137 *----------------------------------------------------------------------------*/ 138 139 static inline uint32_t extractFloat32Frac(float32 a) 140 { 141 return float32_val(a) & 0x007FFFFF; 142 } 143 144 /*---------------------------------------------------------------------------- 145 | Returns the exponent bits of the single-precision floating-point value `a'. 146 *----------------------------------------------------------------------------*/ 147 148 static inline int extractFloat32Exp(float32 a) 149 { 150 return (float32_val(a) >> 23) & 0xFF; 151 } 152 153 /*---------------------------------------------------------------------------- 154 | Returns the sign bit of the single-precision floating-point value `a'. 155 *----------------------------------------------------------------------------*/ 156 157 static inline flag extractFloat32Sign(float32 a) 158 { 159 return float32_val(a) >> 31; 160 } 161 162 /*---------------------------------------------------------------------------- 163 | Returns the fraction bits of the double-precision floating-point value `a'. 164 *----------------------------------------------------------------------------*/ 165 166 static inline uint64_t extractFloat64Frac(float64 a) 167 { 168 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF); 169 } 170 171 /*---------------------------------------------------------------------------- 172 | Returns the exponent bits of the double-precision floating-point value `a'. 173 *----------------------------------------------------------------------------*/ 174 175 static inline int extractFloat64Exp(float64 a) 176 { 177 return (float64_val(a) >> 52) & 0x7FF; 178 } 179 180 /*---------------------------------------------------------------------------- 181 | Returns the sign bit of the double-precision floating-point value `a'. 182 *----------------------------------------------------------------------------*/ 183 184 static inline flag extractFloat64Sign(float64 a) 185 { 186 return float64_val(a) >> 63; 187 } 188 189 /* 190 * Classify a floating point number. Everything above float_class_qnan 191 * is a NaN so cls >= float_class_qnan is any NaN. 192 */ 193 194 typedef enum __attribute__ ((__packed__)) { 195 float_class_unclassified, 196 float_class_zero, 197 float_class_normal, 198 float_class_inf, 199 float_class_qnan, /* all NaNs from here */ 200 float_class_snan, 201 float_class_dnan, 202 float_class_msnan, /* maybe silenced */ 203 } FloatClass; 204 205 /* 206 * Structure holding all of the decomposed parts of a float. The 207 * exponent is unbiased and the fraction is normalized. All 208 * calculations are done with a 64 bit fraction and then rounded as 209 * appropriate for the final format. 210 * 211 * Thanks to the packed FloatClass a decent compiler should be able to 212 * fit the whole structure into registers and avoid using the stack 213 * for parameter passing. 214 */ 215 216 typedef struct { 217 uint64_t frac; 218 int32_t exp; 219 FloatClass cls; 220 bool sign; 221 } FloatParts; 222 223 #define DECOMPOSED_BINARY_POINT (64 - 2) 224 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 225 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1) 226 227 /* Structure holding all of the relevant parameters for a format. 228 * exp_size: the size of the exponent field 229 * exp_bias: the offset applied to the exponent field 230 * exp_max: the maximum normalised exponent 231 * frac_size: the size of the fraction field 232 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 233 * The following are computed based the size of fraction 234 * frac_lsb: least significant bit of fraction 235 * fram_lsbm1: the bit bellow the least significant bit (for rounding) 236 * round_mask/roundeven_mask: masks used for rounding 237 */ 238 typedef struct { 239 int exp_size; 240 int exp_bias; 241 int exp_max; 242 int frac_size; 243 int frac_shift; 244 uint64_t frac_lsb; 245 uint64_t frac_lsbm1; 246 uint64_t round_mask; 247 uint64_t roundeven_mask; 248 } FloatFmt; 249 250 /* Expand fields based on the size of exponent and fraction */ 251 #define FLOAT_PARAMS(E, F) \ 252 .exp_size = E, \ 253 .exp_bias = ((1 << E) - 1) >> 1, \ 254 .exp_max = (1 << E) - 1, \ 255 .frac_size = F, \ 256 .frac_shift = DECOMPOSED_BINARY_POINT - F, \ 257 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \ 258 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \ 259 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \ 260 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1 261 262 static const FloatFmt float16_params = { 263 FLOAT_PARAMS(5, 10) 264 }; 265 266 static const FloatFmt float32_params = { 267 FLOAT_PARAMS(8, 23) 268 }; 269 270 static const FloatFmt float64_params = { 271 FLOAT_PARAMS(11, 52) 272 }; 273 274 /* Unpack a float to parts, but do not canonicalize. */ 275 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw) 276 { 277 const int sign_pos = fmt.frac_size + fmt.exp_size; 278 279 return (FloatParts) { 280 .cls = float_class_unclassified, 281 .sign = extract64(raw, sign_pos, 1), 282 .exp = extract64(raw, fmt.frac_size, fmt.exp_size), 283 .frac = extract64(raw, 0, fmt.frac_size), 284 }; 285 } 286 287 static inline FloatParts float16_unpack_raw(float16 f) 288 { 289 return unpack_raw(float16_params, f); 290 } 291 292 static inline FloatParts float32_unpack_raw(float32 f) 293 { 294 return unpack_raw(float32_params, f); 295 } 296 297 static inline FloatParts float64_unpack_raw(float64 f) 298 { 299 return unpack_raw(float64_params, f); 300 } 301 302 /* Pack a float from parts, but do not canonicalize. */ 303 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p) 304 { 305 const int sign_pos = fmt.frac_size + fmt.exp_size; 306 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp); 307 return deposit64(ret, sign_pos, 1, p.sign); 308 } 309 310 static inline float16 float16_pack_raw(FloatParts p) 311 { 312 return make_float16(pack_raw(float16_params, p)); 313 } 314 315 static inline float32 float32_pack_raw(FloatParts p) 316 { 317 return make_float32(pack_raw(float32_params, p)); 318 } 319 320 static inline float64 float64_pack_raw(FloatParts p) 321 { 322 return make_float64(pack_raw(float64_params, p)); 323 } 324 325 /* Canonicalize EXP and FRAC, setting CLS. */ 326 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm, 327 float_status *status) 328 { 329 if (part.exp == parm->exp_max) { 330 if (part.frac == 0) { 331 part.cls = float_class_inf; 332 } else { 333 #ifdef NO_SIGNALING_NANS 334 part.cls = float_class_qnan; 335 #else 336 int64_t msb = part.frac << (parm->frac_shift + 2); 337 if ((msb < 0) == status->snan_bit_is_one) { 338 part.cls = float_class_snan; 339 } else { 340 part.cls = float_class_qnan; 341 } 342 #endif 343 } 344 } else if (part.exp == 0) { 345 if (likely(part.frac == 0)) { 346 part.cls = float_class_zero; 347 } else if (status->flush_inputs_to_zero) { 348 float_raise(float_flag_input_denormal, status); 349 part.cls = float_class_zero; 350 part.frac = 0; 351 } else { 352 int shift = clz64(part.frac) - 1; 353 part.cls = float_class_normal; 354 part.exp = parm->frac_shift - parm->exp_bias - shift + 1; 355 part.frac <<= shift; 356 } 357 } else { 358 part.cls = float_class_normal; 359 part.exp -= parm->exp_bias; 360 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift); 361 } 362 return part; 363 } 364 365 /* Round and uncanonicalize a floating-point number by parts. There 366 * are FRAC_SHIFT bits that may require rounding at the bottom of the 367 * fraction; these bits will be removed. The exponent will be biased 368 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0]. 369 */ 370 371 static FloatParts round_canonical(FloatParts p, float_status *s, 372 const FloatFmt *parm) 373 { 374 const uint64_t frac_lsbm1 = parm->frac_lsbm1; 375 const uint64_t round_mask = parm->round_mask; 376 const uint64_t roundeven_mask = parm->roundeven_mask; 377 const int exp_max = parm->exp_max; 378 const int frac_shift = parm->frac_shift; 379 uint64_t frac, inc; 380 int exp, flags = 0; 381 bool overflow_norm; 382 383 frac = p.frac; 384 exp = p.exp; 385 386 switch (p.cls) { 387 case float_class_normal: 388 switch (s->float_rounding_mode) { 389 case float_round_nearest_even: 390 overflow_norm = false; 391 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 392 break; 393 case float_round_ties_away: 394 overflow_norm = false; 395 inc = frac_lsbm1; 396 break; 397 case float_round_to_zero: 398 overflow_norm = true; 399 inc = 0; 400 break; 401 case float_round_up: 402 inc = p.sign ? 0 : round_mask; 403 overflow_norm = p.sign; 404 break; 405 case float_round_down: 406 inc = p.sign ? round_mask : 0; 407 overflow_norm = !p.sign; 408 break; 409 default: 410 g_assert_not_reached(); 411 } 412 413 exp += parm->exp_bias; 414 if (likely(exp > 0)) { 415 if (frac & round_mask) { 416 flags |= float_flag_inexact; 417 frac += inc; 418 if (frac & DECOMPOSED_OVERFLOW_BIT) { 419 frac >>= 1; 420 exp++; 421 } 422 } 423 frac >>= frac_shift; 424 425 if (unlikely(exp >= exp_max)) { 426 flags |= float_flag_overflow | float_flag_inexact; 427 if (overflow_norm) { 428 exp = exp_max - 1; 429 frac = -1; 430 } else { 431 p.cls = float_class_inf; 432 goto do_inf; 433 } 434 } 435 } else if (s->flush_to_zero) { 436 flags |= float_flag_output_denormal; 437 p.cls = float_class_zero; 438 goto do_zero; 439 } else { 440 bool is_tiny = (s->float_detect_tininess 441 == float_tininess_before_rounding) 442 || (exp < 0) 443 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT); 444 445 shift64RightJamming(frac, 1 - exp, &frac); 446 if (frac & round_mask) { 447 /* Need to recompute round-to-even. */ 448 if (s->float_rounding_mode == float_round_nearest_even) { 449 inc = ((frac & roundeven_mask) != frac_lsbm1 450 ? frac_lsbm1 : 0); 451 } 452 flags |= float_flag_inexact; 453 frac += inc; 454 } 455 456 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0); 457 frac >>= frac_shift; 458 459 if (is_tiny && (flags & float_flag_inexact)) { 460 flags |= float_flag_underflow; 461 } 462 if (exp == 0 && frac == 0) { 463 p.cls = float_class_zero; 464 } 465 } 466 break; 467 468 case float_class_zero: 469 do_zero: 470 exp = 0; 471 frac = 0; 472 break; 473 474 case float_class_inf: 475 do_inf: 476 exp = exp_max; 477 frac = 0; 478 break; 479 480 case float_class_qnan: 481 case float_class_snan: 482 exp = exp_max; 483 break; 484 485 default: 486 g_assert_not_reached(); 487 } 488 489 float_raise(flags, s); 490 p.exp = exp; 491 p.frac = frac; 492 return p; 493 } 494 495 static FloatParts float16_unpack_canonical(float16 f, float_status *s) 496 { 497 return canonicalize(float16_unpack_raw(f), &float16_params, s); 498 } 499 500 static float16 float16_round_pack_canonical(FloatParts p, float_status *s) 501 { 502 switch (p.cls) { 503 case float_class_dnan: 504 return float16_default_nan(s); 505 case float_class_msnan: 506 return float16_maybe_silence_nan(float16_pack_raw(p), s); 507 default: 508 p = round_canonical(p, s, &float16_params); 509 return float16_pack_raw(p); 510 } 511 } 512 513 static FloatParts float32_unpack_canonical(float32 f, float_status *s) 514 { 515 return canonicalize(float32_unpack_raw(f), &float32_params, s); 516 } 517 518 static float32 float32_round_pack_canonical(FloatParts p, float_status *s) 519 { 520 switch (p.cls) { 521 case float_class_dnan: 522 return float32_default_nan(s); 523 case float_class_msnan: 524 return float32_maybe_silence_nan(float32_pack_raw(p), s); 525 default: 526 p = round_canonical(p, s, &float32_params); 527 return float32_pack_raw(p); 528 } 529 } 530 531 static FloatParts float64_unpack_canonical(float64 f, float_status *s) 532 { 533 return canonicalize(float64_unpack_raw(f), &float64_params, s); 534 } 535 536 static float64 float64_round_pack_canonical(FloatParts p, float_status *s) 537 { 538 switch (p.cls) { 539 case float_class_dnan: 540 return float64_default_nan(s); 541 case float_class_msnan: 542 return float64_maybe_silence_nan(float64_pack_raw(p), s); 543 default: 544 p = round_canonical(p, s, &float64_params); 545 return float64_pack_raw(p); 546 } 547 } 548 549 /* Simple helpers for checking if what NaN we have */ 550 static bool is_nan(FloatClass c) 551 { 552 return unlikely(c >= float_class_qnan); 553 } 554 static bool is_snan(FloatClass c) 555 { 556 return c == float_class_snan; 557 } 558 static bool is_qnan(FloatClass c) 559 { 560 return c == float_class_qnan; 561 } 562 563 static FloatParts return_nan(FloatParts a, float_status *s) 564 { 565 switch (a.cls) { 566 case float_class_snan: 567 s->float_exception_flags |= float_flag_invalid; 568 a.cls = float_class_msnan; 569 /* fall through */ 570 case float_class_qnan: 571 if (s->default_nan_mode) { 572 a.cls = float_class_dnan; 573 } 574 break; 575 576 default: 577 g_assert_not_reached(); 578 } 579 return a; 580 } 581 582 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s) 583 { 584 if (is_snan(a.cls) || is_snan(b.cls)) { 585 s->float_exception_flags |= float_flag_invalid; 586 } 587 588 if (s->default_nan_mode) { 589 a.cls = float_class_dnan; 590 } else { 591 if (pickNaN(is_qnan(a.cls), is_snan(a.cls), 592 is_qnan(b.cls), is_snan(b.cls), 593 a.frac > b.frac || 594 (a.frac == b.frac && a.sign < b.sign))) { 595 a = b; 596 } 597 a.cls = float_class_msnan; 598 } 599 return a; 600 } 601 602 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c, 603 bool inf_zero, float_status *s) 604 { 605 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) { 606 s->float_exception_flags |= float_flag_invalid; 607 } 608 609 if (s->default_nan_mode) { 610 a.cls = float_class_dnan; 611 } else { 612 switch (pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls), 613 is_qnan(b.cls), is_snan(b.cls), 614 is_qnan(c.cls), is_snan(c.cls), 615 inf_zero, s)) { 616 case 0: 617 break; 618 case 1: 619 a = b; 620 break; 621 case 2: 622 a = c; 623 break; 624 case 3: 625 a.cls = float_class_dnan; 626 return a; 627 default: 628 g_assert_not_reached(); 629 } 630 631 a.cls = float_class_msnan; 632 } 633 return a; 634 } 635 636 /* 637 * Returns the result of adding or subtracting the values of the 638 * floating-point values `a' and `b'. The operation is performed 639 * according to the IEC/IEEE Standard for Binary Floating-Point 640 * Arithmetic. 641 */ 642 643 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract, 644 float_status *s) 645 { 646 bool a_sign = a.sign; 647 bool b_sign = b.sign ^ subtract; 648 649 if (a_sign != b_sign) { 650 /* Subtraction */ 651 652 if (a.cls == float_class_normal && b.cls == float_class_normal) { 653 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) { 654 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 655 a.frac = a.frac - b.frac; 656 } else { 657 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 658 a.frac = b.frac - a.frac; 659 a.exp = b.exp; 660 a_sign ^= 1; 661 } 662 663 if (a.frac == 0) { 664 a.cls = float_class_zero; 665 a.sign = s->float_rounding_mode == float_round_down; 666 } else { 667 int shift = clz64(a.frac) - 1; 668 a.frac = a.frac << shift; 669 a.exp = a.exp - shift; 670 a.sign = a_sign; 671 } 672 return a; 673 } 674 if (is_nan(a.cls) || is_nan(b.cls)) { 675 return pick_nan(a, b, s); 676 } 677 if (a.cls == float_class_inf) { 678 if (b.cls == float_class_inf) { 679 float_raise(float_flag_invalid, s); 680 a.cls = float_class_dnan; 681 } 682 return a; 683 } 684 if (a.cls == float_class_zero && b.cls == float_class_zero) { 685 a.sign = s->float_rounding_mode == float_round_down; 686 return a; 687 } 688 if (a.cls == float_class_zero || b.cls == float_class_inf) { 689 b.sign = a_sign ^ 1; 690 return b; 691 } 692 if (b.cls == float_class_zero) { 693 return a; 694 } 695 } else { 696 /* Addition */ 697 if (a.cls == float_class_normal && b.cls == float_class_normal) { 698 if (a.exp > b.exp) { 699 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 700 } else if (a.exp < b.exp) { 701 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 702 a.exp = b.exp; 703 } 704 a.frac += b.frac; 705 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 706 a.frac >>= 1; 707 a.exp += 1; 708 } 709 return a; 710 } 711 if (is_nan(a.cls) || is_nan(b.cls)) { 712 return pick_nan(a, b, s); 713 } 714 if (a.cls == float_class_inf || b.cls == float_class_zero) { 715 return a; 716 } 717 if (b.cls == float_class_inf || a.cls == float_class_zero) { 718 b.sign = b_sign; 719 return b; 720 } 721 } 722 g_assert_not_reached(); 723 } 724 725 /* 726 * Returns the result of adding or subtracting the floating-point 727 * values `a' and `b'. The operation is performed according to the 728 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 729 */ 730 731 float16 __attribute__((flatten)) float16_add(float16 a, float16 b, 732 float_status *status) 733 { 734 FloatParts pa = float16_unpack_canonical(a, status); 735 FloatParts pb = float16_unpack_canonical(b, status); 736 FloatParts pr = addsub_floats(pa, pb, false, status); 737 738 return float16_round_pack_canonical(pr, status); 739 } 740 741 float32 __attribute__((flatten)) float32_add(float32 a, float32 b, 742 float_status *status) 743 { 744 FloatParts pa = float32_unpack_canonical(a, status); 745 FloatParts pb = float32_unpack_canonical(b, status); 746 FloatParts pr = addsub_floats(pa, pb, false, status); 747 748 return float32_round_pack_canonical(pr, status); 749 } 750 751 float64 __attribute__((flatten)) float64_add(float64 a, float64 b, 752 float_status *status) 753 { 754 FloatParts pa = float64_unpack_canonical(a, status); 755 FloatParts pb = float64_unpack_canonical(b, status); 756 FloatParts pr = addsub_floats(pa, pb, false, status); 757 758 return float64_round_pack_canonical(pr, status); 759 } 760 761 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b, 762 float_status *status) 763 { 764 FloatParts pa = float16_unpack_canonical(a, status); 765 FloatParts pb = float16_unpack_canonical(b, status); 766 FloatParts pr = addsub_floats(pa, pb, true, status); 767 768 return float16_round_pack_canonical(pr, status); 769 } 770 771 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b, 772 float_status *status) 773 { 774 FloatParts pa = float32_unpack_canonical(a, status); 775 FloatParts pb = float32_unpack_canonical(b, status); 776 FloatParts pr = addsub_floats(pa, pb, true, status); 777 778 return float32_round_pack_canonical(pr, status); 779 } 780 781 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b, 782 float_status *status) 783 { 784 FloatParts pa = float64_unpack_canonical(a, status); 785 FloatParts pb = float64_unpack_canonical(b, status); 786 FloatParts pr = addsub_floats(pa, pb, true, status); 787 788 return float64_round_pack_canonical(pr, status); 789 } 790 791 /* 792 * Returns the result of multiplying the floating-point values `a' and 793 * `b'. The operation is performed according to the IEC/IEEE Standard 794 * for Binary Floating-Point Arithmetic. 795 */ 796 797 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s) 798 { 799 bool sign = a.sign ^ b.sign; 800 801 if (a.cls == float_class_normal && b.cls == float_class_normal) { 802 uint64_t hi, lo; 803 int exp = a.exp + b.exp; 804 805 mul64To128(a.frac, b.frac, &hi, &lo); 806 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 807 if (lo & DECOMPOSED_OVERFLOW_BIT) { 808 shift64RightJamming(lo, 1, &lo); 809 exp += 1; 810 } 811 812 /* Re-use a */ 813 a.exp = exp; 814 a.sign = sign; 815 a.frac = lo; 816 return a; 817 } 818 /* handle all the NaN cases */ 819 if (is_nan(a.cls) || is_nan(b.cls)) { 820 return pick_nan(a, b, s); 821 } 822 /* Inf * Zero == NaN */ 823 if ((a.cls == float_class_inf && b.cls == float_class_zero) || 824 (a.cls == float_class_zero && b.cls == float_class_inf)) { 825 s->float_exception_flags |= float_flag_invalid; 826 a.cls = float_class_dnan; 827 a.sign = sign; 828 return a; 829 } 830 /* Multiply by 0 or Inf */ 831 if (a.cls == float_class_inf || a.cls == float_class_zero) { 832 a.sign = sign; 833 return a; 834 } 835 if (b.cls == float_class_inf || b.cls == float_class_zero) { 836 b.sign = sign; 837 return b; 838 } 839 g_assert_not_reached(); 840 } 841 842 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b, 843 float_status *status) 844 { 845 FloatParts pa = float16_unpack_canonical(a, status); 846 FloatParts pb = float16_unpack_canonical(b, status); 847 FloatParts pr = mul_floats(pa, pb, status); 848 849 return float16_round_pack_canonical(pr, status); 850 } 851 852 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b, 853 float_status *status) 854 { 855 FloatParts pa = float32_unpack_canonical(a, status); 856 FloatParts pb = float32_unpack_canonical(b, status); 857 FloatParts pr = mul_floats(pa, pb, status); 858 859 return float32_round_pack_canonical(pr, status); 860 } 861 862 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b, 863 float_status *status) 864 { 865 FloatParts pa = float64_unpack_canonical(a, status); 866 FloatParts pb = float64_unpack_canonical(b, status); 867 FloatParts pr = mul_floats(pa, pb, status); 868 869 return float64_round_pack_canonical(pr, status); 870 } 871 872 /* 873 * Returns the result of multiplying the floating-point values `a' and 874 * `b' then adding 'c', with no intermediate rounding step after the 875 * multiplication. The operation is performed according to the 876 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008. 877 * The flags argument allows the caller to select negation of the 878 * addend, the intermediate product, or the final result. (The 879 * difference between this and having the caller do a separate 880 * negation is that negating externally will flip the sign bit on 881 * NaNs.) 882 */ 883 884 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c, 885 int flags, float_status *s) 886 { 887 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) == 888 ((1 << float_class_inf) | (1 << float_class_zero)); 889 bool p_sign; 890 bool sign_flip = flags & float_muladd_negate_result; 891 FloatClass p_class; 892 uint64_t hi, lo; 893 int p_exp; 894 895 /* It is implementation-defined whether the cases of (0,inf,qnan) 896 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 897 * they return if they do), so we have to hand this information 898 * off to the target-specific pick-a-NaN routine. 899 */ 900 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) { 901 return pick_nan_muladd(a, b, c, inf_zero, s); 902 } 903 904 if (inf_zero) { 905 s->float_exception_flags |= float_flag_invalid; 906 a.cls = float_class_dnan; 907 return a; 908 } 909 910 if (flags & float_muladd_negate_c) { 911 c.sign ^= 1; 912 } 913 914 p_sign = a.sign ^ b.sign; 915 916 if (flags & float_muladd_negate_product) { 917 p_sign ^= 1; 918 } 919 920 if (a.cls == float_class_inf || b.cls == float_class_inf) { 921 p_class = float_class_inf; 922 } else if (a.cls == float_class_zero || b.cls == float_class_zero) { 923 p_class = float_class_zero; 924 } else { 925 p_class = float_class_normal; 926 } 927 928 if (c.cls == float_class_inf) { 929 if (p_class == float_class_inf && p_sign != c.sign) { 930 s->float_exception_flags |= float_flag_invalid; 931 a.cls = float_class_dnan; 932 } else { 933 a.cls = float_class_inf; 934 a.sign = c.sign ^ sign_flip; 935 } 936 return a; 937 } 938 939 if (p_class == float_class_inf) { 940 a.cls = float_class_inf; 941 a.sign = p_sign ^ sign_flip; 942 return a; 943 } 944 945 if (p_class == float_class_zero) { 946 if (c.cls == float_class_zero) { 947 if (p_sign != c.sign) { 948 p_sign = s->float_rounding_mode == float_round_down; 949 } 950 c.sign = p_sign; 951 } else if (flags & float_muladd_halve_result) { 952 c.exp -= 1; 953 } 954 c.sign ^= sign_flip; 955 return c; 956 } 957 958 /* a & b should be normals now... */ 959 assert(a.cls == float_class_normal && 960 b.cls == float_class_normal); 961 962 p_exp = a.exp + b.exp; 963 964 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit 965 * result. 966 */ 967 mul64To128(a.frac, b.frac, &hi, &lo); 968 /* binary point now at bit 124 */ 969 970 /* check for overflow */ 971 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) { 972 shift128RightJamming(hi, lo, 1, &hi, &lo); 973 p_exp += 1; 974 } 975 976 /* + add/sub */ 977 if (c.cls == float_class_zero) { 978 /* move binary point back to 62 */ 979 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 980 } else { 981 int exp_diff = p_exp - c.exp; 982 if (p_sign == c.sign) { 983 /* Addition */ 984 if (exp_diff <= 0) { 985 shift128RightJamming(hi, lo, 986 DECOMPOSED_BINARY_POINT - exp_diff, 987 &hi, &lo); 988 lo += c.frac; 989 p_exp = c.exp; 990 } else { 991 uint64_t c_hi, c_lo; 992 /* shift c to the same binary point as the product (124) */ 993 c_hi = c.frac >> 2; 994 c_lo = 0; 995 shift128RightJamming(c_hi, c_lo, 996 exp_diff, 997 &c_hi, &c_lo); 998 add128(hi, lo, c_hi, c_lo, &hi, &lo); 999 /* move binary point back to 62 */ 1000 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1001 } 1002 1003 if (lo & DECOMPOSED_OVERFLOW_BIT) { 1004 shift64RightJamming(lo, 1, &lo); 1005 p_exp += 1; 1006 } 1007 1008 } else { 1009 /* Subtraction */ 1010 uint64_t c_hi, c_lo; 1011 /* make C binary point match product at bit 124 */ 1012 c_hi = c.frac >> 2; 1013 c_lo = 0; 1014 1015 if (exp_diff <= 0) { 1016 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo); 1017 if (exp_diff == 0 1018 && 1019 (hi > c_hi || (hi == c_hi && lo >= c_lo))) { 1020 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1021 } else { 1022 sub128(c_hi, c_lo, hi, lo, &hi, &lo); 1023 p_sign ^= 1; 1024 p_exp = c.exp; 1025 } 1026 } else { 1027 shift128RightJamming(c_hi, c_lo, 1028 exp_diff, 1029 &c_hi, &c_lo); 1030 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1031 } 1032 1033 if (hi == 0 && lo == 0) { 1034 a.cls = float_class_zero; 1035 a.sign = s->float_rounding_mode == float_round_down; 1036 a.sign ^= sign_flip; 1037 return a; 1038 } else { 1039 int shift; 1040 if (hi != 0) { 1041 shift = clz64(hi); 1042 } else { 1043 shift = clz64(lo) + 64; 1044 } 1045 /* Normalizing to a binary point of 124 is the 1046 correct adjust for the exponent. However since we're 1047 shifting, we might as well put the binary point back 1048 at 62 where we really want it. Therefore shift as 1049 if we're leaving 1 bit at the top of the word, but 1050 adjust the exponent as if we're leaving 3 bits. */ 1051 shift -= 1; 1052 if (shift >= 64) { 1053 lo = lo << (shift - 64); 1054 } else { 1055 hi = (hi << shift) | (lo >> (64 - shift)); 1056 lo = hi | ((lo << shift) != 0); 1057 } 1058 p_exp -= shift - 2; 1059 } 1060 } 1061 } 1062 1063 if (flags & float_muladd_halve_result) { 1064 p_exp -= 1; 1065 } 1066 1067 /* finally prepare our result */ 1068 a.cls = float_class_normal; 1069 a.sign = p_sign ^ sign_flip; 1070 a.exp = p_exp; 1071 a.frac = lo; 1072 1073 return a; 1074 } 1075 1076 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c, 1077 int flags, float_status *status) 1078 { 1079 FloatParts pa = float16_unpack_canonical(a, status); 1080 FloatParts pb = float16_unpack_canonical(b, status); 1081 FloatParts pc = float16_unpack_canonical(c, status); 1082 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1083 1084 return float16_round_pack_canonical(pr, status); 1085 } 1086 1087 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c, 1088 int flags, float_status *status) 1089 { 1090 FloatParts pa = float32_unpack_canonical(a, status); 1091 FloatParts pb = float32_unpack_canonical(b, status); 1092 FloatParts pc = float32_unpack_canonical(c, status); 1093 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1094 1095 return float32_round_pack_canonical(pr, status); 1096 } 1097 1098 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c, 1099 int flags, float_status *status) 1100 { 1101 FloatParts pa = float64_unpack_canonical(a, status); 1102 FloatParts pb = float64_unpack_canonical(b, status); 1103 FloatParts pc = float64_unpack_canonical(c, status); 1104 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1105 1106 return float64_round_pack_canonical(pr, status); 1107 } 1108 1109 /* 1110 * Returns the result of dividing the floating-point value `a' by the 1111 * corresponding value `b'. The operation is performed according to 1112 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1113 */ 1114 1115 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s) 1116 { 1117 bool sign = a.sign ^ b.sign; 1118 1119 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1120 uint64_t temp_lo, temp_hi; 1121 int exp = a.exp - b.exp; 1122 if (a.frac < b.frac) { 1123 exp -= 1; 1124 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, 1125 &temp_hi, &temp_lo); 1126 } else { 1127 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, 1128 &temp_hi, &temp_lo); 1129 } 1130 /* LSB of quot is set if inexact which roundandpack will use 1131 * to set flags. Yet again we re-use a for the result */ 1132 a.frac = div128To64(temp_lo, temp_hi, b.frac); 1133 a.sign = sign; 1134 a.exp = exp; 1135 return a; 1136 } 1137 /* handle all the NaN cases */ 1138 if (is_nan(a.cls) || is_nan(b.cls)) { 1139 return pick_nan(a, b, s); 1140 } 1141 /* 0/0 or Inf/Inf */ 1142 if (a.cls == b.cls 1143 && 1144 (a.cls == float_class_inf || a.cls == float_class_zero)) { 1145 s->float_exception_flags |= float_flag_invalid; 1146 a.cls = float_class_dnan; 1147 return a; 1148 } 1149 /* Div 0 => Inf */ 1150 if (b.cls == float_class_zero) { 1151 s->float_exception_flags |= float_flag_divbyzero; 1152 a.cls = float_class_inf; 1153 a.sign = sign; 1154 return a; 1155 } 1156 /* Inf / x or 0 / x */ 1157 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1158 a.sign = sign; 1159 return a; 1160 } 1161 /* Div by Inf */ 1162 if (b.cls == float_class_inf) { 1163 a.cls = float_class_zero; 1164 a.sign = sign; 1165 return a; 1166 } 1167 g_assert_not_reached(); 1168 } 1169 1170 float16 float16_div(float16 a, float16 b, float_status *status) 1171 { 1172 FloatParts pa = float16_unpack_canonical(a, status); 1173 FloatParts pb = float16_unpack_canonical(b, status); 1174 FloatParts pr = div_floats(pa, pb, status); 1175 1176 return float16_round_pack_canonical(pr, status); 1177 } 1178 1179 float32 float32_div(float32 a, float32 b, float_status *status) 1180 { 1181 FloatParts pa = float32_unpack_canonical(a, status); 1182 FloatParts pb = float32_unpack_canonical(b, status); 1183 FloatParts pr = div_floats(pa, pb, status); 1184 1185 return float32_round_pack_canonical(pr, status); 1186 } 1187 1188 float64 float64_div(float64 a, float64 b, float_status *status) 1189 { 1190 FloatParts pa = float64_unpack_canonical(a, status); 1191 FloatParts pb = float64_unpack_canonical(b, status); 1192 FloatParts pr = div_floats(pa, pb, status); 1193 1194 return float64_round_pack_canonical(pr, status); 1195 } 1196 1197 /* 1198 * Rounds the floating-point value `a' to an integer, and returns the 1199 * result as a floating-point value. The operation is performed 1200 * according to the IEC/IEEE Standard for Binary Floating-Point 1201 * Arithmetic. 1202 */ 1203 1204 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s) 1205 { 1206 if (is_nan(a.cls)) { 1207 return return_nan(a, s); 1208 } 1209 1210 switch (a.cls) { 1211 case float_class_zero: 1212 case float_class_inf: 1213 case float_class_qnan: 1214 /* already "integral" */ 1215 break; 1216 case float_class_normal: 1217 if (a.exp >= DECOMPOSED_BINARY_POINT) { 1218 /* already integral */ 1219 break; 1220 } 1221 if (a.exp < 0) { 1222 bool one; 1223 /* all fractional */ 1224 s->float_exception_flags |= float_flag_inexact; 1225 switch (rounding_mode) { 1226 case float_round_nearest_even: 1227 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT; 1228 break; 1229 case float_round_ties_away: 1230 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT; 1231 break; 1232 case float_round_to_zero: 1233 one = false; 1234 break; 1235 case float_round_up: 1236 one = !a.sign; 1237 break; 1238 case float_round_down: 1239 one = a.sign; 1240 break; 1241 default: 1242 g_assert_not_reached(); 1243 } 1244 1245 if (one) { 1246 a.frac = DECOMPOSED_IMPLICIT_BIT; 1247 a.exp = 0; 1248 } else { 1249 a.cls = float_class_zero; 1250 } 1251 } else { 1252 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp; 1253 uint64_t frac_lsbm1 = frac_lsb >> 1; 1254 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb; 1255 uint64_t rnd_mask = rnd_even_mask >> 1; 1256 uint64_t inc; 1257 1258 switch (rounding_mode) { 1259 case float_round_nearest_even: 1260 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 1261 break; 1262 case float_round_ties_away: 1263 inc = frac_lsbm1; 1264 break; 1265 case float_round_to_zero: 1266 inc = 0; 1267 break; 1268 case float_round_up: 1269 inc = a.sign ? 0 : rnd_mask; 1270 break; 1271 case float_round_down: 1272 inc = a.sign ? rnd_mask : 0; 1273 break; 1274 default: 1275 g_assert_not_reached(); 1276 } 1277 1278 if (a.frac & rnd_mask) { 1279 s->float_exception_flags |= float_flag_inexact; 1280 a.frac += inc; 1281 a.frac &= ~rnd_mask; 1282 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 1283 a.frac >>= 1; 1284 a.exp++; 1285 } 1286 } 1287 } 1288 break; 1289 default: 1290 g_assert_not_reached(); 1291 } 1292 return a; 1293 } 1294 1295 float16 float16_round_to_int(float16 a, float_status *s) 1296 { 1297 FloatParts pa = float16_unpack_canonical(a, s); 1298 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s); 1299 return float16_round_pack_canonical(pr, s); 1300 } 1301 1302 float32 float32_round_to_int(float32 a, float_status *s) 1303 { 1304 FloatParts pa = float32_unpack_canonical(a, s); 1305 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s); 1306 return float32_round_pack_canonical(pr, s); 1307 } 1308 1309 float64 float64_round_to_int(float64 a, float_status *s) 1310 { 1311 FloatParts pa = float64_unpack_canonical(a, s); 1312 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s); 1313 return float64_round_pack_canonical(pr, s); 1314 } 1315 1316 float64 float64_trunc_to_int(float64 a, float_status *s) 1317 { 1318 FloatParts pa = float64_unpack_canonical(a, s); 1319 FloatParts pr = round_to_int(pa, float_round_to_zero, s); 1320 return float64_round_pack_canonical(pr, s); 1321 } 1322 1323 /* 1324 * Returns the result of converting the floating-point value `a' to 1325 * the two's complement integer format. The conversion is performed 1326 * according to the IEC/IEEE Standard for Binary Floating-Point 1327 * Arithmetic---which means in particular that the conversion is 1328 * rounded according to the current rounding mode. If `a' is a NaN, 1329 * the largest positive integer is returned. Otherwise, if the 1330 * conversion overflows, the largest integer with the same sign as `a' 1331 * is returned. 1332 */ 1333 1334 static int64_t round_to_int_and_pack(FloatParts in, int rmode, 1335 int64_t min, int64_t max, 1336 float_status *s) 1337 { 1338 uint64_t r; 1339 int orig_flags = get_float_exception_flags(s); 1340 FloatParts p = round_to_int(in, rmode, s); 1341 1342 switch (p.cls) { 1343 case float_class_snan: 1344 case float_class_qnan: 1345 return max; 1346 case float_class_inf: 1347 return p.sign ? min : max; 1348 case float_class_zero: 1349 return 0; 1350 case float_class_normal: 1351 if (p.exp < DECOMPOSED_BINARY_POINT) { 1352 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 1353 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 1354 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 1355 } else { 1356 r = UINT64_MAX; 1357 } 1358 if (p.sign) { 1359 if (r < -(uint64_t) min) { 1360 return -r; 1361 } else { 1362 s->float_exception_flags = orig_flags | float_flag_invalid; 1363 return min; 1364 } 1365 } else { 1366 if (r < max) { 1367 return r; 1368 } else { 1369 s->float_exception_flags = orig_flags | float_flag_invalid; 1370 return max; 1371 } 1372 } 1373 default: 1374 g_assert_not_reached(); 1375 } 1376 } 1377 1378 #define FLOAT_TO_INT(fsz, isz) \ 1379 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a, \ 1380 float_status *s) \ 1381 { \ 1382 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1383 return round_to_int_and_pack(p, s->float_rounding_mode, \ 1384 INT ## isz ## _MIN, INT ## isz ## _MAX,\ 1385 s); \ 1386 } \ 1387 \ 1388 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero \ 1389 (float ## fsz a, float_status *s) \ 1390 { \ 1391 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1392 return round_to_int_and_pack(p, float_round_to_zero, \ 1393 INT ## isz ## _MIN, INT ## isz ## _MAX,\ 1394 s); \ 1395 } 1396 1397 FLOAT_TO_INT(16, 16) 1398 FLOAT_TO_INT(16, 32) 1399 FLOAT_TO_INT(16, 64) 1400 1401 FLOAT_TO_INT(32, 16) 1402 FLOAT_TO_INT(32, 32) 1403 FLOAT_TO_INT(32, 64) 1404 1405 FLOAT_TO_INT(64, 16) 1406 FLOAT_TO_INT(64, 32) 1407 FLOAT_TO_INT(64, 64) 1408 1409 #undef FLOAT_TO_INT 1410 1411 /* 1412 * Returns the result of converting the floating-point value `a' to 1413 * the unsigned integer format. The conversion is performed according 1414 * to the IEC/IEEE Standard for Binary Floating-Point 1415 * Arithmetic---which means in particular that the conversion is 1416 * rounded according to the current rounding mode. If `a' is a NaN, 1417 * the largest unsigned integer is returned. Otherwise, if the 1418 * conversion overflows, the largest unsigned integer is returned. If 1419 * the 'a' is negative, the result is rounded and zero is returned; 1420 * values that do not round to zero will raise the inexact exception 1421 * flag. 1422 */ 1423 1424 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max, 1425 float_status *s) 1426 { 1427 int orig_flags = get_float_exception_flags(s); 1428 FloatParts p = round_to_int(in, rmode, s); 1429 1430 switch (p.cls) { 1431 case float_class_snan: 1432 case float_class_qnan: 1433 s->float_exception_flags = orig_flags | float_flag_invalid; 1434 return max; 1435 case float_class_inf: 1436 return p.sign ? 0 : max; 1437 case float_class_zero: 1438 return 0; 1439 case float_class_normal: 1440 { 1441 uint64_t r; 1442 if (p.sign) { 1443 s->float_exception_flags = orig_flags | float_flag_invalid; 1444 return 0; 1445 } 1446 1447 if (p.exp < DECOMPOSED_BINARY_POINT) { 1448 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 1449 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 1450 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 1451 } else { 1452 s->float_exception_flags = orig_flags | float_flag_invalid; 1453 return max; 1454 } 1455 1456 /* For uint64 this will never trip, but if p.exp is too large 1457 * to shift a decomposed fraction we shall have exited via the 1458 * 3rd leg above. 1459 */ 1460 if (r > max) { 1461 s->float_exception_flags = orig_flags | float_flag_invalid; 1462 return max; 1463 } else { 1464 return r; 1465 } 1466 } 1467 default: 1468 g_assert_not_reached(); 1469 } 1470 } 1471 1472 #define FLOAT_TO_UINT(fsz, isz) \ 1473 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a, \ 1474 float_status *s) \ 1475 { \ 1476 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1477 return round_to_uint_and_pack(p, s->float_rounding_mode, \ 1478 UINT ## isz ## _MAX, s); \ 1479 } \ 1480 \ 1481 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero \ 1482 (float ## fsz a, float_status *s) \ 1483 { \ 1484 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1485 return round_to_uint_and_pack(p, s->float_rounding_mode, \ 1486 UINT ## isz ## _MAX, s); \ 1487 } 1488 1489 FLOAT_TO_UINT(16, 16) 1490 FLOAT_TO_UINT(16, 32) 1491 FLOAT_TO_UINT(16, 64) 1492 1493 FLOAT_TO_UINT(32, 16) 1494 FLOAT_TO_UINT(32, 32) 1495 FLOAT_TO_UINT(32, 64) 1496 1497 FLOAT_TO_UINT(64, 16) 1498 FLOAT_TO_UINT(64, 32) 1499 FLOAT_TO_UINT(64, 64) 1500 1501 #undef FLOAT_TO_UINT 1502 1503 /* 1504 * Integer to float conversions 1505 * 1506 * Returns the result of converting the two's complement integer `a' 1507 * to the floating-point format. The conversion is performed according 1508 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1509 */ 1510 1511 static FloatParts int_to_float(int64_t a, float_status *status) 1512 { 1513 FloatParts r; 1514 if (a == 0) { 1515 r.cls = float_class_zero; 1516 r.sign = false; 1517 } else if (a == (1ULL << 63)) { 1518 r.cls = float_class_normal; 1519 r.sign = true; 1520 r.frac = DECOMPOSED_IMPLICIT_BIT; 1521 r.exp = 63; 1522 } else { 1523 uint64_t f; 1524 if (a < 0) { 1525 f = -a; 1526 r.sign = true; 1527 } else { 1528 f = a; 1529 r.sign = false; 1530 } 1531 int shift = clz64(f) - 1; 1532 r.cls = float_class_normal; 1533 r.exp = (DECOMPOSED_BINARY_POINT - shift); 1534 r.frac = f << shift; 1535 } 1536 1537 return r; 1538 } 1539 1540 float16 int64_to_float16(int64_t a, float_status *status) 1541 { 1542 FloatParts pa = int_to_float(a, status); 1543 return float16_round_pack_canonical(pa, status); 1544 } 1545 1546 float16 int32_to_float16(int32_t a, float_status *status) 1547 { 1548 return int64_to_float16(a, status); 1549 } 1550 1551 float16 int16_to_float16(int16_t a, float_status *status) 1552 { 1553 return int64_to_float16(a, status); 1554 } 1555 1556 float32 int64_to_float32(int64_t a, float_status *status) 1557 { 1558 FloatParts pa = int_to_float(a, status); 1559 return float32_round_pack_canonical(pa, status); 1560 } 1561 1562 float32 int32_to_float32(int32_t a, float_status *status) 1563 { 1564 return int64_to_float32(a, status); 1565 } 1566 1567 float32 int16_to_float32(int16_t a, float_status *status) 1568 { 1569 return int64_to_float32(a, status); 1570 } 1571 1572 float64 int64_to_float64(int64_t a, float_status *status) 1573 { 1574 FloatParts pa = int_to_float(a, status); 1575 return float64_round_pack_canonical(pa, status); 1576 } 1577 1578 float64 int32_to_float64(int32_t a, float_status *status) 1579 { 1580 return int64_to_float64(a, status); 1581 } 1582 1583 float64 int16_to_float64(int16_t a, float_status *status) 1584 { 1585 return int64_to_float64(a, status); 1586 } 1587 1588 1589 /* 1590 * Unsigned Integer to float conversions 1591 * 1592 * Returns the result of converting the unsigned integer `a' to the 1593 * floating-point format. The conversion is performed according to the 1594 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1595 */ 1596 1597 static FloatParts uint_to_float(uint64_t a, float_status *status) 1598 { 1599 FloatParts r = { .sign = false}; 1600 1601 if (a == 0) { 1602 r.cls = float_class_zero; 1603 } else { 1604 int spare_bits = clz64(a) - 1; 1605 r.cls = float_class_normal; 1606 r.exp = DECOMPOSED_BINARY_POINT - spare_bits; 1607 if (spare_bits < 0) { 1608 shift64RightJamming(a, -spare_bits, &a); 1609 r.frac = a; 1610 } else { 1611 r.frac = a << spare_bits; 1612 } 1613 } 1614 1615 return r; 1616 } 1617 1618 float16 uint64_to_float16(uint64_t a, float_status *status) 1619 { 1620 FloatParts pa = uint_to_float(a, status); 1621 return float16_round_pack_canonical(pa, status); 1622 } 1623 1624 float16 uint32_to_float16(uint32_t a, float_status *status) 1625 { 1626 return uint64_to_float16(a, status); 1627 } 1628 1629 float16 uint16_to_float16(uint16_t a, float_status *status) 1630 { 1631 return uint64_to_float16(a, status); 1632 } 1633 1634 float32 uint64_to_float32(uint64_t a, float_status *status) 1635 { 1636 FloatParts pa = uint_to_float(a, status); 1637 return float32_round_pack_canonical(pa, status); 1638 } 1639 1640 float32 uint32_to_float32(uint32_t a, float_status *status) 1641 { 1642 return uint64_to_float32(a, status); 1643 } 1644 1645 float32 uint16_to_float32(uint16_t a, float_status *status) 1646 { 1647 return uint64_to_float32(a, status); 1648 } 1649 1650 float64 uint64_to_float64(uint64_t a, float_status *status) 1651 { 1652 FloatParts pa = uint_to_float(a, status); 1653 return float64_round_pack_canonical(pa, status); 1654 } 1655 1656 float64 uint32_to_float64(uint32_t a, float_status *status) 1657 { 1658 return uint64_to_float64(a, status); 1659 } 1660 1661 float64 uint16_to_float64(uint16_t a, float_status *status) 1662 { 1663 return uint64_to_float64(a, status); 1664 } 1665 1666 /* Float Min/Max */ 1667 /* min() and max() functions. These can't be implemented as 1668 * 'compare and pick one input' because that would mishandle 1669 * NaNs and +0 vs -0. 1670 * 1671 * minnum() and maxnum() functions. These are similar to the min() 1672 * and max() functions but if one of the arguments is a QNaN and 1673 * the other is numerical then the numerical argument is returned. 1674 * SNaNs will get quietened before being returned. 1675 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 1676 * and maxNum() operations. min() and max() are the typical min/max 1677 * semantics provided by many CPUs which predate that specification. 1678 * 1679 * minnummag() and maxnummag() functions correspond to minNumMag() 1680 * and minNumMag() from the IEEE-754 2008. 1681 */ 1682 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin, 1683 bool ieee, bool ismag, float_status *s) 1684 { 1685 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) { 1686 if (ieee) { 1687 /* Takes two floating-point values `a' and `b', one of 1688 * which is a NaN, and returns the appropriate NaN 1689 * result. If either `a' or `b' is a signaling NaN, 1690 * the invalid exception is raised. 1691 */ 1692 if (is_snan(a.cls) || is_snan(b.cls)) { 1693 return pick_nan(a, b, s); 1694 } else if (is_nan(a.cls) && !is_nan(b.cls)) { 1695 return b; 1696 } else if (is_nan(b.cls) && !is_nan(a.cls)) { 1697 return a; 1698 } 1699 } 1700 return pick_nan(a, b, s); 1701 } else { 1702 int a_exp, b_exp; 1703 bool a_sign, b_sign; 1704 1705 switch (a.cls) { 1706 case float_class_normal: 1707 a_exp = a.exp; 1708 break; 1709 case float_class_inf: 1710 a_exp = INT_MAX; 1711 break; 1712 case float_class_zero: 1713 a_exp = INT_MIN; 1714 break; 1715 default: 1716 g_assert_not_reached(); 1717 break; 1718 } 1719 switch (b.cls) { 1720 case float_class_normal: 1721 b_exp = b.exp; 1722 break; 1723 case float_class_inf: 1724 b_exp = INT_MAX; 1725 break; 1726 case float_class_zero: 1727 b_exp = INT_MIN; 1728 break; 1729 default: 1730 g_assert_not_reached(); 1731 break; 1732 } 1733 1734 a_sign = a.sign; 1735 b_sign = b.sign; 1736 if (ismag) { 1737 a_sign = b_sign = 0; 1738 } 1739 1740 if (a_sign == b_sign) { 1741 bool a_less = a_exp < b_exp; 1742 if (a_exp == b_exp) { 1743 a_less = a.frac < b.frac; 1744 } 1745 return a_sign ^ a_less ^ ismin ? b : a; 1746 } else { 1747 return a_sign ^ ismin ? b : a; 1748 } 1749 } 1750 } 1751 1752 #define MINMAX(sz, name, ismin, isiee, ismag) \ 1753 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \ 1754 float_status *s) \ 1755 { \ 1756 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 1757 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 1758 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 1759 \ 1760 return float ## sz ## _round_pack_canonical(pr, s); \ 1761 } 1762 1763 MINMAX(16, min, true, false, false) 1764 MINMAX(16, minnum, true, true, false) 1765 MINMAX(16, minnummag, true, true, true) 1766 MINMAX(16, max, false, false, false) 1767 MINMAX(16, maxnum, false, true, false) 1768 MINMAX(16, maxnummag, false, true, true) 1769 1770 MINMAX(32, min, true, false, false) 1771 MINMAX(32, minnum, true, true, false) 1772 MINMAX(32, minnummag, true, true, true) 1773 MINMAX(32, max, false, false, false) 1774 MINMAX(32, maxnum, false, true, false) 1775 MINMAX(32, maxnummag, false, true, true) 1776 1777 MINMAX(64, min, true, false, false) 1778 MINMAX(64, minnum, true, true, false) 1779 MINMAX(64, minnummag, true, true, true) 1780 MINMAX(64, max, false, false, false) 1781 MINMAX(64, maxnum, false, true, false) 1782 MINMAX(64, maxnummag, false, true, true) 1783 1784 #undef MINMAX 1785 1786 /* Floating point compare */ 1787 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet, 1788 float_status *s) 1789 { 1790 if (is_nan(a.cls) || is_nan(b.cls)) { 1791 if (!is_quiet || 1792 a.cls == float_class_snan || 1793 b.cls == float_class_snan) { 1794 s->float_exception_flags |= float_flag_invalid; 1795 } 1796 return float_relation_unordered; 1797 } 1798 1799 if (a.cls == float_class_zero) { 1800 if (b.cls == float_class_zero) { 1801 return float_relation_equal; 1802 } 1803 return b.sign ? float_relation_greater : float_relation_less; 1804 } else if (b.cls == float_class_zero) { 1805 return a.sign ? float_relation_less : float_relation_greater; 1806 } 1807 1808 /* The only really important thing about infinity is its sign. If 1809 * both are infinities the sign marks the smallest of the two. 1810 */ 1811 if (a.cls == float_class_inf) { 1812 if ((b.cls == float_class_inf) && (a.sign == b.sign)) { 1813 return float_relation_equal; 1814 } 1815 return a.sign ? float_relation_less : float_relation_greater; 1816 } else if (b.cls == float_class_inf) { 1817 return b.sign ? float_relation_greater : float_relation_less; 1818 } 1819 1820 if (a.sign != b.sign) { 1821 return a.sign ? float_relation_less : float_relation_greater; 1822 } 1823 1824 if (a.exp == b.exp) { 1825 if (a.frac == b.frac) { 1826 return float_relation_equal; 1827 } 1828 if (a.sign) { 1829 return a.frac > b.frac ? 1830 float_relation_less : float_relation_greater; 1831 } else { 1832 return a.frac > b.frac ? 1833 float_relation_greater : float_relation_less; 1834 } 1835 } else { 1836 if (a.sign) { 1837 return a.exp > b.exp ? float_relation_less : float_relation_greater; 1838 } else { 1839 return a.exp > b.exp ? float_relation_greater : float_relation_less; 1840 } 1841 } 1842 } 1843 1844 #define COMPARE(sz) \ 1845 int float ## sz ## _compare(float ## sz a, float ## sz b, \ 1846 float_status *s) \ 1847 { \ 1848 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 1849 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 1850 return compare_floats(pa, pb, false, s); \ 1851 } \ 1852 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \ 1853 float_status *s) \ 1854 { \ 1855 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 1856 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 1857 return compare_floats(pa, pb, true, s); \ 1858 } 1859 1860 COMPARE(16) 1861 COMPARE(32) 1862 COMPARE(64) 1863 1864 #undef COMPARE 1865 1866 /* Multiply A by 2 raised to the power N. */ 1867 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s) 1868 { 1869 if (unlikely(is_nan(a.cls))) { 1870 return return_nan(a, s); 1871 } 1872 if (a.cls == float_class_normal) { 1873 a.exp += n; 1874 } 1875 return a; 1876 } 1877 1878 float16 float16_scalbn(float16 a, int n, float_status *status) 1879 { 1880 FloatParts pa = float16_unpack_canonical(a, status); 1881 FloatParts pr = scalbn_decomposed(pa, n, status); 1882 return float16_round_pack_canonical(pr, status); 1883 } 1884 1885 float32 float32_scalbn(float32 a, int n, float_status *status) 1886 { 1887 FloatParts pa = float32_unpack_canonical(a, status); 1888 FloatParts pr = scalbn_decomposed(pa, n, status); 1889 return float32_round_pack_canonical(pr, status); 1890 } 1891 1892 float64 float64_scalbn(float64 a, int n, float_status *status) 1893 { 1894 FloatParts pa = float64_unpack_canonical(a, status); 1895 FloatParts pr = scalbn_decomposed(pa, n, status); 1896 return float64_round_pack_canonical(pr, status); 1897 } 1898 1899 /* 1900 * Square Root 1901 * 1902 * The old softfloat code did an approximation step before zeroing in 1903 * on the final result. However for simpleness we just compute the 1904 * square root by iterating down from the implicit bit to enough extra 1905 * bits to ensure we get a correctly rounded result. 1906 * 1907 * This does mean however the calculation is slower than before, 1908 * especially for 64 bit floats. 1909 */ 1910 1911 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p) 1912 { 1913 uint64_t a_frac, r_frac, s_frac; 1914 int bit, last_bit; 1915 1916 if (is_nan(a.cls)) { 1917 return return_nan(a, s); 1918 } 1919 if (a.cls == float_class_zero) { 1920 return a; /* sqrt(+-0) = +-0 */ 1921 } 1922 if (a.sign) { 1923 s->float_exception_flags |= float_flag_invalid; 1924 a.cls = float_class_dnan; 1925 return a; 1926 } 1927 if (a.cls == float_class_inf) { 1928 return a; /* sqrt(+inf) = +inf */ 1929 } 1930 1931 assert(a.cls == float_class_normal); 1932 1933 /* We need two overflow bits at the top. Adding room for that is a 1934 * right shift. If the exponent is odd, we can discard the low bit 1935 * by multiplying the fraction by 2; that's a left shift. Combine 1936 * those and we shift right if the exponent is even. 1937 */ 1938 a_frac = a.frac; 1939 if (!(a.exp & 1)) { 1940 a_frac >>= 1; 1941 } 1942 a.exp >>= 1; 1943 1944 /* Bit-by-bit computation of sqrt. */ 1945 r_frac = 0; 1946 s_frac = 0; 1947 1948 /* Iterate from implicit bit down to the 3 extra bits to compute a 1949 * properly rounded result. Remember we've inserted one more bit 1950 * at the top, so these positions are one less. 1951 */ 1952 bit = DECOMPOSED_BINARY_POINT - 1; 1953 last_bit = MAX(p->frac_shift - 4, 0); 1954 do { 1955 uint64_t q = 1ULL << bit; 1956 uint64_t t_frac = s_frac + q; 1957 if (t_frac <= a_frac) { 1958 s_frac = t_frac + q; 1959 a_frac -= t_frac; 1960 r_frac += q; 1961 } 1962 a_frac <<= 1; 1963 } while (--bit >= last_bit); 1964 1965 /* Undo the right shift done above. If there is any remaining 1966 * fraction, the result is inexact. Set the sticky bit. 1967 */ 1968 a.frac = (r_frac << 1) + (a_frac != 0); 1969 1970 return a; 1971 } 1972 1973 float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status) 1974 { 1975 FloatParts pa = float16_unpack_canonical(a, status); 1976 FloatParts pr = sqrt_float(pa, status, &float16_params); 1977 return float16_round_pack_canonical(pr, status); 1978 } 1979 1980 float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status) 1981 { 1982 FloatParts pa = float32_unpack_canonical(a, status); 1983 FloatParts pr = sqrt_float(pa, status, &float32_params); 1984 return float32_round_pack_canonical(pr, status); 1985 } 1986 1987 float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status) 1988 { 1989 FloatParts pa = float64_unpack_canonical(a, status); 1990 FloatParts pr = sqrt_float(pa, status, &float64_params); 1991 return float64_round_pack_canonical(pr, status); 1992 } 1993 1994 1995 /*---------------------------------------------------------------------------- 1996 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 1997 | and 7, and returns the properly rounded 32-bit integer corresponding to the 1998 | input. If `zSign' is 1, the input is negated before being converted to an 1999 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 2000 | is simply rounded to an integer, with the inexact exception raised if the 2001 | input cannot be represented exactly as an integer. However, if the fixed- 2002 | point input is too large, the invalid exception is raised and the largest 2003 | positive or negative integer is returned. 2004 *----------------------------------------------------------------------------*/ 2005 2006 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status) 2007 { 2008 int8_t roundingMode; 2009 flag roundNearestEven; 2010 int8_t roundIncrement, roundBits; 2011 int32_t z; 2012 2013 roundingMode = status->float_rounding_mode; 2014 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2015 switch (roundingMode) { 2016 case float_round_nearest_even: 2017 case float_round_ties_away: 2018 roundIncrement = 0x40; 2019 break; 2020 case float_round_to_zero: 2021 roundIncrement = 0; 2022 break; 2023 case float_round_up: 2024 roundIncrement = zSign ? 0 : 0x7f; 2025 break; 2026 case float_round_down: 2027 roundIncrement = zSign ? 0x7f : 0; 2028 break; 2029 default: 2030 abort(); 2031 } 2032 roundBits = absZ & 0x7F; 2033 absZ = ( absZ + roundIncrement )>>7; 2034 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 2035 z = absZ; 2036 if ( zSign ) z = - z; 2037 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 2038 float_raise(float_flag_invalid, status); 2039 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 2040 } 2041 if (roundBits) { 2042 status->float_exception_flags |= float_flag_inexact; 2043 } 2044 return z; 2045 2046 } 2047 2048 /*---------------------------------------------------------------------------- 2049 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 2050 | `absZ1', with binary point between bits 63 and 64 (between the input words), 2051 | and returns the properly rounded 64-bit integer corresponding to the input. 2052 | If `zSign' is 1, the input is negated before being converted to an integer. 2053 | Ordinarily, the fixed-point input is simply rounded to an integer, with 2054 | the inexact exception raised if the input cannot be represented exactly as 2055 | an integer. However, if the fixed-point input is too large, the invalid 2056 | exception is raised and the largest positive or negative integer is 2057 | returned. 2058 *----------------------------------------------------------------------------*/ 2059 2060 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1, 2061 float_status *status) 2062 { 2063 int8_t roundingMode; 2064 flag roundNearestEven, increment; 2065 int64_t z; 2066 2067 roundingMode = status->float_rounding_mode; 2068 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2069 switch (roundingMode) { 2070 case float_round_nearest_even: 2071 case float_round_ties_away: 2072 increment = ((int64_t) absZ1 < 0); 2073 break; 2074 case float_round_to_zero: 2075 increment = 0; 2076 break; 2077 case float_round_up: 2078 increment = !zSign && absZ1; 2079 break; 2080 case float_round_down: 2081 increment = zSign && absZ1; 2082 break; 2083 default: 2084 abort(); 2085 } 2086 if ( increment ) { 2087 ++absZ0; 2088 if ( absZ0 == 0 ) goto overflow; 2089 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 2090 } 2091 z = absZ0; 2092 if ( zSign ) z = - z; 2093 if ( z && ( ( z < 0 ) ^ zSign ) ) { 2094 overflow: 2095 float_raise(float_flag_invalid, status); 2096 return 2097 zSign ? (int64_t) LIT64( 0x8000000000000000 ) 2098 : LIT64( 0x7FFFFFFFFFFFFFFF ); 2099 } 2100 if (absZ1) { 2101 status->float_exception_flags |= float_flag_inexact; 2102 } 2103 return z; 2104 2105 } 2106 2107 /*---------------------------------------------------------------------------- 2108 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 2109 | `absZ1', with binary point between bits 63 and 64 (between the input words), 2110 | and returns the properly rounded 64-bit unsigned integer corresponding to the 2111 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 2112 | with the inexact exception raised if the input cannot be represented exactly 2113 | as an integer. However, if the fixed-point input is too large, the invalid 2114 | exception is raised and the largest unsigned integer is returned. 2115 *----------------------------------------------------------------------------*/ 2116 2117 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0, 2118 uint64_t absZ1, float_status *status) 2119 { 2120 int8_t roundingMode; 2121 flag roundNearestEven, increment; 2122 2123 roundingMode = status->float_rounding_mode; 2124 roundNearestEven = (roundingMode == float_round_nearest_even); 2125 switch (roundingMode) { 2126 case float_round_nearest_even: 2127 case float_round_ties_away: 2128 increment = ((int64_t)absZ1 < 0); 2129 break; 2130 case float_round_to_zero: 2131 increment = 0; 2132 break; 2133 case float_round_up: 2134 increment = !zSign && absZ1; 2135 break; 2136 case float_round_down: 2137 increment = zSign && absZ1; 2138 break; 2139 default: 2140 abort(); 2141 } 2142 if (increment) { 2143 ++absZ0; 2144 if (absZ0 == 0) { 2145 float_raise(float_flag_invalid, status); 2146 return LIT64(0xFFFFFFFFFFFFFFFF); 2147 } 2148 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven); 2149 } 2150 2151 if (zSign && absZ0) { 2152 float_raise(float_flag_invalid, status); 2153 return 0; 2154 } 2155 2156 if (absZ1) { 2157 status->float_exception_flags |= float_flag_inexact; 2158 } 2159 return absZ0; 2160 } 2161 2162 /*---------------------------------------------------------------------------- 2163 | If `a' is denormal and we are in flush-to-zero mode then set the 2164 | input-denormal exception and return zero. Otherwise just return the value. 2165 *----------------------------------------------------------------------------*/ 2166 float32 float32_squash_input_denormal(float32 a, float_status *status) 2167 { 2168 if (status->flush_inputs_to_zero) { 2169 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) { 2170 float_raise(float_flag_input_denormal, status); 2171 return make_float32(float32_val(a) & 0x80000000); 2172 } 2173 } 2174 return a; 2175 } 2176 2177 /*---------------------------------------------------------------------------- 2178 | Normalizes the subnormal single-precision floating-point value represented 2179 | by the denormalized significand `aSig'. The normalized exponent and 2180 | significand are stored at the locations pointed to by `zExpPtr' and 2181 | `zSigPtr', respectively. 2182 *----------------------------------------------------------------------------*/ 2183 2184 static void 2185 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 2186 { 2187 int8_t shiftCount; 2188 2189 shiftCount = countLeadingZeros32( aSig ) - 8; 2190 *zSigPtr = aSig<<shiftCount; 2191 *zExpPtr = 1 - shiftCount; 2192 2193 } 2194 2195 /*---------------------------------------------------------------------------- 2196 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 2197 | single-precision floating-point value, returning the result. After being 2198 | shifted into the proper positions, the three fields are simply added 2199 | together to form the result. This means that any integer portion of `zSig' 2200 | will be added into the exponent. Since a properly normalized significand 2201 | will have an integer portion equal to 1, the `zExp' input should be 1 less 2202 | than the desired result exponent whenever `zSig' is a complete, normalized 2203 | significand. 2204 *----------------------------------------------------------------------------*/ 2205 2206 static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig) 2207 { 2208 2209 return make_float32( 2210 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig); 2211 2212 } 2213 2214 /*---------------------------------------------------------------------------- 2215 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2216 | and significand `zSig', and returns the proper single-precision floating- 2217 | point value corresponding to the abstract input. Ordinarily, the abstract 2218 | value is simply rounded and packed into the single-precision format, with 2219 | the inexact exception raised if the abstract input cannot be represented 2220 | exactly. However, if the abstract value is too large, the overflow and 2221 | inexact exceptions are raised and an infinity or maximal finite value is 2222 | returned. If the abstract value is too small, the input value is rounded to 2223 | a subnormal number, and the underflow and inexact exceptions are raised if 2224 | the abstract input cannot be represented exactly as a subnormal single- 2225 | precision floating-point number. 2226 | The input significand `zSig' has its binary point between bits 30 2227 | and 29, which is 7 bits to the left of the usual location. This shifted 2228 | significand must be normalized or smaller. If `zSig' is not normalized, 2229 | `zExp' must be 0; in that case, the result returned is a subnormal number, 2230 | and it must not require rounding. In the usual case that `zSig' is 2231 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 2232 | The handling of underflow and overflow follows the IEC/IEEE Standard for 2233 | Binary Floating-Point Arithmetic. 2234 *----------------------------------------------------------------------------*/ 2235 2236 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 2237 float_status *status) 2238 { 2239 int8_t roundingMode; 2240 flag roundNearestEven; 2241 int8_t roundIncrement, roundBits; 2242 flag isTiny; 2243 2244 roundingMode = status->float_rounding_mode; 2245 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2246 switch (roundingMode) { 2247 case float_round_nearest_even: 2248 case float_round_ties_away: 2249 roundIncrement = 0x40; 2250 break; 2251 case float_round_to_zero: 2252 roundIncrement = 0; 2253 break; 2254 case float_round_up: 2255 roundIncrement = zSign ? 0 : 0x7f; 2256 break; 2257 case float_round_down: 2258 roundIncrement = zSign ? 0x7f : 0; 2259 break; 2260 default: 2261 abort(); 2262 break; 2263 } 2264 roundBits = zSig & 0x7F; 2265 if ( 0xFD <= (uint16_t) zExp ) { 2266 if ( ( 0xFD < zExp ) 2267 || ( ( zExp == 0xFD ) 2268 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 2269 ) { 2270 float_raise(float_flag_overflow | float_flag_inexact, status); 2271 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 )); 2272 } 2273 if ( zExp < 0 ) { 2274 if (status->flush_to_zero) { 2275 float_raise(float_flag_output_denormal, status); 2276 return packFloat32(zSign, 0, 0); 2277 } 2278 isTiny = 2279 (status->float_detect_tininess 2280 == float_tininess_before_rounding) 2281 || ( zExp < -1 ) 2282 || ( zSig + roundIncrement < 0x80000000 ); 2283 shift32RightJamming( zSig, - zExp, &zSig ); 2284 zExp = 0; 2285 roundBits = zSig & 0x7F; 2286 if (isTiny && roundBits) { 2287 float_raise(float_flag_underflow, status); 2288 } 2289 } 2290 } 2291 if (roundBits) { 2292 status->float_exception_flags |= float_flag_inexact; 2293 } 2294 zSig = ( zSig + roundIncrement )>>7; 2295 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 2296 if ( zSig == 0 ) zExp = 0; 2297 return packFloat32( zSign, zExp, zSig ); 2298 2299 } 2300 2301 /*---------------------------------------------------------------------------- 2302 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2303 | and significand `zSig', and returns the proper single-precision floating- 2304 | point value corresponding to the abstract input. This routine is just like 2305 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 2306 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 2307 | floating-point exponent. 2308 *----------------------------------------------------------------------------*/ 2309 2310 static float32 2311 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 2312 float_status *status) 2313 { 2314 int8_t shiftCount; 2315 2316 shiftCount = countLeadingZeros32( zSig ) - 1; 2317 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 2318 status); 2319 2320 } 2321 2322 /*---------------------------------------------------------------------------- 2323 | If `a' is denormal and we are in flush-to-zero mode then set the 2324 | input-denormal exception and return zero. Otherwise just return the value. 2325 *----------------------------------------------------------------------------*/ 2326 float64 float64_squash_input_denormal(float64 a, float_status *status) 2327 { 2328 if (status->flush_inputs_to_zero) { 2329 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) { 2330 float_raise(float_flag_input_denormal, status); 2331 return make_float64(float64_val(a) & (1ULL << 63)); 2332 } 2333 } 2334 return a; 2335 } 2336 2337 /*---------------------------------------------------------------------------- 2338 | Normalizes the subnormal double-precision floating-point value represented 2339 | by the denormalized significand `aSig'. The normalized exponent and 2340 | significand are stored at the locations pointed to by `zExpPtr' and 2341 | `zSigPtr', respectively. 2342 *----------------------------------------------------------------------------*/ 2343 2344 static void 2345 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 2346 { 2347 int8_t shiftCount; 2348 2349 shiftCount = countLeadingZeros64( aSig ) - 11; 2350 *zSigPtr = aSig<<shiftCount; 2351 *zExpPtr = 1 - shiftCount; 2352 2353 } 2354 2355 /*---------------------------------------------------------------------------- 2356 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 2357 | double-precision floating-point value, returning the result. After being 2358 | shifted into the proper positions, the three fields are simply added 2359 | together to form the result. This means that any integer portion of `zSig' 2360 | will be added into the exponent. Since a properly normalized significand 2361 | will have an integer portion equal to 1, the `zExp' input should be 1 less 2362 | than the desired result exponent whenever `zSig' is a complete, normalized 2363 | significand. 2364 *----------------------------------------------------------------------------*/ 2365 2366 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig) 2367 { 2368 2369 return make_float64( 2370 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 2371 2372 } 2373 2374 /*---------------------------------------------------------------------------- 2375 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2376 | and significand `zSig', and returns the proper double-precision floating- 2377 | point value corresponding to the abstract input. Ordinarily, the abstract 2378 | value is simply rounded and packed into the double-precision format, with 2379 | the inexact exception raised if the abstract input cannot be represented 2380 | exactly. However, if the abstract value is too large, the overflow and 2381 | inexact exceptions are raised and an infinity or maximal finite value is 2382 | returned. If the abstract value is too small, the input value is rounded to 2383 | a subnormal number, and the underflow and inexact exceptions are raised if 2384 | the abstract input cannot be represented exactly as a subnormal double- 2385 | precision floating-point number. 2386 | The input significand `zSig' has its binary point between bits 62 2387 | and 61, which is 10 bits to the left of the usual location. This shifted 2388 | significand must be normalized or smaller. If `zSig' is not normalized, 2389 | `zExp' must be 0; in that case, the result returned is a subnormal number, 2390 | and it must not require rounding. In the usual case that `zSig' is 2391 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 2392 | The handling of underflow and overflow follows the IEC/IEEE Standard for 2393 | Binary Floating-Point Arithmetic. 2394 *----------------------------------------------------------------------------*/ 2395 2396 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 2397 float_status *status) 2398 { 2399 int8_t roundingMode; 2400 flag roundNearestEven; 2401 int roundIncrement, roundBits; 2402 flag isTiny; 2403 2404 roundingMode = status->float_rounding_mode; 2405 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2406 switch (roundingMode) { 2407 case float_round_nearest_even: 2408 case float_round_ties_away: 2409 roundIncrement = 0x200; 2410 break; 2411 case float_round_to_zero: 2412 roundIncrement = 0; 2413 break; 2414 case float_round_up: 2415 roundIncrement = zSign ? 0 : 0x3ff; 2416 break; 2417 case float_round_down: 2418 roundIncrement = zSign ? 0x3ff : 0; 2419 break; 2420 case float_round_to_odd: 2421 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 2422 break; 2423 default: 2424 abort(); 2425 } 2426 roundBits = zSig & 0x3FF; 2427 if ( 0x7FD <= (uint16_t) zExp ) { 2428 if ( ( 0x7FD < zExp ) 2429 || ( ( zExp == 0x7FD ) 2430 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 2431 ) { 2432 bool overflow_to_inf = roundingMode != float_round_to_odd && 2433 roundIncrement != 0; 2434 float_raise(float_flag_overflow | float_flag_inexact, status); 2435 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 2436 } 2437 if ( zExp < 0 ) { 2438 if (status->flush_to_zero) { 2439 float_raise(float_flag_output_denormal, status); 2440 return packFloat64(zSign, 0, 0); 2441 } 2442 isTiny = 2443 (status->float_detect_tininess 2444 == float_tininess_before_rounding) 2445 || ( zExp < -1 ) 2446 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 2447 shift64RightJamming( zSig, - zExp, &zSig ); 2448 zExp = 0; 2449 roundBits = zSig & 0x3FF; 2450 if (isTiny && roundBits) { 2451 float_raise(float_flag_underflow, status); 2452 } 2453 if (roundingMode == float_round_to_odd) { 2454 /* 2455 * For round-to-odd case, the roundIncrement depends on 2456 * zSig which just changed. 2457 */ 2458 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 2459 } 2460 } 2461 } 2462 if (roundBits) { 2463 status->float_exception_flags |= float_flag_inexact; 2464 } 2465 zSig = ( zSig + roundIncrement )>>10; 2466 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 2467 if ( zSig == 0 ) zExp = 0; 2468 return packFloat64( zSign, zExp, zSig ); 2469 2470 } 2471 2472 /*---------------------------------------------------------------------------- 2473 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2474 | and significand `zSig', and returns the proper double-precision floating- 2475 | point value corresponding to the abstract input. This routine is just like 2476 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 2477 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 2478 | floating-point exponent. 2479 *----------------------------------------------------------------------------*/ 2480 2481 static float64 2482 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 2483 float_status *status) 2484 { 2485 int8_t shiftCount; 2486 2487 shiftCount = countLeadingZeros64( zSig ) - 1; 2488 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 2489 status); 2490 2491 } 2492 2493 /*---------------------------------------------------------------------------- 2494 | Returns the fraction bits of the extended double-precision floating-point 2495 | value `a'. 2496 *----------------------------------------------------------------------------*/ 2497 2498 static inline uint64_t extractFloatx80Frac( floatx80 a ) 2499 { 2500 2501 return a.low; 2502 2503 } 2504 2505 /*---------------------------------------------------------------------------- 2506 | Returns the exponent bits of the extended double-precision floating-point 2507 | value `a'. 2508 *----------------------------------------------------------------------------*/ 2509 2510 static inline int32_t extractFloatx80Exp( floatx80 a ) 2511 { 2512 2513 return a.high & 0x7FFF; 2514 2515 } 2516 2517 /*---------------------------------------------------------------------------- 2518 | Returns the sign bit of the extended double-precision floating-point value 2519 | `a'. 2520 *----------------------------------------------------------------------------*/ 2521 2522 static inline flag extractFloatx80Sign( floatx80 a ) 2523 { 2524 2525 return a.high>>15; 2526 2527 } 2528 2529 /*---------------------------------------------------------------------------- 2530 | Normalizes the subnormal extended double-precision floating-point value 2531 | represented by the denormalized significand `aSig'. The normalized exponent 2532 | and significand are stored at the locations pointed to by `zExpPtr' and 2533 | `zSigPtr', respectively. 2534 *----------------------------------------------------------------------------*/ 2535 2536 static void 2537 normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr ) 2538 { 2539 int8_t shiftCount; 2540 2541 shiftCount = countLeadingZeros64( aSig ); 2542 *zSigPtr = aSig<<shiftCount; 2543 *zExpPtr = 1 - shiftCount; 2544 2545 } 2546 2547 /*---------------------------------------------------------------------------- 2548 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 2549 | extended double-precision floating-point value, returning the result. 2550 *----------------------------------------------------------------------------*/ 2551 2552 static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig ) 2553 { 2554 floatx80 z; 2555 2556 z.low = zSig; 2557 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp; 2558 return z; 2559 2560 } 2561 2562 /*---------------------------------------------------------------------------- 2563 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2564 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 2565 | and returns the proper extended double-precision floating-point value 2566 | corresponding to the abstract input. Ordinarily, the abstract value is 2567 | rounded and packed into the extended double-precision format, with the 2568 | inexact exception raised if the abstract input cannot be represented 2569 | exactly. However, if the abstract value is too large, the overflow and 2570 | inexact exceptions are raised and an infinity or maximal finite value is 2571 | returned. If the abstract value is too small, the input value is rounded to 2572 | a subnormal number, and the underflow and inexact exceptions are raised if 2573 | the abstract input cannot be represented exactly as a subnormal extended 2574 | double-precision floating-point number. 2575 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 2576 | number of bits as single or double precision, respectively. Otherwise, the 2577 | result is rounded to the full precision of the extended double-precision 2578 | format. 2579 | The input significand must be normalized or smaller. If the input 2580 | significand is not normalized, `zExp' must be 0; in that case, the result 2581 | returned is a subnormal number, and it must not require rounding. The 2582 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 2583 | Floating-Point Arithmetic. 2584 *----------------------------------------------------------------------------*/ 2585 2586 static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign, 2587 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 2588 float_status *status) 2589 { 2590 int8_t roundingMode; 2591 flag roundNearestEven, increment, isTiny; 2592 int64_t roundIncrement, roundMask, roundBits; 2593 2594 roundingMode = status->float_rounding_mode; 2595 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2596 if ( roundingPrecision == 80 ) goto precision80; 2597 if ( roundingPrecision == 64 ) { 2598 roundIncrement = LIT64( 0x0000000000000400 ); 2599 roundMask = LIT64( 0x00000000000007FF ); 2600 } 2601 else if ( roundingPrecision == 32 ) { 2602 roundIncrement = LIT64( 0x0000008000000000 ); 2603 roundMask = LIT64( 0x000000FFFFFFFFFF ); 2604 } 2605 else { 2606 goto precision80; 2607 } 2608 zSig0 |= ( zSig1 != 0 ); 2609 switch (roundingMode) { 2610 case float_round_nearest_even: 2611 case float_round_ties_away: 2612 break; 2613 case float_round_to_zero: 2614 roundIncrement = 0; 2615 break; 2616 case float_round_up: 2617 roundIncrement = zSign ? 0 : roundMask; 2618 break; 2619 case float_round_down: 2620 roundIncrement = zSign ? roundMask : 0; 2621 break; 2622 default: 2623 abort(); 2624 } 2625 roundBits = zSig0 & roundMask; 2626 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 2627 if ( ( 0x7FFE < zExp ) 2628 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 2629 ) { 2630 goto overflow; 2631 } 2632 if ( zExp <= 0 ) { 2633 if (status->flush_to_zero) { 2634 float_raise(float_flag_output_denormal, status); 2635 return packFloatx80(zSign, 0, 0); 2636 } 2637 isTiny = 2638 (status->float_detect_tininess 2639 == float_tininess_before_rounding) 2640 || ( zExp < 0 ) 2641 || ( zSig0 <= zSig0 + roundIncrement ); 2642 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 2643 zExp = 0; 2644 roundBits = zSig0 & roundMask; 2645 if (isTiny && roundBits) { 2646 float_raise(float_flag_underflow, status); 2647 } 2648 if (roundBits) { 2649 status->float_exception_flags |= float_flag_inexact; 2650 } 2651 zSig0 += roundIncrement; 2652 if ( (int64_t) zSig0 < 0 ) zExp = 1; 2653 roundIncrement = roundMask + 1; 2654 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 2655 roundMask |= roundIncrement; 2656 } 2657 zSig0 &= ~ roundMask; 2658 return packFloatx80( zSign, zExp, zSig0 ); 2659 } 2660 } 2661 if (roundBits) { 2662 status->float_exception_flags |= float_flag_inexact; 2663 } 2664 zSig0 += roundIncrement; 2665 if ( zSig0 < roundIncrement ) { 2666 ++zExp; 2667 zSig0 = LIT64( 0x8000000000000000 ); 2668 } 2669 roundIncrement = roundMask + 1; 2670 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 2671 roundMask |= roundIncrement; 2672 } 2673 zSig0 &= ~ roundMask; 2674 if ( zSig0 == 0 ) zExp = 0; 2675 return packFloatx80( zSign, zExp, zSig0 ); 2676 precision80: 2677 switch (roundingMode) { 2678 case float_round_nearest_even: 2679 case float_round_ties_away: 2680 increment = ((int64_t)zSig1 < 0); 2681 break; 2682 case float_round_to_zero: 2683 increment = 0; 2684 break; 2685 case float_round_up: 2686 increment = !zSign && zSig1; 2687 break; 2688 case float_round_down: 2689 increment = zSign && zSig1; 2690 break; 2691 default: 2692 abort(); 2693 } 2694 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 2695 if ( ( 0x7FFE < zExp ) 2696 || ( ( zExp == 0x7FFE ) 2697 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 2698 && increment 2699 ) 2700 ) { 2701 roundMask = 0; 2702 overflow: 2703 float_raise(float_flag_overflow | float_flag_inexact, status); 2704 if ( ( roundingMode == float_round_to_zero ) 2705 || ( zSign && ( roundingMode == float_round_up ) ) 2706 || ( ! zSign && ( roundingMode == float_round_down ) ) 2707 ) { 2708 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 2709 } 2710 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2711 } 2712 if ( zExp <= 0 ) { 2713 isTiny = 2714 (status->float_detect_tininess 2715 == float_tininess_before_rounding) 2716 || ( zExp < 0 ) 2717 || ! increment 2718 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 2719 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 2720 zExp = 0; 2721 if (isTiny && zSig1) { 2722 float_raise(float_flag_underflow, status); 2723 } 2724 if (zSig1) { 2725 status->float_exception_flags |= float_flag_inexact; 2726 } 2727 switch (roundingMode) { 2728 case float_round_nearest_even: 2729 case float_round_ties_away: 2730 increment = ((int64_t)zSig1 < 0); 2731 break; 2732 case float_round_to_zero: 2733 increment = 0; 2734 break; 2735 case float_round_up: 2736 increment = !zSign && zSig1; 2737 break; 2738 case float_round_down: 2739 increment = zSign && zSig1; 2740 break; 2741 default: 2742 abort(); 2743 } 2744 if ( increment ) { 2745 ++zSig0; 2746 zSig0 &= 2747 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 2748 if ( (int64_t) zSig0 < 0 ) zExp = 1; 2749 } 2750 return packFloatx80( zSign, zExp, zSig0 ); 2751 } 2752 } 2753 if (zSig1) { 2754 status->float_exception_flags |= float_flag_inexact; 2755 } 2756 if ( increment ) { 2757 ++zSig0; 2758 if ( zSig0 == 0 ) { 2759 ++zExp; 2760 zSig0 = LIT64( 0x8000000000000000 ); 2761 } 2762 else { 2763 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 2764 } 2765 } 2766 else { 2767 if ( zSig0 == 0 ) zExp = 0; 2768 } 2769 return packFloatx80( zSign, zExp, zSig0 ); 2770 2771 } 2772 2773 /*---------------------------------------------------------------------------- 2774 | Takes an abstract floating-point value having sign `zSign', exponent 2775 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 2776 | and returns the proper extended double-precision floating-point value 2777 | corresponding to the abstract input. This routine is just like 2778 | `roundAndPackFloatx80' except that the input significand does not have to be 2779 | normalized. 2780 *----------------------------------------------------------------------------*/ 2781 2782 static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 2783 flag zSign, int32_t zExp, 2784 uint64_t zSig0, uint64_t zSig1, 2785 float_status *status) 2786 { 2787 int8_t shiftCount; 2788 2789 if ( zSig0 == 0 ) { 2790 zSig0 = zSig1; 2791 zSig1 = 0; 2792 zExp -= 64; 2793 } 2794 shiftCount = countLeadingZeros64( zSig0 ); 2795 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 2796 zExp -= shiftCount; 2797 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 2798 zSig0, zSig1, status); 2799 2800 } 2801 2802 /*---------------------------------------------------------------------------- 2803 | Returns the least-significant 64 fraction bits of the quadruple-precision 2804 | floating-point value `a'. 2805 *----------------------------------------------------------------------------*/ 2806 2807 static inline uint64_t extractFloat128Frac1( float128 a ) 2808 { 2809 2810 return a.low; 2811 2812 } 2813 2814 /*---------------------------------------------------------------------------- 2815 | Returns the most-significant 48 fraction bits of the quadruple-precision 2816 | floating-point value `a'. 2817 *----------------------------------------------------------------------------*/ 2818 2819 static inline uint64_t extractFloat128Frac0( float128 a ) 2820 { 2821 2822 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 2823 2824 } 2825 2826 /*---------------------------------------------------------------------------- 2827 | Returns the exponent bits of the quadruple-precision floating-point value 2828 | `a'. 2829 *----------------------------------------------------------------------------*/ 2830 2831 static inline int32_t extractFloat128Exp( float128 a ) 2832 { 2833 2834 return ( a.high>>48 ) & 0x7FFF; 2835 2836 } 2837 2838 /*---------------------------------------------------------------------------- 2839 | Returns the sign bit of the quadruple-precision floating-point value `a'. 2840 *----------------------------------------------------------------------------*/ 2841 2842 static inline flag extractFloat128Sign( float128 a ) 2843 { 2844 2845 return a.high>>63; 2846 2847 } 2848 2849 /*---------------------------------------------------------------------------- 2850 | Normalizes the subnormal quadruple-precision floating-point value 2851 | represented by the denormalized significand formed by the concatenation of 2852 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 2853 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 2854 | significand are stored at the location pointed to by `zSig0Ptr', and the 2855 | least significant 64 bits of the normalized significand are stored at the 2856 | location pointed to by `zSig1Ptr'. 2857 *----------------------------------------------------------------------------*/ 2858 2859 static void 2860 normalizeFloat128Subnormal( 2861 uint64_t aSig0, 2862 uint64_t aSig1, 2863 int32_t *zExpPtr, 2864 uint64_t *zSig0Ptr, 2865 uint64_t *zSig1Ptr 2866 ) 2867 { 2868 int8_t shiftCount; 2869 2870 if ( aSig0 == 0 ) { 2871 shiftCount = countLeadingZeros64( aSig1 ) - 15; 2872 if ( shiftCount < 0 ) { 2873 *zSig0Ptr = aSig1>>( - shiftCount ); 2874 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 2875 } 2876 else { 2877 *zSig0Ptr = aSig1<<shiftCount; 2878 *zSig1Ptr = 0; 2879 } 2880 *zExpPtr = - shiftCount - 63; 2881 } 2882 else { 2883 shiftCount = countLeadingZeros64( aSig0 ) - 15; 2884 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 2885 *zExpPtr = 1 - shiftCount; 2886 } 2887 2888 } 2889 2890 /*---------------------------------------------------------------------------- 2891 | Packs the sign `zSign', the exponent `zExp', and the significand formed 2892 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 2893 | floating-point value, returning the result. After being shifted into the 2894 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 2895 | added together to form the most significant 32 bits of the result. This 2896 | means that any integer portion of `zSig0' will be added into the exponent. 2897 | Since a properly normalized significand will have an integer portion equal 2898 | to 1, the `zExp' input should be 1 less than the desired result exponent 2899 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 2900 | significand. 2901 *----------------------------------------------------------------------------*/ 2902 2903 static inline float128 2904 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 ) 2905 { 2906 float128 z; 2907 2908 z.low = zSig1; 2909 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0; 2910 return z; 2911 2912 } 2913 2914 /*---------------------------------------------------------------------------- 2915 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2916 | and extended significand formed by the concatenation of `zSig0', `zSig1', 2917 | and `zSig2', and returns the proper quadruple-precision floating-point value 2918 | corresponding to the abstract input. Ordinarily, the abstract value is 2919 | simply rounded and packed into the quadruple-precision format, with the 2920 | inexact exception raised if the abstract input cannot be represented 2921 | exactly. However, if the abstract value is too large, the overflow and 2922 | inexact exceptions are raised and an infinity or maximal finite value is 2923 | returned. If the abstract value is too small, the input value is rounded to 2924 | a subnormal number, and the underflow and inexact exceptions are raised if 2925 | the abstract input cannot be represented exactly as a subnormal quadruple- 2926 | precision floating-point number. 2927 | The input significand must be normalized or smaller. If the input 2928 | significand is not normalized, `zExp' must be 0; in that case, the result 2929 | returned is a subnormal number, and it must not require rounding. In the 2930 | usual case that the input significand is normalized, `zExp' must be 1 less 2931 | than the ``true'' floating-point exponent. The handling of underflow and 2932 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2933 *----------------------------------------------------------------------------*/ 2934 2935 static float128 roundAndPackFloat128(flag zSign, int32_t zExp, 2936 uint64_t zSig0, uint64_t zSig1, 2937 uint64_t zSig2, float_status *status) 2938 { 2939 int8_t roundingMode; 2940 flag roundNearestEven, increment, isTiny; 2941 2942 roundingMode = status->float_rounding_mode; 2943 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2944 switch (roundingMode) { 2945 case float_round_nearest_even: 2946 case float_round_ties_away: 2947 increment = ((int64_t)zSig2 < 0); 2948 break; 2949 case float_round_to_zero: 2950 increment = 0; 2951 break; 2952 case float_round_up: 2953 increment = !zSign && zSig2; 2954 break; 2955 case float_round_down: 2956 increment = zSign && zSig2; 2957 break; 2958 case float_round_to_odd: 2959 increment = !(zSig1 & 0x1) && zSig2; 2960 break; 2961 default: 2962 abort(); 2963 } 2964 if ( 0x7FFD <= (uint32_t) zExp ) { 2965 if ( ( 0x7FFD < zExp ) 2966 || ( ( zExp == 0x7FFD ) 2967 && eq128( 2968 LIT64( 0x0001FFFFFFFFFFFF ), 2969 LIT64( 0xFFFFFFFFFFFFFFFF ), 2970 zSig0, 2971 zSig1 2972 ) 2973 && increment 2974 ) 2975 ) { 2976 float_raise(float_flag_overflow | float_flag_inexact, status); 2977 if ( ( roundingMode == float_round_to_zero ) 2978 || ( zSign && ( roundingMode == float_round_up ) ) 2979 || ( ! zSign && ( roundingMode == float_round_down ) ) 2980 || (roundingMode == float_round_to_odd) 2981 ) { 2982 return 2983 packFloat128( 2984 zSign, 2985 0x7FFE, 2986 LIT64( 0x0000FFFFFFFFFFFF ), 2987 LIT64( 0xFFFFFFFFFFFFFFFF ) 2988 ); 2989 } 2990 return packFloat128( zSign, 0x7FFF, 0, 0 ); 2991 } 2992 if ( zExp < 0 ) { 2993 if (status->flush_to_zero) { 2994 float_raise(float_flag_output_denormal, status); 2995 return packFloat128(zSign, 0, 0, 0); 2996 } 2997 isTiny = 2998 (status->float_detect_tininess 2999 == float_tininess_before_rounding) 3000 || ( zExp < -1 ) 3001 || ! increment 3002 || lt128( 3003 zSig0, 3004 zSig1, 3005 LIT64( 0x0001FFFFFFFFFFFF ), 3006 LIT64( 0xFFFFFFFFFFFFFFFF ) 3007 ); 3008 shift128ExtraRightJamming( 3009 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 3010 zExp = 0; 3011 if (isTiny && zSig2) { 3012 float_raise(float_flag_underflow, status); 3013 } 3014 switch (roundingMode) { 3015 case float_round_nearest_even: 3016 case float_round_ties_away: 3017 increment = ((int64_t)zSig2 < 0); 3018 break; 3019 case float_round_to_zero: 3020 increment = 0; 3021 break; 3022 case float_round_up: 3023 increment = !zSign && zSig2; 3024 break; 3025 case float_round_down: 3026 increment = zSign && zSig2; 3027 break; 3028 case float_round_to_odd: 3029 increment = !(zSig1 & 0x1) && zSig2; 3030 break; 3031 default: 3032 abort(); 3033 } 3034 } 3035 } 3036 if (zSig2) { 3037 status->float_exception_flags |= float_flag_inexact; 3038 } 3039 if ( increment ) { 3040 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 3041 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 3042 } 3043 else { 3044 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 3045 } 3046 return packFloat128( zSign, zExp, zSig0, zSig1 ); 3047 3048 } 3049 3050 /*---------------------------------------------------------------------------- 3051 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3052 | and significand formed by the concatenation of `zSig0' and `zSig1', and 3053 | returns the proper quadruple-precision floating-point value corresponding 3054 | to the abstract input. This routine is just like `roundAndPackFloat128' 3055 | except that the input significand has fewer bits and does not have to be 3056 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 3057 | point exponent. 3058 *----------------------------------------------------------------------------*/ 3059 3060 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp, 3061 uint64_t zSig0, uint64_t zSig1, 3062 float_status *status) 3063 { 3064 int8_t shiftCount; 3065 uint64_t zSig2; 3066 3067 if ( zSig0 == 0 ) { 3068 zSig0 = zSig1; 3069 zSig1 = 0; 3070 zExp -= 64; 3071 } 3072 shiftCount = countLeadingZeros64( zSig0 ) - 15; 3073 if ( 0 <= shiftCount ) { 3074 zSig2 = 0; 3075 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 3076 } 3077 else { 3078 shift128ExtraRightJamming( 3079 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 3080 } 3081 zExp -= shiftCount; 3082 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 3083 3084 } 3085 3086 3087 /*---------------------------------------------------------------------------- 3088 | Returns the result of converting the 32-bit two's complement integer `a' 3089 | to the extended double-precision floating-point format. The conversion 3090 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3091 | Arithmetic. 3092 *----------------------------------------------------------------------------*/ 3093 3094 floatx80 int32_to_floatx80(int32_t a, float_status *status) 3095 { 3096 flag zSign; 3097 uint32_t absA; 3098 int8_t shiftCount; 3099 uint64_t zSig; 3100 3101 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 3102 zSign = ( a < 0 ); 3103 absA = zSign ? - a : a; 3104 shiftCount = countLeadingZeros32( absA ) + 32; 3105 zSig = absA; 3106 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 3107 3108 } 3109 3110 /*---------------------------------------------------------------------------- 3111 | Returns the result of converting the 32-bit two's complement integer `a' to 3112 | the quadruple-precision floating-point format. The conversion is performed 3113 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3114 *----------------------------------------------------------------------------*/ 3115 3116 float128 int32_to_float128(int32_t a, float_status *status) 3117 { 3118 flag zSign; 3119 uint32_t absA; 3120 int8_t shiftCount; 3121 uint64_t zSig0; 3122 3123 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 3124 zSign = ( a < 0 ); 3125 absA = zSign ? - a : a; 3126 shiftCount = countLeadingZeros32( absA ) + 17; 3127 zSig0 = absA; 3128 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 3129 3130 } 3131 3132 /*---------------------------------------------------------------------------- 3133 | Returns the result of converting the 64-bit two's complement integer `a' 3134 | to the extended double-precision floating-point format. The conversion 3135 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3136 | Arithmetic. 3137 *----------------------------------------------------------------------------*/ 3138 3139 floatx80 int64_to_floatx80(int64_t a, float_status *status) 3140 { 3141 flag zSign; 3142 uint64_t absA; 3143 int8_t shiftCount; 3144 3145 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 3146 zSign = ( a < 0 ); 3147 absA = zSign ? - a : a; 3148 shiftCount = countLeadingZeros64( absA ); 3149 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 3150 3151 } 3152 3153 /*---------------------------------------------------------------------------- 3154 | Returns the result of converting the 64-bit two's complement integer `a' to 3155 | the quadruple-precision floating-point format. The conversion is performed 3156 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3157 *----------------------------------------------------------------------------*/ 3158 3159 float128 int64_to_float128(int64_t a, float_status *status) 3160 { 3161 flag zSign; 3162 uint64_t absA; 3163 int8_t shiftCount; 3164 int32_t zExp; 3165 uint64_t zSig0, zSig1; 3166 3167 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 3168 zSign = ( a < 0 ); 3169 absA = zSign ? - a : a; 3170 shiftCount = countLeadingZeros64( absA ) + 49; 3171 zExp = 0x406E - shiftCount; 3172 if ( 64 <= shiftCount ) { 3173 zSig1 = 0; 3174 zSig0 = absA; 3175 shiftCount -= 64; 3176 } 3177 else { 3178 zSig1 = absA; 3179 zSig0 = 0; 3180 } 3181 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 3182 return packFloat128( zSign, zExp, zSig0, zSig1 ); 3183 3184 } 3185 3186 /*---------------------------------------------------------------------------- 3187 | Returns the result of converting the 64-bit unsigned integer `a' 3188 | to the quadruple-precision floating-point format. The conversion is performed 3189 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3190 *----------------------------------------------------------------------------*/ 3191 3192 float128 uint64_to_float128(uint64_t a, float_status *status) 3193 { 3194 if (a == 0) { 3195 return float128_zero; 3196 } 3197 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status); 3198 } 3199 3200 3201 3202 3203 /*---------------------------------------------------------------------------- 3204 | Returns the result of converting the single-precision floating-point value 3205 | `a' to the double-precision floating-point format. The conversion is 3206 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3207 | Arithmetic. 3208 *----------------------------------------------------------------------------*/ 3209 3210 float64 float32_to_float64(float32 a, float_status *status) 3211 { 3212 flag aSign; 3213 int aExp; 3214 uint32_t aSig; 3215 a = float32_squash_input_denormal(a, status); 3216 3217 aSig = extractFloat32Frac( a ); 3218 aExp = extractFloat32Exp( a ); 3219 aSign = extractFloat32Sign( a ); 3220 if ( aExp == 0xFF ) { 3221 if (aSig) { 3222 return commonNaNToFloat64(float32ToCommonNaN(a, status), status); 3223 } 3224 return packFloat64( aSign, 0x7FF, 0 ); 3225 } 3226 if ( aExp == 0 ) { 3227 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 3228 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3229 --aExp; 3230 } 3231 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 ); 3232 3233 } 3234 3235 /*---------------------------------------------------------------------------- 3236 | Returns the result of converting the single-precision floating-point value 3237 | `a' to the extended double-precision floating-point format. The conversion 3238 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3239 | Arithmetic. 3240 *----------------------------------------------------------------------------*/ 3241 3242 floatx80 float32_to_floatx80(float32 a, float_status *status) 3243 { 3244 flag aSign; 3245 int aExp; 3246 uint32_t aSig; 3247 3248 a = float32_squash_input_denormal(a, status); 3249 aSig = extractFloat32Frac( a ); 3250 aExp = extractFloat32Exp( a ); 3251 aSign = extractFloat32Sign( a ); 3252 if ( aExp == 0xFF ) { 3253 if (aSig) { 3254 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status); 3255 } 3256 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3257 } 3258 if ( aExp == 0 ) { 3259 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 3260 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3261 } 3262 aSig |= 0x00800000; 3263 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 3264 3265 } 3266 3267 /*---------------------------------------------------------------------------- 3268 | Returns the result of converting the single-precision floating-point value 3269 | `a' to the double-precision floating-point format. The conversion is 3270 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3271 | Arithmetic. 3272 *----------------------------------------------------------------------------*/ 3273 3274 float128 float32_to_float128(float32 a, float_status *status) 3275 { 3276 flag aSign; 3277 int aExp; 3278 uint32_t aSig; 3279 3280 a = float32_squash_input_denormal(a, status); 3281 aSig = extractFloat32Frac( a ); 3282 aExp = extractFloat32Exp( a ); 3283 aSign = extractFloat32Sign( a ); 3284 if ( aExp == 0xFF ) { 3285 if (aSig) { 3286 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 3287 } 3288 return packFloat128( aSign, 0x7FFF, 0, 0 ); 3289 } 3290 if ( aExp == 0 ) { 3291 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 3292 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3293 --aExp; 3294 } 3295 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 3296 3297 } 3298 3299 /*---------------------------------------------------------------------------- 3300 | Returns the remainder of the single-precision floating-point value `a' 3301 | with respect to the corresponding value `b'. The operation is performed 3302 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3303 *----------------------------------------------------------------------------*/ 3304 3305 float32 float32_rem(float32 a, float32 b, float_status *status) 3306 { 3307 flag aSign, zSign; 3308 int aExp, bExp, expDiff; 3309 uint32_t aSig, bSig; 3310 uint32_t q; 3311 uint64_t aSig64, bSig64, q64; 3312 uint32_t alternateASig; 3313 int32_t sigMean; 3314 a = float32_squash_input_denormal(a, status); 3315 b = float32_squash_input_denormal(b, status); 3316 3317 aSig = extractFloat32Frac( a ); 3318 aExp = extractFloat32Exp( a ); 3319 aSign = extractFloat32Sign( a ); 3320 bSig = extractFloat32Frac( b ); 3321 bExp = extractFloat32Exp( b ); 3322 if ( aExp == 0xFF ) { 3323 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 3324 return propagateFloat32NaN(a, b, status); 3325 } 3326 float_raise(float_flag_invalid, status); 3327 return float32_default_nan(status); 3328 } 3329 if ( bExp == 0xFF ) { 3330 if (bSig) { 3331 return propagateFloat32NaN(a, b, status); 3332 } 3333 return a; 3334 } 3335 if ( bExp == 0 ) { 3336 if ( bSig == 0 ) { 3337 float_raise(float_flag_invalid, status); 3338 return float32_default_nan(status); 3339 } 3340 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 3341 } 3342 if ( aExp == 0 ) { 3343 if ( aSig == 0 ) return a; 3344 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3345 } 3346 expDiff = aExp - bExp; 3347 aSig |= 0x00800000; 3348 bSig |= 0x00800000; 3349 if ( expDiff < 32 ) { 3350 aSig <<= 8; 3351 bSig <<= 8; 3352 if ( expDiff < 0 ) { 3353 if ( expDiff < -1 ) return a; 3354 aSig >>= 1; 3355 } 3356 q = ( bSig <= aSig ); 3357 if ( q ) aSig -= bSig; 3358 if ( 0 < expDiff ) { 3359 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 3360 q >>= 32 - expDiff; 3361 bSig >>= 2; 3362 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 3363 } 3364 else { 3365 aSig >>= 2; 3366 bSig >>= 2; 3367 } 3368 } 3369 else { 3370 if ( bSig <= aSig ) aSig -= bSig; 3371 aSig64 = ( (uint64_t) aSig )<<40; 3372 bSig64 = ( (uint64_t) bSig )<<40; 3373 expDiff -= 64; 3374 while ( 0 < expDiff ) { 3375 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 3376 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 3377 aSig64 = - ( ( bSig * q64 )<<38 ); 3378 expDiff -= 62; 3379 } 3380 expDiff += 64; 3381 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 3382 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 3383 q = q64>>( 64 - expDiff ); 3384 bSig <<= 6; 3385 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 3386 } 3387 do { 3388 alternateASig = aSig; 3389 ++q; 3390 aSig -= bSig; 3391 } while ( 0 <= (int32_t) aSig ); 3392 sigMean = aSig + alternateASig; 3393 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 3394 aSig = alternateASig; 3395 } 3396 zSign = ( (int32_t) aSig < 0 ); 3397 if ( zSign ) aSig = - aSig; 3398 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 3399 } 3400 3401 3402 3403 /*---------------------------------------------------------------------------- 3404 | Returns the binary exponential of the single-precision floating-point value 3405 | `a'. The operation is performed according to the IEC/IEEE Standard for 3406 | Binary Floating-Point Arithmetic. 3407 | 3408 | Uses the following identities: 3409 | 3410 | 1. ------------------------------------------------------------------------- 3411 | x x*ln(2) 3412 | 2 = e 3413 | 3414 | 2. ------------------------------------------------------------------------- 3415 | 2 3 4 5 n 3416 | x x x x x x x 3417 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 3418 | 1! 2! 3! 4! 5! n! 3419 *----------------------------------------------------------------------------*/ 3420 3421 static const float64 float32_exp2_coefficients[15] = 3422 { 3423 const_float64( 0x3ff0000000000000ll ), /* 1 */ 3424 const_float64( 0x3fe0000000000000ll ), /* 2 */ 3425 const_float64( 0x3fc5555555555555ll ), /* 3 */ 3426 const_float64( 0x3fa5555555555555ll ), /* 4 */ 3427 const_float64( 0x3f81111111111111ll ), /* 5 */ 3428 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 3429 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 3430 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 3431 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 3432 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 3433 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 3434 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 3435 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 3436 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 3437 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 3438 }; 3439 3440 float32 float32_exp2(float32 a, float_status *status) 3441 { 3442 flag aSign; 3443 int aExp; 3444 uint32_t aSig; 3445 float64 r, x, xn; 3446 int i; 3447 a = float32_squash_input_denormal(a, status); 3448 3449 aSig = extractFloat32Frac( a ); 3450 aExp = extractFloat32Exp( a ); 3451 aSign = extractFloat32Sign( a ); 3452 3453 if ( aExp == 0xFF) { 3454 if (aSig) { 3455 return propagateFloat32NaN(a, float32_zero, status); 3456 } 3457 return (aSign) ? float32_zero : a; 3458 } 3459 if (aExp == 0) { 3460 if (aSig == 0) return float32_one; 3461 } 3462 3463 float_raise(float_flag_inexact, status); 3464 3465 /* ******************************* */ 3466 /* using float64 for approximation */ 3467 /* ******************************* */ 3468 x = float32_to_float64(a, status); 3469 x = float64_mul(x, float64_ln2, status); 3470 3471 xn = x; 3472 r = float64_one; 3473 for (i = 0 ; i < 15 ; i++) { 3474 float64 f; 3475 3476 f = float64_mul(xn, float32_exp2_coefficients[i], status); 3477 r = float64_add(r, f, status); 3478 3479 xn = float64_mul(xn, x, status); 3480 } 3481 3482 return float64_to_float32(r, status); 3483 } 3484 3485 /*---------------------------------------------------------------------------- 3486 | Returns the binary log of the single-precision floating-point value `a'. 3487 | The operation is performed according to the IEC/IEEE Standard for Binary 3488 | Floating-Point Arithmetic. 3489 *----------------------------------------------------------------------------*/ 3490 float32 float32_log2(float32 a, float_status *status) 3491 { 3492 flag aSign, zSign; 3493 int aExp; 3494 uint32_t aSig, zSig, i; 3495 3496 a = float32_squash_input_denormal(a, status); 3497 aSig = extractFloat32Frac( a ); 3498 aExp = extractFloat32Exp( a ); 3499 aSign = extractFloat32Sign( a ); 3500 3501 if ( aExp == 0 ) { 3502 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 3503 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3504 } 3505 if ( aSign ) { 3506 float_raise(float_flag_invalid, status); 3507 return float32_default_nan(status); 3508 } 3509 if ( aExp == 0xFF ) { 3510 if (aSig) { 3511 return propagateFloat32NaN(a, float32_zero, status); 3512 } 3513 return a; 3514 } 3515 3516 aExp -= 0x7F; 3517 aSig |= 0x00800000; 3518 zSign = aExp < 0; 3519 zSig = aExp << 23; 3520 3521 for (i = 1 << 22; i > 0; i >>= 1) { 3522 aSig = ( (uint64_t)aSig * aSig ) >> 23; 3523 if ( aSig & 0x01000000 ) { 3524 aSig >>= 1; 3525 zSig |= i; 3526 } 3527 } 3528 3529 if ( zSign ) 3530 zSig = -zSig; 3531 3532 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 3533 } 3534 3535 /*---------------------------------------------------------------------------- 3536 | Returns 1 if the single-precision floating-point value `a' is equal to 3537 | the corresponding value `b', and 0 otherwise. The invalid exception is 3538 | raised if either operand is a NaN. Otherwise, the comparison is performed 3539 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3540 *----------------------------------------------------------------------------*/ 3541 3542 int float32_eq(float32 a, float32 b, float_status *status) 3543 { 3544 uint32_t av, bv; 3545 a = float32_squash_input_denormal(a, status); 3546 b = float32_squash_input_denormal(b, status); 3547 3548 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3549 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3550 ) { 3551 float_raise(float_flag_invalid, status); 3552 return 0; 3553 } 3554 av = float32_val(a); 3555 bv = float32_val(b); 3556 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3557 } 3558 3559 /*---------------------------------------------------------------------------- 3560 | Returns 1 if the single-precision floating-point value `a' is less than 3561 | or equal to the corresponding value `b', and 0 otherwise. The invalid 3562 | exception is raised if either operand is a NaN. The comparison is performed 3563 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3564 *----------------------------------------------------------------------------*/ 3565 3566 int float32_le(float32 a, float32 b, float_status *status) 3567 { 3568 flag aSign, bSign; 3569 uint32_t av, bv; 3570 a = float32_squash_input_denormal(a, status); 3571 b = float32_squash_input_denormal(b, status); 3572 3573 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3574 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3575 ) { 3576 float_raise(float_flag_invalid, status); 3577 return 0; 3578 } 3579 aSign = extractFloat32Sign( a ); 3580 bSign = extractFloat32Sign( b ); 3581 av = float32_val(a); 3582 bv = float32_val(b); 3583 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3584 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3585 3586 } 3587 3588 /*---------------------------------------------------------------------------- 3589 | Returns 1 if the single-precision floating-point value `a' is less than 3590 | the corresponding value `b', and 0 otherwise. The invalid exception is 3591 | raised if either operand is a NaN. The comparison is performed according 3592 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3593 *----------------------------------------------------------------------------*/ 3594 3595 int float32_lt(float32 a, float32 b, float_status *status) 3596 { 3597 flag aSign, bSign; 3598 uint32_t av, bv; 3599 a = float32_squash_input_denormal(a, status); 3600 b = float32_squash_input_denormal(b, status); 3601 3602 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3603 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3604 ) { 3605 float_raise(float_flag_invalid, status); 3606 return 0; 3607 } 3608 aSign = extractFloat32Sign( a ); 3609 bSign = extractFloat32Sign( b ); 3610 av = float32_val(a); 3611 bv = float32_val(b); 3612 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 3613 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3614 3615 } 3616 3617 /*---------------------------------------------------------------------------- 3618 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 3619 | be compared, and 0 otherwise. The invalid exception is raised if either 3620 | operand is a NaN. The comparison is performed according to the IEC/IEEE 3621 | Standard for Binary Floating-Point Arithmetic. 3622 *----------------------------------------------------------------------------*/ 3623 3624 int float32_unordered(float32 a, float32 b, float_status *status) 3625 { 3626 a = float32_squash_input_denormal(a, status); 3627 b = float32_squash_input_denormal(b, status); 3628 3629 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3630 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3631 ) { 3632 float_raise(float_flag_invalid, status); 3633 return 1; 3634 } 3635 return 0; 3636 } 3637 3638 /*---------------------------------------------------------------------------- 3639 | Returns 1 if the single-precision floating-point value `a' is equal to 3640 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3641 | exception. The comparison is performed according to the IEC/IEEE Standard 3642 | for Binary Floating-Point Arithmetic. 3643 *----------------------------------------------------------------------------*/ 3644 3645 int float32_eq_quiet(float32 a, float32 b, float_status *status) 3646 { 3647 a = float32_squash_input_denormal(a, status); 3648 b = float32_squash_input_denormal(b, status); 3649 3650 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3651 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3652 ) { 3653 if (float32_is_signaling_nan(a, status) 3654 || float32_is_signaling_nan(b, status)) { 3655 float_raise(float_flag_invalid, status); 3656 } 3657 return 0; 3658 } 3659 return ( float32_val(a) == float32_val(b) ) || 3660 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 3661 } 3662 3663 /*---------------------------------------------------------------------------- 3664 | Returns 1 if the single-precision floating-point value `a' is less than or 3665 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 3666 | cause an exception. Otherwise, the comparison is performed according to the 3667 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3668 *----------------------------------------------------------------------------*/ 3669 3670 int float32_le_quiet(float32 a, float32 b, float_status *status) 3671 { 3672 flag aSign, bSign; 3673 uint32_t av, bv; 3674 a = float32_squash_input_denormal(a, status); 3675 b = float32_squash_input_denormal(b, status); 3676 3677 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3678 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3679 ) { 3680 if (float32_is_signaling_nan(a, status) 3681 || float32_is_signaling_nan(b, status)) { 3682 float_raise(float_flag_invalid, status); 3683 } 3684 return 0; 3685 } 3686 aSign = extractFloat32Sign( a ); 3687 bSign = extractFloat32Sign( b ); 3688 av = float32_val(a); 3689 bv = float32_val(b); 3690 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3691 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3692 3693 } 3694 3695 /*---------------------------------------------------------------------------- 3696 | Returns 1 if the single-precision floating-point value `a' is less than 3697 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3698 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 3699 | Standard for Binary Floating-Point Arithmetic. 3700 *----------------------------------------------------------------------------*/ 3701 3702 int float32_lt_quiet(float32 a, float32 b, float_status *status) 3703 { 3704 flag aSign, bSign; 3705 uint32_t av, bv; 3706 a = float32_squash_input_denormal(a, status); 3707 b = float32_squash_input_denormal(b, status); 3708 3709 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3710 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3711 ) { 3712 if (float32_is_signaling_nan(a, status) 3713 || float32_is_signaling_nan(b, status)) { 3714 float_raise(float_flag_invalid, status); 3715 } 3716 return 0; 3717 } 3718 aSign = extractFloat32Sign( a ); 3719 bSign = extractFloat32Sign( b ); 3720 av = float32_val(a); 3721 bv = float32_val(b); 3722 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 3723 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3724 3725 } 3726 3727 /*---------------------------------------------------------------------------- 3728 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 3729 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 3730 | comparison is performed according to the IEC/IEEE Standard for Binary 3731 | Floating-Point Arithmetic. 3732 *----------------------------------------------------------------------------*/ 3733 3734 int float32_unordered_quiet(float32 a, float32 b, float_status *status) 3735 { 3736 a = float32_squash_input_denormal(a, status); 3737 b = float32_squash_input_denormal(b, status); 3738 3739 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3740 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3741 ) { 3742 if (float32_is_signaling_nan(a, status) 3743 || float32_is_signaling_nan(b, status)) { 3744 float_raise(float_flag_invalid, status); 3745 } 3746 return 1; 3747 } 3748 return 0; 3749 } 3750 3751 3752 /*---------------------------------------------------------------------------- 3753 | Returns the result of converting the double-precision floating-point value 3754 | `a' to the single-precision floating-point format. The conversion is 3755 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3756 | Arithmetic. 3757 *----------------------------------------------------------------------------*/ 3758 3759 float32 float64_to_float32(float64 a, float_status *status) 3760 { 3761 flag aSign; 3762 int aExp; 3763 uint64_t aSig; 3764 uint32_t zSig; 3765 a = float64_squash_input_denormal(a, status); 3766 3767 aSig = extractFloat64Frac( a ); 3768 aExp = extractFloat64Exp( a ); 3769 aSign = extractFloat64Sign( a ); 3770 if ( aExp == 0x7FF ) { 3771 if (aSig) { 3772 return commonNaNToFloat32(float64ToCommonNaN(a, status), status); 3773 } 3774 return packFloat32( aSign, 0xFF, 0 ); 3775 } 3776 shift64RightJamming( aSig, 22, &aSig ); 3777 zSig = aSig; 3778 if ( aExp || zSig ) { 3779 zSig |= 0x40000000; 3780 aExp -= 0x381; 3781 } 3782 return roundAndPackFloat32(aSign, aExp, zSig, status); 3783 3784 } 3785 3786 3787 /*---------------------------------------------------------------------------- 3788 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 3789 | half-precision floating-point value, returning the result. After being 3790 | shifted into the proper positions, the three fields are simply added 3791 | together to form the result. This means that any integer portion of `zSig' 3792 | will be added into the exponent. Since a properly normalized significand 3793 | will have an integer portion equal to 1, the `zExp' input should be 1 less 3794 | than the desired result exponent whenever `zSig' is a complete, normalized 3795 | significand. 3796 *----------------------------------------------------------------------------*/ 3797 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig) 3798 { 3799 return make_float16( 3800 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig); 3801 } 3802 3803 /*---------------------------------------------------------------------------- 3804 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3805 | and significand `zSig', and returns the proper half-precision floating- 3806 | point value corresponding to the abstract input. Ordinarily, the abstract 3807 | value is simply rounded and packed into the half-precision format, with 3808 | the inexact exception raised if the abstract input cannot be represented 3809 | exactly. However, if the abstract value is too large, the overflow and 3810 | inexact exceptions are raised and an infinity or maximal finite value is 3811 | returned. If the abstract value is too small, the input value is rounded to 3812 | a subnormal number, and the underflow and inexact exceptions are raised if 3813 | the abstract input cannot be represented exactly as a subnormal half- 3814 | precision floating-point number. 3815 | The `ieee' flag indicates whether to use IEEE standard half precision, or 3816 | ARM-style "alternative representation", which omits the NaN and Inf 3817 | encodings in order to raise the maximum representable exponent by one. 3818 | The input significand `zSig' has its binary point between bits 22 3819 | and 23, which is 13 bits to the left of the usual location. This shifted 3820 | significand must be normalized or smaller. If `zSig' is not normalized, 3821 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3822 | and it must not require rounding. In the usual case that `zSig' is 3823 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3824 | Note the slightly odd position of the binary point in zSig compared with the 3825 | other roundAndPackFloat functions. This should probably be fixed if we 3826 | need to implement more float16 routines than just conversion. 3827 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3828 | Binary Floating-Point Arithmetic. 3829 *----------------------------------------------------------------------------*/ 3830 3831 static float16 roundAndPackFloat16(flag zSign, int zExp, 3832 uint32_t zSig, flag ieee, 3833 float_status *status) 3834 { 3835 int maxexp = ieee ? 29 : 30; 3836 uint32_t mask; 3837 uint32_t increment; 3838 bool rounding_bumps_exp; 3839 bool is_tiny = false; 3840 3841 /* Calculate the mask of bits of the mantissa which are not 3842 * representable in half-precision and will be lost. 3843 */ 3844 if (zExp < 1) { 3845 /* Will be denormal in halfprec */ 3846 mask = 0x00ffffff; 3847 if (zExp >= -11) { 3848 mask >>= 11 + zExp; 3849 } 3850 } else { 3851 /* Normal number in halfprec */ 3852 mask = 0x00001fff; 3853 } 3854 3855 switch (status->float_rounding_mode) { 3856 case float_round_nearest_even: 3857 increment = (mask + 1) >> 1; 3858 if ((zSig & mask) == increment) { 3859 increment = zSig & (increment << 1); 3860 } 3861 break; 3862 case float_round_ties_away: 3863 increment = (mask + 1) >> 1; 3864 break; 3865 case float_round_up: 3866 increment = zSign ? 0 : mask; 3867 break; 3868 case float_round_down: 3869 increment = zSign ? mask : 0; 3870 break; 3871 default: /* round_to_zero */ 3872 increment = 0; 3873 break; 3874 } 3875 3876 rounding_bumps_exp = (zSig + increment >= 0x01000000); 3877 3878 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) { 3879 if (ieee) { 3880 float_raise(float_flag_overflow | float_flag_inexact, status); 3881 return packFloat16(zSign, 0x1f, 0); 3882 } else { 3883 float_raise(float_flag_invalid, status); 3884 return packFloat16(zSign, 0x1f, 0x3ff); 3885 } 3886 } 3887 3888 if (zExp < 0) { 3889 /* Note that flush-to-zero does not affect half-precision results */ 3890 is_tiny = 3891 (status->float_detect_tininess == float_tininess_before_rounding) 3892 || (zExp < -1) 3893 || (!rounding_bumps_exp); 3894 } 3895 if (zSig & mask) { 3896 float_raise(float_flag_inexact, status); 3897 if (is_tiny) { 3898 float_raise(float_flag_underflow, status); 3899 } 3900 } 3901 3902 zSig += increment; 3903 if (rounding_bumps_exp) { 3904 zSig >>= 1; 3905 zExp++; 3906 } 3907 3908 if (zExp < -10) { 3909 return packFloat16(zSign, 0, 0); 3910 } 3911 if (zExp < 0) { 3912 zSig >>= -zExp; 3913 zExp = 0; 3914 } 3915 return packFloat16(zSign, zExp, zSig >> 13); 3916 } 3917 3918 /*---------------------------------------------------------------------------- 3919 | If `a' is denormal and we are in flush-to-zero mode then set the 3920 | input-denormal exception and return zero. Otherwise just return the value. 3921 *----------------------------------------------------------------------------*/ 3922 float16 float16_squash_input_denormal(float16 a, float_status *status) 3923 { 3924 if (status->flush_inputs_to_zero) { 3925 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) { 3926 float_raise(float_flag_input_denormal, status); 3927 return make_float16(float16_val(a) & 0x8000); 3928 } 3929 } 3930 return a; 3931 } 3932 3933 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr, 3934 uint32_t *zSigPtr) 3935 { 3936 int8_t shiftCount = countLeadingZeros32(aSig) - 21; 3937 *zSigPtr = aSig << shiftCount; 3938 *zExpPtr = 1 - shiftCount; 3939 } 3940 3941 /* Half precision floats come in two formats: standard IEEE and "ARM" format. 3942 The latter gains extra exponent range by omitting the NaN/Inf encodings. */ 3943 3944 float32 float16_to_float32(float16 a, flag ieee, float_status *status) 3945 { 3946 flag aSign; 3947 int aExp; 3948 uint32_t aSig; 3949 3950 aSign = extractFloat16Sign(a); 3951 aExp = extractFloat16Exp(a); 3952 aSig = extractFloat16Frac(a); 3953 3954 if (aExp == 0x1f && ieee) { 3955 if (aSig) { 3956 return commonNaNToFloat32(float16ToCommonNaN(a, status), status); 3957 } 3958 return packFloat32(aSign, 0xff, 0); 3959 } 3960 if (aExp == 0) { 3961 if (aSig == 0) { 3962 return packFloat32(aSign, 0, 0); 3963 } 3964 3965 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 3966 aExp--; 3967 } 3968 return packFloat32( aSign, aExp + 0x70, aSig << 13); 3969 } 3970 3971 float16 float32_to_float16(float32 a, flag ieee, float_status *status) 3972 { 3973 flag aSign; 3974 int aExp; 3975 uint32_t aSig; 3976 3977 a = float32_squash_input_denormal(a, status); 3978 3979 aSig = extractFloat32Frac( a ); 3980 aExp = extractFloat32Exp( a ); 3981 aSign = extractFloat32Sign( a ); 3982 if ( aExp == 0xFF ) { 3983 if (aSig) { 3984 /* Input is a NaN */ 3985 if (!ieee) { 3986 float_raise(float_flag_invalid, status); 3987 return packFloat16(aSign, 0, 0); 3988 } 3989 return commonNaNToFloat16( 3990 float32ToCommonNaN(a, status), status); 3991 } 3992 /* Infinity */ 3993 if (!ieee) { 3994 float_raise(float_flag_invalid, status); 3995 return packFloat16(aSign, 0x1f, 0x3ff); 3996 } 3997 return packFloat16(aSign, 0x1f, 0); 3998 } 3999 if (aExp == 0 && aSig == 0) { 4000 return packFloat16(aSign, 0, 0); 4001 } 4002 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 4003 * even if the input is denormal; however this is harmless because 4004 * the largest possible single-precision denormal is still smaller 4005 * than the smallest representable half-precision denormal, and so we 4006 * will end up ignoring aSig and returning via the "always return zero" 4007 * codepath. 4008 */ 4009 aSig |= 0x00800000; 4010 aExp -= 0x71; 4011 4012 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status); 4013 } 4014 4015 float64 float16_to_float64(float16 a, flag ieee, float_status *status) 4016 { 4017 flag aSign; 4018 int aExp; 4019 uint32_t aSig; 4020 4021 aSign = extractFloat16Sign(a); 4022 aExp = extractFloat16Exp(a); 4023 aSig = extractFloat16Frac(a); 4024 4025 if (aExp == 0x1f && ieee) { 4026 if (aSig) { 4027 return commonNaNToFloat64( 4028 float16ToCommonNaN(a, status), status); 4029 } 4030 return packFloat64(aSign, 0x7ff, 0); 4031 } 4032 if (aExp == 0) { 4033 if (aSig == 0) { 4034 return packFloat64(aSign, 0, 0); 4035 } 4036 4037 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 4038 aExp--; 4039 } 4040 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42); 4041 } 4042 4043 float16 float64_to_float16(float64 a, flag ieee, float_status *status) 4044 { 4045 flag aSign; 4046 int aExp; 4047 uint64_t aSig; 4048 uint32_t zSig; 4049 4050 a = float64_squash_input_denormal(a, status); 4051 4052 aSig = extractFloat64Frac(a); 4053 aExp = extractFloat64Exp(a); 4054 aSign = extractFloat64Sign(a); 4055 if (aExp == 0x7FF) { 4056 if (aSig) { 4057 /* Input is a NaN */ 4058 if (!ieee) { 4059 float_raise(float_flag_invalid, status); 4060 return packFloat16(aSign, 0, 0); 4061 } 4062 return commonNaNToFloat16( 4063 float64ToCommonNaN(a, status), status); 4064 } 4065 /* Infinity */ 4066 if (!ieee) { 4067 float_raise(float_flag_invalid, status); 4068 return packFloat16(aSign, 0x1f, 0x3ff); 4069 } 4070 return packFloat16(aSign, 0x1f, 0); 4071 } 4072 shift64RightJamming(aSig, 29, &aSig); 4073 zSig = aSig; 4074 if (aExp == 0 && zSig == 0) { 4075 return packFloat16(aSign, 0, 0); 4076 } 4077 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 4078 * even if the input is denormal; however this is harmless because 4079 * the largest possible single-precision denormal is still smaller 4080 * than the smallest representable half-precision denormal, and so we 4081 * will end up ignoring aSig and returning via the "always return zero" 4082 * codepath. 4083 */ 4084 zSig |= 0x00800000; 4085 aExp -= 0x3F1; 4086 4087 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status); 4088 } 4089 4090 /*---------------------------------------------------------------------------- 4091 | Returns the result of converting the double-precision floating-point value 4092 | `a' to the extended double-precision floating-point format. The conversion 4093 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4094 | Arithmetic. 4095 *----------------------------------------------------------------------------*/ 4096 4097 floatx80 float64_to_floatx80(float64 a, float_status *status) 4098 { 4099 flag aSign; 4100 int aExp; 4101 uint64_t aSig; 4102 4103 a = float64_squash_input_denormal(a, status); 4104 aSig = extractFloat64Frac( a ); 4105 aExp = extractFloat64Exp( a ); 4106 aSign = extractFloat64Sign( a ); 4107 if ( aExp == 0x7FF ) { 4108 if (aSig) { 4109 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status); 4110 } 4111 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4112 } 4113 if ( aExp == 0 ) { 4114 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 4115 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4116 } 4117 return 4118 packFloatx80( 4119 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 4120 4121 } 4122 4123 /*---------------------------------------------------------------------------- 4124 | Returns the result of converting the double-precision floating-point value 4125 | `a' to the quadruple-precision floating-point format. The conversion is 4126 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4127 | Arithmetic. 4128 *----------------------------------------------------------------------------*/ 4129 4130 float128 float64_to_float128(float64 a, float_status *status) 4131 { 4132 flag aSign; 4133 int aExp; 4134 uint64_t aSig, zSig0, zSig1; 4135 4136 a = float64_squash_input_denormal(a, status); 4137 aSig = extractFloat64Frac( a ); 4138 aExp = extractFloat64Exp( a ); 4139 aSign = extractFloat64Sign( a ); 4140 if ( aExp == 0x7FF ) { 4141 if (aSig) { 4142 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 4143 } 4144 return packFloat128( aSign, 0x7FFF, 0, 0 ); 4145 } 4146 if ( aExp == 0 ) { 4147 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 4148 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4149 --aExp; 4150 } 4151 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 4152 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 4153 4154 } 4155 4156 4157 /*---------------------------------------------------------------------------- 4158 | Returns the remainder of the double-precision floating-point value `a' 4159 | with respect to the corresponding value `b'. The operation is performed 4160 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4161 *----------------------------------------------------------------------------*/ 4162 4163 float64 float64_rem(float64 a, float64 b, float_status *status) 4164 { 4165 flag aSign, zSign; 4166 int aExp, bExp, expDiff; 4167 uint64_t aSig, bSig; 4168 uint64_t q, alternateASig; 4169 int64_t sigMean; 4170 4171 a = float64_squash_input_denormal(a, status); 4172 b = float64_squash_input_denormal(b, status); 4173 aSig = extractFloat64Frac( a ); 4174 aExp = extractFloat64Exp( a ); 4175 aSign = extractFloat64Sign( a ); 4176 bSig = extractFloat64Frac( b ); 4177 bExp = extractFloat64Exp( b ); 4178 if ( aExp == 0x7FF ) { 4179 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 4180 return propagateFloat64NaN(a, b, status); 4181 } 4182 float_raise(float_flag_invalid, status); 4183 return float64_default_nan(status); 4184 } 4185 if ( bExp == 0x7FF ) { 4186 if (bSig) { 4187 return propagateFloat64NaN(a, b, status); 4188 } 4189 return a; 4190 } 4191 if ( bExp == 0 ) { 4192 if ( bSig == 0 ) { 4193 float_raise(float_flag_invalid, status); 4194 return float64_default_nan(status); 4195 } 4196 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4197 } 4198 if ( aExp == 0 ) { 4199 if ( aSig == 0 ) return a; 4200 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4201 } 4202 expDiff = aExp - bExp; 4203 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 4204 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4205 if ( expDiff < 0 ) { 4206 if ( expDiff < -1 ) return a; 4207 aSig >>= 1; 4208 } 4209 q = ( bSig <= aSig ); 4210 if ( q ) aSig -= bSig; 4211 expDiff -= 64; 4212 while ( 0 < expDiff ) { 4213 q = estimateDiv128To64( aSig, 0, bSig ); 4214 q = ( 2 < q ) ? q - 2 : 0; 4215 aSig = - ( ( bSig>>2 ) * q ); 4216 expDiff -= 62; 4217 } 4218 expDiff += 64; 4219 if ( 0 < expDiff ) { 4220 q = estimateDiv128To64( aSig, 0, bSig ); 4221 q = ( 2 < q ) ? q - 2 : 0; 4222 q >>= 64 - expDiff; 4223 bSig >>= 2; 4224 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4225 } 4226 else { 4227 aSig >>= 2; 4228 bSig >>= 2; 4229 } 4230 do { 4231 alternateASig = aSig; 4232 ++q; 4233 aSig -= bSig; 4234 } while ( 0 <= (int64_t) aSig ); 4235 sigMean = aSig + alternateASig; 4236 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 4237 aSig = alternateASig; 4238 } 4239 zSign = ( (int64_t) aSig < 0 ); 4240 if ( zSign ) aSig = - aSig; 4241 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 4242 4243 } 4244 4245 /*---------------------------------------------------------------------------- 4246 | Returns the binary log of the double-precision floating-point value `a'. 4247 | The operation is performed according to the IEC/IEEE Standard for Binary 4248 | Floating-Point Arithmetic. 4249 *----------------------------------------------------------------------------*/ 4250 float64 float64_log2(float64 a, float_status *status) 4251 { 4252 flag aSign, zSign; 4253 int aExp; 4254 uint64_t aSig, aSig0, aSig1, zSig, i; 4255 a = float64_squash_input_denormal(a, status); 4256 4257 aSig = extractFloat64Frac( a ); 4258 aExp = extractFloat64Exp( a ); 4259 aSign = extractFloat64Sign( a ); 4260 4261 if ( aExp == 0 ) { 4262 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 4263 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4264 } 4265 if ( aSign ) { 4266 float_raise(float_flag_invalid, status); 4267 return float64_default_nan(status); 4268 } 4269 if ( aExp == 0x7FF ) { 4270 if (aSig) { 4271 return propagateFloat64NaN(a, float64_zero, status); 4272 } 4273 return a; 4274 } 4275 4276 aExp -= 0x3FF; 4277 aSig |= LIT64( 0x0010000000000000 ); 4278 zSign = aExp < 0; 4279 zSig = (uint64_t)aExp << 52; 4280 for (i = 1LL << 51; i > 0; i >>= 1) { 4281 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 4282 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 4283 if ( aSig & LIT64( 0x0020000000000000 ) ) { 4284 aSig >>= 1; 4285 zSig |= i; 4286 } 4287 } 4288 4289 if ( zSign ) 4290 zSig = -zSig; 4291 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 4292 } 4293 4294 /*---------------------------------------------------------------------------- 4295 | Returns 1 if the double-precision floating-point value `a' is equal to the 4296 | corresponding value `b', and 0 otherwise. The invalid exception is raised 4297 | if either operand is a NaN. Otherwise, the comparison is performed 4298 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4299 *----------------------------------------------------------------------------*/ 4300 4301 int float64_eq(float64 a, float64 b, float_status *status) 4302 { 4303 uint64_t av, bv; 4304 a = float64_squash_input_denormal(a, status); 4305 b = float64_squash_input_denormal(b, status); 4306 4307 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4308 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4309 ) { 4310 float_raise(float_flag_invalid, status); 4311 return 0; 4312 } 4313 av = float64_val(a); 4314 bv = float64_val(b); 4315 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4316 4317 } 4318 4319 /*---------------------------------------------------------------------------- 4320 | Returns 1 if the double-precision floating-point value `a' is less than or 4321 | equal to the corresponding value `b', and 0 otherwise. The invalid 4322 | exception is raised if either operand is a NaN. The comparison is performed 4323 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4324 *----------------------------------------------------------------------------*/ 4325 4326 int float64_le(float64 a, float64 b, float_status *status) 4327 { 4328 flag aSign, bSign; 4329 uint64_t av, bv; 4330 a = float64_squash_input_denormal(a, status); 4331 b = float64_squash_input_denormal(b, status); 4332 4333 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4334 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4335 ) { 4336 float_raise(float_flag_invalid, status); 4337 return 0; 4338 } 4339 aSign = extractFloat64Sign( a ); 4340 bSign = extractFloat64Sign( b ); 4341 av = float64_val(a); 4342 bv = float64_val(b); 4343 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4344 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4345 4346 } 4347 4348 /*---------------------------------------------------------------------------- 4349 | Returns 1 if the double-precision floating-point value `a' is less than 4350 | the corresponding value `b', and 0 otherwise. The invalid exception is 4351 | raised if either operand is a NaN. The comparison is performed according 4352 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4353 *----------------------------------------------------------------------------*/ 4354 4355 int float64_lt(float64 a, float64 b, float_status *status) 4356 { 4357 flag aSign, bSign; 4358 uint64_t av, bv; 4359 4360 a = float64_squash_input_denormal(a, status); 4361 b = float64_squash_input_denormal(b, status); 4362 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4363 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4364 ) { 4365 float_raise(float_flag_invalid, status); 4366 return 0; 4367 } 4368 aSign = extractFloat64Sign( a ); 4369 bSign = extractFloat64Sign( b ); 4370 av = float64_val(a); 4371 bv = float64_val(b); 4372 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4373 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4374 4375 } 4376 4377 /*---------------------------------------------------------------------------- 4378 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4379 | be compared, and 0 otherwise. The invalid exception is raised if either 4380 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4381 | Standard for Binary Floating-Point Arithmetic. 4382 *----------------------------------------------------------------------------*/ 4383 4384 int float64_unordered(float64 a, float64 b, float_status *status) 4385 { 4386 a = float64_squash_input_denormal(a, status); 4387 b = float64_squash_input_denormal(b, status); 4388 4389 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4390 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4391 ) { 4392 float_raise(float_flag_invalid, status); 4393 return 1; 4394 } 4395 return 0; 4396 } 4397 4398 /*---------------------------------------------------------------------------- 4399 | Returns 1 if the double-precision floating-point value `a' is equal to the 4400 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4401 | exception.The comparison is performed according to the IEC/IEEE Standard 4402 | for Binary Floating-Point Arithmetic. 4403 *----------------------------------------------------------------------------*/ 4404 4405 int float64_eq_quiet(float64 a, float64 b, float_status *status) 4406 { 4407 uint64_t av, bv; 4408 a = float64_squash_input_denormal(a, status); 4409 b = float64_squash_input_denormal(b, status); 4410 4411 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4412 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4413 ) { 4414 if (float64_is_signaling_nan(a, status) 4415 || float64_is_signaling_nan(b, status)) { 4416 float_raise(float_flag_invalid, status); 4417 } 4418 return 0; 4419 } 4420 av = float64_val(a); 4421 bv = float64_val(b); 4422 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4423 4424 } 4425 4426 /*---------------------------------------------------------------------------- 4427 | Returns 1 if the double-precision floating-point value `a' is less than or 4428 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4429 | cause an exception. Otherwise, the comparison is performed according to the 4430 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4431 *----------------------------------------------------------------------------*/ 4432 4433 int float64_le_quiet(float64 a, float64 b, float_status *status) 4434 { 4435 flag aSign, bSign; 4436 uint64_t av, bv; 4437 a = float64_squash_input_denormal(a, status); 4438 b = float64_squash_input_denormal(b, status); 4439 4440 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4441 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4442 ) { 4443 if (float64_is_signaling_nan(a, status) 4444 || float64_is_signaling_nan(b, status)) { 4445 float_raise(float_flag_invalid, status); 4446 } 4447 return 0; 4448 } 4449 aSign = extractFloat64Sign( a ); 4450 bSign = extractFloat64Sign( b ); 4451 av = float64_val(a); 4452 bv = float64_val(b); 4453 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4454 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4455 4456 } 4457 4458 /*---------------------------------------------------------------------------- 4459 | Returns 1 if the double-precision floating-point value `a' is less than 4460 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4461 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 4462 | Standard for Binary Floating-Point Arithmetic. 4463 *----------------------------------------------------------------------------*/ 4464 4465 int float64_lt_quiet(float64 a, float64 b, float_status *status) 4466 { 4467 flag aSign, bSign; 4468 uint64_t av, bv; 4469 a = float64_squash_input_denormal(a, status); 4470 b = float64_squash_input_denormal(b, status); 4471 4472 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4473 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4474 ) { 4475 if (float64_is_signaling_nan(a, status) 4476 || float64_is_signaling_nan(b, status)) { 4477 float_raise(float_flag_invalid, status); 4478 } 4479 return 0; 4480 } 4481 aSign = extractFloat64Sign( a ); 4482 bSign = extractFloat64Sign( b ); 4483 av = float64_val(a); 4484 bv = float64_val(b); 4485 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4486 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4487 4488 } 4489 4490 /*---------------------------------------------------------------------------- 4491 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4492 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 4493 | comparison is performed according to the IEC/IEEE Standard for Binary 4494 | Floating-Point Arithmetic. 4495 *----------------------------------------------------------------------------*/ 4496 4497 int float64_unordered_quiet(float64 a, float64 b, float_status *status) 4498 { 4499 a = float64_squash_input_denormal(a, status); 4500 b = float64_squash_input_denormal(b, status); 4501 4502 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4503 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4504 ) { 4505 if (float64_is_signaling_nan(a, status) 4506 || float64_is_signaling_nan(b, status)) { 4507 float_raise(float_flag_invalid, status); 4508 } 4509 return 1; 4510 } 4511 return 0; 4512 } 4513 4514 /*---------------------------------------------------------------------------- 4515 | Returns the result of converting the extended double-precision floating- 4516 | point value `a' to the 32-bit two's complement integer format. The 4517 | conversion is performed according to the IEC/IEEE Standard for Binary 4518 | Floating-Point Arithmetic---which means in particular that the conversion 4519 | is rounded according to the current rounding mode. If `a' is a NaN, the 4520 | largest positive integer is returned. Otherwise, if the conversion 4521 | overflows, the largest integer with the same sign as `a' is returned. 4522 *----------------------------------------------------------------------------*/ 4523 4524 int32_t floatx80_to_int32(floatx80 a, float_status *status) 4525 { 4526 flag aSign; 4527 int32_t aExp, shiftCount; 4528 uint64_t aSig; 4529 4530 if (floatx80_invalid_encoding(a)) { 4531 float_raise(float_flag_invalid, status); 4532 return 1 << 31; 4533 } 4534 aSig = extractFloatx80Frac( a ); 4535 aExp = extractFloatx80Exp( a ); 4536 aSign = extractFloatx80Sign( a ); 4537 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4538 shiftCount = 0x4037 - aExp; 4539 if ( shiftCount <= 0 ) shiftCount = 1; 4540 shift64RightJamming( aSig, shiftCount, &aSig ); 4541 return roundAndPackInt32(aSign, aSig, status); 4542 4543 } 4544 4545 /*---------------------------------------------------------------------------- 4546 | Returns the result of converting the extended double-precision floating- 4547 | point value `a' to the 32-bit two's complement integer format. The 4548 | conversion is performed according to the IEC/IEEE Standard for Binary 4549 | Floating-Point Arithmetic, except that the conversion is always rounded 4550 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4551 | Otherwise, if the conversion overflows, the largest integer with the same 4552 | sign as `a' is returned. 4553 *----------------------------------------------------------------------------*/ 4554 4555 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 4556 { 4557 flag aSign; 4558 int32_t aExp, shiftCount; 4559 uint64_t aSig, savedASig; 4560 int32_t z; 4561 4562 if (floatx80_invalid_encoding(a)) { 4563 float_raise(float_flag_invalid, status); 4564 return 1 << 31; 4565 } 4566 aSig = extractFloatx80Frac( a ); 4567 aExp = extractFloatx80Exp( a ); 4568 aSign = extractFloatx80Sign( a ); 4569 if ( 0x401E < aExp ) { 4570 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4571 goto invalid; 4572 } 4573 else if ( aExp < 0x3FFF ) { 4574 if (aExp || aSig) { 4575 status->float_exception_flags |= float_flag_inexact; 4576 } 4577 return 0; 4578 } 4579 shiftCount = 0x403E - aExp; 4580 savedASig = aSig; 4581 aSig >>= shiftCount; 4582 z = aSig; 4583 if ( aSign ) z = - z; 4584 if ( ( z < 0 ) ^ aSign ) { 4585 invalid: 4586 float_raise(float_flag_invalid, status); 4587 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 4588 } 4589 if ( ( aSig<<shiftCount ) != savedASig ) { 4590 status->float_exception_flags |= float_flag_inexact; 4591 } 4592 return z; 4593 4594 } 4595 4596 /*---------------------------------------------------------------------------- 4597 | Returns the result of converting the extended double-precision floating- 4598 | point value `a' to the 64-bit two's complement integer format. The 4599 | conversion is performed according to the IEC/IEEE Standard for Binary 4600 | Floating-Point Arithmetic---which means in particular that the conversion 4601 | is rounded according to the current rounding mode. If `a' is a NaN, 4602 | the largest positive integer is returned. Otherwise, if the conversion 4603 | overflows, the largest integer with the same sign as `a' is returned. 4604 *----------------------------------------------------------------------------*/ 4605 4606 int64_t floatx80_to_int64(floatx80 a, float_status *status) 4607 { 4608 flag aSign; 4609 int32_t aExp, shiftCount; 4610 uint64_t aSig, aSigExtra; 4611 4612 if (floatx80_invalid_encoding(a)) { 4613 float_raise(float_flag_invalid, status); 4614 return 1ULL << 63; 4615 } 4616 aSig = extractFloatx80Frac( a ); 4617 aExp = extractFloatx80Exp( a ); 4618 aSign = extractFloatx80Sign( a ); 4619 shiftCount = 0x403E - aExp; 4620 if ( shiftCount <= 0 ) { 4621 if ( shiftCount ) { 4622 float_raise(float_flag_invalid, status); 4623 if ( ! aSign 4624 || ( ( aExp == 0x7FFF ) 4625 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 4626 ) { 4627 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4628 } 4629 return (int64_t) LIT64( 0x8000000000000000 ); 4630 } 4631 aSigExtra = 0; 4632 } 4633 else { 4634 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 4635 } 4636 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 4637 4638 } 4639 4640 /*---------------------------------------------------------------------------- 4641 | Returns the result of converting the extended double-precision floating- 4642 | point value `a' to the 64-bit two's complement integer format. The 4643 | conversion is performed according to the IEC/IEEE Standard for Binary 4644 | Floating-Point Arithmetic, except that the conversion is always rounded 4645 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4646 | Otherwise, if the conversion overflows, the largest integer with the same 4647 | sign as `a' is returned. 4648 *----------------------------------------------------------------------------*/ 4649 4650 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 4651 { 4652 flag aSign; 4653 int32_t aExp, shiftCount; 4654 uint64_t aSig; 4655 int64_t z; 4656 4657 if (floatx80_invalid_encoding(a)) { 4658 float_raise(float_flag_invalid, status); 4659 return 1ULL << 63; 4660 } 4661 aSig = extractFloatx80Frac( a ); 4662 aExp = extractFloatx80Exp( a ); 4663 aSign = extractFloatx80Sign( a ); 4664 shiftCount = aExp - 0x403E; 4665 if ( 0 <= shiftCount ) { 4666 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 4667 if ( ( a.high != 0xC03E ) || aSig ) { 4668 float_raise(float_flag_invalid, status); 4669 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 4670 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4671 } 4672 } 4673 return (int64_t) LIT64( 0x8000000000000000 ); 4674 } 4675 else if ( aExp < 0x3FFF ) { 4676 if (aExp | aSig) { 4677 status->float_exception_flags |= float_flag_inexact; 4678 } 4679 return 0; 4680 } 4681 z = aSig>>( - shiftCount ); 4682 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 4683 status->float_exception_flags |= float_flag_inexact; 4684 } 4685 if ( aSign ) z = - z; 4686 return z; 4687 4688 } 4689 4690 /*---------------------------------------------------------------------------- 4691 | Returns the result of converting the extended double-precision floating- 4692 | point value `a' to the single-precision floating-point format. The 4693 | conversion is performed according to the IEC/IEEE Standard for Binary 4694 | Floating-Point Arithmetic. 4695 *----------------------------------------------------------------------------*/ 4696 4697 float32 floatx80_to_float32(floatx80 a, float_status *status) 4698 { 4699 flag aSign; 4700 int32_t aExp; 4701 uint64_t aSig; 4702 4703 if (floatx80_invalid_encoding(a)) { 4704 float_raise(float_flag_invalid, status); 4705 return float32_default_nan(status); 4706 } 4707 aSig = extractFloatx80Frac( a ); 4708 aExp = extractFloatx80Exp( a ); 4709 aSign = extractFloatx80Sign( a ); 4710 if ( aExp == 0x7FFF ) { 4711 if ( (uint64_t) ( aSig<<1 ) ) { 4712 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status); 4713 } 4714 return packFloat32( aSign, 0xFF, 0 ); 4715 } 4716 shift64RightJamming( aSig, 33, &aSig ); 4717 if ( aExp || aSig ) aExp -= 0x3F81; 4718 return roundAndPackFloat32(aSign, aExp, aSig, status); 4719 4720 } 4721 4722 /*---------------------------------------------------------------------------- 4723 | Returns the result of converting the extended double-precision floating- 4724 | point value `a' to the double-precision floating-point format. The 4725 | conversion is performed according to the IEC/IEEE Standard for Binary 4726 | Floating-Point Arithmetic. 4727 *----------------------------------------------------------------------------*/ 4728 4729 float64 floatx80_to_float64(floatx80 a, float_status *status) 4730 { 4731 flag aSign; 4732 int32_t aExp; 4733 uint64_t aSig, zSig; 4734 4735 if (floatx80_invalid_encoding(a)) { 4736 float_raise(float_flag_invalid, status); 4737 return float64_default_nan(status); 4738 } 4739 aSig = extractFloatx80Frac( a ); 4740 aExp = extractFloatx80Exp( a ); 4741 aSign = extractFloatx80Sign( a ); 4742 if ( aExp == 0x7FFF ) { 4743 if ( (uint64_t) ( aSig<<1 ) ) { 4744 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status); 4745 } 4746 return packFloat64( aSign, 0x7FF, 0 ); 4747 } 4748 shift64RightJamming( aSig, 1, &zSig ); 4749 if ( aExp || aSig ) aExp -= 0x3C01; 4750 return roundAndPackFloat64(aSign, aExp, zSig, status); 4751 4752 } 4753 4754 /*---------------------------------------------------------------------------- 4755 | Returns the result of converting the extended double-precision floating- 4756 | point value `a' to the quadruple-precision floating-point format. The 4757 | conversion is performed according to the IEC/IEEE Standard for Binary 4758 | Floating-Point Arithmetic. 4759 *----------------------------------------------------------------------------*/ 4760 4761 float128 floatx80_to_float128(floatx80 a, float_status *status) 4762 { 4763 flag aSign; 4764 int aExp; 4765 uint64_t aSig, zSig0, zSig1; 4766 4767 if (floatx80_invalid_encoding(a)) { 4768 float_raise(float_flag_invalid, status); 4769 return float128_default_nan(status); 4770 } 4771 aSig = extractFloatx80Frac( a ); 4772 aExp = extractFloatx80Exp( a ); 4773 aSign = extractFloatx80Sign( a ); 4774 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 4775 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status); 4776 } 4777 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 4778 return packFloat128( aSign, aExp, zSig0, zSig1 ); 4779 4780 } 4781 4782 /*---------------------------------------------------------------------------- 4783 | Rounds the extended double-precision floating-point value `a' 4784 | to the precision provided by floatx80_rounding_precision and returns the 4785 | result as an extended double-precision floating-point value. 4786 | The operation is performed according to the IEC/IEEE Standard for Binary 4787 | Floating-Point Arithmetic. 4788 *----------------------------------------------------------------------------*/ 4789 4790 floatx80 floatx80_round(floatx80 a, float_status *status) 4791 { 4792 return roundAndPackFloatx80(status->floatx80_rounding_precision, 4793 extractFloatx80Sign(a), 4794 extractFloatx80Exp(a), 4795 extractFloatx80Frac(a), 0, status); 4796 } 4797 4798 /*---------------------------------------------------------------------------- 4799 | Rounds the extended double-precision floating-point value `a' to an integer, 4800 | and returns the result as an extended quadruple-precision floating-point 4801 | value. The operation is performed according to the IEC/IEEE Standard for 4802 | Binary Floating-Point Arithmetic. 4803 *----------------------------------------------------------------------------*/ 4804 4805 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 4806 { 4807 flag aSign; 4808 int32_t aExp; 4809 uint64_t lastBitMask, roundBitsMask; 4810 floatx80 z; 4811 4812 if (floatx80_invalid_encoding(a)) { 4813 float_raise(float_flag_invalid, status); 4814 return floatx80_default_nan(status); 4815 } 4816 aExp = extractFloatx80Exp( a ); 4817 if ( 0x403E <= aExp ) { 4818 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 4819 return propagateFloatx80NaN(a, a, status); 4820 } 4821 return a; 4822 } 4823 if ( aExp < 0x3FFF ) { 4824 if ( ( aExp == 0 ) 4825 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 4826 return a; 4827 } 4828 status->float_exception_flags |= float_flag_inexact; 4829 aSign = extractFloatx80Sign( a ); 4830 switch (status->float_rounding_mode) { 4831 case float_round_nearest_even: 4832 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 4833 ) { 4834 return 4835 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 4836 } 4837 break; 4838 case float_round_ties_away: 4839 if (aExp == 0x3FFE) { 4840 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000)); 4841 } 4842 break; 4843 case float_round_down: 4844 return 4845 aSign ? 4846 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 4847 : packFloatx80( 0, 0, 0 ); 4848 case float_round_up: 4849 return 4850 aSign ? packFloatx80( 1, 0, 0 ) 4851 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 4852 } 4853 return packFloatx80( aSign, 0, 0 ); 4854 } 4855 lastBitMask = 1; 4856 lastBitMask <<= 0x403E - aExp; 4857 roundBitsMask = lastBitMask - 1; 4858 z = a; 4859 switch (status->float_rounding_mode) { 4860 case float_round_nearest_even: 4861 z.low += lastBitMask>>1; 4862 if ((z.low & roundBitsMask) == 0) { 4863 z.low &= ~lastBitMask; 4864 } 4865 break; 4866 case float_round_ties_away: 4867 z.low += lastBitMask >> 1; 4868 break; 4869 case float_round_to_zero: 4870 break; 4871 case float_round_up: 4872 if (!extractFloatx80Sign(z)) { 4873 z.low += roundBitsMask; 4874 } 4875 break; 4876 case float_round_down: 4877 if (extractFloatx80Sign(z)) { 4878 z.low += roundBitsMask; 4879 } 4880 break; 4881 default: 4882 abort(); 4883 } 4884 z.low &= ~ roundBitsMask; 4885 if ( z.low == 0 ) { 4886 ++z.high; 4887 z.low = LIT64( 0x8000000000000000 ); 4888 } 4889 if (z.low != a.low) { 4890 status->float_exception_flags |= float_flag_inexact; 4891 } 4892 return z; 4893 4894 } 4895 4896 /*---------------------------------------------------------------------------- 4897 | Returns the result of adding the absolute values of the extended double- 4898 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 4899 | negated before being returned. `zSign' is ignored if the result is a NaN. 4900 | The addition is performed according to the IEC/IEEE Standard for Binary 4901 | Floating-Point Arithmetic. 4902 *----------------------------------------------------------------------------*/ 4903 4904 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 4905 float_status *status) 4906 { 4907 int32_t aExp, bExp, zExp; 4908 uint64_t aSig, bSig, zSig0, zSig1; 4909 int32_t expDiff; 4910 4911 aSig = extractFloatx80Frac( a ); 4912 aExp = extractFloatx80Exp( a ); 4913 bSig = extractFloatx80Frac( b ); 4914 bExp = extractFloatx80Exp( b ); 4915 expDiff = aExp - bExp; 4916 if ( 0 < expDiff ) { 4917 if ( aExp == 0x7FFF ) { 4918 if ((uint64_t)(aSig << 1)) { 4919 return propagateFloatx80NaN(a, b, status); 4920 } 4921 return a; 4922 } 4923 if ( bExp == 0 ) --expDiff; 4924 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 4925 zExp = aExp; 4926 } 4927 else if ( expDiff < 0 ) { 4928 if ( bExp == 0x7FFF ) { 4929 if ((uint64_t)(bSig << 1)) { 4930 return propagateFloatx80NaN(a, b, status); 4931 } 4932 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4933 } 4934 if ( aExp == 0 ) ++expDiff; 4935 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 4936 zExp = bExp; 4937 } 4938 else { 4939 if ( aExp == 0x7FFF ) { 4940 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 4941 return propagateFloatx80NaN(a, b, status); 4942 } 4943 return a; 4944 } 4945 zSig1 = 0; 4946 zSig0 = aSig + bSig; 4947 if ( aExp == 0 ) { 4948 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 4949 goto roundAndPack; 4950 } 4951 zExp = aExp; 4952 goto shiftRight1; 4953 } 4954 zSig0 = aSig + bSig; 4955 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 4956 shiftRight1: 4957 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 4958 zSig0 |= LIT64( 0x8000000000000000 ); 4959 ++zExp; 4960 roundAndPack: 4961 return roundAndPackFloatx80(status->floatx80_rounding_precision, 4962 zSign, zExp, zSig0, zSig1, status); 4963 } 4964 4965 /*---------------------------------------------------------------------------- 4966 | Returns the result of subtracting the absolute values of the extended 4967 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 4968 | difference is negated before being returned. `zSign' is ignored if the 4969 | result is a NaN. The subtraction is performed according to the IEC/IEEE 4970 | Standard for Binary Floating-Point Arithmetic. 4971 *----------------------------------------------------------------------------*/ 4972 4973 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 4974 float_status *status) 4975 { 4976 int32_t aExp, bExp, zExp; 4977 uint64_t aSig, bSig, zSig0, zSig1; 4978 int32_t expDiff; 4979 4980 aSig = extractFloatx80Frac( a ); 4981 aExp = extractFloatx80Exp( a ); 4982 bSig = extractFloatx80Frac( b ); 4983 bExp = extractFloatx80Exp( b ); 4984 expDiff = aExp - bExp; 4985 if ( 0 < expDiff ) goto aExpBigger; 4986 if ( expDiff < 0 ) goto bExpBigger; 4987 if ( aExp == 0x7FFF ) { 4988 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 4989 return propagateFloatx80NaN(a, b, status); 4990 } 4991 float_raise(float_flag_invalid, status); 4992 return floatx80_default_nan(status); 4993 } 4994 if ( aExp == 0 ) { 4995 aExp = 1; 4996 bExp = 1; 4997 } 4998 zSig1 = 0; 4999 if ( bSig < aSig ) goto aBigger; 5000 if ( aSig < bSig ) goto bBigger; 5001 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 5002 bExpBigger: 5003 if ( bExp == 0x7FFF ) { 5004 if ((uint64_t)(bSig << 1)) { 5005 return propagateFloatx80NaN(a, b, status); 5006 } 5007 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5008 } 5009 if ( aExp == 0 ) ++expDiff; 5010 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5011 bBigger: 5012 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 5013 zExp = bExp; 5014 zSign ^= 1; 5015 goto normalizeRoundAndPack; 5016 aExpBigger: 5017 if ( aExp == 0x7FFF ) { 5018 if ((uint64_t)(aSig << 1)) { 5019 return propagateFloatx80NaN(a, b, status); 5020 } 5021 return a; 5022 } 5023 if ( bExp == 0 ) --expDiff; 5024 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5025 aBigger: 5026 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 5027 zExp = aExp; 5028 normalizeRoundAndPack: 5029 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 5030 zSign, zExp, zSig0, zSig1, status); 5031 } 5032 5033 /*---------------------------------------------------------------------------- 5034 | Returns the result of adding the extended double-precision floating-point 5035 | values `a' and `b'. The operation is performed according to the IEC/IEEE 5036 | Standard for Binary Floating-Point Arithmetic. 5037 *----------------------------------------------------------------------------*/ 5038 5039 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 5040 { 5041 flag aSign, bSign; 5042 5043 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5044 float_raise(float_flag_invalid, status); 5045 return floatx80_default_nan(status); 5046 } 5047 aSign = extractFloatx80Sign( a ); 5048 bSign = extractFloatx80Sign( b ); 5049 if ( aSign == bSign ) { 5050 return addFloatx80Sigs(a, b, aSign, status); 5051 } 5052 else { 5053 return subFloatx80Sigs(a, b, aSign, status); 5054 } 5055 5056 } 5057 5058 /*---------------------------------------------------------------------------- 5059 | Returns the result of subtracting the extended double-precision floating- 5060 | point values `a' and `b'. The operation is performed according to the 5061 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5062 *----------------------------------------------------------------------------*/ 5063 5064 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 5065 { 5066 flag aSign, bSign; 5067 5068 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5069 float_raise(float_flag_invalid, status); 5070 return floatx80_default_nan(status); 5071 } 5072 aSign = extractFloatx80Sign( a ); 5073 bSign = extractFloatx80Sign( b ); 5074 if ( aSign == bSign ) { 5075 return subFloatx80Sigs(a, b, aSign, status); 5076 } 5077 else { 5078 return addFloatx80Sigs(a, b, aSign, status); 5079 } 5080 5081 } 5082 5083 /*---------------------------------------------------------------------------- 5084 | Returns the result of multiplying the extended double-precision floating- 5085 | point values `a' and `b'. The operation is performed according to the 5086 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5087 *----------------------------------------------------------------------------*/ 5088 5089 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 5090 { 5091 flag aSign, bSign, zSign; 5092 int32_t aExp, bExp, zExp; 5093 uint64_t aSig, bSig, zSig0, zSig1; 5094 5095 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5096 float_raise(float_flag_invalid, status); 5097 return floatx80_default_nan(status); 5098 } 5099 aSig = extractFloatx80Frac( a ); 5100 aExp = extractFloatx80Exp( a ); 5101 aSign = extractFloatx80Sign( a ); 5102 bSig = extractFloatx80Frac( b ); 5103 bExp = extractFloatx80Exp( b ); 5104 bSign = extractFloatx80Sign( b ); 5105 zSign = aSign ^ bSign; 5106 if ( aExp == 0x7FFF ) { 5107 if ( (uint64_t) ( aSig<<1 ) 5108 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5109 return propagateFloatx80NaN(a, b, status); 5110 } 5111 if ( ( bExp | bSig ) == 0 ) goto invalid; 5112 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5113 } 5114 if ( bExp == 0x7FFF ) { 5115 if ((uint64_t)(bSig << 1)) { 5116 return propagateFloatx80NaN(a, b, status); 5117 } 5118 if ( ( aExp | aSig ) == 0 ) { 5119 invalid: 5120 float_raise(float_flag_invalid, status); 5121 return floatx80_default_nan(status); 5122 } 5123 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5124 } 5125 if ( aExp == 0 ) { 5126 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5127 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5128 } 5129 if ( bExp == 0 ) { 5130 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5131 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5132 } 5133 zExp = aExp + bExp - 0x3FFE; 5134 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 5135 if ( 0 < (int64_t) zSig0 ) { 5136 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5137 --zExp; 5138 } 5139 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5140 zSign, zExp, zSig0, zSig1, status); 5141 } 5142 5143 /*---------------------------------------------------------------------------- 5144 | Returns the result of dividing the extended double-precision floating-point 5145 | value `a' by the corresponding value `b'. The operation is performed 5146 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5147 *----------------------------------------------------------------------------*/ 5148 5149 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 5150 { 5151 flag aSign, bSign, zSign; 5152 int32_t aExp, bExp, zExp; 5153 uint64_t aSig, bSig, zSig0, zSig1; 5154 uint64_t rem0, rem1, rem2, term0, term1, term2; 5155 5156 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5157 float_raise(float_flag_invalid, status); 5158 return floatx80_default_nan(status); 5159 } 5160 aSig = extractFloatx80Frac( a ); 5161 aExp = extractFloatx80Exp( a ); 5162 aSign = extractFloatx80Sign( a ); 5163 bSig = extractFloatx80Frac( b ); 5164 bExp = extractFloatx80Exp( b ); 5165 bSign = extractFloatx80Sign( b ); 5166 zSign = aSign ^ bSign; 5167 if ( aExp == 0x7FFF ) { 5168 if ((uint64_t)(aSig << 1)) { 5169 return propagateFloatx80NaN(a, b, status); 5170 } 5171 if ( bExp == 0x7FFF ) { 5172 if ((uint64_t)(bSig << 1)) { 5173 return propagateFloatx80NaN(a, b, status); 5174 } 5175 goto invalid; 5176 } 5177 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5178 } 5179 if ( bExp == 0x7FFF ) { 5180 if ((uint64_t)(bSig << 1)) { 5181 return propagateFloatx80NaN(a, b, status); 5182 } 5183 return packFloatx80( zSign, 0, 0 ); 5184 } 5185 if ( bExp == 0 ) { 5186 if ( bSig == 0 ) { 5187 if ( ( aExp | aSig ) == 0 ) { 5188 invalid: 5189 float_raise(float_flag_invalid, status); 5190 return floatx80_default_nan(status); 5191 } 5192 float_raise(float_flag_divbyzero, status); 5193 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5194 } 5195 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5196 } 5197 if ( aExp == 0 ) { 5198 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5199 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5200 } 5201 zExp = aExp - bExp + 0x3FFE; 5202 rem1 = 0; 5203 if ( bSig <= aSig ) { 5204 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 5205 ++zExp; 5206 } 5207 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 5208 mul64To128( bSig, zSig0, &term0, &term1 ); 5209 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 5210 while ( (int64_t) rem0 < 0 ) { 5211 --zSig0; 5212 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 5213 } 5214 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 5215 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 5216 mul64To128( bSig, zSig1, &term1, &term2 ); 5217 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5218 while ( (int64_t) rem1 < 0 ) { 5219 --zSig1; 5220 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 5221 } 5222 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 5223 } 5224 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5225 zSign, zExp, zSig0, zSig1, status); 5226 } 5227 5228 /*---------------------------------------------------------------------------- 5229 | Returns the remainder of the extended double-precision floating-point value 5230 | `a' with respect to the corresponding value `b'. The operation is performed 5231 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5232 *----------------------------------------------------------------------------*/ 5233 5234 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 5235 { 5236 flag aSign, zSign; 5237 int32_t aExp, bExp, expDiff; 5238 uint64_t aSig0, aSig1, bSig; 5239 uint64_t q, term0, term1, alternateASig0, alternateASig1; 5240 5241 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5242 float_raise(float_flag_invalid, status); 5243 return floatx80_default_nan(status); 5244 } 5245 aSig0 = extractFloatx80Frac( a ); 5246 aExp = extractFloatx80Exp( a ); 5247 aSign = extractFloatx80Sign( a ); 5248 bSig = extractFloatx80Frac( b ); 5249 bExp = extractFloatx80Exp( b ); 5250 if ( aExp == 0x7FFF ) { 5251 if ( (uint64_t) ( aSig0<<1 ) 5252 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5253 return propagateFloatx80NaN(a, b, status); 5254 } 5255 goto invalid; 5256 } 5257 if ( bExp == 0x7FFF ) { 5258 if ((uint64_t)(bSig << 1)) { 5259 return propagateFloatx80NaN(a, b, status); 5260 } 5261 return a; 5262 } 5263 if ( bExp == 0 ) { 5264 if ( bSig == 0 ) { 5265 invalid: 5266 float_raise(float_flag_invalid, status); 5267 return floatx80_default_nan(status); 5268 } 5269 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5270 } 5271 if ( aExp == 0 ) { 5272 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a; 5273 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5274 } 5275 bSig |= LIT64( 0x8000000000000000 ); 5276 zSign = aSign; 5277 expDiff = aExp - bExp; 5278 aSig1 = 0; 5279 if ( expDiff < 0 ) { 5280 if ( expDiff < -1 ) return a; 5281 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 5282 expDiff = 0; 5283 } 5284 q = ( bSig <= aSig0 ); 5285 if ( q ) aSig0 -= bSig; 5286 expDiff -= 64; 5287 while ( 0 < expDiff ) { 5288 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5289 q = ( 2 < q ) ? q - 2 : 0; 5290 mul64To128( bSig, q, &term0, &term1 ); 5291 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5292 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 5293 expDiff -= 62; 5294 } 5295 expDiff += 64; 5296 if ( 0 < expDiff ) { 5297 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5298 q = ( 2 < q ) ? q - 2 : 0; 5299 q >>= 64 - expDiff; 5300 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 5301 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5302 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 5303 while ( le128( term0, term1, aSig0, aSig1 ) ) { 5304 ++q; 5305 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5306 } 5307 } 5308 else { 5309 term1 = 0; 5310 term0 = bSig; 5311 } 5312 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 5313 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5314 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5315 && ( q & 1 ) ) 5316 ) { 5317 aSig0 = alternateASig0; 5318 aSig1 = alternateASig1; 5319 zSign = ! zSign; 5320 } 5321 return 5322 normalizeRoundAndPackFloatx80( 5323 80, zSign, bExp + expDiff, aSig0, aSig1, status); 5324 5325 } 5326 5327 /*---------------------------------------------------------------------------- 5328 | Returns the square root of the extended double-precision floating-point 5329 | value `a'. The operation is performed according to the IEC/IEEE Standard 5330 | for Binary Floating-Point Arithmetic. 5331 *----------------------------------------------------------------------------*/ 5332 5333 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 5334 { 5335 flag aSign; 5336 int32_t aExp, zExp; 5337 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 5338 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5339 5340 if (floatx80_invalid_encoding(a)) { 5341 float_raise(float_flag_invalid, status); 5342 return floatx80_default_nan(status); 5343 } 5344 aSig0 = extractFloatx80Frac( a ); 5345 aExp = extractFloatx80Exp( a ); 5346 aSign = extractFloatx80Sign( a ); 5347 if ( aExp == 0x7FFF ) { 5348 if ((uint64_t)(aSig0 << 1)) { 5349 return propagateFloatx80NaN(a, a, status); 5350 } 5351 if ( ! aSign ) return a; 5352 goto invalid; 5353 } 5354 if ( aSign ) { 5355 if ( ( aExp | aSig0 ) == 0 ) return a; 5356 invalid: 5357 float_raise(float_flag_invalid, status); 5358 return floatx80_default_nan(status); 5359 } 5360 if ( aExp == 0 ) { 5361 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 5362 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5363 } 5364 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 5365 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 5366 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 5367 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5368 doubleZSig0 = zSig0<<1; 5369 mul64To128( zSig0, zSig0, &term0, &term1 ); 5370 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5371 while ( (int64_t) rem0 < 0 ) { 5372 --zSig0; 5373 doubleZSig0 -= 2; 5374 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5375 } 5376 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5377 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 5378 if ( zSig1 == 0 ) zSig1 = 1; 5379 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5380 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5381 mul64To128( zSig1, zSig1, &term2, &term3 ); 5382 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5383 while ( (int64_t) rem1 < 0 ) { 5384 --zSig1; 5385 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5386 term3 |= 1; 5387 term2 |= doubleZSig0; 5388 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5389 } 5390 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5391 } 5392 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 5393 zSig0 |= doubleZSig0; 5394 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5395 0, zExp, zSig0, zSig1, status); 5396 } 5397 5398 /*---------------------------------------------------------------------------- 5399 | Returns 1 if the extended double-precision floating-point value `a' is equal 5400 | to the corresponding value `b', and 0 otherwise. The invalid exception is 5401 | raised if either operand is a NaN. Otherwise, the comparison is performed 5402 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5403 *----------------------------------------------------------------------------*/ 5404 5405 int floatx80_eq(floatx80 a, floatx80 b, float_status *status) 5406 { 5407 5408 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5409 || (extractFloatx80Exp(a) == 0x7FFF 5410 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5411 || (extractFloatx80Exp(b) == 0x7FFF 5412 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5413 ) { 5414 float_raise(float_flag_invalid, status); 5415 return 0; 5416 } 5417 return 5418 ( a.low == b.low ) 5419 && ( ( a.high == b.high ) 5420 || ( ( a.low == 0 ) 5421 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5422 ); 5423 5424 } 5425 5426 /*---------------------------------------------------------------------------- 5427 | Returns 1 if the extended double-precision floating-point value `a' is 5428 | less than or equal to the corresponding value `b', and 0 otherwise. The 5429 | invalid exception is raised if either operand is a NaN. The comparison is 5430 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5431 | Arithmetic. 5432 *----------------------------------------------------------------------------*/ 5433 5434 int floatx80_le(floatx80 a, floatx80 b, float_status *status) 5435 { 5436 flag aSign, bSign; 5437 5438 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5439 || (extractFloatx80Exp(a) == 0x7FFF 5440 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5441 || (extractFloatx80Exp(b) == 0x7FFF 5442 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5443 ) { 5444 float_raise(float_flag_invalid, status); 5445 return 0; 5446 } 5447 aSign = extractFloatx80Sign( a ); 5448 bSign = extractFloatx80Sign( b ); 5449 if ( aSign != bSign ) { 5450 return 5451 aSign 5452 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5453 == 0 ); 5454 } 5455 return 5456 aSign ? le128( b.high, b.low, a.high, a.low ) 5457 : le128( a.high, a.low, b.high, b.low ); 5458 5459 } 5460 5461 /*---------------------------------------------------------------------------- 5462 | Returns 1 if the extended double-precision floating-point value `a' is 5463 | less than the corresponding value `b', and 0 otherwise. The invalid 5464 | exception is raised if either operand is a NaN. The comparison is performed 5465 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5466 *----------------------------------------------------------------------------*/ 5467 5468 int floatx80_lt(floatx80 a, floatx80 b, float_status *status) 5469 { 5470 flag aSign, bSign; 5471 5472 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5473 || (extractFloatx80Exp(a) == 0x7FFF 5474 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5475 || (extractFloatx80Exp(b) == 0x7FFF 5476 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5477 ) { 5478 float_raise(float_flag_invalid, status); 5479 return 0; 5480 } 5481 aSign = extractFloatx80Sign( a ); 5482 bSign = extractFloatx80Sign( b ); 5483 if ( aSign != bSign ) { 5484 return 5485 aSign 5486 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5487 != 0 ); 5488 } 5489 return 5490 aSign ? lt128( b.high, b.low, a.high, a.low ) 5491 : lt128( a.high, a.low, b.high, b.low ); 5492 5493 } 5494 5495 /*---------------------------------------------------------------------------- 5496 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5497 | cannot be compared, and 0 otherwise. The invalid exception is raised if 5498 | either operand is a NaN. The comparison is performed according to the 5499 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5500 *----------------------------------------------------------------------------*/ 5501 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status) 5502 { 5503 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5504 || (extractFloatx80Exp(a) == 0x7FFF 5505 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5506 || (extractFloatx80Exp(b) == 0x7FFF 5507 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5508 ) { 5509 float_raise(float_flag_invalid, status); 5510 return 1; 5511 } 5512 return 0; 5513 } 5514 5515 /*---------------------------------------------------------------------------- 5516 | Returns 1 if the extended double-precision floating-point value `a' is 5517 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5518 | cause an exception. The comparison is performed according to the IEC/IEEE 5519 | Standard for Binary Floating-Point Arithmetic. 5520 *----------------------------------------------------------------------------*/ 5521 5522 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status) 5523 { 5524 5525 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5526 float_raise(float_flag_invalid, status); 5527 return 0; 5528 } 5529 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5530 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5531 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5532 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5533 ) { 5534 if (floatx80_is_signaling_nan(a, status) 5535 || floatx80_is_signaling_nan(b, status)) { 5536 float_raise(float_flag_invalid, status); 5537 } 5538 return 0; 5539 } 5540 return 5541 ( a.low == b.low ) 5542 && ( ( a.high == b.high ) 5543 || ( ( a.low == 0 ) 5544 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5545 ); 5546 5547 } 5548 5549 /*---------------------------------------------------------------------------- 5550 | Returns 1 if the extended double-precision floating-point value `a' is less 5551 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 5552 | do not cause an exception. Otherwise, the comparison is performed according 5553 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5554 *----------------------------------------------------------------------------*/ 5555 5556 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status) 5557 { 5558 flag aSign, bSign; 5559 5560 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5561 float_raise(float_flag_invalid, status); 5562 return 0; 5563 } 5564 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5565 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5566 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5567 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5568 ) { 5569 if (floatx80_is_signaling_nan(a, status) 5570 || floatx80_is_signaling_nan(b, status)) { 5571 float_raise(float_flag_invalid, status); 5572 } 5573 return 0; 5574 } 5575 aSign = extractFloatx80Sign( a ); 5576 bSign = extractFloatx80Sign( b ); 5577 if ( aSign != bSign ) { 5578 return 5579 aSign 5580 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5581 == 0 ); 5582 } 5583 return 5584 aSign ? le128( b.high, b.low, a.high, a.low ) 5585 : le128( a.high, a.low, b.high, b.low ); 5586 5587 } 5588 5589 /*---------------------------------------------------------------------------- 5590 | Returns 1 if the extended double-precision floating-point value `a' is less 5591 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 5592 | an exception. Otherwise, the comparison is performed according to the 5593 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5594 *----------------------------------------------------------------------------*/ 5595 5596 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status) 5597 { 5598 flag aSign, bSign; 5599 5600 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5601 float_raise(float_flag_invalid, status); 5602 return 0; 5603 } 5604 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5605 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5606 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5607 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5608 ) { 5609 if (floatx80_is_signaling_nan(a, status) 5610 || floatx80_is_signaling_nan(b, status)) { 5611 float_raise(float_flag_invalid, status); 5612 } 5613 return 0; 5614 } 5615 aSign = extractFloatx80Sign( a ); 5616 bSign = extractFloatx80Sign( b ); 5617 if ( aSign != bSign ) { 5618 return 5619 aSign 5620 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5621 != 0 ); 5622 } 5623 return 5624 aSign ? lt128( b.high, b.low, a.high, a.low ) 5625 : lt128( a.high, a.low, b.high, b.low ); 5626 5627 } 5628 5629 /*---------------------------------------------------------------------------- 5630 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5631 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception. 5632 | The comparison is performed according to the IEC/IEEE Standard for Binary 5633 | Floating-Point Arithmetic. 5634 *----------------------------------------------------------------------------*/ 5635 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status) 5636 { 5637 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5638 float_raise(float_flag_invalid, status); 5639 return 1; 5640 } 5641 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5642 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5643 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5644 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5645 ) { 5646 if (floatx80_is_signaling_nan(a, status) 5647 || floatx80_is_signaling_nan(b, status)) { 5648 float_raise(float_flag_invalid, status); 5649 } 5650 return 1; 5651 } 5652 return 0; 5653 } 5654 5655 /*---------------------------------------------------------------------------- 5656 | Returns the result of converting the quadruple-precision floating-point 5657 | value `a' to the 32-bit two's complement integer format. The conversion 5658 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5659 | Arithmetic---which means in particular that the conversion is rounded 5660 | according to the current rounding mode. If `a' is a NaN, the largest 5661 | positive integer is returned. Otherwise, if the conversion overflows, the 5662 | largest integer with the same sign as `a' is returned. 5663 *----------------------------------------------------------------------------*/ 5664 5665 int32_t float128_to_int32(float128 a, float_status *status) 5666 { 5667 flag aSign; 5668 int32_t aExp, shiftCount; 5669 uint64_t aSig0, aSig1; 5670 5671 aSig1 = extractFloat128Frac1( a ); 5672 aSig0 = extractFloat128Frac0( a ); 5673 aExp = extractFloat128Exp( a ); 5674 aSign = extractFloat128Sign( a ); 5675 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 5676 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5677 aSig0 |= ( aSig1 != 0 ); 5678 shiftCount = 0x4028 - aExp; 5679 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 5680 return roundAndPackInt32(aSign, aSig0, status); 5681 5682 } 5683 5684 /*---------------------------------------------------------------------------- 5685 | Returns the result of converting the quadruple-precision floating-point 5686 | value `a' to the 32-bit two's complement integer format. The conversion 5687 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5688 | Arithmetic, except that the conversion is always rounded toward zero. If 5689 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 5690 | conversion overflows, the largest integer with the same sign as `a' is 5691 | returned. 5692 *----------------------------------------------------------------------------*/ 5693 5694 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 5695 { 5696 flag aSign; 5697 int32_t aExp, shiftCount; 5698 uint64_t aSig0, aSig1, savedASig; 5699 int32_t z; 5700 5701 aSig1 = extractFloat128Frac1( a ); 5702 aSig0 = extractFloat128Frac0( a ); 5703 aExp = extractFloat128Exp( a ); 5704 aSign = extractFloat128Sign( a ); 5705 aSig0 |= ( aSig1 != 0 ); 5706 if ( 0x401E < aExp ) { 5707 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 5708 goto invalid; 5709 } 5710 else if ( aExp < 0x3FFF ) { 5711 if (aExp || aSig0) { 5712 status->float_exception_flags |= float_flag_inexact; 5713 } 5714 return 0; 5715 } 5716 aSig0 |= LIT64( 0x0001000000000000 ); 5717 shiftCount = 0x402F - aExp; 5718 savedASig = aSig0; 5719 aSig0 >>= shiftCount; 5720 z = aSig0; 5721 if ( aSign ) z = - z; 5722 if ( ( z < 0 ) ^ aSign ) { 5723 invalid: 5724 float_raise(float_flag_invalid, status); 5725 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5726 } 5727 if ( ( aSig0<<shiftCount ) != savedASig ) { 5728 status->float_exception_flags |= float_flag_inexact; 5729 } 5730 return z; 5731 5732 } 5733 5734 /*---------------------------------------------------------------------------- 5735 | Returns the result of converting the quadruple-precision floating-point 5736 | value `a' to the 64-bit two's complement integer format. The conversion 5737 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5738 | Arithmetic---which means in particular that the conversion is rounded 5739 | according to the current rounding mode. If `a' is a NaN, the largest 5740 | positive integer is returned. Otherwise, if the conversion overflows, the 5741 | largest integer with the same sign as `a' is returned. 5742 *----------------------------------------------------------------------------*/ 5743 5744 int64_t float128_to_int64(float128 a, float_status *status) 5745 { 5746 flag aSign; 5747 int32_t aExp, shiftCount; 5748 uint64_t aSig0, aSig1; 5749 5750 aSig1 = extractFloat128Frac1( a ); 5751 aSig0 = extractFloat128Frac0( a ); 5752 aExp = extractFloat128Exp( a ); 5753 aSign = extractFloat128Sign( a ); 5754 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5755 shiftCount = 0x402F - aExp; 5756 if ( shiftCount <= 0 ) { 5757 if ( 0x403E < aExp ) { 5758 float_raise(float_flag_invalid, status); 5759 if ( ! aSign 5760 || ( ( aExp == 0x7FFF ) 5761 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 5762 ) 5763 ) { 5764 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5765 } 5766 return (int64_t) LIT64( 0x8000000000000000 ); 5767 } 5768 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 5769 } 5770 else { 5771 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 5772 } 5773 return roundAndPackInt64(aSign, aSig0, aSig1, status); 5774 5775 } 5776 5777 /*---------------------------------------------------------------------------- 5778 | Returns the result of converting the quadruple-precision floating-point 5779 | value `a' to the 64-bit two's complement integer format. The conversion 5780 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5781 | Arithmetic, except that the conversion is always rounded toward zero. 5782 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 5783 | the conversion overflows, the largest integer with the same sign as `a' is 5784 | returned. 5785 *----------------------------------------------------------------------------*/ 5786 5787 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 5788 { 5789 flag aSign; 5790 int32_t aExp, shiftCount; 5791 uint64_t aSig0, aSig1; 5792 int64_t z; 5793 5794 aSig1 = extractFloat128Frac1( a ); 5795 aSig0 = extractFloat128Frac0( a ); 5796 aExp = extractFloat128Exp( a ); 5797 aSign = extractFloat128Sign( a ); 5798 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5799 shiftCount = aExp - 0x402F; 5800 if ( 0 < shiftCount ) { 5801 if ( 0x403E <= aExp ) { 5802 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 5803 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 5804 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 5805 if (aSig1) { 5806 status->float_exception_flags |= float_flag_inexact; 5807 } 5808 } 5809 else { 5810 float_raise(float_flag_invalid, status); 5811 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 5812 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5813 } 5814 } 5815 return (int64_t) LIT64( 0x8000000000000000 ); 5816 } 5817 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 5818 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 5819 status->float_exception_flags |= float_flag_inexact; 5820 } 5821 } 5822 else { 5823 if ( aExp < 0x3FFF ) { 5824 if ( aExp | aSig0 | aSig1 ) { 5825 status->float_exception_flags |= float_flag_inexact; 5826 } 5827 return 0; 5828 } 5829 z = aSig0>>( - shiftCount ); 5830 if ( aSig1 5831 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 5832 status->float_exception_flags |= float_flag_inexact; 5833 } 5834 } 5835 if ( aSign ) z = - z; 5836 return z; 5837 5838 } 5839 5840 /*---------------------------------------------------------------------------- 5841 | Returns the result of converting the quadruple-precision floating-point value 5842 | `a' to the 64-bit unsigned integer format. The conversion is 5843 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5844 | Arithmetic---which means in particular that the conversion is rounded 5845 | according to the current rounding mode. If `a' is a NaN, the largest 5846 | positive integer is returned. If the conversion overflows, the 5847 | largest unsigned integer is returned. If 'a' is negative, the value is 5848 | rounded and zero is returned; negative values that do not round to zero 5849 | will raise the inexact exception. 5850 *----------------------------------------------------------------------------*/ 5851 5852 uint64_t float128_to_uint64(float128 a, float_status *status) 5853 { 5854 flag aSign; 5855 int aExp; 5856 int shiftCount; 5857 uint64_t aSig0, aSig1; 5858 5859 aSig0 = extractFloat128Frac0(a); 5860 aSig1 = extractFloat128Frac1(a); 5861 aExp = extractFloat128Exp(a); 5862 aSign = extractFloat128Sign(a); 5863 if (aSign && (aExp > 0x3FFE)) { 5864 float_raise(float_flag_invalid, status); 5865 if (float128_is_any_nan(a)) { 5866 return LIT64(0xFFFFFFFFFFFFFFFF); 5867 } else { 5868 return 0; 5869 } 5870 } 5871 if (aExp) { 5872 aSig0 |= LIT64(0x0001000000000000); 5873 } 5874 shiftCount = 0x402F - aExp; 5875 if (shiftCount <= 0) { 5876 if (0x403E < aExp) { 5877 float_raise(float_flag_invalid, status); 5878 return LIT64(0xFFFFFFFFFFFFFFFF); 5879 } 5880 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 5881 } else { 5882 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 5883 } 5884 return roundAndPackUint64(aSign, aSig0, aSig1, status); 5885 } 5886 5887 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 5888 { 5889 uint64_t v; 5890 signed char current_rounding_mode = status->float_rounding_mode; 5891 5892 set_float_rounding_mode(float_round_to_zero, status); 5893 v = float128_to_uint64(a, status); 5894 set_float_rounding_mode(current_rounding_mode, status); 5895 5896 return v; 5897 } 5898 5899 /*---------------------------------------------------------------------------- 5900 | Returns the result of converting the quadruple-precision floating-point 5901 | value `a' to the 32-bit unsigned integer format. The conversion 5902 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5903 | Arithmetic except that the conversion is always rounded toward zero. 5904 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 5905 | if the conversion overflows, the largest unsigned integer is returned. 5906 | If 'a' is negative, the value is rounded and zero is returned; negative 5907 | values that do not round to zero will raise the inexact exception. 5908 *----------------------------------------------------------------------------*/ 5909 5910 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 5911 { 5912 uint64_t v; 5913 uint32_t res; 5914 int old_exc_flags = get_float_exception_flags(status); 5915 5916 v = float128_to_uint64_round_to_zero(a, status); 5917 if (v > 0xffffffff) { 5918 res = 0xffffffff; 5919 } else { 5920 return v; 5921 } 5922 set_float_exception_flags(old_exc_flags, status); 5923 float_raise(float_flag_invalid, status); 5924 return res; 5925 } 5926 5927 /*---------------------------------------------------------------------------- 5928 | Returns the result of converting the quadruple-precision floating-point 5929 | value `a' to the single-precision floating-point format. The conversion 5930 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5931 | Arithmetic. 5932 *----------------------------------------------------------------------------*/ 5933 5934 float32 float128_to_float32(float128 a, float_status *status) 5935 { 5936 flag aSign; 5937 int32_t aExp; 5938 uint64_t aSig0, aSig1; 5939 uint32_t zSig; 5940 5941 aSig1 = extractFloat128Frac1( a ); 5942 aSig0 = extractFloat128Frac0( a ); 5943 aExp = extractFloat128Exp( a ); 5944 aSign = extractFloat128Sign( a ); 5945 if ( aExp == 0x7FFF ) { 5946 if ( aSig0 | aSig1 ) { 5947 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 5948 } 5949 return packFloat32( aSign, 0xFF, 0 ); 5950 } 5951 aSig0 |= ( aSig1 != 0 ); 5952 shift64RightJamming( aSig0, 18, &aSig0 ); 5953 zSig = aSig0; 5954 if ( aExp || zSig ) { 5955 zSig |= 0x40000000; 5956 aExp -= 0x3F81; 5957 } 5958 return roundAndPackFloat32(aSign, aExp, zSig, status); 5959 5960 } 5961 5962 /*---------------------------------------------------------------------------- 5963 | Returns the result of converting the quadruple-precision floating-point 5964 | value `a' to the double-precision floating-point format. The conversion 5965 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5966 | Arithmetic. 5967 *----------------------------------------------------------------------------*/ 5968 5969 float64 float128_to_float64(float128 a, float_status *status) 5970 { 5971 flag aSign; 5972 int32_t aExp; 5973 uint64_t aSig0, aSig1; 5974 5975 aSig1 = extractFloat128Frac1( a ); 5976 aSig0 = extractFloat128Frac0( a ); 5977 aExp = extractFloat128Exp( a ); 5978 aSign = extractFloat128Sign( a ); 5979 if ( aExp == 0x7FFF ) { 5980 if ( aSig0 | aSig1 ) { 5981 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 5982 } 5983 return packFloat64( aSign, 0x7FF, 0 ); 5984 } 5985 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 5986 aSig0 |= ( aSig1 != 0 ); 5987 if ( aExp || aSig0 ) { 5988 aSig0 |= LIT64( 0x4000000000000000 ); 5989 aExp -= 0x3C01; 5990 } 5991 return roundAndPackFloat64(aSign, aExp, aSig0, status); 5992 5993 } 5994 5995 /*---------------------------------------------------------------------------- 5996 | Returns the result of converting the quadruple-precision floating-point 5997 | value `a' to the extended double-precision floating-point format. The 5998 | conversion is performed according to the IEC/IEEE Standard for Binary 5999 | Floating-Point Arithmetic. 6000 *----------------------------------------------------------------------------*/ 6001 6002 floatx80 float128_to_floatx80(float128 a, float_status *status) 6003 { 6004 flag aSign; 6005 int32_t aExp; 6006 uint64_t aSig0, aSig1; 6007 6008 aSig1 = extractFloat128Frac1( a ); 6009 aSig0 = extractFloat128Frac0( a ); 6010 aExp = extractFloat128Exp( a ); 6011 aSign = extractFloat128Sign( a ); 6012 if ( aExp == 0x7FFF ) { 6013 if ( aSig0 | aSig1 ) { 6014 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status); 6015 } 6016 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 6017 } 6018 if ( aExp == 0 ) { 6019 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6020 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6021 } 6022 else { 6023 aSig0 |= LIT64( 0x0001000000000000 ); 6024 } 6025 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6026 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6027 6028 } 6029 6030 /*---------------------------------------------------------------------------- 6031 | Rounds the quadruple-precision floating-point value `a' to an integer, and 6032 | returns the result as a quadruple-precision floating-point value. The 6033 | operation is performed according to the IEC/IEEE Standard for Binary 6034 | Floating-Point Arithmetic. 6035 *----------------------------------------------------------------------------*/ 6036 6037 float128 float128_round_to_int(float128 a, float_status *status) 6038 { 6039 flag aSign; 6040 int32_t aExp; 6041 uint64_t lastBitMask, roundBitsMask; 6042 float128 z; 6043 6044 aExp = extractFloat128Exp( a ); 6045 if ( 0x402F <= aExp ) { 6046 if ( 0x406F <= aExp ) { 6047 if ( ( aExp == 0x7FFF ) 6048 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 6049 ) { 6050 return propagateFloat128NaN(a, a, status); 6051 } 6052 return a; 6053 } 6054 lastBitMask = 1; 6055 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 6056 roundBitsMask = lastBitMask - 1; 6057 z = a; 6058 switch (status->float_rounding_mode) { 6059 case float_round_nearest_even: 6060 if ( lastBitMask ) { 6061 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 6062 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 6063 } 6064 else { 6065 if ( (int64_t) z.low < 0 ) { 6066 ++z.high; 6067 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 6068 } 6069 } 6070 break; 6071 case float_round_ties_away: 6072 if (lastBitMask) { 6073 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 6074 } else { 6075 if ((int64_t) z.low < 0) { 6076 ++z.high; 6077 } 6078 } 6079 break; 6080 case float_round_to_zero: 6081 break; 6082 case float_round_up: 6083 if (!extractFloat128Sign(z)) { 6084 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6085 } 6086 break; 6087 case float_round_down: 6088 if (extractFloat128Sign(z)) { 6089 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6090 } 6091 break; 6092 default: 6093 abort(); 6094 } 6095 z.low &= ~ roundBitsMask; 6096 } 6097 else { 6098 if ( aExp < 0x3FFF ) { 6099 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 6100 status->float_exception_flags |= float_flag_inexact; 6101 aSign = extractFloat128Sign( a ); 6102 switch (status->float_rounding_mode) { 6103 case float_round_nearest_even: 6104 if ( ( aExp == 0x3FFE ) 6105 && ( extractFloat128Frac0( a ) 6106 | extractFloat128Frac1( a ) ) 6107 ) { 6108 return packFloat128( aSign, 0x3FFF, 0, 0 ); 6109 } 6110 break; 6111 case float_round_ties_away: 6112 if (aExp == 0x3FFE) { 6113 return packFloat128(aSign, 0x3FFF, 0, 0); 6114 } 6115 break; 6116 case float_round_down: 6117 return 6118 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 6119 : packFloat128( 0, 0, 0, 0 ); 6120 case float_round_up: 6121 return 6122 aSign ? packFloat128( 1, 0, 0, 0 ) 6123 : packFloat128( 0, 0x3FFF, 0, 0 ); 6124 } 6125 return packFloat128( aSign, 0, 0, 0 ); 6126 } 6127 lastBitMask = 1; 6128 lastBitMask <<= 0x402F - aExp; 6129 roundBitsMask = lastBitMask - 1; 6130 z.low = 0; 6131 z.high = a.high; 6132 switch (status->float_rounding_mode) { 6133 case float_round_nearest_even: 6134 z.high += lastBitMask>>1; 6135 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 6136 z.high &= ~ lastBitMask; 6137 } 6138 break; 6139 case float_round_ties_away: 6140 z.high += lastBitMask>>1; 6141 break; 6142 case float_round_to_zero: 6143 break; 6144 case float_round_up: 6145 if (!extractFloat128Sign(z)) { 6146 z.high |= ( a.low != 0 ); 6147 z.high += roundBitsMask; 6148 } 6149 break; 6150 case float_round_down: 6151 if (extractFloat128Sign(z)) { 6152 z.high |= (a.low != 0); 6153 z.high += roundBitsMask; 6154 } 6155 break; 6156 default: 6157 abort(); 6158 } 6159 z.high &= ~ roundBitsMask; 6160 } 6161 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 6162 status->float_exception_flags |= float_flag_inexact; 6163 } 6164 return z; 6165 6166 } 6167 6168 /*---------------------------------------------------------------------------- 6169 | Returns the result of adding the absolute values of the quadruple-precision 6170 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 6171 | before being returned. `zSign' is ignored if the result is a NaN. 6172 | The addition is performed according to the IEC/IEEE Standard for Binary 6173 | Floating-Point Arithmetic. 6174 *----------------------------------------------------------------------------*/ 6175 6176 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign, 6177 float_status *status) 6178 { 6179 int32_t aExp, bExp, zExp; 6180 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6181 int32_t expDiff; 6182 6183 aSig1 = extractFloat128Frac1( a ); 6184 aSig0 = extractFloat128Frac0( a ); 6185 aExp = extractFloat128Exp( a ); 6186 bSig1 = extractFloat128Frac1( b ); 6187 bSig0 = extractFloat128Frac0( b ); 6188 bExp = extractFloat128Exp( b ); 6189 expDiff = aExp - bExp; 6190 if ( 0 < expDiff ) { 6191 if ( aExp == 0x7FFF ) { 6192 if (aSig0 | aSig1) { 6193 return propagateFloat128NaN(a, b, status); 6194 } 6195 return a; 6196 } 6197 if ( bExp == 0 ) { 6198 --expDiff; 6199 } 6200 else { 6201 bSig0 |= LIT64( 0x0001000000000000 ); 6202 } 6203 shift128ExtraRightJamming( 6204 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 6205 zExp = aExp; 6206 } 6207 else if ( expDiff < 0 ) { 6208 if ( bExp == 0x7FFF ) { 6209 if (bSig0 | bSig1) { 6210 return propagateFloat128NaN(a, b, status); 6211 } 6212 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6213 } 6214 if ( aExp == 0 ) { 6215 ++expDiff; 6216 } 6217 else { 6218 aSig0 |= LIT64( 0x0001000000000000 ); 6219 } 6220 shift128ExtraRightJamming( 6221 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 6222 zExp = bExp; 6223 } 6224 else { 6225 if ( aExp == 0x7FFF ) { 6226 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6227 return propagateFloat128NaN(a, b, status); 6228 } 6229 return a; 6230 } 6231 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6232 if ( aExp == 0 ) { 6233 if (status->flush_to_zero) { 6234 if (zSig0 | zSig1) { 6235 float_raise(float_flag_output_denormal, status); 6236 } 6237 return packFloat128(zSign, 0, 0, 0); 6238 } 6239 return packFloat128( zSign, 0, zSig0, zSig1 ); 6240 } 6241 zSig2 = 0; 6242 zSig0 |= LIT64( 0x0002000000000000 ); 6243 zExp = aExp; 6244 goto shiftRight1; 6245 } 6246 aSig0 |= LIT64( 0x0001000000000000 ); 6247 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6248 --zExp; 6249 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 6250 ++zExp; 6251 shiftRight1: 6252 shift128ExtraRightJamming( 6253 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6254 roundAndPack: 6255 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6256 6257 } 6258 6259 /*---------------------------------------------------------------------------- 6260 | Returns the result of subtracting the absolute values of the quadruple- 6261 | precision floating-point values `a' and `b'. If `zSign' is 1, the 6262 | difference is negated before being returned. `zSign' is ignored if the 6263 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6264 | Standard for Binary Floating-Point Arithmetic. 6265 *----------------------------------------------------------------------------*/ 6266 6267 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign, 6268 float_status *status) 6269 { 6270 int32_t aExp, bExp, zExp; 6271 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 6272 int32_t expDiff; 6273 6274 aSig1 = extractFloat128Frac1( a ); 6275 aSig0 = extractFloat128Frac0( a ); 6276 aExp = extractFloat128Exp( a ); 6277 bSig1 = extractFloat128Frac1( b ); 6278 bSig0 = extractFloat128Frac0( b ); 6279 bExp = extractFloat128Exp( b ); 6280 expDiff = aExp - bExp; 6281 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6282 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 6283 if ( 0 < expDiff ) goto aExpBigger; 6284 if ( expDiff < 0 ) goto bExpBigger; 6285 if ( aExp == 0x7FFF ) { 6286 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6287 return propagateFloat128NaN(a, b, status); 6288 } 6289 float_raise(float_flag_invalid, status); 6290 return float128_default_nan(status); 6291 } 6292 if ( aExp == 0 ) { 6293 aExp = 1; 6294 bExp = 1; 6295 } 6296 if ( bSig0 < aSig0 ) goto aBigger; 6297 if ( aSig0 < bSig0 ) goto bBigger; 6298 if ( bSig1 < aSig1 ) goto aBigger; 6299 if ( aSig1 < bSig1 ) goto bBigger; 6300 return packFloat128(status->float_rounding_mode == float_round_down, 6301 0, 0, 0); 6302 bExpBigger: 6303 if ( bExp == 0x7FFF ) { 6304 if (bSig0 | bSig1) { 6305 return propagateFloat128NaN(a, b, status); 6306 } 6307 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 6308 } 6309 if ( aExp == 0 ) { 6310 ++expDiff; 6311 } 6312 else { 6313 aSig0 |= LIT64( 0x4000000000000000 ); 6314 } 6315 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6316 bSig0 |= LIT64( 0x4000000000000000 ); 6317 bBigger: 6318 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6319 zExp = bExp; 6320 zSign ^= 1; 6321 goto normalizeRoundAndPack; 6322 aExpBigger: 6323 if ( aExp == 0x7FFF ) { 6324 if (aSig0 | aSig1) { 6325 return propagateFloat128NaN(a, b, status); 6326 } 6327 return a; 6328 } 6329 if ( bExp == 0 ) { 6330 --expDiff; 6331 } 6332 else { 6333 bSig0 |= LIT64( 0x4000000000000000 ); 6334 } 6335 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 6336 aSig0 |= LIT64( 0x4000000000000000 ); 6337 aBigger: 6338 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6339 zExp = aExp; 6340 normalizeRoundAndPack: 6341 --zExp; 6342 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 6343 status); 6344 6345 } 6346 6347 /*---------------------------------------------------------------------------- 6348 | Returns the result of adding the quadruple-precision floating-point values 6349 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 6350 | for Binary Floating-Point Arithmetic. 6351 *----------------------------------------------------------------------------*/ 6352 6353 float128 float128_add(float128 a, float128 b, float_status *status) 6354 { 6355 flag aSign, bSign; 6356 6357 aSign = extractFloat128Sign( a ); 6358 bSign = extractFloat128Sign( b ); 6359 if ( aSign == bSign ) { 6360 return addFloat128Sigs(a, b, aSign, status); 6361 } 6362 else { 6363 return subFloat128Sigs(a, b, aSign, status); 6364 } 6365 6366 } 6367 6368 /*---------------------------------------------------------------------------- 6369 | Returns the result of subtracting the quadruple-precision floating-point 6370 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6371 | Standard for Binary Floating-Point Arithmetic. 6372 *----------------------------------------------------------------------------*/ 6373 6374 float128 float128_sub(float128 a, float128 b, float_status *status) 6375 { 6376 flag aSign, bSign; 6377 6378 aSign = extractFloat128Sign( a ); 6379 bSign = extractFloat128Sign( b ); 6380 if ( aSign == bSign ) { 6381 return subFloat128Sigs(a, b, aSign, status); 6382 } 6383 else { 6384 return addFloat128Sigs(a, b, aSign, status); 6385 } 6386 6387 } 6388 6389 /*---------------------------------------------------------------------------- 6390 | Returns the result of multiplying the quadruple-precision floating-point 6391 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6392 | Standard for Binary Floating-Point Arithmetic. 6393 *----------------------------------------------------------------------------*/ 6394 6395 float128 float128_mul(float128 a, float128 b, float_status *status) 6396 { 6397 flag aSign, bSign, zSign; 6398 int32_t aExp, bExp, zExp; 6399 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 6400 6401 aSig1 = extractFloat128Frac1( a ); 6402 aSig0 = extractFloat128Frac0( a ); 6403 aExp = extractFloat128Exp( a ); 6404 aSign = extractFloat128Sign( a ); 6405 bSig1 = extractFloat128Frac1( b ); 6406 bSig0 = extractFloat128Frac0( b ); 6407 bExp = extractFloat128Exp( b ); 6408 bSign = extractFloat128Sign( b ); 6409 zSign = aSign ^ bSign; 6410 if ( aExp == 0x7FFF ) { 6411 if ( ( aSig0 | aSig1 ) 6412 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6413 return propagateFloat128NaN(a, b, status); 6414 } 6415 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 6416 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6417 } 6418 if ( bExp == 0x7FFF ) { 6419 if (bSig0 | bSig1) { 6420 return propagateFloat128NaN(a, b, status); 6421 } 6422 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6423 invalid: 6424 float_raise(float_flag_invalid, status); 6425 return float128_default_nan(status); 6426 } 6427 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6428 } 6429 if ( aExp == 0 ) { 6430 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6431 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6432 } 6433 if ( bExp == 0 ) { 6434 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6435 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6436 } 6437 zExp = aExp + bExp - 0x4000; 6438 aSig0 |= LIT64( 0x0001000000000000 ); 6439 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 6440 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 6441 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6442 zSig2 |= ( zSig3 != 0 ); 6443 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 6444 shift128ExtraRightJamming( 6445 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6446 ++zExp; 6447 } 6448 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6449 6450 } 6451 6452 /*---------------------------------------------------------------------------- 6453 | Returns the result of dividing the quadruple-precision floating-point value 6454 | `a' by the corresponding value `b'. The operation is performed according to 6455 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6456 *----------------------------------------------------------------------------*/ 6457 6458 float128 float128_div(float128 a, float128 b, float_status *status) 6459 { 6460 flag aSign, bSign, zSign; 6461 int32_t aExp, bExp, zExp; 6462 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6463 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6464 6465 aSig1 = extractFloat128Frac1( a ); 6466 aSig0 = extractFloat128Frac0( a ); 6467 aExp = extractFloat128Exp( a ); 6468 aSign = extractFloat128Sign( a ); 6469 bSig1 = extractFloat128Frac1( b ); 6470 bSig0 = extractFloat128Frac0( b ); 6471 bExp = extractFloat128Exp( b ); 6472 bSign = extractFloat128Sign( b ); 6473 zSign = aSign ^ bSign; 6474 if ( aExp == 0x7FFF ) { 6475 if (aSig0 | aSig1) { 6476 return propagateFloat128NaN(a, b, status); 6477 } 6478 if ( bExp == 0x7FFF ) { 6479 if (bSig0 | bSig1) { 6480 return propagateFloat128NaN(a, b, status); 6481 } 6482 goto invalid; 6483 } 6484 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6485 } 6486 if ( bExp == 0x7FFF ) { 6487 if (bSig0 | bSig1) { 6488 return propagateFloat128NaN(a, b, status); 6489 } 6490 return packFloat128( zSign, 0, 0, 0 ); 6491 } 6492 if ( bExp == 0 ) { 6493 if ( ( bSig0 | bSig1 ) == 0 ) { 6494 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6495 invalid: 6496 float_raise(float_flag_invalid, status); 6497 return float128_default_nan(status); 6498 } 6499 float_raise(float_flag_divbyzero, status); 6500 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6501 } 6502 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6503 } 6504 if ( aExp == 0 ) { 6505 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6506 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6507 } 6508 zExp = aExp - bExp + 0x3FFD; 6509 shortShift128Left( 6510 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 6511 shortShift128Left( 6512 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6513 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 6514 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 6515 ++zExp; 6516 } 6517 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6518 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 6519 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 6520 while ( (int64_t) rem0 < 0 ) { 6521 --zSig0; 6522 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 6523 } 6524 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 6525 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 6526 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 6527 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 6528 while ( (int64_t) rem1 < 0 ) { 6529 --zSig1; 6530 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 6531 } 6532 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6533 } 6534 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 6535 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6536 6537 } 6538 6539 /*---------------------------------------------------------------------------- 6540 | Returns the remainder of the quadruple-precision floating-point value `a' 6541 | with respect to the corresponding value `b'. The operation is performed 6542 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6543 *----------------------------------------------------------------------------*/ 6544 6545 float128 float128_rem(float128 a, float128 b, float_status *status) 6546 { 6547 flag aSign, zSign; 6548 int32_t aExp, bExp, expDiff; 6549 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 6550 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 6551 int64_t sigMean0; 6552 6553 aSig1 = extractFloat128Frac1( a ); 6554 aSig0 = extractFloat128Frac0( a ); 6555 aExp = extractFloat128Exp( a ); 6556 aSign = extractFloat128Sign( a ); 6557 bSig1 = extractFloat128Frac1( b ); 6558 bSig0 = extractFloat128Frac0( b ); 6559 bExp = extractFloat128Exp( b ); 6560 if ( aExp == 0x7FFF ) { 6561 if ( ( aSig0 | aSig1 ) 6562 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6563 return propagateFloat128NaN(a, b, status); 6564 } 6565 goto invalid; 6566 } 6567 if ( bExp == 0x7FFF ) { 6568 if (bSig0 | bSig1) { 6569 return propagateFloat128NaN(a, b, status); 6570 } 6571 return a; 6572 } 6573 if ( bExp == 0 ) { 6574 if ( ( bSig0 | bSig1 ) == 0 ) { 6575 invalid: 6576 float_raise(float_flag_invalid, status); 6577 return float128_default_nan(status); 6578 } 6579 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6580 } 6581 if ( aExp == 0 ) { 6582 if ( ( aSig0 | aSig1 ) == 0 ) return a; 6583 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6584 } 6585 expDiff = aExp - bExp; 6586 if ( expDiff < -1 ) return a; 6587 shortShift128Left( 6588 aSig0 | LIT64( 0x0001000000000000 ), 6589 aSig1, 6590 15 - ( expDiff < 0 ), 6591 &aSig0, 6592 &aSig1 6593 ); 6594 shortShift128Left( 6595 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6596 q = le128( bSig0, bSig1, aSig0, aSig1 ); 6597 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6598 expDiff -= 64; 6599 while ( 0 < expDiff ) { 6600 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6601 q = ( 4 < q ) ? q - 4 : 0; 6602 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6603 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 6604 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 6605 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 6606 expDiff -= 61; 6607 } 6608 if ( -64 < expDiff ) { 6609 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6610 q = ( 4 < q ) ? q - 4 : 0; 6611 q >>= - expDiff; 6612 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6613 expDiff += 52; 6614 if ( expDiff < 0 ) { 6615 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6616 } 6617 else { 6618 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 6619 } 6620 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6621 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 6622 } 6623 else { 6624 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 6625 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6626 } 6627 do { 6628 alternateASig0 = aSig0; 6629 alternateASig1 = aSig1; 6630 ++q; 6631 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6632 } while ( 0 <= (int64_t) aSig0 ); 6633 add128( 6634 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 6635 if ( ( sigMean0 < 0 ) 6636 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 6637 aSig0 = alternateASig0; 6638 aSig1 = alternateASig1; 6639 } 6640 zSign = ( (int64_t) aSig0 < 0 ); 6641 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 6642 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 6643 status); 6644 } 6645 6646 /*---------------------------------------------------------------------------- 6647 | Returns the square root of the quadruple-precision floating-point value `a'. 6648 | The operation is performed according to the IEC/IEEE Standard for Binary 6649 | Floating-Point Arithmetic. 6650 *----------------------------------------------------------------------------*/ 6651 6652 float128 float128_sqrt(float128 a, float_status *status) 6653 { 6654 flag aSign; 6655 int32_t aExp, zExp; 6656 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 6657 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6658 6659 aSig1 = extractFloat128Frac1( a ); 6660 aSig0 = extractFloat128Frac0( a ); 6661 aExp = extractFloat128Exp( a ); 6662 aSign = extractFloat128Sign( a ); 6663 if ( aExp == 0x7FFF ) { 6664 if (aSig0 | aSig1) { 6665 return propagateFloat128NaN(a, a, status); 6666 } 6667 if ( ! aSign ) return a; 6668 goto invalid; 6669 } 6670 if ( aSign ) { 6671 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 6672 invalid: 6673 float_raise(float_flag_invalid, status); 6674 return float128_default_nan(status); 6675 } 6676 if ( aExp == 0 ) { 6677 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 6678 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6679 } 6680 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 6681 aSig0 |= LIT64( 0x0001000000000000 ); 6682 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 6683 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 6684 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6685 doubleZSig0 = zSig0<<1; 6686 mul64To128( zSig0, zSig0, &term0, &term1 ); 6687 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6688 while ( (int64_t) rem0 < 0 ) { 6689 --zSig0; 6690 doubleZSig0 -= 2; 6691 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6692 } 6693 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6694 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 6695 if ( zSig1 == 0 ) zSig1 = 1; 6696 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6697 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6698 mul64To128( zSig1, zSig1, &term2, &term3 ); 6699 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6700 while ( (int64_t) rem1 < 0 ) { 6701 --zSig1; 6702 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6703 term3 |= 1; 6704 term2 |= doubleZSig0; 6705 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6706 } 6707 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6708 } 6709 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 6710 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 6711 6712 } 6713 6714 /*---------------------------------------------------------------------------- 6715 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 6716 | the corresponding value `b', and 0 otherwise. The invalid exception is 6717 | raised if either operand is a NaN. Otherwise, the comparison is performed 6718 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6719 *----------------------------------------------------------------------------*/ 6720 6721 int float128_eq(float128 a, float128 b, float_status *status) 6722 { 6723 6724 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6725 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6726 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6727 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6728 ) { 6729 float_raise(float_flag_invalid, status); 6730 return 0; 6731 } 6732 return 6733 ( a.low == b.low ) 6734 && ( ( a.high == b.high ) 6735 || ( ( a.low == 0 ) 6736 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6737 ); 6738 6739 } 6740 6741 /*---------------------------------------------------------------------------- 6742 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6743 | or equal to the corresponding value `b', and 0 otherwise. The invalid 6744 | exception is raised if either operand is a NaN. The comparison is performed 6745 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6746 *----------------------------------------------------------------------------*/ 6747 6748 int float128_le(float128 a, float128 b, float_status *status) 6749 { 6750 flag aSign, bSign; 6751 6752 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6753 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6754 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6755 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6756 ) { 6757 float_raise(float_flag_invalid, status); 6758 return 0; 6759 } 6760 aSign = extractFloat128Sign( a ); 6761 bSign = extractFloat128Sign( b ); 6762 if ( aSign != bSign ) { 6763 return 6764 aSign 6765 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6766 == 0 ); 6767 } 6768 return 6769 aSign ? le128( b.high, b.low, a.high, a.low ) 6770 : le128( a.high, a.low, b.high, b.low ); 6771 6772 } 6773 6774 /*---------------------------------------------------------------------------- 6775 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6776 | the corresponding value `b', and 0 otherwise. The invalid exception is 6777 | raised if either operand is a NaN. The comparison is performed according 6778 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6779 *----------------------------------------------------------------------------*/ 6780 6781 int float128_lt(float128 a, float128 b, float_status *status) 6782 { 6783 flag aSign, bSign; 6784 6785 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6786 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6787 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6788 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6789 ) { 6790 float_raise(float_flag_invalid, status); 6791 return 0; 6792 } 6793 aSign = extractFloat128Sign( a ); 6794 bSign = extractFloat128Sign( b ); 6795 if ( aSign != bSign ) { 6796 return 6797 aSign 6798 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6799 != 0 ); 6800 } 6801 return 6802 aSign ? lt128( b.high, b.low, a.high, a.low ) 6803 : lt128( a.high, a.low, b.high, b.low ); 6804 6805 } 6806 6807 /*---------------------------------------------------------------------------- 6808 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 6809 | be compared, and 0 otherwise. The invalid exception is raised if either 6810 | operand is a NaN. The comparison is performed according to the IEC/IEEE 6811 | Standard for Binary Floating-Point Arithmetic. 6812 *----------------------------------------------------------------------------*/ 6813 6814 int float128_unordered(float128 a, float128 b, float_status *status) 6815 { 6816 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6817 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6818 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6819 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6820 ) { 6821 float_raise(float_flag_invalid, status); 6822 return 1; 6823 } 6824 return 0; 6825 } 6826 6827 /*---------------------------------------------------------------------------- 6828 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 6829 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 6830 | exception. The comparison is performed according to the IEC/IEEE Standard 6831 | for Binary Floating-Point Arithmetic. 6832 *----------------------------------------------------------------------------*/ 6833 6834 int float128_eq_quiet(float128 a, float128 b, float_status *status) 6835 { 6836 6837 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6838 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6839 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6840 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6841 ) { 6842 if (float128_is_signaling_nan(a, status) 6843 || float128_is_signaling_nan(b, status)) { 6844 float_raise(float_flag_invalid, status); 6845 } 6846 return 0; 6847 } 6848 return 6849 ( a.low == b.low ) 6850 && ( ( a.high == b.high ) 6851 || ( ( a.low == 0 ) 6852 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6853 ); 6854 6855 } 6856 6857 /*---------------------------------------------------------------------------- 6858 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6859 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 6860 | cause an exception. Otherwise, the comparison is performed according to the 6861 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6862 *----------------------------------------------------------------------------*/ 6863 6864 int float128_le_quiet(float128 a, float128 b, float_status *status) 6865 { 6866 flag aSign, bSign; 6867 6868 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6869 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6870 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6871 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6872 ) { 6873 if (float128_is_signaling_nan(a, status) 6874 || float128_is_signaling_nan(b, status)) { 6875 float_raise(float_flag_invalid, status); 6876 } 6877 return 0; 6878 } 6879 aSign = extractFloat128Sign( a ); 6880 bSign = extractFloat128Sign( b ); 6881 if ( aSign != bSign ) { 6882 return 6883 aSign 6884 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6885 == 0 ); 6886 } 6887 return 6888 aSign ? le128( b.high, b.low, a.high, a.low ) 6889 : le128( a.high, a.low, b.high, b.low ); 6890 6891 } 6892 6893 /*---------------------------------------------------------------------------- 6894 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6895 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 6896 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 6897 | Standard for Binary Floating-Point Arithmetic. 6898 *----------------------------------------------------------------------------*/ 6899 6900 int float128_lt_quiet(float128 a, float128 b, float_status *status) 6901 { 6902 flag aSign, bSign; 6903 6904 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6905 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6906 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6907 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6908 ) { 6909 if (float128_is_signaling_nan(a, status) 6910 || float128_is_signaling_nan(b, status)) { 6911 float_raise(float_flag_invalid, status); 6912 } 6913 return 0; 6914 } 6915 aSign = extractFloat128Sign( a ); 6916 bSign = extractFloat128Sign( b ); 6917 if ( aSign != bSign ) { 6918 return 6919 aSign 6920 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6921 != 0 ); 6922 } 6923 return 6924 aSign ? lt128( b.high, b.low, a.high, a.low ) 6925 : lt128( a.high, a.low, b.high, b.low ); 6926 6927 } 6928 6929 /*---------------------------------------------------------------------------- 6930 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 6931 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 6932 | comparison is performed according to the IEC/IEEE Standard for Binary 6933 | Floating-Point Arithmetic. 6934 *----------------------------------------------------------------------------*/ 6935 6936 int float128_unordered_quiet(float128 a, float128 b, float_status *status) 6937 { 6938 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6939 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6940 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6941 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6942 ) { 6943 if (float128_is_signaling_nan(a, status) 6944 || float128_is_signaling_nan(b, status)) { 6945 float_raise(float_flag_invalid, status); 6946 } 6947 return 1; 6948 } 6949 return 0; 6950 } 6951 6952 static inline int floatx80_compare_internal(floatx80 a, floatx80 b, 6953 int is_quiet, float_status *status) 6954 { 6955 flag aSign, bSign; 6956 6957 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6958 float_raise(float_flag_invalid, status); 6959 return float_relation_unordered; 6960 } 6961 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 6962 ( extractFloatx80Frac( a )<<1 ) ) || 6963 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 6964 ( extractFloatx80Frac( b )<<1 ) )) { 6965 if (!is_quiet || 6966 floatx80_is_signaling_nan(a, status) || 6967 floatx80_is_signaling_nan(b, status)) { 6968 float_raise(float_flag_invalid, status); 6969 } 6970 return float_relation_unordered; 6971 } 6972 aSign = extractFloatx80Sign( a ); 6973 bSign = extractFloatx80Sign( b ); 6974 if ( aSign != bSign ) { 6975 6976 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 6977 ( ( a.low | b.low ) == 0 ) ) { 6978 /* zero case */ 6979 return float_relation_equal; 6980 } else { 6981 return 1 - (2 * aSign); 6982 } 6983 } else { 6984 if (a.low == b.low && a.high == b.high) { 6985 return float_relation_equal; 6986 } else { 6987 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 6988 } 6989 } 6990 } 6991 6992 int floatx80_compare(floatx80 a, floatx80 b, float_status *status) 6993 { 6994 return floatx80_compare_internal(a, b, 0, status); 6995 } 6996 6997 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status) 6998 { 6999 return floatx80_compare_internal(a, b, 1, status); 7000 } 7001 7002 static inline int float128_compare_internal(float128 a, float128 b, 7003 int is_quiet, float_status *status) 7004 { 7005 flag aSign, bSign; 7006 7007 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7008 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7009 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7010 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7011 if (!is_quiet || 7012 float128_is_signaling_nan(a, status) || 7013 float128_is_signaling_nan(b, status)) { 7014 float_raise(float_flag_invalid, status); 7015 } 7016 return float_relation_unordered; 7017 } 7018 aSign = extractFloat128Sign( a ); 7019 bSign = extractFloat128Sign( b ); 7020 if ( aSign != bSign ) { 7021 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7022 /* zero case */ 7023 return float_relation_equal; 7024 } else { 7025 return 1 - (2 * aSign); 7026 } 7027 } else { 7028 if (a.low == b.low && a.high == b.high) { 7029 return float_relation_equal; 7030 } else { 7031 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7032 } 7033 } 7034 } 7035 7036 int float128_compare(float128 a, float128 b, float_status *status) 7037 { 7038 return float128_compare_internal(a, b, 0, status); 7039 } 7040 7041 int float128_compare_quiet(float128 a, float128 b, float_status *status) 7042 { 7043 return float128_compare_internal(a, b, 1, status); 7044 } 7045 7046 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7047 { 7048 flag aSign; 7049 int32_t aExp; 7050 uint64_t aSig; 7051 7052 if (floatx80_invalid_encoding(a)) { 7053 float_raise(float_flag_invalid, status); 7054 return floatx80_default_nan(status); 7055 } 7056 aSig = extractFloatx80Frac( a ); 7057 aExp = extractFloatx80Exp( a ); 7058 aSign = extractFloatx80Sign( a ); 7059 7060 if ( aExp == 0x7FFF ) { 7061 if ( aSig<<1 ) { 7062 return propagateFloatx80NaN(a, a, status); 7063 } 7064 return a; 7065 } 7066 7067 if (aExp == 0) { 7068 if (aSig == 0) { 7069 return a; 7070 } 7071 aExp++; 7072 } 7073 7074 if (n > 0x10000) { 7075 n = 0x10000; 7076 } else if (n < -0x10000) { 7077 n = -0x10000; 7078 } 7079 7080 aExp += n; 7081 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7082 aSign, aExp, aSig, 0, status); 7083 } 7084 7085 float128 float128_scalbn(float128 a, int n, float_status *status) 7086 { 7087 flag aSign; 7088 int32_t aExp; 7089 uint64_t aSig0, aSig1; 7090 7091 aSig1 = extractFloat128Frac1( a ); 7092 aSig0 = extractFloat128Frac0( a ); 7093 aExp = extractFloat128Exp( a ); 7094 aSign = extractFloat128Sign( a ); 7095 if ( aExp == 0x7FFF ) { 7096 if ( aSig0 | aSig1 ) { 7097 return propagateFloat128NaN(a, a, status); 7098 } 7099 return a; 7100 } 7101 if (aExp != 0) { 7102 aSig0 |= LIT64( 0x0001000000000000 ); 7103 } else if (aSig0 == 0 && aSig1 == 0) { 7104 return a; 7105 } else { 7106 aExp++; 7107 } 7108 7109 if (n > 0x10000) { 7110 n = 0x10000; 7111 } else if (n < -0x10000) { 7112 n = -0x10000; 7113 } 7114 7115 aExp += n - 1; 7116 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7117 , status); 7118 7119 } 7120