1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include <math.h> 87 #include "qemu/bitops.h" 88 #include "fpu/softfloat.h" 89 90 /* We only need stdlib for abort() */ 91 92 /*---------------------------------------------------------------------------- 93 | Primitive arithmetic functions, including multi-word arithmetic, and 94 | division and square root approximations. (Can be specialized to target if 95 | desired.) 96 *----------------------------------------------------------------------------*/ 97 #include "fpu/softfloat-macros.h" 98 99 /* 100 * Hardfloat 101 * 102 * Fast emulation of guest FP instructions is challenging for two reasons. 103 * First, FP instruction semantics are similar but not identical, particularly 104 * when handling NaNs. Second, emulating at reasonable speed the guest FP 105 * exception flags is not trivial: reading the host's flags register with a 106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp], 107 * and trapping on every FP exception is not fast nor pleasant to work with. 108 * 109 * We address these challenges by leveraging the host FPU for a subset of the 110 * operations. To do this we expand on the idea presented in this paper: 111 * 112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a 113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615. 114 * 115 * The idea is thus to leverage the host FPU to (1) compute FP operations 116 * and (2) identify whether FP exceptions occurred while avoiding 117 * expensive exception flag register accesses. 118 * 119 * An important optimization shown in the paper is that given that exception 120 * flags are rarely cleared by the guest, we can avoid recomputing some flags. 121 * This is particularly useful for the inexact flag, which is very frequently 122 * raised in floating-point workloads. 123 * 124 * We optimize the code further by deferring to soft-fp whenever FP exception 125 * detection might get hairy. Two examples: (1) when at least one operand is 126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result 127 * and the result is < the minimum normal. 128 */ 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \ 130 static inline void name(soft_t *a, float_status *s) \ 131 { \ 132 if (unlikely(soft_t ## _is_denormal(*a))) { \ 133 *a = soft_t ## _set_sign(soft_t ## _zero, \ 134 soft_t ## _is_neg(*a)); \ 135 s->float_exception_flags |= float_flag_input_denormal; \ 136 } \ 137 } 138 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32) 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64) 141 #undef GEN_INPUT_FLUSH__NOCHECK 142 143 #define GEN_INPUT_FLUSH1(name, soft_t) \ 144 static inline void name(soft_t *a, float_status *s) \ 145 { \ 146 if (likely(!s->flush_inputs_to_zero)) { \ 147 return; \ 148 } \ 149 soft_t ## _input_flush__nocheck(a, s); \ 150 } 151 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32) 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64) 154 #undef GEN_INPUT_FLUSH1 155 156 #define GEN_INPUT_FLUSH2(name, soft_t) \ 157 static inline void name(soft_t *a, soft_t *b, float_status *s) \ 158 { \ 159 if (likely(!s->flush_inputs_to_zero)) { \ 160 return; \ 161 } \ 162 soft_t ## _input_flush__nocheck(a, s); \ 163 soft_t ## _input_flush__nocheck(b, s); \ 164 } 165 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32) 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64) 168 #undef GEN_INPUT_FLUSH2 169 170 #define GEN_INPUT_FLUSH3(name, soft_t) \ 171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \ 172 { \ 173 if (likely(!s->flush_inputs_to_zero)) { \ 174 return; \ 175 } \ 176 soft_t ## _input_flush__nocheck(a, s); \ 177 soft_t ## _input_flush__nocheck(b, s); \ 178 soft_t ## _input_flush__nocheck(c, s); \ 179 } 180 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32) 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64) 183 #undef GEN_INPUT_FLUSH3 184 185 /* 186 * Choose whether to use fpclassify or float32/64_* primitives in the generated 187 * hardfloat functions. Each combination of number of inputs and float size 188 * gets its own value. 189 */ 190 #if defined(__x86_64__) 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1 197 #else 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0 204 #endif 205 206 /* 207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over 208 * float{32,64}_is_infinity when !USE_FP. 209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup. 210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%. 211 */ 212 #if defined(__x86_64__) || defined(__aarch64__) 213 # define QEMU_HARDFLOAT_USE_ISINF 1 214 #else 215 # define QEMU_HARDFLOAT_USE_ISINF 0 216 #endif 217 218 /* 219 * Some targets clear the FP flags before most FP operations. This prevents 220 * the use of hardfloat, since hardfloat relies on the inexact flag being 221 * already set. 222 */ 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__) 224 # if defined(__FAST_MATH__) 225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \ 226 IEEE implementation 227 # endif 228 # define QEMU_NO_HARDFLOAT 1 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN 230 #else 231 # define QEMU_NO_HARDFLOAT 0 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline)) 233 #endif 234 235 static inline bool can_use_fpu(const float_status *s) 236 { 237 if (QEMU_NO_HARDFLOAT) { 238 return false; 239 } 240 return likely(s->float_exception_flags & float_flag_inexact && 241 s->float_rounding_mode == float_round_nearest_even); 242 } 243 244 /* 245 * Hardfloat generation functions. Each operation can have two flavors: 246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for 247 * most condition checks, or native ones (e.g. fpclassify). 248 * 249 * The flavor is chosen by the callers. Instead of using macros, we rely on the 250 * compiler to propagate constants and inline everything into the callers. 251 * 252 * We only generate functions for operations with two inputs, since only 253 * these are common enough to justify consolidating them into common code. 254 */ 255 256 typedef union { 257 float32 s; 258 float h; 259 } union_float32; 260 261 typedef union { 262 float64 s; 263 double h; 264 } union_float64; 265 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b); 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b); 268 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s); 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s); 271 typedef float (*hard_f32_op2_fn)(float a, float b); 272 typedef double (*hard_f64_op2_fn)(double a, double b); 273 274 /* 2-input is-zero-or-normal */ 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b) 276 { 277 if (QEMU_HARDFLOAT_2F32_USE_FP) { 278 /* 279 * Not using a temp variable for consecutive fpclassify calls ends up 280 * generating faster code. 281 */ 282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 284 } 285 return float32_is_zero_or_normal(a.s) && 286 float32_is_zero_or_normal(b.s); 287 } 288 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b) 290 { 291 if (QEMU_HARDFLOAT_2F64_USE_FP) { 292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 294 } 295 return float64_is_zero_or_normal(a.s) && 296 float64_is_zero_or_normal(b.s); 297 } 298 299 /* 3-input is-zero-or-normal */ 300 static inline 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c) 302 { 303 if (QEMU_HARDFLOAT_3F32_USE_FP) { 304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 307 } 308 return float32_is_zero_or_normal(a.s) && 309 float32_is_zero_or_normal(b.s) && 310 float32_is_zero_or_normal(c.s); 311 } 312 313 static inline 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c) 315 { 316 if (QEMU_HARDFLOAT_3F64_USE_FP) { 317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 320 } 321 return float64_is_zero_or_normal(a.s) && 322 float64_is_zero_or_normal(b.s) && 323 float64_is_zero_or_normal(c.s); 324 } 325 326 static inline bool f32_is_inf(union_float32 a) 327 { 328 if (QEMU_HARDFLOAT_USE_ISINF) { 329 return isinf(a.h); 330 } 331 return float32_is_infinity(a.s); 332 } 333 334 static inline bool f64_is_inf(union_float64 a) 335 { 336 if (QEMU_HARDFLOAT_USE_ISINF) { 337 return isinf(a.h); 338 } 339 return float64_is_infinity(a.s); 340 } 341 342 /* Note: @fast_test and @post can be NULL */ 343 static inline float32 344 float32_gen2(float32 xa, float32 xb, float_status *s, 345 hard_f32_op2_fn hard, soft_f32_op2_fn soft, 346 f32_check_fn pre, f32_check_fn post, 347 f32_check_fn fast_test, soft_f32_op2_fn fast_op) 348 { 349 union_float32 ua, ub, ur; 350 351 ua.s = xa; 352 ub.s = xb; 353 354 if (unlikely(!can_use_fpu(s))) { 355 goto soft; 356 } 357 358 float32_input_flush2(&ua.s, &ub.s, s); 359 if (unlikely(!pre(ua, ub))) { 360 goto soft; 361 } 362 if (fast_test && fast_test(ua, ub)) { 363 return fast_op(ua.s, ub.s, s); 364 } 365 366 ur.h = hard(ua.h, ub.h); 367 if (unlikely(f32_is_inf(ur))) { 368 s->float_exception_flags |= float_flag_overflow; 369 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) { 370 if (post == NULL || post(ua, ub)) { 371 goto soft; 372 } 373 } 374 return ur.s; 375 376 soft: 377 return soft(ua.s, ub.s, s); 378 } 379 380 static inline float64 381 float64_gen2(float64 xa, float64 xb, float_status *s, 382 hard_f64_op2_fn hard, soft_f64_op2_fn soft, 383 f64_check_fn pre, f64_check_fn post, 384 f64_check_fn fast_test, soft_f64_op2_fn fast_op) 385 { 386 union_float64 ua, ub, ur; 387 388 ua.s = xa; 389 ub.s = xb; 390 391 if (unlikely(!can_use_fpu(s))) { 392 goto soft; 393 } 394 395 float64_input_flush2(&ua.s, &ub.s, s); 396 if (unlikely(!pre(ua, ub))) { 397 goto soft; 398 } 399 if (fast_test && fast_test(ua, ub)) { 400 return fast_op(ua.s, ub.s, s); 401 } 402 403 ur.h = hard(ua.h, ub.h); 404 if (unlikely(f64_is_inf(ur))) { 405 s->float_exception_flags |= float_flag_overflow; 406 } else if (unlikely(fabs(ur.h) <= DBL_MIN)) { 407 if (post == NULL || post(ua, ub)) { 408 goto soft; 409 } 410 } 411 return ur.s; 412 413 soft: 414 return soft(ua.s, ub.s, s); 415 } 416 417 /*---------------------------------------------------------------------------- 418 | Returns the fraction bits of the half-precision floating-point value `a'. 419 *----------------------------------------------------------------------------*/ 420 421 static inline uint32_t extractFloat16Frac(float16 a) 422 { 423 return float16_val(a) & 0x3ff; 424 } 425 426 /*---------------------------------------------------------------------------- 427 | Returns the exponent bits of the half-precision floating-point value `a'. 428 *----------------------------------------------------------------------------*/ 429 430 static inline int extractFloat16Exp(float16 a) 431 { 432 return (float16_val(a) >> 10) & 0x1f; 433 } 434 435 /*---------------------------------------------------------------------------- 436 | Returns the fraction bits of the single-precision floating-point value `a'. 437 *----------------------------------------------------------------------------*/ 438 439 static inline uint32_t extractFloat32Frac(float32 a) 440 { 441 return float32_val(a) & 0x007FFFFF; 442 } 443 444 /*---------------------------------------------------------------------------- 445 | Returns the exponent bits of the single-precision floating-point value `a'. 446 *----------------------------------------------------------------------------*/ 447 448 static inline int extractFloat32Exp(float32 a) 449 { 450 return (float32_val(a) >> 23) & 0xFF; 451 } 452 453 /*---------------------------------------------------------------------------- 454 | Returns the sign bit of the single-precision floating-point value `a'. 455 *----------------------------------------------------------------------------*/ 456 457 static inline flag extractFloat32Sign(float32 a) 458 { 459 return float32_val(a) >> 31; 460 } 461 462 /*---------------------------------------------------------------------------- 463 | Returns the fraction bits of the double-precision floating-point value `a'. 464 *----------------------------------------------------------------------------*/ 465 466 static inline uint64_t extractFloat64Frac(float64 a) 467 { 468 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF); 469 } 470 471 /*---------------------------------------------------------------------------- 472 | Returns the exponent bits of the double-precision floating-point value `a'. 473 *----------------------------------------------------------------------------*/ 474 475 static inline int extractFloat64Exp(float64 a) 476 { 477 return (float64_val(a) >> 52) & 0x7FF; 478 } 479 480 /*---------------------------------------------------------------------------- 481 | Returns the sign bit of the double-precision floating-point value `a'. 482 *----------------------------------------------------------------------------*/ 483 484 static inline flag extractFloat64Sign(float64 a) 485 { 486 return float64_val(a) >> 63; 487 } 488 489 /* 490 * Classify a floating point number. Everything above float_class_qnan 491 * is a NaN so cls >= float_class_qnan is any NaN. 492 */ 493 494 typedef enum __attribute__ ((__packed__)) { 495 float_class_unclassified, 496 float_class_zero, 497 float_class_normal, 498 float_class_inf, 499 float_class_qnan, /* all NaNs from here */ 500 float_class_snan, 501 } FloatClass; 502 503 /* Simple helpers for checking if, or what kind of, NaN we have */ 504 static inline __attribute__((unused)) bool is_nan(FloatClass c) 505 { 506 return unlikely(c >= float_class_qnan); 507 } 508 509 static inline __attribute__((unused)) bool is_snan(FloatClass c) 510 { 511 return c == float_class_snan; 512 } 513 514 static inline __attribute__((unused)) bool is_qnan(FloatClass c) 515 { 516 return c == float_class_qnan; 517 } 518 519 /* 520 * Structure holding all of the decomposed parts of a float. The 521 * exponent is unbiased and the fraction is normalized. All 522 * calculations are done with a 64 bit fraction and then rounded as 523 * appropriate for the final format. 524 * 525 * Thanks to the packed FloatClass a decent compiler should be able to 526 * fit the whole structure into registers and avoid using the stack 527 * for parameter passing. 528 */ 529 530 typedef struct { 531 uint64_t frac; 532 int32_t exp; 533 FloatClass cls; 534 bool sign; 535 } FloatParts; 536 537 #define DECOMPOSED_BINARY_POINT (64 - 2) 538 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 539 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1) 540 541 /* Structure holding all of the relevant parameters for a format. 542 * exp_size: the size of the exponent field 543 * exp_bias: the offset applied to the exponent field 544 * exp_max: the maximum normalised exponent 545 * frac_size: the size of the fraction field 546 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 547 * The following are computed based the size of fraction 548 * frac_lsb: least significant bit of fraction 549 * frac_lsbm1: the bit below the least significant bit (for rounding) 550 * round_mask/roundeven_mask: masks used for rounding 551 * The following optional modifiers are available: 552 * arm_althp: handle ARM Alternative Half Precision 553 */ 554 typedef struct { 555 int exp_size; 556 int exp_bias; 557 int exp_max; 558 int frac_size; 559 int frac_shift; 560 uint64_t frac_lsb; 561 uint64_t frac_lsbm1; 562 uint64_t round_mask; 563 uint64_t roundeven_mask; 564 bool arm_althp; 565 } FloatFmt; 566 567 /* Expand fields based on the size of exponent and fraction */ 568 #define FLOAT_PARAMS(E, F) \ 569 .exp_size = E, \ 570 .exp_bias = ((1 << E) - 1) >> 1, \ 571 .exp_max = (1 << E) - 1, \ 572 .frac_size = F, \ 573 .frac_shift = DECOMPOSED_BINARY_POINT - F, \ 574 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \ 575 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \ 576 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \ 577 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1 578 579 static const FloatFmt float16_params = { 580 FLOAT_PARAMS(5, 10) 581 }; 582 583 static const FloatFmt float16_params_ahp = { 584 FLOAT_PARAMS(5, 10), 585 .arm_althp = true 586 }; 587 588 static const FloatFmt float32_params = { 589 FLOAT_PARAMS(8, 23) 590 }; 591 592 static const FloatFmt float64_params = { 593 FLOAT_PARAMS(11, 52) 594 }; 595 596 /* Unpack a float to parts, but do not canonicalize. */ 597 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw) 598 { 599 const int sign_pos = fmt.frac_size + fmt.exp_size; 600 601 return (FloatParts) { 602 .cls = float_class_unclassified, 603 .sign = extract64(raw, sign_pos, 1), 604 .exp = extract64(raw, fmt.frac_size, fmt.exp_size), 605 .frac = extract64(raw, 0, fmt.frac_size), 606 }; 607 } 608 609 static inline FloatParts float16_unpack_raw(float16 f) 610 { 611 return unpack_raw(float16_params, f); 612 } 613 614 static inline FloatParts float32_unpack_raw(float32 f) 615 { 616 return unpack_raw(float32_params, f); 617 } 618 619 static inline FloatParts float64_unpack_raw(float64 f) 620 { 621 return unpack_raw(float64_params, f); 622 } 623 624 /* Pack a float from parts, but do not canonicalize. */ 625 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p) 626 { 627 const int sign_pos = fmt.frac_size + fmt.exp_size; 628 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp); 629 return deposit64(ret, sign_pos, 1, p.sign); 630 } 631 632 static inline float16 float16_pack_raw(FloatParts p) 633 { 634 return make_float16(pack_raw(float16_params, p)); 635 } 636 637 static inline float32 float32_pack_raw(FloatParts p) 638 { 639 return make_float32(pack_raw(float32_params, p)); 640 } 641 642 static inline float64 float64_pack_raw(FloatParts p) 643 { 644 return make_float64(pack_raw(float64_params, p)); 645 } 646 647 /*---------------------------------------------------------------------------- 648 | Functions and definitions to determine: (1) whether tininess for underflow 649 | is detected before or after rounding by default, (2) what (if anything) 650 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 651 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 652 | are propagated from function inputs to output. These details are target- 653 | specific. 654 *----------------------------------------------------------------------------*/ 655 #include "softfloat-specialize.h" 656 657 /* Canonicalize EXP and FRAC, setting CLS. */ 658 static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm, 659 float_status *status) 660 { 661 if (part.exp == parm->exp_max && !parm->arm_althp) { 662 if (part.frac == 0) { 663 part.cls = float_class_inf; 664 } else { 665 part.frac <<= parm->frac_shift; 666 part.cls = (parts_is_snan_frac(part.frac, status) 667 ? float_class_snan : float_class_qnan); 668 } 669 } else if (part.exp == 0) { 670 if (likely(part.frac == 0)) { 671 part.cls = float_class_zero; 672 } else if (status->flush_inputs_to_zero) { 673 float_raise(float_flag_input_denormal, status); 674 part.cls = float_class_zero; 675 part.frac = 0; 676 } else { 677 int shift = clz64(part.frac) - 1; 678 part.cls = float_class_normal; 679 part.exp = parm->frac_shift - parm->exp_bias - shift + 1; 680 part.frac <<= shift; 681 } 682 } else { 683 part.cls = float_class_normal; 684 part.exp -= parm->exp_bias; 685 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift); 686 } 687 return part; 688 } 689 690 /* Round and uncanonicalize a floating-point number by parts. There 691 * are FRAC_SHIFT bits that may require rounding at the bottom of the 692 * fraction; these bits will be removed. The exponent will be biased 693 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0]. 694 */ 695 696 static FloatParts round_canonical(FloatParts p, float_status *s, 697 const FloatFmt *parm) 698 { 699 const uint64_t frac_lsb = parm->frac_lsb; 700 const uint64_t frac_lsbm1 = parm->frac_lsbm1; 701 const uint64_t round_mask = parm->round_mask; 702 const uint64_t roundeven_mask = parm->roundeven_mask; 703 const int exp_max = parm->exp_max; 704 const int frac_shift = parm->frac_shift; 705 uint64_t frac, inc; 706 int exp, flags = 0; 707 bool overflow_norm; 708 709 frac = p.frac; 710 exp = p.exp; 711 712 switch (p.cls) { 713 case float_class_normal: 714 switch (s->float_rounding_mode) { 715 case float_round_nearest_even: 716 overflow_norm = false; 717 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 718 break; 719 case float_round_ties_away: 720 overflow_norm = false; 721 inc = frac_lsbm1; 722 break; 723 case float_round_to_zero: 724 overflow_norm = true; 725 inc = 0; 726 break; 727 case float_round_up: 728 inc = p.sign ? 0 : round_mask; 729 overflow_norm = p.sign; 730 break; 731 case float_round_down: 732 inc = p.sign ? round_mask : 0; 733 overflow_norm = !p.sign; 734 break; 735 case float_round_to_odd: 736 overflow_norm = true; 737 inc = frac & frac_lsb ? 0 : round_mask; 738 break; 739 default: 740 g_assert_not_reached(); 741 } 742 743 exp += parm->exp_bias; 744 if (likely(exp > 0)) { 745 if (frac & round_mask) { 746 flags |= float_flag_inexact; 747 frac += inc; 748 if (frac & DECOMPOSED_OVERFLOW_BIT) { 749 frac >>= 1; 750 exp++; 751 } 752 } 753 frac >>= frac_shift; 754 755 if (parm->arm_althp) { 756 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */ 757 if (unlikely(exp > exp_max)) { 758 /* Overflow. Return the maximum normal. */ 759 flags = float_flag_invalid; 760 exp = exp_max; 761 frac = -1; 762 } 763 } else if (unlikely(exp >= exp_max)) { 764 flags |= float_flag_overflow | float_flag_inexact; 765 if (overflow_norm) { 766 exp = exp_max - 1; 767 frac = -1; 768 } else { 769 p.cls = float_class_inf; 770 goto do_inf; 771 } 772 } 773 } else if (s->flush_to_zero) { 774 flags |= float_flag_output_denormal; 775 p.cls = float_class_zero; 776 goto do_zero; 777 } else { 778 bool is_tiny = (s->float_detect_tininess 779 == float_tininess_before_rounding) 780 || (exp < 0) 781 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT); 782 783 shift64RightJamming(frac, 1 - exp, &frac); 784 if (frac & round_mask) { 785 /* Need to recompute round-to-even. */ 786 switch (s->float_rounding_mode) { 787 case float_round_nearest_even: 788 inc = ((frac & roundeven_mask) != frac_lsbm1 789 ? frac_lsbm1 : 0); 790 break; 791 case float_round_to_odd: 792 inc = frac & frac_lsb ? 0 : round_mask; 793 break; 794 } 795 flags |= float_flag_inexact; 796 frac += inc; 797 } 798 799 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0); 800 frac >>= frac_shift; 801 802 if (is_tiny && (flags & float_flag_inexact)) { 803 flags |= float_flag_underflow; 804 } 805 if (exp == 0 && frac == 0) { 806 p.cls = float_class_zero; 807 } 808 } 809 break; 810 811 case float_class_zero: 812 do_zero: 813 exp = 0; 814 frac = 0; 815 break; 816 817 case float_class_inf: 818 do_inf: 819 assert(!parm->arm_althp); 820 exp = exp_max; 821 frac = 0; 822 break; 823 824 case float_class_qnan: 825 case float_class_snan: 826 assert(!parm->arm_althp); 827 exp = exp_max; 828 frac >>= parm->frac_shift; 829 break; 830 831 default: 832 g_assert_not_reached(); 833 } 834 835 float_raise(flags, s); 836 p.exp = exp; 837 p.frac = frac; 838 return p; 839 } 840 841 /* Explicit FloatFmt version */ 842 static FloatParts float16a_unpack_canonical(float16 f, float_status *s, 843 const FloatFmt *params) 844 { 845 return sf_canonicalize(float16_unpack_raw(f), params, s); 846 } 847 848 static FloatParts float16_unpack_canonical(float16 f, float_status *s) 849 { 850 return float16a_unpack_canonical(f, s, &float16_params); 851 } 852 853 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s, 854 const FloatFmt *params) 855 { 856 return float16_pack_raw(round_canonical(p, s, params)); 857 } 858 859 static float16 float16_round_pack_canonical(FloatParts p, float_status *s) 860 { 861 return float16a_round_pack_canonical(p, s, &float16_params); 862 } 863 864 static FloatParts float32_unpack_canonical(float32 f, float_status *s) 865 { 866 return sf_canonicalize(float32_unpack_raw(f), &float32_params, s); 867 } 868 869 static float32 float32_round_pack_canonical(FloatParts p, float_status *s) 870 { 871 return float32_pack_raw(round_canonical(p, s, &float32_params)); 872 } 873 874 static FloatParts float64_unpack_canonical(float64 f, float_status *s) 875 { 876 return sf_canonicalize(float64_unpack_raw(f), &float64_params, s); 877 } 878 879 static float64 float64_round_pack_canonical(FloatParts p, float_status *s) 880 { 881 return float64_pack_raw(round_canonical(p, s, &float64_params)); 882 } 883 884 static FloatParts return_nan(FloatParts a, float_status *s) 885 { 886 switch (a.cls) { 887 case float_class_snan: 888 s->float_exception_flags |= float_flag_invalid; 889 a = parts_silence_nan(a, s); 890 /* fall through */ 891 case float_class_qnan: 892 if (s->default_nan_mode) { 893 return parts_default_nan(s); 894 } 895 break; 896 897 default: 898 g_assert_not_reached(); 899 } 900 return a; 901 } 902 903 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s) 904 { 905 if (is_snan(a.cls) || is_snan(b.cls)) { 906 s->float_exception_flags |= float_flag_invalid; 907 } 908 909 if (s->default_nan_mode) { 910 return parts_default_nan(s); 911 } else { 912 if (pickNaN(a.cls, b.cls, 913 a.frac > b.frac || 914 (a.frac == b.frac && a.sign < b.sign))) { 915 a = b; 916 } 917 if (is_snan(a.cls)) { 918 return parts_silence_nan(a, s); 919 } 920 } 921 return a; 922 } 923 924 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c, 925 bool inf_zero, float_status *s) 926 { 927 int which; 928 929 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) { 930 s->float_exception_flags |= float_flag_invalid; 931 } 932 933 which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s); 934 935 if (s->default_nan_mode) { 936 /* Note that this check is after pickNaNMulAdd so that function 937 * has an opportunity to set the Invalid flag. 938 */ 939 which = 3; 940 } 941 942 switch (which) { 943 case 0: 944 break; 945 case 1: 946 a = b; 947 break; 948 case 2: 949 a = c; 950 break; 951 case 3: 952 return parts_default_nan(s); 953 default: 954 g_assert_not_reached(); 955 } 956 957 if (is_snan(a.cls)) { 958 return parts_silence_nan(a, s); 959 } 960 return a; 961 } 962 963 /* 964 * Returns the result of adding or subtracting the values of the 965 * floating-point values `a' and `b'. The operation is performed 966 * according to the IEC/IEEE Standard for Binary Floating-Point 967 * Arithmetic. 968 */ 969 970 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract, 971 float_status *s) 972 { 973 bool a_sign = a.sign; 974 bool b_sign = b.sign ^ subtract; 975 976 if (a_sign != b_sign) { 977 /* Subtraction */ 978 979 if (a.cls == float_class_normal && b.cls == float_class_normal) { 980 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) { 981 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 982 a.frac = a.frac - b.frac; 983 } else { 984 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 985 a.frac = b.frac - a.frac; 986 a.exp = b.exp; 987 a_sign ^= 1; 988 } 989 990 if (a.frac == 0) { 991 a.cls = float_class_zero; 992 a.sign = s->float_rounding_mode == float_round_down; 993 } else { 994 int shift = clz64(a.frac) - 1; 995 a.frac = a.frac << shift; 996 a.exp = a.exp - shift; 997 a.sign = a_sign; 998 } 999 return a; 1000 } 1001 if (is_nan(a.cls) || is_nan(b.cls)) { 1002 return pick_nan(a, b, s); 1003 } 1004 if (a.cls == float_class_inf) { 1005 if (b.cls == float_class_inf) { 1006 float_raise(float_flag_invalid, s); 1007 return parts_default_nan(s); 1008 } 1009 return a; 1010 } 1011 if (a.cls == float_class_zero && b.cls == float_class_zero) { 1012 a.sign = s->float_rounding_mode == float_round_down; 1013 return a; 1014 } 1015 if (a.cls == float_class_zero || b.cls == float_class_inf) { 1016 b.sign = a_sign ^ 1; 1017 return b; 1018 } 1019 if (b.cls == float_class_zero) { 1020 return a; 1021 } 1022 } else { 1023 /* Addition */ 1024 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1025 if (a.exp > b.exp) { 1026 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 1027 } else if (a.exp < b.exp) { 1028 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 1029 a.exp = b.exp; 1030 } 1031 a.frac += b.frac; 1032 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 1033 shift64RightJamming(a.frac, 1, &a.frac); 1034 a.exp += 1; 1035 } 1036 return a; 1037 } 1038 if (is_nan(a.cls) || is_nan(b.cls)) { 1039 return pick_nan(a, b, s); 1040 } 1041 if (a.cls == float_class_inf || b.cls == float_class_zero) { 1042 return a; 1043 } 1044 if (b.cls == float_class_inf || a.cls == float_class_zero) { 1045 b.sign = b_sign; 1046 return b; 1047 } 1048 } 1049 g_assert_not_reached(); 1050 } 1051 1052 /* 1053 * Returns the result of adding or subtracting the floating-point 1054 * values `a' and `b'. The operation is performed according to the 1055 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1056 */ 1057 1058 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status) 1059 { 1060 FloatParts pa = float16_unpack_canonical(a, status); 1061 FloatParts pb = float16_unpack_canonical(b, status); 1062 FloatParts pr = addsub_floats(pa, pb, false, status); 1063 1064 return float16_round_pack_canonical(pr, status); 1065 } 1066 1067 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status) 1068 { 1069 FloatParts pa = float16_unpack_canonical(a, status); 1070 FloatParts pb = float16_unpack_canonical(b, status); 1071 FloatParts pr = addsub_floats(pa, pb, true, status); 1072 1073 return float16_round_pack_canonical(pr, status); 1074 } 1075 1076 static float32 QEMU_SOFTFLOAT_ATTR 1077 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status) 1078 { 1079 FloatParts pa = float32_unpack_canonical(a, status); 1080 FloatParts pb = float32_unpack_canonical(b, status); 1081 FloatParts pr = addsub_floats(pa, pb, subtract, status); 1082 1083 return float32_round_pack_canonical(pr, status); 1084 } 1085 1086 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status) 1087 { 1088 return soft_f32_addsub(a, b, false, status); 1089 } 1090 1091 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status) 1092 { 1093 return soft_f32_addsub(a, b, true, status); 1094 } 1095 1096 static float64 QEMU_SOFTFLOAT_ATTR 1097 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status) 1098 { 1099 FloatParts pa = float64_unpack_canonical(a, status); 1100 FloatParts pb = float64_unpack_canonical(b, status); 1101 FloatParts pr = addsub_floats(pa, pb, subtract, status); 1102 1103 return float64_round_pack_canonical(pr, status); 1104 } 1105 1106 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status) 1107 { 1108 return soft_f64_addsub(a, b, false, status); 1109 } 1110 1111 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status) 1112 { 1113 return soft_f64_addsub(a, b, true, status); 1114 } 1115 1116 static float hard_f32_add(float a, float b) 1117 { 1118 return a + b; 1119 } 1120 1121 static float hard_f32_sub(float a, float b) 1122 { 1123 return a - b; 1124 } 1125 1126 static double hard_f64_add(double a, double b) 1127 { 1128 return a + b; 1129 } 1130 1131 static double hard_f64_sub(double a, double b) 1132 { 1133 return a - b; 1134 } 1135 1136 static bool f32_addsub_post(union_float32 a, union_float32 b) 1137 { 1138 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1139 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1140 } 1141 return !(float32_is_zero(a.s) && float32_is_zero(b.s)); 1142 } 1143 1144 static bool f64_addsub_post(union_float64 a, union_float64 b) 1145 { 1146 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1147 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1148 } else { 1149 return !(float64_is_zero(a.s) && float64_is_zero(b.s)); 1150 } 1151 } 1152 1153 static float32 float32_addsub(float32 a, float32 b, float_status *s, 1154 hard_f32_op2_fn hard, soft_f32_op2_fn soft) 1155 { 1156 return float32_gen2(a, b, s, hard, soft, 1157 f32_is_zon2, f32_addsub_post, NULL, NULL); 1158 } 1159 1160 static float64 float64_addsub(float64 a, float64 b, float_status *s, 1161 hard_f64_op2_fn hard, soft_f64_op2_fn soft) 1162 { 1163 return float64_gen2(a, b, s, hard, soft, 1164 f64_is_zon2, f64_addsub_post, NULL, NULL); 1165 } 1166 1167 float32 QEMU_FLATTEN 1168 float32_add(float32 a, float32 b, float_status *s) 1169 { 1170 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add); 1171 } 1172 1173 float32 QEMU_FLATTEN 1174 float32_sub(float32 a, float32 b, float_status *s) 1175 { 1176 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub); 1177 } 1178 1179 float64 QEMU_FLATTEN 1180 float64_add(float64 a, float64 b, float_status *s) 1181 { 1182 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add); 1183 } 1184 1185 float64 QEMU_FLATTEN 1186 float64_sub(float64 a, float64 b, float_status *s) 1187 { 1188 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub); 1189 } 1190 1191 /* 1192 * Returns the result of multiplying the floating-point values `a' and 1193 * `b'. The operation is performed according to the IEC/IEEE Standard 1194 * for Binary Floating-Point Arithmetic. 1195 */ 1196 1197 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s) 1198 { 1199 bool sign = a.sign ^ b.sign; 1200 1201 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1202 uint64_t hi, lo; 1203 int exp = a.exp + b.exp; 1204 1205 mul64To128(a.frac, b.frac, &hi, &lo); 1206 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1207 if (lo & DECOMPOSED_OVERFLOW_BIT) { 1208 shift64RightJamming(lo, 1, &lo); 1209 exp += 1; 1210 } 1211 1212 /* Re-use a */ 1213 a.exp = exp; 1214 a.sign = sign; 1215 a.frac = lo; 1216 return a; 1217 } 1218 /* handle all the NaN cases */ 1219 if (is_nan(a.cls) || is_nan(b.cls)) { 1220 return pick_nan(a, b, s); 1221 } 1222 /* Inf * Zero == NaN */ 1223 if ((a.cls == float_class_inf && b.cls == float_class_zero) || 1224 (a.cls == float_class_zero && b.cls == float_class_inf)) { 1225 s->float_exception_flags |= float_flag_invalid; 1226 return parts_default_nan(s); 1227 } 1228 /* Multiply by 0 or Inf */ 1229 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1230 a.sign = sign; 1231 return a; 1232 } 1233 if (b.cls == float_class_inf || b.cls == float_class_zero) { 1234 b.sign = sign; 1235 return b; 1236 } 1237 g_assert_not_reached(); 1238 } 1239 1240 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status) 1241 { 1242 FloatParts pa = float16_unpack_canonical(a, status); 1243 FloatParts pb = float16_unpack_canonical(b, status); 1244 FloatParts pr = mul_floats(pa, pb, status); 1245 1246 return float16_round_pack_canonical(pr, status); 1247 } 1248 1249 static float32 QEMU_SOFTFLOAT_ATTR 1250 soft_f32_mul(float32 a, float32 b, float_status *status) 1251 { 1252 FloatParts pa = float32_unpack_canonical(a, status); 1253 FloatParts pb = float32_unpack_canonical(b, status); 1254 FloatParts pr = mul_floats(pa, pb, status); 1255 1256 return float32_round_pack_canonical(pr, status); 1257 } 1258 1259 static float64 QEMU_SOFTFLOAT_ATTR 1260 soft_f64_mul(float64 a, float64 b, float_status *status) 1261 { 1262 FloatParts pa = float64_unpack_canonical(a, status); 1263 FloatParts pb = float64_unpack_canonical(b, status); 1264 FloatParts pr = mul_floats(pa, pb, status); 1265 1266 return float64_round_pack_canonical(pr, status); 1267 } 1268 1269 static float hard_f32_mul(float a, float b) 1270 { 1271 return a * b; 1272 } 1273 1274 static double hard_f64_mul(double a, double b) 1275 { 1276 return a * b; 1277 } 1278 1279 static bool f32_mul_fast_test(union_float32 a, union_float32 b) 1280 { 1281 return float32_is_zero(a.s) || float32_is_zero(b.s); 1282 } 1283 1284 static bool f64_mul_fast_test(union_float64 a, union_float64 b) 1285 { 1286 return float64_is_zero(a.s) || float64_is_zero(b.s); 1287 } 1288 1289 static float32 f32_mul_fast_op(float32 a, float32 b, float_status *s) 1290 { 1291 bool signbit = float32_is_neg(a) ^ float32_is_neg(b); 1292 1293 return float32_set_sign(float32_zero, signbit); 1294 } 1295 1296 static float64 f64_mul_fast_op(float64 a, float64 b, float_status *s) 1297 { 1298 bool signbit = float64_is_neg(a) ^ float64_is_neg(b); 1299 1300 return float64_set_sign(float64_zero, signbit); 1301 } 1302 1303 float32 QEMU_FLATTEN 1304 float32_mul(float32 a, float32 b, float_status *s) 1305 { 1306 return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul, 1307 f32_is_zon2, NULL, f32_mul_fast_test, f32_mul_fast_op); 1308 } 1309 1310 float64 QEMU_FLATTEN 1311 float64_mul(float64 a, float64 b, float_status *s) 1312 { 1313 return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul, 1314 f64_is_zon2, NULL, f64_mul_fast_test, f64_mul_fast_op); 1315 } 1316 1317 /* 1318 * Returns the result of multiplying the floating-point values `a' and 1319 * `b' then adding 'c', with no intermediate rounding step after the 1320 * multiplication. The operation is performed according to the 1321 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008. 1322 * The flags argument allows the caller to select negation of the 1323 * addend, the intermediate product, or the final result. (The 1324 * difference between this and having the caller do a separate 1325 * negation is that negating externally will flip the sign bit on 1326 * NaNs.) 1327 */ 1328 1329 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c, 1330 int flags, float_status *s) 1331 { 1332 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) == 1333 ((1 << float_class_inf) | (1 << float_class_zero)); 1334 bool p_sign; 1335 bool sign_flip = flags & float_muladd_negate_result; 1336 FloatClass p_class; 1337 uint64_t hi, lo; 1338 int p_exp; 1339 1340 /* It is implementation-defined whether the cases of (0,inf,qnan) 1341 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 1342 * they return if they do), so we have to hand this information 1343 * off to the target-specific pick-a-NaN routine. 1344 */ 1345 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) { 1346 return pick_nan_muladd(a, b, c, inf_zero, s); 1347 } 1348 1349 if (inf_zero) { 1350 s->float_exception_flags |= float_flag_invalid; 1351 return parts_default_nan(s); 1352 } 1353 1354 if (flags & float_muladd_negate_c) { 1355 c.sign ^= 1; 1356 } 1357 1358 p_sign = a.sign ^ b.sign; 1359 1360 if (flags & float_muladd_negate_product) { 1361 p_sign ^= 1; 1362 } 1363 1364 if (a.cls == float_class_inf || b.cls == float_class_inf) { 1365 p_class = float_class_inf; 1366 } else if (a.cls == float_class_zero || b.cls == float_class_zero) { 1367 p_class = float_class_zero; 1368 } else { 1369 p_class = float_class_normal; 1370 } 1371 1372 if (c.cls == float_class_inf) { 1373 if (p_class == float_class_inf && p_sign != c.sign) { 1374 s->float_exception_flags |= float_flag_invalid; 1375 return parts_default_nan(s); 1376 } else { 1377 a.cls = float_class_inf; 1378 a.sign = c.sign ^ sign_flip; 1379 return a; 1380 } 1381 } 1382 1383 if (p_class == float_class_inf) { 1384 a.cls = float_class_inf; 1385 a.sign = p_sign ^ sign_flip; 1386 return a; 1387 } 1388 1389 if (p_class == float_class_zero) { 1390 if (c.cls == float_class_zero) { 1391 if (p_sign != c.sign) { 1392 p_sign = s->float_rounding_mode == float_round_down; 1393 } 1394 c.sign = p_sign; 1395 } else if (flags & float_muladd_halve_result) { 1396 c.exp -= 1; 1397 } 1398 c.sign ^= sign_flip; 1399 return c; 1400 } 1401 1402 /* a & b should be normals now... */ 1403 assert(a.cls == float_class_normal && 1404 b.cls == float_class_normal); 1405 1406 p_exp = a.exp + b.exp; 1407 1408 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit 1409 * result. 1410 */ 1411 mul64To128(a.frac, b.frac, &hi, &lo); 1412 /* binary point now at bit 124 */ 1413 1414 /* check for overflow */ 1415 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) { 1416 shift128RightJamming(hi, lo, 1, &hi, &lo); 1417 p_exp += 1; 1418 } 1419 1420 /* + add/sub */ 1421 if (c.cls == float_class_zero) { 1422 /* move binary point back to 62 */ 1423 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1424 } else { 1425 int exp_diff = p_exp - c.exp; 1426 if (p_sign == c.sign) { 1427 /* Addition */ 1428 if (exp_diff <= 0) { 1429 shift128RightJamming(hi, lo, 1430 DECOMPOSED_BINARY_POINT - exp_diff, 1431 &hi, &lo); 1432 lo += c.frac; 1433 p_exp = c.exp; 1434 } else { 1435 uint64_t c_hi, c_lo; 1436 /* shift c to the same binary point as the product (124) */ 1437 c_hi = c.frac >> 2; 1438 c_lo = 0; 1439 shift128RightJamming(c_hi, c_lo, 1440 exp_diff, 1441 &c_hi, &c_lo); 1442 add128(hi, lo, c_hi, c_lo, &hi, &lo); 1443 /* move binary point back to 62 */ 1444 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1445 } 1446 1447 if (lo & DECOMPOSED_OVERFLOW_BIT) { 1448 shift64RightJamming(lo, 1, &lo); 1449 p_exp += 1; 1450 } 1451 1452 } else { 1453 /* Subtraction */ 1454 uint64_t c_hi, c_lo; 1455 /* make C binary point match product at bit 124 */ 1456 c_hi = c.frac >> 2; 1457 c_lo = 0; 1458 1459 if (exp_diff <= 0) { 1460 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo); 1461 if (exp_diff == 0 1462 && 1463 (hi > c_hi || (hi == c_hi && lo >= c_lo))) { 1464 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1465 } else { 1466 sub128(c_hi, c_lo, hi, lo, &hi, &lo); 1467 p_sign ^= 1; 1468 p_exp = c.exp; 1469 } 1470 } else { 1471 shift128RightJamming(c_hi, c_lo, 1472 exp_diff, 1473 &c_hi, &c_lo); 1474 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1475 } 1476 1477 if (hi == 0 && lo == 0) { 1478 a.cls = float_class_zero; 1479 a.sign = s->float_rounding_mode == float_round_down; 1480 a.sign ^= sign_flip; 1481 return a; 1482 } else { 1483 int shift; 1484 if (hi != 0) { 1485 shift = clz64(hi); 1486 } else { 1487 shift = clz64(lo) + 64; 1488 } 1489 /* Normalizing to a binary point of 124 is the 1490 correct adjust for the exponent. However since we're 1491 shifting, we might as well put the binary point back 1492 at 62 where we really want it. Therefore shift as 1493 if we're leaving 1 bit at the top of the word, but 1494 adjust the exponent as if we're leaving 3 bits. */ 1495 shift -= 1; 1496 if (shift >= 64) { 1497 lo = lo << (shift - 64); 1498 } else { 1499 hi = (hi << shift) | (lo >> (64 - shift)); 1500 lo = hi | ((lo << shift) != 0); 1501 } 1502 p_exp -= shift - 2; 1503 } 1504 } 1505 } 1506 1507 if (flags & float_muladd_halve_result) { 1508 p_exp -= 1; 1509 } 1510 1511 /* finally prepare our result */ 1512 a.cls = float_class_normal; 1513 a.sign = p_sign ^ sign_flip; 1514 a.exp = p_exp; 1515 a.frac = lo; 1516 1517 return a; 1518 } 1519 1520 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c, 1521 int flags, float_status *status) 1522 { 1523 FloatParts pa = float16_unpack_canonical(a, status); 1524 FloatParts pb = float16_unpack_canonical(b, status); 1525 FloatParts pc = float16_unpack_canonical(c, status); 1526 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1527 1528 return float16_round_pack_canonical(pr, status); 1529 } 1530 1531 static float32 QEMU_SOFTFLOAT_ATTR 1532 soft_f32_muladd(float32 a, float32 b, float32 c, int flags, 1533 float_status *status) 1534 { 1535 FloatParts pa = float32_unpack_canonical(a, status); 1536 FloatParts pb = float32_unpack_canonical(b, status); 1537 FloatParts pc = float32_unpack_canonical(c, status); 1538 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1539 1540 return float32_round_pack_canonical(pr, status); 1541 } 1542 1543 static float64 QEMU_SOFTFLOAT_ATTR 1544 soft_f64_muladd(float64 a, float64 b, float64 c, int flags, 1545 float_status *status) 1546 { 1547 FloatParts pa = float64_unpack_canonical(a, status); 1548 FloatParts pb = float64_unpack_canonical(b, status); 1549 FloatParts pc = float64_unpack_canonical(c, status); 1550 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1551 1552 return float64_round_pack_canonical(pr, status); 1553 } 1554 1555 static bool force_soft_fma; 1556 1557 float32 QEMU_FLATTEN 1558 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s) 1559 { 1560 union_float32 ua, ub, uc, ur; 1561 1562 ua.s = xa; 1563 ub.s = xb; 1564 uc.s = xc; 1565 1566 if (unlikely(!can_use_fpu(s))) { 1567 goto soft; 1568 } 1569 if (unlikely(flags & float_muladd_halve_result)) { 1570 goto soft; 1571 } 1572 1573 float32_input_flush3(&ua.s, &ub.s, &uc.s, s); 1574 if (unlikely(!f32_is_zon3(ua, ub, uc))) { 1575 goto soft; 1576 } 1577 1578 if (unlikely(force_soft_fma)) { 1579 goto soft; 1580 } 1581 1582 /* 1583 * When (a || b) == 0, there's no need to check for under/over flow, 1584 * since we know the addend is (normal || 0) and the product is 0. 1585 */ 1586 if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) { 1587 union_float32 up; 1588 bool prod_sign; 1589 1590 prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s); 1591 prod_sign ^= !!(flags & float_muladd_negate_product); 1592 up.s = float32_set_sign(float32_zero, prod_sign); 1593 1594 if (flags & float_muladd_negate_c) { 1595 uc.h = -uc.h; 1596 } 1597 ur.h = up.h + uc.h; 1598 } else { 1599 if (flags & float_muladd_negate_product) { 1600 ua.h = -ua.h; 1601 } 1602 if (flags & float_muladd_negate_c) { 1603 uc.h = -uc.h; 1604 } 1605 1606 ur.h = fmaf(ua.h, ub.h, uc.h); 1607 1608 if (unlikely(f32_is_inf(ur))) { 1609 s->float_exception_flags |= float_flag_overflow; 1610 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) { 1611 goto soft; 1612 } 1613 } 1614 if (flags & float_muladd_negate_result) { 1615 return float32_chs(ur.s); 1616 } 1617 return ur.s; 1618 1619 soft: 1620 return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s); 1621 } 1622 1623 float64 QEMU_FLATTEN 1624 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s) 1625 { 1626 union_float64 ua, ub, uc, ur; 1627 1628 ua.s = xa; 1629 ub.s = xb; 1630 uc.s = xc; 1631 1632 if (unlikely(!can_use_fpu(s))) { 1633 goto soft; 1634 } 1635 if (unlikely(flags & float_muladd_halve_result)) { 1636 goto soft; 1637 } 1638 1639 float64_input_flush3(&ua.s, &ub.s, &uc.s, s); 1640 if (unlikely(!f64_is_zon3(ua, ub, uc))) { 1641 goto soft; 1642 } 1643 1644 if (unlikely(force_soft_fma)) { 1645 goto soft; 1646 } 1647 1648 /* 1649 * When (a || b) == 0, there's no need to check for under/over flow, 1650 * since we know the addend is (normal || 0) and the product is 0. 1651 */ 1652 if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) { 1653 union_float64 up; 1654 bool prod_sign; 1655 1656 prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s); 1657 prod_sign ^= !!(flags & float_muladd_negate_product); 1658 up.s = float64_set_sign(float64_zero, prod_sign); 1659 1660 if (flags & float_muladd_negate_c) { 1661 uc.h = -uc.h; 1662 } 1663 ur.h = up.h + uc.h; 1664 } else { 1665 if (flags & float_muladd_negate_product) { 1666 ua.h = -ua.h; 1667 } 1668 if (flags & float_muladd_negate_c) { 1669 uc.h = -uc.h; 1670 } 1671 1672 ur.h = fma(ua.h, ub.h, uc.h); 1673 1674 if (unlikely(f64_is_inf(ur))) { 1675 s->float_exception_flags |= float_flag_overflow; 1676 } else if (unlikely(fabs(ur.h) <= FLT_MIN)) { 1677 goto soft; 1678 } 1679 } 1680 if (flags & float_muladd_negate_result) { 1681 return float64_chs(ur.s); 1682 } 1683 return ur.s; 1684 1685 soft: 1686 return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s); 1687 } 1688 1689 /* 1690 * Returns the result of dividing the floating-point value `a' by the 1691 * corresponding value `b'. The operation is performed according to 1692 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1693 */ 1694 1695 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s) 1696 { 1697 bool sign = a.sign ^ b.sign; 1698 1699 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1700 uint64_t n0, n1, q, r; 1701 int exp = a.exp - b.exp; 1702 1703 /* 1704 * We want a 2*N / N-bit division to produce exactly an N-bit 1705 * result, so that we do not lose any precision and so that we 1706 * do not have to renormalize afterward. If A.frac < B.frac, 1707 * then division would produce an (N-1)-bit result; shift A left 1708 * by one to produce the an N-bit result, and decrement the 1709 * exponent to match. 1710 * 1711 * The udiv_qrnnd algorithm that we're using requires normalization, 1712 * i.e. the msb of the denominator must be set. Since we know that 1713 * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left 1714 * by one (more), and the remainder must be shifted right by one. 1715 */ 1716 if (a.frac < b.frac) { 1717 exp -= 1; 1718 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0); 1719 } else { 1720 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0); 1721 } 1722 q = udiv_qrnnd(&r, n1, n0, b.frac << 1); 1723 1724 /* 1725 * Set lsb if there is a remainder, to set inexact. 1726 * As mentioned above, to find the actual value of the remainder we 1727 * would need to shift right, but (1) we are only concerned about 1728 * non-zero-ness, and (2) the remainder will always be even because 1729 * both inputs to the division primitive are even. 1730 */ 1731 a.frac = q | (r != 0); 1732 a.sign = sign; 1733 a.exp = exp; 1734 return a; 1735 } 1736 /* handle all the NaN cases */ 1737 if (is_nan(a.cls) || is_nan(b.cls)) { 1738 return pick_nan(a, b, s); 1739 } 1740 /* 0/0 or Inf/Inf */ 1741 if (a.cls == b.cls 1742 && 1743 (a.cls == float_class_inf || a.cls == float_class_zero)) { 1744 s->float_exception_flags |= float_flag_invalid; 1745 return parts_default_nan(s); 1746 } 1747 /* Inf / x or 0 / x */ 1748 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1749 a.sign = sign; 1750 return a; 1751 } 1752 /* Div 0 => Inf */ 1753 if (b.cls == float_class_zero) { 1754 s->float_exception_flags |= float_flag_divbyzero; 1755 a.cls = float_class_inf; 1756 a.sign = sign; 1757 return a; 1758 } 1759 /* Div by Inf */ 1760 if (b.cls == float_class_inf) { 1761 a.cls = float_class_zero; 1762 a.sign = sign; 1763 return a; 1764 } 1765 g_assert_not_reached(); 1766 } 1767 1768 float16 float16_div(float16 a, float16 b, float_status *status) 1769 { 1770 FloatParts pa = float16_unpack_canonical(a, status); 1771 FloatParts pb = float16_unpack_canonical(b, status); 1772 FloatParts pr = div_floats(pa, pb, status); 1773 1774 return float16_round_pack_canonical(pr, status); 1775 } 1776 1777 static float32 QEMU_SOFTFLOAT_ATTR 1778 soft_f32_div(float32 a, float32 b, float_status *status) 1779 { 1780 FloatParts pa = float32_unpack_canonical(a, status); 1781 FloatParts pb = float32_unpack_canonical(b, status); 1782 FloatParts pr = div_floats(pa, pb, status); 1783 1784 return float32_round_pack_canonical(pr, status); 1785 } 1786 1787 static float64 QEMU_SOFTFLOAT_ATTR 1788 soft_f64_div(float64 a, float64 b, float_status *status) 1789 { 1790 FloatParts pa = float64_unpack_canonical(a, status); 1791 FloatParts pb = float64_unpack_canonical(b, status); 1792 FloatParts pr = div_floats(pa, pb, status); 1793 1794 return float64_round_pack_canonical(pr, status); 1795 } 1796 1797 static float hard_f32_div(float a, float b) 1798 { 1799 return a / b; 1800 } 1801 1802 static double hard_f64_div(double a, double b) 1803 { 1804 return a / b; 1805 } 1806 1807 static bool f32_div_pre(union_float32 a, union_float32 b) 1808 { 1809 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1810 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1811 fpclassify(b.h) == FP_NORMAL; 1812 } 1813 return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s); 1814 } 1815 1816 static bool f64_div_pre(union_float64 a, union_float64 b) 1817 { 1818 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1819 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1820 fpclassify(b.h) == FP_NORMAL; 1821 } 1822 return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s); 1823 } 1824 1825 static bool f32_div_post(union_float32 a, union_float32 b) 1826 { 1827 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1828 return fpclassify(a.h) != FP_ZERO; 1829 } 1830 return !float32_is_zero(a.s); 1831 } 1832 1833 static bool f64_div_post(union_float64 a, union_float64 b) 1834 { 1835 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1836 return fpclassify(a.h) != FP_ZERO; 1837 } 1838 return !float64_is_zero(a.s); 1839 } 1840 1841 float32 QEMU_FLATTEN 1842 float32_div(float32 a, float32 b, float_status *s) 1843 { 1844 return float32_gen2(a, b, s, hard_f32_div, soft_f32_div, 1845 f32_div_pre, f32_div_post, NULL, NULL); 1846 } 1847 1848 float64 QEMU_FLATTEN 1849 float64_div(float64 a, float64 b, float_status *s) 1850 { 1851 return float64_gen2(a, b, s, hard_f64_div, soft_f64_div, 1852 f64_div_pre, f64_div_post, NULL, NULL); 1853 } 1854 1855 /* 1856 * Float to Float conversions 1857 * 1858 * Returns the result of converting one float format to another. The 1859 * conversion is performed according to the IEC/IEEE Standard for 1860 * Binary Floating-Point Arithmetic. 1861 * 1862 * The float_to_float helper only needs to take care of raising 1863 * invalid exceptions and handling the conversion on NaNs. 1864 */ 1865 1866 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf, 1867 float_status *s) 1868 { 1869 if (dstf->arm_althp) { 1870 switch (a.cls) { 1871 case float_class_qnan: 1872 case float_class_snan: 1873 /* There is no NaN in the destination format. Raise Invalid 1874 * and return a zero with the sign of the input NaN. 1875 */ 1876 s->float_exception_flags |= float_flag_invalid; 1877 a.cls = float_class_zero; 1878 a.frac = 0; 1879 a.exp = 0; 1880 break; 1881 1882 case float_class_inf: 1883 /* There is no Inf in the destination format. Raise Invalid 1884 * and return the maximum normal with the correct sign. 1885 */ 1886 s->float_exception_flags |= float_flag_invalid; 1887 a.cls = float_class_normal; 1888 a.exp = dstf->exp_max; 1889 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift; 1890 break; 1891 1892 default: 1893 break; 1894 } 1895 } else if (is_nan(a.cls)) { 1896 if (is_snan(a.cls)) { 1897 s->float_exception_flags |= float_flag_invalid; 1898 a = parts_silence_nan(a, s); 1899 } 1900 if (s->default_nan_mode) { 1901 return parts_default_nan(s); 1902 } 1903 } 1904 return a; 1905 } 1906 1907 float32 float16_to_float32(float16 a, bool ieee, float_status *s) 1908 { 1909 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1910 FloatParts p = float16a_unpack_canonical(a, s, fmt16); 1911 FloatParts pr = float_to_float(p, &float32_params, s); 1912 return float32_round_pack_canonical(pr, s); 1913 } 1914 1915 float64 float16_to_float64(float16 a, bool ieee, float_status *s) 1916 { 1917 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1918 FloatParts p = float16a_unpack_canonical(a, s, fmt16); 1919 FloatParts pr = float_to_float(p, &float64_params, s); 1920 return float64_round_pack_canonical(pr, s); 1921 } 1922 1923 float16 float32_to_float16(float32 a, bool ieee, float_status *s) 1924 { 1925 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1926 FloatParts p = float32_unpack_canonical(a, s); 1927 FloatParts pr = float_to_float(p, fmt16, s); 1928 return float16a_round_pack_canonical(pr, s, fmt16); 1929 } 1930 1931 float64 float32_to_float64(float32 a, float_status *s) 1932 { 1933 FloatParts p = float32_unpack_canonical(a, s); 1934 FloatParts pr = float_to_float(p, &float64_params, s); 1935 return float64_round_pack_canonical(pr, s); 1936 } 1937 1938 float16 float64_to_float16(float64 a, bool ieee, float_status *s) 1939 { 1940 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1941 FloatParts p = float64_unpack_canonical(a, s); 1942 FloatParts pr = float_to_float(p, fmt16, s); 1943 return float16a_round_pack_canonical(pr, s, fmt16); 1944 } 1945 1946 float32 float64_to_float32(float64 a, float_status *s) 1947 { 1948 FloatParts p = float64_unpack_canonical(a, s); 1949 FloatParts pr = float_to_float(p, &float32_params, s); 1950 return float32_round_pack_canonical(pr, s); 1951 } 1952 1953 /* 1954 * Rounds the floating-point value `a' to an integer, and returns the 1955 * result as a floating-point value. The operation is performed 1956 * according to the IEC/IEEE Standard for Binary Floating-Point 1957 * Arithmetic. 1958 */ 1959 1960 static FloatParts round_to_int(FloatParts a, int rmode, 1961 int scale, float_status *s) 1962 { 1963 switch (a.cls) { 1964 case float_class_qnan: 1965 case float_class_snan: 1966 return return_nan(a, s); 1967 1968 case float_class_zero: 1969 case float_class_inf: 1970 /* already "integral" */ 1971 break; 1972 1973 case float_class_normal: 1974 scale = MIN(MAX(scale, -0x10000), 0x10000); 1975 a.exp += scale; 1976 1977 if (a.exp >= DECOMPOSED_BINARY_POINT) { 1978 /* already integral */ 1979 break; 1980 } 1981 if (a.exp < 0) { 1982 bool one; 1983 /* all fractional */ 1984 s->float_exception_flags |= float_flag_inexact; 1985 switch (rmode) { 1986 case float_round_nearest_even: 1987 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT; 1988 break; 1989 case float_round_ties_away: 1990 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT; 1991 break; 1992 case float_round_to_zero: 1993 one = false; 1994 break; 1995 case float_round_up: 1996 one = !a.sign; 1997 break; 1998 case float_round_down: 1999 one = a.sign; 2000 break; 2001 case float_round_to_odd: 2002 one = true; 2003 break; 2004 default: 2005 g_assert_not_reached(); 2006 } 2007 2008 if (one) { 2009 a.frac = DECOMPOSED_IMPLICIT_BIT; 2010 a.exp = 0; 2011 } else { 2012 a.cls = float_class_zero; 2013 } 2014 } else { 2015 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp; 2016 uint64_t frac_lsbm1 = frac_lsb >> 1; 2017 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb; 2018 uint64_t rnd_mask = rnd_even_mask >> 1; 2019 uint64_t inc; 2020 2021 switch (rmode) { 2022 case float_round_nearest_even: 2023 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 2024 break; 2025 case float_round_ties_away: 2026 inc = frac_lsbm1; 2027 break; 2028 case float_round_to_zero: 2029 inc = 0; 2030 break; 2031 case float_round_up: 2032 inc = a.sign ? 0 : rnd_mask; 2033 break; 2034 case float_round_down: 2035 inc = a.sign ? rnd_mask : 0; 2036 break; 2037 case float_round_to_odd: 2038 inc = a.frac & frac_lsb ? 0 : rnd_mask; 2039 break; 2040 default: 2041 g_assert_not_reached(); 2042 } 2043 2044 if (a.frac & rnd_mask) { 2045 s->float_exception_flags |= float_flag_inexact; 2046 a.frac += inc; 2047 a.frac &= ~rnd_mask; 2048 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 2049 a.frac >>= 1; 2050 a.exp++; 2051 } 2052 } 2053 } 2054 break; 2055 default: 2056 g_assert_not_reached(); 2057 } 2058 return a; 2059 } 2060 2061 float16 float16_round_to_int(float16 a, float_status *s) 2062 { 2063 FloatParts pa = float16_unpack_canonical(a, s); 2064 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2065 return float16_round_pack_canonical(pr, s); 2066 } 2067 2068 float32 float32_round_to_int(float32 a, float_status *s) 2069 { 2070 FloatParts pa = float32_unpack_canonical(a, s); 2071 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2072 return float32_round_pack_canonical(pr, s); 2073 } 2074 2075 float64 float64_round_to_int(float64 a, float_status *s) 2076 { 2077 FloatParts pa = float64_unpack_canonical(a, s); 2078 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2079 return float64_round_pack_canonical(pr, s); 2080 } 2081 2082 /* 2083 * Returns the result of converting the floating-point value `a' to 2084 * the two's complement integer format. The conversion is performed 2085 * according to the IEC/IEEE Standard for Binary Floating-Point 2086 * Arithmetic---which means in particular that the conversion is 2087 * rounded according to the current rounding mode. If `a' is a NaN, 2088 * the largest positive integer is returned. Otherwise, if the 2089 * conversion overflows, the largest integer with the same sign as `a' 2090 * is returned. 2091 */ 2092 2093 static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale, 2094 int64_t min, int64_t max, 2095 float_status *s) 2096 { 2097 uint64_t r; 2098 int orig_flags = get_float_exception_flags(s); 2099 FloatParts p = round_to_int(in, rmode, scale, s); 2100 2101 switch (p.cls) { 2102 case float_class_snan: 2103 case float_class_qnan: 2104 s->float_exception_flags = orig_flags | float_flag_invalid; 2105 return max; 2106 case float_class_inf: 2107 s->float_exception_flags = orig_flags | float_flag_invalid; 2108 return p.sign ? min : max; 2109 case float_class_zero: 2110 return 0; 2111 case float_class_normal: 2112 if (p.exp < DECOMPOSED_BINARY_POINT) { 2113 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2114 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 2115 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 2116 } else { 2117 r = UINT64_MAX; 2118 } 2119 if (p.sign) { 2120 if (r <= -(uint64_t) min) { 2121 return -r; 2122 } else { 2123 s->float_exception_flags = orig_flags | float_flag_invalid; 2124 return min; 2125 } 2126 } else { 2127 if (r <= max) { 2128 return r; 2129 } else { 2130 s->float_exception_flags = orig_flags | float_flag_invalid; 2131 return max; 2132 } 2133 } 2134 default: 2135 g_assert_not_reached(); 2136 } 2137 } 2138 2139 int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale, 2140 float_status *s) 2141 { 2142 return round_to_int_and_pack(float16_unpack_canonical(a, s), 2143 rmode, scale, INT16_MIN, INT16_MAX, s); 2144 } 2145 2146 int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale, 2147 float_status *s) 2148 { 2149 return round_to_int_and_pack(float16_unpack_canonical(a, s), 2150 rmode, scale, INT32_MIN, INT32_MAX, s); 2151 } 2152 2153 int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale, 2154 float_status *s) 2155 { 2156 return round_to_int_and_pack(float16_unpack_canonical(a, s), 2157 rmode, scale, INT64_MIN, INT64_MAX, s); 2158 } 2159 2160 int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale, 2161 float_status *s) 2162 { 2163 return round_to_int_and_pack(float32_unpack_canonical(a, s), 2164 rmode, scale, INT16_MIN, INT16_MAX, s); 2165 } 2166 2167 int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale, 2168 float_status *s) 2169 { 2170 return round_to_int_and_pack(float32_unpack_canonical(a, s), 2171 rmode, scale, INT32_MIN, INT32_MAX, s); 2172 } 2173 2174 int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale, 2175 float_status *s) 2176 { 2177 return round_to_int_and_pack(float32_unpack_canonical(a, s), 2178 rmode, scale, INT64_MIN, INT64_MAX, s); 2179 } 2180 2181 int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale, 2182 float_status *s) 2183 { 2184 return round_to_int_and_pack(float64_unpack_canonical(a, s), 2185 rmode, scale, INT16_MIN, INT16_MAX, s); 2186 } 2187 2188 int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale, 2189 float_status *s) 2190 { 2191 return round_to_int_and_pack(float64_unpack_canonical(a, s), 2192 rmode, scale, INT32_MIN, INT32_MAX, s); 2193 } 2194 2195 int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale, 2196 float_status *s) 2197 { 2198 return round_to_int_and_pack(float64_unpack_canonical(a, s), 2199 rmode, scale, INT64_MIN, INT64_MAX, s); 2200 } 2201 2202 int16_t float16_to_int16(float16 a, float_status *s) 2203 { 2204 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2205 } 2206 2207 int32_t float16_to_int32(float16 a, float_status *s) 2208 { 2209 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2210 } 2211 2212 int64_t float16_to_int64(float16 a, float_status *s) 2213 { 2214 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2215 } 2216 2217 int16_t float32_to_int16(float32 a, float_status *s) 2218 { 2219 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2220 } 2221 2222 int32_t float32_to_int32(float32 a, float_status *s) 2223 { 2224 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2225 } 2226 2227 int64_t float32_to_int64(float32 a, float_status *s) 2228 { 2229 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2230 } 2231 2232 int16_t float64_to_int16(float64 a, float_status *s) 2233 { 2234 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2235 } 2236 2237 int32_t float64_to_int32(float64 a, float_status *s) 2238 { 2239 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2240 } 2241 2242 int64_t float64_to_int64(float64 a, float_status *s) 2243 { 2244 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2245 } 2246 2247 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s) 2248 { 2249 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2250 } 2251 2252 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s) 2253 { 2254 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2255 } 2256 2257 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s) 2258 { 2259 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2260 } 2261 2262 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s) 2263 { 2264 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s); 2265 } 2266 2267 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s) 2268 { 2269 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s); 2270 } 2271 2272 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s) 2273 { 2274 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s); 2275 } 2276 2277 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s) 2278 { 2279 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s); 2280 } 2281 2282 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s) 2283 { 2284 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s); 2285 } 2286 2287 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s) 2288 { 2289 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s); 2290 } 2291 2292 /* 2293 * Returns the result of converting the floating-point value `a' to 2294 * the unsigned integer format. The conversion is performed according 2295 * to the IEC/IEEE Standard for Binary Floating-Point 2296 * Arithmetic---which means in particular that the conversion is 2297 * rounded according to the current rounding mode. If `a' is a NaN, 2298 * the largest unsigned integer is returned. Otherwise, if the 2299 * conversion overflows, the largest unsigned integer is returned. If 2300 * the 'a' is negative, the result is rounded and zero is returned; 2301 * values that do not round to zero will raise the inexact exception 2302 * flag. 2303 */ 2304 2305 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale, 2306 uint64_t max, float_status *s) 2307 { 2308 int orig_flags = get_float_exception_flags(s); 2309 FloatParts p = round_to_int(in, rmode, scale, s); 2310 uint64_t r; 2311 2312 switch (p.cls) { 2313 case float_class_snan: 2314 case float_class_qnan: 2315 s->float_exception_flags = orig_flags | float_flag_invalid; 2316 return max; 2317 case float_class_inf: 2318 s->float_exception_flags = orig_flags | float_flag_invalid; 2319 return p.sign ? 0 : max; 2320 case float_class_zero: 2321 return 0; 2322 case float_class_normal: 2323 if (p.sign) { 2324 s->float_exception_flags = orig_flags | float_flag_invalid; 2325 return 0; 2326 } 2327 2328 if (p.exp < DECOMPOSED_BINARY_POINT) { 2329 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2330 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 2331 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 2332 } else { 2333 s->float_exception_flags = orig_flags | float_flag_invalid; 2334 return max; 2335 } 2336 2337 /* For uint64 this will never trip, but if p.exp is too large 2338 * to shift a decomposed fraction we shall have exited via the 2339 * 3rd leg above. 2340 */ 2341 if (r > max) { 2342 s->float_exception_flags = orig_flags | float_flag_invalid; 2343 return max; 2344 } 2345 return r; 2346 default: 2347 g_assert_not_reached(); 2348 } 2349 } 2350 2351 uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale, 2352 float_status *s) 2353 { 2354 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 2355 rmode, scale, UINT16_MAX, s); 2356 } 2357 2358 uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale, 2359 float_status *s) 2360 { 2361 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 2362 rmode, scale, UINT32_MAX, s); 2363 } 2364 2365 uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale, 2366 float_status *s) 2367 { 2368 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 2369 rmode, scale, UINT64_MAX, s); 2370 } 2371 2372 uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale, 2373 float_status *s) 2374 { 2375 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 2376 rmode, scale, UINT16_MAX, s); 2377 } 2378 2379 uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale, 2380 float_status *s) 2381 { 2382 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 2383 rmode, scale, UINT32_MAX, s); 2384 } 2385 2386 uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale, 2387 float_status *s) 2388 { 2389 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 2390 rmode, scale, UINT64_MAX, s); 2391 } 2392 2393 uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale, 2394 float_status *s) 2395 { 2396 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 2397 rmode, scale, UINT16_MAX, s); 2398 } 2399 2400 uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale, 2401 float_status *s) 2402 { 2403 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 2404 rmode, scale, UINT32_MAX, s); 2405 } 2406 2407 uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale, 2408 float_status *s) 2409 { 2410 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 2411 rmode, scale, UINT64_MAX, s); 2412 } 2413 2414 uint16_t float16_to_uint16(float16 a, float_status *s) 2415 { 2416 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2417 } 2418 2419 uint32_t float16_to_uint32(float16 a, float_status *s) 2420 { 2421 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2422 } 2423 2424 uint64_t float16_to_uint64(float16 a, float_status *s) 2425 { 2426 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2427 } 2428 2429 uint16_t float32_to_uint16(float32 a, float_status *s) 2430 { 2431 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2432 } 2433 2434 uint32_t float32_to_uint32(float32 a, float_status *s) 2435 { 2436 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2437 } 2438 2439 uint64_t float32_to_uint64(float32 a, float_status *s) 2440 { 2441 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2442 } 2443 2444 uint16_t float64_to_uint16(float64 a, float_status *s) 2445 { 2446 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2447 } 2448 2449 uint32_t float64_to_uint32(float64 a, float_status *s) 2450 { 2451 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2452 } 2453 2454 uint64_t float64_to_uint64(float64 a, float_status *s) 2455 { 2456 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2457 } 2458 2459 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s) 2460 { 2461 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2462 } 2463 2464 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s) 2465 { 2466 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2467 } 2468 2469 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s) 2470 { 2471 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2472 } 2473 2474 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s) 2475 { 2476 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2477 } 2478 2479 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s) 2480 { 2481 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2482 } 2483 2484 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s) 2485 { 2486 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2487 } 2488 2489 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s) 2490 { 2491 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2492 } 2493 2494 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s) 2495 { 2496 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2497 } 2498 2499 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s) 2500 { 2501 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2502 } 2503 2504 /* 2505 * Integer to float conversions 2506 * 2507 * Returns the result of converting the two's complement integer `a' 2508 * to the floating-point format. The conversion is performed according 2509 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2510 */ 2511 2512 static FloatParts int_to_float(int64_t a, int scale, float_status *status) 2513 { 2514 FloatParts r = { .sign = false }; 2515 2516 if (a == 0) { 2517 r.cls = float_class_zero; 2518 } else { 2519 uint64_t f = a; 2520 int shift; 2521 2522 r.cls = float_class_normal; 2523 if (a < 0) { 2524 f = -f; 2525 r.sign = true; 2526 } 2527 shift = clz64(f) - 1; 2528 scale = MIN(MAX(scale, -0x10000), 0x10000); 2529 2530 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 2531 r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift); 2532 } 2533 2534 return r; 2535 } 2536 2537 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status) 2538 { 2539 FloatParts pa = int_to_float(a, scale, status); 2540 return float16_round_pack_canonical(pa, status); 2541 } 2542 2543 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status) 2544 { 2545 return int64_to_float16_scalbn(a, scale, status); 2546 } 2547 2548 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status) 2549 { 2550 return int64_to_float16_scalbn(a, scale, status); 2551 } 2552 2553 float16 int64_to_float16(int64_t a, float_status *status) 2554 { 2555 return int64_to_float16_scalbn(a, 0, status); 2556 } 2557 2558 float16 int32_to_float16(int32_t a, float_status *status) 2559 { 2560 return int64_to_float16_scalbn(a, 0, status); 2561 } 2562 2563 float16 int16_to_float16(int16_t a, float_status *status) 2564 { 2565 return int64_to_float16_scalbn(a, 0, status); 2566 } 2567 2568 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status) 2569 { 2570 FloatParts pa = int_to_float(a, scale, status); 2571 return float32_round_pack_canonical(pa, status); 2572 } 2573 2574 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status) 2575 { 2576 return int64_to_float32_scalbn(a, scale, status); 2577 } 2578 2579 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status) 2580 { 2581 return int64_to_float32_scalbn(a, scale, status); 2582 } 2583 2584 float32 int64_to_float32(int64_t a, float_status *status) 2585 { 2586 return int64_to_float32_scalbn(a, 0, status); 2587 } 2588 2589 float32 int32_to_float32(int32_t a, float_status *status) 2590 { 2591 return int64_to_float32_scalbn(a, 0, status); 2592 } 2593 2594 float32 int16_to_float32(int16_t a, float_status *status) 2595 { 2596 return int64_to_float32_scalbn(a, 0, status); 2597 } 2598 2599 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status) 2600 { 2601 FloatParts pa = int_to_float(a, scale, status); 2602 return float64_round_pack_canonical(pa, status); 2603 } 2604 2605 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status) 2606 { 2607 return int64_to_float64_scalbn(a, scale, status); 2608 } 2609 2610 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status) 2611 { 2612 return int64_to_float64_scalbn(a, scale, status); 2613 } 2614 2615 float64 int64_to_float64(int64_t a, float_status *status) 2616 { 2617 return int64_to_float64_scalbn(a, 0, status); 2618 } 2619 2620 float64 int32_to_float64(int32_t a, float_status *status) 2621 { 2622 return int64_to_float64_scalbn(a, 0, status); 2623 } 2624 2625 float64 int16_to_float64(int16_t a, float_status *status) 2626 { 2627 return int64_to_float64_scalbn(a, 0, status); 2628 } 2629 2630 2631 /* 2632 * Unsigned Integer to float conversions 2633 * 2634 * Returns the result of converting the unsigned integer `a' to the 2635 * floating-point format. The conversion is performed according to the 2636 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2637 */ 2638 2639 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status) 2640 { 2641 FloatParts r = { .sign = false }; 2642 2643 if (a == 0) { 2644 r.cls = float_class_zero; 2645 } else { 2646 scale = MIN(MAX(scale, -0x10000), 0x10000); 2647 r.cls = float_class_normal; 2648 if ((int64_t)a < 0) { 2649 r.exp = DECOMPOSED_BINARY_POINT + 1 + scale; 2650 shift64RightJamming(a, 1, &a); 2651 r.frac = a; 2652 } else { 2653 int shift = clz64(a) - 1; 2654 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 2655 r.frac = a << shift; 2656 } 2657 } 2658 2659 return r; 2660 } 2661 2662 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status) 2663 { 2664 FloatParts pa = uint_to_float(a, scale, status); 2665 return float16_round_pack_canonical(pa, status); 2666 } 2667 2668 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status) 2669 { 2670 return uint64_to_float16_scalbn(a, scale, status); 2671 } 2672 2673 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status) 2674 { 2675 return uint64_to_float16_scalbn(a, scale, status); 2676 } 2677 2678 float16 uint64_to_float16(uint64_t a, float_status *status) 2679 { 2680 return uint64_to_float16_scalbn(a, 0, status); 2681 } 2682 2683 float16 uint32_to_float16(uint32_t a, float_status *status) 2684 { 2685 return uint64_to_float16_scalbn(a, 0, status); 2686 } 2687 2688 float16 uint16_to_float16(uint16_t a, float_status *status) 2689 { 2690 return uint64_to_float16_scalbn(a, 0, status); 2691 } 2692 2693 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status) 2694 { 2695 FloatParts pa = uint_to_float(a, scale, status); 2696 return float32_round_pack_canonical(pa, status); 2697 } 2698 2699 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status) 2700 { 2701 return uint64_to_float32_scalbn(a, scale, status); 2702 } 2703 2704 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status) 2705 { 2706 return uint64_to_float32_scalbn(a, scale, status); 2707 } 2708 2709 float32 uint64_to_float32(uint64_t a, float_status *status) 2710 { 2711 return uint64_to_float32_scalbn(a, 0, status); 2712 } 2713 2714 float32 uint32_to_float32(uint32_t a, float_status *status) 2715 { 2716 return uint64_to_float32_scalbn(a, 0, status); 2717 } 2718 2719 float32 uint16_to_float32(uint16_t a, float_status *status) 2720 { 2721 return uint64_to_float32_scalbn(a, 0, status); 2722 } 2723 2724 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status) 2725 { 2726 FloatParts pa = uint_to_float(a, scale, status); 2727 return float64_round_pack_canonical(pa, status); 2728 } 2729 2730 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status) 2731 { 2732 return uint64_to_float64_scalbn(a, scale, status); 2733 } 2734 2735 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status) 2736 { 2737 return uint64_to_float64_scalbn(a, scale, status); 2738 } 2739 2740 float64 uint64_to_float64(uint64_t a, float_status *status) 2741 { 2742 return uint64_to_float64_scalbn(a, 0, status); 2743 } 2744 2745 float64 uint32_to_float64(uint32_t a, float_status *status) 2746 { 2747 return uint64_to_float64_scalbn(a, 0, status); 2748 } 2749 2750 float64 uint16_to_float64(uint16_t a, float_status *status) 2751 { 2752 return uint64_to_float64_scalbn(a, 0, status); 2753 } 2754 2755 /* Float Min/Max */ 2756 /* min() and max() functions. These can't be implemented as 2757 * 'compare and pick one input' because that would mishandle 2758 * NaNs and +0 vs -0. 2759 * 2760 * minnum() and maxnum() functions. These are similar to the min() 2761 * and max() functions but if one of the arguments is a QNaN and 2762 * the other is numerical then the numerical argument is returned. 2763 * SNaNs will get quietened before being returned. 2764 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 2765 * and maxNum() operations. min() and max() are the typical min/max 2766 * semantics provided by many CPUs which predate that specification. 2767 * 2768 * minnummag() and maxnummag() functions correspond to minNumMag() 2769 * and minNumMag() from the IEEE-754 2008. 2770 */ 2771 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin, 2772 bool ieee, bool ismag, float_status *s) 2773 { 2774 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) { 2775 if (ieee) { 2776 /* Takes two floating-point values `a' and `b', one of 2777 * which is a NaN, and returns the appropriate NaN 2778 * result. If either `a' or `b' is a signaling NaN, 2779 * the invalid exception is raised. 2780 */ 2781 if (is_snan(a.cls) || is_snan(b.cls)) { 2782 return pick_nan(a, b, s); 2783 } else if (is_nan(a.cls) && !is_nan(b.cls)) { 2784 return b; 2785 } else if (is_nan(b.cls) && !is_nan(a.cls)) { 2786 return a; 2787 } 2788 } 2789 return pick_nan(a, b, s); 2790 } else { 2791 int a_exp, b_exp; 2792 2793 switch (a.cls) { 2794 case float_class_normal: 2795 a_exp = a.exp; 2796 break; 2797 case float_class_inf: 2798 a_exp = INT_MAX; 2799 break; 2800 case float_class_zero: 2801 a_exp = INT_MIN; 2802 break; 2803 default: 2804 g_assert_not_reached(); 2805 break; 2806 } 2807 switch (b.cls) { 2808 case float_class_normal: 2809 b_exp = b.exp; 2810 break; 2811 case float_class_inf: 2812 b_exp = INT_MAX; 2813 break; 2814 case float_class_zero: 2815 b_exp = INT_MIN; 2816 break; 2817 default: 2818 g_assert_not_reached(); 2819 break; 2820 } 2821 2822 if (ismag && (a_exp != b_exp || a.frac != b.frac)) { 2823 bool a_less = a_exp < b_exp; 2824 if (a_exp == b_exp) { 2825 a_less = a.frac < b.frac; 2826 } 2827 return a_less ^ ismin ? b : a; 2828 } 2829 2830 if (a.sign == b.sign) { 2831 bool a_less = a_exp < b_exp; 2832 if (a_exp == b_exp) { 2833 a_less = a.frac < b.frac; 2834 } 2835 return a.sign ^ a_less ^ ismin ? b : a; 2836 } else { 2837 return a.sign ^ ismin ? b : a; 2838 } 2839 } 2840 } 2841 2842 #define MINMAX(sz, name, ismin, isiee, ismag) \ 2843 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \ 2844 float_status *s) \ 2845 { \ 2846 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 2847 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 2848 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 2849 \ 2850 return float ## sz ## _round_pack_canonical(pr, s); \ 2851 } 2852 2853 MINMAX(16, min, true, false, false) 2854 MINMAX(16, minnum, true, true, false) 2855 MINMAX(16, minnummag, true, true, true) 2856 MINMAX(16, max, false, false, false) 2857 MINMAX(16, maxnum, false, true, false) 2858 MINMAX(16, maxnummag, false, true, true) 2859 2860 MINMAX(32, min, true, false, false) 2861 MINMAX(32, minnum, true, true, false) 2862 MINMAX(32, minnummag, true, true, true) 2863 MINMAX(32, max, false, false, false) 2864 MINMAX(32, maxnum, false, true, false) 2865 MINMAX(32, maxnummag, false, true, true) 2866 2867 MINMAX(64, min, true, false, false) 2868 MINMAX(64, minnum, true, true, false) 2869 MINMAX(64, minnummag, true, true, true) 2870 MINMAX(64, max, false, false, false) 2871 MINMAX(64, maxnum, false, true, false) 2872 MINMAX(64, maxnummag, false, true, true) 2873 2874 #undef MINMAX 2875 2876 /* Floating point compare */ 2877 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet, 2878 float_status *s) 2879 { 2880 if (is_nan(a.cls) || is_nan(b.cls)) { 2881 if (!is_quiet || 2882 a.cls == float_class_snan || 2883 b.cls == float_class_snan) { 2884 s->float_exception_flags |= float_flag_invalid; 2885 } 2886 return float_relation_unordered; 2887 } 2888 2889 if (a.cls == float_class_zero) { 2890 if (b.cls == float_class_zero) { 2891 return float_relation_equal; 2892 } 2893 return b.sign ? float_relation_greater : float_relation_less; 2894 } else if (b.cls == float_class_zero) { 2895 return a.sign ? float_relation_less : float_relation_greater; 2896 } 2897 2898 /* The only really important thing about infinity is its sign. If 2899 * both are infinities the sign marks the smallest of the two. 2900 */ 2901 if (a.cls == float_class_inf) { 2902 if ((b.cls == float_class_inf) && (a.sign == b.sign)) { 2903 return float_relation_equal; 2904 } 2905 return a.sign ? float_relation_less : float_relation_greater; 2906 } else if (b.cls == float_class_inf) { 2907 return b.sign ? float_relation_greater : float_relation_less; 2908 } 2909 2910 if (a.sign != b.sign) { 2911 return a.sign ? float_relation_less : float_relation_greater; 2912 } 2913 2914 if (a.exp == b.exp) { 2915 if (a.frac == b.frac) { 2916 return float_relation_equal; 2917 } 2918 if (a.sign) { 2919 return a.frac > b.frac ? 2920 float_relation_less : float_relation_greater; 2921 } else { 2922 return a.frac > b.frac ? 2923 float_relation_greater : float_relation_less; 2924 } 2925 } else { 2926 if (a.sign) { 2927 return a.exp > b.exp ? float_relation_less : float_relation_greater; 2928 } else { 2929 return a.exp > b.exp ? float_relation_greater : float_relation_less; 2930 } 2931 } 2932 } 2933 2934 #define COMPARE(name, attr, sz) \ 2935 static int attr \ 2936 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \ 2937 { \ 2938 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 2939 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 2940 return compare_floats(pa, pb, is_quiet, s); \ 2941 } 2942 2943 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16) 2944 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32) 2945 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64) 2946 2947 #undef COMPARE 2948 2949 int float16_compare(float16 a, float16 b, float_status *s) 2950 { 2951 return soft_f16_compare(a, b, false, s); 2952 } 2953 2954 int float16_compare_quiet(float16 a, float16 b, float_status *s) 2955 { 2956 return soft_f16_compare(a, b, true, s); 2957 } 2958 2959 static int QEMU_FLATTEN 2960 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s) 2961 { 2962 union_float32 ua, ub; 2963 2964 ua.s = xa; 2965 ub.s = xb; 2966 2967 if (QEMU_NO_HARDFLOAT) { 2968 goto soft; 2969 } 2970 2971 float32_input_flush2(&ua.s, &ub.s, s); 2972 if (isgreaterequal(ua.h, ub.h)) { 2973 if (isgreater(ua.h, ub.h)) { 2974 return float_relation_greater; 2975 } 2976 return float_relation_equal; 2977 } 2978 if (likely(isless(ua.h, ub.h))) { 2979 return float_relation_less; 2980 } 2981 /* The only condition remaining is unordered. 2982 * Fall through to set flags. 2983 */ 2984 soft: 2985 return soft_f32_compare(ua.s, ub.s, is_quiet, s); 2986 } 2987 2988 int float32_compare(float32 a, float32 b, float_status *s) 2989 { 2990 return f32_compare(a, b, false, s); 2991 } 2992 2993 int float32_compare_quiet(float32 a, float32 b, float_status *s) 2994 { 2995 return f32_compare(a, b, true, s); 2996 } 2997 2998 static int QEMU_FLATTEN 2999 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s) 3000 { 3001 union_float64 ua, ub; 3002 3003 ua.s = xa; 3004 ub.s = xb; 3005 3006 if (QEMU_NO_HARDFLOAT) { 3007 goto soft; 3008 } 3009 3010 float64_input_flush2(&ua.s, &ub.s, s); 3011 if (isgreaterequal(ua.h, ub.h)) { 3012 if (isgreater(ua.h, ub.h)) { 3013 return float_relation_greater; 3014 } 3015 return float_relation_equal; 3016 } 3017 if (likely(isless(ua.h, ub.h))) { 3018 return float_relation_less; 3019 } 3020 /* The only condition remaining is unordered. 3021 * Fall through to set flags. 3022 */ 3023 soft: 3024 return soft_f64_compare(ua.s, ub.s, is_quiet, s); 3025 } 3026 3027 int float64_compare(float64 a, float64 b, float_status *s) 3028 { 3029 return f64_compare(a, b, false, s); 3030 } 3031 3032 int float64_compare_quiet(float64 a, float64 b, float_status *s) 3033 { 3034 return f64_compare(a, b, true, s); 3035 } 3036 3037 /* Multiply A by 2 raised to the power N. */ 3038 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s) 3039 { 3040 if (unlikely(is_nan(a.cls))) { 3041 return return_nan(a, s); 3042 } 3043 if (a.cls == float_class_normal) { 3044 /* The largest float type (even though not supported by FloatParts) 3045 * is float128, which has a 15 bit exponent. Bounding N to 16 bits 3046 * still allows rounding to infinity, without allowing overflow 3047 * within the int32_t that backs FloatParts.exp. 3048 */ 3049 n = MIN(MAX(n, -0x10000), 0x10000); 3050 a.exp += n; 3051 } 3052 return a; 3053 } 3054 3055 float16 float16_scalbn(float16 a, int n, float_status *status) 3056 { 3057 FloatParts pa = float16_unpack_canonical(a, status); 3058 FloatParts pr = scalbn_decomposed(pa, n, status); 3059 return float16_round_pack_canonical(pr, status); 3060 } 3061 3062 float32 float32_scalbn(float32 a, int n, float_status *status) 3063 { 3064 FloatParts pa = float32_unpack_canonical(a, status); 3065 FloatParts pr = scalbn_decomposed(pa, n, status); 3066 return float32_round_pack_canonical(pr, status); 3067 } 3068 3069 float64 float64_scalbn(float64 a, int n, float_status *status) 3070 { 3071 FloatParts pa = float64_unpack_canonical(a, status); 3072 FloatParts pr = scalbn_decomposed(pa, n, status); 3073 return float64_round_pack_canonical(pr, status); 3074 } 3075 3076 /* 3077 * Square Root 3078 * 3079 * The old softfloat code did an approximation step before zeroing in 3080 * on the final result. However for simpleness we just compute the 3081 * square root by iterating down from the implicit bit to enough extra 3082 * bits to ensure we get a correctly rounded result. 3083 * 3084 * This does mean however the calculation is slower than before, 3085 * especially for 64 bit floats. 3086 */ 3087 3088 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p) 3089 { 3090 uint64_t a_frac, r_frac, s_frac; 3091 int bit, last_bit; 3092 3093 if (is_nan(a.cls)) { 3094 return return_nan(a, s); 3095 } 3096 if (a.cls == float_class_zero) { 3097 return a; /* sqrt(+-0) = +-0 */ 3098 } 3099 if (a.sign) { 3100 s->float_exception_flags |= float_flag_invalid; 3101 return parts_default_nan(s); 3102 } 3103 if (a.cls == float_class_inf) { 3104 return a; /* sqrt(+inf) = +inf */ 3105 } 3106 3107 assert(a.cls == float_class_normal); 3108 3109 /* We need two overflow bits at the top. Adding room for that is a 3110 * right shift. If the exponent is odd, we can discard the low bit 3111 * by multiplying the fraction by 2; that's a left shift. Combine 3112 * those and we shift right if the exponent is even. 3113 */ 3114 a_frac = a.frac; 3115 if (!(a.exp & 1)) { 3116 a_frac >>= 1; 3117 } 3118 a.exp >>= 1; 3119 3120 /* Bit-by-bit computation of sqrt. */ 3121 r_frac = 0; 3122 s_frac = 0; 3123 3124 /* Iterate from implicit bit down to the 3 extra bits to compute a 3125 * properly rounded result. Remember we've inserted one more bit 3126 * at the top, so these positions are one less. 3127 */ 3128 bit = DECOMPOSED_BINARY_POINT - 1; 3129 last_bit = MAX(p->frac_shift - 4, 0); 3130 do { 3131 uint64_t q = 1ULL << bit; 3132 uint64_t t_frac = s_frac + q; 3133 if (t_frac <= a_frac) { 3134 s_frac = t_frac + q; 3135 a_frac -= t_frac; 3136 r_frac += q; 3137 } 3138 a_frac <<= 1; 3139 } while (--bit >= last_bit); 3140 3141 /* Undo the right shift done above. If there is any remaining 3142 * fraction, the result is inexact. Set the sticky bit. 3143 */ 3144 a.frac = (r_frac << 1) + (a_frac != 0); 3145 3146 return a; 3147 } 3148 3149 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status) 3150 { 3151 FloatParts pa = float16_unpack_canonical(a, status); 3152 FloatParts pr = sqrt_float(pa, status, &float16_params); 3153 return float16_round_pack_canonical(pr, status); 3154 } 3155 3156 static float32 QEMU_SOFTFLOAT_ATTR 3157 soft_f32_sqrt(float32 a, float_status *status) 3158 { 3159 FloatParts pa = float32_unpack_canonical(a, status); 3160 FloatParts pr = sqrt_float(pa, status, &float32_params); 3161 return float32_round_pack_canonical(pr, status); 3162 } 3163 3164 static float64 QEMU_SOFTFLOAT_ATTR 3165 soft_f64_sqrt(float64 a, float_status *status) 3166 { 3167 FloatParts pa = float64_unpack_canonical(a, status); 3168 FloatParts pr = sqrt_float(pa, status, &float64_params); 3169 return float64_round_pack_canonical(pr, status); 3170 } 3171 3172 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s) 3173 { 3174 union_float32 ua, ur; 3175 3176 ua.s = xa; 3177 if (unlikely(!can_use_fpu(s))) { 3178 goto soft; 3179 } 3180 3181 float32_input_flush1(&ua.s, s); 3182 if (QEMU_HARDFLOAT_1F32_USE_FP) { 3183 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3184 fpclassify(ua.h) == FP_ZERO) || 3185 signbit(ua.h))) { 3186 goto soft; 3187 } 3188 } else if (unlikely(!float32_is_zero_or_normal(ua.s) || 3189 float32_is_neg(ua.s))) { 3190 goto soft; 3191 } 3192 ur.h = sqrtf(ua.h); 3193 return ur.s; 3194 3195 soft: 3196 return soft_f32_sqrt(ua.s, s); 3197 } 3198 3199 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s) 3200 { 3201 union_float64 ua, ur; 3202 3203 ua.s = xa; 3204 if (unlikely(!can_use_fpu(s))) { 3205 goto soft; 3206 } 3207 3208 float64_input_flush1(&ua.s, s); 3209 if (QEMU_HARDFLOAT_1F64_USE_FP) { 3210 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3211 fpclassify(ua.h) == FP_ZERO) || 3212 signbit(ua.h))) { 3213 goto soft; 3214 } 3215 } else if (unlikely(!float64_is_zero_or_normal(ua.s) || 3216 float64_is_neg(ua.s))) { 3217 goto soft; 3218 } 3219 ur.h = sqrt(ua.h); 3220 return ur.s; 3221 3222 soft: 3223 return soft_f64_sqrt(ua.s, s); 3224 } 3225 3226 /*---------------------------------------------------------------------------- 3227 | The pattern for a default generated NaN. 3228 *----------------------------------------------------------------------------*/ 3229 3230 float16 float16_default_nan(float_status *status) 3231 { 3232 FloatParts p = parts_default_nan(status); 3233 p.frac >>= float16_params.frac_shift; 3234 return float16_pack_raw(p); 3235 } 3236 3237 float32 float32_default_nan(float_status *status) 3238 { 3239 FloatParts p = parts_default_nan(status); 3240 p.frac >>= float32_params.frac_shift; 3241 return float32_pack_raw(p); 3242 } 3243 3244 float64 float64_default_nan(float_status *status) 3245 { 3246 FloatParts p = parts_default_nan(status); 3247 p.frac >>= float64_params.frac_shift; 3248 return float64_pack_raw(p); 3249 } 3250 3251 float128 float128_default_nan(float_status *status) 3252 { 3253 FloatParts p = parts_default_nan(status); 3254 float128 r; 3255 3256 /* Extrapolate from the choices made by parts_default_nan to fill 3257 * in the quad-floating format. If the low bit is set, assume we 3258 * want to set all non-snan bits. 3259 */ 3260 r.low = -(p.frac & 1); 3261 r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48); 3262 r.high |= LIT64(0x7FFF000000000000); 3263 r.high |= (uint64_t)p.sign << 63; 3264 3265 return r; 3266 } 3267 3268 /*---------------------------------------------------------------------------- 3269 | Returns a quiet NaN from a signalling NaN for the floating point value `a'. 3270 *----------------------------------------------------------------------------*/ 3271 3272 float16 float16_silence_nan(float16 a, float_status *status) 3273 { 3274 FloatParts p = float16_unpack_raw(a); 3275 p.frac <<= float16_params.frac_shift; 3276 p = parts_silence_nan(p, status); 3277 p.frac >>= float16_params.frac_shift; 3278 return float16_pack_raw(p); 3279 } 3280 3281 float32 float32_silence_nan(float32 a, float_status *status) 3282 { 3283 FloatParts p = float32_unpack_raw(a); 3284 p.frac <<= float32_params.frac_shift; 3285 p = parts_silence_nan(p, status); 3286 p.frac >>= float32_params.frac_shift; 3287 return float32_pack_raw(p); 3288 } 3289 3290 float64 float64_silence_nan(float64 a, float_status *status) 3291 { 3292 FloatParts p = float64_unpack_raw(a); 3293 p.frac <<= float64_params.frac_shift; 3294 p = parts_silence_nan(p, status); 3295 p.frac >>= float64_params.frac_shift; 3296 return float64_pack_raw(p); 3297 } 3298 3299 /*---------------------------------------------------------------------------- 3300 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 3301 | and 7, and returns the properly rounded 32-bit integer corresponding to the 3302 | input. If `zSign' is 1, the input is negated before being converted to an 3303 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 3304 | is simply rounded to an integer, with the inexact exception raised if the 3305 | input cannot be represented exactly as an integer. However, if the fixed- 3306 | point input is too large, the invalid exception is raised and the largest 3307 | positive or negative integer is returned. 3308 *----------------------------------------------------------------------------*/ 3309 3310 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status) 3311 { 3312 int8_t roundingMode; 3313 flag roundNearestEven; 3314 int8_t roundIncrement, roundBits; 3315 int32_t z; 3316 3317 roundingMode = status->float_rounding_mode; 3318 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3319 switch (roundingMode) { 3320 case float_round_nearest_even: 3321 case float_round_ties_away: 3322 roundIncrement = 0x40; 3323 break; 3324 case float_round_to_zero: 3325 roundIncrement = 0; 3326 break; 3327 case float_round_up: 3328 roundIncrement = zSign ? 0 : 0x7f; 3329 break; 3330 case float_round_down: 3331 roundIncrement = zSign ? 0x7f : 0; 3332 break; 3333 case float_round_to_odd: 3334 roundIncrement = absZ & 0x80 ? 0 : 0x7f; 3335 break; 3336 default: 3337 abort(); 3338 } 3339 roundBits = absZ & 0x7F; 3340 absZ = ( absZ + roundIncrement )>>7; 3341 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 3342 z = absZ; 3343 if ( zSign ) z = - z; 3344 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 3345 float_raise(float_flag_invalid, status); 3346 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 3347 } 3348 if (roundBits) { 3349 status->float_exception_flags |= float_flag_inexact; 3350 } 3351 return z; 3352 3353 } 3354 3355 /*---------------------------------------------------------------------------- 3356 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 3357 | `absZ1', with binary point between bits 63 and 64 (between the input words), 3358 | and returns the properly rounded 64-bit integer corresponding to the input. 3359 | If `zSign' is 1, the input is negated before being converted to an integer. 3360 | Ordinarily, the fixed-point input is simply rounded to an integer, with 3361 | the inexact exception raised if the input cannot be represented exactly as 3362 | an integer. However, if the fixed-point input is too large, the invalid 3363 | exception is raised and the largest positive or negative integer is 3364 | returned. 3365 *----------------------------------------------------------------------------*/ 3366 3367 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1, 3368 float_status *status) 3369 { 3370 int8_t roundingMode; 3371 flag roundNearestEven, increment; 3372 int64_t z; 3373 3374 roundingMode = status->float_rounding_mode; 3375 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3376 switch (roundingMode) { 3377 case float_round_nearest_even: 3378 case float_round_ties_away: 3379 increment = ((int64_t) absZ1 < 0); 3380 break; 3381 case float_round_to_zero: 3382 increment = 0; 3383 break; 3384 case float_round_up: 3385 increment = !zSign && absZ1; 3386 break; 3387 case float_round_down: 3388 increment = zSign && absZ1; 3389 break; 3390 case float_round_to_odd: 3391 increment = !(absZ0 & 1) && absZ1; 3392 break; 3393 default: 3394 abort(); 3395 } 3396 if ( increment ) { 3397 ++absZ0; 3398 if ( absZ0 == 0 ) goto overflow; 3399 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 3400 } 3401 z = absZ0; 3402 if ( zSign ) z = - z; 3403 if ( z && ( ( z < 0 ) ^ zSign ) ) { 3404 overflow: 3405 float_raise(float_flag_invalid, status); 3406 return 3407 zSign ? (int64_t) LIT64( 0x8000000000000000 ) 3408 : LIT64( 0x7FFFFFFFFFFFFFFF ); 3409 } 3410 if (absZ1) { 3411 status->float_exception_flags |= float_flag_inexact; 3412 } 3413 return z; 3414 3415 } 3416 3417 /*---------------------------------------------------------------------------- 3418 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 3419 | `absZ1', with binary point between bits 63 and 64 (between the input words), 3420 | and returns the properly rounded 64-bit unsigned integer corresponding to the 3421 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 3422 | with the inexact exception raised if the input cannot be represented exactly 3423 | as an integer. However, if the fixed-point input is too large, the invalid 3424 | exception is raised and the largest unsigned integer is returned. 3425 *----------------------------------------------------------------------------*/ 3426 3427 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0, 3428 uint64_t absZ1, float_status *status) 3429 { 3430 int8_t roundingMode; 3431 flag roundNearestEven, increment; 3432 3433 roundingMode = status->float_rounding_mode; 3434 roundNearestEven = (roundingMode == float_round_nearest_even); 3435 switch (roundingMode) { 3436 case float_round_nearest_even: 3437 case float_round_ties_away: 3438 increment = ((int64_t)absZ1 < 0); 3439 break; 3440 case float_round_to_zero: 3441 increment = 0; 3442 break; 3443 case float_round_up: 3444 increment = !zSign && absZ1; 3445 break; 3446 case float_round_down: 3447 increment = zSign && absZ1; 3448 break; 3449 case float_round_to_odd: 3450 increment = !(absZ0 & 1) && absZ1; 3451 break; 3452 default: 3453 abort(); 3454 } 3455 if (increment) { 3456 ++absZ0; 3457 if (absZ0 == 0) { 3458 float_raise(float_flag_invalid, status); 3459 return LIT64(0xFFFFFFFFFFFFFFFF); 3460 } 3461 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven); 3462 } 3463 3464 if (zSign && absZ0) { 3465 float_raise(float_flag_invalid, status); 3466 return 0; 3467 } 3468 3469 if (absZ1) { 3470 status->float_exception_flags |= float_flag_inexact; 3471 } 3472 return absZ0; 3473 } 3474 3475 /*---------------------------------------------------------------------------- 3476 | If `a' is denormal and we are in flush-to-zero mode then set the 3477 | input-denormal exception and return zero. Otherwise just return the value. 3478 *----------------------------------------------------------------------------*/ 3479 float32 float32_squash_input_denormal(float32 a, float_status *status) 3480 { 3481 if (status->flush_inputs_to_zero) { 3482 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) { 3483 float_raise(float_flag_input_denormal, status); 3484 return make_float32(float32_val(a) & 0x80000000); 3485 } 3486 } 3487 return a; 3488 } 3489 3490 /*---------------------------------------------------------------------------- 3491 | Normalizes the subnormal single-precision floating-point value represented 3492 | by the denormalized significand `aSig'. The normalized exponent and 3493 | significand are stored at the locations pointed to by `zExpPtr' and 3494 | `zSigPtr', respectively. 3495 *----------------------------------------------------------------------------*/ 3496 3497 static void 3498 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 3499 { 3500 int8_t shiftCount; 3501 3502 shiftCount = clz32(aSig) - 8; 3503 *zSigPtr = aSig<<shiftCount; 3504 *zExpPtr = 1 - shiftCount; 3505 3506 } 3507 3508 /*---------------------------------------------------------------------------- 3509 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3510 | and significand `zSig', and returns the proper single-precision floating- 3511 | point value corresponding to the abstract input. Ordinarily, the abstract 3512 | value is simply rounded and packed into the single-precision format, with 3513 | the inexact exception raised if the abstract input cannot be represented 3514 | exactly. However, if the abstract value is too large, the overflow and 3515 | inexact exceptions are raised and an infinity or maximal finite value is 3516 | returned. If the abstract value is too small, the input value is rounded to 3517 | a subnormal number, and the underflow and inexact exceptions are raised if 3518 | the abstract input cannot be represented exactly as a subnormal single- 3519 | precision floating-point number. 3520 | The input significand `zSig' has its binary point between bits 30 3521 | and 29, which is 7 bits to the left of the usual location. This shifted 3522 | significand must be normalized or smaller. If `zSig' is not normalized, 3523 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3524 | and it must not require rounding. In the usual case that `zSig' is 3525 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3526 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3527 | Binary Floating-Point Arithmetic. 3528 *----------------------------------------------------------------------------*/ 3529 3530 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 3531 float_status *status) 3532 { 3533 int8_t roundingMode; 3534 flag roundNearestEven; 3535 int8_t roundIncrement, roundBits; 3536 flag isTiny; 3537 3538 roundingMode = status->float_rounding_mode; 3539 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3540 switch (roundingMode) { 3541 case float_round_nearest_even: 3542 case float_round_ties_away: 3543 roundIncrement = 0x40; 3544 break; 3545 case float_round_to_zero: 3546 roundIncrement = 0; 3547 break; 3548 case float_round_up: 3549 roundIncrement = zSign ? 0 : 0x7f; 3550 break; 3551 case float_round_down: 3552 roundIncrement = zSign ? 0x7f : 0; 3553 break; 3554 case float_round_to_odd: 3555 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 3556 break; 3557 default: 3558 abort(); 3559 break; 3560 } 3561 roundBits = zSig & 0x7F; 3562 if ( 0xFD <= (uint16_t) zExp ) { 3563 if ( ( 0xFD < zExp ) 3564 || ( ( zExp == 0xFD ) 3565 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 3566 ) { 3567 bool overflow_to_inf = roundingMode != float_round_to_odd && 3568 roundIncrement != 0; 3569 float_raise(float_flag_overflow | float_flag_inexact, status); 3570 return packFloat32(zSign, 0xFF, -!overflow_to_inf); 3571 } 3572 if ( zExp < 0 ) { 3573 if (status->flush_to_zero) { 3574 float_raise(float_flag_output_denormal, status); 3575 return packFloat32(zSign, 0, 0); 3576 } 3577 isTiny = 3578 (status->float_detect_tininess 3579 == float_tininess_before_rounding) 3580 || ( zExp < -1 ) 3581 || ( zSig + roundIncrement < 0x80000000 ); 3582 shift32RightJamming( zSig, - zExp, &zSig ); 3583 zExp = 0; 3584 roundBits = zSig & 0x7F; 3585 if (isTiny && roundBits) { 3586 float_raise(float_flag_underflow, status); 3587 } 3588 if (roundingMode == float_round_to_odd) { 3589 /* 3590 * For round-to-odd case, the roundIncrement depends on 3591 * zSig which just changed. 3592 */ 3593 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 3594 } 3595 } 3596 } 3597 if (roundBits) { 3598 status->float_exception_flags |= float_flag_inexact; 3599 } 3600 zSig = ( zSig + roundIncrement )>>7; 3601 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 3602 if ( zSig == 0 ) zExp = 0; 3603 return packFloat32( zSign, zExp, zSig ); 3604 3605 } 3606 3607 /*---------------------------------------------------------------------------- 3608 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3609 | and significand `zSig', and returns the proper single-precision floating- 3610 | point value corresponding to the abstract input. This routine is just like 3611 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 3612 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 3613 | floating-point exponent. 3614 *----------------------------------------------------------------------------*/ 3615 3616 static float32 3617 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 3618 float_status *status) 3619 { 3620 int8_t shiftCount; 3621 3622 shiftCount = clz32(zSig) - 1; 3623 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 3624 status); 3625 3626 } 3627 3628 /*---------------------------------------------------------------------------- 3629 | If `a' is denormal and we are in flush-to-zero mode then set the 3630 | input-denormal exception and return zero. Otherwise just return the value. 3631 *----------------------------------------------------------------------------*/ 3632 float64 float64_squash_input_denormal(float64 a, float_status *status) 3633 { 3634 if (status->flush_inputs_to_zero) { 3635 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) { 3636 float_raise(float_flag_input_denormal, status); 3637 return make_float64(float64_val(a) & (1ULL << 63)); 3638 } 3639 } 3640 return a; 3641 } 3642 3643 /*---------------------------------------------------------------------------- 3644 | Normalizes the subnormal double-precision floating-point value represented 3645 | by the denormalized significand `aSig'. The normalized exponent and 3646 | significand are stored at the locations pointed to by `zExpPtr' and 3647 | `zSigPtr', respectively. 3648 *----------------------------------------------------------------------------*/ 3649 3650 static void 3651 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 3652 { 3653 int8_t shiftCount; 3654 3655 shiftCount = clz64(aSig) - 11; 3656 *zSigPtr = aSig<<shiftCount; 3657 *zExpPtr = 1 - shiftCount; 3658 3659 } 3660 3661 /*---------------------------------------------------------------------------- 3662 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 3663 | double-precision floating-point value, returning the result. After being 3664 | shifted into the proper positions, the three fields are simply added 3665 | together to form the result. This means that any integer portion of `zSig' 3666 | will be added into the exponent. Since a properly normalized significand 3667 | will have an integer portion equal to 1, the `zExp' input should be 1 less 3668 | than the desired result exponent whenever `zSig' is a complete, normalized 3669 | significand. 3670 *----------------------------------------------------------------------------*/ 3671 3672 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig) 3673 { 3674 3675 return make_float64( 3676 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 3677 3678 } 3679 3680 /*---------------------------------------------------------------------------- 3681 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3682 | and significand `zSig', and returns the proper double-precision floating- 3683 | point value corresponding to the abstract input. Ordinarily, the abstract 3684 | value is simply rounded and packed into the double-precision format, with 3685 | the inexact exception raised if the abstract input cannot be represented 3686 | exactly. However, if the abstract value is too large, the overflow and 3687 | inexact exceptions are raised and an infinity or maximal finite value is 3688 | returned. If the abstract value is too small, the input value is rounded to 3689 | a subnormal number, and the underflow and inexact exceptions are raised if 3690 | the abstract input cannot be represented exactly as a subnormal double- 3691 | precision floating-point number. 3692 | The input significand `zSig' has its binary point between bits 62 3693 | and 61, which is 10 bits to the left of the usual location. This shifted 3694 | significand must be normalized or smaller. If `zSig' is not normalized, 3695 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3696 | and it must not require rounding. In the usual case that `zSig' is 3697 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3698 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3699 | Binary Floating-Point Arithmetic. 3700 *----------------------------------------------------------------------------*/ 3701 3702 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 3703 float_status *status) 3704 { 3705 int8_t roundingMode; 3706 flag roundNearestEven; 3707 int roundIncrement, roundBits; 3708 flag isTiny; 3709 3710 roundingMode = status->float_rounding_mode; 3711 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3712 switch (roundingMode) { 3713 case float_round_nearest_even: 3714 case float_round_ties_away: 3715 roundIncrement = 0x200; 3716 break; 3717 case float_round_to_zero: 3718 roundIncrement = 0; 3719 break; 3720 case float_round_up: 3721 roundIncrement = zSign ? 0 : 0x3ff; 3722 break; 3723 case float_round_down: 3724 roundIncrement = zSign ? 0x3ff : 0; 3725 break; 3726 case float_round_to_odd: 3727 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 3728 break; 3729 default: 3730 abort(); 3731 } 3732 roundBits = zSig & 0x3FF; 3733 if ( 0x7FD <= (uint16_t) zExp ) { 3734 if ( ( 0x7FD < zExp ) 3735 || ( ( zExp == 0x7FD ) 3736 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 3737 ) { 3738 bool overflow_to_inf = roundingMode != float_round_to_odd && 3739 roundIncrement != 0; 3740 float_raise(float_flag_overflow | float_flag_inexact, status); 3741 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 3742 } 3743 if ( zExp < 0 ) { 3744 if (status->flush_to_zero) { 3745 float_raise(float_flag_output_denormal, status); 3746 return packFloat64(zSign, 0, 0); 3747 } 3748 isTiny = 3749 (status->float_detect_tininess 3750 == float_tininess_before_rounding) 3751 || ( zExp < -1 ) 3752 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 3753 shift64RightJamming( zSig, - zExp, &zSig ); 3754 zExp = 0; 3755 roundBits = zSig & 0x3FF; 3756 if (isTiny && roundBits) { 3757 float_raise(float_flag_underflow, status); 3758 } 3759 if (roundingMode == float_round_to_odd) { 3760 /* 3761 * For round-to-odd case, the roundIncrement depends on 3762 * zSig which just changed. 3763 */ 3764 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 3765 } 3766 } 3767 } 3768 if (roundBits) { 3769 status->float_exception_flags |= float_flag_inexact; 3770 } 3771 zSig = ( zSig + roundIncrement )>>10; 3772 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 3773 if ( zSig == 0 ) zExp = 0; 3774 return packFloat64( zSign, zExp, zSig ); 3775 3776 } 3777 3778 /*---------------------------------------------------------------------------- 3779 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3780 | and significand `zSig', and returns the proper double-precision floating- 3781 | point value corresponding to the abstract input. This routine is just like 3782 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 3783 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 3784 | floating-point exponent. 3785 *----------------------------------------------------------------------------*/ 3786 3787 static float64 3788 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 3789 float_status *status) 3790 { 3791 int8_t shiftCount; 3792 3793 shiftCount = clz64(zSig) - 1; 3794 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 3795 status); 3796 3797 } 3798 3799 /*---------------------------------------------------------------------------- 3800 | Normalizes the subnormal extended double-precision floating-point value 3801 | represented by the denormalized significand `aSig'. The normalized exponent 3802 | and significand are stored at the locations pointed to by `zExpPtr' and 3803 | `zSigPtr', respectively. 3804 *----------------------------------------------------------------------------*/ 3805 3806 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr, 3807 uint64_t *zSigPtr) 3808 { 3809 int8_t shiftCount; 3810 3811 shiftCount = clz64(aSig); 3812 *zSigPtr = aSig<<shiftCount; 3813 *zExpPtr = 1 - shiftCount; 3814 } 3815 3816 /*---------------------------------------------------------------------------- 3817 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3818 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 3819 | and returns the proper extended double-precision floating-point value 3820 | corresponding to the abstract input. Ordinarily, the abstract value is 3821 | rounded and packed into the extended double-precision format, with the 3822 | inexact exception raised if the abstract input cannot be represented 3823 | exactly. However, if the abstract value is too large, the overflow and 3824 | inexact exceptions are raised and an infinity or maximal finite value is 3825 | returned. If the abstract value is too small, the input value is rounded to 3826 | a subnormal number, and the underflow and inexact exceptions are raised if 3827 | the abstract input cannot be represented exactly as a subnormal extended 3828 | double-precision floating-point number. 3829 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 3830 | number of bits as single or double precision, respectively. Otherwise, the 3831 | result is rounded to the full precision of the extended double-precision 3832 | format. 3833 | The input significand must be normalized or smaller. If the input 3834 | significand is not normalized, `zExp' must be 0; in that case, the result 3835 | returned is a subnormal number, and it must not require rounding. The 3836 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 3837 | Floating-Point Arithmetic. 3838 *----------------------------------------------------------------------------*/ 3839 3840 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign, 3841 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 3842 float_status *status) 3843 { 3844 int8_t roundingMode; 3845 flag roundNearestEven, increment, isTiny; 3846 int64_t roundIncrement, roundMask, roundBits; 3847 3848 roundingMode = status->float_rounding_mode; 3849 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3850 if ( roundingPrecision == 80 ) goto precision80; 3851 if ( roundingPrecision == 64 ) { 3852 roundIncrement = LIT64( 0x0000000000000400 ); 3853 roundMask = LIT64( 0x00000000000007FF ); 3854 } 3855 else if ( roundingPrecision == 32 ) { 3856 roundIncrement = LIT64( 0x0000008000000000 ); 3857 roundMask = LIT64( 0x000000FFFFFFFFFF ); 3858 } 3859 else { 3860 goto precision80; 3861 } 3862 zSig0 |= ( zSig1 != 0 ); 3863 switch (roundingMode) { 3864 case float_round_nearest_even: 3865 case float_round_ties_away: 3866 break; 3867 case float_round_to_zero: 3868 roundIncrement = 0; 3869 break; 3870 case float_round_up: 3871 roundIncrement = zSign ? 0 : roundMask; 3872 break; 3873 case float_round_down: 3874 roundIncrement = zSign ? roundMask : 0; 3875 break; 3876 default: 3877 abort(); 3878 } 3879 roundBits = zSig0 & roundMask; 3880 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 3881 if ( ( 0x7FFE < zExp ) 3882 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 3883 ) { 3884 goto overflow; 3885 } 3886 if ( zExp <= 0 ) { 3887 if (status->flush_to_zero) { 3888 float_raise(float_flag_output_denormal, status); 3889 return packFloatx80(zSign, 0, 0); 3890 } 3891 isTiny = 3892 (status->float_detect_tininess 3893 == float_tininess_before_rounding) 3894 || ( zExp < 0 ) 3895 || ( zSig0 <= zSig0 + roundIncrement ); 3896 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 3897 zExp = 0; 3898 roundBits = zSig0 & roundMask; 3899 if (isTiny && roundBits) { 3900 float_raise(float_flag_underflow, status); 3901 } 3902 if (roundBits) { 3903 status->float_exception_flags |= float_flag_inexact; 3904 } 3905 zSig0 += roundIncrement; 3906 if ( (int64_t) zSig0 < 0 ) zExp = 1; 3907 roundIncrement = roundMask + 1; 3908 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 3909 roundMask |= roundIncrement; 3910 } 3911 zSig0 &= ~ roundMask; 3912 return packFloatx80( zSign, zExp, zSig0 ); 3913 } 3914 } 3915 if (roundBits) { 3916 status->float_exception_flags |= float_flag_inexact; 3917 } 3918 zSig0 += roundIncrement; 3919 if ( zSig0 < roundIncrement ) { 3920 ++zExp; 3921 zSig0 = LIT64( 0x8000000000000000 ); 3922 } 3923 roundIncrement = roundMask + 1; 3924 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 3925 roundMask |= roundIncrement; 3926 } 3927 zSig0 &= ~ roundMask; 3928 if ( zSig0 == 0 ) zExp = 0; 3929 return packFloatx80( zSign, zExp, zSig0 ); 3930 precision80: 3931 switch (roundingMode) { 3932 case float_round_nearest_even: 3933 case float_round_ties_away: 3934 increment = ((int64_t)zSig1 < 0); 3935 break; 3936 case float_round_to_zero: 3937 increment = 0; 3938 break; 3939 case float_round_up: 3940 increment = !zSign && zSig1; 3941 break; 3942 case float_round_down: 3943 increment = zSign && zSig1; 3944 break; 3945 default: 3946 abort(); 3947 } 3948 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 3949 if ( ( 0x7FFE < zExp ) 3950 || ( ( zExp == 0x7FFE ) 3951 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 3952 && increment 3953 ) 3954 ) { 3955 roundMask = 0; 3956 overflow: 3957 float_raise(float_flag_overflow | float_flag_inexact, status); 3958 if ( ( roundingMode == float_round_to_zero ) 3959 || ( zSign && ( roundingMode == float_round_up ) ) 3960 || ( ! zSign && ( roundingMode == float_round_down ) ) 3961 ) { 3962 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 3963 } 3964 return packFloatx80(zSign, 3965 floatx80_infinity_high, 3966 floatx80_infinity_low); 3967 } 3968 if ( zExp <= 0 ) { 3969 isTiny = 3970 (status->float_detect_tininess 3971 == float_tininess_before_rounding) 3972 || ( zExp < 0 ) 3973 || ! increment 3974 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 3975 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 3976 zExp = 0; 3977 if (isTiny && zSig1) { 3978 float_raise(float_flag_underflow, status); 3979 } 3980 if (zSig1) { 3981 status->float_exception_flags |= float_flag_inexact; 3982 } 3983 switch (roundingMode) { 3984 case float_round_nearest_even: 3985 case float_round_ties_away: 3986 increment = ((int64_t)zSig1 < 0); 3987 break; 3988 case float_round_to_zero: 3989 increment = 0; 3990 break; 3991 case float_round_up: 3992 increment = !zSign && zSig1; 3993 break; 3994 case float_round_down: 3995 increment = zSign && zSig1; 3996 break; 3997 default: 3998 abort(); 3999 } 4000 if ( increment ) { 4001 ++zSig0; 4002 zSig0 &= 4003 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 4004 if ( (int64_t) zSig0 < 0 ) zExp = 1; 4005 } 4006 return packFloatx80( zSign, zExp, zSig0 ); 4007 } 4008 } 4009 if (zSig1) { 4010 status->float_exception_flags |= float_flag_inexact; 4011 } 4012 if ( increment ) { 4013 ++zSig0; 4014 if ( zSig0 == 0 ) { 4015 ++zExp; 4016 zSig0 = LIT64( 0x8000000000000000 ); 4017 } 4018 else { 4019 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 4020 } 4021 } 4022 else { 4023 if ( zSig0 == 0 ) zExp = 0; 4024 } 4025 return packFloatx80( zSign, zExp, zSig0 ); 4026 4027 } 4028 4029 /*---------------------------------------------------------------------------- 4030 | Takes an abstract floating-point value having sign `zSign', exponent 4031 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 4032 | and returns the proper extended double-precision floating-point value 4033 | corresponding to the abstract input. This routine is just like 4034 | `roundAndPackFloatx80' except that the input significand does not have to be 4035 | normalized. 4036 *----------------------------------------------------------------------------*/ 4037 4038 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 4039 flag zSign, int32_t zExp, 4040 uint64_t zSig0, uint64_t zSig1, 4041 float_status *status) 4042 { 4043 int8_t shiftCount; 4044 4045 if ( zSig0 == 0 ) { 4046 zSig0 = zSig1; 4047 zSig1 = 0; 4048 zExp -= 64; 4049 } 4050 shiftCount = clz64(zSig0); 4051 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4052 zExp -= shiftCount; 4053 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 4054 zSig0, zSig1, status); 4055 4056 } 4057 4058 /*---------------------------------------------------------------------------- 4059 | Returns the least-significant 64 fraction bits of the quadruple-precision 4060 | floating-point value `a'. 4061 *----------------------------------------------------------------------------*/ 4062 4063 static inline uint64_t extractFloat128Frac1( float128 a ) 4064 { 4065 4066 return a.low; 4067 4068 } 4069 4070 /*---------------------------------------------------------------------------- 4071 | Returns the most-significant 48 fraction bits of the quadruple-precision 4072 | floating-point value `a'. 4073 *----------------------------------------------------------------------------*/ 4074 4075 static inline uint64_t extractFloat128Frac0( float128 a ) 4076 { 4077 4078 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 4079 4080 } 4081 4082 /*---------------------------------------------------------------------------- 4083 | Returns the exponent bits of the quadruple-precision floating-point value 4084 | `a'. 4085 *----------------------------------------------------------------------------*/ 4086 4087 static inline int32_t extractFloat128Exp( float128 a ) 4088 { 4089 4090 return ( a.high>>48 ) & 0x7FFF; 4091 4092 } 4093 4094 /*---------------------------------------------------------------------------- 4095 | Returns the sign bit of the quadruple-precision floating-point value `a'. 4096 *----------------------------------------------------------------------------*/ 4097 4098 static inline flag extractFloat128Sign( float128 a ) 4099 { 4100 4101 return a.high>>63; 4102 4103 } 4104 4105 /*---------------------------------------------------------------------------- 4106 | Normalizes the subnormal quadruple-precision floating-point value 4107 | represented by the denormalized significand formed by the concatenation of 4108 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 4109 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 4110 | significand are stored at the location pointed to by `zSig0Ptr', and the 4111 | least significant 64 bits of the normalized significand are stored at the 4112 | location pointed to by `zSig1Ptr'. 4113 *----------------------------------------------------------------------------*/ 4114 4115 static void 4116 normalizeFloat128Subnormal( 4117 uint64_t aSig0, 4118 uint64_t aSig1, 4119 int32_t *zExpPtr, 4120 uint64_t *zSig0Ptr, 4121 uint64_t *zSig1Ptr 4122 ) 4123 { 4124 int8_t shiftCount; 4125 4126 if ( aSig0 == 0 ) { 4127 shiftCount = clz64(aSig1) - 15; 4128 if ( shiftCount < 0 ) { 4129 *zSig0Ptr = aSig1>>( - shiftCount ); 4130 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 4131 } 4132 else { 4133 *zSig0Ptr = aSig1<<shiftCount; 4134 *zSig1Ptr = 0; 4135 } 4136 *zExpPtr = - shiftCount - 63; 4137 } 4138 else { 4139 shiftCount = clz64(aSig0) - 15; 4140 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 4141 *zExpPtr = 1 - shiftCount; 4142 } 4143 4144 } 4145 4146 /*---------------------------------------------------------------------------- 4147 | Packs the sign `zSign', the exponent `zExp', and the significand formed 4148 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 4149 | floating-point value, returning the result. After being shifted into the 4150 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 4151 | added together to form the most significant 32 bits of the result. This 4152 | means that any integer portion of `zSig0' will be added into the exponent. 4153 | Since a properly normalized significand will have an integer portion equal 4154 | to 1, the `zExp' input should be 1 less than the desired result exponent 4155 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 4156 | significand. 4157 *----------------------------------------------------------------------------*/ 4158 4159 static inline float128 4160 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 ) 4161 { 4162 float128 z; 4163 4164 z.low = zSig1; 4165 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0; 4166 return z; 4167 4168 } 4169 4170 /*---------------------------------------------------------------------------- 4171 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4172 | and extended significand formed by the concatenation of `zSig0', `zSig1', 4173 | and `zSig2', and returns the proper quadruple-precision floating-point value 4174 | corresponding to the abstract input. Ordinarily, the abstract value is 4175 | simply rounded and packed into the quadruple-precision format, with the 4176 | inexact exception raised if the abstract input cannot be represented 4177 | exactly. However, if the abstract value is too large, the overflow and 4178 | inexact exceptions are raised and an infinity or maximal finite value is 4179 | returned. If the abstract value is too small, the input value is rounded to 4180 | a subnormal number, and the underflow and inexact exceptions are raised if 4181 | the abstract input cannot be represented exactly as a subnormal quadruple- 4182 | precision floating-point number. 4183 | The input significand must be normalized or smaller. If the input 4184 | significand is not normalized, `zExp' must be 0; in that case, the result 4185 | returned is a subnormal number, and it must not require rounding. In the 4186 | usual case that the input significand is normalized, `zExp' must be 1 less 4187 | than the ``true'' floating-point exponent. The handling of underflow and 4188 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4189 *----------------------------------------------------------------------------*/ 4190 4191 static float128 roundAndPackFloat128(flag zSign, int32_t zExp, 4192 uint64_t zSig0, uint64_t zSig1, 4193 uint64_t zSig2, float_status *status) 4194 { 4195 int8_t roundingMode; 4196 flag roundNearestEven, increment, isTiny; 4197 4198 roundingMode = status->float_rounding_mode; 4199 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4200 switch (roundingMode) { 4201 case float_round_nearest_even: 4202 case float_round_ties_away: 4203 increment = ((int64_t)zSig2 < 0); 4204 break; 4205 case float_round_to_zero: 4206 increment = 0; 4207 break; 4208 case float_round_up: 4209 increment = !zSign && zSig2; 4210 break; 4211 case float_round_down: 4212 increment = zSign && zSig2; 4213 break; 4214 case float_round_to_odd: 4215 increment = !(zSig1 & 0x1) && zSig2; 4216 break; 4217 default: 4218 abort(); 4219 } 4220 if ( 0x7FFD <= (uint32_t) zExp ) { 4221 if ( ( 0x7FFD < zExp ) 4222 || ( ( zExp == 0x7FFD ) 4223 && eq128( 4224 LIT64( 0x0001FFFFFFFFFFFF ), 4225 LIT64( 0xFFFFFFFFFFFFFFFF ), 4226 zSig0, 4227 zSig1 4228 ) 4229 && increment 4230 ) 4231 ) { 4232 float_raise(float_flag_overflow | float_flag_inexact, status); 4233 if ( ( roundingMode == float_round_to_zero ) 4234 || ( zSign && ( roundingMode == float_round_up ) ) 4235 || ( ! zSign && ( roundingMode == float_round_down ) ) 4236 || (roundingMode == float_round_to_odd) 4237 ) { 4238 return 4239 packFloat128( 4240 zSign, 4241 0x7FFE, 4242 LIT64( 0x0000FFFFFFFFFFFF ), 4243 LIT64( 0xFFFFFFFFFFFFFFFF ) 4244 ); 4245 } 4246 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4247 } 4248 if ( zExp < 0 ) { 4249 if (status->flush_to_zero) { 4250 float_raise(float_flag_output_denormal, status); 4251 return packFloat128(zSign, 0, 0, 0); 4252 } 4253 isTiny = 4254 (status->float_detect_tininess 4255 == float_tininess_before_rounding) 4256 || ( zExp < -1 ) 4257 || ! increment 4258 || lt128( 4259 zSig0, 4260 zSig1, 4261 LIT64( 0x0001FFFFFFFFFFFF ), 4262 LIT64( 0xFFFFFFFFFFFFFFFF ) 4263 ); 4264 shift128ExtraRightJamming( 4265 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 4266 zExp = 0; 4267 if (isTiny && zSig2) { 4268 float_raise(float_flag_underflow, status); 4269 } 4270 switch (roundingMode) { 4271 case float_round_nearest_even: 4272 case float_round_ties_away: 4273 increment = ((int64_t)zSig2 < 0); 4274 break; 4275 case float_round_to_zero: 4276 increment = 0; 4277 break; 4278 case float_round_up: 4279 increment = !zSign && zSig2; 4280 break; 4281 case float_round_down: 4282 increment = zSign && zSig2; 4283 break; 4284 case float_round_to_odd: 4285 increment = !(zSig1 & 0x1) && zSig2; 4286 break; 4287 default: 4288 abort(); 4289 } 4290 } 4291 } 4292 if (zSig2) { 4293 status->float_exception_flags |= float_flag_inexact; 4294 } 4295 if ( increment ) { 4296 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 4297 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 4298 } 4299 else { 4300 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 4301 } 4302 return packFloat128( zSign, zExp, zSig0, zSig1 ); 4303 4304 } 4305 4306 /*---------------------------------------------------------------------------- 4307 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4308 | and significand formed by the concatenation of `zSig0' and `zSig1', and 4309 | returns the proper quadruple-precision floating-point value corresponding 4310 | to the abstract input. This routine is just like `roundAndPackFloat128' 4311 | except that the input significand has fewer bits and does not have to be 4312 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 4313 | point exponent. 4314 *----------------------------------------------------------------------------*/ 4315 4316 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp, 4317 uint64_t zSig0, uint64_t zSig1, 4318 float_status *status) 4319 { 4320 int8_t shiftCount; 4321 uint64_t zSig2; 4322 4323 if ( zSig0 == 0 ) { 4324 zSig0 = zSig1; 4325 zSig1 = 0; 4326 zExp -= 64; 4327 } 4328 shiftCount = clz64(zSig0) - 15; 4329 if ( 0 <= shiftCount ) { 4330 zSig2 = 0; 4331 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4332 } 4333 else { 4334 shift128ExtraRightJamming( 4335 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 4336 } 4337 zExp -= shiftCount; 4338 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 4339 4340 } 4341 4342 4343 /*---------------------------------------------------------------------------- 4344 | Returns the result of converting the 32-bit two's complement integer `a' 4345 | to the extended double-precision floating-point format. The conversion 4346 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4347 | Arithmetic. 4348 *----------------------------------------------------------------------------*/ 4349 4350 floatx80 int32_to_floatx80(int32_t a, float_status *status) 4351 { 4352 flag zSign; 4353 uint32_t absA; 4354 int8_t shiftCount; 4355 uint64_t zSig; 4356 4357 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 4358 zSign = ( a < 0 ); 4359 absA = zSign ? - a : a; 4360 shiftCount = clz32(absA) + 32; 4361 zSig = absA; 4362 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 4363 4364 } 4365 4366 /*---------------------------------------------------------------------------- 4367 | Returns the result of converting the 32-bit two's complement integer `a' to 4368 | the quadruple-precision floating-point format. The conversion is performed 4369 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4370 *----------------------------------------------------------------------------*/ 4371 4372 float128 int32_to_float128(int32_t a, float_status *status) 4373 { 4374 flag zSign; 4375 uint32_t absA; 4376 int8_t shiftCount; 4377 uint64_t zSig0; 4378 4379 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 4380 zSign = ( a < 0 ); 4381 absA = zSign ? - a : a; 4382 shiftCount = clz32(absA) + 17; 4383 zSig0 = absA; 4384 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 4385 4386 } 4387 4388 /*---------------------------------------------------------------------------- 4389 | Returns the result of converting the 64-bit two's complement integer `a' 4390 | to the extended double-precision floating-point format. The conversion 4391 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4392 | Arithmetic. 4393 *----------------------------------------------------------------------------*/ 4394 4395 floatx80 int64_to_floatx80(int64_t a, float_status *status) 4396 { 4397 flag zSign; 4398 uint64_t absA; 4399 int8_t shiftCount; 4400 4401 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 4402 zSign = ( a < 0 ); 4403 absA = zSign ? - a : a; 4404 shiftCount = clz64(absA); 4405 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 4406 4407 } 4408 4409 /*---------------------------------------------------------------------------- 4410 | Returns the result of converting the 64-bit two's complement integer `a' to 4411 | the quadruple-precision floating-point format. The conversion is performed 4412 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4413 *----------------------------------------------------------------------------*/ 4414 4415 float128 int64_to_float128(int64_t a, float_status *status) 4416 { 4417 flag zSign; 4418 uint64_t absA; 4419 int8_t shiftCount; 4420 int32_t zExp; 4421 uint64_t zSig0, zSig1; 4422 4423 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 4424 zSign = ( a < 0 ); 4425 absA = zSign ? - a : a; 4426 shiftCount = clz64(absA) + 49; 4427 zExp = 0x406E - shiftCount; 4428 if ( 64 <= shiftCount ) { 4429 zSig1 = 0; 4430 zSig0 = absA; 4431 shiftCount -= 64; 4432 } 4433 else { 4434 zSig1 = absA; 4435 zSig0 = 0; 4436 } 4437 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4438 return packFloat128( zSign, zExp, zSig0, zSig1 ); 4439 4440 } 4441 4442 /*---------------------------------------------------------------------------- 4443 | Returns the result of converting the 64-bit unsigned integer `a' 4444 | to the quadruple-precision floating-point format. The conversion is performed 4445 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4446 *----------------------------------------------------------------------------*/ 4447 4448 float128 uint64_to_float128(uint64_t a, float_status *status) 4449 { 4450 if (a == 0) { 4451 return float128_zero; 4452 } 4453 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status); 4454 } 4455 4456 /*---------------------------------------------------------------------------- 4457 | Returns the result of converting the single-precision floating-point value 4458 | `a' to the extended double-precision floating-point format. The conversion 4459 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4460 | Arithmetic. 4461 *----------------------------------------------------------------------------*/ 4462 4463 floatx80 float32_to_floatx80(float32 a, float_status *status) 4464 { 4465 flag aSign; 4466 int aExp; 4467 uint32_t aSig; 4468 4469 a = float32_squash_input_denormal(a, status); 4470 aSig = extractFloat32Frac( a ); 4471 aExp = extractFloat32Exp( a ); 4472 aSign = extractFloat32Sign( a ); 4473 if ( aExp == 0xFF ) { 4474 if (aSig) { 4475 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status); 4476 } 4477 return packFloatx80(aSign, 4478 floatx80_infinity_high, 4479 floatx80_infinity_low); 4480 } 4481 if ( aExp == 0 ) { 4482 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 4483 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4484 } 4485 aSig |= 0x00800000; 4486 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 4487 4488 } 4489 4490 /*---------------------------------------------------------------------------- 4491 | Returns the result of converting the single-precision floating-point value 4492 | `a' to the double-precision floating-point format. The conversion is 4493 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4494 | Arithmetic. 4495 *----------------------------------------------------------------------------*/ 4496 4497 float128 float32_to_float128(float32 a, float_status *status) 4498 { 4499 flag aSign; 4500 int aExp; 4501 uint32_t aSig; 4502 4503 a = float32_squash_input_denormal(a, status); 4504 aSig = extractFloat32Frac( a ); 4505 aExp = extractFloat32Exp( a ); 4506 aSign = extractFloat32Sign( a ); 4507 if ( aExp == 0xFF ) { 4508 if (aSig) { 4509 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 4510 } 4511 return packFloat128( aSign, 0x7FFF, 0, 0 ); 4512 } 4513 if ( aExp == 0 ) { 4514 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 4515 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4516 --aExp; 4517 } 4518 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 4519 4520 } 4521 4522 /*---------------------------------------------------------------------------- 4523 | Returns the remainder of the single-precision floating-point value `a' 4524 | with respect to the corresponding value `b'. The operation is performed 4525 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4526 *----------------------------------------------------------------------------*/ 4527 4528 float32 float32_rem(float32 a, float32 b, float_status *status) 4529 { 4530 flag aSign, zSign; 4531 int aExp, bExp, expDiff; 4532 uint32_t aSig, bSig; 4533 uint32_t q; 4534 uint64_t aSig64, bSig64, q64; 4535 uint32_t alternateASig; 4536 int32_t sigMean; 4537 a = float32_squash_input_denormal(a, status); 4538 b = float32_squash_input_denormal(b, status); 4539 4540 aSig = extractFloat32Frac( a ); 4541 aExp = extractFloat32Exp( a ); 4542 aSign = extractFloat32Sign( a ); 4543 bSig = extractFloat32Frac( b ); 4544 bExp = extractFloat32Exp( b ); 4545 if ( aExp == 0xFF ) { 4546 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 4547 return propagateFloat32NaN(a, b, status); 4548 } 4549 float_raise(float_flag_invalid, status); 4550 return float32_default_nan(status); 4551 } 4552 if ( bExp == 0xFF ) { 4553 if (bSig) { 4554 return propagateFloat32NaN(a, b, status); 4555 } 4556 return a; 4557 } 4558 if ( bExp == 0 ) { 4559 if ( bSig == 0 ) { 4560 float_raise(float_flag_invalid, status); 4561 return float32_default_nan(status); 4562 } 4563 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 4564 } 4565 if ( aExp == 0 ) { 4566 if ( aSig == 0 ) return a; 4567 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4568 } 4569 expDiff = aExp - bExp; 4570 aSig |= 0x00800000; 4571 bSig |= 0x00800000; 4572 if ( expDiff < 32 ) { 4573 aSig <<= 8; 4574 bSig <<= 8; 4575 if ( expDiff < 0 ) { 4576 if ( expDiff < -1 ) return a; 4577 aSig >>= 1; 4578 } 4579 q = ( bSig <= aSig ); 4580 if ( q ) aSig -= bSig; 4581 if ( 0 < expDiff ) { 4582 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 4583 q >>= 32 - expDiff; 4584 bSig >>= 2; 4585 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4586 } 4587 else { 4588 aSig >>= 2; 4589 bSig >>= 2; 4590 } 4591 } 4592 else { 4593 if ( bSig <= aSig ) aSig -= bSig; 4594 aSig64 = ( (uint64_t) aSig )<<40; 4595 bSig64 = ( (uint64_t) bSig )<<40; 4596 expDiff -= 64; 4597 while ( 0 < expDiff ) { 4598 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 4599 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 4600 aSig64 = - ( ( bSig * q64 )<<38 ); 4601 expDiff -= 62; 4602 } 4603 expDiff += 64; 4604 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 4605 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 4606 q = q64>>( 64 - expDiff ); 4607 bSig <<= 6; 4608 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 4609 } 4610 do { 4611 alternateASig = aSig; 4612 ++q; 4613 aSig -= bSig; 4614 } while ( 0 <= (int32_t) aSig ); 4615 sigMean = aSig + alternateASig; 4616 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 4617 aSig = alternateASig; 4618 } 4619 zSign = ( (int32_t) aSig < 0 ); 4620 if ( zSign ) aSig = - aSig; 4621 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 4622 } 4623 4624 4625 4626 /*---------------------------------------------------------------------------- 4627 | Returns the binary exponential of the single-precision floating-point value 4628 | `a'. The operation is performed according to the IEC/IEEE Standard for 4629 | Binary Floating-Point Arithmetic. 4630 | 4631 | Uses the following identities: 4632 | 4633 | 1. ------------------------------------------------------------------------- 4634 | x x*ln(2) 4635 | 2 = e 4636 | 4637 | 2. ------------------------------------------------------------------------- 4638 | 2 3 4 5 n 4639 | x x x x x x x 4640 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 4641 | 1! 2! 3! 4! 5! n! 4642 *----------------------------------------------------------------------------*/ 4643 4644 static const float64 float32_exp2_coefficients[15] = 4645 { 4646 const_float64( 0x3ff0000000000000ll ), /* 1 */ 4647 const_float64( 0x3fe0000000000000ll ), /* 2 */ 4648 const_float64( 0x3fc5555555555555ll ), /* 3 */ 4649 const_float64( 0x3fa5555555555555ll ), /* 4 */ 4650 const_float64( 0x3f81111111111111ll ), /* 5 */ 4651 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 4652 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 4653 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 4654 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 4655 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 4656 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 4657 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 4658 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 4659 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 4660 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 4661 }; 4662 4663 float32 float32_exp2(float32 a, float_status *status) 4664 { 4665 flag aSign; 4666 int aExp; 4667 uint32_t aSig; 4668 float64 r, x, xn; 4669 int i; 4670 a = float32_squash_input_denormal(a, status); 4671 4672 aSig = extractFloat32Frac( a ); 4673 aExp = extractFloat32Exp( a ); 4674 aSign = extractFloat32Sign( a ); 4675 4676 if ( aExp == 0xFF) { 4677 if (aSig) { 4678 return propagateFloat32NaN(a, float32_zero, status); 4679 } 4680 return (aSign) ? float32_zero : a; 4681 } 4682 if (aExp == 0) { 4683 if (aSig == 0) return float32_one; 4684 } 4685 4686 float_raise(float_flag_inexact, status); 4687 4688 /* ******************************* */ 4689 /* using float64 for approximation */ 4690 /* ******************************* */ 4691 x = float32_to_float64(a, status); 4692 x = float64_mul(x, float64_ln2, status); 4693 4694 xn = x; 4695 r = float64_one; 4696 for (i = 0 ; i < 15 ; i++) { 4697 float64 f; 4698 4699 f = float64_mul(xn, float32_exp2_coefficients[i], status); 4700 r = float64_add(r, f, status); 4701 4702 xn = float64_mul(xn, x, status); 4703 } 4704 4705 return float64_to_float32(r, status); 4706 } 4707 4708 /*---------------------------------------------------------------------------- 4709 | Returns the binary log of the single-precision floating-point value `a'. 4710 | The operation is performed according to the IEC/IEEE Standard for Binary 4711 | Floating-Point Arithmetic. 4712 *----------------------------------------------------------------------------*/ 4713 float32 float32_log2(float32 a, float_status *status) 4714 { 4715 flag aSign, zSign; 4716 int aExp; 4717 uint32_t aSig, zSig, i; 4718 4719 a = float32_squash_input_denormal(a, status); 4720 aSig = extractFloat32Frac( a ); 4721 aExp = extractFloat32Exp( a ); 4722 aSign = extractFloat32Sign( a ); 4723 4724 if ( aExp == 0 ) { 4725 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 4726 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4727 } 4728 if ( aSign ) { 4729 float_raise(float_flag_invalid, status); 4730 return float32_default_nan(status); 4731 } 4732 if ( aExp == 0xFF ) { 4733 if (aSig) { 4734 return propagateFloat32NaN(a, float32_zero, status); 4735 } 4736 return a; 4737 } 4738 4739 aExp -= 0x7F; 4740 aSig |= 0x00800000; 4741 zSign = aExp < 0; 4742 zSig = aExp << 23; 4743 4744 for (i = 1 << 22; i > 0; i >>= 1) { 4745 aSig = ( (uint64_t)aSig * aSig ) >> 23; 4746 if ( aSig & 0x01000000 ) { 4747 aSig >>= 1; 4748 zSig |= i; 4749 } 4750 } 4751 4752 if ( zSign ) 4753 zSig = -zSig; 4754 4755 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 4756 } 4757 4758 /*---------------------------------------------------------------------------- 4759 | Returns 1 if the single-precision floating-point value `a' is equal to 4760 | the corresponding value `b', and 0 otherwise. The invalid exception is 4761 | raised if either operand is a NaN. Otherwise, the comparison is performed 4762 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4763 *----------------------------------------------------------------------------*/ 4764 4765 int float32_eq(float32 a, float32 b, float_status *status) 4766 { 4767 uint32_t av, bv; 4768 a = float32_squash_input_denormal(a, status); 4769 b = float32_squash_input_denormal(b, status); 4770 4771 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4772 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4773 ) { 4774 float_raise(float_flag_invalid, status); 4775 return 0; 4776 } 4777 av = float32_val(a); 4778 bv = float32_val(b); 4779 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 4780 } 4781 4782 /*---------------------------------------------------------------------------- 4783 | Returns 1 if the single-precision floating-point value `a' is less than 4784 | or equal to the corresponding value `b', and 0 otherwise. The invalid 4785 | exception is raised if either operand is a NaN. The comparison is performed 4786 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4787 *----------------------------------------------------------------------------*/ 4788 4789 int float32_le(float32 a, float32 b, float_status *status) 4790 { 4791 flag aSign, bSign; 4792 uint32_t av, bv; 4793 a = float32_squash_input_denormal(a, status); 4794 b = float32_squash_input_denormal(b, status); 4795 4796 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4797 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4798 ) { 4799 float_raise(float_flag_invalid, status); 4800 return 0; 4801 } 4802 aSign = extractFloat32Sign( a ); 4803 bSign = extractFloat32Sign( b ); 4804 av = float32_val(a); 4805 bv = float32_val(b); 4806 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 4807 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4808 4809 } 4810 4811 /*---------------------------------------------------------------------------- 4812 | Returns 1 if the single-precision floating-point value `a' is less than 4813 | the corresponding value `b', and 0 otherwise. The invalid exception is 4814 | raised if either operand is a NaN. The comparison is performed according 4815 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4816 *----------------------------------------------------------------------------*/ 4817 4818 int float32_lt(float32 a, float32 b, float_status *status) 4819 { 4820 flag aSign, bSign; 4821 uint32_t av, bv; 4822 a = float32_squash_input_denormal(a, status); 4823 b = float32_squash_input_denormal(b, status); 4824 4825 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4826 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4827 ) { 4828 float_raise(float_flag_invalid, status); 4829 return 0; 4830 } 4831 aSign = extractFloat32Sign( a ); 4832 bSign = extractFloat32Sign( b ); 4833 av = float32_val(a); 4834 bv = float32_val(b); 4835 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 4836 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4837 4838 } 4839 4840 /*---------------------------------------------------------------------------- 4841 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 4842 | be compared, and 0 otherwise. The invalid exception is raised if either 4843 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4844 | Standard for Binary Floating-Point Arithmetic. 4845 *----------------------------------------------------------------------------*/ 4846 4847 int float32_unordered(float32 a, float32 b, float_status *status) 4848 { 4849 a = float32_squash_input_denormal(a, status); 4850 b = float32_squash_input_denormal(b, status); 4851 4852 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4853 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4854 ) { 4855 float_raise(float_flag_invalid, status); 4856 return 1; 4857 } 4858 return 0; 4859 } 4860 4861 /*---------------------------------------------------------------------------- 4862 | Returns 1 if the single-precision floating-point value `a' is equal to 4863 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4864 | exception. The comparison is performed according to the IEC/IEEE Standard 4865 | for Binary Floating-Point Arithmetic. 4866 *----------------------------------------------------------------------------*/ 4867 4868 int float32_eq_quiet(float32 a, float32 b, float_status *status) 4869 { 4870 a = float32_squash_input_denormal(a, status); 4871 b = float32_squash_input_denormal(b, status); 4872 4873 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4874 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4875 ) { 4876 if (float32_is_signaling_nan(a, status) 4877 || float32_is_signaling_nan(b, status)) { 4878 float_raise(float_flag_invalid, status); 4879 } 4880 return 0; 4881 } 4882 return ( float32_val(a) == float32_val(b) ) || 4883 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 4884 } 4885 4886 /*---------------------------------------------------------------------------- 4887 | Returns 1 if the single-precision floating-point value `a' is less than or 4888 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4889 | cause an exception. Otherwise, the comparison is performed according to the 4890 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4891 *----------------------------------------------------------------------------*/ 4892 4893 int float32_le_quiet(float32 a, float32 b, float_status *status) 4894 { 4895 flag aSign, bSign; 4896 uint32_t av, bv; 4897 a = float32_squash_input_denormal(a, status); 4898 b = float32_squash_input_denormal(b, status); 4899 4900 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4901 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4902 ) { 4903 if (float32_is_signaling_nan(a, status) 4904 || float32_is_signaling_nan(b, status)) { 4905 float_raise(float_flag_invalid, status); 4906 } 4907 return 0; 4908 } 4909 aSign = extractFloat32Sign( a ); 4910 bSign = extractFloat32Sign( b ); 4911 av = float32_val(a); 4912 bv = float32_val(b); 4913 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 4914 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4915 4916 } 4917 4918 /*---------------------------------------------------------------------------- 4919 | Returns 1 if the single-precision floating-point value `a' is less than 4920 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4921 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 4922 | Standard for Binary Floating-Point Arithmetic. 4923 *----------------------------------------------------------------------------*/ 4924 4925 int float32_lt_quiet(float32 a, float32 b, float_status *status) 4926 { 4927 flag aSign, bSign; 4928 uint32_t av, bv; 4929 a = float32_squash_input_denormal(a, status); 4930 b = float32_squash_input_denormal(b, status); 4931 4932 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4933 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4934 ) { 4935 if (float32_is_signaling_nan(a, status) 4936 || float32_is_signaling_nan(b, status)) { 4937 float_raise(float_flag_invalid, status); 4938 } 4939 return 0; 4940 } 4941 aSign = extractFloat32Sign( a ); 4942 bSign = extractFloat32Sign( b ); 4943 av = float32_val(a); 4944 bv = float32_val(b); 4945 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 4946 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4947 4948 } 4949 4950 /*---------------------------------------------------------------------------- 4951 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 4952 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 4953 | comparison is performed according to the IEC/IEEE Standard for Binary 4954 | Floating-Point Arithmetic. 4955 *----------------------------------------------------------------------------*/ 4956 4957 int float32_unordered_quiet(float32 a, float32 b, float_status *status) 4958 { 4959 a = float32_squash_input_denormal(a, status); 4960 b = float32_squash_input_denormal(b, status); 4961 4962 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4963 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4964 ) { 4965 if (float32_is_signaling_nan(a, status) 4966 || float32_is_signaling_nan(b, status)) { 4967 float_raise(float_flag_invalid, status); 4968 } 4969 return 1; 4970 } 4971 return 0; 4972 } 4973 4974 /*---------------------------------------------------------------------------- 4975 | If `a' is denormal and we are in flush-to-zero mode then set the 4976 | input-denormal exception and return zero. Otherwise just return the value. 4977 *----------------------------------------------------------------------------*/ 4978 float16 float16_squash_input_denormal(float16 a, float_status *status) 4979 { 4980 if (status->flush_inputs_to_zero) { 4981 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) { 4982 float_raise(float_flag_input_denormal, status); 4983 return make_float16(float16_val(a) & 0x8000); 4984 } 4985 } 4986 return a; 4987 } 4988 4989 /*---------------------------------------------------------------------------- 4990 | Returns the result of converting the double-precision floating-point value 4991 | `a' to the extended double-precision floating-point format. The conversion 4992 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4993 | Arithmetic. 4994 *----------------------------------------------------------------------------*/ 4995 4996 floatx80 float64_to_floatx80(float64 a, float_status *status) 4997 { 4998 flag aSign; 4999 int aExp; 5000 uint64_t aSig; 5001 5002 a = float64_squash_input_denormal(a, status); 5003 aSig = extractFloat64Frac( a ); 5004 aExp = extractFloat64Exp( a ); 5005 aSign = extractFloat64Sign( a ); 5006 if ( aExp == 0x7FF ) { 5007 if (aSig) { 5008 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status); 5009 } 5010 return packFloatx80(aSign, 5011 floatx80_infinity_high, 5012 floatx80_infinity_low); 5013 } 5014 if ( aExp == 0 ) { 5015 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 5016 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5017 } 5018 return 5019 packFloatx80( 5020 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 5021 5022 } 5023 5024 /*---------------------------------------------------------------------------- 5025 | Returns the result of converting the double-precision floating-point value 5026 | `a' to the quadruple-precision floating-point format. The conversion is 5027 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5028 | Arithmetic. 5029 *----------------------------------------------------------------------------*/ 5030 5031 float128 float64_to_float128(float64 a, float_status *status) 5032 { 5033 flag aSign; 5034 int aExp; 5035 uint64_t aSig, zSig0, zSig1; 5036 5037 a = float64_squash_input_denormal(a, status); 5038 aSig = extractFloat64Frac( a ); 5039 aExp = extractFloat64Exp( a ); 5040 aSign = extractFloat64Sign( a ); 5041 if ( aExp == 0x7FF ) { 5042 if (aSig) { 5043 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 5044 } 5045 return packFloat128( aSign, 0x7FFF, 0, 0 ); 5046 } 5047 if ( aExp == 0 ) { 5048 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 5049 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5050 --aExp; 5051 } 5052 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 5053 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 5054 5055 } 5056 5057 5058 /*---------------------------------------------------------------------------- 5059 | Returns the remainder of the double-precision floating-point value `a' 5060 | with respect to the corresponding value `b'. The operation is performed 5061 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5062 *----------------------------------------------------------------------------*/ 5063 5064 float64 float64_rem(float64 a, float64 b, float_status *status) 5065 { 5066 flag aSign, zSign; 5067 int aExp, bExp, expDiff; 5068 uint64_t aSig, bSig; 5069 uint64_t q, alternateASig; 5070 int64_t sigMean; 5071 5072 a = float64_squash_input_denormal(a, status); 5073 b = float64_squash_input_denormal(b, status); 5074 aSig = extractFloat64Frac( a ); 5075 aExp = extractFloat64Exp( a ); 5076 aSign = extractFloat64Sign( a ); 5077 bSig = extractFloat64Frac( b ); 5078 bExp = extractFloat64Exp( b ); 5079 if ( aExp == 0x7FF ) { 5080 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 5081 return propagateFloat64NaN(a, b, status); 5082 } 5083 float_raise(float_flag_invalid, status); 5084 return float64_default_nan(status); 5085 } 5086 if ( bExp == 0x7FF ) { 5087 if (bSig) { 5088 return propagateFloat64NaN(a, b, status); 5089 } 5090 return a; 5091 } 5092 if ( bExp == 0 ) { 5093 if ( bSig == 0 ) { 5094 float_raise(float_flag_invalid, status); 5095 return float64_default_nan(status); 5096 } 5097 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 5098 } 5099 if ( aExp == 0 ) { 5100 if ( aSig == 0 ) return a; 5101 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5102 } 5103 expDiff = aExp - bExp; 5104 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 5105 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 5106 if ( expDiff < 0 ) { 5107 if ( expDiff < -1 ) return a; 5108 aSig >>= 1; 5109 } 5110 q = ( bSig <= aSig ); 5111 if ( q ) aSig -= bSig; 5112 expDiff -= 64; 5113 while ( 0 < expDiff ) { 5114 q = estimateDiv128To64( aSig, 0, bSig ); 5115 q = ( 2 < q ) ? q - 2 : 0; 5116 aSig = - ( ( bSig>>2 ) * q ); 5117 expDiff -= 62; 5118 } 5119 expDiff += 64; 5120 if ( 0 < expDiff ) { 5121 q = estimateDiv128To64( aSig, 0, bSig ); 5122 q = ( 2 < q ) ? q - 2 : 0; 5123 q >>= 64 - expDiff; 5124 bSig >>= 2; 5125 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 5126 } 5127 else { 5128 aSig >>= 2; 5129 bSig >>= 2; 5130 } 5131 do { 5132 alternateASig = aSig; 5133 ++q; 5134 aSig -= bSig; 5135 } while ( 0 <= (int64_t) aSig ); 5136 sigMean = aSig + alternateASig; 5137 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 5138 aSig = alternateASig; 5139 } 5140 zSign = ( (int64_t) aSig < 0 ); 5141 if ( zSign ) aSig = - aSig; 5142 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 5143 5144 } 5145 5146 /*---------------------------------------------------------------------------- 5147 | Returns the binary log of the double-precision floating-point value `a'. 5148 | The operation is performed according to the IEC/IEEE Standard for Binary 5149 | Floating-Point Arithmetic. 5150 *----------------------------------------------------------------------------*/ 5151 float64 float64_log2(float64 a, float_status *status) 5152 { 5153 flag aSign, zSign; 5154 int aExp; 5155 uint64_t aSig, aSig0, aSig1, zSig, i; 5156 a = float64_squash_input_denormal(a, status); 5157 5158 aSig = extractFloat64Frac( a ); 5159 aExp = extractFloat64Exp( a ); 5160 aSign = extractFloat64Sign( a ); 5161 5162 if ( aExp == 0 ) { 5163 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 5164 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5165 } 5166 if ( aSign ) { 5167 float_raise(float_flag_invalid, status); 5168 return float64_default_nan(status); 5169 } 5170 if ( aExp == 0x7FF ) { 5171 if (aSig) { 5172 return propagateFloat64NaN(a, float64_zero, status); 5173 } 5174 return a; 5175 } 5176 5177 aExp -= 0x3FF; 5178 aSig |= LIT64( 0x0010000000000000 ); 5179 zSign = aExp < 0; 5180 zSig = (uint64_t)aExp << 52; 5181 for (i = 1LL << 51; i > 0; i >>= 1) { 5182 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 5183 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 5184 if ( aSig & LIT64( 0x0020000000000000 ) ) { 5185 aSig >>= 1; 5186 zSig |= i; 5187 } 5188 } 5189 5190 if ( zSign ) 5191 zSig = -zSig; 5192 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 5193 } 5194 5195 /*---------------------------------------------------------------------------- 5196 | Returns 1 if the double-precision floating-point value `a' is equal to the 5197 | corresponding value `b', and 0 otherwise. The invalid exception is raised 5198 | if either operand is a NaN. Otherwise, the comparison is performed 5199 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5200 *----------------------------------------------------------------------------*/ 5201 5202 int float64_eq(float64 a, float64 b, float_status *status) 5203 { 5204 uint64_t av, bv; 5205 a = float64_squash_input_denormal(a, status); 5206 b = float64_squash_input_denormal(b, status); 5207 5208 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5209 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5210 ) { 5211 float_raise(float_flag_invalid, status); 5212 return 0; 5213 } 5214 av = float64_val(a); 5215 bv = float64_val(b); 5216 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 5217 5218 } 5219 5220 /*---------------------------------------------------------------------------- 5221 | Returns 1 if the double-precision floating-point value `a' is less than or 5222 | equal to the corresponding value `b', and 0 otherwise. The invalid 5223 | exception is raised if either operand is a NaN. The comparison is performed 5224 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5225 *----------------------------------------------------------------------------*/ 5226 5227 int float64_le(float64 a, float64 b, float_status *status) 5228 { 5229 flag aSign, bSign; 5230 uint64_t av, bv; 5231 a = float64_squash_input_denormal(a, status); 5232 b = float64_squash_input_denormal(b, status); 5233 5234 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5235 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5236 ) { 5237 float_raise(float_flag_invalid, status); 5238 return 0; 5239 } 5240 aSign = extractFloat64Sign( a ); 5241 bSign = extractFloat64Sign( b ); 5242 av = float64_val(a); 5243 bv = float64_val(b); 5244 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 5245 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 5246 5247 } 5248 5249 /*---------------------------------------------------------------------------- 5250 | Returns 1 if the double-precision floating-point value `a' is less than 5251 | the corresponding value `b', and 0 otherwise. The invalid exception is 5252 | raised if either operand is a NaN. The comparison is performed according 5253 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5254 *----------------------------------------------------------------------------*/ 5255 5256 int float64_lt(float64 a, float64 b, float_status *status) 5257 { 5258 flag aSign, bSign; 5259 uint64_t av, bv; 5260 5261 a = float64_squash_input_denormal(a, status); 5262 b = float64_squash_input_denormal(b, status); 5263 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5264 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5265 ) { 5266 float_raise(float_flag_invalid, status); 5267 return 0; 5268 } 5269 aSign = extractFloat64Sign( a ); 5270 bSign = extractFloat64Sign( b ); 5271 av = float64_val(a); 5272 bv = float64_val(b); 5273 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 5274 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 5275 5276 } 5277 5278 /*---------------------------------------------------------------------------- 5279 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 5280 | be compared, and 0 otherwise. The invalid exception is raised if either 5281 | operand is a NaN. The comparison is performed according to the IEC/IEEE 5282 | Standard for Binary Floating-Point Arithmetic. 5283 *----------------------------------------------------------------------------*/ 5284 5285 int float64_unordered(float64 a, float64 b, float_status *status) 5286 { 5287 a = float64_squash_input_denormal(a, status); 5288 b = float64_squash_input_denormal(b, status); 5289 5290 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5291 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5292 ) { 5293 float_raise(float_flag_invalid, status); 5294 return 1; 5295 } 5296 return 0; 5297 } 5298 5299 /*---------------------------------------------------------------------------- 5300 | Returns 1 if the double-precision floating-point value `a' is equal to the 5301 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5302 | exception.The comparison is performed according to the IEC/IEEE Standard 5303 | for Binary Floating-Point Arithmetic. 5304 *----------------------------------------------------------------------------*/ 5305 5306 int float64_eq_quiet(float64 a, float64 b, float_status *status) 5307 { 5308 uint64_t av, bv; 5309 a = float64_squash_input_denormal(a, status); 5310 b = float64_squash_input_denormal(b, status); 5311 5312 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5313 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5314 ) { 5315 if (float64_is_signaling_nan(a, status) 5316 || float64_is_signaling_nan(b, status)) { 5317 float_raise(float_flag_invalid, status); 5318 } 5319 return 0; 5320 } 5321 av = float64_val(a); 5322 bv = float64_val(b); 5323 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 5324 5325 } 5326 5327 /*---------------------------------------------------------------------------- 5328 | Returns 1 if the double-precision floating-point value `a' is less than or 5329 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5330 | cause an exception. Otherwise, the comparison is performed according to the 5331 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5332 *----------------------------------------------------------------------------*/ 5333 5334 int float64_le_quiet(float64 a, float64 b, float_status *status) 5335 { 5336 flag aSign, bSign; 5337 uint64_t av, bv; 5338 a = float64_squash_input_denormal(a, status); 5339 b = float64_squash_input_denormal(b, status); 5340 5341 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5342 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5343 ) { 5344 if (float64_is_signaling_nan(a, status) 5345 || float64_is_signaling_nan(b, status)) { 5346 float_raise(float_flag_invalid, status); 5347 } 5348 return 0; 5349 } 5350 aSign = extractFloat64Sign( a ); 5351 bSign = extractFloat64Sign( b ); 5352 av = float64_val(a); 5353 bv = float64_val(b); 5354 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 5355 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 5356 5357 } 5358 5359 /*---------------------------------------------------------------------------- 5360 | Returns 1 if the double-precision floating-point value `a' is less than 5361 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5362 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 5363 | Standard for Binary Floating-Point Arithmetic. 5364 *----------------------------------------------------------------------------*/ 5365 5366 int float64_lt_quiet(float64 a, float64 b, float_status *status) 5367 { 5368 flag aSign, bSign; 5369 uint64_t av, bv; 5370 a = float64_squash_input_denormal(a, status); 5371 b = float64_squash_input_denormal(b, status); 5372 5373 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5374 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5375 ) { 5376 if (float64_is_signaling_nan(a, status) 5377 || float64_is_signaling_nan(b, status)) { 5378 float_raise(float_flag_invalid, status); 5379 } 5380 return 0; 5381 } 5382 aSign = extractFloat64Sign( a ); 5383 bSign = extractFloat64Sign( b ); 5384 av = float64_val(a); 5385 bv = float64_val(b); 5386 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 5387 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 5388 5389 } 5390 5391 /*---------------------------------------------------------------------------- 5392 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 5393 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 5394 | comparison is performed according to the IEC/IEEE Standard for Binary 5395 | Floating-Point Arithmetic. 5396 *----------------------------------------------------------------------------*/ 5397 5398 int float64_unordered_quiet(float64 a, float64 b, float_status *status) 5399 { 5400 a = float64_squash_input_denormal(a, status); 5401 b = float64_squash_input_denormal(b, status); 5402 5403 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5404 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5405 ) { 5406 if (float64_is_signaling_nan(a, status) 5407 || float64_is_signaling_nan(b, status)) { 5408 float_raise(float_flag_invalid, status); 5409 } 5410 return 1; 5411 } 5412 return 0; 5413 } 5414 5415 /*---------------------------------------------------------------------------- 5416 | Returns the result of converting the extended double-precision floating- 5417 | point value `a' to the 32-bit two's complement integer format. The 5418 | conversion is performed according to the IEC/IEEE Standard for Binary 5419 | Floating-Point Arithmetic---which means in particular that the conversion 5420 | is rounded according to the current rounding mode. If `a' is a NaN, the 5421 | largest positive integer is returned. Otherwise, if the conversion 5422 | overflows, the largest integer with the same sign as `a' is returned. 5423 *----------------------------------------------------------------------------*/ 5424 5425 int32_t floatx80_to_int32(floatx80 a, float_status *status) 5426 { 5427 flag aSign; 5428 int32_t aExp, shiftCount; 5429 uint64_t aSig; 5430 5431 if (floatx80_invalid_encoding(a)) { 5432 float_raise(float_flag_invalid, status); 5433 return 1 << 31; 5434 } 5435 aSig = extractFloatx80Frac( a ); 5436 aExp = extractFloatx80Exp( a ); 5437 aSign = extractFloatx80Sign( a ); 5438 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5439 shiftCount = 0x4037 - aExp; 5440 if ( shiftCount <= 0 ) shiftCount = 1; 5441 shift64RightJamming( aSig, shiftCount, &aSig ); 5442 return roundAndPackInt32(aSign, aSig, status); 5443 5444 } 5445 5446 /*---------------------------------------------------------------------------- 5447 | Returns the result of converting the extended double-precision floating- 5448 | point value `a' to the 32-bit two's complement integer format. The 5449 | conversion is performed according to the IEC/IEEE Standard for Binary 5450 | Floating-Point Arithmetic, except that the conversion is always rounded 5451 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5452 | Otherwise, if the conversion overflows, the largest integer with the same 5453 | sign as `a' is returned. 5454 *----------------------------------------------------------------------------*/ 5455 5456 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 5457 { 5458 flag aSign; 5459 int32_t aExp, shiftCount; 5460 uint64_t aSig, savedASig; 5461 int32_t z; 5462 5463 if (floatx80_invalid_encoding(a)) { 5464 float_raise(float_flag_invalid, status); 5465 return 1 << 31; 5466 } 5467 aSig = extractFloatx80Frac( a ); 5468 aExp = extractFloatx80Exp( a ); 5469 aSign = extractFloatx80Sign( a ); 5470 if ( 0x401E < aExp ) { 5471 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5472 goto invalid; 5473 } 5474 else if ( aExp < 0x3FFF ) { 5475 if (aExp || aSig) { 5476 status->float_exception_flags |= float_flag_inexact; 5477 } 5478 return 0; 5479 } 5480 shiftCount = 0x403E - aExp; 5481 savedASig = aSig; 5482 aSig >>= shiftCount; 5483 z = aSig; 5484 if ( aSign ) z = - z; 5485 if ( ( z < 0 ) ^ aSign ) { 5486 invalid: 5487 float_raise(float_flag_invalid, status); 5488 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5489 } 5490 if ( ( aSig<<shiftCount ) != savedASig ) { 5491 status->float_exception_flags |= float_flag_inexact; 5492 } 5493 return z; 5494 5495 } 5496 5497 /*---------------------------------------------------------------------------- 5498 | Returns the result of converting the extended double-precision floating- 5499 | point value `a' to the 64-bit two's complement integer format. The 5500 | conversion is performed according to the IEC/IEEE Standard for Binary 5501 | Floating-Point Arithmetic---which means in particular that the conversion 5502 | is rounded according to the current rounding mode. If `a' is a NaN, 5503 | the largest positive integer is returned. Otherwise, if the conversion 5504 | overflows, the largest integer with the same sign as `a' is returned. 5505 *----------------------------------------------------------------------------*/ 5506 5507 int64_t floatx80_to_int64(floatx80 a, float_status *status) 5508 { 5509 flag aSign; 5510 int32_t aExp, shiftCount; 5511 uint64_t aSig, aSigExtra; 5512 5513 if (floatx80_invalid_encoding(a)) { 5514 float_raise(float_flag_invalid, status); 5515 return 1ULL << 63; 5516 } 5517 aSig = extractFloatx80Frac( a ); 5518 aExp = extractFloatx80Exp( a ); 5519 aSign = extractFloatx80Sign( a ); 5520 shiftCount = 0x403E - aExp; 5521 if ( shiftCount <= 0 ) { 5522 if ( shiftCount ) { 5523 float_raise(float_flag_invalid, status); 5524 if (!aSign || floatx80_is_any_nan(a)) { 5525 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5526 } 5527 return (int64_t) LIT64( 0x8000000000000000 ); 5528 } 5529 aSigExtra = 0; 5530 } 5531 else { 5532 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 5533 } 5534 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 5535 5536 } 5537 5538 /*---------------------------------------------------------------------------- 5539 | Returns the result of converting the extended double-precision floating- 5540 | point value `a' to the 64-bit two's complement integer format. The 5541 | conversion is performed according to the IEC/IEEE Standard for Binary 5542 | Floating-Point Arithmetic, except that the conversion is always rounded 5543 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5544 | Otherwise, if the conversion overflows, the largest integer with the same 5545 | sign as `a' is returned. 5546 *----------------------------------------------------------------------------*/ 5547 5548 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 5549 { 5550 flag aSign; 5551 int32_t aExp, shiftCount; 5552 uint64_t aSig; 5553 int64_t z; 5554 5555 if (floatx80_invalid_encoding(a)) { 5556 float_raise(float_flag_invalid, status); 5557 return 1ULL << 63; 5558 } 5559 aSig = extractFloatx80Frac( a ); 5560 aExp = extractFloatx80Exp( a ); 5561 aSign = extractFloatx80Sign( a ); 5562 shiftCount = aExp - 0x403E; 5563 if ( 0 <= shiftCount ) { 5564 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 5565 if ( ( a.high != 0xC03E ) || aSig ) { 5566 float_raise(float_flag_invalid, status); 5567 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 5568 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5569 } 5570 } 5571 return (int64_t) LIT64( 0x8000000000000000 ); 5572 } 5573 else if ( aExp < 0x3FFF ) { 5574 if (aExp | aSig) { 5575 status->float_exception_flags |= float_flag_inexact; 5576 } 5577 return 0; 5578 } 5579 z = aSig>>( - shiftCount ); 5580 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 5581 status->float_exception_flags |= float_flag_inexact; 5582 } 5583 if ( aSign ) z = - z; 5584 return z; 5585 5586 } 5587 5588 /*---------------------------------------------------------------------------- 5589 | Returns the result of converting the extended double-precision floating- 5590 | point value `a' to the single-precision floating-point format. The 5591 | conversion is performed according to the IEC/IEEE Standard for Binary 5592 | Floating-Point Arithmetic. 5593 *----------------------------------------------------------------------------*/ 5594 5595 float32 floatx80_to_float32(floatx80 a, float_status *status) 5596 { 5597 flag aSign; 5598 int32_t aExp; 5599 uint64_t aSig; 5600 5601 if (floatx80_invalid_encoding(a)) { 5602 float_raise(float_flag_invalid, status); 5603 return float32_default_nan(status); 5604 } 5605 aSig = extractFloatx80Frac( a ); 5606 aExp = extractFloatx80Exp( a ); 5607 aSign = extractFloatx80Sign( a ); 5608 if ( aExp == 0x7FFF ) { 5609 if ( (uint64_t) ( aSig<<1 ) ) { 5610 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status); 5611 } 5612 return packFloat32( aSign, 0xFF, 0 ); 5613 } 5614 shift64RightJamming( aSig, 33, &aSig ); 5615 if ( aExp || aSig ) aExp -= 0x3F81; 5616 return roundAndPackFloat32(aSign, aExp, aSig, status); 5617 5618 } 5619 5620 /*---------------------------------------------------------------------------- 5621 | Returns the result of converting the extended double-precision floating- 5622 | point value `a' to the double-precision floating-point format. The 5623 | conversion is performed according to the IEC/IEEE Standard for Binary 5624 | Floating-Point Arithmetic. 5625 *----------------------------------------------------------------------------*/ 5626 5627 float64 floatx80_to_float64(floatx80 a, float_status *status) 5628 { 5629 flag aSign; 5630 int32_t aExp; 5631 uint64_t aSig, zSig; 5632 5633 if (floatx80_invalid_encoding(a)) { 5634 float_raise(float_flag_invalid, status); 5635 return float64_default_nan(status); 5636 } 5637 aSig = extractFloatx80Frac( a ); 5638 aExp = extractFloatx80Exp( a ); 5639 aSign = extractFloatx80Sign( a ); 5640 if ( aExp == 0x7FFF ) { 5641 if ( (uint64_t) ( aSig<<1 ) ) { 5642 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status); 5643 } 5644 return packFloat64( aSign, 0x7FF, 0 ); 5645 } 5646 shift64RightJamming( aSig, 1, &zSig ); 5647 if ( aExp || aSig ) aExp -= 0x3C01; 5648 return roundAndPackFloat64(aSign, aExp, zSig, status); 5649 5650 } 5651 5652 /*---------------------------------------------------------------------------- 5653 | Returns the result of converting the extended double-precision floating- 5654 | point value `a' to the quadruple-precision floating-point format. The 5655 | conversion is performed according to the IEC/IEEE Standard for Binary 5656 | Floating-Point Arithmetic. 5657 *----------------------------------------------------------------------------*/ 5658 5659 float128 floatx80_to_float128(floatx80 a, float_status *status) 5660 { 5661 flag aSign; 5662 int aExp; 5663 uint64_t aSig, zSig0, zSig1; 5664 5665 if (floatx80_invalid_encoding(a)) { 5666 float_raise(float_flag_invalid, status); 5667 return float128_default_nan(status); 5668 } 5669 aSig = extractFloatx80Frac( a ); 5670 aExp = extractFloatx80Exp( a ); 5671 aSign = extractFloatx80Sign( a ); 5672 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 5673 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status); 5674 } 5675 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 5676 return packFloat128( aSign, aExp, zSig0, zSig1 ); 5677 5678 } 5679 5680 /*---------------------------------------------------------------------------- 5681 | Rounds the extended double-precision floating-point value `a' 5682 | to the precision provided by floatx80_rounding_precision and returns the 5683 | result as an extended double-precision floating-point value. 5684 | The operation is performed according to the IEC/IEEE Standard for Binary 5685 | Floating-Point Arithmetic. 5686 *----------------------------------------------------------------------------*/ 5687 5688 floatx80 floatx80_round(floatx80 a, float_status *status) 5689 { 5690 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5691 extractFloatx80Sign(a), 5692 extractFloatx80Exp(a), 5693 extractFloatx80Frac(a), 0, status); 5694 } 5695 5696 /*---------------------------------------------------------------------------- 5697 | Rounds the extended double-precision floating-point value `a' to an integer, 5698 | and returns the result as an extended quadruple-precision floating-point 5699 | value. The operation is performed according to the IEC/IEEE Standard for 5700 | Binary Floating-Point Arithmetic. 5701 *----------------------------------------------------------------------------*/ 5702 5703 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 5704 { 5705 flag aSign; 5706 int32_t aExp; 5707 uint64_t lastBitMask, roundBitsMask; 5708 floatx80 z; 5709 5710 if (floatx80_invalid_encoding(a)) { 5711 float_raise(float_flag_invalid, status); 5712 return floatx80_default_nan(status); 5713 } 5714 aExp = extractFloatx80Exp( a ); 5715 if ( 0x403E <= aExp ) { 5716 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 5717 return propagateFloatx80NaN(a, a, status); 5718 } 5719 return a; 5720 } 5721 if ( aExp < 0x3FFF ) { 5722 if ( ( aExp == 0 ) 5723 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 5724 return a; 5725 } 5726 status->float_exception_flags |= float_flag_inexact; 5727 aSign = extractFloatx80Sign( a ); 5728 switch (status->float_rounding_mode) { 5729 case float_round_nearest_even: 5730 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 5731 ) { 5732 return 5733 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 5734 } 5735 break; 5736 case float_round_ties_away: 5737 if (aExp == 0x3FFE) { 5738 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000)); 5739 } 5740 break; 5741 case float_round_down: 5742 return 5743 aSign ? 5744 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 5745 : packFloatx80( 0, 0, 0 ); 5746 case float_round_up: 5747 return 5748 aSign ? packFloatx80( 1, 0, 0 ) 5749 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 5750 } 5751 return packFloatx80( aSign, 0, 0 ); 5752 } 5753 lastBitMask = 1; 5754 lastBitMask <<= 0x403E - aExp; 5755 roundBitsMask = lastBitMask - 1; 5756 z = a; 5757 switch (status->float_rounding_mode) { 5758 case float_round_nearest_even: 5759 z.low += lastBitMask>>1; 5760 if ((z.low & roundBitsMask) == 0) { 5761 z.low &= ~lastBitMask; 5762 } 5763 break; 5764 case float_round_ties_away: 5765 z.low += lastBitMask >> 1; 5766 break; 5767 case float_round_to_zero: 5768 break; 5769 case float_round_up: 5770 if (!extractFloatx80Sign(z)) { 5771 z.low += roundBitsMask; 5772 } 5773 break; 5774 case float_round_down: 5775 if (extractFloatx80Sign(z)) { 5776 z.low += roundBitsMask; 5777 } 5778 break; 5779 default: 5780 abort(); 5781 } 5782 z.low &= ~ roundBitsMask; 5783 if ( z.low == 0 ) { 5784 ++z.high; 5785 z.low = LIT64( 0x8000000000000000 ); 5786 } 5787 if (z.low != a.low) { 5788 status->float_exception_flags |= float_flag_inexact; 5789 } 5790 return z; 5791 5792 } 5793 5794 /*---------------------------------------------------------------------------- 5795 | Returns the result of adding the absolute values of the extended double- 5796 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 5797 | negated before being returned. `zSign' is ignored if the result is a NaN. 5798 | The addition is performed according to the IEC/IEEE Standard for Binary 5799 | Floating-Point Arithmetic. 5800 *----------------------------------------------------------------------------*/ 5801 5802 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5803 float_status *status) 5804 { 5805 int32_t aExp, bExp, zExp; 5806 uint64_t aSig, bSig, zSig0, zSig1; 5807 int32_t expDiff; 5808 5809 aSig = extractFloatx80Frac( a ); 5810 aExp = extractFloatx80Exp( a ); 5811 bSig = extractFloatx80Frac( b ); 5812 bExp = extractFloatx80Exp( b ); 5813 expDiff = aExp - bExp; 5814 if ( 0 < expDiff ) { 5815 if ( aExp == 0x7FFF ) { 5816 if ((uint64_t)(aSig << 1)) { 5817 return propagateFloatx80NaN(a, b, status); 5818 } 5819 return a; 5820 } 5821 if ( bExp == 0 ) --expDiff; 5822 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5823 zExp = aExp; 5824 } 5825 else if ( expDiff < 0 ) { 5826 if ( bExp == 0x7FFF ) { 5827 if ((uint64_t)(bSig << 1)) { 5828 return propagateFloatx80NaN(a, b, status); 5829 } 5830 return packFloatx80(zSign, 5831 floatx80_infinity_high, 5832 floatx80_infinity_low); 5833 } 5834 if ( aExp == 0 ) ++expDiff; 5835 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5836 zExp = bExp; 5837 } 5838 else { 5839 if ( aExp == 0x7FFF ) { 5840 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5841 return propagateFloatx80NaN(a, b, status); 5842 } 5843 return a; 5844 } 5845 zSig1 = 0; 5846 zSig0 = aSig + bSig; 5847 if ( aExp == 0 ) { 5848 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 5849 goto roundAndPack; 5850 } 5851 zExp = aExp; 5852 goto shiftRight1; 5853 } 5854 zSig0 = aSig + bSig; 5855 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 5856 shiftRight1: 5857 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5858 zSig0 |= LIT64( 0x8000000000000000 ); 5859 ++zExp; 5860 roundAndPack: 5861 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5862 zSign, zExp, zSig0, zSig1, status); 5863 } 5864 5865 /*---------------------------------------------------------------------------- 5866 | Returns the result of subtracting the absolute values of the extended 5867 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 5868 | difference is negated before being returned. `zSign' is ignored if the 5869 | result is a NaN. The subtraction is performed according to the IEC/IEEE 5870 | Standard for Binary Floating-Point Arithmetic. 5871 *----------------------------------------------------------------------------*/ 5872 5873 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5874 float_status *status) 5875 { 5876 int32_t aExp, bExp, zExp; 5877 uint64_t aSig, bSig, zSig0, zSig1; 5878 int32_t expDiff; 5879 5880 aSig = extractFloatx80Frac( a ); 5881 aExp = extractFloatx80Exp( a ); 5882 bSig = extractFloatx80Frac( b ); 5883 bExp = extractFloatx80Exp( b ); 5884 expDiff = aExp - bExp; 5885 if ( 0 < expDiff ) goto aExpBigger; 5886 if ( expDiff < 0 ) goto bExpBigger; 5887 if ( aExp == 0x7FFF ) { 5888 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5889 return propagateFloatx80NaN(a, b, status); 5890 } 5891 float_raise(float_flag_invalid, status); 5892 return floatx80_default_nan(status); 5893 } 5894 if ( aExp == 0 ) { 5895 aExp = 1; 5896 bExp = 1; 5897 } 5898 zSig1 = 0; 5899 if ( bSig < aSig ) goto aBigger; 5900 if ( aSig < bSig ) goto bBigger; 5901 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 5902 bExpBigger: 5903 if ( bExp == 0x7FFF ) { 5904 if ((uint64_t)(bSig << 1)) { 5905 return propagateFloatx80NaN(a, b, status); 5906 } 5907 return packFloatx80(zSign ^ 1, floatx80_infinity_high, 5908 floatx80_infinity_low); 5909 } 5910 if ( aExp == 0 ) ++expDiff; 5911 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5912 bBigger: 5913 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 5914 zExp = bExp; 5915 zSign ^= 1; 5916 goto normalizeRoundAndPack; 5917 aExpBigger: 5918 if ( aExp == 0x7FFF ) { 5919 if ((uint64_t)(aSig << 1)) { 5920 return propagateFloatx80NaN(a, b, status); 5921 } 5922 return a; 5923 } 5924 if ( bExp == 0 ) --expDiff; 5925 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5926 aBigger: 5927 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 5928 zExp = aExp; 5929 normalizeRoundAndPack: 5930 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 5931 zSign, zExp, zSig0, zSig1, status); 5932 } 5933 5934 /*---------------------------------------------------------------------------- 5935 | Returns the result of adding the extended double-precision floating-point 5936 | values `a' and `b'. The operation is performed according to the IEC/IEEE 5937 | Standard for Binary Floating-Point Arithmetic. 5938 *----------------------------------------------------------------------------*/ 5939 5940 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 5941 { 5942 flag aSign, bSign; 5943 5944 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5945 float_raise(float_flag_invalid, status); 5946 return floatx80_default_nan(status); 5947 } 5948 aSign = extractFloatx80Sign( a ); 5949 bSign = extractFloatx80Sign( b ); 5950 if ( aSign == bSign ) { 5951 return addFloatx80Sigs(a, b, aSign, status); 5952 } 5953 else { 5954 return subFloatx80Sigs(a, b, aSign, status); 5955 } 5956 5957 } 5958 5959 /*---------------------------------------------------------------------------- 5960 | Returns the result of subtracting the extended double-precision floating- 5961 | point values `a' and `b'. The operation is performed according to the 5962 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5963 *----------------------------------------------------------------------------*/ 5964 5965 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 5966 { 5967 flag aSign, bSign; 5968 5969 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5970 float_raise(float_flag_invalid, status); 5971 return floatx80_default_nan(status); 5972 } 5973 aSign = extractFloatx80Sign( a ); 5974 bSign = extractFloatx80Sign( b ); 5975 if ( aSign == bSign ) { 5976 return subFloatx80Sigs(a, b, aSign, status); 5977 } 5978 else { 5979 return addFloatx80Sigs(a, b, aSign, status); 5980 } 5981 5982 } 5983 5984 /*---------------------------------------------------------------------------- 5985 | Returns the result of multiplying the extended double-precision floating- 5986 | point values `a' and `b'. The operation is performed according to the 5987 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5988 *----------------------------------------------------------------------------*/ 5989 5990 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 5991 { 5992 flag aSign, bSign, zSign; 5993 int32_t aExp, bExp, zExp; 5994 uint64_t aSig, bSig, zSig0, zSig1; 5995 5996 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5997 float_raise(float_flag_invalid, status); 5998 return floatx80_default_nan(status); 5999 } 6000 aSig = extractFloatx80Frac( a ); 6001 aExp = extractFloatx80Exp( a ); 6002 aSign = extractFloatx80Sign( a ); 6003 bSig = extractFloatx80Frac( b ); 6004 bExp = extractFloatx80Exp( b ); 6005 bSign = extractFloatx80Sign( b ); 6006 zSign = aSign ^ bSign; 6007 if ( aExp == 0x7FFF ) { 6008 if ( (uint64_t) ( aSig<<1 ) 6009 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 6010 return propagateFloatx80NaN(a, b, status); 6011 } 6012 if ( ( bExp | bSig ) == 0 ) goto invalid; 6013 return packFloatx80(zSign, floatx80_infinity_high, 6014 floatx80_infinity_low); 6015 } 6016 if ( bExp == 0x7FFF ) { 6017 if ((uint64_t)(bSig << 1)) { 6018 return propagateFloatx80NaN(a, b, status); 6019 } 6020 if ( ( aExp | aSig ) == 0 ) { 6021 invalid: 6022 float_raise(float_flag_invalid, status); 6023 return floatx80_default_nan(status); 6024 } 6025 return packFloatx80(zSign, floatx80_infinity_high, 6026 floatx80_infinity_low); 6027 } 6028 if ( aExp == 0 ) { 6029 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6030 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 6031 } 6032 if ( bExp == 0 ) { 6033 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6034 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6035 } 6036 zExp = aExp + bExp - 0x3FFE; 6037 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 6038 if ( 0 < (int64_t) zSig0 ) { 6039 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 6040 --zExp; 6041 } 6042 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6043 zSign, zExp, zSig0, zSig1, status); 6044 } 6045 6046 /*---------------------------------------------------------------------------- 6047 | Returns the result of dividing the extended double-precision floating-point 6048 | value `a' by the corresponding value `b'. The operation is performed 6049 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6050 *----------------------------------------------------------------------------*/ 6051 6052 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 6053 { 6054 flag aSign, bSign, zSign; 6055 int32_t aExp, bExp, zExp; 6056 uint64_t aSig, bSig, zSig0, zSig1; 6057 uint64_t rem0, rem1, rem2, term0, term1, term2; 6058 6059 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6060 float_raise(float_flag_invalid, status); 6061 return floatx80_default_nan(status); 6062 } 6063 aSig = extractFloatx80Frac( a ); 6064 aExp = extractFloatx80Exp( a ); 6065 aSign = extractFloatx80Sign( a ); 6066 bSig = extractFloatx80Frac( b ); 6067 bExp = extractFloatx80Exp( b ); 6068 bSign = extractFloatx80Sign( b ); 6069 zSign = aSign ^ bSign; 6070 if ( aExp == 0x7FFF ) { 6071 if ((uint64_t)(aSig << 1)) { 6072 return propagateFloatx80NaN(a, b, status); 6073 } 6074 if ( bExp == 0x7FFF ) { 6075 if ((uint64_t)(bSig << 1)) { 6076 return propagateFloatx80NaN(a, b, status); 6077 } 6078 goto invalid; 6079 } 6080 return packFloatx80(zSign, floatx80_infinity_high, 6081 floatx80_infinity_low); 6082 } 6083 if ( bExp == 0x7FFF ) { 6084 if ((uint64_t)(bSig << 1)) { 6085 return propagateFloatx80NaN(a, b, status); 6086 } 6087 return packFloatx80( zSign, 0, 0 ); 6088 } 6089 if ( bExp == 0 ) { 6090 if ( bSig == 0 ) { 6091 if ( ( aExp | aSig ) == 0 ) { 6092 invalid: 6093 float_raise(float_flag_invalid, status); 6094 return floatx80_default_nan(status); 6095 } 6096 float_raise(float_flag_divbyzero, status); 6097 return packFloatx80(zSign, floatx80_infinity_high, 6098 floatx80_infinity_low); 6099 } 6100 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6101 } 6102 if ( aExp == 0 ) { 6103 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6104 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 6105 } 6106 zExp = aExp - bExp + 0x3FFE; 6107 rem1 = 0; 6108 if ( bSig <= aSig ) { 6109 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 6110 ++zExp; 6111 } 6112 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 6113 mul64To128( bSig, zSig0, &term0, &term1 ); 6114 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 6115 while ( (int64_t) rem0 < 0 ) { 6116 --zSig0; 6117 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 6118 } 6119 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 6120 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 6121 mul64To128( bSig, zSig1, &term1, &term2 ); 6122 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6123 while ( (int64_t) rem1 < 0 ) { 6124 --zSig1; 6125 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 6126 } 6127 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 6128 } 6129 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6130 zSign, zExp, zSig0, zSig1, status); 6131 } 6132 6133 /*---------------------------------------------------------------------------- 6134 | Returns the remainder of the extended double-precision floating-point value 6135 | `a' with respect to the corresponding value `b'. The operation is performed 6136 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6137 *----------------------------------------------------------------------------*/ 6138 6139 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 6140 { 6141 flag aSign, zSign; 6142 int32_t aExp, bExp, expDiff; 6143 uint64_t aSig0, aSig1, bSig; 6144 uint64_t q, term0, term1, alternateASig0, alternateASig1; 6145 6146 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6147 float_raise(float_flag_invalid, status); 6148 return floatx80_default_nan(status); 6149 } 6150 aSig0 = extractFloatx80Frac( a ); 6151 aExp = extractFloatx80Exp( a ); 6152 aSign = extractFloatx80Sign( a ); 6153 bSig = extractFloatx80Frac( b ); 6154 bExp = extractFloatx80Exp( b ); 6155 if ( aExp == 0x7FFF ) { 6156 if ( (uint64_t) ( aSig0<<1 ) 6157 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 6158 return propagateFloatx80NaN(a, b, status); 6159 } 6160 goto invalid; 6161 } 6162 if ( bExp == 0x7FFF ) { 6163 if ((uint64_t)(bSig << 1)) { 6164 return propagateFloatx80NaN(a, b, status); 6165 } 6166 return a; 6167 } 6168 if ( bExp == 0 ) { 6169 if ( bSig == 0 ) { 6170 invalid: 6171 float_raise(float_flag_invalid, status); 6172 return floatx80_default_nan(status); 6173 } 6174 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6175 } 6176 if ( aExp == 0 ) { 6177 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a; 6178 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6179 } 6180 bSig |= LIT64( 0x8000000000000000 ); 6181 zSign = aSign; 6182 expDiff = aExp - bExp; 6183 aSig1 = 0; 6184 if ( expDiff < 0 ) { 6185 if ( expDiff < -1 ) return a; 6186 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 6187 expDiff = 0; 6188 } 6189 q = ( bSig <= aSig0 ); 6190 if ( q ) aSig0 -= bSig; 6191 expDiff -= 64; 6192 while ( 0 < expDiff ) { 6193 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6194 q = ( 2 < q ) ? q - 2 : 0; 6195 mul64To128( bSig, q, &term0, &term1 ); 6196 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6197 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 6198 expDiff -= 62; 6199 } 6200 expDiff += 64; 6201 if ( 0 < expDiff ) { 6202 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6203 q = ( 2 < q ) ? q - 2 : 0; 6204 q >>= 64 - expDiff; 6205 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 6206 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6207 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 6208 while ( le128( term0, term1, aSig0, aSig1 ) ) { 6209 ++q; 6210 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6211 } 6212 } 6213 else { 6214 term1 = 0; 6215 term0 = bSig; 6216 } 6217 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 6218 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6219 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6220 && ( q & 1 ) ) 6221 ) { 6222 aSig0 = alternateASig0; 6223 aSig1 = alternateASig1; 6224 zSign = ! zSign; 6225 } 6226 return 6227 normalizeRoundAndPackFloatx80( 6228 80, zSign, bExp + expDiff, aSig0, aSig1, status); 6229 6230 } 6231 6232 /*---------------------------------------------------------------------------- 6233 | Returns the square root of the extended double-precision floating-point 6234 | value `a'. The operation is performed according to the IEC/IEEE Standard 6235 | for Binary Floating-Point Arithmetic. 6236 *----------------------------------------------------------------------------*/ 6237 6238 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 6239 { 6240 flag aSign; 6241 int32_t aExp, zExp; 6242 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 6243 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6244 6245 if (floatx80_invalid_encoding(a)) { 6246 float_raise(float_flag_invalid, status); 6247 return floatx80_default_nan(status); 6248 } 6249 aSig0 = extractFloatx80Frac( a ); 6250 aExp = extractFloatx80Exp( a ); 6251 aSign = extractFloatx80Sign( a ); 6252 if ( aExp == 0x7FFF ) { 6253 if ((uint64_t)(aSig0 << 1)) { 6254 return propagateFloatx80NaN(a, a, status); 6255 } 6256 if ( ! aSign ) return a; 6257 goto invalid; 6258 } 6259 if ( aSign ) { 6260 if ( ( aExp | aSig0 ) == 0 ) return a; 6261 invalid: 6262 float_raise(float_flag_invalid, status); 6263 return floatx80_default_nan(status); 6264 } 6265 if ( aExp == 0 ) { 6266 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 6267 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6268 } 6269 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 6270 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 6271 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 6272 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6273 doubleZSig0 = zSig0<<1; 6274 mul64To128( zSig0, zSig0, &term0, &term1 ); 6275 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6276 while ( (int64_t) rem0 < 0 ) { 6277 --zSig0; 6278 doubleZSig0 -= 2; 6279 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6280 } 6281 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6282 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 6283 if ( zSig1 == 0 ) zSig1 = 1; 6284 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6285 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6286 mul64To128( zSig1, zSig1, &term2, &term3 ); 6287 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6288 while ( (int64_t) rem1 < 0 ) { 6289 --zSig1; 6290 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6291 term3 |= 1; 6292 term2 |= doubleZSig0; 6293 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6294 } 6295 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6296 } 6297 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 6298 zSig0 |= doubleZSig0; 6299 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6300 0, zExp, zSig0, zSig1, status); 6301 } 6302 6303 /*---------------------------------------------------------------------------- 6304 | Returns 1 if the extended double-precision floating-point value `a' is equal 6305 | to the corresponding value `b', and 0 otherwise. The invalid exception is 6306 | raised if either operand is a NaN. Otherwise, the comparison is performed 6307 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6308 *----------------------------------------------------------------------------*/ 6309 6310 int floatx80_eq(floatx80 a, floatx80 b, float_status *status) 6311 { 6312 6313 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 6314 || (extractFloatx80Exp(a) == 0x7FFF 6315 && (uint64_t) (extractFloatx80Frac(a) << 1)) 6316 || (extractFloatx80Exp(b) == 0x7FFF 6317 && (uint64_t) (extractFloatx80Frac(b) << 1)) 6318 ) { 6319 float_raise(float_flag_invalid, status); 6320 return 0; 6321 } 6322 return 6323 ( a.low == b.low ) 6324 && ( ( a.high == b.high ) 6325 || ( ( a.low == 0 ) 6326 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6327 ); 6328 6329 } 6330 6331 /*---------------------------------------------------------------------------- 6332 | Returns 1 if the extended double-precision floating-point value `a' is 6333 | less than or equal to the corresponding value `b', and 0 otherwise. The 6334 | invalid exception is raised if either operand is a NaN. The comparison is 6335 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6336 | Arithmetic. 6337 *----------------------------------------------------------------------------*/ 6338 6339 int floatx80_le(floatx80 a, floatx80 b, float_status *status) 6340 { 6341 flag aSign, bSign; 6342 6343 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 6344 || (extractFloatx80Exp(a) == 0x7FFF 6345 && (uint64_t) (extractFloatx80Frac(a) << 1)) 6346 || (extractFloatx80Exp(b) == 0x7FFF 6347 && (uint64_t) (extractFloatx80Frac(b) << 1)) 6348 ) { 6349 float_raise(float_flag_invalid, status); 6350 return 0; 6351 } 6352 aSign = extractFloatx80Sign( a ); 6353 bSign = extractFloatx80Sign( b ); 6354 if ( aSign != bSign ) { 6355 return 6356 aSign 6357 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6358 == 0 ); 6359 } 6360 return 6361 aSign ? le128( b.high, b.low, a.high, a.low ) 6362 : le128( a.high, a.low, b.high, b.low ); 6363 6364 } 6365 6366 /*---------------------------------------------------------------------------- 6367 | Returns 1 if the extended double-precision floating-point value `a' is 6368 | less than the corresponding value `b', and 0 otherwise. The invalid 6369 | exception is raised if either operand is a NaN. The comparison is performed 6370 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6371 *----------------------------------------------------------------------------*/ 6372 6373 int floatx80_lt(floatx80 a, floatx80 b, float_status *status) 6374 { 6375 flag aSign, bSign; 6376 6377 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 6378 || (extractFloatx80Exp(a) == 0x7FFF 6379 && (uint64_t) (extractFloatx80Frac(a) << 1)) 6380 || (extractFloatx80Exp(b) == 0x7FFF 6381 && (uint64_t) (extractFloatx80Frac(b) << 1)) 6382 ) { 6383 float_raise(float_flag_invalid, status); 6384 return 0; 6385 } 6386 aSign = extractFloatx80Sign( a ); 6387 bSign = extractFloatx80Sign( b ); 6388 if ( aSign != bSign ) { 6389 return 6390 aSign 6391 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6392 != 0 ); 6393 } 6394 return 6395 aSign ? lt128( b.high, b.low, a.high, a.low ) 6396 : lt128( a.high, a.low, b.high, b.low ); 6397 6398 } 6399 6400 /*---------------------------------------------------------------------------- 6401 | Returns 1 if the extended double-precision floating-point values `a' and `b' 6402 | cannot be compared, and 0 otherwise. The invalid exception is raised if 6403 | either operand is a NaN. The comparison is performed according to the 6404 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6405 *----------------------------------------------------------------------------*/ 6406 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status) 6407 { 6408 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 6409 || (extractFloatx80Exp(a) == 0x7FFF 6410 && (uint64_t) (extractFloatx80Frac(a) << 1)) 6411 || (extractFloatx80Exp(b) == 0x7FFF 6412 && (uint64_t) (extractFloatx80Frac(b) << 1)) 6413 ) { 6414 float_raise(float_flag_invalid, status); 6415 return 1; 6416 } 6417 return 0; 6418 } 6419 6420 /*---------------------------------------------------------------------------- 6421 | Returns 1 if the extended double-precision floating-point value `a' is 6422 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 6423 | cause an exception. The comparison is performed according to the IEC/IEEE 6424 | Standard for Binary Floating-Point Arithmetic. 6425 *----------------------------------------------------------------------------*/ 6426 6427 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status) 6428 { 6429 6430 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6431 float_raise(float_flag_invalid, status); 6432 return 0; 6433 } 6434 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6435 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6436 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6437 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6438 ) { 6439 if (floatx80_is_signaling_nan(a, status) 6440 || floatx80_is_signaling_nan(b, status)) { 6441 float_raise(float_flag_invalid, status); 6442 } 6443 return 0; 6444 } 6445 return 6446 ( a.low == b.low ) 6447 && ( ( a.high == b.high ) 6448 || ( ( a.low == 0 ) 6449 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6450 ); 6451 6452 } 6453 6454 /*---------------------------------------------------------------------------- 6455 | Returns 1 if the extended double-precision floating-point value `a' is less 6456 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 6457 | do not cause an exception. Otherwise, the comparison is performed according 6458 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6459 *----------------------------------------------------------------------------*/ 6460 6461 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status) 6462 { 6463 flag aSign, bSign; 6464 6465 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6466 float_raise(float_flag_invalid, status); 6467 return 0; 6468 } 6469 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6470 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6471 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6472 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6473 ) { 6474 if (floatx80_is_signaling_nan(a, status) 6475 || floatx80_is_signaling_nan(b, status)) { 6476 float_raise(float_flag_invalid, status); 6477 } 6478 return 0; 6479 } 6480 aSign = extractFloatx80Sign( a ); 6481 bSign = extractFloatx80Sign( b ); 6482 if ( aSign != bSign ) { 6483 return 6484 aSign 6485 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6486 == 0 ); 6487 } 6488 return 6489 aSign ? le128( b.high, b.low, a.high, a.low ) 6490 : le128( a.high, a.low, b.high, b.low ); 6491 6492 } 6493 6494 /*---------------------------------------------------------------------------- 6495 | Returns 1 if the extended double-precision floating-point value `a' is less 6496 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 6497 | an exception. Otherwise, the comparison is performed according to the 6498 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6499 *----------------------------------------------------------------------------*/ 6500 6501 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status) 6502 { 6503 flag aSign, bSign; 6504 6505 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6506 float_raise(float_flag_invalid, status); 6507 return 0; 6508 } 6509 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6510 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6511 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6512 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6513 ) { 6514 if (floatx80_is_signaling_nan(a, status) 6515 || floatx80_is_signaling_nan(b, status)) { 6516 float_raise(float_flag_invalid, status); 6517 } 6518 return 0; 6519 } 6520 aSign = extractFloatx80Sign( a ); 6521 bSign = extractFloatx80Sign( b ); 6522 if ( aSign != bSign ) { 6523 return 6524 aSign 6525 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6526 != 0 ); 6527 } 6528 return 6529 aSign ? lt128( b.high, b.low, a.high, a.low ) 6530 : lt128( a.high, a.low, b.high, b.low ); 6531 6532 } 6533 6534 /*---------------------------------------------------------------------------- 6535 | Returns 1 if the extended double-precision floating-point values `a' and `b' 6536 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception. 6537 | The comparison is performed according to the IEC/IEEE Standard for Binary 6538 | Floating-Point Arithmetic. 6539 *----------------------------------------------------------------------------*/ 6540 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status) 6541 { 6542 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6543 float_raise(float_flag_invalid, status); 6544 return 1; 6545 } 6546 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6547 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6548 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6549 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6550 ) { 6551 if (floatx80_is_signaling_nan(a, status) 6552 || floatx80_is_signaling_nan(b, status)) { 6553 float_raise(float_flag_invalid, status); 6554 } 6555 return 1; 6556 } 6557 return 0; 6558 } 6559 6560 /*---------------------------------------------------------------------------- 6561 | Returns the result of converting the quadruple-precision floating-point 6562 | value `a' to the 32-bit two's complement integer format. The conversion 6563 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6564 | Arithmetic---which means in particular that the conversion is rounded 6565 | according to the current rounding mode. If `a' is a NaN, the largest 6566 | positive integer is returned. Otherwise, if the conversion overflows, the 6567 | largest integer with the same sign as `a' is returned. 6568 *----------------------------------------------------------------------------*/ 6569 6570 int32_t float128_to_int32(float128 a, float_status *status) 6571 { 6572 flag aSign; 6573 int32_t aExp, shiftCount; 6574 uint64_t aSig0, aSig1; 6575 6576 aSig1 = extractFloat128Frac1( a ); 6577 aSig0 = extractFloat128Frac0( a ); 6578 aExp = extractFloat128Exp( a ); 6579 aSign = extractFloat128Sign( a ); 6580 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 6581 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6582 aSig0 |= ( aSig1 != 0 ); 6583 shiftCount = 0x4028 - aExp; 6584 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 6585 return roundAndPackInt32(aSign, aSig0, status); 6586 6587 } 6588 6589 /*---------------------------------------------------------------------------- 6590 | Returns the result of converting the quadruple-precision floating-point 6591 | value `a' to the 32-bit two's complement integer format. The conversion 6592 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6593 | Arithmetic, except that the conversion is always rounded toward zero. If 6594 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 6595 | conversion overflows, the largest integer with the same sign as `a' is 6596 | returned. 6597 *----------------------------------------------------------------------------*/ 6598 6599 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 6600 { 6601 flag aSign; 6602 int32_t aExp, shiftCount; 6603 uint64_t aSig0, aSig1, savedASig; 6604 int32_t z; 6605 6606 aSig1 = extractFloat128Frac1( a ); 6607 aSig0 = extractFloat128Frac0( a ); 6608 aExp = extractFloat128Exp( a ); 6609 aSign = extractFloat128Sign( a ); 6610 aSig0 |= ( aSig1 != 0 ); 6611 if ( 0x401E < aExp ) { 6612 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 6613 goto invalid; 6614 } 6615 else if ( aExp < 0x3FFF ) { 6616 if (aExp || aSig0) { 6617 status->float_exception_flags |= float_flag_inexact; 6618 } 6619 return 0; 6620 } 6621 aSig0 |= LIT64( 0x0001000000000000 ); 6622 shiftCount = 0x402F - aExp; 6623 savedASig = aSig0; 6624 aSig0 >>= shiftCount; 6625 z = aSig0; 6626 if ( aSign ) z = - z; 6627 if ( ( z < 0 ) ^ aSign ) { 6628 invalid: 6629 float_raise(float_flag_invalid, status); 6630 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 6631 } 6632 if ( ( aSig0<<shiftCount ) != savedASig ) { 6633 status->float_exception_flags |= float_flag_inexact; 6634 } 6635 return z; 6636 6637 } 6638 6639 /*---------------------------------------------------------------------------- 6640 | Returns the result of converting the quadruple-precision floating-point 6641 | value `a' to the 64-bit two's complement integer format. The conversion 6642 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6643 | Arithmetic---which means in particular that the conversion is rounded 6644 | according to the current rounding mode. If `a' is a NaN, the largest 6645 | positive integer is returned. Otherwise, if the conversion overflows, the 6646 | largest integer with the same sign as `a' is returned. 6647 *----------------------------------------------------------------------------*/ 6648 6649 int64_t float128_to_int64(float128 a, float_status *status) 6650 { 6651 flag aSign; 6652 int32_t aExp, shiftCount; 6653 uint64_t aSig0, aSig1; 6654 6655 aSig1 = extractFloat128Frac1( a ); 6656 aSig0 = extractFloat128Frac0( a ); 6657 aExp = extractFloat128Exp( a ); 6658 aSign = extractFloat128Sign( a ); 6659 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6660 shiftCount = 0x402F - aExp; 6661 if ( shiftCount <= 0 ) { 6662 if ( 0x403E < aExp ) { 6663 float_raise(float_flag_invalid, status); 6664 if ( ! aSign 6665 || ( ( aExp == 0x7FFF ) 6666 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 6667 ) 6668 ) { 6669 return LIT64( 0x7FFFFFFFFFFFFFFF ); 6670 } 6671 return (int64_t) LIT64( 0x8000000000000000 ); 6672 } 6673 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 6674 } 6675 else { 6676 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 6677 } 6678 return roundAndPackInt64(aSign, aSig0, aSig1, status); 6679 6680 } 6681 6682 /*---------------------------------------------------------------------------- 6683 | Returns the result of converting the quadruple-precision floating-point 6684 | value `a' to the 64-bit two's complement integer format. The conversion 6685 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6686 | Arithmetic, except that the conversion is always rounded toward zero. 6687 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 6688 | the conversion overflows, the largest integer with the same sign as `a' is 6689 | returned. 6690 *----------------------------------------------------------------------------*/ 6691 6692 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 6693 { 6694 flag aSign; 6695 int32_t aExp, shiftCount; 6696 uint64_t aSig0, aSig1; 6697 int64_t z; 6698 6699 aSig1 = extractFloat128Frac1( a ); 6700 aSig0 = extractFloat128Frac0( a ); 6701 aExp = extractFloat128Exp( a ); 6702 aSign = extractFloat128Sign( a ); 6703 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6704 shiftCount = aExp - 0x402F; 6705 if ( 0 < shiftCount ) { 6706 if ( 0x403E <= aExp ) { 6707 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 6708 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 6709 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 6710 if (aSig1) { 6711 status->float_exception_flags |= float_flag_inexact; 6712 } 6713 } 6714 else { 6715 float_raise(float_flag_invalid, status); 6716 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 6717 return LIT64( 0x7FFFFFFFFFFFFFFF ); 6718 } 6719 } 6720 return (int64_t) LIT64( 0x8000000000000000 ); 6721 } 6722 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 6723 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 6724 status->float_exception_flags |= float_flag_inexact; 6725 } 6726 } 6727 else { 6728 if ( aExp < 0x3FFF ) { 6729 if ( aExp | aSig0 | aSig1 ) { 6730 status->float_exception_flags |= float_flag_inexact; 6731 } 6732 return 0; 6733 } 6734 z = aSig0>>( - shiftCount ); 6735 if ( aSig1 6736 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 6737 status->float_exception_flags |= float_flag_inexact; 6738 } 6739 } 6740 if ( aSign ) z = - z; 6741 return z; 6742 6743 } 6744 6745 /*---------------------------------------------------------------------------- 6746 | Returns the result of converting the quadruple-precision floating-point value 6747 | `a' to the 64-bit unsigned integer format. The conversion is 6748 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6749 | Arithmetic---which means in particular that the conversion is rounded 6750 | according to the current rounding mode. If `a' is a NaN, the largest 6751 | positive integer is returned. If the conversion overflows, the 6752 | largest unsigned integer is returned. If 'a' is negative, the value is 6753 | rounded and zero is returned; negative values that do not round to zero 6754 | will raise the inexact exception. 6755 *----------------------------------------------------------------------------*/ 6756 6757 uint64_t float128_to_uint64(float128 a, float_status *status) 6758 { 6759 flag aSign; 6760 int aExp; 6761 int shiftCount; 6762 uint64_t aSig0, aSig1; 6763 6764 aSig0 = extractFloat128Frac0(a); 6765 aSig1 = extractFloat128Frac1(a); 6766 aExp = extractFloat128Exp(a); 6767 aSign = extractFloat128Sign(a); 6768 if (aSign && (aExp > 0x3FFE)) { 6769 float_raise(float_flag_invalid, status); 6770 if (float128_is_any_nan(a)) { 6771 return LIT64(0xFFFFFFFFFFFFFFFF); 6772 } else { 6773 return 0; 6774 } 6775 } 6776 if (aExp) { 6777 aSig0 |= LIT64(0x0001000000000000); 6778 } 6779 shiftCount = 0x402F - aExp; 6780 if (shiftCount <= 0) { 6781 if (0x403E < aExp) { 6782 float_raise(float_flag_invalid, status); 6783 return LIT64(0xFFFFFFFFFFFFFFFF); 6784 } 6785 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 6786 } else { 6787 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 6788 } 6789 return roundAndPackUint64(aSign, aSig0, aSig1, status); 6790 } 6791 6792 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 6793 { 6794 uint64_t v; 6795 signed char current_rounding_mode = status->float_rounding_mode; 6796 6797 set_float_rounding_mode(float_round_to_zero, status); 6798 v = float128_to_uint64(a, status); 6799 set_float_rounding_mode(current_rounding_mode, status); 6800 6801 return v; 6802 } 6803 6804 /*---------------------------------------------------------------------------- 6805 | Returns the result of converting the quadruple-precision floating-point 6806 | value `a' to the 32-bit unsigned integer format. The conversion 6807 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6808 | Arithmetic except that the conversion is always rounded toward zero. 6809 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 6810 | if the conversion overflows, the largest unsigned integer is returned. 6811 | If 'a' is negative, the value is rounded and zero is returned; negative 6812 | values that do not round to zero will raise the inexact exception. 6813 *----------------------------------------------------------------------------*/ 6814 6815 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 6816 { 6817 uint64_t v; 6818 uint32_t res; 6819 int old_exc_flags = get_float_exception_flags(status); 6820 6821 v = float128_to_uint64_round_to_zero(a, status); 6822 if (v > 0xffffffff) { 6823 res = 0xffffffff; 6824 } else { 6825 return v; 6826 } 6827 set_float_exception_flags(old_exc_flags, status); 6828 float_raise(float_flag_invalid, status); 6829 return res; 6830 } 6831 6832 /*---------------------------------------------------------------------------- 6833 | Returns the result of converting the quadruple-precision floating-point value 6834 | `a' to the 32-bit unsigned integer format. The conversion is 6835 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6836 | Arithmetic---which means in particular that the conversion is rounded 6837 | according to the current rounding mode. If `a' is a NaN, the largest 6838 | positive integer is returned. If the conversion overflows, the 6839 | largest unsigned integer is returned. If 'a' is negative, the value is 6840 | rounded and zero is returned; negative values that do not round to zero 6841 | will raise the inexact exception. 6842 *----------------------------------------------------------------------------*/ 6843 6844 uint32_t float128_to_uint32(float128 a, float_status *status) 6845 { 6846 uint64_t v; 6847 uint32_t res; 6848 int old_exc_flags = get_float_exception_flags(status); 6849 6850 v = float128_to_uint64(a, status); 6851 if (v > 0xffffffff) { 6852 res = 0xffffffff; 6853 } else { 6854 return v; 6855 } 6856 set_float_exception_flags(old_exc_flags, status); 6857 float_raise(float_flag_invalid, status); 6858 return res; 6859 } 6860 6861 /*---------------------------------------------------------------------------- 6862 | Returns the result of converting the quadruple-precision floating-point 6863 | value `a' to the single-precision floating-point format. The conversion 6864 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6865 | Arithmetic. 6866 *----------------------------------------------------------------------------*/ 6867 6868 float32 float128_to_float32(float128 a, float_status *status) 6869 { 6870 flag aSign; 6871 int32_t aExp; 6872 uint64_t aSig0, aSig1; 6873 uint32_t zSig; 6874 6875 aSig1 = extractFloat128Frac1( a ); 6876 aSig0 = extractFloat128Frac0( a ); 6877 aExp = extractFloat128Exp( a ); 6878 aSign = extractFloat128Sign( a ); 6879 if ( aExp == 0x7FFF ) { 6880 if ( aSig0 | aSig1 ) { 6881 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 6882 } 6883 return packFloat32( aSign, 0xFF, 0 ); 6884 } 6885 aSig0 |= ( aSig1 != 0 ); 6886 shift64RightJamming( aSig0, 18, &aSig0 ); 6887 zSig = aSig0; 6888 if ( aExp || zSig ) { 6889 zSig |= 0x40000000; 6890 aExp -= 0x3F81; 6891 } 6892 return roundAndPackFloat32(aSign, aExp, zSig, status); 6893 6894 } 6895 6896 /*---------------------------------------------------------------------------- 6897 | Returns the result of converting the quadruple-precision floating-point 6898 | value `a' to the double-precision floating-point format. The conversion 6899 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6900 | Arithmetic. 6901 *----------------------------------------------------------------------------*/ 6902 6903 float64 float128_to_float64(float128 a, float_status *status) 6904 { 6905 flag aSign; 6906 int32_t aExp; 6907 uint64_t aSig0, aSig1; 6908 6909 aSig1 = extractFloat128Frac1( a ); 6910 aSig0 = extractFloat128Frac0( a ); 6911 aExp = extractFloat128Exp( a ); 6912 aSign = extractFloat128Sign( a ); 6913 if ( aExp == 0x7FFF ) { 6914 if ( aSig0 | aSig1 ) { 6915 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 6916 } 6917 return packFloat64( aSign, 0x7FF, 0 ); 6918 } 6919 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6920 aSig0 |= ( aSig1 != 0 ); 6921 if ( aExp || aSig0 ) { 6922 aSig0 |= LIT64( 0x4000000000000000 ); 6923 aExp -= 0x3C01; 6924 } 6925 return roundAndPackFloat64(aSign, aExp, aSig0, status); 6926 6927 } 6928 6929 /*---------------------------------------------------------------------------- 6930 | Returns the result of converting the quadruple-precision floating-point 6931 | value `a' to the extended double-precision floating-point format. The 6932 | conversion is performed according to the IEC/IEEE Standard for Binary 6933 | Floating-Point Arithmetic. 6934 *----------------------------------------------------------------------------*/ 6935 6936 floatx80 float128_to_floatx80(float128 a, float_status *status) 6937 { 6938 flag aSign; 6939 int32_t aExp; 6940 uint64_t aSig0, aSig1; 6941 6942 aSig1 = extractFloat128Frac1( a ); 6943 aSig0 = extractFloat128Frac0( a ); 6944 aExp = extractFloat128Exp( a ); 6945 aSign = extractFloat128Sign( a ); 6946 if ( aExp == 0x7FFF ) { 6947 if ( aSig0 | aSig1 ) { 6948 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status); 6949 } 6950 return packFloatx80(aSign, floatx80_infinity_high, 6951 floatx80_infinity_low); 6952 } 6953 if ( aExp == 0 ) { 6954 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6955 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6956 } 6957 else { 6958 aSig0 |= LIT64( 0x0001000000000000 ); 6959 } 6960 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6961 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6962 6963 } 6964 6965 /*---------------------------------------------------------------------------- 6966 | Rounds the quadruple-precision floating-point value `a' to an integer, and 6967 | returns the result as a quadruple-precision floating-point value. The 6968 | operation is performed according to the IEC/IEEE Standard for Binary 6969 | Floating-Point Arithmetic. 6970 *----------------------------------------------------------------------------*/ 6971 6972 float128 float128_round_to_int(float128 a, float_status *status) 6973 { 6974 flag aSign; 6975 int32_t aExp; 6976 uint64_t lastBitMask, roundBitsMask; 6977 float128 z; 6978 6979 aExp = extractFloat128Exp( a ); 6980 if ( 0x402F <= aExp ) { 6981 if ( 0x406F <= aExp ) { 6982 if ( ( aExp == 0x7FFF ) 6983 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 6984 ) { 6985 return propagateFloat128NaN(a, a, status); 6986 } 6987 return a; 6988 } 6989 lastBitMask = 1; 6990 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 6991 roundBitsMask = lastBitMask - 1; 6992 z = a; 6993 switch (status->float_rounding_mode) { 6994 case float_round_nearest_even: 6995 if ( lastBitMask ) { 6996 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 6997 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 6998 } 6999 else { 7000 if ( (int64_t) z.low < 0 ) { 7001 ++z.high; 7002 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 7003 } 7004 } 7005 break; 7006 case float_round_ties_away: 7007 if (lastBitMask) { 7008 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 7009 } else { 7010 if ((int64_t) z.low < 0) { 7011 ++z.high; 7012 } 7013 } 7014 break; 7015 case float_round_to_zero: 7016 break; 7017 case float_round_up: 7018 if (!extractFloat128Sign(z)) { 7019 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 7020 } 7021 break; 7022 case float_round_down: 7023 if (extractFloat128Sign(z)) { 7024 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 7025 } 7026 break; 7027 case float_round_to_odd: 7028 /* 7029 * Note that if lastBitMask == 0, the last bit is the lsb 7030 * of high, and roundBitsMask == -1. 7031 */ 7032 if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) { 7033 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 7034 } 7035 break; 7036 default: 7037 abort(); 7038 } 7039 z.low &= ~ roundBitsMask; 7040 } 7041 else { 7042 if ( aExp < 0x3FFF ) { 7043 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 7044 status->float_exception_flags |= float_flag_inexact; 7045 aSign = extractFloat128Sign( a ); 7046 switch (status->float_rounding_mode) { 7047 case float_round_nearest_even: 7048 if ( ( aExp == 0x3FFE ) 7049 && ( extractFloat128Frac0( a ) 7050 | extractFloat128Frac1( a ) ) 7051 ) { 7052 return packFloat128( aSign, 0x3FFF, 0, 0 ); 7053 } 7054 break; 7055 case float_round_ties_away: 7056 if (aExp == 0x3FFE) { 7057 return packFloat128(aSign, 0x3FFF, 0, 0); 7058 } 7059 break; 7060 case float_round_down: 7061 return 7062 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 7063 : packFloat128( 0, 0, 0, 0 ); 7064 case float_round_up: 7065 return 7066 aSign ? packFloat128( 1, 0, 0, 0 ) 7067 : packFloat128( 0, 0x3FFF, 0, 0 ); 7068 7069 case float_round_to_odd: 7070 return packFloat128(aSign, 0x3FFF, 0, 0); 7071 } 7072 return packFloat128( aSign, 0, 0, 0 ); 7073 } 7074 lastBitMask = 1; 7075 lastBitMask <<= 0x402F - aExp; 7076 roundBitsMask = lastBitMask - 1; 7077 z.low = 0; 7078 z.high = a.high; 7079 switch (status->float_rounding_mode) { 7080 case float_round_nearest_even: 7081 z.high += lastBitMask>>1; 7082 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 7083 z.high &= ~ lastBitMask; 7084 } 7085 break; 7086 case float_round_ties_away: 7087 z.high += lastBitMask>>1; 7088 break; 7089 case float_round_to_zero: 7090 break; 7091 case float_round_up: 7092 if (!extractFloat128Sign(z)) { 7093 z.high |= ( a.low != 0 ); 7094 z.high += roundBitsMask; 7095 } 7096 break; 7097 case float_round_down: 7098 if (extractFloat128Sign(z)) { 7099 z.high |= (a.low != 0); 7100 z.high += roundBitsMask; 7101 } 7102 break; 7103 case float_round_to_odd: 7104 if ((z.high & lastBitMask) == 0) { 7105 z.high |= (a.low != 0); 7106 z.high += roundBitsMask; 7107 } 7108 break; 7109 default: 7110 abort(); 7111 } 7112 z.high &= ~ roundBitsMask; 7113 } 7114 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 7115 status->float_exception_flags |= float_flag_inexact; 7116 } 7117 return z; 7118 7119 } 7120 7121 /*---------------------------------------------------------------------------- 7122 | Returns the result of adding the absolute values of the quadruple-precision 7123 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 7124 | before being returned. `zSign' is ignored if the result is a NaN. 7125 | The addition is performed according to the IEC/IEEE Standard for Binary 7126 | Floating-Point Arithmetic. 7127 *----------------------------------------------------------------------------*/ 7128 7129 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign, 7130 float_status *status) 7131 { 7132 int32_t aExp, bExp, zExp; 7133 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 7134 int32_t expDiff; 7135 7136 aSig1 = extractFloat128Frac1( a ); 7137 aSig0 = extractFloat128Frac0( a ); 7138 aExp = extractFloat128Exp( a ); 7139 bSig1 = extractFloat128Frac1( b ); 7140 bSig0 = extractFloat128Frac0( b ); 7141 bExp = extractFloat128Exp( b ); 7142 expDiff = aExp - bExp; 7143 if ( 0 < expDiff ) { 7144 if ( aExp == 0x7FFF ) { 7145 if (aSig0 | aSig1) { 7146 return propagateFloat128NaN(a, b, status); 7147 } 7148 return a; 7149 } 7150 if ( bExp == 0 ) { 7151 --expDiff; 7152 } 7153 else { 7154 bSig0 |= LIT64( 0x0001000000000000 ); 7155 } 7156 shift128ExtraRightJamming( 7157 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 7158 zExp = aExp; 7159 } 7160 else if ( expDiff < 0 ) { 7161 if ( bExp == 0x7FFF ) { 7162 if (bSig0 | bSig1) { 7163 return propagateFloat128NaN(a, b, status); 7164 } 7165 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7166 } 7167 if ( aExp == 0 ) { 7168 ++expDiff; 7169 } 7170 else { 7171 aSig0 |= LIT64( 0x0001000000000000 ); 7172 } 7173 shift128ExtraRightJamming( 7174 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 7175 zExp = bExp; 7176 } 7177 else { 7178 if ( aExp == 0x7FFF ) { 7179 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 7180 return propagateFloat128NaN(a, b, status); 7181 } 7182 return a; 7183 } 7184 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 7185 if ( aExp == 0 ) { 7186 if (status->flush_to_zero) { 7187 if (zSig0 | zSig1) { 7188 float_raise(float_flag_output_denormal, status); 7189 } 7190 return packFloat128(zSign, 0, 0, 0); 7191 } 7192 return packFloat128( zSign, 0, zSig0, zSig1 ); 7193 } 7194 zSig2 = 0; 7195 zSig0 |= LIT64( 0x0002000000000000 ); 7196 zExp = aExp; 7197 goto shiftRight1; 7198 } 7199 aSig0 |= LIT64( 0x0001000000000000 ); 7200 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 7201 --zExp; 7202 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 7203 ++zExp; 7204 shiftRight1: 7205 shift128ExtraRightJamming( 7206 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 7207 roundAndPack: 7208 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7209 7210 } 7211 7212 /*---------------------------------------------------------------------------- 7213 | Returns the result of subtracting the absolute values of the quadruple- 7214 | precision floating-point values `a' and `b'. If `zSign' is 1, the 7215 | difference is negated before being returned. `zSign' is ignored if the 7216 | result is a NaN. The subtraction is performed according to the IEC/IEEE 7217 | Standard for Binary Floating-Point Arithmetic. 7218 *----------------------------------------------------------------------------*/ 7219 7220 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign, 7221 float_status *status) 7222 { 7223 int32_t aExp, bExp, zExp; 7224 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 7225 int32_t expDiff; 7226 7227 aSig1 = extractFloat128Frac1( a ); 7228 aSig0 = extractFloat128Frac0( a ); 7229 aExp = extractFloat128Exp( a ); 7230 bSig1 = extractFloat128Frac1( b ); 7231 bSig0 = extractFloat128Frac0( b ); 7232 bExp = extractFloat128Exp( b ); 7233 expDiff = aExp - bExp; 7234 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 7235 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 7236 if ( 0 < expDiff ) goto aExpBigger; 7237 if ( expDiff < 0 ) goto bExpBigger; 7238 if ( aExp == 0x7FFF ) { 7239 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 7240 return propagateFloat128NaN(a, b, status); 7241 } 7242 float_raise(float_flag_invalid, status); 7243 return float128_default_nan(status); 7244 } 7245 if ( aExp == 0 ) { 7246 aExp = 1; 7247 bExp = 1; 7248 } 7249 if ( bSig0 < aSig0 ) goto aBigger; 7250 if ( aSig0 < bSig0 ) goto bBigger; 7251 if ( bSig1 < aSig1 ) goto aBigger; 7252 if ( aSig1 < bSig1 ) goto bBigger; 7253 return packFloat128(status->float_rounding_mode == float_round_down, 7254 0, 0, 0); 7255 bExpBigger: 7256 if ( bExp == 0x7FFF ) { 7257 if (bSig0 | bSig1) { 7258 return propagateFloat128NaN(a, b, status); 7259 } 7260 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 7261 } 7262 if ( aExp == 0 ) { 7263 ++expDiff; 7264 } 7265 else { 7266 aSig0 |= LIT64( 0x4000000000000000 ); 7267 } 7268 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 7269 bSig0 |= LIT64( 0x4000000000000000 ); 7270 bBigger: 7271 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 7272 zExp = bExp; 7273 zSign ^= 1; 7274 goto normalizeRoundAndPack; 7275 aExpBigger: 7276 if ( aExp == 0x7FFF ) { 7277 if (aSig0 | aSig1) { 7278 return propagateFloat128NaN(a, b, status); 7279 } 7280 return a; 7281 } 7282 if ( bExp == 0 ) { 7283 --expDiff; 7284 } 7285 else { 7286 bSig0 |= LIT64( 0x4000000000000000 ); 7287 } 7288 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 7289 aSig0 |= LIT64( 0x4000000000000000 ); 7290 aBigger: 7291 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 7292 zExp = aExp; 7293 normalizeRoundAndPack: 7294 --zExp; 7295 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 7296 status); 7297 7298 } 7299 7300 /*---------------------------------------------------------------------------- 7301 | Returns the result of adding the quadruple-precision floating-point values 7302 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 7303 | for Binary Floating-Point Arithmetic. 7304 *----------------------------------------------------------------------------*/ 7305 7306 float128 float128_add(float128 a, float128 b, float_status *status) 7307 { 7308 flag aSign, bSign; 7309 7310 aSign = extractFloat128Sign( a ); 7311 bSign = extractFloat128Sign( b ); 7312 if ( aSign == bSign ) { 7313 return addFloat128Sigs(a, b, aSign, status); 7314 } 7315 else { 7316 return subFloat128Sigs(a, b, aSign, status); 7317 } 7318 7319 } 7320 7321 /*---------------------------------------------------------------------------- 7322 | Returns the result of subtracting the quadruple-precision floating-point 7323 | values `a' and `b'. The operation is performed according to the IEC/IEEE 7324 | Standard for Binary Floating-Point Arithmetic. 7325 *----------------------------------------------------------------------------*/ 7326 7327 float128 float128_sub(float128 a, float128 b, float_status *status) 7328 { 7329 flag aSign, bSign; 7330 7331 aSign = extractFloat128Sign( a ); 7332 bSign = extractFloat128Sign( b ); 7333 if ( aSign == bSign ) { 7334 return subFloat128Sigs(a, b, aSign, status); 7335 } 7336 else { 7337 return addFloat128Sigs(a, b, aSign, status); 7338 } 7339 7340 } 7341 7342 /*---------------------------------------------------------------------------- 7343 | Returns the result of multiplying the quadruple-precision floating-point 7344 | values `a' and `b'. The operation is performed according to the IEC/IEEE 7345 | Standard for Binary Floating-Point Arithmetic. 7346 *----------------------------------------------------------------------------*/ 7347 7348 float128 float128_mul(float128 a, float128 b, float_status *status) 7349 { 7350 flag aSign, bSign, zSign; 7351 int32_t aExp, bExp, zExp; 7352 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 7353 7354 aSig1 = extractFloat128Frac1( a ); 7355 aSig0 = extractFloat128Frac0( a ); 7356 aExp = extractFloat128Exp( a ); 7357 aSign = extractFloat128Sign( a ); 7358 bSig1 = extractFloat128Frac1( b ); 7359 bSig0 = extractFloat128Frac0( b ); 7360 bExp = extractFloat128Exp( b ); 7361 bSign = extractFloat128Sign( b ); 7362 zSign = aSign ^ bSign; 7363 if ( aExp == 0x7FFF ) { 7364 if ( ( aSig0 | aSig1 ) 7365 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 7366 return propagateFloat128NaN(a, b, status); 7367 } 7368 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 7369 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7370 } 7371 if ( bExp == 0x7FFF ) { 7372 if (bSig0 | bSig1) { 7373 return propagateFloat128NaN(a, b, status); 7374 } 7375 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 7376 invalid: 7377 float_raise(float_flag_invalid, status); 7378 return float128_default_nan(status); 7379 } 7380 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7381 } 7382 if ( aExp == 0 ) { 7383 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7384 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7385 } 7386 if ( bExp == 0 ) { 7387 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7388 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7389 } 7390 zExp = aExp + bExp - 0x4000; 7391 aSig0 |= LIT64( 0x0001000000000000 ); 7392 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 7393 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 7394 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 7395 zSig2 |= ( zSig3 != 0 ); 7396 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 7397 shift128ExtraRightJamming( 7398 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 7399 ++zExp; 7400 } 7401 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7402 7403 } 7404 7405 /*---------------------------------------------------------------------------- 7406 | Returns the result of dividing the quadruple-precision floating-point value 7407 | `a' by the corresponding value `b'. The operation is performed according to 7408 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7409 *----------------------------------------------------------------------------*/ 7410 7411 float128 float128_div(float128 a, float128 b, float_status *status) 7412 { 7413 flag aSign, bSign, zSign; 7414 int32_t aExp, bExp, zExp; 7415 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 7416 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7417 7418 aSig1 = extractFloat128Frac1( a ); 7419 aSig0 = extractFloat128Frac0( a ); 7420 aExp = extractFloat128Exp( a ); 7421 aSign = extractFloat128Sign( a ); 7422 bSig1 = extractFloat128Frac1( b ); 7423 bSig0 = extractFloat128Frac0( b ); 7424 bExp = extractFloat128Exp( b ); 7425 bSign = extractFloat128Sign( b ); 7426 zSign = aSign ^ bSign; 7427 if ( aExp == 0x7FFF ) { 7428 if (aSig0 | aSig1) { 7429 return propagateFloat128NaN(a, b, status); 7430 } 7431 if ( bExp == 0x7FFF ) { 7432 if (bSig0 | bSig1) { 7433 return propagateFloat128NaN(a, b, status); 7434 } 7435 goto invalid; 7436 } 7437 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7438 } 7439 if ( bExp == 0x7FFF ) { 7440 if (bSig0 | bSig1) { 7441 return propagateFloat128NaN(a, b, status); 7442 } 7443 return packFloat128( zSign, 0, 0, 0 ); 7444 } 7445 if ( bExp == 0 ) { 7446 if ( ( bSig0 | bSig1 ) == 0 ) { 7447 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 7448 invalid: 7449 float_raise(float_flag_invalid, status); 7450 return float128_default_nan(status); 7451 } 7452 float_raise(float_flag_divbyzero, status); 7453 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7454 } 7455 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7456 } 7457 if ( aExp == 0 ) { 7458 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7459 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7460 } 7461 zExp = aExp - bExp + 0x3FFD; 7462 shortShift128Left( 7463 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 7464 shortShift128Left( 7465 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 7466 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 7467 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 7468 ++zExp; 7469 } 7470 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7471 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 7472 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 7473 while ( (int64_t) rem0 < 0 ) { 7474 --zSig0; 7475 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 7476 } 7477 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 7478 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 7479 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 7480 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 7481 while ( (int64_t) rem1 < 0 ) { 7482 --zSig1; 7483 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 7484 } 7485 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7486 } 7487 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 7488 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7489 7490 } 7491 7492 /*---------------------------------------------------------------------------- 7493 | Returns the remainder of the quadruple-precision floating-point value `a' 7494 | with respect to the corresponding value `b'. The operation is performed 7495 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7496 *----------------------------------------------------------------------------*/ 7497 7498 float128 float128_rem(float128 a, float128 b, float_status *status) 7499 { 7500 flag aSign, zSign; 7501 int32_t aExp, bExp, expDiff; 7502 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 7503 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 7504 int64_t sigMean0; 7505 7506 aSig1 = extractFloat128Frac1( a ); 7507 aSig0 = extractFloat128Frac0( a ); 7508 aExp = extractFloat128Exp( a ); 7509 aSign = extractFloat128Sign( a ); 7510 bSig1 = extractFloat128Frac1( b ); 7511 bSig0 = extractFloat128Frac0( b ); 7512 bExp = extractFloat128Exp( b ); 7513 if ( aExp == 0x7FFF ) { 7514 if ( ( aSig0 | aSig1 ) 7515 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 7516 return propagateFloat128NaN(a, b, status); 7517 } 7518 goto invalid; 7519 } 7520 if ( bExp == 0x7FFF ) { 7521 if (bSig0 | bSig1) { 7522 return propagateFloat128NaN(a, b, status); 7523 } 7524 return a; 7525 } 7526 if ( bExp == 0 ) { 7527 if ( ( bSig0 | bSig1 ) == 0 ) { 7528 invalid: 7529 float_raise(float_flag_invalid, status); 7530 return float128_default_nan(status); 7531 } 7532 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7533 } 7534 if ( aExp == 0 ) { 7535 if ( ( aSig0 | aSig1 ) == 0 ) return a; 7536 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7537 } 7538 expDiff = aExp - bExp; 7539 if ( expDiff < -1 ) return a; 7540 shortShift128Left( 7541 aSig0 | LIT64( 0x0001000000000000 ), 7542 aSig1, 7543 15 - ( expDiff < 0 ), 7544 &aSig0, 7545 &aSig1 7546 ); 7547 shortShift128Left( 7548 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 7549 q = le128( bSig0, bSig1, aSig0, aSig1 ); 7550 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7551 expDiff -= 64; 7552 while ( 0 < expDiff ) { 7553 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7554 q = ( 4 < q ) ? q - 4 : 0; 7555 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7556 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 7557 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 7558 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 7559 expDiff -= 61; 7560 } 7561 if ( -64 < expDiff ) { 7562 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7563 q = ( 4 < q ) ? q - 4 : 0; 7564 q >>= - expDiff; 7565 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7566 expDiff += 52; 7567 if ( expDiff < 0 ) { 7568 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 7569 } 7570 else { 7571 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 7572 } 7573 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7574 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 7575 } 7576 else { 7577 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 7578 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7579 } 7580 do { 7581 alternateASig0 = aSig0; 7582 alternateASig1 = aSig1; 7583 ++q; 7584 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7585 } while ( 0 <= (int64_t) aSig0 ); 7586 add128( 7587 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 7588 if ( ( sigMean0 < 0 ) 7589 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 7590 aSig0 = alternateASig0; 7591 aSig1 = alternateASig1; 7592 } 7593 zSign = ( (int64_t) aSig0 < 0 ); 7594 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 7595 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 7596 status); 7597 } 7598 7599 /*---------------------------------------------------------------------------- 7600 | Returns the square root of the quadruple-precision floating-point value `a'. 7601 | The operation is performed according to the IEC/IEEE Standard for Binary 7602 | Floating-Point Arithmetic. 7603 *----------------------------------------------------------------------------*/ 7604 7605 float128 float128_sqrt(float128 a, float_status *status) 7606 { 7607 flag aSign; 7608 int32_t aExp, zExp; 7609 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 7610 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7611 7612 aSig1 = extractFloat128Frac1( a ); 7613 aSig0 = extractFloat128Frac0( a ); 7614 aExp = extractFloat128Exp( a ); 7615 aSign = extractFloat128Sign( a ); 7616 if ( aExp == 0x7FFF ) { 7617 if (aSig0 | aSig1) { 7618 return propagateFloat128NaN(a, a, status); 7619 } 7620 if ( ! aSign ) return a; 7621 goto invalid; 7622 } 7623 if ( aSign ) { 7624 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 7625 invalid: 7626 float_raise(float_flag_invalid, status); 7627 return float128_default_nan(status); 7628 } 7629 if ( aExp == 0 ) { 7630 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 7631 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7632 } 7633 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 7634 aSig0 |= LIT64( 0x0001000000000000 ); 7635 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 7636 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 7637 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 7638 doubleZSig0 = zSig0<<1; 7639 mul64To128( zSig0, zSig0, &term0, &term1 ); 7640 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 7641 while ( (int64_t) rem0 < 0 ) { 7642 --zSig0; 7643 doubleZSig0 -= 2; 7644 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 7645 } 7646 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 7647 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 7648 if ( zSig1 == 0 ) zSig1 = 1; 7649 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 7650 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 7651 mul64To128( zSig1, zSig1, &term2, &term3 ); 7652 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 7653 while ( (int64_t) rem1 < 0 ) { 7654 --zSig1; 7655 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 7656 term3 |= 1; 7657 term2 |= doubleZSig0; 7658 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 7659 } 7660 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7661 } 7662 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 7663 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 7664 7665 } 7666 7667 /*---------------------------------------------------------------------------- 7668 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 7669 | the corresponding value `b', and 0 otherwise. The invalid exception is 7670 | raised if either operand is a NaN. Otherwise, the comparison is performed 7671 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7672 *----------------------------------------------------------------------------*/ 7673 7674 int float128_eq(float128 a, float128 b, float_status *status) 7675 { 7676 7677 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7678 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7679 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7680 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7681 ) { 7682 float_raise(float_flag_invalid, status); 7683 return 0; 7684 } 7685 return 7686 ( a.low == b.low ) 7687 && ( ( a.high == b.high ) 7688 || ( ( a.low == 0 ) 7689 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 7690 ); 7691 7692 } 7693 7694 /*---------------------------------------------------------------------------- 7695 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7696 | or equal to the corresponding value `b', and 0 otherwise. The invalid 7697 | exception is raised if either operand is a NaN. The comparison is performed 7698 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7699 *----------------------------------------------------------------------------*/ 7700 7701 int float128_le(float128 a, float128 b, float_status *status) 7702 { 7703 flag aSign, bSign; 7704 7705 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7706 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7707 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7708 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7709 ) { 7710 float_raise(float_flag_invalid, status); 7711 return 0; 7712 } 7713 aSign = extractFloat128Sign( a ); 7714 bSign = extractFloat128Sign( b ); 7715 if ( aSign != bSign ) { 7716 return 7717 aSign 7718 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7719 == 0 ); 7720 } 7721 return 7722 aSign ? le128( b.high, b.low, a.high, a.low ) 7723 : le128( a.high, a.low, b.high, b.low ); 7724 7725 } 7726 7727 /*---------------------------------------------------------------------------- 7728 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7729 | the corresponding value `b', and 0 otherwise. The invalid exception is 7730 | raised if either operand is a NaN. The comparison is performed according 7731 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7732 *----------------------------------------------------------------------------*/ 7733 7734 int float128_lt(float128 a, float128 b, float_status *status) 7735 { 7736 flag aSign, bSign; 7737 7738 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7739 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7740 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7741 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7742 ) { 7743 float_raise(float_flag_invalid, status); 7744 return 0; 7745 } 7746 aSign = extractFloat128Sign( a ); 7747 bSign = extractFloat128Sign( b ); 7748 if ( aSign != bSign ) { 7749 return 7750 aSign 7751 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7752 != 0 ); 7753 } 7754 return 7755 aSign ? lt128( b.high, b.low, a.high, a.low ) 7756 : lt128( a.high, a.low, b.high, b.low ); 7757 7758 } 7759 7760 /*---------------------------------------------------------------------------- 7761 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7762 | be compared, and 0 otherwise. The invalid exception is raised if either 7763 | operand is a NaN. The comparison is performed according to the IEC/IEEE 7764 | Standard for Binary Floating-Point Arithmetic. 7765 *----------------------------------------------------------------------------*/ 7766 7767 int float128_unordered(float128 a, float128 b, float_status *status) 7768 { 7769 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7770 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7771 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7772 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7773 ) { 7774 float_raise(float_flag_invalid, status); 7775 return 1; 7776 } 7777 return 0; 7778 } 7779 7780 /*---------------------------------------------------------------------------- 7781 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 7782 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 7783 | exception. The comparison is performed according to the IEC/IEEE Standard 7784 | for Binary Floating-Point Arithmetic. 7785 *----------------------------------------------------------------------------*/ 7786 7787 int float128_eq_quiet(float128 a, float128 b, float_status *status) 7788 { 7789 7790 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7791 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7792 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7793 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7794 ) { 7795 if (float128_is_signaling_nan(a, status) 7796 || float128_is_signaling_nan(b, status)) { 7797 float_raise(float_flag_invalid, status); 7798 } 7799 return 0; 7800 } 7801 return 7802 ( a.low == b.low ) 7803 && ( ( a.high == b.high ) 7804 || ( ( a.low == 0 ) 7805 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 7806 ); 7807 7808 } 7809 7810 /*---------------------------------------------------------------------------- 7811 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7812 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 7813 | cause an exception. Otherwise, the comparison is performed according to the 7814 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7815 *----------------------------------------------------------------------------*/ 7816 7817 int float128_le_quiet(float128 a, float128 b, float_status *status) 7818 { 7819 flag aSign, bSign; 7820 7821 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7822 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7823 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7824 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7825 ) { 7826 if (float128_is_signaling_nan(a, status) 7827 || float128_is_signaling_nan(b, status)) { 7828 float_raise(float_flag_invalid, status); 7829 } 7830 return 0; 7831 } 7832 aSign = extractFloat128Sign( a ); 7833 bSign = extractFloat128Sign( b ); 7834 if ( aSign != bSign ) { 7835 return 7836 aSign 7837 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7838 == 0 ); 7839 } 7840 return 7841 aSign ? le128( b.high, b.low, a.high, a.low ) 7842 : le128( a.high, a.low, b.high, b.low ); 7843 7844 } 7845 7846 /*---------------------------------------------------------------------------- 7847 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7848 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 7849 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 7850 | Standard for Binary Floating-Point Arithmetic. 7851 *----------------------------------------------------------------------------*/ 7852 7853 int float128_lt_quiet(float128 a, float128 b, float_status *status) 7854 { 7855 flag aSign, bSign; 7856 7857 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7858 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7859 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7860 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7861 ) { 7862 if (float128_is_signaling_nan(a, status) 7863 || float128_is_signaling_nan(b, status)) { 7864 float_raise(float_flag_invalid, status); 7865 } 7866 return 0; 7867 } 7868 aSign = extractFloat128Sign( a ); 7869 bSign = extractFloat128Sign( b ); 7870 if ( aSign != bSign ) { 7871 return 7872 aSign 7873 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7874 != 0 ); 7875 } 7876 return 7877 aSign ? lt128( b.high, b.low, a.high, a.low ) 7878 : lt128( a.high, a.low, b.high, b.low ); 7879 7880 } 7881 7882 /*---------------------------------------------------------------------------- 7883 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7884 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 7885 | comparison is performed according to the IEC/IEEE Standard for Binary 7886 | Floating-Point Arithmetic. 7887 *----------------------------------------------------------------------------*/ 7888 7889 int float128_unordered_quiet(float128 a, float128 b, float_status *status) 7890 { 7891 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7892 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7893 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7894 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7895 ) { 7896 if (float128_is_signaling_nan(a, status) 7897 || float128_is_signaling_nan(b, status)) { 7898 float_raise(float_flag_invalid, status); 7899 } 7900 return 1; 7901 } 7902 return 0; 7903 } 7904 7905 static inline int floatx80_compare_internal(floatx80 a, floatx80 b, 7906 int is_quiet, float_status *status) 7907 { 7908 flag aSign, bSign; 7909 7910 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 7911 float_raise(float_flag_invalid, status); 7912 return float_relation_unordered; 7913 } 7914 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 7915 ( extractFloatx80Frac( a )<<1 ) ) || 7916 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 7917 ( extractFloatx80Frac( b )<<1 ) )) { 7918 if (!is_quiet || 7919 floatx80_is_signaling_nan(a, status) || 7920 floatx80_is_signaling_nan(b, status)) { 7921 float_raise(float_flag_invalid, status); 7922 } 7923 return float_relation_unordered; 7924 } 7925 aSign = extractFloatx80Sign( a ); 7926 bSign = extractFloatx80Sign( b ); 7927 if ( aSign != bSign ) { 7928 7929 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 7930 ( ( a.low | b.low ) == 0 ) ) { 7931 /* zero case */ 7932 return float_relation_equal; 7933 } else { 7934 return 1 - (2 * aSign); 7935 } 7936 } else { 7937 if (a.low == b.low && a.high == b.high) { 7938 return float_relation_equal; 7939 } else { 7940 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7941 } 7942 } 7943 } 7944 7945 int floatx80_compare(floatx80 a, floatx80 b, float_status *status) 7946 { 7947 return floatx80_compare_internal(a, b, 0, status); 7948 } 7949 7950 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status) 7951 { 7952 return floatx80_compare_internal(a, b, 1, status); 7953 } 7954 7955 static inline int float128_compare_internal(float128 a, float128 b, 7956 int is_quiet, float_status *status) 7957 { 7958 flag aSign, bSign; 7959 7960 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7961 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7962 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7963 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7964 if (!is_quiet || 7965 float128_is_signaling_nan(a, status) || 7966 float128_is_signaling_nan(b, status)) { 7967 float_raise(float_flag_invalid, status); 7968 } 7969 return float_relation_unordered; 7970 } 7971 aSign = extractFloat128Sign( a ); 7972 bSign = extractFloat128Sign( b ); 7973 if ( aSign != bSign ) { 7974 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7975 /* zero case */ 7976 return float_relation_equal; 7977 } else { 7978 return 1 - (2 * aSign); 7979 } 7980 } else { 7981 if (a.low == b.low && a.high == b.high) { 7982 return float_relation_equal; 7983 } else { 7984 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7985 } 7986 } 7987 } 7988 7989 int float128_compare(float128 a, float128 b, float_status *status) 7990 { 7991 return float128_compare_internal(a, b, 0, status); 7992 } 7993 7994 int float128_compare_quiet(float128 a, float128 b, float_status *status) 7995 { 7996 return float128_compare_internal(a, b, 1, status); 7997 } 7998 7999 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 8000 { 8001 flag aSign; 8002 int32_t aExp; 8003 uint64_t aSig; 8004 8005 if (floatx80_invalid_encoding(a)) { 8006 float_raise(float_flag_invalid, status); 8007 return floatx80_default_nan(status); 8008 } 8009 aSig = extractFloatx80Frac( a ); 8010 aExp = extractFloatx80Exp( a ); 8011 aSign = extractFloatx80Sign( a ); 8012 8013 if ( aExp == 0x7FFF ) { 8014 if ( aSig<<1 ) { 8015 return propagateFloatx80NaN(a, a, status); 8016 } 8017 return a; 8018 } 8019 8020 if (aExp == 0) { 8021 if (aSig == 0) { 8022 return a; 8023 } 8024 aExp++; 8025 } 8026 8027 if (n > 0x10000) { 8028 n = 0x10000; 8029 } else if (n < -0x10000) { 8030 n = -0x10000; 8031 } 8032 8033 aExp += n; 8034 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 8035 aSign, aExp, aSig, 0, status); 8036 } 8037 8038 float128 float128_scalbn(float128 a, int n, float_status *status) 8039 { 8040 flag aSign; 8041 int32_t aExp; 8042 uint64_t aSig0, aSig1; 8043 8044 aSig1 = extractFloat128Frac1( a ); 8045 aSig0 = extractFloat128Frac0( a ); 8046 aExp = extractFloat128Exp( a ); 8047 aSign = extractFloat128Sign( a ); 8048 if ( aExp == 0x7FFF ) { 8049 if ( aSig0 | aSig1 ) { 8050 return propagateFloat128NaN(a, a, status); 8051 } 8052 return a; 8053 } 8054 if (aExp != 0) { 8055 aSig0 |= LIT64( 0x0001000000000000 ); 8056 } else if (aSig0 == 0 && aSig1 == 0) { 8057 return a; 8058 } else { 8059 aExp++; 8060 } 8061 8062 if (n > 0x10000) { 8063 n = 0x10000; 8064 } else if (n < -0x10000) { 8065 n = -0x10000; 8066 } 8067 8068 aExp += n - 1; 8069 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 8070 , status); 8071 8072 } 8073 8074 static void __attribute__((constructor)) softfloat_init(void) 8075 { 8076 union_float64 ua, ub, uc, ur; 8077 8078 if (QEMU_NO_HARDFLOAT) { 8079 return; 8080 } 8081 /* 8082 * Test that the host's FMA is not obviously broken. For example, 8083 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see 8084 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304 8085 */ 8086 ua.s = 0x0020000000000001ULL; 8087 ub.s = 0x3ca0000000000000ULL; 8088 uc.s = 0x0020000000000000ULL; 8089 ur.h = fma(ua.h, ub.h, uc.h); 8090 if (ur.s != 0x0020000000000001ULL) { 8091 force_soft_fma = true; 8092 } 8093 } 8094