1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include <math.h> 87 #include "qemu/bitops.h" 88 #include "fpu/softfloat.h" 89 90 /* We only need stdlib for abort() */ 91 92 /*---------------------------------------------------------------------------- 93 | Primitive arithmetic functions, including multi-word arithmetic, and 94 | division and square root approximations. (Can be specialized to target if 95 | desired.) 96 *----------------------------------------------------------------------------*/ 97 #include "fpu/softfloat-macros.h" 98 99 /* 100 * Hardfloat 101 * 102 * Fast emulation of guest FP instructions is challenging for two reasons. 103 * First, FP instruction semantics are similar but not identical, particularly 104 * when handling NaNs. Second, emulating at reasonable speed the guest FP 105 * exception flags is not trivial: reading the host's flags register with a 106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp], 107 * and trapping on every FP exception is not fast nor pleasant to work with. 108 * 109 * We address these challenges by leveraging the host FPU for a subset of the 110 * operations. To do this we expand on the idea presented in this paper: 111 * 112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a 113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615. 114 * 115 * The idea is thus to leverage the host FPU to (1) compute FP operations 116 * and (2) identify whether FP exceptions occurred while avoiding 117 * expensive exception flag register accesses. 118 * 119 * An important optimization shown in the paper is that given that exception 120 * flags are rarely cleared by the guest, we can avoid recomputing some flags. 121 * This is particularly useful for the inexact flag, which is very frequently 122 * raised in floating-point workloads. 123 * 124 * We optimize the code further by deferring to soft-fp whenever FP exception 125 * detection might get hairy. Two examples: (1) when at least one operand is 126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result 127 * and the result is < the minimum normal. 128 */ 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \ 130 static inline void name(soft_t *a, float_status *s) \ 131 { \ 132 if (unlikely(soft_t ## _is_denormal(*a))) { \ 133 *a = soft_t ## _set_sign(soft_t ## _zero, \ 134 soft_t ## _is_neg(*a)); \ 135 s->float_exception_flags |= float_flag_input_denormal; \ 136 } \ 137 } 138 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32) 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64) 141 #undef GEN_INPUT_FLUSH__NOCHECK 142 143 #define GEN_INPUT_FLUSH1(name, soft_t) \ 144 static inline void name(soft_t *a, float_status *s) \ 145 { \ 146 if (likely(!s->flush_inputs_to_zero)) { \ 147 return; \ 148 } \ 149 soft_t ## _input_flush__nocheck(a, s); \ 150 } 151 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32) 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64) 154 #undef GEN_INPUT_FLUSH1 155 156 #define GEN_INPUT_FLUSH2(name, soft_t) \ 157 static inline void name(soft_t *a, soft_t *b, float_status *s) \ 158 { \ 159 if (likely(!s->flush_inputs_to_zero)) { \ 160 return; \ 161 } \ 162 soft_t ## _input_flush__nocheck(a, s); \ 163 soft_t ## _input_flush__nocheck(b, s); \ 164 } 165 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32) 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64) 168 #undef GEN_INPUT_FLUSH2 169 170 #define GEN_INPUT_FLUSH3(name, soft_t) \ 171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \ 172 { \ 173 if (likely(!s->flush_inputs_to_zero)) { \ 174 return; \ 175 } \ 176 soft_t ## _input_flush__nocheck(a, s); \ 177 soft_t ## _input_flush__nocheck(b, s); \ 178 soft_t ## _input_flush__nocheck(c, s); \ 179 } 180 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32) 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64) 183 #undef GEN_INPUT_FLUSH3 184 185 /* 186 * Choose whether to use fpclassify or float32/64_* primitives in the generated 187 * hardfloat functions. Each combination of number of inputs and float size 188 * gets its own value. 189 */ 190 #if defined(__x86_64__) 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1 197 #else 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0 204 #endif 205 206 /* 207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over 208 * float{32,64}_is_infinity when !USE_FP. 209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup. 210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%. 211 */ 212 #if defined(__x86_64__) || defined(__aarch64__) 213 # define QEMU_HARDFLOAT_USE_ISINF 1 214 #else 215 # define QEMU_HARDFLOAT_USE_ISINF 0 216 #endif 217 218 /* 219 * Some targets clear the FP flags before most FP operations. This prevents 220 * the use of hardfloat, since hardfloat relies on the inexact flag being 221 * already set. 222 */ 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__) 224 # if defined(__FAST_MATH__) 225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \ 226 IEEE implementation 227 # endif 228 # define QEMU_NO_HARDFLOAT 1 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN 230 #else 231 # define QEMU_NO_HARDFLOAT 0 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline)) 233 #endif 234 235 static inline bool can_use_fpu(const float_status *s) 236 { 237 if (QEMU_NO_HARDFLOAT) { 238 return false; 239 } 240 return likely(s->float_exception_flags & float_flag_inexact && 241 s->float_rounding_mode == float_round_nearest_even); 242 } 243 244 /* 245 * Hardfloat generation functions. Each operation can have two flavors: 246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for 247 * most condition checks, or native ones (e.g. fpclassify). 248 * 249 * The flavor is chosen by the callers. Instead of using macros, we rely on the 250 * compiler to propagate constants and inline everything into the callers. 251 * 252 * We only generate functions for operations with two inputs, since only 253 * these are common enough to justify consolidating them into common code. 254 */ 255 256 typedef union { 257 float32 s; 258 float h; 259 } union_float32; 260 261 typedef union { 262 float64 s; 263 double h; 264 } union_float64; 265 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b); 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b); 268 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s); 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s); 271 typedef float (*hard_f32_op2_fn)(float a, float b); 272 typedef double (*hard_f64_op2_fn)(double a, double b); 273 274 /* 2-input is-zero-or-normal */ 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b) 276 { 277 if (QEMU_HARDFLOAT_2F32_USE_FP) { 278 /* 279 * Not using a temp variable for consecutive fpclassify calls ends up 280 * generating faster code. 281 */ 282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 284 } 285 return float32_is_zero_or_normal(a.s) && 286 float32_is_zero_or_normal(b.s); 287 } 288 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b) 290 { 291 if (QEMU_HARDFLOAT_2F64_USE_FP) { 292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 294 } 295 return float64_is_zero_or_normal(a.s) && 296 float64_is_zero_or_normal(b.s); 297 } 298 299 /* 3-input is-zero-or-normal */ 300 static inline 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c) 302 { 303 if (QEMU_HARDFLOAT_3F32_USE_FP) { 304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 307 } 308 return float32_is_zero_or_normal(a.s) && 309 float32_is_zero_or_normal(b.s) && 310 float32_is_zero_or_normal(c.s); 311 } 312 313 static inline 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c) 315 { 316 if (QEMU_HARDFLOAT_3F64_USE_FP) { 317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 320 } 321 return float64_is_zero_or_normal(a.s) && 322 float64_is_zero_or_normal(b.s) && 323 float64_is_zero_or_normal(c.s); 324 } 325 326 static inline bool f32_is_inf(union_float32 a) 327 { 328 if (QEMU_HARDFLOAT_USE_ISINF) { 329 return isinf(a.h); 330 } 331 return float32_is_infinity(a.s); 332 } 333 334 static inline bool f64_is_inf(union_float64 a) 335 { 336 if (QEMU_HARDFLOAT_USE_ISINF) { 337 return isinf(a.h); 338 } 339 return float64_is_infinity(a.s); 340 } 341 342 /* Note: @fast_test and @post can be NULL */ 343 static inline float32 344 float32_gen2(float32 xa, float32 xb, float_status *s, 345 hard_f32_op2_fn hard, soft_f32_op2_fn soft, 346 f32_check_fn pre, f32_check_fn post, 347 f32_check_fn fast_test, soft_f32_op2_fn fast_op) 348 { 349 union_float32 ua, ub, ur; 350 351 ua.s = xa; 352 ub.s = xb; 353 354 if (unlikely(!can_use_fpu(s))) { 355 goto soft; 356 } 357 358 float32_input_flush2(&ua.s, &ub.s, s); 359 if (unlikely(!pre(ua, ub))) { 360 goto soft; 361 } 362 if (fast_test && fast_test(ua, ub)) { 363 return fast_op(ua.s, ub.s, s); 364 } 365 366 ur.h = hard(ua.h, ub.h); 367 if (unlikely(f32_is_inf(ur))) { 368 s->float_exception_flags |= float_flag_overflow; 369 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) { 370 if (post == NULL || post(ua, ub)) { 371 goto soft; 372 } 373 } 374 return ur.s; 375 376 soft: 377 return soft(ua.s, ub.s, s); 378 } 379 380 static inline float64 381 float64_gen2(float64 xa, float64 xb, float_status *s, 382 hard_f64_op2_fn hard, soft_f64_op2_fn soft, 383 f64_check_fn pre, f64_check_fn post, 384 f64_check_fn fast_test, soft_f64_op2_fn fast_op) 385 { 386 union_float64 ua, ub, ur; 387 388 ua.s = xa; 389 ub.s = xb; 390 391 if (unlikely(!can_use_fpu(s))) { 392 goto soft; 393 } 394 395 float64_input_flush2(&ua.s, &ub.s, s); 396 if (unlikely(!pre(ua, ub))) { 397 goto soft; 398 } 399 if (fast_test && fast_test(ua, ub)) { 400 return fast_op(ua.s, ub.s, s); 401 } 402 403 ur.h = hard(ua.h, ub.h); 404 if (unlikely(f64_is_inf(ur))) { 405 s->float_exception_flags |= float_flag_overflow; 406 } else if (unlikely(fabs(ur.h) <= DBL_MIN)) { 407 if (post == NULL || post(ua, ub)) { 408 goto soft; 409 } 410 } 411 return ur.s; 412 413 soft: 414 return soft(ua.s, ub.s, s); 415 } 416 417 /*---------------------------------------------------------------------------- 418 | Returns the fraction bits of the half-precision floating-point value `a'. 419 *----------------------------------------------------------------------------*/ 420 421 static inline uint32_t extractFloat16Frac(float16 a) 422 { 423 return float16_val(a) & 0x3ff; 424 } 425 426 /*---------------------------------------------------------------------------- 427 | Returns the exponent bits of the half-precision floating-point value `a'. 428 *----------------------------------------------------------------------------*/ 429 430 static inline int extractFloat16Exp(float16 a) 431 { 432 return (float16_val(a) >> 10) & 0x1f; 433 } 434 435 /*---------------------------------------------------------------------------- 436 | Returns the fraction bits of the single-precision floating-point value `a'. 437 *----------------------------------------------------------------------------*/ 438 439 static inline uint32_t extractFloat32Frac(float32 a) 440 { 441 return float32_val(a) & 0x007FFFFF; 442 } 443 444 /*---------------------------------------------------------------------------- 445 | Returns the exponent bits of the single-precision floating-point value `a'. 446 *----------------------------------------------------------------------------*/ 447 448 static inline int extractFloat32Exp(float32 a) 449 { 450 return (float32_val(a) >> 23) & 0xFF; 451 } 452 453 /*---------------------------------------------------------------------------- 454 | Returns the sign bit of the single-precision floating-point value `a'. 455 *----------------------------------------------------------------------------*/ 456 457 static inline flag extractFloat32Sign(float32 a) 458 { 459 return float32_val(a) >> 31; 460 } 461 462 /*---------------------------------------------------------------------------- 463 | Returns the fraction bits of the double-precision floating-point value `a'. 464 *----------------------------------------------------------------------------*/ 465 466 static inline uint64_t extractFloat64Frac(float64 a) 467 { 468 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF); 469 } 470 471 /*---------------------------------------------------------------------------- 472 | Returns the exponent bits of the double-precision floating-point value `a'. 473 *----------------------------------------------------------------------------*/ 474 475 static inline int extractFloat64Exp(float64 a) 476 { 477 return (float64_val(a) >> 52) & 0x7FF; 478 } 479 480 /*---------------------------------------------------------------------------- 481 | Returns the sign bit of the double-precision floating-point value `a'. 482 *----------------------------------------------------------------------------*/ 483 484 static inline flag extractFloat64Sign(float64 a) 485 { 486 return float64_val(a) >> 63; 487 } 488 489 /* 490 * Classify a floating point number. Everything above float_class_qnan 491 * is a NaN so cls >= float_class_qnan is any NaN. 492 */ 493 494 typedef enum __attribute__ ((__packed__)) { 495 float_class_unclassified, 496 float_class_zero, 497 float_class_normal, 498 float_class_inf, 499 float_class_qnan, /* all NaNs from here */ 500 float_class_snan, 501 } FloatClass; 502 503 /* Simple helpers for checking if, or what kind of, NaN we have */ 504 static inline __attribute__((unused)) bool is_nan(FloatClass c) 505 { 506 return unlikely(c >= float_class_qnan); 507 } 508 509 static inline __attribute__((unused)) bool is_snan(FloatClass c) 510 { 511 return c == float_class_snan; 512 } 513 514 static inline __attribute__((unused)) bool is_qnan(FloatClass c) 515 { 516 return c == float_class_qnan; 517 } 518 519 /* 520 * Structure holding all of the decomposed parts of a float. The 521 * exponent is unbiased and the fraction is normalized. All 522 * calculations are done with a 64 bit fraction and then rounded as 523 * appropriate for the final format. 524 * 525 * Thanks to the packed FloatClass a decent compiler should be able to 526 * fit the whole structure into registers and avoid using the stack 527 * for parameter passing. 528 */ 529 530 typedef struct { 531 uint64_t frac; 532 int32_t exp; 533 FloatClass cls; 534 bool sign; 535 } FloatParts; 536 537 #define DECOMPOSED_BINARY_POINT (64 - 2) 538 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 539 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1) 540 541 /* Structure holding all of the relevant parameters for a format. 542 * exp_size: the size of the exponent field 543 * exp_bias: the offset applied to the exponent field 544 * exp_max: the maximum normalised exponent 545 * frac_size: the size of the fraction field 546 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 547 * The following are computed based the size of fraction 548 * frac_lsb: least significant bit of fraction 549 * frac_lsbm1: the bit below the least significant bit (for rounding) 550 * round_mask/roundeven_mask: masks used for rounding 551 * The following optional modifiers are available: 552 * arm_althp: handle ARM Alternative Half Precision 553 */ 554 typedef struct { 555 int exp_size; 556 int exp_bias; 557 int exp_max; 558 int frac_size; 559 int frac_shift; 560 uint64_t frac_lsb; 561 uint64_t frac_lsbm1; 562 uint64_t round_mask; 563 uint64_t roundeven_mask; 564 bool arm_althp; 565 } FloatFmt; 566 567 /* Expand fields based on the size of exponent and fraction */ 568 #define FLOAT_PARAMS(E, F) \ 569 .exp_size = E, \ 570 .exp_bias = ((1 << E) - 1) >> 1, \ 571 .exp_max = (1 << E) - 1, \ 572 .frac_size = F, \ 573 .frac_shift = DECOMPOSED_BINARY_POINT - F, \ 574 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \ 575 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \ 576 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \ 577 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1 578 579 static const FloatFmt float16_params = { 580 FLOAT_PARAMS(5, 10) 581 }; 582 583 static const FloatFmt float16_params_ahp = { 584 FLOAT_PARAMS(5, 10), 585 .arm_althp = true 586 }; 587 588 static const FloatFmt float32_params = { 589 FLOAT_PARAMS(8, 23) 590 }; 591 592 static const FloatFmt float64_params = { 593 FLOAT_PARAMS(11, 52) 594 }; 595 596 /* Unpack a float to parts, but do not canonicalize. */ 597 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw) 598 { 599 const int sign_pos = fmt.frac_size + fmt.exp_size; 600 601 return (FloatParts) { 602 .cls = float_class_unclassified, 603 .sign = extract64(raw, sign_pos, 1), 604 .exp = extract64(raw, fmt.frac_size, fmt.exp_size), 605 .frac = extract64(raw, 0, fmt.frac_size), 606 }; 607 } 608 609 static inline FloatParts float16_unpack_raw(float16 f) 610 { 611 return unpack_raw(float16_params, f); 612 } 613 614 static inline FloatParts float32_unpack_raw(float32 f) 615 { 616 return unpack_raw(float32_params, f); 617 } 618 619 static inline FloatParts float64_unpack_raw(float64 f) 620 { 621 return unpack_raw(float64_params, f); 622 } 623 624 /* Pack a float from parts, but do not canonicalize. */ 625 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p) 626 { 627 const int sign_pos = fmt.frac_size + fmt.exp_size; 628 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp); 629 return deposit64(ret, sign_pos, 1, p.sign); 630 } 631 632 static inline float16 float16_pack_raw(FloatParts p) 633 { 634 return make_float16(pack_raw(float16_params, p)); 635 } 636 637 static inline float32 float32_pack_raw(FloatParts p) 638 { 639 return make_float32(pack_raw(float32_params, p)); 640 } 641 642 static inline float64 float64_pack_raw(FloatParts p) 643 { 644 return make_float64(pack_raw(float64_params, p)); 645 } 646 647 /*---------------------------------------------------------------------------- 648 | Functions and definitions to determine: (1) whether tininess for underflow 649 | is detected before or after rounding by default, (2) what (if anything) 650 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 651 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 652 | are propagated from function inputs to output. These details are target- 653 | specific. 654 *----------------------------------------------------------------------------*/ 655 #include "softfloat-specialize.h" 656 657 /* Canonicalize EXP and FRAC, setting CLS. */ 658 static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm, 659 float_status *status) 660 { 661 if (part.exp == parm->exp_max && !parm->arm_althp) { 662 if (part.frac == 0) { 663 part.cls = float_class_inf; 664 } else { 665 part.frac <<= parm->frac_shift; 666 part.cls = (parts_is_snan_frac(part.frac, status) 667 ? float_class_snan : float_class_qnan); 668 } 669 } else if (part.exp == 0) { 670 if (likely(part.frac == 0)) { 671 part.cls = float_class_zero; 672 } else if (status->flush_inputs_to_zero) { 673 float_raise(float_flag_input_denormal, status); 674 part.cls = float_class_zero; 675 part.frac = 0; 676 } else { 677 int shift = clz64(part.frac) - 1; 678 part.cls = float_class_normal; 679 part.exp = parm->frac_shift - parm->exp_bias - shift + 1; 680 part.frac <<= shift; 681 } 682 } else { 683 part.cls = float_class_normal; 684 part.exp -= parm->exp_bias; 685 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift); 686 } 687 return part; 688 } 689 690 /* Round and uncanonicalize a floating-point number by parts. There 691 * are FRAC_SHIFT bits that may require rounding at the bottom of the 692 * fraction; these bits will be removed. The exponent will be biased 693 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0]. 694 */ 695 696 static FloatParts round_canonical(FloatParts p, float_status *s, 697 const FloatFmt *parm) 698 { 699 const uint64_t frac_lsbm1 = parm->frac_lsbm1; 700 const uint64_t round_mask = parm->round_mask; 701 const uint64_t roundeven_mask = parm->roundeven_mask; 702 const int exp_max = parm->exp_max; 703 const int frac_shift = parm->frac_shift; 704 uint64_t frac, inc; 705 int exp, flags = 0; 706 bool overflow_norm; 707 708 frac = p.frac; 709 exp = p.exp; 710 711 switch (p.cls) { 712 case float_class_normal: 713 switch (s->float_rounding_mode) { 714 case float_round_nearest_even: 715 overflow_norm = false; 716 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 717 break; 718 case float_round_ties_away: 719 overflow_norm = false; 720 inc = frac_lsbm1; 721 break; 722 case float_round_to_zero: 723 overflow_norm = true; 724 inc = 0; 725 break; 726 case float_round_up: 727 inc = p.sign ? 0 : round_mask; 728 overflow_norm = p.sign; 729 break; 730 case float_round_down: 731 inc = p.sign ? round_mask : 0; 732 overflow_norm = !p.sign; 733 break; 734 default: 735 g_assert_not_reached(); 736 } 737 738 exp += parm->exp_bias; 739 if (likely(exp > 0)) { 740 if (frac & round_mask) { 741 flags |= float_flag_inexact; 742 frac += inc; 743 if (frac & DECOMPOSED_OVERFLOW_BIT) { 744 frac >>= 1; 745 exp++; 746 } 747 } 748 frac >>= frac_shift; 749 750 if (parm->arm_althp) { 751 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */ 752 if (unlikely(exp > exp_max)) { 753 /* Overflow. Return the maximum normal. */ 754 flags = float_flag_invalid; 755 exp = exp_max; 756 frac = -1; 757 } 758 } else if (unlikely(exp >= exp_max)) { 759 flags |= float_flag_overflow | float_flag_inexact; 760 if (overflow_norm) { 761 exp = exp_max - 1; 762 frac = -1; 763 } else { 764 p.cls = float_class_inf; 765 goto do_inf; 766 } 767 } 768 } else if (s->flush_to_zero) { 769 flags |= float_flag_output_denormal; 770 p.cls = float_class_zero; 771 goto do_zero; 772 } else { 773 bool is_tiny = (s->float_detect_tininess 774 == float_tininess_before_rounding) 775 || (exp < 0) 776 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT); 777 778 shift64RightJamming(frac, 1 - exp, &frac); 779 if (frac & round_mask) { 780 /* Need to recompute round-to-even. */ 781 if (s->float_rounding_mode == float_round_nearest_even) { 782 inc = ((frac & roundeven_mask) != frac_lsbm1 783 ? frac_lsbm1 : 0); 784 } 785 flags |= float_flag_inexact; 786 frac += inc; 787 } 788 789 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0); 790 frac >>= frac_shift; 791 792 if (is_tiny && (flags & float_flag_inexact)) { 793 flags |= float_flag_underflow; 794 } 795 if (exp == 0 && frac == 0) { 796 p.cls = float_class_zero; 797 } 798 } 799 break; 800 801 case float_class_zero: 802 do_zero: 803 exp = 0; 804 frac = 0; 805 break; 806 807 case float_class_inf: 808 do_inf: 809 assert(!parm->arm_althp); 810 exp = exp_max; 811 frac = 0; 812 break; 813 814 case float_class_qnan: 815 case float_class_snan: 816 assert(!parm->arm_althp); 817 exp = exp_max; 818 frac >>= parm->frac_shift; 819 break; 820 821 default: 822 g_assert_not_reached(); 823 } 824 825 float_raise(flags, s); 826 p.exp = exp; 827 p.frac = frac; 828 return p; 829 } 830 831 /* Explicit FloatFmt version */ 832 static FloatParts float16a_unpack_canonical(float16 f, float_status *s, 833 const FloatFmt *params) 834 { 835 return sf_canonicalize(float16_unpack_raw(f), params, s); 836 } 837 838 static FloatParts float16_unpack_canonical(float16 f, float_status *s) 839 { 840 return float16a_unpack_canonical(f, s, &float16_params); 841 } 842 843 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s, 844 const FloatFmt *params) 845 { 846 return float16_pack_raw(round_canonical(p, s, params)); 847 } 848 849 static float16 float16_round_pack_canonical(FloatParts p, float_status *s) 850 { 851 return float16a_round_pack_canonical(p, s, &float16_params); 852 } 853 854 static FloatParts float32_unpack_canonical(float32 f, float_status *s) 855 { 856 return sf_canonicalize(float32_unpack_raw(f), &float32_params, s); 857 } 858 859 static float32 float32_round_pack_canonical(FloatParts p, float_status *s) 860 { 861 return float32_pack_raw(round_canonical(p, s, &float32_params)); 862 } 863 864 static FloatParts float64_unpack_canonical(float64 f, float_status *s) 865 { 866 return sf_canonicalize(float64_unpack_raw(f), &float64_params, s); 867 } 868 869 static float64 float64_round_pack_canonical(FloatParts p, float_status *s) 870 { 871 return float64_pack_raw(round_canonical(p, s, &float64_params)); 872 } 873 874 static FloatParts return_nan(FloatParts a, float_status *s) 875 { 876 switch (a.cls) { 877 case float_class_snan: 878 s->float_exception_flags |= float_flag_invalid; 879 a = parts_silence_nan(a, s); 880 /* fall through */ 881 case float_class_qnan: 882 if (s->default_nan_mode) { 883 return parts_default_nan(s); 884 } 885 break; 886 887 default: 888 g_assert_not_reached(); 889 } 890 return a; 891 } 892 893 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s) 894 { 895 if (is_snan(a.cls) || is_snan(b.cls)) { 896 s->float_exception_flags |= float_flag_invalid; 897 } 898 899 if (s->default_nan_mode) { 900 return parts_default_nan(s); 901 } else { 902 if (pickNaN(a.cls, b.cls, 903 a.frac > b.frac || 904 (a.frac == b.frac && a.sign < b.sign))) { 905 a = b; 906 } 907 if (is_snan(a.cls)) { 908 return parts_silence_nan(a, s); 909 } 910 } 911 return a; 912 } 913 914 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c, 915 bool inf_zero, float_status *s) 916 { 917 int which; 918 919 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) { 920 s->float_exception_flags |= float_flag_invalid; 921 } 922 923 which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s); 924 925 if (s->default_nan_mode) { 926 /* Note that this check is after pickNaNMulAdd so that function 927 * has an opportunity to set the Invalid flag. 928 */ 929 which = 3; 930 } 931 932 switch (which) { 933 case 0: 934 break; 935 case 1: 936 a = b; 937 break; 938 case 2: 939 a = c; 940 break; 941 case 3: 942 return parts_default_nan(s); 943 default: 944 g_assert_not_reached(); 945 } 946 947 if (is_snan(a.cls)) { 948 return parts_silence_nan(a, s); 949 } 950 return a; 951 } 952 953 /* 954 * Returns the result of adding or subtracting the values of the 955 * floating-point values `a' and `b'. The operation is performed 956 * according to the IEC/IEEE Standard for Binary Floating-Point 957 * Arithmetic. 958 */ 959 960 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract, 961 float_status *s) 962 { 963 bool a_sign = a.sign; 964 bool b_sign = b.sign ^ subtract; 965 966 if (a_sign != b_sign) { 967 /* Subtraction */ 968 969 if (a.cls == float_class_normal && b.cls == float_class_normal) { 970 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) { 971 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 972 a.frac = a.frac - b.frac; 973 } else { 974 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 975 a.frac = b.frac - a.frac; 976 a.exp = b.exp; 977 a_sign ^= 1; 978 } 979 980 if (a.frac == 0) { 981 a.cls = float_class_zero; 982 a.sign = s->float_rounding_mode == float_round_down; 983 } else { 984 int shift = clz64(a.frac) - 1; 985 a.frac = a.frac << shift; 986 a.exp = a.exp - shift; 987 a.sign = a_sign; 988 } 989 return a; 990 } 991 if (is_nan(a.cls) || is_nan(b.cls)) { 992 return pick_nan(a, b, s); 993 } 994 if (a.cls == float_class_inf) { 995 if (b.cls == float_class_inf) { 996 float_raise(float_flag_invalid, s); 997 return parts_default_nan(s); 998 } 999 return a; 1000 } 1001 if (a.cls == float_class_zero && b.cls == float_class_zero) { 1002 a.sign = s->float_rounding_mode == float_round_down; 1003 return a; 1004 } 1005 if (a.cls == float_class_zero || b.cls == float_class_inf) { 1006 b.sign = a_sign ^ 1; 1007 return b; 1008 } 1009 if (b.cls == float_class_zero) { 1010 return a; 1011 } 1012 } else { 1013 /* Addition */ 1014 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1015 if (a.exp > b.exp) { 1016 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 1017 } else if (a.exp < b.exp) { 1018 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 1019 a.exp = b.exp; 1020 } 1021 a.frac += b.frac; 1022 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 1023 shift64RightJamming(a.frac, 1, &a.frac); 1024 a.exp += 1; 1025 } 1026 return a; 1027 } 1028 if (is_nan(a.cls) || is_nan(b.cls)) { 1029 return pick_nan(a, b, s); 1030 } 1031 if (a.cls == float_class_inf || b.cls == float_class_zero) { 1032 return a; 1033 } 1034 if (b.cls == float_class_inf || a.cls == float_class_zero) { 1035 b.sign = b_sign; 1036 return b; 1037 } 1038 } 1039 g_assert_not_reached(); 1040 } 1041 1042 /* 1043 * Returns the result of adding or subtracting the floating-point 1044 * values `a' and `b'. The operation is performed according to the 1045 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1046 */ 1047 1048 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status) 1049 { 1050 FloatParts pa = float16_unpack_canonical(a, status); 1051 FloatParts pb = float16_unpack_canonical(b, status); 1052 FloatParts pr = addsub_floats(pa, pb, false, status); 1053 1054 return float16_round_pack_canonical(pr, status); 1055 } 1056 1057 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status) 1058 { 1059 FloatParts pa = float16_unpack_canonical(a, status); 1060 FloatParts pb = float16_unpack_canonical(b, status); 1061 FloatParts pr = addsub_floats(pa, pb, true, status); 1062 1063 return float16_round_pack_canonical(pr, status); 1064 } 1065 1066 static float32 QEMU_SOFTFLOAT_ATTR 1067 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status) 1068 { 1069 FloatParts pa = float32_unpack_canonical(a, status); 1070 FloatParts pb = float32_unpack_canonical(b, status); 1071 FloatParts pr = addsub_floats(pa, pb, subtract, status); 1072 1073 return float32_round_pack_canonical(pr, status); 1074 } 1075 1076 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status) 1077 { 1078 return soft_f32_addsub(a, b, false, status); 1079 } 1080 1081 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status) 1082 { 1083 return soft_f32_addsub(a, b, true, status); 1084 } 1085 1086 static float64 QEMU_SOFTFLOAT_ATTR 1087 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status) 1088 { 1089 FloatParts pa = float64_unpack_canonical(a, status); 1090 FloatParts pb = float64_unpack_canonical(b, status); 1091 FloatParts pr = addsub_floats(pa, pb, subtract, status); 1092 1093 return float64_round_pack_canonical(pr, status); 1094 } 1095 1096 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status) 1097 { 1098 return soft_f64_addsub(a, b, false, status); 1099 } 1100 1101 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status) 1102 { 1103 return soft_f64_addsub(a, b, true, status); 1104 } 1105 1106 static float hard_f32_add(float a, float b) 1107 { 1108 return a + b; 1109 } 1110 1111 static float hard_f32_sub(float a, float b) 1112 { 1113 return a - b; 1114 } 1115 1116 static double hard_f64_add(double a, double b) 1117 { 1118 return a + b; 1119 } 1120 1121 static double hard_f64_sub(double a, double b) 1122 { 1123 return a - b; 1124 } 1125 1126 static bool f32_addsub_post(union_float32 a, union_float32 b) 1127 { 1128 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1129 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1130 } 1131 return !(float32_is_zero(a.s) && float32_is_zero(b.s)); 1132 } 1133 1134 static bool f64_addsub_post(union_float64 a, union_float64 b) 1135 { 1136 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1137 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1138 } else { 1139 return !(float64_is_zero(a.s) && float64_is_zero(b.s)); 1140 } 1141 } 1142 1143 static float32 float32_addsub(float32 a, float32 b, float_status *s, 1144 hard_f32_op2_fn hard, soft_f32_op2_fn soft) 1145 { 1146 return float32_gen2(a, b, s, hard, soft, 1147 f32_is_zon2, f32_addsub_post, NULL, NULL); 1148 } 1149 1150 static float64 float64_addsub(float64 a, float64 b, float_status *s, 1151 hard_f64_op2_fn hard, soft_f64_op2_fn soft) 1152 { 1153 return float64_gen2(a, b, s, hard, soft, 1154 f64_is_zon2, f64_addsub_post, NULL, NULL); 1155 } 1156 1157 float32 QEMU_FLATTEN 1158 float32_add(float32 a, float32 b, float_status *s) 1159 { 1160 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add); 1161 } 1162 1163 float32 QEMU_FLATTEN 1164 float32_sub(float32 a, float32 b, float_status *s) 1165 { 1166 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub); 1167 } 1168 1169 float64 QEMU_FLATTEN 1170 float64_add(float64 a, float64 b, float_status *s) 1171 { 1172 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add); 1173 } 1174 1175 float64 QEMU_FLATTEN 1176 float64_sub(float64 a, float64 b, float_status *s) 1177 { 1178 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub); 1179 } 1180 1181 /* 1182 * Returns the result of multiplying the floating-point values `a' and 1183 * `b'. The operation is performed according to the IEC/IEEE Standard 1184 * for Binary Floating-Point Arithmetic. 1185 */ 1186 1187 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s) 1188 { 1189 bool sign = a.sign ^ b.sign; 1190 1191 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1192 uint64_t hi, lo; 1193 int exp = a.exp + b.exp; 1194 1195 mul64To128(a.frac, b.frac, &hi, &lo); 1196 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1197 if (lo & DECOMPOSED_OVERFLOW_BIT) { 1198 shift64RightJamming(lo, 1, &lo); 1199 exp += 1; 1200 } 1201 1202 /* Re-use a */ 1203 a.exp = exp; 1204 a.sign = sign; 1205 a.frac = lo; 1206 return a; 1207 } 1208 /* handle all the NaN cases */ 1209 if (is_nan(a.cls) || is_nan(b.cls)) { 1210 return pick_nan(a, b, s); 1211 } 1212 /* Inf * Zero == NaN */ 1213 if ((a.cls == float_class_inf && b.cls == float_class_zero) || 1214 (a.cls == float_class_zero && b.cls == float_class_inf)) { 1215 s->float_exception_flags |= float_flag_invalid; 1216 return parts_default_nan(s); 1217 } 1218 /* Multiply by 0 or Inf */ 1219 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1220 a.sign = sign; 1221 return a; 1222 } 1223 if (b.cls == float_class_inf || b.cls == float_class_zero) { 1224 b.sign = sign; 1225 return b; 1226 } 1227 g_assert_not_reached(); 1228 } 1229 1230 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status) 1231 { 1232 FloatParts pa = float16_unpack_canonical(a, status); 1233 FloatParts pb = float16_unpack_canonical(b, status); 1234 FloatParts pr = mul_floats(pa, pb, status); 1235 1236 return float16_round_pack_canonical(pr, status); 1237 } 1238 1239 static float32 QEMU_SOFTFLOAT_ATTR 1240 soft_f32_mul(float32 a, float32 b, float_status *status) 1241 { 1242 FloatParts pa = float32_unpack_canonical(a, status); 1243 FloatParts pb = float32_unpack_canonical(b, status); 1244 FloatParts pr = mul_floats(pa, pb, status); 1245 1246 return float32_round_pack_canonical(pr, status); 1247 } 1248 1249 static float64 QEMU_SOFTFLOAT_ATTR 1250 soft_f64_mul(float64 a, float64 b, float_status *status) 1251 { 1252 FloatParts pa = float64_unpack_canonical(a, status); 1253 FloatParts pb = float64_unpack_canonical(b, status); 1254 FloatParts pr = mul_floats(pa, pb, status); 1255 1256 return float64_round_pack_canonical(pr, status); 1257 } 1258 1259 static float hard_f32_mul(float a, float b) 1260 { 1261 return a * b; 1262 } 1263 1264 static double hard_f64_mul(double a, double b) 1265 { 1266 return a * b; 1267 } 1268 1269 static bool f32_mul_fast_test(union_float32 a, union_float32 b) 1270 { 1271 return float32_is_zero(a.s) || float32_is_zero(b.s); 1272 } 1273 1274 static bool f64_mul_fast_test(union_float64 a, union_float64 b) 1275 { 1276 return float64_is_zero(a.s) || float64_is_zero(b.s); 1277 } 1278 1279 static float32 f32_mul_fast_op(float32 a, float32 b, float_status *s) 1280 { 1281 bool signbit = float32_is_neg(a) ^ float32_is_neg(b); 1282 1283 return float32_set_sign(float32_zero, signbit); 1284 } 1285 1286 static float64 f64_mul_fast_op(float64 a, float64 b, float_status *s) 1287 { 1288 bool signbit = float64_is_neg(a) ^ float64_is_neg(b); 1289 1290 return float64_set_sign(float64_zero, signbit); 1291 } 1292 1293 float32 QEMU_FLATTEN 1294 float32_mul(float32 a, float32 b, float_status *s) 1295 { 1296 return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul, 1297 f32_is_zon2, NULL, f32_mul_fast_test, f32_mul_fast_op); 1298 } 1299 1300 float64 QEMU_FLATTEN 1301 float64_mul(float64 a, float64 b, float_status *s) 1302 { 1303 return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul, 1304 f64_is_zon2, NULL, f64_mul_fast_test, f64_mul_fast_op); 1305 } 1306 1307 /* 1308 * Returns the result of multiplying the floating-point values `a' and 1309 * `b' then adding 'c', with no intermediate rounding step after the 1310 * multiplication. The operation is performed according to the 1311 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008. 1312 * The flags argument allows the caller to select negation of the 1313 * addend, the intermediate product, or the final result. (The 1314 * difference between this and having the caller do a separate 1315 * negation is that negating externally will flip the sign bit on 1316 * NaNs.) 1317 */ 1318 1319 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c, 1320 int flags, float_status *s) 1321 { 1322 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) == 1323 ((1 << float_class_inf) | (1 << float_class_zero)); 1324 bool p_sign; 1325 bool sign_flip = flags & float_muladd_negate_result; 1326 FloatClass p_class; 1327 uint64_t hi, lo; 1328 int p_exp; 1329 1330 /* It is implementation-defined whether the cases of (0,inf,qnan) 1331 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 1332 * they return if they do), so we have to hand this information 1333 * off to the target-specific pick-a-NaN routine. 1334 */ 1335 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) { 1336 return pick_nan_muladd(a, b, c, inf_zero, s); 1337 } 1338 1339 if (inf_zero) { 1340 s->float_exception_flags |= float_flag_invalid; 1341 return parts_default_nan(s); 1342 } 1343 1344 if (flags & float_muladd_negate_c) { 1345 c.sign ^= 1; 1346 } 1347 1348 p_sign = a.sign ^ b.sign; 1349 1350 if (flags & float_muladd_negate_product) { 1351 p_sign ^= 1; 1352 } 1353 1354 if (a.cls == float_class_inf || b.cls == float_class_inf) { 1355 p_class = float_class_inf; 1356 } else if (a.cls == float_class_zero || b.cls == float_class_zero) { 1357 p_class = float_class_zero; 1358 } else { 1359 p_class = float_class_normal; 1360 } 1361 1362 if (c.cls == float_class_inf) { 1363 if (p_class == float_class_inf && p_sign != c.sign) { 1364 s->float_exception_flags |= float_flag_invalid; 1365 return parts_default_nan(s); 1366 } else { 1367 a.cls = float_class_inf; 1368 a.sign = c.sign ^ sign_flip; 1369 return a; 1370 } 1371 } 1372 1373 if (p_class == float_class_inf) { 1374 a.cls = float_class_inf; 1375 a.sign = p_sign ^ sign_flip; 1376 return a; 1377 } 1378 1379 if (p_class == float_class_zero) { 1380 if (c.cls == float_class_zero) { 1381 if (p_sign != c.sign) { 1382 p_sign = s->float_rounding_mode == float_round_down; 1383 } 1384 c.sign = p_sign; 1385 } else if (flags & float_muladd_halve_result) { 1386 c.exp -= 1; 1387 } 1388 c.sign ^= sign_flip; 1389 return c; 1390 } 1391 1392 /* a & b should be normals now... */ 1393 assert(a.cls == float_class_normal && 1394 b.cls == float_class_normal); 1395 1396 p_exp = a.exp + b.exp; 1397 1398 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit 1399 * result. 1400 */ 1401 mul64To128(a.frac, b.frac, &hi, &lo); 1402 /* binary point now at bit 124 */ 1403 1404 /* check for overflow */ 1405 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) { 1406 shift128RightJamming(hi, lo, 1, &hi, &lo); 1407 p_exp += 1; 1408 } 1409 1410 /* + add/sub */ 1411 if (c.cls == float_class_zero) { 1412 /* move binary point back to 62 */ 1413 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1414 } else { 1415 int exp_diff = p_exp - c.exp; 1416 if (p_sign == c.sign) { 1417 /* Addition */ 1418 if (exp_diff <= 0) { 1419 shift128RightJamming(hi, lo, 1420 DECOMPOSED_BINARY_POINT - exp_diff, 1421 &hi, &lo); 1422 lo += c.frac; 1423 p_exp = c.exp; 1424 } else { 1425 uint64_t c_hi, c_lo; 1426 /* shift c to the same binary point as the product (124) */ 1427 c_hi = c.frac >> 2; 1428 c_lo = 0; 1429 shift128RightJamming(c_hi, c_lo, 1430 exp_diff, 1431 &c_hi, &c_lo); 1432 add128(hi, lo, c_hi, c_lo, &hi, &lo); 1433 /* move binary point back to 62 */ 1434 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1435 } 1436 1437 if (lo & DECOMPOSED_OVERFLOW_BIT) { 1438 shift64RightJamming(lo, 1, &lo); 1439 p_exp += 1; 1440 } 1441 1442 } else { 1443 /* Subtraction */ 1444 uint64_t c_hi, c_lo; 1445 /* make C binary point match product at bit 124 */ 1446 c_hi = c.frac >> 2; 1447 c_lo = 0; 1448 1449 if (exp_diff <= 0) { 1450 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo); 1451 if (exp_diff == 0 1452 && 1453 (hi > c_hi || (hi == c_hi && lo >= c_lo))) { 1454 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1455 } else { 1456 sub128(c_hi, c_lo, hi, lo, &hi, &lo); 1457 p_sign ^= 1; 1458 p_exp = c.exp; 1459 } 1460 } else { 1461 shift128RightJamming(c_hi, c_lo, 1462 exp_diff, 1463 &c_hi, &c_lo); 1464 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1465 } 1466 1467 if (hi == 0 && lo == 0) { 1468 a.cls = float_class_zero; 1469 a.sign = s->float_rounding_mode == float_round_down; 1470 a.sign ^= sign_flip; 1471 return a; 1472 } else { 1473 int shift; 1474 if (hi != 0) { 1475 shift = clz64(hi); 1476 } else { 1477 shift = clz64(lo) + 64; 1478 } 1479 /* Normalizing to a binary point of 124 is the 1480 correct adjust for the exponent. However since we're 1481 shifting, we might as well put the binary point back 1482 at 62 where we really want it. Therefore shift as 1483 if we're leaving 1 bit at the top of the word, but 1484 adjust the exponent as if we're leaving 3 bits. */ 1485 shift -= 1; 1486 if (shift >= 64) { 1487 lo = lo << (shift - 64); 1488 } else { 1489 hi = (hi << shift) | (lo >> (64 - shift)); 1490 lo = hi | ((lo << shift) != 0); 1491 } 1492 p_exp -= shift - 2; 1493 } 1494 } 1495 } 1496 1497 if (flags & float_muladd_halve_result) { 1498 p_exp -= 1; 1499 } 1500 1501 /* finally prepare our result */ 1502 a.cls = float_class_normal; 1503 a.sign = p_sign ^ sign_flip; 1504 a.exp = p_exp; 1505 a.frac = lo; 1506 1507 return a; 1508 } 1509 1510 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c, 1511 int flags, float_status *status) 1512 { 1513 FloatParts pa = float16_unpack_canonical(a, status); 1514 FloatParts pb = float16_unpack_canonical(b, status); 1515 FloatParts pc = float16_unpack_canonical(c, status); 1516 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1517 1518 return float16_round_pack_canonical(pr, status); 1519 } 1520 1521 static float32 QEMU_SOFTFLOAT_ATTR 1522 soft_f32_muladd(float32 a, float32 b, float32 c, int flags, 1523 float_status *status) 1524 { 1525 FloatParts pa = float32_unpack_canonical(a, status); 1526 FloatParts pb = float32_unpack_canonical(b, status); 1527 FloatParts pc = float32_unpack_canonical(c, status); 1528 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1529 1530 return float32_round_pack_canonical(pr, status); 1531 } 1532 1533 static float64 QEMU_SOFTFLOAT_ATTR 1534 soft_f64_muladd(float64 a, float64 b, float64 c, int flags, 1535 float_status *status) 1536 { 1537 FloatParts pa = float64_unpack_canonical(a, status); 1538 FloatParts pb = float64_unpack_canonical(b, status); 1539 FloatParts pc = float64_unpack_canonical(c, status); 1540 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1541 1542 return float64_round_pack_canonical(pr, status); 1543 } 1544 1545 static bool force_soft_fma; 1546 1547 float32 QEMU_FLATTEN 1548 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s) 1549 { 1550 union_float32 ua, ub, uc, ur; 1551 1552 ua.s = xa; 1553 ub.s = xb; 1554 uc.s = xc; 1555 1556 if (unlikely(!can_use_fpu(s))) { 1557 goto soft; 1558 } 1559 if (unlikely(flags & float_muladd_halve_result)) { 1560 goto soft; 1561 } 1562 1563 float32_input_flush3(&ua.s, &ub.s, &uc.s, s); 1564 if (unlikely(!f32_is_zon3(ua, ub, uc))) { 1565 goto soft; 1566 } 1567 1568 if (unlikely(force_soft_fma)) { 1569 goto soft; 1570 } 1571 1572 /* 1573 * When (a || b) == 0, there's no need to check for under/over flow, 1574 * since we know the addend is (normal || 0) and the product is 0. 1575 */ 1576 if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) { 1577 union_float32 up; 1578 bool prod_sign; 1579 1580 prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s); 1581 prod_sign ^= !!(flags & float_muladd_negate_product); 1582 up.s = float32_set_sign(float32_zero, prod_sign); 1583 1584 if (flags & float_muladd_negate_c) { 1585 uc.h = -uc.h; 1586 } 1587 ur.h = up.h + uc.h; 1588 } else { 1589 if (flags & float_muladd_negate_product) { 1590 ua.h = -ua.h; 1591 } 1592 if (flags & float_muladd_negate_c) { 1593 uc.h = -uc.h; 1594 } 1595 1596 ur.h = fmaf(ua.h, ub.h, uc.h); 1597 1598 if (unlikely(f32_is_inf(ur))) { 1599 s->float_exception_flags |= float_flag_overflow; 1600 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) { 1601 goto soft; 1602 } 1603 } 1604 if (flags & float_muladd_negate_result) { 1605 return float32_chs(ur.s); 1606 } 1607 return ur.s; 1608 1609 soft: 1610 return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s); 1611 } 1612 1613 float64 QEMU_FLATTEN 1614 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s) 1615 { 1616 union_float64 ua, ub, uc, ur; 1617 1618 ua.s = xa; 1619 ub.s = xb; 1620 uc.s = xc; 1621 1622 if (unlikely(!can_use_fpu(s))) { 1623 goto soft; 1624 } 1625 if (unlikely(flags & float_muladd_halve_result)) { 1626 goto soft; 1627 } 1628 1629 float64_input_flush3(&ua.s, &ub.s, &uc.s, s); 1630 if (unlikely(!f64_is_zon3(ua, ub, uc))) { 1631 goto soft; 1632 } 1633 1634 if (unlikely(force_soft_fma)) { 1635 goto soft; 1636 } 1637 1638 /* 1639 * When (a || b) == 0, there's no need to check for under/over flow, 1640 * since we know the addend is (normal || 0) and the product is 0. 1641 */ 1642 if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) { 1643 union_float64 up; 1644 bool prod_sign; 1645 1646 prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s); 1647 prod_sign ^= !!(flags & float_muladd_negate_product); 1648 up.s = float64_set_sign(float64_zero, prod_sign); 1649 1650 if (flags & float_muladd_negate_c) { 1651 uc.h = -uc.h; 1652 } 1653 ur.h = up.h + uc.h; 1654 } else { 1655 if (flags & float_muladd_negate_product) { 1656 ua.h = -ua.h; 1657 } 1658 if (flags & float_muladd_negate_c) { 1659 uc.h = -uc.h; 1660 } 1661 1662 ur.h = fma(ua.h, ub.h, uc.h); 1663 1664 if (unlikely(f64_is_inf(ur))) { 1665 s->float_exception_flags |= float_flag_overflow; 1666 } else if (unlikely(fabs(ur.h) <= FLT_MIN)) { 1667 goto soft; 1668 } 1669 } 1670 if (flags & float_muladd_negate_result) { 1671 return float64_chs(ur.s); 1672 } 1673 return ur.s; 1674 1675 soft: 1676 return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s); 1677 } 1678 1679 /* 1680 * Returns the result of dividing the floating-point value `a' by the 1681 * corresponding value `b'. The operation is performed according to 1682 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1683 */ 1684 1685 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s) 1686 { 1687 bool sign = a.sign ^ b.sign; 1688 1689 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1690 uint64_t n0, n1, q, r; 1691 int exp = a.exp - b.exp; 1692 1693 /* 1694 * We want a 2*N / N-bit division to produce exactly an N-bit 1695 * result, so that we do not lose any precision and so that we 1696 * do not have to renormalize afterward. If A.frac < B.frac, 1697 * then division would produce an (N-1)-bit result; shift A left 1698 * by one to produce the an N-bit result, and decrement the 1699 * exponent to match. 1700 * 1701 * The udiv_qrnnd algorithm that we're using requires normalization, 1702 * i.e. the msb of the denominator must be set. Since we know that 1703 * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left 1704 * by one (more), and the remainder must be shifted right by one. 1705 */ 1706 if (a.frac < b.frac) { 1707 exp -= 1; 1708 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0); 1709 } else { 1710 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0); 1711 } 1712 q = udiv_qrnnd(&r, n1, n0, b.frac << 1); 1713 1714 /* 1715 * Set lsb if there is a remainder, to set inexact. 1716 * As mentioned above, to find the actual value of the remainder we 1717 * would need to shift right, but (1) we are only concerned about 1718 * non-zero-ness, and (2) the remainder will always be even because 1719 * both inputs to the division primitive are even. 1720 */ 1721 a.frac = q | (r != 0); 1722 a.sign = sign; 1723 a.exp = exp; 1724 return a; 1725 } 1726 /* handle all the NaN cases */ 1727 if (is_nan(a.cls) || is_nan(b.cls)) { 1728 return pick_nan(a, b, s); 1729 } 1730 /* 0/0 or Inf/Inf */ 1731 if (a.cls == b.cls 1732 && 1733 (a.cls == float_class_inf || a.cls == float_class_zero)) { 1734 s->float_exception_flags |= float_flag_invalid; 1735 return parts_default_nan(s); 1736 } 1737 /* Inf / x or 0 / x */ 1738 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1739 a.sign = sign; 1740 return a; 1741 } 1742 /* Div 0 => Inf */ 1743 if (b.cls == float_class_zero) { 1744 s->float_exception_flags |= float_flag_divbyzero; 1745 a.cls = float_class_inf; 1746 a.sign = sign; 1747 return a; 1748 } 1749 /* Div by Inf */ 1750 if (b.cls == float_class_inf) { 1751 a.cls = float_class_zero; 1752 a.sign = sign; 1753 return a; 1754 } 1755 g_assert_not_reached(); 1756 } 1757 1758 float16 float16_div(float16 a, float16 b, float_status *status) 1759 { 1760 FloatParts pa = float16_unpack_canonical(a, status); 1761 FloatParts pb = float16_unpack_canonical(b, status); 1762 FloatParts pr = div_floats(pa, pb, status); 1763 1764 return float16_round_pack_canonical(pr, status); 1765 } 1766 1767 static float32 QEMU_SOFTFLOAT_ATTR 1768 soft_f32_div(float32 a, float32 b, float_status *status) 1769 { 1770 FloatParts pa = float32_unpack_canonical(a, status); 1771 FloatParts pb = float32_unpack_canonical(b, status); 1772 FloatParts pr = div_floats(pa, pb, status); 1773 1774 return float32_round_pack_canonical(pr, status); 1775 } 1776 1777 static float64 QEMU_SOFTFLOAT_ATTR 1778 soft_f64_div(float64 a, float64 b, float_status *status) 1779 { 1780 FloatParts pa = float64_unpack_canonical(a, status); 1781 FloatParts pb = float64_unpack_canonical(b, status); 1782 FloatParts pr = div_floats(pa, pb, status); 1783 1784 return float64_round_pack_canonical(pr, status); 1785 } 1786 1787 static float hard_f32_div(float a, float b) 1788 { 1789 return a / b; 1790 } 1791 1792 static double hard_f64_div(double a, double b) 1793 { 1794 return a / b; 1795 } 1796 1797 static bool f32_div_pre(union_float32 a, union_float32 b) 1798 { 1799 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1800 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1801 fpclassify(b.h) == FP_NORMAL; 1802 } 1803 return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s); 1804 } 1805 1806 static bool f64_div_pre(union_float64 a, union_float64 b) 1807 { 1808 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1809 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1810 fpclassify(b.h) == FP_NORMAL; 1811 } 1812 return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s); 1813 } 1814 1815 static bool f32_div_post(union_float32 a, union_float32 b) 1816 { 1817 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1818 return fpclassify(a.h) != FP_ZERO; 1819 } 1820 return !float32_is_zero(a.s); 1821 } 1822 1823 static bool f64_div_post(union_float64 a, union_float64 b) 1824 { 1825 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1826 return fpclassify(a.h) != FP_ZERO; 1827 } 1828 return !float64_is_zero(a.s); 1829 } 1830 1831 float32 QEMU_FLATTEN 1832 float32_div(float32 a, float32 b, float_status *s) 1833 { 1834 return float32_gen2(a, b, s, hard_f32_div, soft_f32_div, 1835 f32_div_pre, f32_div_post, NULL, NULL); 1836 } 1837 1838 float64 QEMU_FLATTEN 1839 float64_div(float64 a, float64 b, float_status *s) 1840 { 1841 return float64_gen2(a, b, s, hard_f64_div, soft_f64_div, 1842 f64_div_pre, f64_div_post, NULL, NULL); 1843 } 1844 1845 /* 1846 * Float to Float conversions 1847 * 1848 * Returns the result of converting one float format to another. The 1849 * conversion is performed according to the IEC/IEEE Standard for 1850 * Binary Floating-Point Arithmetic. 1851 * 1852 * The float_to_float helper only needs to take care of raising 1853 * invalid exceptions and handling the conversion on NaNs. 1854 */ 1855 1856 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf, 1857 float_status *s) 1858 { 1859 if (dstf->arm_althp) { 1860 switch (a.cls) { 1861 case float_class_qnan: 1862 case float_class_snan: 1863 /* There is no NaN in the destination format. Raise Invalid 1864 * and return a zero with the sign of the input NaN. 1865 */ 1866 s->float_exception_flags |= float_flag_invalid; 1867 a.cls = float_class_zero; 1868 a.frac = 0; 1869 a.exp = 0; 1870 break; 1871 1872 case float_class_inf: 1873 /* There is no Inf in the destination format. Raise Invalid 1874 * and return the maximum normal with the correct sign. 1875 */ 1876 s->float_exception_flags |= float_flag_invalid; 1877 a.cls = float_class_normal; 1878 a.exp = dstf->exp_max; 1879 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift; 1880 break; 1881 1882 default: 1883 break; 1884 } 1885 } else if (is_nan(a.cls)) { 1886 if (is_snan(a.cls)) { 1887 s->float_exception_flags |= float_flag_invalid; 1888 a = parts_silence_nan(a, s); 1889 } 1890 if (s->default_nan_mode) { 1891 return parts_default_nan(s); 1892 } 1893 } 1894 return a; 1895 } 1896 1897 float32 float16_to_float32(float16 a, bool ieee, float_status *s) 1898 { 1899 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1900 FloatParts p = float16a_unpack_canonical(a, s, fmt16); 1901 FloatParts pr = float_to_float(p, &float32_params, s); 1902 return float32_round_pack_canonical(pr, s); 1903 } 1904 1905 float64 float16_to_float64(float16 a, bool ieee, float_status *s) 1906 { 1907 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1908 FloatParts p = float16a_unpack_canonical(a, s, fmt16); 1909 FloatParts pr = float_to_float(p, &float64_params, s); 1910 return float64_round_pack_canonical(pr, s); 1911 } 1912 1913 float16 float32_to_float16(float32 a, bool ieee, float_status *s) 1914 { 1915 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1916 FloatParts p = float32_unpack_canonical(a, s); 1917 FloatParts pr = float_to_float(p, fmt16, s); 1918 return float16a_round_pack_canonical(pr, s, fmt16); 1919 } 1920 1921 float64 float32_to_float64(float32 a, float_status *s) 1922 { 1923 FloatParts p = float32_unpack_canonical(a, s); 1924 FloatParts pr = float_to_float(p, &float64_params, s); 1925 return float64_round_pack_canonical(pr, s); 1926 } 1927 1928 float16 float64_to_float16(float64 a, bool ieee, float_status *s) 1929 { 1930 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1931 FloatParts p = float64_unpack_canonical(a, s); 1932 FloatParts pr = float_to_float(p, fmt16, s); 1933 return float16a_round_pack_canonical(pr, s, fmt16); 1934 } 1935 1936 float32 float64_to_float32(float64 a, float_status *s) 1937 { 1938 FloatParts p = float64_unpack_canonical(a, s); 1939 FloatParts pr = float_to_float(p, &float32_params, s); 1940 return float32_round_pack_canonical(pr, s); 1941 } 1942 1943 /* 1944 * Rounds the floating-point value `a' to an integer, and returns the 1945 * result as a floating-point value. The operation is performed 1946 * according to the IEC/IEEE Standard for Binary Floating-Point 1947 * Arithmetic. 1948 */ 1949 1950 static FloatParts round_to_int(FloatParts a, int rmode, 1951 int scale, float_status *s) 1952 { 1953 switch (a.cls) { 1954 case float_class_qnan: 1955 case float_class_snan: 1956 return return_nan(a, s); 1957 1958 case float_class_zero: 1959 case float_class_inf: 1960 /* already "integral" */ 1961 break; 1962 1963 case float_class_normal: 1964 scale = MIN(MAX(scale, -0x10000), 0x10000); 1965 a.exp += scale; 1966 1967 if (a.exp >= DECOMPOSED_BINARY_POINT) { 1968 /* already integral */ 1969 break; 1970 } 1971 if (a.exp < 0) { 1972 bool one; 1973 /* all fractional */ 1974 s->float_exception_flags |= float_flag_inexact; 1975 switch (rmode) { 1976 case float_round_nearest_even: 1977 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT; 1978 break; 1979 case float_round_ties_away: 1980 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT; 1981 break; 1982 case float_round_to_zero: 1983 one = false; 1984 break; 1985 case float_round_up: 1986 one = !a.sign; 1987 break; 1988 case float_round_down: 1989 one = a.sign; 1990 break; 1991 default: 1992 g_assert_not_reached(); 1993 } 1994 1995 if (one) { 1996 a.frac = DECOMPOSED_IMPLICIT_BIT; 1997 a.exp = 0; 1998 } else { 1999 a.cls = float_class_zero; 2000 } 2001 } else { 2002 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp; 2003 uint64_t frac_lsbm1 = frac_lsb >> 1; 2004 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb; 2005 uint64_t rnd_mask = rnd_even_mask >> 1; 2006 uint64_t inc; 2007 2008 switch (rmode) { 2009 case float_round_nearest_even: 2010 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 2011 break; 2012 case float_round_ties_away: 2013 inc = frac_lsbm1; 2014 break; 2015 case float_round_to_zero: 2016 inc = 0; 2017 break; 2018 case float_round_up: 2019 inc = a.sign ? 0 : rnd_mask; 2020 break; 2021 case float_round_down: 2022 inc = a.sign ? rnd_mask : 0; 2023 break; 2024 default: 2025 g_assert_not_reached(); 2026 } 2027 2028 if (a.frac & rnd_mask) { 2029 s->float_exception_flags |= float_flag_inexact; 2030 a.frac += inc; 2031 a.frac &= ~rnd_mask; 2032 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 2033 a.frac >>= 1; 2034 a.exp++; 2035 } 2036 } 2037 } 2038 break; 2039 default: 2040 g_assert_not_reached(); 2041 } 2042 return a; 2043 } 2044 2045 float16 float16_round_to_int(float16 a, float_status *s) 2046 { 2047 FloatParts pa = float16_unpack_canonical(a, s); 2048 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2049 return float16_round_pack_canonical(pr, s); 2050 } 2051 2052 float32 float32_round_to_int(float32 a, float_status *s) 2053 { 2054 FloatParts pa = float32_unpack_canonical(a, s); 2055 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2056 return float32_round_pack_canonical(pr, s); 2057 } 2058 2059 float64 float64_round_to_int(float64 a, float_status *s) 2060 { 2061 FloatParts pa = float64_unpack_canonical(a, s); 2062 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2063 return float64_round_pack_canonical(pr, s); 2064 } 2065 2066 /* 2067 * Returns the result of converting the floating-point value `a' to 2068 * the two's complement integer format. The conversion is performed 2069 * according to the IEC/IEEE Standard for Binary Floating-Point 2070 * Arithmetic---which means in particular that the conversion is 2071 * rounded according to the current rounding mode. If `a' is a NaN, 2072 * the largest positive integer is returned. Otherwise, if the 2073 * conversion overflows, the largest integer with the same sign as `a' 2074 * is returned. 2075 */ 2076 2077 static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale, 2078 int64_t min, int64_t max, 2079 float_status *s) 2080 { 2081 uint64_t r; 2082 int orig_flags = get_float_exception_flags(s); 2083 FloatParts p = round_to_int(in, rmode, scale, s); 2084 2085 switch (p.cls) { 2086 case float_class_snan: 2087 case float_class_qnan: 2088 s->float_exception_flags = orig_flags | float_flag_invalid; 2089 return max; 2090 case float_class_inf: 2091 s->float_exception_flags = orig_flags | float_flag_invalid; 2092 return p.sign ? min : max; 2093 case float_class_zero: 2094 return 0; 2095 case float_class_normal: 2096 if (p.exp < DECOMPOSED_BINARY_POINT) { 2097 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2098 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 2099 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 2100 } else { 2101 r = UINT64_MAX; 2102 } 2103 if (p.sign) { 2104 if (r <= -(uint64_t) min) { 2105 return -r; 2106 } else { 2107 s->float_exception_flags = orig_flags | float_flag_invalid; 2108 return min; 2109 } 2110 } else { 2111 if (r <= max) { 2112 return r; 2113 } else { 2114 s->float_exception_flags = orig_flags | float_flag_invalid; 2115 return max; 2116 } 2117 } 2118 default: 2119 g_assert_not_reached(); 2120 } 2121 } 2122 2123 int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale, 2124 float_status *s) 2125 { 2126 return round_to_int_and_pack(float16_unpack_canonical(a, s), 2127 rmode, scale, INT16_MIN, INT16_MAX, s); 2128 } 2129 2130 int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale, 2131 float_status *s) 2132 { 2133 return round_to_int_and_pack(float16_unpack_canonical(a, s), 2134 rmode, scale, INT32_MIN, INT32_MAX, s); 2135 } 2136 2137 int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale, 2138 float_status *s) 2139 { 2140 return round_to_int_and_pack(float16_unpack_canonical(a, s), 2141 rmode, scale, INT64_MIN, INT64_MAX, s); 2142 } 2143 2144 int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale, 2145 float_status *s) 2146 { 2147 return round_to_int_and_pack(float32_unpack_canonical(a, s), 2148 rmode, scale, INT16_MIN, INT16_MAX, s); 2149 } 2150 2151 int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale, 2152 float_status *s) 2153 { 2154 return round_to_int_and_pack(float32_unpack_canonical(a, s), 2155 rmode, scale, INT32_MIN, INT32_MAX, s); 2156 } 2157 2158 int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale, 2159 float_status *s) 2160 { 2161 return round_to_int_and_pack(float32_unpack_canonical(a, s), 2162 rmode, scale, INT64_MIN, INT64_MAX, s); 2163 } 2164 2165 int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale, 2166 float_status *s) 2167 { 2168 return round_to_int_and_pack(float64_unpack_canonical(a, s), 2169 rmode, scale, INT16_MIN, INT16_MAX, s); 2170 } 2171 2172 int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale, 2173 float_status *s) 2174 { 2175 return round_to_int_and_pack(float64_unpack_canonical(a, s), 2176 rmode, scale, INT32_MIN, INT32_MAX, s); 2177 } 2178 2179 int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale, 2180 float_status *s) 2181 { 2182 return round_to_int_and_pack(float64_unpack_canonical(a, s), 2183 rmode, scale, INT64_MIN, INT64_MAX, s); 2184 } 2185 2186 int16_t float16_to_int16(float16 a, float_status *s) 2187 { 2188 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2189 } 2190 2191 int32_t float16_to_int32(float16 a, float_status *s) 2192 { 2193 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2194 } 2195 2196 int64_t float16_to_int64(float16 a, float_status *s) 2197 { 2198 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2199 } 2200 2201 int16_t float32_to_int16(float32 a, float_status *s) 2202 { 2203 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2204 } 2205 2206 int32_t float32_to_int32(float32 a, float_status *s) 2207 { 2208 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2209 } 2210 2211 int64_t float32_to_int64(float32 a, float_status *s) 2212 { 2213 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2214 } 2215 2216 int16_t float64_to_int16(float64 a, float_status *s) 2217 { 2218 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2219 } 2220 2221 int32_t float64_to_int32(float64 a, float_status *s) 2222 { 2223 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2224 } 2225 2226 int64_t float64_to_int64(float64 a, float_status *s) 2227 { 2228 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2229 } 2230 2231 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s) 2232 { 2233 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2234 } 2235 2236 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s) 2237 { 2238 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2239 } 2240 2241 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s) 2242 { 2243 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2244 } 2245 2246 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s) 2247 { 2248 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s); 2249 } 2250 2251 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s) 2252 { 2253 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s); 2254 } 2255 2256 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s) 2257 { 2258 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s); 2259 } 2260 2261 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s) 2262 { 2263 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s); 2264 } 2265 2266 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s) 2267 { 2268 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s); 2269 } 2270 2271 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s) 2272 { 2273 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s); 2274 } 2275 2276 /* 2277 * Returns the result of converting the floating-point value `a' to 2278 * the unsigned integer format. The conversion is performed according 2279 * to the IEC/IEEE Standard for Binary Floating-Point 2280 * Arithmetic---which means in particular that the conversion is 2281 * rounded according to the current rounding mode. If `a' is a NaN, 2282 * the largest unsigned integer is returned. Otherwise, if the 2283 * conversion overflows, the largest unsigned integer is returned. If 2284 * the 'a' is negative, the result is rounded and zero is returned; 2285 * values that do not round to zero will raise the inexact exception 2286 * flag. 2287 */ 2288 2289 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale, 2290 uint64_t max, float_status *s) 2291 { 2292 int orig_flags = get_float_exception_flags(s); 2293 FloatParts p = round_to_int(in, rmode, scale, s); 2294 uint64_t r; 2295 2296 switch (p.cls) { 2297 case float_class_snan: 2298 case float_class_qnan: 2299 s->float_exception_flags = orig_flags | float_flag_invalid; 2300 return max; 2301 case float_class_inf: 2302 s->float_exception_flags = orig_flags | float_flag_invalid; 2303 return p.sign ? 0 : max; 2304 case float_class_zero: 2305 return 0; 2306 case float_class_normal: 2307 if (p.sign) { 2308 s->float_exception_flags = orig_flags | float_flag_invalid; 2309 return 0; 2310 } 2311 2312 if (p.exp < DECOMPOSED_BINARY_POINT) { 2313 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2314 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 2315 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 2316 } else { 2317 s->float_exception_flags = orig_flags | float_flag_invalid; 2318 return max; 2319 } 2320 2321 /* For uint64 this will never trip, but if p.exp is too large 2322 * to shift a decomposed fraction we shall have exited via the 2323 * 3rd leg above. 2324 */ 2325 if (r > max) { 2326 s->float_exception_flags = orig_flags | float_flag_invalid; 2327 return max; 2328 } 2329 return r; 2330 default: 2331 g_assert_not_reached(); 2332 } 2333 } 2334 2335 uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale, 2336 float_status *s) 2337 { 2338 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 2339 rmode, scale, UINT16_MAX, s); 2340 } 2341 2342 uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale, 2343 float_status *s) 2344 { 2345 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 2346 rmode, scale, UINT32_MAX, s); 2347 } 2348 2349 uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale, 2350 float_status *s) 2351 { 2352 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 2353 rmode, scale, UINT64_MAX, s); 2354 } 2355 2356 uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale, 2357 float_status *s) 2358 { 2359 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 2360 rmode, scale, UINT16_MAX, s); 2361 } 2362 2363 uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale, 2364 float_status *s) 2365 { 2366 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 2367 rmode, scale, UINT32_MAX, s); 2368 } 2369 2370 uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale, 2371 float_status *s) 2372 { 2373 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 2374 rmode, scale, UINT64_MAX, s); 2375 } 2376 2377 uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale, 2378 float_status *s) 2379 { 2380 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 2381 rmode, scale, UINT16_MAX, s); 2382 } 2383 2384 uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale, 2385 float_status *s) 2386 { 2387 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 2388 rmode, scale, UINT32_MAX, s); 2389 } 2390 2391 uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale, 2392 float_status *s) 2393 { 2394 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 2395 rmode, scale, UINT64_MAX, s); 2396 } 2397 2398 uint16_t float16_to_uint16(float16 a, float_status *s) 2399 { 2400 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2401 } 2402 2403 uint32_t float16_to_uint32(float16 a, float_status *s) 2404 { 2405 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2406 } 2407 2408 uint64_t float16_to_uint64(float16 a, float_status *s) 2409 { 2410 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2411 } 2412 2413 uint16_t float32_to_uint16(float32 a, float_status *s) 2414 { 2415 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2416 } 2417 2418 uint32_t float32_to_uint32(float32 a, float_status *s) 2419 { 2420 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2421 } 2422 2423 uint64_t float32_to_uint64(float32 a, float_status *s) 2424 { 2425 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2426 } 2427 2428 uint16_t float64_to_uint16(float64 a, float_status *s) 2429 { 2430 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2431 } 2432 2433 uint32_t float64_to_uint32(float64 a, float_status *s) 2434 { 2435 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2436 } 2437 2438 uint64_t float64_to_uint64(float64 a, float_status *s) 2439 { 2440 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2441 } 2442 2443 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s) 2444 { 2445 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2446 } 2447 2448 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s) 2449 { 2450 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2451 } 2452 2453 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s) 2454 { 2455 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2456 } 2457 2458 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s) 2459 { 2460 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2461 } 2462 2463 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s) 2464 { 2465 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2466 } 2467 2468 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s) 2469 { 2470 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2471 } 2472 2473 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s) 2474 { 2475 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2476 } 2477 2478 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s) 2479 { 2480 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2481 } 2482 2483 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s) 2484 { 2485 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2486 } 2487 2488 /* 2489 * Integer to float conversions 2490 * 2491 * Returns the result of converting the two's complement integer `a' 2492 * to the floating-point format. The conversion is performed according 2493 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2494 */ 2495 2496 static FloatParts int_to_float(int64_t a, int scale, float_status *status) 2497 { 2498 FloatParts r = { .sign = false }; 2499 2500 if (a == 0) { 2501 r.cls = float_class_zero; 2502 } else { 2503 uint64_t f = a; 2504 int shift; 2505 2506 r.cls = float_class_normal; 2507 if (a < 0) { 2508 f = -f; 2509 r.sign = true; 2510 } 2511 shift = clz64(f) - 1; 2512 scale = MIN(MAX(scale, -0x10000), 0x10000); 2513 2514 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 2515 r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift); 2516 } 2517 2518 return r; 2519 } 2520 2521 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status) 2522 { 2523 FloatParts pa = int_to_float(a, scale, status); 2524 return float16_round_pack_canonical(pa, status); 2525 } 2526 2527 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status) 2528 { 2529 return int64_to_float16_scalbn(a, scale, status); 2530 } 2531 2532 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status) 2533 { 2534 return int64_to_float16_scalbn(a, scale, status); 2535 } 2536 2537 float16 int64_to_float16(int64_t a, float_status *status) 2538 { 2539 return int64_to_float16_scalbn(a, 0, status); 2540 } 2541 2542 float16 int32_to_float16(int32_t a, float_status *status) 2543 { 2544 return int64_to_float16_scalbn(a, 0, status); 2545 } 2546 2547 float16 int16_to_float16(int16_t a, float_status *status) 2548 { 2549 return int64_to_float16_scalbn(a, 0, status); 2550 } 2551 2552 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status) 2553 { 2554 FloatParts pa = int_to_float(a, scale, status); 2555 return float32_round_pack_canonical(pa, status); 2556 } 2557 2558 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status) 2559 { 2560 return int64_to_float32_scalbn(a, scale, status); 2561 } 2562 2563 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status) 2564 { 2565 return int64_to_float32_scalbn(a, scale, status); 2566 } 2567 2568 float32 int64_to_float32(int64_t a, float_status *status) 2569 { 2570 return int64_to_float32_scalbn(a, 0, status); 2571 } 2572 2573 float32 int32_to_float32(int32_t a, float_status *status) 2574 { 2575 return int64_to_float32_scalbn(a, 0, status); 2576 } 2577 2578 float32 int16_to_float32(int16_t a, float_status *status) 2579 { 2580 return int64_to_float32_scalbn(a, 0, status); 2581 } 2582 2583 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status) 2584 { 2585 FloatParts pa = int_to_float(a, scale, status); 2586 return float64_round_pack_canonical(pa, status); 2587 } 2588 2589 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status) 2590 { 2591 return int64_to_float64_scalbn(a, scale, status); 2592 } 2593 2594 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status) 2595 { 2596 return int64_to_float64_scalbn(a, scale, status); 2597 } 2598 2599 float64 int64_to_float64(int64_t a, float_status *status) 2600 { 2601 return int64_to_float64_scalbn(a, 0, status); 2602 } 2603 2604 float64 int32_to_float64(int32_t a, float_status *status) 2605 { 2606 return int64_to_float64_scalbn(a, 0, status); 2607 } 2608 2609 float64 int16_to_float64(int16_t a, float_status *status) 2610 { 2611 return int64_to_float64_scalbn(a, 0, status); 2612 } 2613 2614 2615 /* 2616 * Unsigned Integer to float conversions 2617 * 2618 * Returns the result of converting the unsigned integer `a' to the 2619 * floating-point format. The conversion is performed according to the 2620 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2621 */ 2622 2623 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status) 2624 { 2625 FloatParts r = { .sign = false }; 2626 2627 if (a == 0) { 2628 r.cls = float_class_zero; 2629 } else { 2630 scale = MIN(MAX(scale, -0x10000), 0x10000); 2631 r.cls = float_class_normal; 2632 if ((int64_t)a < 0) { 2633 r.exp = DECOMPOSED_BINARY_POINT + 1 + scale; 2634 shift64RightJamming(a, 1, &a); 2635 r.frac = a; 2636 } else { 2637 int shift = clz64(a) - 1; 2638 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 2639 r.frac = a << shift; 2640 } 2641 } 2642 2643 return r; 2644 } 2645 2646 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status) 2647 { 2648 FloatParts pa = uint_to_float(a, scale, status); 2649 return float16_round_pack_canonical(pa, status); 2650 } 2651 2652 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status) 2653 { 2654 return uint64_to_float16_scalbn(a, scale, status); 2655 } 2656 2657 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status) 2658 { 2659 return uint64_to_float16_scalbn(a, scale, status); 2660 } 2661 2662 float16 uint64_to_float16(uint64_t a, float_status *status) 2663 { 2664 return uint64_to_float16_scalbn(a, 0, status); 2665 } 2666 2667 float16 uint32_to_float16(uint32_t a, float_status *status) 2668 { 2669 return uint64_to_float16_scalbn(a, 0, status); 2670 } 2671 2672 float16 uint16_to_float16(uint16_t a, float_status *status) 2673 { 2674 return uint64_to_float16_scalbn(a, 0, status); 2675 } 2676 2677 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status) 2678 { 2679 FloatParts pa = uint_to_float(a, scale, status); 2680 return float32_round_pack_canonical(pa, status); 2681 } 2682 2683 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status) 2684 { 2685 return uint64_to_float32_scalbn(a, scale, status); 2686 } 2687 2688 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status) 2689 { 2690 return uint64_to_float32_scalbn(a, scale, status); 2691 } 2692 2693 float32 uint64_to_float32(uint64_t a, float_status *status) 2694 { 2695 return uint64_to_float32_scalbn(a, 0, status); 2696 } 2697 2698 float32 uint32_to_float32(uint32_t a, float_status *status) 2699 { 2700 return uint64_to_float32_scalbn(a, 0, status); 2701 } 2702 2703 float32 uint16_to_float32(uint16_t a, float_status *status) 2704 { 2705 return uint64_to_float32_scalbn(a, 0, status); 2706 } 2707 2708 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status) 2709 { 2710 FloatParts pa = uint_to_float(a, scale, status); 2711 return float64_round_pack_canonical(pa, status); 2712 } 2713 2714 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status) 2715 { 2716 return uint64_to_float64_scalbn(a, scale, status); 2717 } 2718 2719 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status) 2720 { 2721 return uint64_to_float64_scalbn(a, scale, status); 2722 } 2723 2724 float64 uint64_to_float64(uint64_t a, float_status *status) 2725 { 2726 return uint64_to_float64_scalbn(a, 0, status); 2727 } 2728 2729 float64 uint32_to_float64(uint32_t a, float_status *status) 2730 { 2731 return uint64_to_float64_scalbn(a, 0, status); 2732 } 2733 2734 float64 uint16_to_float64(uint16_t a, float_status *status) 2735 { 2736 return uint64_to_float64_scalbn(a, 0, status); 2737 } 2738 2739 /* Float Min/Max */ 2740 /* min() and max() functions. These can't be implemented as 2741 * 'compare and pick one input' because that would mishandle 2742 * NaNs and +0 vs -0. 2743 * 2744 * minnum() and maxnum() functions. These are similar to the min() 2745 * and max() functions but if one of the arguments is a QNaN and 2746 * the other is numerical then the numerical argument is returned. 2747 * SNaNs will get quietened before being returned. 2748 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 2749 * and maxNum() operations. min() and max() are the typical min/max 2750 * semantics provided by many CPUs which predate that specification. 2751 * 2752 * minnummag() and maxnummag() functions correspond to minNumMag() 2753 * and minNumMag() from the IEEE-754 2008. 2754 */ 2755 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin, 2756 bool ieee, bool ismag, float_status *s) 2757 { 2758 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) { 2759 if (ieee) { 2760 /* Takes two floating-point values `a' and `b', one of 2761 * which is a NaN, and returns the appropriate NaN 2762 * result. If either `a' or `b' is a signaling NaN, 2763 * the invalid exception is raised. 2764 */ 2765 if (is_snan(a.cls) || is_snan(b.cls)) { 2766 return pick_nan(a, b, s); 2767 } else if (is_nan(a.cls) && !is_nan(b.cls)) { 2768 return b; 2769 } else if (is_nan(b.cls) && !is_nan(a.cls)) { 2770 return a; 2771 } 2772 } 2773 return pick_nan(a, b, s); 2774 } else { 2775 int a_exp, b_exp; 2776 2777 switch (a.cls) { 2778 case float_class_normal: 2779 a_exp = a.exp; 2780 break; 2781 case float_class_inf: 2782 a_exp = INT_MAX; 2783 break; 2784 case float_class_zero: 2785 a_exp = INT_MIN; 2786 break; 2787 default: 2788 g_assert_not_reached(); 2789 break; 2790 } 2791 switch (b.cls) { 2792 case float_class_normal: 2793 b_exp = b.exp; 2794 break; 2795 case float_class_inf: 2796 b_exp = INT_MAX; 2797 break; 2798 case float_class_zero: 2799 b_exp = INT_MIN; 2800 break; 2801 default: 2802 g_assert_not_reached(); 2803 break; 2804 } 2805 2806 if (ismag && (a_exp != b_exp || a.frac != b.frac)) { 2807 bool a_less = a_exp < b_exp; 2808 if (a_exp == b_exp) { 2809 a_less = a.frac < b.frac; 2810 } 2811 return a_less ^ ismin ? b : a; 2812 } 2813 2814 if (a.sign == b.sign) { 2815 bool a_less = a_exp < b_exp; 2816 if (a_exp == b_exp) { 2817 a_less = a.frac < b.frac; 2818 } 2819 return a.sign ^ a_less ^ ismin ? b : a; 2820 } else { 2821 return a.sign ^ ismin ? b : a; 2822 } 2823 } 2824 } 2825 2826 #define MINMAX(sz, name, ismin, isiee, ismag) \ 2827 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \ 2828 float_status *s) \ 2829 { \ 2830 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 2831 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 2832 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 2833 \ 2834 return float ## sz ## _round_pack_canonical(pr, s); \ 2835 } 2836 2837 MINMAX(16, min, true, false, false) 2838 MINMAX(16, minnum, true, true, false) 2839 MINMAX(16, minnummag, true, true, true) 2840 MINMAX(16, max, false, false, false) 2841 MINMAX(16, maxnum, false, true, false) 2842 MINMAX(16, maxnummag, false, true, true) 2843 2844 MINMAX(32, min, true, false, false) 2845 MINMAX(32, minnum, true, true, false) 2846 MINMAX(32, minnummag, true, true, true) 2847 MINMAX(32, max, false, false, false) 2848 MINMAX(32, maxnum, false, true, false) 2849 MINMAX(32, maxnummag, false, true, true) 2850 2851 MINMAX(64, min, true, false, false) 2852 MINMAX(64, minnum, true, true, false) 2853 MINMAX(64, minnummag, true, true, true) 2854 MINMAX(64, max, false, false, false) 2855 MINMAX(64, maxnum, false, true, false) 2856 MINMAX(64, maxnummag, false, true, true) 2857 2858 #undef MINMAX 2859 2860 /* Floating point compare */ 2861 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet, 2862 float_status *s) 2863 { 2864 if (is_nan(a.cls) || is_nan(b.cls)) { 2865 if (!is_quiet || 2866 a.cls == float_class_snan || 2867 b.cls == float_class_snan) { 2868 s->float_exception_flags |= float_flag_invalid; 2869 } 2870 return float_relation_unordered; 2871 } 2872 2873 if (a.cls == float_class_zero) { 2874 if (b.cls == float_class_zero) { 2875 return float_relation_equal; 2876 } 2877 return b.sign ? float_relation_greater : float_relation_less; 2878 } else if (b.cls == float_class_zero) { 2879 return a.sign ? float_relation_less : float_relation_greater; 2880 } 2881 2882 /* The only really important thing about infinity is its sign. If 2883 * both are infinities the sign marks the smallest of the two. 2884 */ 2885 if (a.cls == float_class_inf) { 2886 if ((b.cls == float_class_inf) && (a.sign == b.sign)) { 2887 return float_relation_equal; 2888 } 2889 return a.sign ? float_relation_less : float_relation_greater; 2890 } else if (b.cls == float_class_inf) { 2891 return b.sign ? float_relation_greater : float_relation_less; 2892 } 2893 2894 if (a.sign != b.sign) { 2895 return a.sign ? float_relation_less : float_relation_greater; 2896 } 2897 2898 if (a.exp == b.exp) { 2899 if (a.frac == b.frac) { 2900 return float_relation_equal; 2901 } 2902 if (a.sign) { 2903 return a.frac > b.frac ? 2904 float_relation_less : float_relation_greater; 2905 } else { 2906 return a.frac > b.frac ? 2907 float_relation_greater : float_relation_less; 2908 } 2909 } else { 2910 if (a.sign) { 2911 return a.exp > b.exp ? float_relation_less : float_relation_greater; 2912 } else { 2913 return a.exp > b.exp ? float_relation_greater : float_relation_less; 2914 } 2915 } 2916 } 2917 2918 #define COMPARE(name, attr, sz) \ 2919 static int attr \ 2920 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \ 2921 { \ 2922 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 2923 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 2924 return compare_floats(pa, pb, is_quiet, s); \ 2925 } 2926 2927 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16) 2928 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32) 2929 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64) 2930 2931 #undef COMPARE 2932 2933 int float16_compare(float16 a, float16 b, float_status *s) 2934 { 2935 return soft_f16_compare(a, b, false, s); 2936 } 2937 2938 int float16_compare_quiet(float16 a, float16 b, float_status *s) 2939 { 2940 return soft_f16_compare(a, b, true, s); 2941 } 2942 2943 static int QEMU_FLATTEN 2944 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s) 2945 { 2946 union_float32 ua, ub; 2947 2948 ua.s = xa; 2949 ub.s = xb; 2950 2951 if (QEMU_NO_HARDFLOAT) { 2952 goto soft; 2953 } 2954 2955 float32_input_flush2(&ua.s, &ub.s, s); 2956 if (isgreaterequal(ua.h, ub.h)) { 2957 if (isgreater(ua.h, ub.h)) { 2958 return float_relation_greater; 2959 } 2960 return float_relation_equal; 2961 } 2962 if (likely(isless(ua.h, ub.h))) { 2963 return float_relation_less; 2964 } 2965 /* The only condition remaining is unordered. 2966 * Fall through to set flags. 2967 */ 2968 soft: 2969 return soft_f32_compare(ua.s, ub.s, is_quiet, s); 2970 } 2971 2972 int float32_compare(float32 a, float32 b, float_status *s) 2973 { 2974 return f32_compare(a, b, false, s); 2975 } 2976 2977 int float32_compare_quiet(float32 a, float32 b, float_status *s) 2978 { 2979 return f32_compare(a, b, true, s); 2980 } 2981 2982 static int QEMU_FLATTEN 2983 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s) 2984 { 2985 union_float64 ua, ub; 2986 2987 ua.s = xa; 2988 ub.s = xb; 2989 2990 if (QEMU_NO_HARDFLOAT) { 2991 goto soft; 2992 } 2993 2994 float64_input_flush2(&ua.s, &ub.s, s); 2995 if (isgreaterequal(ua.h, ub.h)) { 2996 if (isgreater(ua.h, ub.h)) { 2997 return float_relation_greater; 2998 } 2999 return float_relation_equal; 3000 } 3001 if (likely(isless(ua.h, ub.h))) { 3002 return float_relation_less; 3003 } 3004 /* The only condition remaining is unordered. 3005 * Fall through to set flags. 3006 */ 3007 soft: 3008 return soft_f64_compare(ua.s, ub.s, is_quiet, s); 3009 } 3010 3011 int float64_compare(float64 a, float64 b, float_status *s) 3012 { 3013 return f64_compare(a, b, false, s); 3014 } 3015 3016 int float64_compare_quiet(float64 a, float64 b, float_status *s) 3017 { 3018 return f64_compare(a, b, true, s); 3019 } 3020 3021 /* Multiply A by 2 raised to the power N. */ 3022 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s) 3023 { 3024 if (unlikely(is_nan(a.cls))) { 3025 return return_nan(a, s); 3026 } 3027 if (a.cls == float_class_normal) { 3028 /* The largest float type (even though not supported by FloatParts) 3029 * is float128, which has a 15 bit exponent. Bounding N to 16 bits 3030 * still allows rounding to infinity, without allowing overflow 3031 * within the int32_t that backs FloatParts.exp. 3032 */ 3033 n = MIN(MAX(n, -0x10000), 0x10000); 3034 a.exp += n; 3035 } 3036 return a; 3037 } 3038 3039 float16 float16_scalbn(float16 a, int n, float_status *status) 3040 { 3041 FloatParts pa = float16_unpack_canonical(a, status); 3042 FloatParts pr = scalbn_decomposed(pa, n, status); 3043 return float16_round_pack_canonical(pr, status); 3044 } 3045 3046 float32 float32_scalbn(float32 a, int n, float_status *status) 3047 { 3048 FloatParts pa = float32_unpack_canonical(a, status); 3049 FloatParts pr = scalbn_decomposed(pa, n, status); 3050 return float32_round_pack_canonical(pr, status); 3051 } 3052 3053 float64 float64_scalbn(float64 a, int n, float_status *status) 3054 { 3055 FloatParts pa = float64_unpack_canonical(a, status); 3056 FloatParts pr = scalbn_decomposed(pa, n, status); 3057 return float64_round_pack_canonical(pr, status); 3058 } 3059 3060 /* 3061 * Square Root 3062 * 3063 * The old softfloat code did an approximation step before zeroing in 3064 * on the final result. However for simpleness we just compute the 3065 * square root by iterating down from the implicit bit to enough extra 3066 * bits to ensure we get a correctly rounded result. 3067 * 3068 * This does mean however the calculation is slower than before, 3069 * especially for 64 bit floats. 3070 */ 3071 3072 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p) 3073 { 3074 uint64_t a_frac, r_frac, s_frac; 3075 int bit, last_bit; 3076 3077 if (is_nan(a.cls)) { 3078 return return_nan(a, s); 3079 } 3080 if (a.cls == float_class_zero) { 3081 return a; /* sqrt(+-0) = +-0 */ 3082 } 3083 if (a.sign) { 3084 s->float_exception_flags |= float_flag_invalid; 3085 return parts_default_nan(s); 3086 } 3087 if (a.cls == float_class_inf) { 3088 return a; /* sqrt(+inf) = +inf */ 3089 } 3090 3091 assert(a.cls == float_class_normal); 3092 3093 /* We need two overflow bits at the top. Adding room for that is a 3094 * right shift. If the exponent is odd, we can discard the low bit 3095 * by multiplying the fraction by 2; that's a left shift. Combine 3096 * those and we shift right if the exponent is even. 3097 */ 3098 a_frac = a.frac; 3099 if (!(a.exp & 1)) { 3100 a_frac >>= 1; 3101 } 3102 a.exp >>= 1; 3103 3104 /* Bit-by-bit computation of sqrt. */ 3105 r_frac = 0; 3106 s_frac = 0; 3107 3108 /* Iterate from implicit bit down to the 3 extra bits to compute a 3109 * properly rounded result. Remember we've inserted one more bit 3110 * at the top, so these positions are one less. 3111 */ 3112 bit = DECOMPOSED_BINARY_POINT - 1; 3113 last_bit = MAX(p->frac_shift - 4, 0); 3114 do { 3115 uint64_t q = 1ULL << bit; 3116 uint64_t t_frac = s_frac + q; 3117 if (t_frac <= a_frac) { 3118 s_frac = t_frac + q; 3119 a_frac -= t_frac; 3120 r_frac += q; 3121 } 3122 a_frac <<= 1; 3123 } while (--bit >= last_bit); 3124 3125 /* Undo the right shift done above. If there is any remaining 3126 * fraction, the result is inexact. Set the sticky bit. 3127 */ 3128 a.frac = (r_frac << 1) + (a_frac != 0); 3129 3130 return a; 3131 } 3132 3133 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status) 3134 { 3135 FloatParts pa = float16_unpack_canonical(a, status); 3136 FloatParts pr = sqrt_float(pa, status, &float16_params); 3137 return float16_round_pack_canonical(pr, status); 3138 } 3139 3140 static float32 QEMU_SOFTFLOAT_ATTR 3141 soft_f32_sqrt(float32 a, float_status *status) 3142 { 3143 FloatParts pa = float32_unpack_canonical(a, status); 3144 FloatParts pr = sqrt_float(pa, status, &float32_params); 3145 return float32_round_pack_canonical(pr, status); 3146 } 3147 3148 static float64 QEMU_SOFTFLOAT_ATTR 3149 soft_f64_sqrt(float64 a, float_status *status) 3150 { 3151 FloatParts pa = float64_unpack_canonical(a, status); 3152 FloatParts pr = sqrt_float(pa, status, &float64_params); 3153 return float64_round_pack_canonical(pr, status); 3154 } 3155 3156 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s) 3157 { 3158 union_float32 ua, ur; 3159 3160 ua.s = xa; 3161 if (unlikely(!can_use_fpu(s))) { 3162 goto soft; 3163 } 3164 3165 float32_input_flush1(&ua.s, s); 3166 if (QEMU_HARDFLOAT_1F32_USE_FP) { 3167 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3168 fpclassify(ua.h) == FP_ZERO) || 3169 signbit(ua.h))) { 3170 goto soft; 3171 } 3172 } else if (unlikely(!float32_is_zero_or_normal(ua.s) || 3173 float32_is_neg(ua.s))) { 3174 goto soft; 3175 } 3176 ur.h = sqrtf(ua.h); 3177 return ur.s; 3178 3179 soft: 3180 return soft_f32_sqrt(ua.s, s); 3181 } 3182 3183 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s) 3184 { 3185 union_float64 ua, ur; 3186 3187 ua.s = xa; 3188 if (unlikely(!can_use_fpu(s))) { 3189 goto soft; 3190 } 3191 3192 float64_input_flush1(&ua.s, s); 3193 if (QEMU_HARDFLOAT_1F64_USE_FP) { 3194 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3195 fpclassify(ua.h) == FP_ZERO) || 3196 signbit(ua.h))) { 3197 goto soft; 3198 } 3199 } else if (unlikely(!float64_is_zero_or_normal(ua.s) || 3200 float64_is_neg(ua.s))) { 3201 goto soft; 3202 } 3203 ur.h = sqrt(ua.h); 3204 return ur.s; 3205 3206 soft: 3207 return soft_f64_sqrt(ua.s, s); 3208 } 3209 3210 /*---------------------------------------------------------------------------- 3211 | The pattern for a default generated NaN. 3212 *----------------------------------------------------------------------------*/ 3213 3214 float16 float16_default_nan(float_status *status) 3215 { 3216 FloatParts p = parts_default_nan(status); 3217 p.frac >>= float16_params.frac_shift; 3218 return float16_pack_raw(p); 3219 } 3220 3221 float32 float32_default_nan(float_status *status) 3222 { 3223 FloatParts p = parts_default_nan(status); 3224 p.frac >>= float32_params.frac_shift; 3225 return float32_pack_raw(p); 3226 } 3227 3228 float64 float64_default_nan(float_status *status) 3229 { 3230 FloatParts p = parts_default_nan(status); 3231 p.frac >>= float64_params.frac_shift; 3232 return float64_pack_raw(p); 3233 } 3234 3235 float128 float128_default_nan(float_status *status) 3236 { 3237 FloatParts p = parts_default_nan(status); 3238 float128 r; 3239 3240 /* Extrapolate from the choices made by parts_default_nan to fill 3241 * in the quad-floating format. If the low bit is set, assume we 3242 * want to set all non-snan bits. 3243 */ 3244 r.low = -(p.frac & 1); 3245 r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48); 3246 r.high |= LIT64(0x7FFF000000000000); 3247 r.high |= (uint64_t)p.sign << 63; 3248 3249 return r; 3250 } 3251 3252 /*---------------------------------------------------------------------------- 3253 | Returns a quiet NaN from a signalling NaN for the floating point value `a'. 3254 *----------------------------------------------------------------------------*/ 3255 3256 float16 float16_silence_nan(float16 a, float_status *status) 3257 { 3258 FloatParts p = float16_unpack_raw(a); 3259 p.frac <<= float16_params.frac_shift; 3260 p = parts_silence_nan(p, status); 3261 p.frac >>= float16_params.frac_shift; 3262 return float16_pack_raw(p); 3263 } 3264 3265 float32 float32_silence_nan(float32 a, float_status *status) 3266 { 3267 FloatParts p = float32_unpack_raw(a); 3268 p.frac <<= float32_params.frac_shift; 3269 p = parts_silence_nan(p, status); 3270 p.frac >>= float32_params.frac_shift; 3271 return float32_pack_raw(p); 3272 } 3273 3274 float64 float64_silence_nan(float64 a, float_status *status) 3275 { 3276 FloatParts p = float64_unpack_raw(a); 3277 p.frac <<= float64_params.frac_shift; 3278 p = parts_silence_nan(p, status); 3279 p.frac >>= float64_params.frac_shift; 3280 return float64_pack_raw(p); 3281 } 3282 3283 /*---------------------------------------------------------------------------- 3284 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 3285 | and 7, and returns the properly rounded 32-bit integer corresponding to the 3286 | input. If `zSign' is 1, the input is negated before being converted to an 3287 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 3288 | is simply rounded to an integer, with the inexact exception raised if the 3289 | input cannot be represented exactly as an integer. However, if the fixed- 3290 | point input is too large, the invalid exception is raised and the largest 3291 | positive or negative integer is returned. 3292 *----------------------------------------------------------------------------*/ 3293 3294 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status) 3295 { 3296 int8_t roundingMode; 3297 flag roundNearestEven; 3298 int8_t roundIncrement, roundBits; 3299 int32_t z; 3300 3301 roundingMode = status->float_rounding_mode; 3302 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3303 switch (roundingMode) { 3304 case float_round_nearest_even: 3305 case float_round_ties_away: 3306 roundIncrement = 0x40; 3307 break; 3308 case float_round_to_zero: 3309 roundIncrement = 0; 3310 break; 3311 case float_round_up: 3312 roundIncrement = zSign ? 0 : 0x7f; 3313 break; 3314 case float_round_down: 3315 roundIncrement = zSign ? 0x7f : 0; 3316 break; 3317 default: 3318 abort(); 3319 } 3320 roundBits = absZ & 0x7F; 3321 absZ = ( absZ + roundIncrement )>>7; 3322 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 3323 z = absZ; 3324 if ( zSign ) z = - z; 3325 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 3326 float_raise(float_flag_invalid, status); 3327 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 3328 } 3329 if (roundBits) { 3330 status->float_exception_flags |= float_flag_inexact; 3331 } 3332 return z; 3333 3334 } 3335 3336 /*---------------------------------------------------------------------------- 3337 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 3338 | `absZ1', with binary point between bits 63 and 64 (between the input words), 3339 | and returns the properly rounded 64-bit integer corresponding to the input. 3340 | If `zSign' is 1, the input is negated before being converted to an integer. 3341 | Ordinarily, the fixed-point input is simply rounded to an integer, with 3342 | the inexact exception raised if the input cannot be represented exactly as 3343 | an integer. However, if the fixed-point input is too large, the invalid 3344 | exception is raised and the largest positive or negative integer is 3345 | returned. 3346 *----------------------------------------------------------------------------*/ 3347 3348 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1, 3349 float_status *status) 3350 { 3351 int8_t roundingMode; 3352 flag roundNearestEven, increment; 3353 int64_t z; 3354 3355 roundingMode = status->float_rounding_mode; 3356 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3357 switch (roundingMode) { 3358 case float_round_nearest_even: 3359 case float_round_ties_away: 3360 increment = ((int64_t) absZ1 < 0); 3361 break; 3362 case float_round_to_zero: 3363 increment = 0; 3364 break; 3365 case float_round_up: 3366 increment = !zSign && absZ1; 3367 break; 3368 case float_round_down: 3369 increment = zSign && absZ1; 3370 break; 3371 default: 3372 abort(); 3373 } 3374 if ( increment ) { 3375 ++absZ0; 3376 if ( absZ0 == 0 ) goto overflow; 3377 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 3378 } 3379 z = absZ0; 3380 if ( zSign ) z = - z; 3381 if ( z && ( ( z < 0 ) ^ zSign ) ) { 3382 overflow: 3383 float_raise(float_flag_invalid, status); 3384 return 3385 zSign ? (int64_t) LIT64( 0x8000000000000000 ) 3386 : LIT64( 0x7FFFFFFFFFFFFFFF ); 3387 } 3388 if (absZ1) { 3389 status->float_exception_flags |= float_flag_inexact; 3390 } 3391 return z; 3392 3393 } 3394 3395 /*---------------------------------------------------------------------------- 3396 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 3397 | `absZ1', with binary point between bits 63 and 64 (between the input words), 3398 | and returns the properly rounded 64-bit unsigned integer corresponding to the 3399 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 3400 | with the inexact exception raised if the input cannot be represented exactly 3401 | as an integer. However, if the fixed-point input is too large, the invalid 3402 | exception is raised and the largest unsigned integer is returned. 3403 *----------------------------------------------------------------------------*/ 3404 3405 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0, 3406 uint64_t absZ1, float_status *status) 3407 { 3408 int8_t roundingMode; 3409 flag roundNearestEven, increment; 3410 3411 roundingMode = status->float_rounding_mode; 3412 roundNearestEven = (roundingMode == float_round_nearest_even); 3413 switch (roundingMode) { 3414 case float_round_nearest_even: 3415 case float_round_ties_away: 3416 increment = ((int64_t)absZ1 < 0); 3417 break; 3418 case float_round_to_zero: 3419 increment = 0; 3420 break; 3421 case float_round_up: 3422 increment = !zSign && absZ1; 3423 break; 3424 case float_round_down: 3425 increment = zSign && absZ1; 3426 break; 3427 default: 3428 abort(); 3429 } 3430 if (increment) { 3431 ++absZ0; 3432 if (absZ0 == 0) { 3433 float_raise(float_flag_invalid, status); 3434 return LIT64(0xFFFFFFFFFFFFFFFF); 3435 } 3436 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven); 3437 } 3438 3439 if (zSign && absZ0) { 3440 float_raise(float_flag_invalid, status); 3441 return 0; 3442 } 3443 3444 if (absZ1) { 3445 status->float_exception_flags |= float_flag_inexact; 3446 } 3447 return absZ0; 3448 } 3449 3450 /*---------------------------------------------------------------------------- 3451 | If `a' is denormal and we are in flush-to-zero mode then set the 3452 | input-denormal exception and return zero. Otherwise just return the value. 3453 *----------------------------------------------------------------------------*/ 3454 float32 float32_squash_input_denormal(float32 a, float_status *status) 3455 { 3456 if (status->flush_inputs_to_zero) { 3457 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) { 3458 float_raise(float_flag_input_denormal, status); 3459 return make_float32(float32_val(a) & 0x80000000); 3460 } 3461 } 3462 return a; 3463 } 3464 3465 /*---------------------------------------------------------------------------- 3466 | Normalizes the subnormal single-precision floating-point value represented 3467 | by the denormalized significand `aSig'. The normalized exponent and 3468 | significand are stored at the locations pointed to by `zExpPtr' and 3469 | `zSigPtr', respectively. 3470 *----------------------------------------------------------------------------*/ 3471 3472 static void 3473 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 3474 { 3475 int8_t shiftCount; 3476 3477 shiftCount = clz32(aSig) - 8; 3478 *zSigPtr = aSig<<shiftCount; 3479 *zExpPtr = 1 - shiftCount; 3480 3481 } 3482 3483 /*---------------------------------------------------------------------------- 3484 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3485 | and significand `zSig', and returns the proper single-precision floating- 3486 | point value corresponding to the abstract input. Ordinarily, the abstract 3487 | value is simply rounded and packed into the single-precision format, with 3488 | the inexact exception raised if the abstract input cannot be represented 3489 | exactly. However, if the abstract value is too large, the overflow and 3490 | inexact exceptions are raised and an infinity or maximal finite value is 3491 | returned. If the abstract value is too small, the input value is rounded to 3492 | a subnormal number, and the underflow and inexact exceptions are raised if 3493 | the abstract input cannot be represented exactly as a subnormal single- 3494 | precision floating-point number. 3495 | The input significand `zSig' has its binary point between bits 30 3496 | and 29, which is 7 bits to the left of the usual location. This shifted 3497 | significand must be normalized or smaller. If `zSig' is not normalized, 3498 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3499 | and it must not require rounding. In the usual case that `zSig' is 3500 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3501 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3502 | Binary Floating-Point Arithmetic. 3503 *----------------------------------------------------------------------------*/ 3504 3505 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 3506 float_status *status) 3507 { 3508 int8_t roundingMode; 3509 flag roundNearestEven; 3510 int8_t roundIncrement, roundBits; 3511 flag isTiny; 3512 3513 roundingMode = status->float_rounding_mode; 3514 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3515 switch (roundingMode) { 3516 case float_round_nearest_even: 3517 case float_round_ties_away: 3518 roundIncrement = 0x40; 3519 break; 3520 case float_round_to_zero: 3521 roundIncrement = 0; 3522 break; 3523 case float_round_up: 3524 roundIncrement = zSign ? 0 : 0x7f; 3525 break; 3526 case float_round_down: 3527 roundIncrement = zSign ? 0x7f : 0; 3528 break; 3529 default: 3530 abort(); 3531 break; 3532 } 3533 roundBits = zSig & 0x7F; 3534 if ( 0xFD <= (uint16_t) zExp ) { 3535 if ( ( 0xFD < zExp ) 3536 || ( ( zExp == 0xFD ) 3537 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 3538 ) { 3539 float_raise(float_flag_overflow | float_flag_inexact, status); 3540 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 )); 3541 } 3542 if ( zExp < 0 ) { 3543 if (status->flush_to_zero) { 3544 float_raise(float_flag_output_denormal, status); 3545 return packFloat32(zSign, 0, 0); 3546 } 3547 isTiny = 3548 (status->float_detect_tininess 3549 == float_tininess_before_rounding) 3550 || ( zExp < -1 ) 3551 || ( zSig + roundIncrement < 0x80000000 ); 3552 shift32RightJamming( zSig, - zExp, &zSig ); 3553 zExp = 0; 3554 roundBits = zSig & 0x7F; 3555 if (isTiny && roundBits) { 3556 float_raise(float_flag_underflow, status); 3557 } 3558 } 3559 } 3560 if (roundBits) { 3561 status->float_exception_flags |= float_flag_inexact; 3562 } 3563 zSig = ( zSig + roundIncrement )>>7; 3564 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 3565 if ( zSig == 0 ) zExp = 0; 3566 return packFloat32( zSign, zExp, zSig ); 3567 3568 } 3569 3570 /*---------------------------------------------------------------------------- 3571 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3572 | and significand `zSig', and returns the proper single-precision floating- 3573 | point value corresponding to the abstract input. This routine is just like 3574 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 3575 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 3576 | floating-point exponent. 3577 *----------------------------------------------------------------------------*/ 3578 3579 static float32 3580 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 3581 float_status *status) 3582 { 3583 int8_t shiftCount; 3584 3585 shiftCount = clz32(zSig) - 1; 3586 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 3587 status); 3588 3589 } 3590 3591 /*---------------------------------------------------------------------------- 3592 | If `a' is denormal and we are in flush-to-zero mode then set the 3593 | input-denormal exception and return zero. Otherwise just return the value. 3594 *----------------------------------------------------------------------------*/ 3595 float64 float64_squash_input_denormal(float64 a, float_status *status) 3596 { 3597 if (status->flush_inputs_to_zero) { 3598 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) { 3599 float_raise(float_flag_input_denormal, status); 3600 return make_float64(float64_val(a) & (1ULL << 63)); 3601 } 3602 } 3603 return a; 3604 } 3605 3606 /*---------------------------------------------------------------------------- 3607 | Normalizes the subnormal double-precision floating-point value represented 3608 | by the denormalized significand `aSig'. The normalized exponent and 3609 | significand are stored at the locations pointed to by `zExpPtr' and 3610 | `zSigPtr', respectively. 3611 *----------------------------------------------------------------------------*/ 3612 3613 static void 3614 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 3615 { 3616 int8_t shiftCount; 3617 3618 shiftCount = clz64(aSig) - 11; 3619 *zSigPtr = aSig<<shiftCount; 3620 *zExpPtr = 1 - shiftCount; 3621 3622 } 3623 3624 /*---------------------------------------------------------------------------- 3625 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 3626 | double-precision floating-point value, returning the result. After being 3627 | shifted into the proper positions, the three fields are simply added 3628 | together to form the result. This means that any integer portion of `zSig' 3629 | will be added into the exponent. Since a properly normalized significand 3630 | will have an integer portion equal to 1, the `zExp' input should be 1 less 3631 | than the desired result exponent whenever `zSig' is a complete, normalized 3632 | significand. 3633 *----------------------------------------------------------------------------*/ 3634 3635 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig) 3636 { 3637 3638 return make_float64( 3639 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 3640 3641 } 3642 3643 /*---------------------------------------------------------------------------- 3644 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3645 | and significand `zSig', and returns the proper double-precision floating- 3646 | point value corresponding to the abstract input. Ordinarily, the abstract 3647 | value is simply rounded and packed into the double-precision format, with 3648 | the inexact exception raised if the abstract input cannot be represented 3649 | exactly. However, if the abstract value is too large, the overflow and 3650 | inexact exceptions are raised and an infinity or maximal finite value is 3651 | returned. If the abstract value is too small, the input value is rounded to 3652 | a subnormal number, and the underflow and inexact exceptions are raised if 3653 | the abstract input cannot be represented exactly as a subnormal double- 3654 | precision floating-point number. 3655 | The input significand `zSig' has its binary point between bits 62 3656 | and 61, which is 10 bits to the left of the usual location. This shifted 3657 | significand must be normalized or smaller. If `zSig' is not normalized, 3658 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3659 | and it must not require rounding. In the usual case that `zSig' is 3660 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3661 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3662 | Binary Floating-Point Arithmetic. 3663 *----------------------------------------------------------------------------*/ 3664 3665 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 3666 float_status *status) 3667 { 3668 int8_t roundingMode; 3669 flag roundNearestEven; 3670 int roundIncrement, roundBits; 3671 flag isTiny; 3672 3673 roundingMode = status->float_rounding_mode; 3674 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3675 switch (roundingMode) { 3676 case float_round_nearest_even: 3677 case float_round_ties_away: 3678 roundIncrement = 0x200; 3679 break; 3680 case float_round_to_zero: 3681 roundIncrement = 0; 3682 break; 3683 case float_round_up: 3684 roundIncrement = zSign ? 0 : 0x3ff; 3685 break; 3686 case float_round_down: 3687 roundIncrement = zSign ? 0x3ff : 0; 3688 break; 3689 case float_round_to_odd: 3690 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 3691 break; 3692 default: 3693 abort(); 3694 } 3695 roundBits = zSig & 0x3FF; 3696 if ( 0x7FD <= (uint16_t) zExp ) { 3697 if ( ( 0x7FD < zExp ) 3698 || ( ( zExp == 0x7FD ) 3699 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 3700 ) { 3701 bool overflow_to_inf = roundingMode != float_round_to_odd && 3702 roundIncrement != 0; 3703 float_raise(float_flag_overflow | float_flag_inexact, status); 3704 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 3705 } 3706 if ( zExp < 0 ) { 3707 if (status->flush_to_zero) { 3708 float_raise(float_flag_output_denormal, status); 3709 return packFloat64(zSign, 0, 0); 3710 } 3711 isTiny = 3712 (status->float_detect_tininess 3713 == float_tininess_before_rounding) 3714 || ( zExp < -1 ) 3715 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 3716 shift64RightJamming( zSig, - zExp, &zSig ); 3717 zExp = 0; 3718 roundBits = zSig & 0x3FF; 3719 if (isTiny && roundBits) { 3720 float_raise(float_flag_underflow, status); 3721 } 3722 if (roundingMode == float_round_to_odd) { 3723 /* 3724 * For round-to-odd case, the roundIncrement depends on 3725 * zSig which just changed. 3726 */ 3727 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 3728 } 3729 } 3730 } 3731 if (roundBits) { 3732 status->float_exception_flags |= float_flag_inexact; 3733 } 3734 zSig = ( zSig + roundIncrement )>>10; 3735 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 3736 if ( zSig == 0 ) zExp = 0; 3737 return packFloat64( zSign, zExp, zSig ); 3738 3739 } 3740 3741 /*---------------------------------------------------------------------------- 3742 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3743 | and significand `zSig', and returns the proper double-precision floating- 3744 | point value corresponding to the abstract input. This routine is just like 3745 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 3746 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 3747 | floating-point exponent. 3748 *----------------------------------------------------------------------------*/ 3749 3750 static float64 3751 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 3752 float_status *status) 3753 { 3754 int8_t shiftCount; 3755 3756 shiftCount = clz64(zSig) - 1; 3757 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 3758 status); 3759 3760 } 3761 3762 /*---------------------------------------------------------------------------- 3763 | Normalizes the subnormal extended double-precision floating-point value 3764 | represented by the denormalized significand `aSig'. The normalized exponent 3765 | and significand are stored at the locations pointed to by `zExpPtr' and 3766 | `zSigPtr', respectively. 3767 *----------------------------------------------------------------------------*/ 3768 3769 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr, 3770 uint64_t *zSigPtr) 3771 { 3772 int8_t shiftCount; 3773 3774 shiftCount = clz64(aSig); 3775 *zSigPtr = aSig<<shiftCount; 3776 *zExpPtr = 1 - shiftCount; 3777 } 3778 3779 /*---------------------------------------------------------------------------- 3780 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3781 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 3782 | and returns the proper extended double-precision floating-point value 3783 | corresponding to the abstract input. Ordinarily, the abstract value is 3784 | rounded and packed into the extended double-precision format, with the 3785 | inexact exception raised if the abstract input cannot be represented 3786 | exactly. However, if the abstract value is too large, the overflow and 3787 | inexact exceptions are raised and an infinity or maximal finite value is 3788 | returned. If the abstract value is too small, the input value is rounded to 3789 | a subnormal number, and the underflow and inexact exceptions are raised if 3790 | the abstract input cannot be represented exactly as a subnormal extended 3791 | double-precision floating-point number. 3792 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 3793 | number of bits as single or double precision, respectively. Otherwise, the 3794 | result is rounded to the full precision of the extended double-precision 3795 | format. 3796 | The input significand must be normalized or smaller. If the input 3797 | significand is not normalized, `zExp' must be 0; in that case, the result 3798 | returned is a subnormal number, and it must not require rounding. The 3799 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 3800 | Floating-Point Arithmetic. 3801 *----------------------------------------------------------------------------*/ 3802 3803 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign, 3804 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 3805 float_status *status) 3806 { 3807 int8_t roundingMode; 3808 flag roundNearestEven, increment, isTiny; 3809 int64_t roundIncrement, roundMask, roundBits; 3810 3811 roundingMode = status->float_rounding_mode; 3812 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3813 if ( roundingPrecision == 80 ) goto precision80; 3814 if ( roundingPrecision == 64 ) { 3815 roundIncrement = LIT64( 0x0000000000000400 ); 3816 roundMask = LIT64( 0x00000000000007FF ); 3817 } 3818 else if ( roundingPrecision == 32 ) { 3819 roundIncrement = LIT64( 0x0000008000000000 ); 3820 roundMask = LIT64( 0x000000FFFFFFFFFF ); 3821 } 3822 else { 3823 goto precision80; 3824 } 3825 zSig0 |= ( zSig1 != 0 ); 3826 switch (roundingMode) { 3827 case float_round_nearest_even: 3828 case float_round_ties_away: 3829 break; 3830 case float_round_to_zero: 3831 roundIncrement = 0; 3832 break; 3833 case float_round_up: 3834 roundIncrement = zSign ? 0 : roundMask; 3835 break; 3836 case float_round_down: 3837 roundIncrement = zSign ? roundMask : 0; 3838 break; 3839 default: 3840 abort(); 3841 } 3842 roundBits = zSig0 & roundMask; 3843 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 3844 if ( ( 0x7FFE < zExp ) 3845 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 3846 ) { 3847 goto overflow; 3848 } 3849 if ( zExp <= 0 ) { 3850 if (status->flush_to_zero) { 3851 float_raise(float_flag_output_denormal, status); 3852 return packFloatx80(zSign, 0, 0); 3853 } 3854 isTiny = 3855 (status->float_detect_tininess 3856 == float_tininess_before_rounding) 3857 || ( zExp < 0 ) 3858 || ( zSig0 <= zSig0 + roundIncrement ); 3859 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 3860 zExp = 0; 3861 roundBits = zSig0 & roundMask; 3862 if (isTiny && roundBits) { 3863 float_raise(float_flag_underflow, status); 3864 } 3865 if (roundBits) { 3866 status->float_exception_flags |= float_flag_inexact; 3867 } 3868 zSig0 += roundIncrement; 3869 if ( (int64_t) zSig0 < 0 ) zExp = 1; 3870 roundIncrement = roundMask + 1; 3871 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 3872 roundMask |= roundIncrement; 3873 } 3874 zSig0 &= ~ roundMask; 3875 return packFloatx80( zSign, zExp, zSig0 ); 3876 } 3877 } 3878 if (roundBits) { 3879 status->float_exception_flags |= float_flag_inexact; 3880 } 3881 zSig0 += roundIncrement; 3882 if ( zSig0 < roundIncrement ) { 3883 ++zExp; 3884 zSig0 = LIT64( 0x8000000000000000 ); 3885 } 3886 roundIncrement = roundMask + 1; 3887 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 3888 roundMask |= roundIncrement; 3889 } 3890 zSig0 &= ~ roundMask; 3891 if ( zSig0 == 0 ) zExp = 0; 3892 return packFloatx80( zSign, zExp, zSig0 ); 3893 precision80: 3894 switch (roundingMode) { 3895 case float_round_nearest_even: 3896 case float_round_ties_away: 3897 increment = ((int64_t)zSig1 < 0); 3898 break; 3899 case float_round_to_zero: 3900 increment = 0; 3901 break; 3902 case float_round_up: 3903 increment = !zSign && zSig1; 3904 break; 3905 case float_round_down: 3906 increment = zSign && zSig1; 3907 break; 3908 default: 3909 abort(); 3910 } 3911 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 3912 if ( ( 0x7FFE < zExp ) 3913 || ( ( zExp == 0x7FFE ) 3914 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 3915 && increment 3916 ) 3917 ) { 3918 roundMask = 0; 3919 overflow: 3920 float_raise(float_flag_overflow | float_flag_inexact, status); 3921 if ( ( roundingMode == float_round_to_zero ) 3922 || ( zSign && ( roundingMode == float_round_up ) ) 3923 || ( ! zSign && ( roundingMode == float_round_down ) ) 3924 ) { 3925 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 3926 } 3927 return packFloatx80(zSign, 3928 floatx80_infinity_high, 3929 floatx80_infinity_low); 3930 } 3931 if ( zExp <= 0 ) { 3932 isTiny = 3933 (status->float_detect_tininess 3934 == float_tininess_before_rounding) 3935 || ( zExp < 0 ) 3936 || ! increment 3937 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 3938 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 3939 zExp = 0; 3940 if (isTiny && zSig1) { 3941 float_raise(float_flag_underflow, status); 3942 } 3943 if (zSig1) { 3944 status->float_exception_flags |= float_flag_inexact; 3945 } 3946 switch (roundingMode) { 3947 case float_round_nearest_even: 3948 case float_round_ties_away: 3949 increment = ((int64_t)zSig1 < 0); 3950 break; 3951 case float_round_to_zero: 3952 increment = 0; 3953 break; 3954 case float_round_up: 3955 increment = !zSign && zSig1; 3956 break; 3957 case float_round_down: 3958 increment = zSign && zSig1; 3959 break; 3960 default: 3961 abort(); 3962 } 3963 if ( increment ) { 3964 ++zSig0; 3965 zSig0 &= 3966 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 3967 if ( (int64_t) zSig0 < 0 ) zExp = 1; 3968 } 3969 return packFloatx80( zSign, zExp, zSig0 ); 3970 } 3971 } 3972 if (zSig1) { 3973 status->float_exception_flags |= float_flag_inexact; 3974 } 3975 if ( increment ) { 3976 ++zSig0; 3977 if ( zSig0 == 0 ) { 3978 ++zExp; 3979 zSig0 = LIT64( 0x8000000000000000 ); 3980 } 3981 else { 3982 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 3983 } 3984 } 3985 else { 3986 if ( zSig0 == 0 ) zExp = 0; 3987 } 3988 return packFloatx80( zSign, zExp, zSig0 ); 3989 3990 } 3991 3992 /*---------------------------------------------------------------------------- 3993 | Takes an abstract floating-point value having sign `zSign', exponent 3994 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 3995 | and returns the proper extended double-precision floating-point value 3996 | corresponding to the abstract input. This routine is just like 3997 | `roundAndPackFloatx80' except that the input significand does not have to be 3998 | normalized. 3999 *----------------------------------------------------------------------------*/ 4000 4001 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 4002 flag zSign, int32_t zExp, 4003 uint64_t zSig0, uint64_t zSig1, 4004 float_status *status) 4005 { 4006 int8_t shiftCount; 4007 4008 if ( zSig0 == 0 ) { 4009 zSig0 = zSig1; 4010 zSig1 = 0; 4011 zExp -= 64; 4012 } 4013 shiftCount = clz64(zSig0); 4014 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4015 zExp -= shiftCount; 4016 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 4017 zSig0, zSig1, status); 4018 4019 } 4020 4021 /*---------------------------------------------------------------------------- 4022 | Returns the least-significant 64 fraction bits of the quadruple-precision 4023 | floating-point value `a'. 4024 *----------------------------------------------------------------------------*/ 4025 4026 static inline uint64_t extractFloat128Frac1( float128 a ) 4027 { 4028 4029 return a.low; 4030 4031 } 4032 4033 /*---------------------------------------------------------------------------- 4034 | Returns the most-significant 48 fraction bits of the quadruple-precision 4035 | floating-point value `a'. 4036 *----------------------------------------------------------------------------*/ 4037 4038 static inline uint64_t extractFloat128Frac0( float128 a ) 4039 { 4040 4041 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 4042 4043 } 4044 4045 /*---------------------------------------------------------------------------- 4046 | Returns the exponent bits of the quadruple-precision floating-point value 4047 | `a'. 4048 *----------------------------------------------------------------------------*/ 4049 4050 static inline int32_t extractFloat128Exp( float128 a ) 4051 { 4052 4053 return ( a.high>>48 ) & 0x7FFF; 4054 4055 } 4056 4057 /*---------------------------------------------------------------------------- 4058 | Returns the sign bit of the quadruple-precision floating-point value `a'. 4059 *----------------------------------------------------------------------------*/ 4060 4061 static inline flag extractFloat128Sign( float128 a ) 4062 { 4063 4064 return a.high>>63; 4065 4066 } 4067 4068 /*---------------------------------------------------------------------------- 4069 | Normalizes the subnormal quadruple-precision floating-point value 4070 | represented by the denormalized significand formed by the concatenation of 4071 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 4072 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 4073 | significand are stored at the location pointed to by `zSig0Ptr', and the 4074 | least significant 64 bits of the normalized significand are stored at the 4075 | location pointed to by `zSig1Ptr'. 4076 *----------------------------------------------------------------------------*/ 4077 4078 static void 4079 normalizeFloat128Subnormal( 4080 uint64_t aSig0, 4081 uint64_t aSig1, 4082 int32_t *zExpPtr, 4083 uint64_t *zSig0Ptr, 4084 uint64_t *zSig1Ptr 4085 ) 4086 { 4087 int8_t shiftCount; 4088 4089 if ( aSig0 == 0 ) { 4090 shiftCount = clz64(aSig1) - 15; 4091 if ( shiftCount < 0 ) { 4092 *zSig0Ptr = aSig1>>( - shiftCount ); 4093 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 4094 } 4095 else { 4096 *zSig0Ptr = aSig1<<shiftCount; 4097 *zSig1Ptr = 0; 4098 } 4099 *zExpPtr = - shiftCount - 63; 4100 } 4101 else { 4102 shiftCount = clz64(aSig0) - 15; 4103 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 4104 *zExpPtr = 1 - shiftCount; 4105 } 4106 4107 } 4108 4109 /*---------------------------------------------------------------------------- 4110 | Packs the sign `zSign', the exponent `zExp', and the significand formed 4111 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 4112 | floating-point value, returning the result. After being shifted into the 4113 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 4114 | added together to form the most significant 32 bits of the result. This 4115 | means that any integer portion of `zSig0' will be added into the exponent. 4116 | Since a properly normalized significand will have an integer portion equal 4117 | to 1, the `zExp' input should be 1 less than the desired result exponent 4118 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 4119 | significand. 4120 *----------------------------------------------------------------------------*/ 4121 4122 static inline float128 4123 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 ) 4124 { 4125 float128 z; 4126 4127 z.low = zSig1; 4128 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0; 4129 return z; 4130 4131 } 4132 4133 /*---------------------------------------------------------------------------- 4134 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4135 | and extended significand formed by the concatenation of `zSig0', `zSig1', 4136 | and `zSig2', and returns the proper quadruple-precision floating-point value 4137 | corresponding to the abstract input. Ordinarily, the abstract value is 4138 | simply rounded and packed into the quadruple-precision format, with the 4139 | inexact exception raised if the abstract input cannot be represented 4140 | exactly. However, if the abstract value is too large, the overflow and 4141 | inexact exceptions are raised and an infinity or maximal finite value is 4142 | returned. If the abstract value is too small, the input value is rounded to 4143 | a subnormal number, and the underflow and inexact exceptions are raised if 4144 | the abstract input cannot be represented exactly as a subnormal quadruple- 4145 | precision floating-point number. 4146 | The input significand must be normalized or smaller. If the input 4147 | significand is not normalized, `zExp' must be 0; in that case, the result 4148 | returned is a subnormal number, and it must not require rounding. In the 4149 | usual case that the input significand is normalized, `zExp' must be 1 less 4150 | than the ``true'' floating-point exponent. The handling of underflow and 4151 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4152 *----------------------------------------------------------------------------*/ 4153 4154 static float128 roundAndPackFloat128(flag zSign, int32_t zExp, 4155 uint64_t zSig0, uint64_t zSig1, 4156 uint64_t zSig2, float_status *status) 4157 { 4158 int8_t roundingMode; 4159 flag roundNearestEven, increment, isTiny; 4160 4161 roundingMode = status->float_rounding_mode; 4162 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4163 switch (roundingMode) { 4164 case float_round_nearest_even: 4165 case float_round_ties_away: 4166 increment = ((int64_t)zSig2 < 0); 4167 break; 4168 case float_round_to_zero: 4169 increment = 0; 4170 break; 4171 case float_round_up: 4172 increment = !zSign && zSig2; 4173 break; 4174 case float_round_down: 4175 increment = zSign && zSig2; 4176 break; 4177 case float_round_to_odd: 4178 increment = !(zSig1 & 0x1) && zSig2; 4179 break; 4180 default: 4181 abort(); 4182 } 4183 if ( 0x7FFD <= (uint32_t) zExp ) { 4184 if ( ( 0x7FFD < zExp ) 4185 || ( ( zExp == 0x7FFD ) 4186 && eq128( 4187 LIT64( 0x0001FFFFFFFFFFFF ), 4188 LIT64( 0xFFFFFFFFFFFFFFFF ), 4189 zSig0, 4190 zSig1 4191 ) 4192 && increment 4193 ) 4194 ) { 4195 float_raise(float_flag_overflow | float_flag_inexact, status); 4196 if ( ( roundingMode == float_round_to_zero ) 4197 || ( zSign && ( roundingMode == float_round_up ) ) 4198 || ( ! zSign && ( roundingMode == float_round_down ) ) 4199 || (roundingMode == float_round_to_odd) 4200 ) { 4201 return 4202 packFloat128( 4203 zSign, 4204 0x7FFE, 4205 LIT64( 0x0000FFFFFFFFFFFF ), 4206 LIT64( 0xFFFFFFFFFFFFFFFF ) 4207 ); 4208 } 4209 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4210 } 4211 if ( zExp < 0 ) { 4212 if (status->flush_to_zero) { 4213 float_raise(float_flag_output_denormal, status); 4214 return packFloat128(zSign, 0, 0, 0); 4215 } 4216 isTiny = 4217 (status->float_detect_tininess 4218 == float_tininess_before_rounding) 4219 || ( zExp < -1 ) 4220 || ! increment 4221 || lt128( 4222 zSig0, 4223 zSig1, 4224 LIT64( 0x0001FFFFFFFFFFFF ), 4225 LIT64( 0xFFFFFFFFFFFFFFFF ) 4226 ); 4227 shift128ExtraRightJamming( 4228 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 4229 zExp = 0; 4230 if (isTiny && zSig2) { 4231 float_raise(float_flag_underflow, status); 4232 } 4233 switch (roundingMode) { 4234 case float_round_nearest_even: 4235 case float_round_ties_away: 4236 increment = ((int64_t)zSig2 < 0); 4237 break; 4238 case float_round_to_zero: 4239 increment = 0; 4240 break; 4241 case float_round_up: 4242 increment = !zSign && zSig2; 4243 break; 4244 case float_round_down: 4245 increment = zSign && zSig2; 4246 break; 4247 case float_round_to_odd: 4248 increment = !(zSig1 & 0x1) && zSig2; 4249 break; 4250 default: 4251 abort(); 4252 } 4253 } 4254 } 4255 if (zSig2) { 4256 status->float_exception_flags |= float_flag_inexact; 4257 } 4258 if ( increment ) { 4259 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 4260 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 4261 } 4262 else { 4263 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 4264 } 4265 return packFloat128( zSign, zExp, zSig0, zSig1 ); 4266 4267 } 4268 4269 /*---------------------------------------------------------------------------- 4270 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4271 | and significand formed by the concatenation of `zSig0' and `zSig1', and 4272 | returns the proper quadruple-precision floating-point value corresponding 4273 | to the abstract input. This routine is just like `roundAndPackFloat128' 4274 | except that the input significand has fewer bits and does not have to be 4275 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 4276 | point exponent. 4277 *----------------------------------------------------------------------------*/ 4278 4279 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp, 4280 uint64_t zSig0, uint64_t zSig1, 4281 float_status *status) 4282 { 4283 int8_t shiftCount; 4284 uint64_t zSig2; 4285 4286 if ( zSig0 == 0 ) { 4287 zSig0 = zSig1; 4288 zSig1 = 0; 4289 zExp -= 64; 4290 } 4291 shiftCount = clz64(zSig0) - 15; 4292 if ( 0 <= shiftCount ) { 4293 zSig2 = 0; 4294 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4295 } 4296 else { 4297 shift128ExtraRightJamming( 4298 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 4299 } 4300 zExp -= shiftCount; 4301 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 4302 4303 } 4304 4305 4306 /*---------------------------------------------------------------------------- 4307 | Returns the result of converting the 32-bit two's complement integer `a' 4308 | to the extended double-precision floating-point format. The conversion 4309 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4310 | Arithmetic. 4311 *----------------------------------------------------------------------------*/ 4312 4313 floatx80 int32_to_floatx80(int32_t a, float_status *status) 4314 { 4315 flag zSign; 4316 uint32_t absA; 4317 int8_t shiftCount; 4318 uint64_t zSig; 4319 4320 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 4321 zSign = ( a < 0 ); 4322 absA = zSign ? - a : a; 4323 shiftCount = clz32(absA) + 32; 4324 zSig = absA; 4325 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 4326 4327 } 4328 4329 /*---------------------------------------------------------------------------- 4330 | Returns the result of converting the 32-bit two's complement integer `a' to 4331 | the quadruple-precision floating-point format. The conversion is performed 4332 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4333 *----------------------------------------------------------------------------*/ 4334 4335 float128 int32_to_float128(int32_t a, float_status *status) 4336 { 4337 flag zSign; 4338 uint32_t absA; 4339 int8_t shiftCount; 4340 uint64_t zSig0; 4341 4342 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 4343 zSign = ( a < 0 ); 4344 absA = zSign ? - a : a; 4345 shiftCount = clz32(absA) + 17; 4346 zSig0 = absA; 4347 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 4348 4349 } 4350 4351 /*---------------------------------------------------------------------------- 4352 | Returns the result of converting the 64-bit two's complement integer `a' 4353 | to the extended double-precision floating-point format. The conversion 4354 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4355 | Arithmetic. 4356 *----------------------------------------------------------------------------*/ 4357 4358 floatx80 int64_to_floatx80(int64_t a, float_status *status) 4359 { 4360 flag zSign; 4361 uint64_t absA; 4362 int8_t shiftCount; 4363 4364 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 4365 zSign = ( a < 0 ); 4366 absA = zSign ? - a : a; 4367 shiftCount = clz64(absA); 4368 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 4369 4370 } 4371 4372 /*---------------------------------------------------------------------------- 4373 | Returns the result of converting the 64-bit two's complement integer `a' to 4374 | the quadruple-precision floating-point format. The conversion is performed 4375 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4376 *----------------------------------------------------------------------------*/ 4377 4378 float128 int64_to_float128(int64_t a, float_status *status) 4379 { 4380 flag zSign; 4381 uint64_t absA; 4382 int8_t shiftCount; 4383 int32_t zExp; 4384 uint64_t zSig0, zSig1; 4385 4386 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 4387 zSign = ( a < 0 ); 4388 absA = zSign ? - a : a; 4389 shiftCount = clz64(absA) + 49; 4390 zExp = 0x406E - shiftCount; 4391 if ( 64 <= shiftCount ) { 4392 zSig1 = 0; 4393 zSig0 = absA; 4394 shiftCount -= 64; 4395 } 4396 else { 4397 zSig1 = absA; 4398 zSig0 = 0; 4399 } 4400 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4401 return packFloat128( zSign, zExp, zSig0, zSig1 ); 4402 4403 } 4404 4405 /*---------------------------------------------------------------------------- 4406 | Returns the result of converting the 64-bit unsigned integer `a' 4407 | to the quadruple-precision floating-point format. The conversion is performed 4408 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4409 *----------------------------------------------------------------------------*/ 4410 4411 float128 uint64_to_float128(uint64_t a, float_status *status) 4412 { 4413 if (a == 0) { 4414 return float128_zero; 4415 } 4416 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status); 4417 } 4418 4419 /*---------------------------------------------------------------------------- 4420 | Returns the result of converting the single-precision floating-point value 4421 | `a' to the extended double-precision floating-point format. The conversion 4422 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4423 | Arithmetic. 4424 *----------------------------------------------------------------------------*/ 4425 4426 floatx80 float32_to_floatx80(float32 a, float_status *status) 4427 { 4428 flag aSign; 4429 int aExp; 4430 uint32_t aSig; 4431 4432 a = float32_squash_input_denormal(a, status); 4433 aSig = extractFloat32Frac( a ); 4434 aExp = extractFloat32Exp( a ); 4435 aSign = extractFloat32Sign( a ); 4436 if ( aExp == 0xFF ) { 4437 if (aSig) { 4438 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status); 4439 } 4440 return packFloatx80(aSign, 4441 floatx80_infinity_high, 4442 floatx80_infinity_low); 4443 } 4444 if ( aExp == 0 ) { 4445 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 4446 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4447 } 4448 aSig |= 0x00800000; 4449 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 4450 4451 } 4452 4453 /*---------------------------------------------------------------------------- 4454 | Returns the result of converting the single-precision floating-point value 4455 | `a' to the double-precision floating-point format. The conversion is 4456 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4457 | Arithmetic. 4458 *----------------------------------------------------------------------------*/ 4459 4460 float128 float32_to_float128(float32 a, float_status *status) 4461 { 4462 flag aSign; 4463 int aExp; 4464 uint32_t aSig; 4465 4466 a = float32_squash_input_denormal(a, status); 4467 aSig = extractFloat32Frac( a ); 4468 aExp = extractFloat32Exp( a ); 4469 aSign = extractFloat32Sign( a ); 4470 if ( aExp == 0xFF ) { 4471 if (aSig) { 4472 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 4473 } 4474 return packFloat128( aSign, 0x7FFF, 0, 0 ); 4475 } 4476 if ( aExp == 0 ) { 4477 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 4478 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4479 --aExp; 4480 } 4481 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 4482 4483 } 4484 4485 /*---------------------------------------------------------------------------- 4486 | Returns the remainder of the single-precision floating-point value `a' 4487 | with respect to the corresponding value `b'. The operation is performed 4488 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4489 *----------------------------------------------------------------------------*/ 4490 4491 float32 float32_rem(float32 a, float32 b, float_status *status) 4492 { 4493 flag aSign, zSign; 4494 int aExp, bExp, expDiff; 4495 uint32_t aSig, bSig; 4496 uint32_t q; 4497 uint64_t aSig64, bSig64, q64; 4498 uint32_t alternateASig; 4499 int32_t sigMean; 4500 a = float32_squash_input_denormal(a, status); 4501 b = float32_squash_input_denormal(b, status); 4502 4503 aSig = extractFloat32Frac( a ); 4504 aExp = extractFloat32Exp( a ); 4505 aSign = extractFloat32Sign( a ); 4506 bSig = extractFloat32Frac( b ); 4507 bExp = extractFloat32Exp( b ); 4508 if ( aExp == 0xFF ) { 4509 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 4510 return propagateFloat32NaN(a, b, status); 4511 } 4512 float_raise(float_flag_invalid, status); 4513 return float32_default_nan(status); 4514 } 4515 if ( bExp == 0xFF ) { 4516 if (bSig) { 4517 return propagateFloat32NaN(a, b, status); 4518 } 4519 return a; 4520 } 4521 if ( bExp == 0 ) { 4522 if ( bSig == 0 ) { 4523 float_raise(float_flag_invalid, status); 4524 return float32_default_nan(status); 4525 } 4526 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 4527 } 4528 if ( aExp == 0 ) { 4529 if ( aSig == 0 ) return a; 4530 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4531 } 4532 expDiff = aExp - bExp; 4533 aSig |= 0x00800000; 4534 bSig |= 0x00800000; 4535 if ( expDiff < 32 ) { 4536 aSig <<= 8; 4537 bSig <<= 8; 4538 if ( expDiff < 0 ) { 4539 if ( expDiff < -1 ) return a; 4540 aSig >>= 1; 4541 } 4542 q = ( bSig <= aSig ); 4543 if ( q ) aSig -= bSig; 4544 if ( 0 < expDiff ) { 4545 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 4546 q >>= 32 - expDiff; 4547 bSig >>= 2; 4548 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4549 } 4550 else { 4551 aSig >>= 2; 4552 bSig >>= 2; 4553 } 4554 } 4555 else { 4556 if ( bSig <= aSig ) aSig -= bSig; 4557 aSig64 = ( (uint64_t) aSig )<<40; 4558 bSig64 = ( (uint64_t) bSig )<<40; 4559 expDiff -= 64; 4560 while ( 0 < expDiff ) { 4561 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 4562 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 4563 aSig64 = - ( ( bSig * q64 )<<38 ); 4564 expDiff -= 62; 4565 } 4566 expDiff += 64; 4567 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 4568 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 4569 q = q64>>( 64 - expDiff ); 4570 bSig <<= 6; 4571 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 4572 } 4573 do { 4574 alternateASig = aSig; 4575 ++q; 4576 aSig -= bSig; 4577 } while ( 0 <= (int32_t) aSig ); 4578 sigMean = aSig + alternateASig; 4579 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 4580 aSig = alternateASig; 4581 } 4582 zSign = ( (int32_t) aSig < 0 ); 4583 if ( zSign ) aSig = - aSig; 4584 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 4585 } 4586 4587 4588 4589 /*---------------------------------------------------------------------------- 4590 | Returns the binary exponential of the single-precision floating-point value 4591 | `a'. The operation is performed according to the IEC/IEEE Standard for 4592 | Binary Floating-Point Arithmetic. 4593 | 4594 | Uses the following identities: 4595 | 4596 | 1. ------------------------------------------------------------------------- 4597 | x x*ln(2) 4598 | 2 = e 4599 | 4600 | 2. ------------------------------------------------------------------------- 4601 | 2 3 4 5 n 4602 | x x x x x x x 4603 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 4604 | 1! 2! 3! 4! 5! n! 4605 *----------------------------------------------------------------------------*/ 4606 4607 static const float64 float32_exp2_coefficients[15] = 4608 { 4609 const_float64( 0x3ff0000000000000ll ), /* 1 */ 4610 const_float64( 0x3fe0000000000000ll ), /* 2 */ 4611 const_float64( 0x3fc5555555555555ll ), /* 3 */ 4612 const_float64( 0x3fa5555555555555ll ), /* 4 */ 4613 const_float64( 0x3f81111111111111ll ), /* 5 */ 4614 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 4615 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 4616 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 4617 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 4618 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 4619 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 4620 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 4621 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 4622 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 4623 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 4624 }; 4625 4626 float32 float32_exp2(float32 a, float_status *status) 4627 { 4628 flag aSign; 4629 int aExp; 4630 uint32_t aSig; 4631 float64 r, x, xn; 4632 int i; 4633 a = float32_squash_input_denormal(a, status); 4634 4635 aSig = extractFloat32Frac( a ); 4636 aExp = extractFloat32Exp( a ); 4637 aSign = extractFloat32Sign( a ); 4638 4639 if ( aExp == 0xFF) { 4640 if (aSig) { 4641 return propagateFloat32NaN(a, float32_zero, status); 4642 } 4643 return (aSign) ? float32_zero : a; 4644 } 4645 if (aExp == 0) { 4646 if (aSig == 0) return float32_one; 4647 } 4648 4649 float_raise(float_flag_inexact, status); 4650 4651 /* ******************************* */ 4652 /* using float64 for approximation */ 4653 /* ******************************* */ 4654 x = float32_to_float64(a, status); 4655 x = float64_mul(x, float64_ln2, status); 4656 4657 xn = x; 4658 r = float64_one; 4659 for (i = 0 ; i < 15 ; i++) { 4660 float64 f; 4661 4662 f = float64_mul(xn, float32_exp2_coefficients[i], status); 4663 r = float64_add(r, f, status); 4664 4665 xn = float64_mul(xn, x, status); 4666 } 4667 4668 return float64_to_float32(r, status); 4669 } 4670 4671 /*---------------------------------------------------------------------------- 4672 | Returns the binary log of the single-precision floating-point value `a'. 4673 | The operation is performed according to the IEC/IEEE Standard for Binary 4674 | Floating-Point Arithmetic. 4675 *----------------------------------------------------------------------------*/ 4676 float32 float32_log2(float32 a, float_status *status) 4677 { 4678 flag aSign, zSign; 4679 int aExp; 4680 uint32_t aSig, zSig, i; 4681 4682 a = float32_squash_input_denormal(a, status); 4683 aSig = extractFloat32Frac( a ); 4684 aExp = extractFloat32Exp( a ); 4685 aSign = extractFloat32Sign( a ); 4686 4687 if ( aExp == 0 ) { 4688 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 4689 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4690 } 4691 if ( aSign ) { 4692 float_raise(float_flag_invalid, status); 4693 return float32_default_nan(status); 4694 } 4695 if ( aExp == 0xFF ) { 4696 if (aSig) { 4697 return propagateFloat32NaN(a, float32_zero, status); 4698 } 4699 return a; 4700 } 4701 4702 aExp -= 0x7F; 4703 aSig |= 0x00800000; 4704 zSign = aExp < 0; 4705 zSig = aExp << 23; 4706 4707 for (i = 1 << 22; i > 0; i >>= 1) { 4708 aSig = ( (uint64_t)aSig * aSig ) >> 23; 4709 if ( aSig & 0x01000000 ) { 4710 aSig >>= 1; 4711 zSig |= i; 4712 } 4713 } 4714 4715 if ( zSign ) 4716 zSig = -zSig; 4717 4718 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 4719 } 4720 4721 /*---------------------------------------------------------------------------- 4722 | Returns 1 if the single-precision floating-point value `a' is equal to 4723 | the corresponding value `b', and 0 otherwise. The invalid exception is 4724 | raised if either operand is a NaN. Otherwise, the comparison is performed 4725 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4726 *----------------------------------------------------------------------------*/ 4727 4728 int float32_eq(float32 a, float32 b, float_status *status) 4729 { 4730 uint32_t av, bv; 4731 a = float32_squash_input_denormal(a, status); 4732 b = float32_squash_input_denormal(b, status); 4733 4734 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4735 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4736 ) { 4737 float_raise(float_flag_invalid, status); 4738 return 0; 4739 } 4740 av = float32_val(a); 4741 bv = float32_val(b); 4742 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 4743 } 4744 4745 /*---------------------------------------------------------------------------- 4746 | Returns 1 if the single-precision floating-point value `a' is less than 4747 | or equal to the corresponding value `b', and 0 otherwise. The invalid 4748 | exception is raised if either operand is a NaN. The comparison is performed 4749 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4750 *----------------------------------------------------------------------------*/ 4751 4752 int float32_le(float32 a, float32 b, float_status *status) 4753 { 4754 flag aSign, bSign; 4755 uint32_t av, bv; 4756 a = float32_squash_input_denormal(a, status); 4757 b = float32_squash_input_denormal(b, status); 4758 4759 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4760 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4761 ) { 4762 float_raise(float_flag_invalid, status); 4763 return 0; 4764 } 4765 aSign = extractFloat32Sign( a ); 4766 bSign = extractFloat32Sign( b ); 4767 av = float32_val(a); 4768 bv = float32_val(b); 4769 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 4770 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4771 4772 } 4773 4774 /*---------------------------------------------------------------------------- 4775 | Returns 1 if the single-precision floating-point value `a' is less than 4776 | the corresponding value `b', and 0 otherwise. The invalid exception is 4777 | raised if either operand is a NaN. The comparison is performed according 4778 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4779 *----------------------------------------------------------------------------*/ 4780 4781 int float32_lt(float32 a, float32 b, float_status *status) 4782 { 4783 flag aSign, bSign; 4784 uint32_t av, bv; 4785 a = float32_squash_input_denormal(a, status); 4786 b = float32_squash_input_denormal(b, status); 4787 4788 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4789 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4790 ) { 4791 float_raise(float_flag_invalid, status); 4792 return 0; 4793 } 4794 aSign = extractFloat32Sign( a ); 4795 bSign = extractFloat32Sign( b ); 4796 av = float32_val(a); 4797 bv = float32_val(b); 4798 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 4799 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4800 4801 } 4802 4803 /*---------------------------------------------------------------------------- 4804 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 4805 | be compared, and 0 otherwise. The invalid exception is raised if either 4806 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4807 | Standard for Binary Floating-Point Arithmetic. 4808 *----------------------------------------------------------------------------*/ 4809 4810 int float32_unordered(float32 a, float32 b, float_status *status) 4811 { 4812 a = float32_squash_input_denormal(a, status); 4813 b = float32_squash_input_denormal(b, status); 4814 4815 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4816 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4817 ) { 4818 float_raise(float_flag_invalid, status); 4819 return 1; 4820 } 4821 return 0; 4822 } 4823 4824 /*---------------------------------------------------------------------------- 4825 | Returns 1 if the single-precision floating-point value `a' is equal to 4826 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4827 | exception. The comparison is performed according to the IEC/IEEE Standard 4828 | for Binary Floating-Point Arithmetic. 4829 *----------------------------------------------------------------------------*/ 4830 4831 int float32_eq_quiet(float32 a, float32 b, float_status *status) 4832 { 4833 a = float32_squash_input_denormal(a, status); 4834 b = float32_squash_input_denormal(b, status); 4835 4836 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4837 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4838 ) { 4839 if (float32_is_signaling_nan(a, status) 4840 || float32_is_signaling_nan(b, status)) { 4841 float_raise(float_flag_invalid, status); 4842 } 4843 return 0; 4844 } 4845 return ( float32_val(a) == float32_val(b) ) || 4846 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 4847 } 4848 4849 /*---------------------------------------------------------------------------- 4850 | Returns 1 if the single-precision floating-point value `a' is less than or 4851 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4852 | cause an exception. Otherwise, the comparison is performed according to the 4853 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4854 *----------------------------------------------------------------------------*/ 4855 4856 int float32_le_quiet(float32 a, float32 b, float_status *status) 4857 { 4858 flag aSign, bSign; 4859 uint32_t av, bv; 4860 a = float32_squash_input_denormal(a, status); 4861 b = float32_squash_input_denormal(b, status); 4862 4863 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4864 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4865 ) { 4866 if (float32_is_signaling_nan(a, status) 4867 || float32_is_signaling_nan(b, status)) { 4868 float_raise(float_flag_invalid, status); 4869 } 4870 return 0; 4871 } 4872 aSign = extractFloat32Sign( a ); 4873 bSign = extractFloat32Sign( b ); 4874 av = float32_val(a); 4875 bv = float32_val(b); 4876 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 4877 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4878 4879 } 4880 4881 /*---------------------------------------------------------------------------- 4882 | Returns 1 if the single-precision floating-point value `a' is less than 4883 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4884 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 4885 | Standard for Binary Floating-Point Arithmetic. 4886 *----------------------------------------------------------------------------*/ 4887 4888 int float32_lt_quiet(float32 a, float32 b, float_status *status) 4889 { 4890 flag aSign, bSign; 4891 uint32_t av, bv; 4892 a = float32_squash_input_denormal(a, status); 4893 b = float32_squash_input_denormal(b, status); 4894 4895 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4896 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4897 ) { 4898 if (float32_is_signaling_nan(a, status) 4899 || float32_is_signaling_nan(b, status)) { 4900 float_raise(float_flag_invalid, status); 4901 } 4902 return 0; 4903 } 4904 aSign = extractFloat32Sign( a ); 4905 bSign = extractFloat32Sign( b ); 4906 av = float32_val(a); 4907 bv = float32_val(b); 4908 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 4909 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4910 4911 } 4912 4913 /*---------------------------------------------------------------------------- 4914 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 4915 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 4916 | comparison is performed according to the IEC/IEEE Standard for Binary 4917 | Floating-Point Arithmetic. 4918 *----------------------------------------------------------------------------*/ 4919 4920 int float32_unordered_quiet(float32 a, float32 b, float_status *status) 4921 { 4922 a = float32_squash_input_denormal(a, status); 4923 b = float32_squash_input_denormal(b, status); 4924 4925 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4926 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4927 ) { 4928 if (float32_is_signaling_nan(a, status) 4929 || float32_is_signaling_nan(b, status)) { 4930 float_raise(float_flag_invalid, status); 4931 } 4932 return 1; 4933 } 4934 return 0; 4935 } 4936 4937 /*---------------------------------------------------------------------------- 4938 | If `a' is denormal and we are in flush-to-zero mode then set the 4939 | input-denormal exception and return zero. Otherwise just return the value. 4940 *----------------------------------------------------------------------------*/ 4941 float16 float16_squash_input_denormal(float16 a, float_status *status) 4942 { 4943 if (status->flush_inputs_to_zero) { 4944 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) { 4945 float_raise(float_flag_input_denormal, status); 4946 return make_float16(float16_val(a) & 0x8000); 4947 } 4948 } 4949 return a; 4950 } 4951 4952 /*---------------------------------------------------------------------------- 4953 | Returns the result of converting the double-precision floating-point value 4954 | `a' to the extended double-precision floating-point format. The conversion 4955 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4956 | Arithmetic. 4957 *----------------------------------------------------------------------------*/ 4958 4959 floatx80 float64_to_floatx80(float64 a, float_status *status) 4960 { 4961 flag aSign; 4962 int aExp; 4963 uint64_t aSig; 4964 4965 a = float64_squash_input_denormal(a, status); 4966 aSig = extractFloat64Frac( a ); 4967 aExp = extractFloat64Exp( a ); 4968 aSign = extractFloat64Sign( a ); 4969 if ( aExp == 0x7FF ) { 4970 if (aSig) { 4971 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status); 4972 } 4973 return packFloatx80(aSign, 4974 floatx80_infinity_high, 4975 floatx80_infinity_low); 4976 } 4977 if ( aExp == 0 ) { 4978 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 4979 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4980 } 4981 return 4982 packFloatx80( 4983 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 4984 4985 } 4986 4987 /*---------------------------------------------------------------------------- 4988 | Returns the result of converting the double-precision floating-point value 4989 | `a' to the quadruple-precision floating-point format. The conversion is 4990 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4991 | Arithmetic. 4992 *----------------------------------------------------------------------------*/ 4993 4994 float128 float64_to_float128(float64 a, float_status *status) 4995 { 4996 flag aSign; 4997 int aExp; 4998 uint64_t aSig, zSig0, zSig1; 4999 5000 a = float64_squash_input_denormal(a, status); 5001 aSig = extractFloat64Frac( a ); 5002 aExp = extractFloat64Exp( a ); 5003 aSign = extractFloat64Sign( a ); 5004 if ( aExp == 0x7FF ) { 5005 if (aSig) { 5006 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 5007 } 5008 return packFloat128( aSign, 0x7FFF, 0, 0 ); 5009 } 5010 if ( aExp == 0 ) { 5011 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 5012 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5013 --aExp; 5014 } 5015 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 5016 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 5017 5018 } 5019 5020 5021 /*---------------------------------------------------------------------------- 5022 | Returns the remainder of the double-precision floating-point value `a' 5023 | with respect to the corresponding value `b'. The operation is performed 5024 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5025 *----------------------------------------------------------------------------*/ 5026 5027 float64 float64_rem(float64 a, float64 b, float_status *status) 5028 { 5029 flag aSign, zSign; 5030 int aExp, bExp, expDiff; 5031 uint64_t aSig, bSig; 5032 uint64_t q, alternateASig; 5033 int64_t sigMean; 5034 5035 a = float64_squash_input_denormal(a, status); 5036 b = float64_squash_input_denormal(b, status); 5037 aSig = extractFloat64Frac( a ); 5038 aExp = extractFloat64Exp( a ); 5039 aSign = extractFloat64Sign( a ); 5040 bSig = extractFloat64Frac( b ); 5041 bExp = extractFloat64Exp( b ); 5042 if ( aExp == 0x7FF ) { 5043 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 5044 return propagateFloat64NaN(a, b, status); 5045 } 5046 float_raise(float_flag_invalid, status); 5047 return float64_default_nan(status); 5048 } 5049 if ( bExp == 0x7FF ) { 5050 if (bSig) { 5051 return propagateFloat64NaN(a, b, status); 5052 } 5053 return a; 5054 } 5055 if ( bExp == 0 ) { 5056 if ( bSig == 0 ) { 5057 float_raise(float_flag_invalid, status); 5058 return float64_default_nan(status); 5059 } 5060 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 5061 } 5062 if ( aExp == 0 ) { 5063 if ( aSig == 0 ) return a; 5064 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5065 } 5066 expDiff = aExp - bExp; 5067 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 5068 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 5069 if ( expDiff < 0 ) { 5070 if ( expDiff < -1 ) return a; 5071 aSig >>= 1; 5072 } 5073 q = ( bSig <= aSig ); 5074 if ( q ) aSig -= bSig; 5075 expDiff -= 64; 5076 while ( 0 < expDiff ) { 5077 q = estimateDiv128To64( aSig, 0, bSig ); 5078 q = ( 2 < q ) ? q - 2 : 0; 5079 aSig = - ( ( bSig>>2 ) * q ); 5080 expDiff -= 62; 5081 } 5082 expDiff += 64; 5083 if ( 0 < expDiff ) { 5084 q = estimateDiv128To64( aSig, 0, bSig ); 5085 q = ( 2 < q ) ? q - 2 : 0; 5086 q >>= 64 - expDiff; 5087 bSig >>= 2; 5088 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 5089 } 5090 else { 5091 aSig >>= 2; 5092 bSig >>= 2; 5093 } 5094 do { 5095 alternateASig = aSig; 5096 ++q; 5097 aSig -= bSig; 5098 } while ( 0 <= (int64_t) aSig ); 5099 sigMean = aSig + alternateASig; 5100 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 5101 aSig = alternateASig; 5102 } 5103 zSign = ( (int64_t) aSig < 0 ); 5104 if ( zSign ) aSig = - aSig; 5105 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 5106 5107 } 5108 5109 /*---------------------------------------------------------------------------- 5110 | Returns the binary log of the double-precision floating-point value `a'. 5111 | The operation is performed according to the IEC/IEEE Standard for Binary 5112 | Floating-Point Arithmetic. 5113 *----------------------------------------------------------------------------*/ 5114 float64 float64_log2(float64 a, float_status *status) 5115 { 5116 flag aSign, zSign; 5117 int aExp; 5118 uint64_t aSig, aSig0, aSig1, zSig, i; 5119 a = float64_squash_input_denormal(a, status); 5120 5121 aSig = extractFloat64Frac( a ); 5122 aExp = extractFloat64Exp( a ); 5123 aSign = extractFloat64Sign( a ); 5124 5125 if ( aExp == 0 ) { 5126 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 5127 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5128 } 5129 if ( aSign ) { 5130 float_raise(float_flag_invalid, status); 5131 return float64_default_nan(status); 5132 } 5133 if ( aExp == 0x7FF ) { 5134 if (aSig) { 5135 return propagateFloat64NaN(a, float64_zero, status); 5136 } 5137 return a; 5138 } 5139 5140 aExp -= 0x3FF; 5141 aSig |= LIT64( 0x0010000000000000 ); 5142 zSign = aExp < 0; 5143 zSig = (uint64_t)aExp << 52; 5144 for (i = 1LL << 51; i > 0; i >>= 1) { 5145 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 5146 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 5147 if ( aSig & LIT64( 0x0020000000000000 ) ) { 5148 aSig >>= 1; 5149 zSig |= i; 5150 } 5151 } 5152 5153 if ( zSign ) 5154 zSig = -zSig; 5155 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 5156 } 5157 5158 /*---------------------------------------------------------------------------- 5159 | Returns 1 if the double-precision floating-point value `a' is equal to the 5160 | corresponding value `b', and 0 otherwise. The invalid exception is raised 5161 | if either operand is a NaN. Otherwise, the comparison is performed 5162 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5163 *----------------------------------------------------------------------------*/ 5164 5165 int float64_eq(float64 a, float64 b, float_status *status) 5166 { 5167 uint64_t av, bv; 5168 a = float64_squash_input_denormal(a, status); 5169 b = float64_squash_input_denormal(b, status); 5170 5171 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5172 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5173 ) { 5174 float_raise(float_flag_invalid, status); 5175 return 0; 5176 } 5177 av = float64_val(a); 5178 bv = float64_val(b); 5179 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 5180 5181 } 5182 5183 /*---------------------------------------------------------------------------- 5184 | Returns 1 if the double-precision floating-point value `a' is less than or 5185 | equal to the corresponding value `b', and 0 otherwise. The invalid 5186 | exception is raised if either operand is a NaN. The comparison is performed 5187 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5188 *----------------------------------------------------------------------------*/ 5189 5190 int float64_le(float64 a, float64 b, float_status *status) 5191 { 5192 flag aSign, bSign; 5193 uint64_t av, bv; 5194 a = float64_squash_input_denormal(a, status); 5195 b = float64_squash_input_denormal(b, status); 5196 5197 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5198 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5199 ) { 5200 float_raise(float_flag_invalid, status); 5201 return 0; 5202 } 5203 aSign = extractFloat64Sign( a ); 5204 bSign = extractFloat64Sign( b ); 5205 av = float64_val(a); 5206 bv = float64_val(b); 5207 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 5208 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 5209 5210 } 5211 5212 /*---------------------------------------------------------------------------- 5213 | Returns 1 if the double-precision floating-point value `a' is less than 5214 | the corresponding value `b', and 0 otherwise. The invalid exception is 5215 | raised if either operand is a NaN. The comparison is performed according 5216 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5217 *----------------------------------------------------------------------------*/ 5218 5219 int float64_lt(float64 a, float64 b, float_status *status) 5220 { 5221 flag aSign, bSign; 5222 uint64_t av, bv; 5223 5224 a = float64_squash_input_denormal(a, status); 5225 b = float64_squash_input_denormal(b, status); 5226 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5227 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5228 ) { 5229 float_raise(float_flag_invalid, status); 5230 return 0; 5231 } 5232 aSign = extractFloat64Sign( a ); 5233 bSign = extractFloat64Sign( b ); 5234 av = float64_val(a); 5235 bv = float64_val(b); 5236 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 5237 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 5238 5239 } 5240 5241 /*---------------------------------------------------------------------------- 5242 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 5243 | be compared, and 0 otherwise. The invalid exception is raised if either 5244 | operand is a NaN. The comparison is performed according to the IEC/IEEE 5245 | Standard for Binary Floating-Point Arithmetic. 5246 *----------------------------------------------------------------------------*/ 5247 5248 int float64_unordered(float64 a, float64 b, float_status *status) 5249 { 5250 a = float64_squash_input_denormal(a, status); 5251 b = float64_squash_input_denormal(b, status); 5252 5253 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5254 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5255 ) { 5256 float_raise(float_flag_invalid, status); 5257 return 1; 5258 } 5259 return 0; 5260 } 5261 5262 /*---------------------------------------------------------------------------- 5263 | Returns 1 if the double-precision floating-point value `a' is equal to the 5264 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5265 | exception.The comparison is performed according to the IEC/IEEE Standard 5266 | for Binary Floating-Point Arithmetic. 5267 *----------------------------------------------------------------------------*/ 5268 5269 int float64_eq_quiet(float64 a, float64 b, float_status *status) 5270 { 5271 uint64_t av, bv; 5272 a = float64_squash_input_denormal(a, status); 5273 b = float64_squash_input_denormal(b, status); 5274 5275 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5276 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5277 ) { 5278 if (float64_is_signaling_nan(a, status) 5279 || float64_is_signaling_nan(b, status)) { 5280 float_raise(float_flag_invalid, status); 5281 } 5282 return 0; 5283 } 5284 av = float64_val(a); 5285 bv = float64_val(b); 5286 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 5287 5288 } 5289 5290 /*---------------------------------------------------------------------------- 5291 | Returns 1 if the double-precision floating-point value `a' is less than or 5292 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5293 | cause an exception. Otherwise, the comparison is performed according to the 5294 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5295 *----------------------------------------------------------------------------*/ 5296 5297 int float64_le_quiet(float64 a, float64 b, float_status *status) 5298 { 5299 flag aSign, bSign; 5300 uint64_t av, bv; 5301 a = float64_squash_input_denormal(a, status); 5302 b = float64_squash_input_denormal(b, status); 5303 5304 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5305 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5306 ) { 5307 if (float64_is_signaling_nan(a, status) 5308 || float64_is_signaling_nan(b, status)) { 5309 float_raise(float_flag_invalid, status); 5310 } 5311 return 0; 5312 } 5313 aSign = extractFloat64Sign( a ); 5314 bSign = extractFloat64Sign( b ); 5315 av = float64_val(a); 5316 bv = float64_val(b); 5317 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 5318 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 5319 5320 } 5321 5322 /*---------------------------------------------------------------------------- 5323 | Returns 1 if the double-precision floating-point value `a' is less than 5324 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 5325 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 5326 | Standard for Binary Floating-Point Arithmetic. 5327 *----------------------------------------------------------------------------*/ 5328 5329 int float64_lt_quiet(float64 a, float64 b, float_status *status) 5330 { 5331 flag aSign, bSign; 5332 uint64_t av, bv; 5333 a = float64_squash_input_denormal(a, status); 5334 b = float64_squash_input_denormal(b, status); 5335 5336 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5337 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5338 ) { 5339 if (float64_is_signaling_nan(a, status) 5340 || float64_is_signaling_nan(b, status)) { 5341 float_raise(float_flag_invalid, status); 5342 } 5343 return 0; 5344 } 5345 aSign = extractFloat64Sign( a ); 5346 bSign = extractFloat64Sign( b ); 5347 av = float64_val(a); 5348 bv = float64_val(b); 5349 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 5350 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 5351 5352 } 5353 5354 /*---------------------------------------------------------------------------- 5355 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 5356 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 5357 | comparison is performed according to the IEC/IEEE Standard for Binary 5358 | Floating-Point Arithmetic. 5359 *----------------------------------------------------------------------------*/ 5360 5361 int float64_unordered_quiet(float64 a, float64 b, float_status *status) 5362 { 5363 a = float64_squash_input_denormal(a, status); 5364 b = float64_squash_input_denormal(b, status); 5365 5366 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 5367 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 5368 ) { 5369 if (float64_is_signaling_nan(a, status) 5370 || float64_is_signaling_nan(b, status)) { 5371 float_raise(float_flag_invalid, status); 5372 } 5373 return 1; 5374 } 5375 return 0; 5376 } 5377 5378 /*---------------------------------------------------------------------------- 5379 | Returns the result of converting the extended double-precision floating- 5380 | point value `a' to the 32-bit two's complement integer format. The 5381 | conversion is performed according to the IEC/IEEE Standard for Binary 5382 | Floating-Point Arithmetic---which means in particular that the conversion 5383 | is rounded according to the current rounding mode. If `a' is a NaN, the 5384 | largest positive integer is returned. Otherwise, if the conversion 5385 | overflows, the largest integer with the same sign as `a' is returned. 5386 *----------------------------------------------------------------------------*/ 5387 5388 int32_t floatx80_to_int32(floatx80 a, float_status *status) 5389 { 5390 flag aSign; 5391 int32_t aExp, shiftCount; 5392 uint64_t aSig; 5393 5394 if (floatx80_invalid_encoding(a)) { 5395 float_raise(float_flag_invalid, status); 5396 return 1 << 31; 5397 } 5398 aSig = extractFloatx80Frac( a ); 5399 aExp = extractFloatx80Exp( a ); 5400 aSign = extractFloatx80Sign( a ); 5401 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5402 shiftCount = 0x4037 - aExp; 5403 if ( shiftCount <= 0 ) shiftCount = 1; 5404 shift64RightJamming( aSig, shiftCount, &aSig ); 5405 return roundAndPackInt32(aSign, aSig, status); 5406 5407 } 5408 5409 /*---------------------------------------------------------------------------- 5410 | Returns the result of converting the extended double-precision floating- 5411 | point value `a' to the 32-bit two's complement integer format. The 5412 | conversion is performed according to the IEC/IEEE Standard for Binary 5413 | Floating-Point Arithmetic, except that the conversion is always rounded 5414 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5415 | Otherwise, if the conversion overflows, the largest integer with the same 5416 | sign as `a' is returned. 5417 *----------------------------------------------------------------------------*/ 5418 5419 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 5420 { 5421 flag aSign; 5422 int32_t aExp, shiftCount; 5423 uint64_t aSig, savedASig; 5424 int32_t z; 5425 5426 if (floatx80_invalid_encoding(a)) { 5427 float_raise(float_flag_invalid, status); 5428 return 1 << 31; 5429 } 5430 aSig = extractFloatx80Frac( a ); 5431 aExp = extractFloatx80Exp( a ); 5432 aSign = extractFloatx80Sign( a ); 5433 if ( 0x401E < aExp ) { 5434 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5435 goto invalid; 5436 } 5437 else if ( aExp < 0x3FFF ) { 5438 if (aExp || aSig) { 5439 status->float_exception_flags |= float_flag_inexact; 5440 } 5441 return 0; 5442 } 5443 shiftCount = 0x403E - aExp; 5444 savedASig = aSig; 5445 aSig >>= shiftCount; 5446 z = aSig; 5447 if ( aSign ) z = - z; 5448 if ( ( z < 0 ) ^ aSign ) { 5449 invalid: 5450 float_raise(float_flag_invalid, status); 5451 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5452 } 5453 if ( ( aSig<<shiftCount ) != savedASig ) { 5454 status->float_exception_flags |= float_flag_inexact; 5455 } 5456 return z; 5457 5458 } 5459 5460 /*---------------------------------------------------------------------------- 5461 | Returns the result of converting the extended double-precision floating- 5462 | point value `a' to the 64-bit two's complement integer format. The 5463 | conversion is performed according to the IEC/IEEE Standard for Binary 5464 | Floating-Point Arithmetic---which means in particular that the conversion 5465 | is rounded according to the current rounding mode. If `a' is a NaN, 5466 | the largest positive integer is returned. Otherwise, if the conversion 5467 | overflows, the largest integer with the same sign as `a' is returned. 5468 *----------------------------------------------------------------------------*/ 5469 5470 int64_t floatx80_to_int64(floatx80 a, float_status *status) 5471 { 5472 flag aSign; 5473 int32_t aExp, shiftCount; 5474 uint64_t aSig, aSigExtra; 5475 5476 if (floatx80_invalid_encoding(a)) { 5477 float_raise(float_flag_invalid, status); 5478 return 1ULL << 63; 5479 } 5480 aSig = extractFloatx80Frac( a ); 5481 aExp = extractFloatx80Exp( a ); 5482 aSign = extractFloatx80Sign( a ); 5483 shiftCount = 0x403E - aExp; 5484 if ( shiftCount <= 0 ) { 5485 if ( shiftCount ) { 5486 float_raise(float_flag_invalid, status); 5487 if (!aSign || floatx80_is_any_nan(a)) { 5488 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5489 } 5490 return (int64_t) LIT64( 0x8000000000000000 ); 5491 } 5492 aSigExtra = 0; 5493 } 5494 else { 5495 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 5496 } 5497 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 5498 5499 } 5500 5501 /*---------------------------------------------------------------------------- 5502 | Returns the result of converting the extended double-precision floating- 5503 | point value `a' to the 64-bit two's complement integer format. The 5504 | conversion is performed according to the IEC/IEEE Standard for Binary 5505 | Floating-Point Arithmetic, except that the conversion is always rounded 5506 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5507 | Otherwise, if the conversion overflows, the largest integer with the same 5508 | sign as `a' is returned. 5509 *----------------------------------------------------------------------------*/ 5510 5511 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 5512 { 5513 flag aSign; 5514 int32_t aExp, shiftCount; 5515 uint64_t aSig; 5516 int64_t z; 5517 5518 if (floatx80_invalid_encoding(a)) { 5519 float_raise(float_flag_invalid, status); 5520 return 1ULL << 63; 5521 } 5522 aSig = extractFloatx80Frac( a ); 5523 aExp = extractFloatx80Exp( a ); 5524 aSign = extractFloatx80Sign( a ); 5525 shiftCount = aExp - 0x403E; 5526 if ( 0 <= shiftCount ) { 5527 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 5528 if ( ( a.high != 0xC03E ) || aSig ) { 5529 float_raise(float_flag_invalid, status); 5530 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 5531 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5532 } 5533 } 5534 return (int64_t) LIT64( 0x8000000000000000 ); 5535 } 5536 else if ( aExp < 0x3FFF ) { 5537 if (aExp | aSig) { 5538 status->float_exception_flags |= float_flag_inexact; 5539 } 5540 return 0; 5541 } 5542 z = aSig>>( - shiftCount ); 5543 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 5544 status->float_exception_flags |= float_flag_inexact; 5545 } 5546 if ( aSign ) z = - z; 5547 return z; 5548 5549 } 5550 5551 /*---------------------------------------------------------------------------- 5552 | Returns the result of converting the extended double-precision floating- 5553 | point value `a' to the single-precision floating-point format. The 5554 | conversion is performed according to the IEC/IEEE Standard for Binary 5555 | Floating-Point Arithmetic. 5556 *----------------------------------------------------------------------------*/ 5557 5558 float32 floatx80_to_float32(floatx80 a, float_status *status) 5559 { 5560 flag aSign; 5561 int32_t aExp; 5562 uint64_t aSig; 5563 5564 if (floatx80_invalid_encoding(a)) { 5565 float_raise(float_flag_invalid, status); 5566 return float32_default_nan(status); 5567 } 5568 aSig = extractFloatx80Frac( a ); 5569 aExp = extractFloatx80Exp( a ); 5570 aSign = extractFloatx80Sign( a ); 5571 if ( aExp == 0x7FFF ) { 5572 if ( (uint64_t) ( aSig<<1 ) ) { 5573 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status); 5574 } 5575 return packFloat32( aSign, 0xFF, 0 ); 5576 } 5577 shift64RightJamming( aSig, 33, &aSig ); 5578 if ( aExp || aSig ) aExp -= 0x3F81; 5579 return roundAndPackFloat32(aSign, aExp, aSig, status); 5580 5581 } 5582 5583 /*---------------------------------------------------------------------------- 5584 | Returns the result of converting the extended double-precision floating- 5585 | point value `a' to the double-precision floating-point format. The 5586 | conversion is performed according to the IEC/IEEE Standard for Binary 5587 | Floating-Point Arithmetic. 5588 *----------------------------------------------------------------------------*/ 5589 5590 float64 floatx80_to_float64(floatx80 a, float_status *status) 5591 { 5592 flag aSign; 5593 int32_t aExp; 5594 uint64_t aSig, zSig; 5595 5596 if (floatx80_invalid_encoding(a)) { 5597 float_raise(float_flag_invalid, status); 5598 return float64_default_nan(status); 5599 } 5600 aSig = extractFloatx80Frac( a ); 5601 aExp = extractFloatx80Exp( a ); 5602 aSign = extractFloatx80Sign( a ); 5603 if ( aExp == 0x7FFF ) { 5604 if ( (uint64_t) ( aSig<<1 ) ) { 5605 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status); 5606 } 5607 return packFloat64( aSign, 0x7FF, 0 ); 5608 } 5609 shift64RightJamming( aSig, 1, &zSig ); 5610 if ( aExp || aSig ) aExp -= 0x3C01; 5611 return roundAndPackFloat64(aSign, aExp, zSig, status); 5612 5613 } 5614 5615 /*---------------------------------------------------------------------------- 5616 | Returns the result of converting the extended double-precision floating- 5617 | point value `a' to the quadruple-precision floating-point format. The 5618 | conversion is performed according to the IEC/IEEE Standard for Binary 5619 | Floating-Point Arithmetic. 5620 *----------------------------------------------------------------------------*/ 5621 5622 float128 floatx80_to_float128(floatx80 a, float_status *status) 5623 { 5624 flag aSign; 5625 int aExp; 5626 uint64_t aSig, zSig0, zSig1; 5627 5628 if (floatx80_invalid_encoding(a)) { 5629 float_raise(float_flag_invalid, status); 5630 return float128_default_nan(status); 5631 } 5632 aSig = extractFloatx80Frac( a ); 5633 aExp = extractFloatx80Exp( a ); 5634 aSign = extractFloatx80Sign( a ); 5635 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 5636 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status); 5637 } 5638 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 5639 return packFloat128( aSign, aExp, zSig0, zSig1 ); 5640 5641 } 5642 5643 /*---------------------------------------------------------------------------- 5644 | Rounds the extended double-precision floating-point value `a' 5645 | to the precision provided by floatx80_rounding_precision and returns the 5646 | result as an extended double-precision floating-point value. 5647 | The operation is performed according to the IEC/IEEE Standard for Binary 5648 | Floating-Point Arithmetic. 5649 *----------------------------------------------------------------------------*/ 5650 5651 floatx80 floatx80_round(floatx80 a, float_status *status) 5652 { 5653 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5654 extractFloatx80Sign(a), 5655 extractFloatx80Exp(a), 5656 extractFloatx80Frac(a), 0, status); 5657 } 5658 5659 /*---------------------------------------------------------------------------- 5660 | Rounds the extended double-precision floating-point value `a' to an integer, 5661 | and returns the result as an extended quadruple-precision floating-point 5662 | value. The operation is performed according to the IEC/IEEE Standard for 5663 | Binary Floating-Point Arithmetic. 5664 *----------------------------------------------------------------------------*/ 5665 5666 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 5667 { 5668 flag aSign; 5669 int32_t aExp; 5670 uint64_t lastBitMask, roundBitsMask; 5671 floatx80 z; 5672 5673 if (floatx80_invalid_encoding(a)) { 5674 float_raise(float_flag_invalid, status); 5675 return floatx80_default_nan(status); 5676 } 5677 aExp = extractFloatx80Exp( a ); 5678 if ( 0x403E <= aExp ) { 5679 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 5680 return propagateFloatx80NaN(a, a, status); 5681 } 5682 return a; 5683 } 5684 if ( aExp < 0x3FFF ) { 5685 if ( ( aExp == 0 ) 5686 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 5687 return a; 5688 } 5689 status->float_exception_flags |= float_flag_inexact; 5690 aSign = extractFloatx80Sign( a ); 5691 switch (status->float_rounding_mode) { 5692 case float_round_nearest_even: 5693 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 5694 ) { 5695 return 5696 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 5697 } 5698 break; 5699 case float_round_ties_away: 5700 if (aExp == 0x3FFE) { 5701 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000)); 5702 } 5703 break; 5704 case float_round_down: 5705 return 5706 aSign ? 5707 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 5708 : packFloatx80( 0, 0, 0 ); 5709 case float_round_up: 5710 return 5711 aSign ? packFloatx80( 1, 0, 0 ) 5712 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 5713 } 5714 return packFloatx80( aSign, 0, 0 ); 5715 } 5716 lastBitMask = 1; 5717 lastBitMask <<= 0x403E - aExp; 5718 roundBitsMask = lastBitMask - 1; 5719 z = a; 5720 switch (status->float_rounding_mode) { 5721 case float_round_nearest_even: 5722 z.low += lastBitMask>>1; 5723 if ((z.low & roundBitsMask) == 0) { 5724 z.low &= ~lastBitMask; 5725 } 5726 break; 5727 case float_round_ties_away: 5728 z.low += lastBitMask >> 1; 5729 break; 5730 case float_round_to_zero: 5731 break; 5732 case float_round_up: 5733 if (!extractFloatx80Sign(z)) { 5734 z.low += roundBitsMask; 5735 } 5736 break; 5737 case float_round_down: 5738 if (extractFloatx80Sign(z)) { 5739 z.low += roundBitsMask; 5740 } 5741 break; 5742 default: 5743 abort(); 5744 } 5745 z.low &= ~ roundBitsMask; 5746 if ( z.low == 0 ) { 5747 ++z.high; 5748 z.low = LIT64( 0x8000000000000000 ); 5749 } 5750 if (z.low != a.low) { 5751 status->float_exception_flags |= float_flag_inexact; 5752 } 5753 return z; 5754 5755 } 5756 5757 /*---------------------------------------------------------------------------- 5758 | Returns the result of adding the absolute values of the extended double- 5759 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 5760 | negated before being returned. `zSign' is ignored if the result is a NaN. 5761 | The addition is performed according to the IEC/IEEE Standard for Binary 5762 | Floating-Point Arithmetic. 5763 *----------------------------------------------------------------------------*/ 5764 5765 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5766 float_status *status) 5767 { 5768 int32_t aExp, bExp, zExp; 5769 uint64_t aSig, bSig, zSig0, zSig1; 5770 int32_t expDiff; 5771 5772 aSig = extractFloatx80Frac( a ); 5773 aExp = extractFloatx80Exp( a ); 5774 bSig = extractFloatx80Frac( b ); 5775 bExp = extractFloatx80Exp( b ); 5776 expDiff = aExp - bExp; 5777 if ( 0 < expDiff ) { 5778 if ( aExp == 0x7FFF ) { 5779 if ((uint64_t)(aSig << 1)) { 5780 return propagateFloatx80NaN(a, b, status); 5781 } 5782 return a; 5783 } 5784 if ( bExp == 0 ) --expDiff; 5785 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5786 zExp = aExp; 5787 } 5788 else if ( expDiff < 0 ) { 5789 if ( bExp == 0x7FFF ) { 5790 if ((uint64_t)(bSig << 1)) { 5791 return propagateFloatx80NaN(a, b, status); 5792 } 5793 return packFloatx80(zSign, 5794 floatx80_infinity_high, 5795 floatx80_infinity_low); 5796 } 5797 if ( aExp == 0 ) ++expDiff; 5798 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5799 zExp = bExp; 5800 } 5801 else { 5802 if ( aExp == 0x7FFF ) { 5803 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5804 return propagateFloatx80NaN(a, b, status); 5805 } 5806 return a; 5807 } 5808 zSig1 = 0; 5809 zSig0 = aSig + bSig; 5810 if ( aExp == 0 ) { 5811 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 5812 goto roundAndPack; 5813 } 5814 zExp = aExp; 5815 goto shiftRight1; 5816 } 5817 zSig0 = aSig + bSig; 5818 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 5819 shiftRight1: 5820 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5821 zSig0 |= LIT64( 0x8000000000000000 ); 5822 ++zExp; 5823 roundAndPack: 5824 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5825 zSign, zExp, zSig0, zSig1, status); 5826 } 5827 5828 /*---------------------------------------------------------------------------- 5829 | Returns the result of subtracting the absolute values of the extended 5830 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 5831 | difference is negated before being returned. `zSign' is ignored if the 5832 | result is a NaN. The subtraction is performed according to the IEC/IEEE 5833 | Standard for Binary Floating-Point Arithmetic. 5834 *----------------------------------------------------------------------------*/ 5835 5836 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5837 float_status *status) 5838 { 5839 int32_t aExp, bExp, zExp; 5840 uint64_t aSig, bSig, zSig0, zSig1; 5841 int32_t expDiff; 5842 5843 aSig = extractFloatx80Frac( a ); 5844 aExp = extractFloatx80Exp( a ); 5845 bSig = extractFloatx80Frac( b ); 5846 bExp = extractFloatx80Exp( b ); 5847 expDiff = aExp - bExp; 5848 if ( 0 < expDiff ) goto aExpBigger; 5849 if ( expDiff < 0 ) goto bExpBigger; 5850 if ( aExp == 0x7FFF ) { 5851 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5852 return propagateFloatx80NaN(a, b, status); 5853 } 5854 float_raise(float_flag_invalid, status); 5855 return floatx80_default_nan(status); 5856 } 5857 if ( aExp == 0 ) { 5858 aExp = 1; 5859 bExp = 1; 5860 } 5861 zSig1 = 0; 5862 if ( bSig < aSig ) goto aBigger; 5863 if ( aSig < bSig ) goto bBigger; 5864 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 5865 bExpBigger: 5866 if ( bExp == 0x7FFF ) { 5867 if ((uint64_t)(bSig << 1)) { 5868 return propagateFloatx80NaN(a, b, status); 5869 } 5870 return packFloatx80(zSign ^ 1, floatx80_infinity_high, 5871 floatx80_infinity_low); 5872 } 5873 if ( aExp == 0 ) ++expDiff; 5874 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5875 bBigger: 5876 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 5877 zExp = bExp; 5878 zSign ^= 1; 5879 goto normalizeRoundAndPack; 5880 aExpBigger: 5881 if ( aExp == 0x7FFF ) { 5882 if ((uint64_t)(aSig << 1)) { 5883 return propagateFloatx80NaN(a, b, status); 5884 } 5885 return a; 5886 } 5887 if ( bExp == 0 ) --expDiff; 5888 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5889 aBigger: 5890 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 5891 zExp = aExp; 5892 normalizeRoundAndPack: 5893 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 5894 zSign, zExp, zSig0, zSig1, status); 5895 } 5896 5897 /*---------------------------------------------------------------------------- 5898 | Returns the result of adding the extended double-precision floating-point 5899 | values `a' and `b'. The operation is performed according to the IEC/IEEE 5900 | Standard for Binary Floating-Point Arithmetic. 5901 *----------------------------------------------------------------------------*/ 5902 5903 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 5904 { 5905 flag aSign, bSign; 5906 5907 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5908 float_raise(float_flag_invalid, status); 5909 return floatx80_default_nan(status); 5910 } 5911 aSign = extractFloatx80Sign( a ); 5912 bSign = extractFloatx80Sign( b ); 5913 if ( aSign == bSign ) { 5914 return addFloatx80Sigs(a, b, aSign, status); 5915 } 5916 else { 5917 return subFloatx80Sigs(a, b, aSign, status); 5918 } 5919 5920 } 5921 5922 /*---------------------------------------------------------------------------- 5923 | Returns the result of subtracting the extended double-precision floating- 5924 | point values `a' and `b'. The operation is performed according to the 5925 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5926 *----------------------------------------------------------------------------*/ 5927 5928 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 5929 { 5930 flag aSign, bSign; 5931 5932 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5933 float_raise(float_flag_invalid, status); 5934 return floatx80_default_nan(status); 5935 } 5936 aSign = extractFloatx80Sign( a ); 5937 bSign = extractFloatx80Sign( b ); 5938 if ( aSign == bSign ) { 5939 return subFloatx80Sigs(a, b, aSign, status); 5940 } 5941 else { 5942 return addFloatx80Sigs(a, b, aSign, status); 5943 } 5944 5945 } 5946 5947 /*---------------------------------------------------------------------------- 5948 | Returns the result of multiplying the extended double-precision floating- 5949 | point values `a' and `b'. The operation is performed according to the 5950 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5951 *----------------------------------------------------------------------------*/ 5952 5953 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 5954 { 5955 flag aSign, bSign, zSign; 5956 int32_t aExp, bExp, zExp; 5957 uint64_t aSig, bSig, zSig0, zSig1; 5958 5959 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5960 float_raise(float_flag_invalid, status); 5961 return floatx80_default_nan(status); 5962 } 5963 aSig = extractFloatx80Frac( a ); 5964 aExp = extractFloatx80Exp( a ); 5965 aSign = extractFloatx80Sign( a ); 5966 bSig = extractFloatx80Frac( b ); 5967 bExp = extractFloatx80Exp( b ); 5968 bSign = extractFloatx80Sign( b ); 5969 zSign = aSign ^ bSign; 5970 if ( aExp == 0x7FFF ) { 5971 if ( (uint64_t) ( aSig<<1 ) 5972 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5973 return propagateFloatx80NaN(a, b, status); 5974 } 5975 if ( ( bExp | bSig ) == 0 ) goto invalid; 5976 return packFloatx80(zSign, floatx80_infinity_high, 5977 floatx80_infinity_low); 5978 } 5979 if ( bExp == 0x7FFF ) { 5980 if ((uint64_t)(bSig << 1)) { 5981 return propagateFloatx80NaN(a, b, status); 5982 } 5983 if ( ( aExp | aSig ) == 0 ) { 5984 invalid: 5985 float_raise(float_flag_invalid, status); 5986 return floatx80_default_nan(status); 5987 } 5988 return packFloatx80(zSign, floatx80_infinity_high, 5989 floatx80_infinity_low); 5990 } 5991 if ( aExp == 0 ) { 5992 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5993 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5994 } 5995 if ( bExp == 0 ) { 5996 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5997 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5998 } 5999 zExp = aExp + bExp - 0x3FFE; 6000 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 6001 if ( 0 < (int64_t) zSig0 ) { 6002 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 6003 --zExp; 6004 } 6005 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6006 zSign, zExp, zSig0, zSig1, status); 6007 } 6008 6009 /*---------------------------------------------------------------------------- 6010 | Returns the result of dividing the extended double-precision floating-point 6011 | value `a' by the corresponding value `b'. The operation is performed 6012 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6013 *----------------------------------------------------------------------------*/ 6014 6015 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 6016 { 6017 flag aSign, bSign, zSign; 6018 int32_t aExp, bExp, zExp; 6019 uint64_t aSig, bSig, zSig0, zSig1; 6020 uint64_t rem0, rem1, rem2, term0, term1, term2; 6021 6022 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6023 float_raise(float_flag_invalid, status); 6024 return floatx80_default_nan(status); 6025 } 6026 aSig = extractFloatx80Frac( a ); 6027 aExp = extractFloatx80Exp( a ); 6028 aSign = extractFloatx80Sign( a ); 6029 bSig = extractFloatx80Frac( b ); 6030 bExp = extractFloatx80Exp( b ); 6031 bSign = extractFloatx80Sign( b ); 6032 zSign = aSign ^ bSign; 6033 if ( aExp == 0x7FFF ) { 6034 if ((uint64_t)(aSig << 1)) { 6035 return propagateFloatx80NaN(a, b, status); 6036 } 6037 if ( bExp == 0x7FFF ) { 6038 if ((uint64_t)(bSig << 1)) { 6039 return propagateFloatx80NaN(a, b, status); 6040 } 6041 goto invalid; 6042 } 6043 return packFloatx80(zSign, floatx80_infinity_high, 6044 floatx80_infinity_low); 6045 } 6046 if ( bExp == 0x7FFF ) { 6047 if ((uint64_t)(bSig << 1)) { 6048 return propagateFloatx80NaN(a, b, status); 6049 } 6050 return packFloatx80( zSign, 0, 0 ); 6051 } 6052 if ( bExp == 0 ) { 6053 if ( bSig == 0 ) { 6054 if ( ( aExp | aSig ) == 0 ) { 6055 invalid: 6056 float_raise(float_flag_invalid, status); 6057 return floatx80_default_nan(status); 6058 } 6059 float_raise(float_flag_divbyzero, status); 6060 return packFloatx80(zSign, floatx80_infinity_high, 6061 floatx80_infinity_low); 6062 } 6063 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6064 } 6065 if ( aExp == 0 ) { 6066 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6067 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 6068 } 6069 zExp = aExp - bExp + 0x3FFE; 6070 rem1 = 0; 6071 if ( bSig <= aSig ) { 6072 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 6073 ++zExp; 6074 } 6075 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 6076 mul64To128( bSig, zSig0, &term0, &term1 ); 6077 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 6078 while ( (int64_t) rem0 < 0 ) { 6079 --zSig0; 6080 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 6081 } 6082 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 6083 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 6084 mul64To128( bSig, zSig1, &term1, &term2 ); 6085 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6086 while ( (int64_t) rem1 < 0 ) { 6087 --zSig1; 6088 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 6089 } 6090 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 6091 } 6092 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6093 zSign, zExp, zSig0, zSig1, status); 6094 } 6095 6096 /*---------------------------------------------------------------------------- 6097 | Returns the remainder of the extended double-precision floating-point value 6098 | `a' with respect to the corresponding value `b'. The operation is performed 6099 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6100 *----------------------------------------------------------------------------*/ 6101 6102 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 6103 { 6104 flag aSign, zSign; 6105 int32_t aExp, bExp, expDiff; 6106 uint64_t aSig0, aSig1, bSig; 6107 uint64_t q, term0, term1, alternateASig0, alternateASig1; 6108 6109 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6110 float_raise(float_flag_invalid, status); 6111 return floatx80_default_nan(status); 6112 } 6113 aSig0 = extractFloatx80Frac( a ); 6114 aExp = extractFloatx80Exp( a ); 6115 aSign = extractFloatx80Sign( a ); 6116 bSig = extractFloatx80Frac( b ); 6117 bExp = extractFloatx80Exp( b ); 6118 if ( aExp == 0x7FFF ) { 6119 if ( (uint64_t) ( aSig0<<1 ) 6120 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 6121 return propagateFloatx80NaN(a, b, status); 6122 } 6123 goto invalid; 6124 } 6125 if ( bExp == 0x7FFF ) { 6126 if ((uint64_t)(bSig << 1)) { 6127 return propagateFloatx80NaN(a, b, status); 6128 } 6129 return a; 6130 } 6131 if ( bExp == 0 ) { 6132 if ( bSig == 0 ) { 6133 invalid: 6134 float_raise(float_flag_invalid, status); 6135 return floatx80_default_nan(status); 6136 } 6137 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6138 } 6139 if ( aExp == 0 ) { 6140 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a; 6141 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6142 } 6143 bSig |= LIT64( 0x8000000000000000 ); 6144 zSign = aSign; 6145 expDiff = aExp - bExp; 6146 aSig1 = 0; 6147 if ( expDiff < 0 ) { 6148 if ( expDiff < -1 ) return a; 6149 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 6150 expDiff = 0; 6151 } 6152 q = ( bSig <= aSig0 ); 6153 if ( q ) aSig0 -= bSig; 6154 expDiff -= 64; 6155 while ( 0 < expDiff ) { 6156 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6157 q = ( 2 < q ) ? q - 2 : 0; 6158 mul64To128( bSig, q, &term0, &term1 ); 6159 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6160 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 6161 expDiff -= 62; 6162 } 6163 expDiff += 64; 6164 if ( 0 < expDiff ) { 6165 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6166 q = ( 2 < q ) ? q - 2 : 0; 6167 q >>= 64 - expDiff; 6168 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 6169 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6170 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 6171 while ( le128( term0, term1, aSig0, aSig1 ) ) { 6172 ++q; 6173 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6174 } 6175 } 6176 else { 6177 term1 = 0; 6178 term0 = bSig; 6179 } 6180 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 6181 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6182 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6183 && ( q & 1 ) ) 6184 ) { 6185 aSig0 = alternateASig0; 6186 aSig1 = alternateASig1; 6187 zSign = ! zSign; 6188 } 6189 return 6190 normalizeRoundAndPackFloatx80( 6191 80, zSign, bExp + expDiff, aSig0, aSig1, status); 6192 6193 } 6194 6195 /*---------------------------------------------------------------------------- 6196 | Returns the square root of the extended double-precision floating-point 6197 | value `a'. The operation is performed according to the IEC/IEEE Standard 6198 | for Binary Floating-Point Arithmetic. 6199 *----------------------------------------------------------------------------*/ 6200 6201 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 6202 { 6203 flag aSign; 6204 int32_t aExp, zExp; 6205 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 6206 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6207 6208 if (floatx80_invalid_encoding(a)) { 6209 float_raise(float_flag_invalid, status); 6210 return floatx80_default_nan(status); 6211 } 6212 aSig0 = extractFloatx80Frac( a ); 6213 aExp = extractFloatx80Exp( a ); 6214 aSign = extractFloatx80Sign( a ); 6215 if ( aExp == 0x7FFF ) { 6216 if ((uint64_t)(aSig0 << 1)) { 6217 return propagateFloatx80NaN(a, a, status); 6218 } 6219 if ( ! aSign ) return a; 6220 goto invalid; 6221 } 6222 if ( aSign ) { 6223 if ( ( aExp | aSig0 ) == 0 ) return a; 6224 invalid: 6225 float_raise(float_flag_invalid, status); 6226 return floatx80_default_nan(status); 6227 } 6228 if ( aExp == 0 ) { 6229 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 6230 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6231 } 6232 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 6233 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 6234 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 6235 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6236 doubleZSig0 = zSig0<<1; 6237 mul64To128( zSig0, zSig0, &term0, &term1 ); 6238 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6239 while ( (int64_t) rem0 < 0 ) { 6240 --zSig0; 6241 doubleZSig0 -= 2; 6242 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6243 } 6244 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6245 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 6246 if ( zSig1 == 0 ) zSig1 = 1; 6247 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6248 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6249 mul64To128( zSig1, zSig1, &term2, &term3 ); 6250 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6251 while ( (int64_t) rem1 < 0 ) { 6252 --zSig1; 6253 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6254 term3 |= 1; 6255 term2 |= doubleZSig0; 6256 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6257 } 6258 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6259 } 6260 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 6261 zSig0 |= doubleZSig0; 6262 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6263 0, zExp, zSig0, zSig1, status); 6264 } 6265 6266 /*---------------------------------------------------------------------------- 6267 | Returns 1 if the extended double-precision floating-point value `a' is equal 6268 | to the corresponding value `b', and 0 otherwise. The invalid exception is 6269 | raised if either operand is a NaN. Otherwise, the comparison is performed 6270 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6271 *----------------------------------------------------------------------------*/ 6272 6273 int floatx80_eq(floatx80 a, floatx80 b, float_status *status) 6274 { 6275 6276 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 6277 || (extractFloatx80Exp(a) == 0x7FFF 6278 && (uint64_t) (extractFloatx80Frac(a) << 1)) 6279 || (extractFloatx80Exp(b) == 0x7FFF 6280 && (uint64_t) (extractFloatx80Frac(b) << 1)) 6281 ) { 6282 float_raise(float_flag_invalid, status); 6283 return 0; 6284 } 6285 return 6286 ( a.low == b.low ) 6287 && ( ( a.high == b.high ) 6288 || ( ( a.low == 0 ) 6289 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6290 ); 6291 6292 } 6293 6294 /*---------------------------------------------------------------------------- 6295 | Returns 1 if the extended double-precision floating-point value `a' is 6296 | less than or equal to the corresponding value `b', and 0 otherwise. The 6297 | invalid exception is raised if either operand is a NaN. The comparison is 6298 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6299 | Arithmetic. 6300 *----------------------------------------------------------------------------*/ 6301 6302 int floatx80_le(floatx80 a, floatx80 b, float_status *status) 6303 { 6304 flag aSign, bSign; 6305 6306 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 6307 || (extractFloatx80Exp(a) == 0x7FFF 6308 && (uint64_t) (extractFloatx80Frac(a) << 1)) 6309 || (extractFloatx80Exp(b) == 0x7FFF 6310 && (uint64_t) (extractFloatx80Frac(b) << 1)) 6311 ) { 6312 float_raise(float_flag_invalid, status); 6313 return 0; 6314 } 6315 aSign = extractFloatx80Sign( a ); 6316 bSign = extractFloatx80Sign( b ); 6317 if ( aSign != bSign ) { 6318 return 6319 aSign 6320 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6321 == 0 ); 6322 } 6323 return 6324 aSign ? le128( b.high, b.low, a.high, a.low ) 6325 : le128( a.high, a.low, b.high, b.low ); 6326 6327 } 6328 6329 /*---------------------------------------------------------------------------- 6330 | Returns 1 if the extended double-precision floating-point value `a' is 6331 | less than the corresponding value `b', and 0 otherwise. The invalid 6332 | exception is raised if either operand is a NaN. The comparison is performed 6333 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6334 *----------------------------------------------------------------------------*/ 6335 6336 int floatx80_lt(floatx80 a, floatx80 b, float_status *status) 6337 { 6338 flag aSign, bSign; 6339 6340 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 6341 || (extractFloatx80Exp(a) == 0x7FFF 6342 && (uint64_t) (extractFloatx80Frac(a) << 1)) 6343 || (extractFloatx80Exp(b) == 0x7FFF 6344 && (uint64_t) (extractFloatx80Frac(b) << 1)) 6345 ) { 6346 float_raise(float_flag_invalid, status); 6347 return 0; 6348 } 6349 aSign = extractFloatx80Sign( a ); 6350 bSign = extractFloatx80Sign( b ); 6351 if ( aSign != bSign ) { 6352 return 6353 aSign 6354 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6355 != 0 ); 6356 } 6357 return 6358 aSign ? lt128( b.high, b.low, a.high, a.low ) 6359 : lt128( a.high, a.low, b.high, b.low ); 6360 6361 } 6362 6363 /*---------------------------------------------------------------------------- 6364 | Returns 1 if the extended double-precision floating-point values `a' and `b' 6365 | cannot be compared, and 0 otherwise. The invalid exception is raised if 6366 | either operand is a NaN. The comparison is performed according to the 6367 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6368 *----------------------------------------------------------------------------*/ 6369 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status) 6370 { 6371 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 6372 || (extractFloatx80Exp(a) == 0x7FFF 6373 && (uint64_t) (extractFloatx80Frac(a) << 1)) 6374 || (extractFloatx80Exp(b) == 0x7FFF 6375 && (uint64_t) (extractFloatx80Frac(b) << 1)) 6376 ) { 6377 float_raise(float_flag_invalid, status); 6378 return 1; 6379 } 6380 return 0; 6381 } 6382 6383 /*---------------------------------------------------------------------------- 6384 | Returns 1 if the extended double-precision floating-point value `a' is 6385 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 6386 | cause an exception. The comparison is performed according to the IEC/IEEE 6387 | Standard for Binary Floating-Point Arithmetic. 6388 *----------------------------------------------------------------------------*/ 6389 6390 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status) 6391 { 6392 6393 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6394 float_raise(float_flag_invalid, status); 6395 return 0; 6396 } 6397 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6398 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6399 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6400 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6401 ) { 6402 if (floatx80_is_signaling_nan(a, status) 6403 || floatx80_is_signaling_nan(b, status)) { 6404 float_raise(float_flag_invalid, status); 6405 } 6406 return 0; 6407 } 6408 return 6409 ( a.low == b.low ) 6410 && ( ( a.high == b.high ) 6411 || ( ( a.low == 0 ) 6412 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6413 ); 6414 6415 } 6416 6417 /*---------------------------------------------------------------------------- 6418 | Returns 1 if the extended double-precision floating-point value `a' is less 6419 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 6420 | do not cause an exception. Otherwise, the comparison is performed according 6421 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6422 *----------------------------------------------------------------------------*/ 6423 6424 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status) 6425 { 6426 flag aSign, bSign; 6427 6428 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6429 float_raise(float_flag_invalid, status); 6430 return 0; 6431 } 6432 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6433 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6434 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6435 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6436 ) { 6437 if (floatx80_is_signaling_nan(a, status) 6438 || floatx80_is_signaling_nan(b, status)) { 6439 float_raise(float_flag_invalid, status); 6440 } 6441 return 0; 6442 } 6443 aSign = extractFloatx80Sign( a ); 6444 bSign = extractFloatx80Sign( b ); 6445 if ( aSign != bSign ) { 6446 return 6447 aSign 6448 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6449 == 0 ); 6450 } 6451 return 6452 aSign ? le128( b.high, b.low, a.high, a.low ) 6453 : le128( a.high, a.low, b.high, b.low ); 6454 6455 } 6456 6457 /*---------------------------------------------------------------------------- 6458 | Returns 1 if the extended double-precision floating-point value `a' is less 6459 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 6460 | an exception. Otherwise, the comparison is performed according to the 6461 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6462 *----------------------------------------------------------------------------*/ 6463 6464 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status) 6465 { 6466 flag aSign, bSign; 6467 6468 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6469 float_raise(float_flag_invalid, status); 6470 return 0; 6471 } 6472 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6473 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6474 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6475 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6476 ) { 6477 if (floatx80_is_signaling_nan(a, status) 6478 || floatx80_is_signaling_nan(b, status)) { 6479 float_raise(float_flag_invalid, status); 6480 } 6481 return 0; 6482 } 6483 aSign = extractFloatx80Sign( a ); 6484 bSign = extractFloatx80Sign( b ); 6485 if ( aSign != bSign ) { 6486 return 6487 aSign 6488 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6489 != 0 ); 6490 } 6491 return 6492 aSign ? lt128( b.high, b.low, a.high, a.low ) 6493 : lt128( a.high, a.low, b.high, b.low ); 6494 6495 } 6496 6497 /*---------------------------------------------------------------------------- 6498 | Returns 1 if the extended double-precision floating-point values `a' and `b' 6499 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception. 6500 | The comparison is performed according to the IEC/IEEE Standard for Binary 6501 | Floating-Point Arithmetic. 6502 *----------------------------------------------------------------------------*/ 6503 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status) 6504 { 6505 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6506 float_raise(float_flag_invalid, status); 6507 return 1; 6508 } 6509 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6510 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6511 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6512 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6513 ) { 6514 if (floatx80_is_signaling_nan(a, status) 6515 || floatx80_is_signaling_nan(b, status)) { 6516 float_raise(float_flag_invalid, status); 6517 } 6518 return 1; 6519 } 6520 return 0; 6521 } 6522 6523 /*---------------------------------------------------------------------------- 6524 | Returns the result of converting the quadruple-precision floating-point 6525 | value `a' to the 32-bit two's complement integer format. The conversion 6526 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6527 | Arithmetic---which means in particular that the conversion is rounded 6528 | according to the current rounding mode. If `a' is a NaN, the largest 6529 | positive integer is returned. Otherwise, if the conversion overflows, the 6530 | largest integer with the same sign as `a' is returned. 6531 *----------------------------------------------------------------------------*/ 6532 6533 int32_t float128_to_int32(float128 a, float_status *status) 6534 { 6535 flag aSign; 6536 int32_t aExp, shiftCount; 6537 uint64_t aSig0, aSig1; 6538 6539 aSig1 = extractFloat128Frac1( a ); 6540 aSig0 = extractFloat128Frac0( a ); 6541 aExp = extractFloat128Exp( a ); 6542 aSign = extractFloat128Sign( a ); 6543 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 6544 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6545 aSig0 |= ( aSig1 != 0 ); 6546 shiftCount = 0x4028 - aExp; 6547 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 6548 return roundAndPackInt32(aSign, aSig0, status); 6549 6550 } 6551 6552 /*---------------------------------------------------------------------------- 6553 | Returns the result of converting the quadruple-precision floating-point 6554 | value `a' to the 32-bit two's complement integer format. The conversion 6555 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6556 | Arithmetic, except that the conversion is always rounded toward zero. If 6557 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 6558 | conversion overflows, the largest integer with the same sign as `a' is 6559 | returned. 6560 *----------------------------------------------------------------------------*/ 6561 6562 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 6563 { 6564 flag aSign; 6565 int32_t aExp, shiftCount; 6566 uint64_t aSig0, aSig1, savedASig; 6567 int32_t z; 6568 6569 aSig1 = extractFloat128Frac1( a ); 6570 aSig0 = extractFloat128Frac0( a ); 6571 aExp = extractFloat128Exp( a ); 6572 aSign = extractFloat128Sign( a ); 6573 aSig0 |= ( aSig1 != 0 ); 6574 if ( 0x401E < aExp ) { 6575 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 6576 goto invalid; 6577 } 6578 else if ( aExp < 0x3FFF ) { 6579 if (aExp || aSig0) { 6580 status->float_exception_flags |= float_flag_inexact; 6581 } 6582 return 0; 6583 } 6584 aSig0 |= LIT64( 0x0001000000000000 ); 6585 shiftCount = 0x402F - aExp; 6586 savedASig = aSig0; 6587 aSig0 >>= shiftCount; 6588 z = aSig0; 6589 if ( aSign ) z = - z; 6590 if ( ( z < 0 ) ^ aSign ) { 6591 invalid: 6592 float_raise(float_flag_invalid, status); 6593 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 6594 } 6595 if ( ( aSig0<<shiftCount ) != savedASig ) { 6596 status->float_exception_flags |= float_flag_inexact; 6597 } 6598 return z; 6599 6600 } 6601 6602 /*---------------------------------------------------------------------------- 6603 | Returns the result of converting the quadruple-precision floating-point 6604 | value `a' to the 64-bit two's complement integer format. The conversion 6605 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6606 | Arithmetic---which means in particular that the conversion is rounded 6607 | according to the current rounding mode. If `a' is a NaN, the largest 6608 | positive integer is returned. Otherwise, if the conversion overflows, the 6609 | largest integer with the same sign as `a' is returned. 6610 *----------------------------------------------------------------------------*/ 6611 6612 int64_t float128_to_int64(float128 a, float_status *status) 6613 { 6614 flag aSign; 6615 int32_t aExp, shiftCount; 6616 uint64_t aSig0, aSig1; 6617 6618 aSig1 = extractFloat128Frac1( a ); 6619 aSig0 = extractFloat128Frac0( a ); 6620 aExp = extractFloat128Exp( a ); 6621 aSign = extractFloat128Sign( a ); 6622 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6623 shiftCount = 0x402F - aExp; 6624 if ( shiftCount <= 0 ) { 6625 if ( 0x403E < aExp ) { 6626 float_raise(float_flag_invalid, status); 6627 if ( ! aSign 6628 || ( ( aExp == 0x7FFF ) 6629 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 6630 ) 6631 ) { 6632 return LIT64( 0x7FFFFFFFFFFFFFFF ); 6633 } 6634 return (int64_t) LIT64( 0x8000000000000000 ); 6635 } 6636 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 6637 } 6638 else { 6639 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 6640 } 6641 return roundAndPackInt64(aSign, aSig0, aSig1, status); 6642 6643 } 6644 6645 /*---------------------------------------------------------------------------- 6646 | Returns the result of converting the quadruple-precision floating-point 6647 | value `a' to the 64-bit two's complement integer format. The conversion 6648 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6649 | Arithmetic, except that the conversion is always rounded toward zero. 6650 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 6651 | the conversion overflows, the largest integer with the same sign as `a' is 6652 | returned. 6653 *----------------------------------------------------------------------------*/ 6654 6655 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 6656 { 6657 flag aSign; 6658 int32_t aExp, shiftCount; 6659 uint64_t aSig0, aSig1; 6660 int64_t z; 6661 6662 aSig1 = extractFloat128Frac1( a ); 6663 aSig0 = extractFloat128Frac0( a ); 6664 aExp = extractFloat128Exp( a ); 6665 aSign = extractFloat128Sign( a ); 6666 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6667 shiftCount = aExp - 0x402F; 6668 if ( 0 < shiftCount ) { 6669 if ( 0x403E <= aExp ) { 6670 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 6671 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 6672 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 6673 if (aSig1) { 6674 status->float_exception_flags |= float_flag_inexact; 6675 } 6676 } 6677 else { 6678 float_raise(float_flag_invalid, status); 6679 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 6680 return LIT64( 0x7FFFFFFFFFFFFFFF ); 6681 } 6682 } 6683 return (int64_t) LIT64( 0x8000000000000000 ); 6684 } 6685 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 6686 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 6687 status->float_exception_flags |= float_flag_inexact; 6688 } 6689 } 6690 else { 6691 if ( aExp < 0x3FFF ) { 6692 if ( aExp | aSig0 | aSig1 ) { 6693 status->float_exception_flags |= float_flag_inexact; 6694 } 6695 return 0; 6696 } 6697 z = aSig0>>( - shiftCount ); 6698 if ( aSig1 6699 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 6700 status->float_exception_flags |= float_flag_inexact; 6701 } 6702 } 6703 if ( aSign ) z = - z; 6704 return z; 6705 6706 } 6707 6708 /*---------------------------------------------------------------------------- 6709 | Returns the result of converting the quadruple-precision floating-point value 6710 | `a' to the 64-bit unsigned integer format. The conversion is 6711 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6712 | Arithmetic---which means in particular that the conversion is rounded 6713 | according to the current rounding mode. If `a' is a NaN, the largest 6714 | positive integer is returned. If the conversion overflows, the 6715 | largest unsigned integer is returned. If 'a' is negative, the value is 6716 | rounded and zero is returned; negative values that do not round to zero 6717 | will raise the inexact exception. 6718 *----------------------------------------------------------------------------*/ 6719 6720 uint64_t float128_to_uint64(float128 a, float_status *status) 6721 { 6722 flag aSign; 6723 int aExp; 6724 int shiftCount; 6725 uint64_t aSig0, aSig1; 6726 6727 aSig0 = extractFloat128Frac0(a); 6728 aSig1 = extractFloat128Frac1(a); 6729 aExp = extractFloat128Exp(a); 6730 aSign = extractFloat128Sign(a); 6731 if (aSign && (aExp > 0x3FFE)) { 6732 float_raise(float_flag_invalid, status); 6733 if (float128_is_any_nan(a)) { 6734 return LIT64(0xFFFFFFFFFFFFFFFF); 6735 } else { 6736 return 0; 6737 } 6738 } 6739 if (aExp) { 6740 aSig0 |= LIT64(0x0001000000000000); 6741 } 6742 shiftCount = 0x402F - aExp; 6743 if (shiftCount <= 0) { 6744 if (0x403E < aExp) { 6745 float_raise(float_flag_invalid, status); 6746 return LIT64(0xFFFFFFFFFFFFFFFF); 6747 } 6748 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 6749 } else { 6750 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 6751 } 6752 return roundAndPackUint64(aSign, aSig0, aSig1, status); 6753 } 6754 6755 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 6756 { 6757 uint64_t v; 6758 signed char current_rounding_mode = status->float_rounding_mode; 6759 6760 set_float_rounding_mode(float_round_to_zero, status); 6761 v = float128_to_uint64(a, status); 6762 set_float_rounding_mode(current_rounding_mode, status); 6763 6764 return v; 6765 } 6766 6767 /*---------------------------------------------------------------------------- 6768 | Returns the result of converting the quadruple-precision floating-point 6769 | value `a' to the 32-bit unsigned integer format. The conversion 6770 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6771 | Arithmetic except that the conversion is always rounded toward zero. 6772 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 6773 | if the conversion overflows, the largest unsigned integer is returned. 6774 | If 'a' is negative, the value is rounded and zero is returned; negative 6775 | values that do not round to zero will raise the inexact exception. 6776 *----------------------------------------------------------------------------*/ 6777 6778 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 6779 { 6780 uint64_t v; 6781 uint32_t res; 6782 int old_exc_flags = get_float_exception_flags(status); 6783 6784 v = float128_to_uint64_round_to_zero(a, status); 6785 if (v > 0xffffffff) { 6786 res = 0xffffffff; 6787 } else { 6788 return v; 6789 } 6790 set_float_exception_flags(old_exc_flags, status); 6791 float_raise(float_flag_invalid, status); 6792 return res; 6793 } 6794 6795 /*---------------------------------------------------------------------------- 6796 | Returns the result of converting the quadruple-precision floating-point 6797 | value `a' to the single-precision floating-point format. The conversion 6798 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6799 | Arithmetic. 6800 *----------------------------------------------------------------------------*/ 6801 6802 float32 float128_to_float32(float128 a, float_status *status) 6803 { 6804 flag aSign; 6805 int32_t aExp; 6806 uint64_t aSig0, aSig1; 6807 uint32_t zSig; 6808 6809 aSig1 = extractFloat128Frac1( a ); 6810 aSig0 = extractFloat128Frac0( a ); 6811 aExp = extractFloat128Exp( a ); 6812 aSign = extractFloat128Sign( a ); 6813 if ( aExp == 0x7FFF ) { 6814 if ( aSig0 | aSig1 ) { 6815 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 6816 } 6817 return packFloat32( aSign, 0xFF, 0 ); 6818 } 6819 aSig0 |= ( aSig1 != 0 ); 6820 shift64RightJamming( aSig0, 18, &aSig0 ); 6821 zSig = aSig0; 6822 if ( aExp || zSig ) { 6823 zSig |= 0x40000000; 6824 aExp -= 0x3F81; 6825 } 6826 return roundAndPackFloat32(aSign, aExp, zSig, status); 6827 6828 } 6829 6830 /*---------------------------------------------------------------------------- 6831 | Returns the result of converting the quadruple-precision floating-point 6832 | value `a' to the double-precision floating-point format. The conversion 6833 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6834 | Arithmetic. 6835 *----------------------------------------------------------------------------*/ 6836 6837 float64 float128_to_float64(float128 a, float_status *status) 6838 { 6839 flag aSign; 6840 int32_t aExp; 6841 uint64_t aSig0, aSig1; 6842 6843 aSig1 = extractFloat128Frac1( a ); 6844 aSig0 = extractFloat128Frac0( a ); 6845 aExp = extractFloat128Exp( a ); 6846 aSign = extractFloat128Sign( a ); 6847 if ( aExp == 0x7FFF ) { 6848 if ( aSig0 | aSig1 ) { 6849 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 6850 } 6851 return packFloat64( aSign, 0x7FF, 0 ); 6852 } 6853 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6854 aSig0 |= ( aSig1 != 0 ); 6855 if ( aExp || aSig0 ) { 6856 aSig0 |= LIT64( 0x4000000000000000 ); 6857 aExp -= 0x3C01; 6858 } 6859 return roundAndPackFloat64(aSign, aExp, aSig0, status); 6860 6861 } 6862 6863 /*---------------------------------------------------------------------------- 6864 | Returns the result of converting the quadruple-precision floating-point 6865 | value `a' to the extended double-precision floating-point format. The 6866 | conversion is performed according to the IEC/IEEE Standard for Binary 6867 | Floating-Point Arithmetic. 6868 *----------------------------------------------------------------------------*/ 6869 6870 floatx80 float128_to_floatx80(float128 a, float_status *status) 6871 { 6872 flag aSign; 6873 int32_t aExp; 6874 uint64_t aSig0, aSig1; 6875 6876 aSig1 = extractFloat128Frac1( a ); 6877 aSig0 = extractFloat128Frac0( a ); 6878 aExp = extractFloat128Exp( a ); 6879 aSign = extractFloat128Sign( a ); 6880 if ( aExp == 0x7FFF ) { 6881 if ( aSig0 | aSig1 ) { 6882 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status); 6883 } 6884 return packFloatx80(aSign, floatx80_infinity_high, 6885 floatx80_infinity_low); 6886 } 6887 if ( aExp == 0 ) { 6888 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6889 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6890 } 6891 else { 6892 aSig0 |= LIT64( 0x0001000000000000 ); 6893 } 6894 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6895 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6896 6897 } 6898 6899 /*---------------------------------------------------------------------------- 6900 | Rounds the quadruple-precision floating-point value `a' to an integer, and 6901 | returns the result as a quadruple-precision floating-point value. The 6902 | operation is performed according to the IEC/IEEE Standard for Binary 6903 | Floating-Point Arithmetic. 6904 *----------------------------------------------------------------------------*/ 6905 6906 float128 float128_round_to_int(float128 a, float_status *status) 6907 { 6908 flag aSign; 6909 int32_t aExp; 6910 uint64_t lastBitMask, roundBitsMask; 6911 float128 z; 6912 6913 aExp = extractFloat128Exp( a ); 6914 if ( 0x402F <= aExp ) { 6915 if ( 0x406F <= aExp ) { 6916 if ( ( aExp == 0x7FFF ) 6917 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 6918 ) { 6919 return propagateFloat128NaN(a, a, status); 6920 } 6921 return a; 6922 } 6923 lastBitMask = 1; 6924 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 6925 roundBitsMask = lastBitMask - 1; 6926 z = a; 6927 switch (status->float_rounding_mode) { 6928 case float_round_nearest_even: 6929 if ( lastBitMask ) { 6930 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 6931 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 6932 } 6933 else { 6934 if ( (int64_t) z.low < 0 ) { 6935 ++z.high; 6936 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 6937 } 6938 } 6939 break; 6940 case float_round_ties_away: 6941 if (lastBitMask) { 6942 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 6943 } else { 6944 if ((int64_t) z.low < 0) { 6945 ++z.high; 6946 } 6947 } 6948 break; 6949 case float_round_to_zero: 6950 break; 6951 case float_round_up: 6952 if (!extractFloat128Sign(z)) { 6953 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6954 } 6955 break; 6956 case float_round_down: 6957 if (extractFloat128Sign(z)) { 6958 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6959 } 6960 break; 6961 default: 6962 abort(); 6963 } 6964 z.low &= ~ roundBitsMask; 6965 } 6966 else { 6967 if ( aExp < 0x3FFF ) { 6968 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 6969 status->float_exception_flags |= float_flag_inexact; 6970 aSign = extractFloat128Sign( a ); 6971 switch (status->float_rounding_mode) { 6972 case float_round_nearest_even: 6973 if ( ( aExp == 0x3FFE ) 6974 && ( extractFloat128Frac0( a ) 6975 | extractFloat128Frac1( a ) ) 6976 ) { 6977 return packFloat128( aSign, 0x3FFF, 0, 0 ); 6978 } 6979 break; 6980 case float_round_ties_away: 6981 if (aExp == 0x3FFE) { 6982 return packFloat128(aSign, 0x3FFF, 0, 0); 6983 } 6984 break; 6985 case float_round_down: 6986 return 6987 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 6988 : packFloat128( 0, 0, 0, 0 ); 6989 case float_round_up: 6990 return 6991 aSign ? packFloat128( 1, 0, 0, 0 ) 6992 : packFloat128( 0, 0x3FFF, 0, 0 ); 6993 } 6994 return packFloat128( aSign, 0, 0, 0 ); 6995 } 6996 lastBitMask = 1; 6997 lastBitMask <<= 0x402F - aExp; 6998 roundBitsMask = lastBitMask - 1; 6999 z.low = 0; 7000 z.high = a.high; 7001 switch (status->float_rounding_mode) { 7002 case float_round_nearest_even: 7003 z.high += lastBitMask>>1; 7004 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 7005 z.high &= ~ lastBitMask; 7006 } 7007 break; 7008 case float_round_ties_away: 7009 z.high += lastBitMask>>1; 7010 break; 7011 case float_round_to_zero: 7012 break; 7013 case float_round_up: 7014 if (!extractFloat128Sign(z)) { 7015 z.high |= ( a.low != 0 ); 7016 z.high += roundBitsMask; 7017 } 7018 break; 7019 case float_round_down: 7020 if (extractFloat128Sign(z)) { 7021 z.high |= (a.low != 0); 7022 z.high += roundBitsMask; 7023 } 7024 break; 7025 default: 7026 abort(); 7027 } 7028 z.high &= ~ roundBitsMask; 7029 } 7030 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 7031 status->float_exception_flags |= float_flag_inexact; 7032 } 7033 return z; 7034 7035 } 7036 7037 /*---------------------------------------------------------------------------- 7038 | Returns the result of adding the absolute values of the quadruple-precision 7039 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 7040 | before being returned. `zSign' is ignored if the result is a NaN. 7041 | The addition is performed according to the IEC/IEEE Standard for Binary 7042 | Floating-Point Arithmetic. 7043 *----------------------------------------------------------------------------*/ 7044 7045 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign, 7046 float_status *status) 7047 { 7048 int32_t aExp, bExp, zExp; 7049 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 7050 int32_t expDiff; 7051 7052 aSig1 = extractFloat128Frac1( a ); 7053 aSig0 = extractFloat128Frac0( a ); 7054 aExp = extractFloat128Exp( a ); 7055 bSig1 = extractFloat128Frac1( b ); 7056 bSig0 = extractFloat128Frac0( b ); 7057 bExp = extractFloat128Exp( b ); 7058 expDiff = aExp - bExp; 7059 if ( 0 < expDiff ) { 7060 if ( aExp == 0x7FFF ) { 7061 if (aSig0 | aSig1) { 7062 return propagateFloat128NaN(a, b, status); 7063 } 7064 return a; 7065 } 7066 if ( bExp == 0 ) { 7067 --expDiff; 7068 } 7069 else { 7070 bSig0 |= LIT64( 0x0001000000000000 ); 7071 } 7072 shift128ExtraRightJamming( 7073 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 7074 zExp = aExp; 7075 } 7076 else if ( expDiff < 0 ) { 7077 if ( bExp == 0x7FFF ) { 7078 if (bSig0 | bSig1) { 7079 return propagateFloat128NaN(a, b, status); 7080 } 7081 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7082 } 7083 if ( aExp == 0 ) { 7084 ++expDiff; 7085 } 7086 else { 7087 aSig0 |= LIT64( 0x0001000000000000 ); 7088 } 7089 shift128ExtraRightJamming( 7090 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 7091 zExp = bExp; 7092 } 7093 else { 7094 if ( aExp == 0x7FFF ) { 7095 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 7096 return propagateFloat128NaN(a, b, status); 7097 } 7098 return a; 7099 } 7100 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 7101 if ( aExp == 0 ) { 7102 if (status->flush_to_zero) { 7103 if (zSig0 | zSig1) { 7104 float_raise(float_flag_output_denormal, status); 7105 } 7106 return packFloat128(zSign, 0, 0, 0); 7107 } 7108 return packFloat128( zSign, 0, zSig0, zSig1 ); 7109 } 7110 zSig2 = 0; 7111 zSig0 |= LIT64( 0x0002000000000000 ); 7112 zExp = aExp; 7113 goto shiftRight1; 7114 } 7115 aSig0 |= LIT64( 0x0001000000000000 ); 7116 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 7117 --zExp; 7118 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 7119 ++zExp; 7120 shiftRight1: 7121 shift128ExtraRightJamming( 7122 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 7123 roundAndPack: 7124 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7125 7126 } 7127 7128 /*---------------------------------------------------------------------------- 7129 | Returns the result of subtracting the absolute values of the quadruple- 7130 | precision floating-point values `a' and `b'. If `zSign' is 1, the 7131 | difference is negated before being returned. `zSign' is ignored if the 7132 | result is a NaN. The subtraction is performed according to the IEC/IEEE 7133 | Standard for Binary Floating-Point Arithmetic. 7134 *----------------------------------------------------------------------------*/ 7135 7136 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign, 7137 float_status *status) 7138 { 7139 int32_t aExp, bExp, zExp; 7140 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 7141 int32_t expDiff; 7142 7143 aSig1 = extractFloat128Frac1( a ); 7144 aSig0 = extractFloat128Frac0( a ); 7145 aExp = extractFloat128Exp( a ); 7146 bSig1 = extractFloat128Frac1( b ); 7147 bSig0 = extractFloat128Frac0( b ); 7148 bExp = extractFloat128Exp( b ); 7149 expDiff = aExp - bExp; 7150 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 7151 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 7152 if ( 0 < expDiff ) goto aExpBigger; 7153 if ( expDiff < 0 ) goto bExpBigger; 7154 if ( aExp == 0x7FFF ) { 7155 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 7156 return propagateFloat128NaN(a, b, status); 7157 } 7158 float_raise(float_flag_invalid, status); 7159 return float128_default_nan(status); 7160 } 7161 if ( aExp == 0 ) { 7162 aExp = 1; 7163 bExp = 1; 7164 } 7165 if ( bSig0 < aSig0 ) goto aBigger; 7166 if ( aSig0 < bSig0 ) goto bBigger; 7167 if ( bSig1 < aSig1 ) goto aBigger; 7168 if ( aSig1 < bSig1 ) goto bBigger; 7169 return packFloat128(status->float_rounding_mode == float_round_down, 7170 0, 0, 0); 7171 bExpBigger: 7172 if ( bExp == 0x7FFF ) { 7173 if (bSig0 | bSig1) { 7174 return propagateFloat128NaN(a, b, status); 7175 } 7176 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 7177 } 7178 if ( aExp == 0 ) { 7179 ++expDiff; 7180 } 7181 else { 7182 aSig0 |= LIT64( 0x4000000000000000 ); 7183 } 7184 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 7185 bSig0 |= LIT64( 0x4000000000000000 ); 7186 bBigger: 7187 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 7188 zExp = bExp; 7189 zSign ^= 1; 7190 goto normalizeRoundAndPack; 7191 aExpBigger: 7192 if ( aExp == 0x7FFF ) { 7193 if (aSig0 | aSig1) { 7194 return propagateFloat128NaN(a, b, status); 7195 } 7196 return a; 7197 } 7198 if ( bExp == 0 ) { 7199 --expDiff; 7200 } 7201 else { 7202 bSig0 |= LIT64( 0x4000000000000000 ); 7203 } 7204 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 7205 aSig0 |= LIT64( 0x4000000000000000 ); 7206 aBigger: 7207 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 7208 zExp = aExp; 7209 normalizeRoundAndPack: 7210 --zExp; 7211 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 7212 status); 7213 7214 } 7215 7216 /*---------------------------------------------------------------------------- 7217 | Returns the result of adding the quadruple-precision floating-point values 7218 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 7219 | for Binary Floating-Point Arithmetic. 7220 *----------------------------------------------------------------------------*/ 7221 7222 float128 float128_add(float128 a, float128 b, float_status *status) 7223 { 7224 flag aSign, bSign; 7225 7226 aSign = extractFloat128Sign( a ); 7227 bSign = extractFloat128Sign( b ); 7228 if ( aSign == bSign ) { 7229 return addFloat128Sigs(a, b, aSign, status); 7230 } 7231 else { 7232 return subFloat128Sigs(a, b, aSign, status); 7233 } 7234 7235 } 7236 7237 /*---------------------------------------------------------------------------- 7238 | Returns the result of subtracting the quadruple-precision floating-point 7239 | values `a' and `b'. The operation is performed according to the IEC/IEEE 7240 | Standard for Binary Floating-Point Arithmetic. 7241 *----------------------------------------------------------------------------*/ 7242 7243 float128 float128_sub(float128 a, float128 b, float_status *status) 7244 { 7245 flag aSign, bSign; 7246 7247 aSign = extractFloat128Sign( a ); 7248 bSign = extractFloat128Sign( b ); 7249 if ( aSign == bSign ) { 7250 return subFloat128Sigs(a, b, aSign, status); 7251 } 7252 else { 7253 return addFloat128Sigs(a, b, aSign, status); 7254 } 7255 7256 } 7257 7258 /*---------------------------------------------------------------------------- 7259 | Returns the result of multiplying the quadruple-precision floating-point 7260 | values `a' and `b'. The operation is performed according to the IEC/IEEE 7261 | Standard for Binary Floating-Point Arithmetic. 7262 *----------------------------------------------------------------------------*/ 7263 7264 float128 float128_mul(float128 a, float128 b, float_status *status) 7265 { 7266 flag aSign, bSign, zSign; 7267 int32_t aExp, bExp, zExp; 7268 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 7269 7270 aSig1 = extractFloat128Frac1( a ); 7271 aSig0 = extractFloat128Frac0( a ); 7272 aExp = extractFloat128Exp( a ); 7273 aSign = extractFloat128Sign( a ); 7274 bSig1 = extractFloat128Frac1( b ); 7275 bSig0 = extractFloat128Frac0( b ); 7276 bExp = extractFloat128Exp( b ); 7277 bSign = extractFloat128Sign( b ); 7278 zSign = aSign ^ bSign; 7279 if ( aExp == 0x7FFF ) { 7280 if ( ( aSig0 | aSig1 ) 7281 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 7282 return propagateFloat128NaN(a, b, status); 7283 } 7284 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 7285 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7286 } 7287 if ( bExp == 0x7FFF ) { 7288 if (bSig0 | bSig1) { 7289 return propagateFloat128NaN(a, b, status); 7290 } 7291 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 7292 invalid: 7293 float_raise(float_flag_invalid, status); 7294 return float128_default_nan(status); 7295 } 7296 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7297 } 7298 if ( aExp == 0 ) { 7299 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7300 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7301 } 7302 if ( bExp == 0 ) { 7303 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7304 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7305 } 7306 zExp = aExp + bExp - 0x4000; 7307 aSig0 |= LIT64( 0x0001000000000000 ); 7308 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 7309 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 7310 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 7311 zSig2 |= ( zSig3 != 0 ); 7312 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 7313 shift128ExtraRightJamming( 7314 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 7315 ++zExp; 7316 } 7317 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7318 7319 } 7320 7321 /*---------------------------------------------------------------------------- 7322 | Returns the result of dividing the quadruple-precision floating-point value 7323 | `a' by the corresponding value `b'. The operation is performed according to 7324 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7325 *----------------------------------------------------------------------------*/ 7326 7327 float128 float128_div(float128 a, float128 b, float_status *status) 7328 { 7329 flag aSign, bSign, zSign; 7330 int32_t aExp, bExp, zExp; 7331 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 7332 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7333 7334 aSig1 = extractFloat128Frac1( a ); 7335 aSig0 = extractFloat128Frac0( a ); 7336 aExp = extractFloat128Exp( a ); 7337 aSign = extractFloat128Sign( a ); 7338 bSig1 = extractFloat128Frac1( b ); 7339 bSig0 = extractFloat128Frac0( b ); 7340 bExp = extractFloat128Exp( b ); 7341 bSign = extractFloat128Sign( b ); 7342 zSign = aSign ^ bSign; 7343 if ( aExp == 0x7FFF ) { 7344 if (aSig0 | aSig1) { 7345 return propagateFloat128NaN(a, b, status); 7346 } 7347 if ( bExp == 0x7FFF ) { 7348 if (bSig0 | bSig1) { 7349 return propagateFloat128NaN(a, b, status); 7350 } 7351 goto invalid; 7352 } 7353 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7354 } 7355 if ( bExp == 0x7FFF ) { 7356 if (bSig0 | bSig1) { 7357 return propagateFloat128NaN(a, b, status); 7358 } 7359 return packFloat128( zSign, 0, 0, 0 ); 7360 } 7361 if ( bExp == 0 ) { 7362 if ( ( bSig0 | bSig1 ) == 0 ) { 7363 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 7364 invalid: 7365 float_raise(float_flag_invalid, status); 7366 return float128_default_nan(status); 7367 } 7368 float_raise(float_flag_divbyzero, status); 7369 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7370 } 7371 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7372 } 7373 if ( aExp == 0 ) { 7374 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7375 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7376 } 7377 zExp = aExp - bExp + 0x3FFD; 7378 shortShift128Left( 7379 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 7380 shortShift128Left( 7381 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 7382 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 7383 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 7384 ++zExp; 7385 } 7386 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7387 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 7388 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 7389 while ( (int64_t) rem0 < 0 ) { 7390 --zSig0; 7391 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 7392 } 7393 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 7394 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 7395 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 7396 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 7397 while ( (int64_t) rem1 < 0 ) { 7398 --zSig1; 7399 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 7400 } 7401 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7402 } 7403 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 7404 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7405 7406 } 7407 7408 /*---------------------------------------------------------------------------- 7409 | Returns the remainder of the quadruple-precision floating-point value `a' 7410 | with respect to the corresponding value `b'. The operation is performed 7411 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7412 *----------------------------------------------------------------------------*/ 7413 7414 float128 float128_rem(float128 a, float128 b, float_status *status) 7415 { 7416 flag aSign, zSign; 7417 int32_t aExp, bExp, expDiff; 7418 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 7419 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 7420 int64_t sigMean0; 7421 7422 aSig1 = extractFloat128Frac1( a ); 7423 aSig0 = extractFloat128Frac0( a ); 7424 aExp = extractFloat128Exp( a ); 7425 aSign = extractFloat128Sign( a ); 7426 bSig1 = extractFloat128Frac1( b ); 7427 bSig0 = extractFloat128Frac0( b ); 7428 bExp = extractFloat128Exp( b ); 7429 if ( aExp == 0x7FFF ) { 7430 if ( ( aSig0 | aSig1 ) 7431 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 7432 return propagateFloat128NaN(a, b, status); 7433 } 7434 goto invalid; 7435 } 7436 if ( bExp == 0x7FFF ) { 7437 if (bSig0 | bSig1) { 7438 return propagateFloat128NaN(a, b, status); 7439 } 7440 return a; 7441 } 7442 if ( bExp == 0 ) { 7443 if ( ( bSig0 | bSig1 ) == 0 ) { 7444 invalid: 7445 float_raise(float_flag_invalid, status); 7446 return float128_default_nan(status); 7447 } 7448 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7449 } 7450 if ( aExp == 0 ) { 7451 if ( ( aSig0 | aSig1 ) == 0 ) return a; 7452 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7453 } 7454 expDiff = aExp - bExp; 7455 if ( expDiff < -1 ) return a; 7456 shortShift128Left( 7457 aSig0 | LIT64( 0x0001000000000000 ), 7458 aSig1, 7459 15 - ( expDiff < 0 ), 7460 &aSig0, 7461 &aSig1 7462 ); 7463 shortShift128Left( 7464 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 7465 q = le128( bSig0, bSig1, aSig0, aSig1 ); 7466 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7467 expDiff -= 64; 7468 while ( 0 < expDiff ) { 7469 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7470 q = ( 4 < q ) ? q - 4 : 0; 7471 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7472 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 7473 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 7474 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 7475 expDiff -= 61; 7476 } 7477 if ( -64 < expDiff ) { 7478 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7479 q = ( 4 < q ) ? q - 4 : 0; 7480 q >>= - expDiff; 7481 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7482 expDiff += 52; 7483 if ( expDiff < 0 ) { 7484 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 7485 } 7486 else { 7487 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 7488 } 7489 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7490 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 7491 } 7492 else { 7493 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 7494 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7495 } 7496 do { 7497 alternateASig0 = aSig0; 7498 alternateASig1 = aSig1; 7499 ++q; 7500 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7501 } while ( 0 <= (int64_t) aSig0 ); 7502 add128( 7503 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 7504 if ( ( sigMean0 < 0 ) 7505 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 7506 aSig0 = alternateASig0; 7507 aSig1 = alternateASig1; 7508 } 7509 zSign = ( (int64_t) aSig0 < 0 ); 7510 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 7511 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 7512 status); 7513 } 7514 7515 /*---------------------------------------------------------------------------- 7516 | Returns the square root of the quadruple-precision floating-point value `a'. 7517 | The operation is performed according to the IEC/IEEE Standard for Binary 7518 | Floating-Point Arithmetic. 7519 *----------------------------------------------------------------------------*/ 7520 7521 float128 float128_sqrt(float128 a, float_status *status) 7522 { 7523 flag aSign; 7524 int32_t aExp, zExp; 7525 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 7526 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7527 7528 aSig1 = extractFloat128Frac1( a ); 7529 aSig0 = extractFloat128Frac0( a ); 7530 aExp = extractFloat128Exp( a ); 7531 aSign = extractFloat128Sign( a ); 7532 if ( aExp == 0x7FFF ) { 7533 if (aSig0 | aSig1) { 7534 return propagateFloat128NaN(a, a, status); 7535 } 7536 if ( ! aSign ) return a; 7537 goto invalid; 7538 } 7539 if ( aSign ) { 7540 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 7541 invalid: 7542 float_raise(float_flag_invalid, status); 7543 return float128_default_nan(status); 7544 } 7545 if ( aExp == 0 ) { 7546 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 7547 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7548 } 7549 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 7550 aSig0 |= LIT64( 0x0001000000000000 ); 7551 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 7552 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 7553 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 7554 doubleZSig0 = zSig0<<1; 7555 mul64To128( zSig0, zSig0, &term0, &term1 ); 7556 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 7557 while ( (int64_t) rem0 < 0 ) { 7558 --zSig0; 7559 doubleZSig0 -= 2; 7560 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 7561 } 7562 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 7563 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 7564 if ( zSig1 == 0 ) zSig1 = 1; 7565 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 7566 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 7567 mul64To128( zSig1, zSig1, &term2, &term3 ); 7568 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 7569 while ( (int64_t) rem1 < 0 ) { 7570 --zSig1; 7571 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 7572 term3 |= 1; 7573 term2 |= doubleZSig0; 7574 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 7575 } 7576 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7577 } 7578 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 7579 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 7580 7581 } 7582 7583 /*---------------------------------------------------------------------------- 7584 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 7585 | the corresponding value `b', and 0 otherwise. The invalid exception is 7586 | raised if either operand is a NaN. Otherwise, the comparison is performed 7587 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7588 *----------------------------------------------------------------------------*/ 7589 7590 int float128_eq(float128 a, float128 b, float_status *status) 7591 { 7592 7593 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7594 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7595 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7596 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7597 ) { 7598 float_raise(float_flag_invalid, status); 7599 return 0; 7600 } 7601 return 7602 ( a.low == b.low ) 7603 && ( ( a.high == b.high ) 7604 || ( ( a.low == 0 ) 7605 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 7606 ); 7607 7608 } 7609 7610 /*---------------------------------------------------------------------------- 7611 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7612 | or equal to the corresponding value `b', and 0 otherwise. The invalid 7613 | exception is raised if either operand is a NaN. The comparison is performed 7614 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7615 *----------------------------------------------------------------------------*/ 7616 7617 int float128_le(float128 a, float128 b, float_status *status) 7618 { 7619 flag aSign, bSign; 7620 7621 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7622 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7623 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7624 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7625 ) { 7626 float_raise(float_flag_invalid, status); 7627 return 0; 7628 } 7629 aSign = extractFloat128Sign( a ); 7630 bSign = extractFloat128Sign( b ); 7631 if ( aSign != bSign ) { 7632 return 7633 aSign 7634 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7635 == 0 ); 7636 } 7637 return 7638 aSign ? le128( b.high, b.low, a.high, a.low ) 7639 : le128( a.high, a.low, b.high, b.low ); 7640 7641 } 7642 7643 /*---------------------------------------------------------------------------- 7644 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7645 | the corresponding value `b', and 0 otherwise. The invalid exception is 7646 | raised if either operand is a NaN. The comparison is performed according 7647 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7648 *----------------------------------------------------------------------------*/ 7649 7650 int float128_lt(float128 a, float128 b, float_status *status) 7651 { 7652 flag aSign, bSign; 7653 7654 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7655 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7656 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7657 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7658 ) { 7659 float_raise(float_flag_invalid, status); 7660 return 0; 7661 } 7662 aSign = extractFloat128Sign( a ); 7663 bSign = extractFloat128Sign( b ); 7664 if ( aSign != bSign ) { 7665 return 7666 aSign 7667 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7668 != 0 ); 7669 } 7670 return 7671 aSign ? lt128( b.high, b.low, a.high, a.low ) 7672 : lt128( a.high, a.low, b.high, b.low ); 7673 7674 } 7675 7676 /*---------------------------------------------------------------------------- 7677 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7678 | be compared, and 0 otherwise. The invalid exception is raised if either 7679 | operand is a NaN. The comparison is performed according to the IEC/IEEE 7680 | Standard for Binary Floating-Point Arithmetic. 7681 *----------------------------------------------------------------------------*/ 7682 7683 int float128_unordered(float128 a, float128 b, float_status *status) 7684 { 7685 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7686 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7687 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7688 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7689 ) { 7690 float_raise(float_flag_invalid, status); 7691 return 1; 7692 } 7693 return 0; 7694 } 7695 7696 /*---------------------------------------------------------------------------- 7697 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 7698 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 7699 | exception. The comparison is performed according to the IEC/IEEE Standard 7700 | for Binary Floating-Point Arithmetic. 7701 *----------------------------------------------------------------------------*/ 7702 7703 int float128_eq_quiet(float128 a, float128 b, float_status *status) 7704 { 7705 7706 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7707 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7708 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7709 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7710 ) { 7711 if (float128_is_signaling_nan(a, status) 7712 || float128_is_signaling_nan(b, status)) { 7713 float_raise(float_flag_invalid, status); 7714 } 7715 return 0; 7716 } 7717 return 7718 ( a.low == b.low ) 7719 && ( ( a.high == b.high ) 7720 || ( ( a.low == 0 ) 7721 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 7722 ); 7723 7724 } 7725 7726 /*---------------------------------------------------------------------------- 7727 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7728 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 7729 | cause an exception. Otherwise, the comparison is performed according to the 7730 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7731 *----------------------------------------------------------------------------*/ 7732 7733 int float128_le_quiet(float128 a, float128 b, float_status *status) 7734 { 7735 flag aSign, bSign; 7736 7737 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7738 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7739 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7740 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7741 ) { 7742 if (float128_is_signaling_nan(a, status) 7743 || float128_is_signaling_nan(b, status)) { 7744 float_raise(float_flag_invalid, status); 7745 } 7746 return 0; 7747 } 7748 aSign = extractFloat128Sign( a ); 7749 bSign = extractFloat128Sign( b ); 7750 if ( aSign != bSign ) { 7751 return 7752 aSign 7753 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7754 == 0 ); 7755 } 7756 return 7757 aSign ? le128( b.high, b.low, a.high, a.low ) 7758 : le128( a.high, a.low, b.high, b.low ); 7759 7760 } 7761 7762 /*---------------------------------------------------------------------------- 7763 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7764 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 7765 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 7766 | Standard for Binary Floating-Point Arithmetic. 7767 *----------------------------------------------------------------------------*/ 7768 7769 int float128_lt_quiet(float128 a, float128 b, float_status *status) 7770 { 7771 flag aSign, bSign; 7772 7773 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7774 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7775 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7776 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7777 ) { 7778 if (float128_is_signaling_nan(a, status) 7779 || float128_is_signaling_nan(b, status)) { 7780 float_raise(float_flag_invalid, status); 7781 } 7782 return 0; 7783 } 7784 aSign = extractFloat128Sign( a ); 7785 bSign = extractFloat128Sign( b ); 7786 if ( aSign != bSign ) { 7787 return 7788 aSign 7789 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7790 != 0 ); 7791 } 7792 return 7793 aSign ? lt128( b.high, b.low, a.high, a.low ) 7794 : lt128( a.high, a.low, b.high, b.low ); 7795 7796 } 7797 7798 /*---------------------------------------------------------------------------- 7799 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7800 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 7801 | comparison is performed according to the IEC/IEEE Standard for Binary 7802 | Floating-Point Arithmetic. 7803 *----------------------------------------------------------------------------*/ 7804 7805 int float128_unordered_quiet(float128 a, float128 b, float_status *status) 7806 { 7807 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7808 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7809 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7810 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7811 ) { 7812 if (float128_is_signaling_nan(a, status) 7813 || float128_is_signaling_nan(b, status)) { 7814 float_raise(float_flag_invalid, status); 7815 } 7816 return 1; 7817 } 7818 return 0; 7819 } 7820 7821 static inline int floatx80_compare_internal(floatx80 a, floatx80 b, 7822 int is_quiet, float_status *status) 7823 { 7824 flag aSign, bSign; 7825 7826 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 7827 float_raise(float_flag_invalid, status); 7828 return float_relation_unordered; 7829 } 7830 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 7831 ( extractFloatx80Frac( a )<<1 ) ) || 7832 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 7833 ( extractFloatx80Frac( b )<<1 ) )) { 7834 if (!is_quiet || 7835 floatx80_is_signaling_nan(a, status) || 7836 floatx80_is_signaling_nan(b, status)) { 7837 float_raise(float_flag_invalid, status); 7838 } 7839 return float_relation_unordered; 7840 } 7841 aSign = extractFloatx80Sign( a ); 7842 bSign = extractFloatx80Sign( b ); 7843 if ( aSign != bSign ) { 7844 7845 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 7846 ( ( a.low | b.low ) == 0 ) ) { 7847 /* zero case */ 7848 return float_relation_equal; 7849 } else { 7850 return 1 - (2 * aSign); 7851 } 7852 } else { 7853 if (a.low == b.low && a.high == b.high) { 7854 return float_relation_equal; 7855 } else { 7856 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7857 } 7858 } 7859 } 7860 7861 int floatx80_compare(floatx80 a, floatx80 b, float_status *status) 7862 { 7863 return floatx80_compare_internal(a, b, 0, status); 7864 } 7865 7866 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status) 7867 { 7868 return floatx80_compare_internal(a, b, 1, status); 7869 } 7870 7871 static inline int float128_compare_internal(float128 a, float128 b, 7872 int is_quiet, float_status *status) 7873 { 7874 flag aSign, bSign; 7875 7876 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7877 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7878 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7879 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7880 if (!is_quiet || 7881 float128_is_signaling_nan(a, status) || 7882 float128_is_signaling_nan(b, status)) { 7883 float_raise(float_flag_invalid, status); 7884 } 7885 return float_relation_unordered; 7886 } 7887 aSign = extractFloat128Sign( a ); 7888 bSign = extractFloat128Sign( b ); 7889 if ( aSign != bSign ) { 7890 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7891 /* zero case */ 7892 return float_relation_equal; 7893 } else { 7894 return 1 - (2 * aSign); 7895 } 7896 } else { 7897 if (a.low == b.low && a.high == b.high) { 7898 return float_relation_equal; 7899 } else { 7900 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7901 } 7902 } 7903 } 7904 7905 int float128_compare(float128 a, float128 b, float_status *status) 7906 { 7907 return float128_compare_internal(a, b, 0, status); 7908 } 7909 7910 int float128_compare_quiet(float128 a, float128 b, float_status *status) 7911 { 7912 return float128_compare_internal(a, b, 1, status); 7913 } 7914 7915 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7916 { 7917 flag aSign; 7918 int32_t aExp; 7919 uint64_t aSig; 7920 7921 if (floatx80_invalid_encoding(a)) { 7922 float_raise(float_flag_invalid, status); 7923 return floatx80_default_nan(status); 7924 } 7925 aSig = extractFloatx80Frac( a ); 7926 aExp = extractFloatx80Exp( a ); 7927 aSign = extractFloatx80Sign( a ); 7928 7929 if ( aExp == 0x7FFF ) { 7930 if ( aSig<<1 ) { 7931 return propagateFloatx80NaN(a, a, status); 7932 } 7933 return a; 7934 } 7935 7936 if (aExp == 0) { 7937 if (aSig == 0) { 7938 return a; 7939 } 7940 aExp++; 7941 } 7942 7943 if (n > 0x10000) { 7944 n = 0x10000; 7945 } else if (n < -0x10000) { 7946 n = -0x10000; 7947 } 7948 7949 aExp += n; 7950 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7951 aSign, aExp, aSig, 0, status); 7952 } 7953 7954 float128 float128_scalbn(float128 a, int n, float_status *status) 7955 { 7956 flag aSign; 7957 int32_t aExp; 7958 uint64_t aSig0, aSig1; 7959 7960 aSig1 = extractFloat128Frac1( a ); 7961 aSig0 = extractFloat128Frac0( a ); 7962 aExp = extractFloat128Exp( a ); 7963 aSign = extractFloat128Sign( a ); 7964 if ( aExp == 0x7FFF ) { 7965 if ( aSig0 | aSig1 ) { 7966 return propagateFloat128NaN(a, a, status); 7967 } 7968 return a; 7969 } 7970 if (aExp != 0) { 7971 aSig0 |= LIT64( 0x0001000000000000 ); 7972 } else if (aSig0 == 0 && aSig1 == 0) { 7973 return a; 7974 } else { 7975 aExp++; 7976 } 7977 7978 if (n > 0x10000) { 7979 n = 0x10000; 7980 } else if (n < -0x10000) { 7981 n = -0x10000; 7982 } 7983 7984 aExp += n - 1; 7985 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7986 , status); 7987 7988 } 7989 7990 static void __attribute__((constructor)) softfloat_init(void) 7991 { 7992 union_float64 ua, ub, uc, ur; 7993 7994 if (QEMU_NO_HARDFLOAT) { 7995 return; 7996 } 7997 /* 7998 * Test that the host's FMA is not obviously broken. For example, 7999 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see 8000 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304 8001 */ 8002 ua.s = 0x0020000000000001ULL; 8003 ub.s = 0x3ca0000000000000ULL; 8004 uc.s = 0x0020000000000000ULL; 8005 ur.h = fma(ua.h, ub.h, uc.h); 8006 if (ur.s != 0x0020000000000001ULL) { 8007 force_soft_fma = true; 8008 } 8009 } 8010