1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include <math.h> 87 #include "qemu/bitops.h" 88 #include "fpu/softfloat.h" 89 90 /* We only need stdlib for abort() */ 91 92 /*---------------------------------------------------------------------------- 93 | Primitive arithmetic functions, including multi-word arithmetic, and 94 | division and square root approximations. (Can be specialized to target if 95 | desired.) 96 *----------------------------------------------------------------------------*/ 97 #include "fpu/softfloat-macros.h" 98 99 /* 100 * Hardfloat 101 * 102 * Fast emulation of guest FP instructions is challenging for two reasons. 103 * First, FP instruction semantics are similar but not identical, particularly 104 * when handling NaNs. Second, emulating at reasonable speed the guest FP 105 * exception flags is not trivial: reading the host's flags register with a 106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp], 107 * and trapping on every FP exception is not fast nor pleasant to work with. 108 * 109 * We address these challenges by leveraging the host FPU for a subset of the 110 * operations. To do this we expand on the idea presented in this paper: 111 * 112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a 113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615. 114 * 115 * The idea is thus to leverage the host FPU to (1) compute FP operations 116 * and (2) identify whether FP exceptions occurred while avoiding 117 * expensive exception flag register accesses. 118 * 119 * An important optimization shown in the paper is that given that exception 120 * flags are rarely cleared by the guest, we can avoid recomputing some flags. 121 * This is particularly useful for the inexact flag, which is very frequently 122 * raised in floating-point workloads. 123 * 124 * We optimize the code further by deferring to soft-fp whenever FP exception 125 * detection might get hairy. Two examples: (1) when at least one operand is 126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result 127 * and the result is < the minimum normal. 128 */ 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \ 130 static inline void name(soft_t *a, float_status *s) \ 131 { \ 132 if (unlikely(soft_t ## _is_denormal(*a))) { \ 133 *a = soft_t ## _set_sign(soft_t ## _zero, \ 134 soft_t ## _is_neg(*a)); \ 135 float_raise(float_flag_input_denormal, s); \ 136 } \ 137 } 138 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32) 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64) 141 #undef GEN_INPUT_FLUSH__NOCHECK 142 143 #define GEN_INPUT_FLUSH1(name, soft_t) \ 144 static inline void name(soft_t *a, float_status *s) \ 145 { \ 146 if (likely(!s->flush_inputs_to_zero)) { \ 147 return; \ 148 } \ 149 soft_t ## _input_flush__nocheck(a, s); \ 150 } 151 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32) 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64) 154 #undef GEN_INPUT_FLUSH1 155 156 #define GEN_INPUT_FLUSH2(name, soft_t) \ 157 static inline void name(soft_t *a, soft_t *b, float_status *s) \ 158 { \ 159 if (likely(!s->flush_inputs_to_zero)) { \ 160 return; \ 161 } \ 162 soft_t ## _input_flush__nocheck(a, s); \ 163 soft_t ## _input_flush__nocheck(b, s); \ 164 } 165 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32) 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64) 168 #undef GEN_INPUT_FLUSH2 169 170 #define GEN_INPUT_FLUSH3(name, soft_t) \ 171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \ 172 { \ 173 if (likely(!s->flush_inputs_to_zero)) { \ 174 return; \ 175 } \ 176 soft_t ## _input_flush__nocheck(a, s); \ 177 soft_t ## _input_flush__nocheck(b, s); \ 178 soft_t ## _input_flush__nocheck(c, s); \ 179 } 180 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32) 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64) 183 #undef GEN_INPUT_FLUSH3 184 185 /* 186 * Choose whether to use fpclassify or float32/64_* primitives in the generated 187 * hardfloat functions. Each combination of number of inputs and float size 188 * gets its own value. 189 */ 190 #if defined(__x86_64__) 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1 197 #else 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0 204 #endif 205 206 /* 207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over 208 * float{32,64}_is_infinity when !USE_FP. 209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup. 210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%. 211 */ 212 #if defined(__x86_64__) || defined(__aarch64__) 213 # define QEMU_HARDFLOAT_USE_ISINF 1 214 #else 215 # define QEMU_HARDFLOAT_USE_ISINF 0 216 #endif 217 218 /* 219 * Some targets clear the FP flags before most FP operations. This prevents 220 * the use of hardfloat, since hardfloat relies on the inexact flag being 221 * already set. 222 */ 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__) 224 # if defined(__FAST_MATH__) 225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \ 226 IEEE implementation 227 # endif 228 # define QEMU_NO_HARDFLOAT 1 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN 230 #else 231 # define QEMU_NO_HARDFLOAT 0 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline)) 233 #endif 234 235 static inline bool can_use_fpu(const float_status *s) 236 { 237 if (QEMU_NO_HARDFLOAT) { 238 return false; 239 } 240 return likely(s->float_exception_flags & float_flag_inexact && 241 s->float_rounding_mode == float_round_nearest_even); 242 } 243 244 /* 245 * Hardfloat generation functions. Each operation can have two flavors: 246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for 247 * most condition checks, or native ones (e.g. fpclassify). 248 * 249 * The flavor is chosen by the callers. Instead of using macros, we rely on the 250 * compiler to propagate constants and inline everything into the callers. 251 * 252 * We only generate functions for operations with two inputs, since only 253 * these are common enough to justify consolidating them into common code. 254 */ 255 256 typedef union { 257 float32 s; 258 float h; 259 } union_float32; 260 261 typedef union { 262 float64 s; 263 double h; 264 } union_float64; 265 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b); 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b); 268 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s); 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s); 271 typedef float (*hard_f32_op2_fn)(float a, float b); 272 typedef double (*hard_f64_op2_fn)(double a, double b); 273 274 /* 2-input is-zero-or-normal */ 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b) 276 { 277 if (QEMU_HARDFLOAT_2F32_USE_FP) { 278 /* 279 * Not using a temp variable for consecutive fpclassify calls ends up 280 * generating faster code. 281 */ 282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 284 } 285 return float32_is_zero_or_normal(a.s) && 286 float32_is_zero_or_normal(b.s); 287 } 288 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b) 290 { 291 if (QEMU_HARDFLOAT_2F64_USE_FP) { 292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 294 } 295 return float64_is_zero_or_normal(a.s) && 296 float64_is_zero_or_normal(b.s); 297 } 298 299 /* 3-input is-zero-or-normal */ 300 static inline 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c) 302 { 303 if (QEMU_HARDFLOAT_3F32_USE_FP) { 304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 307 } 308 return float32_is_zero_or_normal(a.s) && 309 float32_is_zero_or_normal(b.s) && 310 float32_is_zero_or_normal(c.s); 311 } 312 313 static inline 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c) 315 { 316 if (QEMU_HARDFLOAT_3F64_USE_FP) { 317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 320 } 321 return float64_is_zero_or_normal(a.s) && 322 float64_is_zero_or_normal(b.s) && 323 float64_is_zero_or_normal(c.s); 324 } 325 326 static inline bool f32_is_inf(union_float32 a) 327 { 328 if (QEMU_HARDFLOAT_USE_ISINF) { 329 return isinf(a.h); 330 } 331 return float32_is_infinity(a.s); 332 } 333 334 static inline bool f64_is_inf(union_float64 a) 335 { 336 if (QEMU_HARDFLOAT_USE_ISINF) { 337 return isinf(a.h); 338 } 339 return float64_is_infinity(a.s); 340 } 341 342 static inline float32 343 float32_gen2(float32 xa, float32 xb, float_status *s, 344 hard_f32_op2_fn hard, soft_f32_op2_fn soft, 345 f32_check_fn pre, f32_check_fn post) 346 { 347 union_float32 ua, ub, ur; 348 349 ua.s = xa; 350 ub.s = xb; 351 352 if (unlikely(!can_use_fpu(s))) { 353 goto soft; 354 } 355 356 float32_input_flush2(&ua.s, &ub.s, s); 357 if (unlikely(!pre(ua, ub))) { 358 goto soft; 359 } 360 361 ur.h = hard(ua.h, ub.h); 362 if (unlikely(f32_is_inf(ur))) { 363 float_raise(float_flag_overflow, s); 364 } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) { 365 goto soft; 366 } 367 return ur.s; 368 369 soft: 370 return soft(ua.s, ub.s, s); 371 } 372 373 static inline float64 374 float64_gen2(float64 xa, float64 xb, float_status *s, 375 hard_f64_op2_fn hard, soft_f64_op2_fn soft, 376 f64_check_fn pre, f64_check_fn post) 377 { 378 union_float64 ua, ub, ur; 379 380 ua.s = xa; 381 ub.s = xb; 382 383 if (unlikely(!can_use_fpu(s))) { 384 goto soft; 385 } 386 387 float64_input_flush2(&ua.s, &ub.s, s); 388 if (unlikely(!pre(ua, ub))) { 389 goto soft; 390 } 391 392 ur.h = hard(ua.h, ub.h); 393 if (unlikely(f64_is_inf(ur))) { 394 float_raise(float_flag_overflow, s); 395 } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) { 396 goto soft; 397 } 398 return ur.s; 399 400 soft: 401 return soft(ua.s, ub.s, s); 402 } 403 404 /*---------------------------------------------------------------------------- 405 | Returns the fraction bits of the single-precision floating-point value `a'. 406 *----------------------------------------------------------------------------*/ 407 408 static inline uint32_t extractFloat32Frac(float32 a) 409 { 410 return float32_val(a) & 0x007FFFFF; 411 } 412 413 /*---------------------------------------------------------------------------- 414 | Returns the exponent bits of the single-precision floating-point value `a'. 415 *----------------------------------------------------------------------------*/ 416 417 static inline int extractFloat32Exp(float32 a) 418 { 419 return (float32_val(a) >> 23) & 0xFF; 420 } 421 422 /*---------------------------------------------------------------------------- 423 | Returns the sign bit of the single-precision floating-point value `a'. 424 *----------------------------------------------------------------------------*/ 425 426 static inline bool extractFloat32Sign(float32 a) 427 { 428 return float32_val(a) >> 31; 429 } 430 431 /*---------------------------------------------------------------------------- 432 | Returns the fraction bits of the double-precision floating-point value `a'. 433 *----------------------------------------------------------------------------*/ 434 435 static inline uint64_t extractFloat64Frac(float64 a) 436 { 437 return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF); 438 } 439 440 /*---------------------------------------------------------------------------- 441 | Returns the exponent bits of the double-precision floating-point value `a'. 442 *----------------------------------------------------------------------------*/ 443 444 static inline int extractFloat64Exp(float64 a) 445 { 446 return (float64_val(a) >> 52) & 0x7FF; 447 } 448 449 /*---------------------------------------------------------------------------- 450 | Returns the sign bit of the double-precision floating-point value `a'. 451 *----------------------------------------------------------------------------*/ 452 453 static inline bool extractFloat64Sign(float64 a) 454 { 455 return float64_val(a) >> 63; 456 } 457 458 /* 459 * Classify a floating point number. Everything above float_class_qnan 460 * is a NaN so cls >= float_class_qnan is any NaN. 461 */ 462 463 typedef enum __attribute__ ((__packed__)) { 464 float_class_unclassified, 465 float_class_zero, 466 float_class_normal, 467 float_class_inf, 468 float_class_qnan, /* all NaNs from here */ 469 float_class_snan, 470 } FloatClass; 471 472 #define float_cmask(bit) (1u << (bit)) 473 474 enum { 475 float_cmask_zero = float_cmask(float_class_zero), 476 float_cmask_normal = float_cmask(float_class_normal), 477 float_cmask_inf = float_cmask(float_class_inf), 478 float_cmask_qnan = float_cmask(float_class_qnan), 479 float_cmask_snan = float_cmask(float_class_snan), 480 481 float_cmask_infzero = float_cmask_zero | float_cmask_inf, 482 float_cmask_anynan = float_cmask_qnan | float_cmask_snan, 483 }; 484 485 486 /* Simple helpers for checking if, or what kind of, NaN we have */ 487 static inline __attribute__((unused)) bool is_nan(FloatClass c) 488 { 489 return unlikely(c >= float_class_qnan); 490 } 491 492 static inline __attribute__((unused)) bool is_snan(FloatClass c) 493 { 494 return c == float_class_snan; 495 } 496 497 static inline __attribute__((unused)) bool is_qnan(FloatClass c) 498 { 499 return c == float_class_qnan; 500 } 501 502 /* 503 * Structure holding all of the decomposed parts of a float. 504 * The exponent is unbiased and the fraction is normalized. 505 * 506 * The fraction words are stored in big-endian word ordering, 507 * so that truncation from a larger format to a smaller format 508 * can be done simply by ignoring subsequent elements. 509 */ 510 511 typedef struct { 512 FloatClass cls; 513 bool sign; 514 int32_t exp; 515 union { 516 /* Routines that know the structure may reference the singular name. */ 517 uint64_t frac; 518 /* 519 * Routines expanded with multiple structures reference "hi" and "lo" 520 * depending on the operation. In FloatParts64, "hi" and "lo" are 521 * both the same word and aliased here. 522 */ 523 uint64_t frac_hi; 524 uint64_t frac_lo; 525 }; 526 } FloatParts64; 527 528 typedef struct { 529 FloatClass cls; 530 bool sign; 531 int32_t exp; 532 uint64_t frac_hi; 533 uint64_t frac_lo; 534 } FloatParts128; 535 536 typedef struct { 537 FloatClass cls; 538 bool sign; 539 int32_t exp; 540 uint64_t frac_hi; 541 uint64_t frac_hm; /* high-middle */ 542 uint64_t frac_lm; /* low-middle */ 543 uint64_t frac_lo; 544 } FloatParts256; 545 546 /* These apply to the most significant word of each FloatPartsN. */ 547 #define DECOMPOSED_BINARY_POINT 63 548 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 549 550 /* Structure holding all of the relevant parameters for a format. 551 * exp_size: the size of the exponent field 552 * exp_bias: the offset applied to the exponent field 553 * exp_max: the maximum normalised exponent 554 * frac_size: the size of the fraction field 555 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 556 * The following are computed based the size of fraction 557 * frac_lsb: least significant bit of fraction 558 * frac_lsbm1: the bit below the least significant bit (for rounding) 559 * round_mask/roundeven_mask: masks used for rounding 560 * The following optional modifiers are available: 561 * arm_althp: handle ARM Alternative Half Precision 562 */ 563 typedef struct { 564 int exp_size; 565 int exp_bias; 566 int exp_max; 567 int frac_size; 568 int frac_shift; 569 uint64_t frac_lsb; 570 uint64_t frac_lsbm1; 571 uint64_t round_mask; 572 uint64_t roundeven_mask; 573 bool arm_althp; 574 } FloatFmt; 575 576 /* Expand fields based on the size of exponent and fraction */ 577 #define FLOAT_PARAMS(E, F) \ 578 .exp_size = E, \ 579 .exp_bias = ((1 << E) - 1) >> 1, \ 580 .exp_max = (1 << E) - 1, \ 581 .frac_size = F, \ 582 .frac_shift = (-F - 1) & 63, \ 583 .frac_lsb = 1ull << ((-F - 1) & 63), \ 584 .frac_lsbm1 = 1ull << ((-F - 2) & 63), \ 585 .round_mask = (1ull << ((-F - 1) & 63)) - 1, \ 586 .roundeven_mask = (2ull << ((-F - 1) & 63)) - 1 587 588 static const FloatFmt float16_params = { 589 FLOAT_PARAMS(5, 10) 590 }; 591 592 static const FloatFmt float16_params_ahp = { 593 FLOAT_PARAMS(5, 10), 594 .arm_althp = true 595 }; 596 597 static const FloatFmt bfloat16_params = { 598 FLOAT_PARAMS(8, 7) 599 }; 600 601 static const FloatFmt float32_params = { 602 FLOAT_PARAMS(8, 23) 603 }; 604 605 static const FloatFmt float64_params = { 606 FLOAT_PARAMS(11, 52) 607 }; 608 609 static const FloatFmt float128_params = { 610 FLOAT_PARAMS(15, 112) 611 }; 612 613 /* Unpack a float to parts, but do not canonicalize. */ 614 static void unpack_raw64(FloatParts64 *r, const FloatFmt *fmt, uint64_t raw) 615 { 616 const int f_size = fmt->frac_size; 617 const int e_size = fmt->exp_size; 618 619 *r = (FloatParts64) { 620 .cls = float_class_unclassified, 621 .sign = extract64(raw, f_size + e_size, 1), 622 .exp = extract64(raw, f_size, e_size), 623 .frac = extract64(raw, 0, f_size) 624 }; 625 } 626 627 static inline void float16_unpack_raw(FloatParts64 *p, float16 f) 628 { 629 unpack_raw64(p, &float16_params, f); 630 } 631 632 static inline void bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f) 633 { 634 unpack_raw64(p, &bfloat16_params, f); 635 } 636 637 static inline void float32_unpack_raw(FloatParts64 *p, float32 f) 638 { 639 unpack_raw64(p, &float32_params, f); 640 } 641 642 static inline void float64_unpack_raw(FloatParts64 *p, float64 f) 643 { 644 unpack_raw64(p, &float64_params, f); 645 } 646 647 static void float128_unpack_raw(FloatParts128 *p, float128 f) 648 { 649 const int f_size = float128_params.frac_size - 64; 650 const int e_size = float128_params.exp_size; 651 652 *p = (FloatParts128) { 653 .cls = float_class_unclassified, 654 .sign = extract64(f.high, f_size + e_size, 1), 655 .exp = extract64(f.high, f_size, e_size), 656 .frac_hi = extract64(f.high, 0, f_size), 657 .frac_lo = f.low, 658 }; 659 } 660 661 /* Pack a float from parts, but do not canonicalize. */ 662 static uint64_t pack_raw64(const FloatParts64 *p, const FloatFmt *fmt) 663 { 664 const int f_size = fmt->frac_size; 665 const int e_size = fmt->exp_size; 666 uint64_t ret; 667 668 ret = (uint64_t)p->sign << (f_size + e_size); 669 ret = deposit64(ret, f_size, e_size, p->exp); 670 ret = deposit64(ret, 0, f_size, p->frac); 671 return ret; 672 } 673 674 static inline float16 float16_pack_raw(const FloatParts64 *p) 675 { 676 return make_float16(pack_raw64(p, &float16_params)); 677 } 678 679 static inline bfloat16 bfloat16_pack_raw(const FloatParts64 *p) 680 { 681 return pack_raw64(p, &bfloat16_params); 682 } 683 684 static inline float32 float32_pack_raw(const FloatParts64 *p) 685 { 686 return make_float32(pack_raw64(p, &float32_params)); 687 } 688 689 static inline float64 float64_pack_raw(const FloatParts64 *p) 690 { 691 return make_float64(pack_raw64(p, &float64_params)); 692 } 693 694 static float128 float128_pack_raw(const FloatParts128 *p) 695 { 696 const int f_size = float128_params.frac_size - 64; 697 const int e_size = float128_params.exp_size; 698 uint64_t hi; 699 700 hi = (uint64_t)p->sign << (f_size + e_size); 701 hi = deposit64(hi, f_size, e_size, p->exp); 702 hi = deposit64(hi, 0, f_size, p->frac_hi); 703 return make_float128(hi, p->frac_lo); 704 } 705 706 /*---------------------------------------------------------------------------- 707 | Functions and definitions to determine: (1) whether tininess for underflow 708 | is detected before or after rounding by default, (2) what (if anything) 709 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 710 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 711 | are propagated from function inputs to output. These details are target- 712 | specific. 713 *----------------------------------------------------------------------------*/ 714 #include "softfloat-specialize.c.inc" 715 716 #define PARTS_GENERIC_64_128(NAME, P) \ 717 QEMU_GENERIC(P, (FloatParts128 *, parts128_##NAME), parts64_##NAME) 718 719 #define PARTS_GENERIC_64_128_256(NAME, P) \ 720 QEMU_GENERIC(P, (FloatParts256 *, parts256_##NAME), \ 721 (FloatParts128 *, parts128_##NAME), parts64_##NAME) 722 723 #define parts_default_nan(P, S) PARTS_GENERIC_64_128(default_nan, P)(P, S) 724 #define parts_silence_nan(P, S) PARTS_GENERIC_64_128(silence_nan, P)(P, S) 725 726 static void parts64_return_nan(FloatParts64 *a, float_status *s); 727 static void parts128_return_nan(FloatParts128 *a, float_status *s); 728 729 #define parts_return_nan(P, S) PARTS_GENERIC_64_128(return_nan, P)(P, S) 730 731 static FloatParts64 *parts64_pick_nan(FloatParts64 *a, FloatParts64 *b, 732 float_status *s); 733 static FloatParts128 *parts128_pick_nan(FloatParts128 *a, FloatParts128 *b, 734 float_status *s); 735 736 #define parts_pick_nan(A, B, S) PARTS_GENERIC_64_128(pick_nan, A)(A, B, S) 737 738 static FloatParts64 *parts64_pick_nan_muladd(FloatParts64 *a, FloatParts64 *b, 739 FloatParts64 *c, float_status *s, 740 int ab_mask, int abc_mask); 741 static FloatParts128 *parts128_pick_nan_muladd(FloatParts128 *a, 742 FloatParts128 *b, 743 FloatParts128 *c, 744 float_status *s, 745 int ab_mask, int abc_mask); 746 747 #define parts_pick_nan_muladd(A, B, C, S, ABM, ABCM) \ 748 PARTS_GENERIC_64_128(pick_nan_muladd, A)(A, B, C, S, ABM, ABCM) 749 750 static void parts64_canonicalize(FloatParts64 *p, float_status *status, 751 const FloatFmt *fmt); 752 static void parts128_canonicalize(FloatParts128 *p, float_status *status, 753 const FloatFmt *fmt); 754 755 #define parts_canonicalize(A, S, F) \ 756 PARTS_GENERIC_64_128(canonicalize, A)(A, S, F) 757 758 static void parts64_uncanon(FloatParts64 *p, float_status *status, 759 const FloatFmt *fmt); 760 static void parts128_uncanon(FloatParts128 *p, float_status *status, 761 const FloatFmt *fmt); 762 763 #define parts_uncanon(A, S, F) \ 764 PARTS_GENERIC_64_128(uncanon, A)(A, S, F) 765 766 static void parts64_add_normal(FloatParts64 *a, FloatParts64 *b); 767 static void parts128_add_normal(FloatParts128 *a, FloatParts128 *b); 768 static void parts256_add_normal(FloatParts256 *a, FloatParts256 *b); 769 770 #define parts_add_normal(A, B) \ 771 PARTS_GENERIC_64_128_256(add_normal, A)(A, B) 772 773 static bool parts64_sub_normal(FloatParts64 *a, FloatParts64 *b); 774 static bool parts128_sub_normal(FloatParts128 *a, FloatParts128 *b); 775 static bool parts256_sub_normal(FloatParts256 *a, FloatParts256 *b); 776 777 #define parts_sub_normal(A, B) \ 778 PARTS_GENERIC_64_128_256(sub_normal, A)(A, B) 779 780 static FloatParts64 *parts64_addsub(FloatParts64 *a, FloatParts64 *b, 781 float_status *s, bool subtract); 782 static FloatParts128 *parts128_addsub(FloatParts128 *a, FloatParts128 *b, 783 float_status *s, bool subtract); 784 785 #define parts_addsub(A, B, S, Z) \ 786 PARTS_GENERIC_64_128(addsub, A)(A, B, S, Z) 787 788 static FloatParts64 *parts64_mul(FloatParts64 *a, FloatParts64 *b, 789 float_status *s); 790 static FloatParts128 *parts128_mul(FloatParts128 *a, FloatParts128 *b, 791 float_status *s); 792 793 #define parts_mul(A, B, S) \ 794 PARTS_GENERIC_64_128(mul, A)(A, B, S) 795 796 static FloatParts64 *parts64_muladd(FloatParts64 *a, FloatParts64 *b, 797 FloatParts64 *c, int flags, 798 float_status *s); 799 static FloatParts128 *parts128_muladd(FloatParts128 *a, FloatParts128 *b, 800 FloatParts128 *c, int flags, 801 float_status *s); 802 803 #define parts_muladd(A, B, C, Z, S) \ 804 PARTS_GENERIC_64_128(muladd, A)(A, B, C, Z, S) 805 806 /* 807 * Helper functions for softfloat-parts.c.inc, per-size operations. 808 */ 809 810 #define FRAC_GENERIC_64_128(NAME, P) \ 811 QEMU_GENERIC(P, (FloatParts128 *, frac128_##NAME), frac64_##NAME) 812 813 #define FRAC_GENERIC_64_128_256(NAME, P) \ 814 QEMU_GENERIC(P, (FloatParts256 *, frac256_##NAME), \ 815 (FloatParts128 *, frac128_##NAME), frac64_##NAME) 816 817 static bool frac64_add(FloatParts64 *r, FloatParts64 *a, FloatParts64 *b) 818 { 819 return uadd64_overflow(a->frac, b->frac, &r->frac); 820 } 821 822 static bool frac128_add(FloatParts128 *r, FloatParts128 *a, FloatParts128 *b) 823 { 824 bool c = 0; 825 r->frac_lo = uadd64_carry(a->frac_lo, b->frac_lo, &c); 826 r->frac_hi = uadd64_carry(a->frac_hi, b->frac_hi, &c); 827 return c; 828 } 829 830 static bool frac256_add(FloatParts256 *r, FloatParts256 *a, FloatParts256 *b) 831 { 832 bool c = 0; 833 r->frac_lo = uadd64_carry(a->frac_lo, b->frac_lo, &c); 834 r->frac_lm = uadd64_carry(a->frac_lm, b->frac_lm, &c); 835 r->frac_hm = uadd64_carry(a->frac_hm, b->frac_hm, &c); 836 r->frac_hi = uadd64_carry(a->frac_hi, b->frac_hi, &c); 837 return c; 838 } 839 840 #define frac_add(R, A, B) FRAC_GENERIC_64_128_256(add, R)(R, A, B) 841 842 static bool frac64_addi(FloatParts64 *r, FloatParts64 *a, uint64_t c) 843 { 844 return uadd64_overflow(a->frac, c, &r->frac); 845 } 846 847 static bool frac128_addi(FloatParts128 *r, FloatParts128 *a, uint64_t c) 848 { 849 c = uadd64_overflow(a->frac_lo, c, &r->frac_lo); 850 return uadd64_overflow(a->frac_hi, c, &r->frac_hi); 851 } 852 853 #define frac_addi(R, A, C) FRAC_GENERIC_64_128(addi, R)(R, A, C) 854 855 static void frac64_allones(FloatParts64 *a) 856 { 857 a->frac = -1; 858 } 859 860 static void frac128_allones(FloatParts128 *a) 861 { 862 a->frac_hi = a->frac_lo = -1; 863 } 864 865 #define frac_allones(A) FRAC_GENERIC_64_128(allones, A)(A) 866 867 static int frac64_cmp(FloatParts64 *a, FloatParts64 *b) 868 { 869 return a->frac == b->frac ? 0 : a->frac < b->frac ? -1 : 1; 870 } 871 872 static int frac128_cmp(FloatParts128 *a, FloatParts128 *b) 873 { 874 uint64_t ta = a->frac_hi, tb = b->frac_hi; 875 if (ta == tb) { 876 ta = a->frac_lo, tb = b->frac_lo; 877 if (ta == tb) { 878 return 0; 879 } 880 } 881 return ta < tb ? -1 : 1; 882 } 883 884 #define frac_cmp(A, B) FRAC_GENERIC_64_128(cmp, A)(A, B) 885 886 static void frac64_clear(FloatParts64 *a) 887 { 888 a->frac = 0; 889 } 890 891 static void frac128_clear(FloatParts128 *a) 892 { 893 a->frac_hi = a->frac_lo = 0; 894 } 895 896 #define frac_clear(A) FRAC_GENERIC_64_128(clear, A)(A) 897 898 static bool frac64_eqz(FloatParts64 *a) 899 { 900 return a->frac == 0; 901 } 902 903 static bool frac128_eqz(FloatParts128 *a) 904 { 905 return (a->frac_hi | a->frac_lo) == 0; 906 } 907 908 #define frac_eqz(A) FRAC_GENERIC_64_128(eqz, A)(A) 909 910 static void frac64_mulw(FloatParts128 *r, FloatParts64 *a, FloatParts64 *b) 911 { 912 mulu64(&r->frac_lo, &r->frac_hi, a->frac, b->frac); 913 } 914 915 static void frac128_mulw(FloatParts256 *r, FloatParts128 *a, FloatParts128 *b) 916 { 917 mul128To256(a->frac_hi, a->frac_lo, b->frac_hi, b->frac_lo, 918 &r->frac_hi, &r->frac_hm, &r->frac_lm, &r->frac_lo); 919 } 920 921 #define frac_mulw(R, A, B) FRAC_GENERIC_64_128(mulw, A)(R, A, B) 922 923 static void frac64_neg(FloatParts64 *a) 924 { 925 a->frac = -a->frac; 926 } 927 928 static void frac128_neg(FloatParts128 *a) 929 { 930 bool c = 0; 931 a->frac_lo = usub64_borrow(0, a->frac_lo, &c); 932 a->frac_hi = usub64_borrow(0, a->frac_hi, &c); 933 } 934 935 static void frac256_neg(FloatParts256 *a) 936 { 937 bool c = 0; 938 a->frac_lo = usub64_borrow(0, a->frac_lo, &c); 939 a->frac_lm = usub64_borrow(0, a->frac_lm, &c); 940 a->frac_hm = usub64_borrow(0, a->frac_hm, &c); 941 a->frac_hi = usub64_borrow(0, a->frac_hi, &c); 942 } 943 944 #define frac_neg(A) FRAC_GENERIC_64_128_256(neg, A)(A) 945 946 static int frac64_normalize(FloatParts64 *a) 947 { 948 if (a->frac) { 949 int shift = clz64(a->frac); 950 a->frac <<= shift; 951 return shift; 952 } 953 return 64; 954 } 955 956 static int frac128_normalize(FloatParts128 *a) 957 { 958 if (a->frac_hi) { 959 int shl = clz64(a->frac_hi); 960 a->frac_hi = shl_double(a->frac_hi, a->frac_lo, shl); 961 a->frac_lo <<= shl; 962 return shl; 963 } else if (a->frac_lo) { 964 int shl = clz64(a->frac_lo); 965 a->frac_hi = a->frac_lo << shl; 966 a->frac_lo = 0; 967 return shl + 64; 968 } 969 return 128; 970 } 971 972 static int frac256_normalize(FloatParts256 *a) 973 { 974 uint64_t a0 = a->frac_hi, a1 = a->frac_hm; 975 uint64_t a2 = a->frac_lm, a3 = a->frac_lo; 976 int ret, shl; 977 978 if (likely(a0)) { 979 shl = clz64(a0); 980 if (shl == 0) { 981 return 0; 982 } 983 ret = shl; 984 } else { 985 if (a1) { 986 ret = 64; 987 a0 = a1, a1 = a2, a2 = a3, a3 = 0; 988 } else if (a2) { 989 ret = 128; 990 a0 = a2, a1 = a3, a2 = 0, a3 = 0; 991 } else if (a3) { 992 ret = 192; 993 a0 = a3, a1 = 0, a2 = 0, a3 = 0; 994 } else { 995 ret = 256; 996 a0 = 0, a1 = 0, a2 = 0, a3 = 0; 997 goto done; 998 } 999 shl = clz64(a0); 1000 if (shl == 0) { 1001 goto done; 1002 } 1003 ret += shl; 1004 } 1005 1006 a0 = shl_double(a0, a1, shl); 1007 a1 = shl_double(a1, a2, shl); 1008 a2 = shl_double(a2, a3, shl); 1009 a3 <<= shl; 1010 1011 done: 1012 a->frac_hi = a0; 1013 a->frac_hm = a1; 1014 a->frac_lm = a2; 1015 a->frac_lo = a3; 1016 return ret; 1017 } 1018 1019 #define frac_normalize(A) FRAC_GENERIC_64_128_256(normalize, A)(A) 1020 1021 static void frac64_shl(FloatParts64 *a, int c) 1022 { 1023 a->frac <<= c; 1024 } 1025 1026 static void frac128_shl(FloatParts128 *a, int c) 1027 { 1028 uint64_t a0 = a->frac_hi, a1 = a->frac_lo; 1029 1030 if (c & 64) { 1031 a0 = a1, a1 = 0; 1032 } 1033 1034 c &= 63; 1035 if (c) { 1036 a0 = shl_double(a0, a1, c); 1037 a1 = a1 << c; 1038 } 1039 1040 a->frac_hi = a0; 1041 a->frac_lo = a1; 1042 } 1043 1044 #define frac_shl(A, C) FRAC_GENERIC_64_128(shl, A)(A, C) 1045 1046 static void frac64_shr(FloatParts64 *a, int c) 1047 { 1048 a->frac >>= c; 1049 } 1050 1051 static void frac128_shr(FloatParts128 *a, int c) 1052 { 1053 uint64_t a0 = a->frac_hi, a1 = a->frac_lo; 1054 1055 if (c & 64) { 1056 a1 = a0, a0 = 0; 1057 } 1058 1059 c &= 63; 1060 if (c) { 1061 a1 = shr_double(a0, a1, c); 1062 a0 = a0 >> c; 1063 } 1064 1065 a->frac_hi = a0; 1066 a->frac_lo = a1; 1067 } 1068 1069 #define frac_shr(A, C) FRAC_GENERIC_64_128(shr, A)(A, C) 1070 1071 static void frac64_shrjam(FloatParts64 *a, int c) 1072 { 1073 uint64_t a0 = a->frac; 1074 1075 if (likely(c != 0)) { 1076 if (likely(c < 64)) { 1077 a0 = (a0 >> c) | (shr_double(a0, 0, c) != 0); 1078 } else { 1079 a0 = a0 != 0; 1080 } 1081 a->frac = a0; 1082 } 1083 } 1084 1085 static void frac128_shrjam(FloatParts128 *a, int c) 1086 { 1087 uint64_t a0 = a->frac_hi, a1 = a->frac_lo; 1088 uint64_t sticky = 0; 1089 1090 if (unlikely(c == 0)) { 1091 return; 1092 } else if (likely(c < 64)) { 1093 /* nothing */ 1094 } else if (likely(c < 128)) { 1095 sticky = a1; 1096 a1 = a0; 1097 a0 = 0; 1098 c &= 63; 1099 if (c == 0) { 1100 goto done; 1101 } 1102 } else { 1103 sticky = a0 | a1; 1104 a0 = a1 = 0; 1105 goto done; 1106 } 1107 1108 sticky |= shr_double(a1, 0, c); 1109 a1 = shr_double(a0, a1, c); 1110 a0 = a0 >> c; 1111 1112 done: 1113 a->frac_lo = a1 | (sticky != 0); 1114 a->frac_hi = a0; 1115 } 1116 1117 static void frac256_shrjam(FloatParts256 *a, int c) 1118 { 1119 uint64_t a0 = a->frac_hi, a1 = a->frac_hm; 1120 uint64_t a2 = a->frac_lm, a3 = a->frac_lo; 1121 uint64_t sticky = 0; 1122 1123 if (unlikely(c == 0)) { 1124 return; 1125 } else if (likely(c < 64)) { 1126 /* nothing */ 1127 } else if (likely(c < 256)) { 1128 if (unlikely(c & 128)) { 1129 sticky |= a2 | a3; 1130 a3 = a1, a2 = a0, a1 = 0, a0 = 0; 1131 } 1132 if (unlikely(c & 64)) { 1133 sticky |= a3; 1134 a3 = a2, a2 = a1, a1 = a0, a0 = 0; 1135 } 1136 c &= 63; 1137 if (c == 0) { 1138 goto done; 1139 } 1140 } else { 1141 sticky = a0 | a1 | a2 | a3; 1142 a0 = a1 = a2 = a3 = 0; 1143 goto done; 1144 } 1145 1146 sticky |= shr_double(a3, 0, c); 1147 a3 = shr_double(a2, a3, c); 1148 a2 = shr_double(a1, a2, c); 1149 a1 = shr_double(a0, a1, c); 1150 a0 = a0 >> c; 1151 1152 done: 1153 a->frac_lo = a3 | (sticky != 0); 1154 a->frac_lm = a2; 1155 a->frac_hm = a1; 1156 a->frac_hi = a0; 1157 } 1158 1159 #define frac_shrjam(A, C) FRAC_GENERIC_64_128_256(shrjam, A)(A, C) 1160 1161 static bool frac64_sub(FloatParts64 *r, FloatParts64 *a, FloatParts64 *b) 1162 { 1163 return usub64_overflow(a->frac, b->frac, &r->frac); 1164 } 1165 1166 static bool frac128_sub(FloatParts128 *r, FloatParts128 *a, FloatParts128 *b) 1167 { 1168 bool c = 0; 1169 r->frac_lo = usub64_borrow(a->frac_lo, b->frac_lo, &c); 1170 r->frac_hi = usub64_borrow(a->frac_hi, b->frac_hi, &c); 1171 return c; 1172 } 1173 1174 static bool frac256_sub(FloatParts256 *r, FloatParts256 *a, FloatParts256 *b) 1175 { 1176 bool c = 0; 1177 r->frac_lo = usub64_borrow(a->frac_lo, b->frac_lo, &c); 1178 r->frac_lm = usub64_borrow(a->frac_lm, b->frac_lm, &c); 1179 r->frac_hm = usub64_borrow(a->frac_hm, b->frac_hm, &c); 1180 r->frac_hi = usub64_borrow(a->frac_hi, b->frac_hi, &c); 1181 return c; 1182 } 1183 1184 #define frac_sub(R, A, B) FRAC_GENERIC_64_128_256(sub, R)(R, A, B) 1185 1186 static void frac64_truncjam(FloatParts64 *r, FloatParts128 *a) 1187 { 1188 r->frac = a->frac_hi | (a->frac_lo != 0); 1189 } 1190 1191 static void frac128_truncjam(FloatParts128 *r, FloatParts256 *a) 1192 { 1193 r->frac_hi = a->frac_hi; 1194 r->frac_lo = a->frac_hm | ((a->frac_lm | a->frac_lo) != 0); 1195 } 1196 1197 #define frac_truncjam(R, A) FRAC_GENERIC_64_128(truncjam, R)(R, A) 1198 1199 static void frac64_widen(FloatParts128 *r, FloatParts64 *a) 1200 { 1201 r->frac_hi = a->frac; 1202 r->frac_lo = 0; 1203 } 1204 1205 static void frac128_widen(FloatParts256 *r, FloatParts128 *a) 1206 { 1207 r->frac_hi = a->frac_hi; 1208 r->frac_hm = a->frac_lo; 1209 r->frac_lm = 0; 1210 r->frac_lo = 0; 1211 } 1212 1213 #define frac_widen(A, B) FRAC_GENERIC_64_128(widen, B)(A, B) 1214 1215 #define partsN(NAME) glue(glue(glue(parts,N),_),NAME) 1216 #define FloatPartsN glue(FloatParts,N) 1217 #define FloatPartsW glue(FloatParts,W) 1218 1219 #define N 64 1220 #define W 128 1221 1222 #include "softfloat-parts-addsub.c.inc" 1223 #include "softfloat-parts.c.inc" 1224 1225 #undef N 1226 #undef W 1227 #define N 128 1228 #define W 256 1229 1230 #include "softfloat-parts-addsub.c.inc" 1231 #include "softfloat-parts.c.inc" 1232 1233 #undef N 1234 #undef W 1235 #define N 256 1236 1237 #include "softfloat-parts-addsub.c.inc" 1238 1239 #undef N 1240 #undef W 1241 #undef partsN 1242 #undef FloatPartsN 1243 #undef FloatPartsW 1244 1245 /* 1246 * Pack/unpack routines with a specific FloatFmt. 1247 */ 1248 1249 static void float16a_unpack_canonical(FloatParts64 *p, float16 f, 1250 float_status *s, const FloatFmt *params) 1251 { 1252 float16_unpack_raw(p, f); 1253 parts_canonicalize(p, s, params); 1254 } 1255 1256 static void float16_unpack_canonical(FloatParts64 *p, float16 f, 1257 float_status *s) 1258 { 1259 float16a_unpack_canonical(p, f, s, &float16_params); 1260 } 1261 1262 static void bfloat16_unpack_canonical(FloatParts64 *p, bfloat16 f, 1263 float_status *s) 1264 { 1265 bfloat16_unpack_raw(p, f); 1266 parts_canonicalize(p, s, &bfloat16_params); 1267 } 1268 1269 static float16 float16a_round_pack_canonical(FloatParts64 *p, 1270 float_status *s, 1271 const FloatFmt *params) 1272 { 1273 parts_uncanon(p, s, params); 1274 return float16_pack_raw(p); 1275 } 1276 1277 static float16 float16_round_pack_canonical(FloatParts64 *p, 1278 float_status *s) 1279 { 1280 return float16a_round_pack_canonical(p, s, &float16_params); 1281 } 1282 1283 static bfloat16 bfloat16_round_pack_canonical(FloatParts64 *p, 1284 float_status *s) 1285 { 1286 parts_uncanon(p, s, &bfloat16_params); 1287 return bfloat16_pack_raw(p); 1288 } 1289 1290 static void float32_unpack_canonical(FloatParts64 *p, float32 f, 1291 float_status *s) 1292 { 1293 float32_unpack_raw(p, f); 1294 parts_canonicalize(p, s, &float32_params); 1295 } 1296 1297 static float32 float32_round_pack_canonical(FloatParts64 *p, 1298 float_status *s) 1299 { 1300 parts_uncanon(p, s, &float32_params); 1301 return float32_pack_raw(p); 1302 } 1303 1304 static void float64_unpack_canonical(FloatParts64 *p, float64 f, 1305 float_status *s) 1306 { 1307 float64_unpack_raw(p, f); 1308 parts_canonicalize(p, s, &float64_params); 1309 } 1310 1311 static float64 float64_round_pack_canonical(FloatParts64 *p, 1312 float_status *s) 1313 { 1314 parts_uncanon(p, s, &float64_params); 1315 return float64_pack_raw(p); 1316 } 1317 1318 static void float128_unpack_canonical(FloatParts128 *p, float128 f, 1319 float_status *s) 1320 { 1321 float128_unpack_raw(p, f); 1322 parts_canonicalize(p, s, &float128_params); 1323 } 1324 1325 static float128 float128_round_pack_canonical(FloatParts128 *p, 1326 float_status *s) 1327 { 1328 parts_uncanon(p, s, &float128_params); 1329 return float128_pack_raw(p); 1330 } 1331 1332 /* 1333 * Addition and subtraction 1334 */ 1335 1336 static float16 QEMU_FLATTEN 1337 float16_addsub(float16 a, float16 b, float_status *status, bool subtract) 1338 { 1339 FloatParts64 pa, pb, *pr; 1340 1341 float16_unpack_canonical(&pa, a, status); 1342 float16_unpack_canonical(&pb, b, status); 1343 pr = parts_addsub(&pa, &pb, status, subtract); 1344 1345 return float16_round_pack_canonical(pr, status); 1346 } 1347 1348 float16 float16_add(float16 a, float16 b, float_status *status) 1349 { 1350 return float16_addsub(a, b, status, false); 1351 } 1352 1353 float16 float16_sub(float16 a, float16 b, float_status *status) 1354 { 1355 return float16_addsub(a, b, status, true); 1356 } 1357 1358 static float32 QEMU_SOFTFLOAT_ATTR 1359 soft_f32_addsub(float32 a, float32 b, float_status *status, bool subtract) 1360 { 1361 FloatParts64 pa, pb, *pr; 1362 1363 float32_unpack_canonical(&pa, a, status); 1364 float32_unpack_canonical(&pb, b, status); 1365 pr = parts_addsub(&pa, &pb, status, subtract); 1366 1367 return float32_round_pack_canonical(pr, status); 1368 } 1369 1370 static float32 soft_f32_add(float32 a, float32 b, float_status *status) 1371 { 1372 return soft_f32_addsub(a, b, status, false); 1373 } 1374 1375 static float32 soft_f32_sub(float32 a, float32 b, float_status *status) 1376 { 1377 return soft_f32_addsub(a, b, status, true); 1378 } 1379 1380 static float64 QEMU_SOFTFLOAT_ATTR 1381 soft_f64_addsub(float64 a, float64 b, float_status *status, bool subtract) 1382 { 1383 FloatParts64 pa, pb, *pr; 1384 1385 float64_unpack_canonical(&pa, a, status); 1386 float64_unpack_canonical(&pb, b, status); 1387 pr = parts_addsub(&pa, &pb, status, subtract); 1388 1389 return float64_round_pack_canonical(pr, status); 1390 } 1391 1392 static float64 soft_f64_add(float64 a, float64 b, float_status *status) 1393 { 1394 return soft_f64_addsub(a, b, status, false); 1395 } 1396 1397 static float64 soft_f64_sub(float64 a, float64 b, float_status *status) 1398 { 1399 return soft_f64_addsub(a, b, status, true); 1400 } 1401 1402 static float hard_f32_add(float a, float b) 1403 { 1404 return a + b; 1405 } 1406 1407 static float hard_f32_sub(float a, float b) 1408 { 1409 return a - b; 1410 } 1411 1412 static double hard_f64_add(double a, double b) 1413 { 1414 return a + b; 1415 } 1416 1417 static double hard_f64_sub(double a, double b) 1418 { 1419 return a - b; 1420 } 1421 1422 static bool f32_addsubmul_post(union_float32 a, union_float32 b) 1423 { 1424 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1425 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1426 } 1427 return !(float32_is_zero(a.s) && float32_is_zero(b.s)); 1428 } 1429 1430 static bool f64_addsubmul_post(union_float64 a, union_float64 b) 1431 { 1432 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1433 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1434 } else { 1435 return !(float64_is_zero(a.s) && float64_is_zero(b.s)); 1436 } 1437 } 1438 1439 static float32 float32_addsub(float32 a, float32 b, float_status *s, 1440 hard_f32_op2_fn hard, soft_f32_op2_fn soft) 1441 { 1442 return float32_gen2(a, b, s, hard, soft, 1443 f32_is_zon2, f32_addsubmul_post); 1444 } 1445 1446 static float64 float64_addsub(float64 a, float64 b, float_status *s, 1447 hard_f64_op2_fn hard, soft_f64_op2_fn soft) 1448 { 1449 return float64_gen2(a, b, s, hard, soft, 1450 f64_is_zon2, f64_addsubmul_post); 1451 } 1452 1453 float32 QEMU_FLATTEN 1454 float32_add(float32 a, float32 b, float_status *s) 1455 { 1456 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add); 1457 } 1458 1459 float32 QEMU_FLATTEN 1460 float32_sub(float32 a, float32 b, float_status *s) 1461 { 1462 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub); 1463 } 1464 1465 float64 QEMU_FLATTEN 1466 float64_add(float64 a, float64 b, float_status *s) 1467 { 1468 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add); 1469 } 1470 1471 float64 QEMU_FLATTEN 1472 float64_sub(float64 a, float64 b, float_status *s) 1473 { 1474 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub); 1475 } 1476 1477 static bfloat16 QEMU_FLATTEN 1478 bfloat16_addsub(bfloat16 a, bfloat16 b, float_status *status, bool subtract) 1479 { 1480 FloatParts64 pa, pb, *pr; 1481 1482 bfloat16_unpack_canonical(&pa, a, status); 1483 bfloat16_unpack_canonical(&pb, b, status); 1484 pr = parts_addsub(&pa, &pb, status, subtract); 1485 1486 return bfloat16_round_pack_canonical(pr, status); 1487 } 1488 1489 bfloat16 bfloat16_add(bfloat16 a, bfloat16 b, float_status *status) 1490 { 1491 return bfloat16_addsub(a, b, status, false); 1492 } 1493 1494 bfloat16 bfloat16_sub(bfloat16 a, bfloat16 b, float_status *status) 1495 { 1496 return bfloat16_addsub(a, b, status, true); 1497 } 1498 1499 static float128 QEMU_FLATTEN 1500 float128_addsub(float128 a, float128 b, float_status *status, bool subtract) 1501 { 1502 FloatParts128 pa, pb, *pr; 1503 1504 float128_unpack_canonical(&pa, a, status); 1505 float128_unpack_canonical(&pb, b, status); 1506 pr = parts_addsub(&pa, &pb, status, subtract); 1507 1508 return float128_round_pack_canonical(pr, status); 1509 } 1510 1511 float128 float128_add(float128 a, float128 b, float_status *status) 1512 { 1513 return float128_addsub(a, b, status, false); 1514 } 1515 1516 float128 float128_sub(float128 a, float128 b, float_status *status) 1517 { 1518 return float128_addsub(a, b, status, true); 1519 } 1520 1521 /* 1522 * Multiplication 1523 */ 1524 1525 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status) 1526 { 1527 FloatParts64 pa, pb, *pr; 1528 1529 float16_unpack_canonical(&pa, a, status); 1530 float16_unpack_canonical(&pb, b, status); 1531 pr = parts_mul(&pa, &pb, status); 1532 1533 return float16_round_pack_canonical(pr, status); 1534 } 1535 1536 static float32 QEMU_SOFTFLOAT_ATTR 1537 soft_f32_mul(float32 a, float32 b, float_status *status) 1538 { 1539 FloatParts64 pa, pb, *pr; 1540 1541 float32_unpack_canonical(&pa, a, status); 1542 float32_unpack_canonical(&pb, b, status); 1543 pr = parts_mul(&pa, &pb, status); 1544 1545 return float32_round_pack_canonical(pr, status); 1546 } 1547 1548 static float64 QEMU_SOFTFLOAT_ATTR 1549 soft_f64_mul(float64 a, float64 b, float_status *status) 1550 { 1551 FloatParts64 pa, pb, *pr; 1552 1553 float64_unpack_canonical(&pa, a, status); 1554 float64_unpack_canonical(&pb, b, status); 1555 pr = parts_mul(&pa, &pb, status); 1556 1557 return float64_round_pack_canonical(pr, status); 1558 } 1559 1560 static float hard_f32_mul(float a, float b) 1561 { 1562 return a * b; 1563 } 1564 1565 static double hard_f64_mul(double a, double b) 1566 { 1567 return a * b; 1568 } 1569 1570 float32 QEMU_FLATTEN 1571 float32_mul(float32 a, float32 b, float_status *s) 1572 { 1573 return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul, 1574 f32_is_zon2, f32_addsubmul_post); 1575 } 1576 1577 float64 QEMU_FLATTEN 1578 float64_mul(float64 a, float64 b, float_status *s) 1579 { 1580 return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul, 1581 f64_is_zon2, f64_addsubmul_post); 1582 } 1583 1584 bfloat16 QEMU_FLATTEN 1585 bfloat16_mul(bfloat16 a, bfloat16 b, float_status *status) 1586 { 1587 FloatParts64 pa, pb, *pr; 1588 1589 bfloat16_unpack_canonical(&pa, a, status); 1590 bfloat16_unpack_canonical(&pb, b, status); 1591 pr = parts_mul(&pa, &pb, status); 1592 1593 return bfloat16_round_pack_canonical(pr, status); 1594 } 1595 1596 float128 QEMU_FLATTEN 1597 float128_mul(float128 a, float128 b, float_status *status) 1598 { 1599 FloatParts128 pa, pb, *pr; 1600 1601 float128_unpack_canonical(&pa, a, status); 1602 float128_unpack_canonical(&pb, b, status); 1603 pr = parts_mul(&pa, &pb, status); 1604 1605 return float128_round_pack_canonical(pr, status); 1606 } 1607 1608 /* 1609 * Fused multiply-add 1610 */ 1611 1612 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c, 1613 int flags, float_status *status) 1614 { 1615 FloatParts64 pa, pb, pc, *pr; 1616 1617 float16_unpack_canonical(&pa, a, status); 1618 float16_unpack_canonical(&pb, b, status); 1619 float16_unpack_canonical(&pc, c, status); 1620 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1621 1622 return float16_round_pack_canonical(pr, status); 1623 } 1624 1625 static float32 QEMU_SOFTFLOAT_ATTR 1626 soft_f32_muladd(float32 a, float32 b, float32 c, int flags, 1627 float_status *status) 1628 { 1629 FloatParts64 pa, pb, pc, *pr; 1630 1631 float32_unpack_canonical(&pa, a, status); 1632 float32_unpack_canonical(&pb, b, status); 1633 float32_unpack_canonical(&pc, c, status); 1634 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1635 1636 return float32_round_pack_canonical(pr, status); 1637 } 1638 1639 static float64 QEMU_SOFTFLOAT_ATTR 1640 soft_f64_muladd(float64 a, float64 b, float64 c, int flags, 1641 float_status *status) 1642 { 1643 FloatParts64 pa, pb, pc, *pr; 1644 1645 float64_unpack_canonical(&pa, a, status); 1646 float64_unpack_canonical(&pb, b, status); 1647 float64_unpack_canonical(&pc, c, status); 1648 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1649 1650 return float64_round_pack_canonical(pr, status); 1651 } 1652 1653 static bool force_soft_fma; 1654 1655 float32 QEMU_FLATTEN 1656 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s) 1657 { 1658 union_float32 ua, ub, uc, ur; 1659 1660 ua.s = xa; 1661 ub.s = xb; 1662 uc.s = xc; 1663 1664 if (unlikely(!can_use_fpu(s))) { 1665 goto soft; 1666 } 1667 if (unlikely(flags & float_muladd_halve_result)) { 1668 goto soft; 1669 } 1670 1671 float32_input_flush3(&ua.s, &ub.s, &uc.s, s); 1672 if (unlikely(!f32_is_zon3(ua, ub, uc))) { 1673 goto soft; 1674 } 1675 1676 if (unlikely(force_soft_fma)) { 1677 goto soft; 1678 } 1679 1680 /* 1681 * When (a || b) == 0, there's no need to check for under/over flow, 1682 * since we know the addend is (normal || 0) and the product is 0. 1683 */ 1684 if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) { 1685 union_float32 up; 1686 bool prod_sign; 1687 1688 prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s); 1689 prod_sign ^= !!(flags & float_muladd_negate_product); 1690 up.s = float32_set_sign(float32_zero, prod_sign); 1691 1692 if (flags & float_muladd_negate_c) { 1693 uc.h = -uc.h; 1694 } 1695 ur.h = up.h + uc.h; 1696 } else { 1697 union_float32 ua_orig = ua; 1698 union_float32 uc_orig = uc; 1699 1700 if (flags & float_muladd_negate_product) { 1701 ua.h = -ua.h; 1702 } 1703 if (flags & float_muladd_negate_c) { 1704 uc.h = -uc.h; 1705 } 1706 1707 ur.h = fmaf(ua.h, ub.h, uc.h); 1708 1709 if (unlikely(f32_is_inf(ur))) { 1710 float_raise(float_flag_overflow, s); 1711 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) { 1712 ua = ua_orig; 1713 uc = uc_orig; 1714 goto soft; 1715 } 1716 } 1717 if (flags & float_muladd_negate_result) { 1718 return float32_chs(ur.s); 1719 } 1720 return ur.s; 1721 1722 soft: 1723 return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s); 1724 } 1725 1726 float64 QEMU_FLATTEN 1727 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s) 1728 { 1729 union_float64 ua, ub, uc, ur; 1730 1731 ua.s = xa; 1732 ub.s = xb; 1733 uc.s = xc; 1734 1735 if (unlikely(!can_use_fpu(s))) { 1736 goto soft; 1737 } 1738 if (unlikely(flags & float_muladd_halve_result)) { 1739 goto soft; 1740 } 1741 1742 float64_input_flush3(&ua.s, &ub.s, &uc.s, s); 1743 if (unlikely(!f64_is_zon3(ua, ub, uc))) { 1744 goto soft; 1745 } 1746 1747 if (unlikely(force_soft_fma)) { 1748 goto soft; 1749 } 1750 1751 /* 1752 * When (a || b) == 0, there's no need to check for under/over flow, 1753 * since we know the addend is (normal || 0) and the product is 0. 1754 */ 1755 if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) { 1756 union_float64 up; 1757 bool prod_sign; 1758 1759 prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s); 1760 prod_sign ^= !!(flags & float_muladd_negate_product); 1761 up.s = float64_set_sign(float64_zero, prod_sign); 1762 1763 if (flags & float_muladd_negate_c) { 1764 uc.h = -uc.h; 1765 } 1766 ur.h = up.h + uc.h; 1767 } else { 1768 union_float64 ua_orig = ua; 1769 union_float64 uc_orig = uc; 1770 1771 if (flags & float_muladd_negate_product) { 1772 ua.h = -ua.h; 1773 } 1774 if (flags & float_muladd_negate_c) { 1775 uc.h = -uc.h; 1776 } 1777 1778 ur.h = fma(ua.h, ub.h, uc.h); 1779 1780 if (unlikely(f64_is_inf(ur))) { 1781 float_raise(float_flag_overflow, s); 1782 } else if (unlikely(fabs(ur.h) <= FLT_MIN)) { 1783 ua = ua_orig; 1784 uc = uc_orig; 1785 goto soft; 1786 } 1787 } 1788 if (flags & float_muladd_negate_result) { 1789 return float64_chs(ur.s); 1790 } 1791 return ur.s; 1792 1793 soft: 1794 return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s); 1795 } 1796 1797 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c, 1798 int flags, float_status *status) 1799 { 1800 FloatParts64 pa, pb, pc, *pr; 1801 1802 bfloat16_unpack_canonical(&pa, a, status); 1803 bfloat16_unpack_canonical(&pb, b, status); 1804 bfloat16_unpack_canonical(&pc, c, status); 1805 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1806 1807 return bfloat16_round_pack_canonical(pr, status); 1808 } 1809 1810 float128 QEMU_FLATTEN float128_muladd(float128 a, float128 b, float128 c, 1811 int flags, float_status *status) 1812 { 1813 FloatParts128 pa, pb, pc, *pr; 1814 1815 float128_unpack_canonical(&pa, a, status); 1816 float128_unpack_canonical(&pb, b, status); 1817 float128_unpack_canonical(&pc, c, status); 1818 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1819 1820 return float128_round_pack_canonical(pr, status); 1821 } 1822 1823 /* 1824 * Returns the result of dividing the floating-point value `a' by the 1825 * corresponding value `b'. The operation is performed according to 1826 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1827 */ 1828 1829 static FloatParts64 div_floats(FloatParts64 a, FloatParts64 b, float_status *s) 1830 { 1831 bool sign = a.sign ^ b.sign; 1832 1833 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1834 uint64_t n0, n1, q, r; 1835 int exp = a.exp - b.exp; 1836 1837 /* 1838 * We want a 2*N / N-bit division to produce exactly an N-bit 1839 * result, so that we do not lose any precision and so that we 1840 * do not have to renormalize afterward. If A.frac < B.frac, 1841 * then division would produce an (N-1)-bit result; shift A left 1842 * by one to produce the an N-bit result, and decrement the 1843 * exponent to match. 1844 * 1845 * The udiv_qrnnd algorithm that we're using requires normalization, 1846 * i.e. the msb of the denominator must be set, which is already true. 1847 */ 1848 if (a.frac < b.frac) { 1849 exp -= 1; 1850 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0); 1851 } else { 1852 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, &n1, &n0); 1853 } 1854 q = udiv_qrnnd(&r, n1, n0, b.frac); 1855 1856 /* Set lsb if there is a remainder, to set inexact. */ 1857 a.frac = q | (r != 0); 1858 a.sign = sign; 1859 a.exp = exp; 1860 return a; 1861 } 1862 /* handle all the NaN cases */ 1863 if (is_nan(a.cls) || is_nan(b.cls)) { 1864 return *parts_pick_nan(&a, &b, s); 1865 } 1866 /* 0/0 or Inf/Inf */ 1867 if (a.cls == b.cls 1868 && 1869 (a.cls == float_class_inf || a.cls == float_class_zero)) { 1870 float_raise(float_flag_invalid, s); 1871 parts_default_nan(&a, s); 1872 return a; 1873 } 1874 /* Inf / x or 0 / x */ 1875 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1876 a.sign = sign; 1877 return a; 1878 } 1879 /* Div 0 => Inf */ 1880 if (b.cls == float_class_zero) { 1881 float_raise(float_flag_divbyzero, s); 1882 a.cls = float_class_inf; 1883 a.sign = sign; 1884 return a; 1885 } 1886 /* Div by Inf */ 1887 if (b.cls == float_class_inf) { 1888 a.cls = float_class_zero; 1889 a.sign = sign; 1890 return a; 1891 } 1892 g_assert_not_reached(); 1893 } 1894 1895 float16 float16_div(float16 a, float16 b, float_status *status) 1896 { 1897 FloatParts64 pa, pb, pr; 1898 1899 float16_unpack_canonical(&pa, a, status); 1900 float16_unpack_canonical(&pb, b, status); 1901 pr = div_floats(pa, pb, status); 1902 1903 return float16_round_pack_canonical(&pr, status); 1904 } 1905 1906 static float32 QEMU_SOFTFLOAT_ATTR 1907 soft_f32_div(float32 a, float32 b, float_status *status) 1908 { 1909 FloatParts64 pa, pb, pr; 1910 1911 float32_unpack_canonical(&pa, a, status); 1912 float32_unpack_canonical(&pb, b, status); 1913 pr = div_floats(pa, pb, status); 1914 1915 return float32_round_pack_canonical(&pr, status); 1916 } 1917 1918 static float64 QEMU_SOFTFLOAT_ATTR 1919 soft_f64_div(float64 a, float64 b, float_status *status) 1920 { 1921 FloatParts64 pa, pb, pr; 1922 1923 float64_unpack_canonical(&pa, a, status); 1924 float64_unpack_canonical(&pb, b, status); 1925 pr = div_floats(pa, pb, status); 1926 1927 return float64_round_pack_canonical(&pr, status); 1928 } 1929 1930 static float hard_f32_div(float a, float b) 1931 { 1932 return a / b; 1933 } 1934 1935 static double hard_f64_div(double a, double b) 1936 { 1937 return a / b; 1938 } 1939 1940 static bool f32_div_pre(union_float32 a, union_float32 b) 1941 { 1942 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1943 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1944 fpclassify(b.h) == FP_NORMAL; 1945 } 1946 return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s); 1947 } 1948 1949 static bool f64_div_pre(union_float64 a, union_float64 b) 1950 { 1951 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1952 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1953 fpclassify(b.h) == FP_NORMAL; 1954 } 1955 return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s); 1956 } 1957 1958 static bool f32_div_post(union_float32 a, union_float32 b) 1959 { 1960 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1961 return fpclassify(a.h) != FP_ZERO; 1962 } 1963 return !float32_is_zero(a.s); 1964 } 1965 1966 static bool f64_div_post(union_float64 a, union_float64 b) 1967 { 1968 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1969 return fpclassify(a.h) != FP_ZERO; 1970 } 1971 return !float64_is_zero(a.s); 1972 } 1973 1974 float32 QEMU_FLATTEN 1975 float32_div(float32 a, float32 b, float_status *s) 1976 { 1977 return float32_gen2(a, b, s, hard_f32_div, soft_f32_div, 1978 f32_div_pre, f32_div_post); 1979 } 1980 1981 float64 QEMU_FLATTEN 1982 float64_div(float64 a, float64 b, float_status *s) 1983 { 1984 return float64_gen2(a, b, s, hard_f64_div, soft_f64_div, 1985 f64_div_pre, f64_div_post); 1986 } 1987 1988 /* 1989 * Returns the result of dividing the bfloat16 1990 * value `a' by the corresponding value `b'. 1991 */ 1992 1993 bfloat16 bfloat16_div(bfloat16 a, bfloat16 b, float_status *status) 1994 { 1995 FloatParts64 pa, pb, pr; 1996 1997 bfloat16_unpack_canonical(&pa, a, status); 1998 bfloat16_unpack_canonical(&pb, b, status); 1999 pr = div_floats(pa, pb, status); 2000 2001 return bfloat16_round_pack_canonical(&pr, status); 2002 } 2003 2004 /* 2005 * Float to Float conversions 2006 * 2007 * Returns the result of converting one float format to another. The 2008 * conversion is performed according to the IEC/IEEE Standard for 2009 * Binary Floating-Point Arithmetic. 2010 * 2011 * The float_to_float helper only needs to take care of raising 2012 * invalid exceptions and handling the conversion on NaNs. 2013 */ 2014 2015 static FloatParts64 float_to_float(FloatParts64 a, const FloatFmt *dstf, 2016 float_status *s) 2017 { 2018 if (dstf->arm_althp) { 2019 switch (a.cls) { 2020 case float_class_qnan: 2021 case float_class_snan: 2022 /* There is no NaN in the destination format. Raise Invalid 2023 * and return a zero with the sign of the input NaN. 2024 */ 2025 float_raise(float_flag_invalid, s); 2026 a.cls = float_class_zero; 2027 a.frac = 0; 2028 a.exp = 0; 2029 break; 2030 2031 case float_class_inf: 2032 /* There is no Inf in the destination format. Raise Invalid 2033 * and return the maximum normal with the correct sign. 2034 */ 2035 float_raise(float_flag_invalid, s); 2036 a.cls = float_class_normal; 2037 a.exp = dstf->exp_max; 2038 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift; 2039 break; 2040 2041 default: 2042 break; 2043 } 2044 } else if (is_nan(a.cls)) { 2045 parts_return_nan(&a, s); 2046 } 2047 return a; 2048 } 2049 2050 float32 float16_to_float32(float16 a, bool ieee, float_status *s) 2051 { 2052 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2053 FloatParts64 pa, pr; 2054 2055 float16a_unpack_canonical(&pa, a, s, fmt16); 2056 pr = float_to_float(pa, &float32_params, s); 2057 return float32_round_pack_canonical(&pr, s); 2058 } 2059 2060 float64 float16_to_float64(float16 a, bool ieee, float_status *s) 2061 { 2062 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2063 FloatParts64 pa, pr; 2064 2065 float16a_unpack_canonical(&pa, a, s, fmt16); 2066 pr = float_to_float(pa, &float64_params, s); 2067 return float64_round_pack_canonical(&pr, s); 2068 } 2069 2070 float16 float32_to_float16(float32 a, bool ieee, float_status *s) 2071 { 2072 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2073 FloatParts64 pa, pr; 2074 2075 float32_unpack_canonical(&pa, a, s); 2076 pr = float_to_float(pa, fmt16, s); 2077 return float16a_round_pack_canonical(&pr, s, fmt16); 2078 } 2079 2080 static float64 QEMU_SOFTFLOAT_ATTR 2081 soft_float32_to_float64(float32 a, float_status *s) 2082 { 2083 FloatParts64 pa, pr; 2084 2085 float32_unpack_canonical(&pa, a, s); 2086 pr = float_to_float(pa, &float64_params, s); 2087 return float64_round_pack_canonical(&pr, s); 2088 } 2089 2090 float64 float32_to_float64(float32 a, float_status *s) 2091 { 2092 if (likely(float32_is_normal(a))) { 2093 /* Widening conversion can never produce inexact results. */ 2094 union_float32 uf; 2095 union_float64 ud; 2096 uf.s = a; 2097 ud.h = uf.h; 2098 return ud.s; 2099 } else if (float32_is_zero(a)) { 2100 return float64_set_sign(float64_zero, float32_is_neg(a)); 2101 } else { 2102 return soft_float32_to_float64(a, s); 2103 } 2104 } 2105 2106 float16 float64_to_float16(float64 a, bool ieee, float_status *s) 2107 { 2108 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2109 FloatParts64 pa, pr; 2110 2111 float64_unpack_canonical(&pa, a, s); 2112 pr = float_to_float(pa, fmt16, s); 2113 return float16a_round_pack_canonical(&pr, s, fmt16); 2114 } 2115 2116 float32 float64_to_float32(float64 a, float_status *s) 2117 { 2118 FloatParts64 pa, pr; 2119 2120 float64_unpack_canonical(&pa, a, s); 2121 pr = float_to_float(pa, &float32_params, s); 2122 return float32_round_pack_canonical(&pr, s); 2123 } 2124 2125 float32 bfloat16_to_float32(bfloat16 a, float_status *s) 2126 { 2127 FloatParts64 pa, pr; 2128 2129 bfloat16_unpack_canonical(&pa, a, s); 2130 pr = float_to_float(pa, &float32_params, s); 2131 return float32_round_pack_canonical(&pr, s); 2132 } 2133 2134 float64 bfloat16_to_float64(bfloat16 a, float_status *s) 2135 { 2136 FloatParts64 pa, pr; 2137 2138 bfloat16_unpack_canonical(&pa, a, s); 2139 pr = float_to_float(pa, &float64_params, s); 2140 return float64_round_pack_canonical(&pr, s); 2141 } 2142 2143 bfloat16 float32_to_bfloat16(float32 a, float_status *s) 2144 { 2145 FloatParts64 pa, pr; 2146 2147 float32_unpack_canonical(&pa, a, s); 2148 pr = float_to_float(pa, &bfloat16_params, s); 2149 return bfloat16_round_pack_canonical(&pr, s); 2150 } 2151 2152 bfloat16 float64_to_bfloat16(float64 a, float_status *s) 2153 { 2154 FloatParts64 pa, pr; 2155 2156 float64_unpack_canonical(&pa, a, s); 2157 pr = float_to_float(pa, &bfloat16_params, s); 2158 return bfloat16_round_pack_canonical(&pr, s); 2159 } 2160 2161 /* 2162 * Rounds the floating-point value `a' to an integer, and returns the 2163 * result as a floating-point value. The operation is performed 2164 * according to the IEC/IEEE Standard for Binary Floating-Point 2165 * Arithmetic. 2166 */ 2167 2168 static FloatParts64 round_to_int(FloatParts64 a, FloatRoundMode rmode, 2169 int scale, float_status *s) 2170 { 2171 switch (a.cls) { 2172 case float_class_qnan: 2173 case float_class_snan: 2174 parts_return_nan(&a, s); 2175 break; 2176 2177 case float_class_zero: 2178 case float_class_inf: 2179 /* already "integral" */ 2180 break; 2181 2182 case float_class_normal: 2183 scale = MIN(MAX(scale, -0x10000), 0x10000); 2184 a.exp += scale; 2185 2186 if (a.exp >= DECOMPOSED_BINARY_POINT) { 2187 /* already integral */ 2188 break; 2189 } 2190 if (a.exp < 0) { 2191 bool one; 2192 /* all fractional */ 2193 float_raise(float_flag_inexact, s); 2194 switch (rmode) { 2195 case float_round_nearest_even: 2196 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT; 2197 break; 2198 case float_round_ties_away: 2199 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT; 2200 break; 2201 case float_round_to_zero: 2202 one = false; 2203 break; 2204 case float_round_up: 2205 one = !a.sign; 2206 break; 2207 case float_round_down: 2208 one = a.sign; 2209 break; 2210 case float_round_to_odd: 2211 one = true; 2212 break; 2213 default: 2214 g_assert_not_reached(); 2215 } 2216 2217 if (one) { 2218 a.frac = DECOMPOSED_IMPLICIT_BIT; 2219 a.exp = 0; 2220 } else { 2221 a.cls = float_class_zero; 2222 } 2223 } else { 2224 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp; 2225 uint64_t frac_lsbm1 = frac_lsb >> 1; 2226 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb; 2227 uint64_t rnd_mask = rnd_even_mask >> 1; 2228 uint64_t inc; 2229 2230 switch (rmode) { 2231 case float_round_nearest_even: 2232 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 2233 break; 2234 case float_round_ties_away: 2235 inc = frac_lsbm1; 2236 break; 2237 case float_round_to_zero: 2238 inc = 0; 2239 break; 2240 case float_round_up: 2241 inc = a.sign ? 0 : rnd_mask; 2242 break; 2243 case float_round_down: 2244 inc = a.sign ? rnd_mask : 0; 2245 break; 2246 case float_round_to_odd: 2247 inc = a.frac & frac_lsb ? 0 : rnd_mask; 2248 break; 2249 default: 2250 g_assert_not_reached(); 2251 } 2252 2253 if (a.frac & rnd_mask) { 2254 float_raise(float_flag_inexact, s); 2255 if (uadd64_overflow(a.frac, inc, &a.frac)) { 2256 a.frac >>= 1; 2257 a.frac |= DECOMPOSED_IMPLICIT_BIT; 2258 a.exp++; 2259 } 2260 a.frac &= ~rnd_mask; 2261 } 2262 } 2263 break; 2264 default: 2265 g_assert_not_reached(); 2266 } 2267 return a; 2268 } 2269 2270 float16 float16_round_to_int(float16 a, float_status *s) 2271 { 2272 FloatParts64 pa, pr; 2273 2274 float16_unpack_canonical(&pa, a, s); 2275 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2276 return float16_round_pack_canonical(&pr, s); 2277 } 2278 2279 float32 float32_round_to_int(float32 a, float_status *s) 2280 { 2281 FloatParts64 pa, pr; 2282 2283 float32_unpack_canonical(&pa, a, s); 2284 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2285 return float32_round_pack_canonical(&pr, s); 2286 } 2287 2288 float64 float64_round_to_int(float64 a, float_status *s) 2289 { 2290 FloatParts64 pa, pr; 2291 2292 float64_unpack_canonical(&pa, a, s); 2293 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2294 return float64_round_pack_canonical(&pr, s); 2295 } 2296 2297 /* 2298 * Rounds the bfloat16 value `a' to an integer, and returns the 2299 * result as a bfloat16 value. 2300 */ 2301 2302 bfloat16 bfloat16_round_to_int(bfloat16 a, float_status *s) 2303 { 2304 FloatParts64 pa, pr; 2305 2306 bfloat16_unpack_canonical(&pa, a, s); 2307 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2308 return bfloat16_round_pack_canonical(&pr, s); 2309 } 2310 2311 /* 2312 * Returns the result of converting the floating-point value `a' to 2313 * the two's complement integer format. The conversion is performed 2314 * according to the IEC/IEEE Standard for Binary Floating-Point 2315 * Arithmetic---which means in particular that the conversion is 2316 * rounded according to the current rounding mode. If `a' is a NaN, 2317 * the largest positive integer is returned. Otherwise, if the 2318 * conversion overflows, the largest integer with the same sign as `a' 2319 * is returned. 2320 */ 2321 2322 static int64_t round_to_int_and_pack(FloatParts64 in, FloatRoundMode rmode, 2323 int scale, int64_t min, int64_t max, 2324 float_status *s) 2325 { 2326 uint64_t r; 2327 int orig_flags = get_float_exception_flags(s); 2328 FloatParts64 p = round_to_int(in, rmode, scale, s); 2329 2330 switch (p.cls) { 2331 case float_class_snan: 2332 case float_class_qnan: 2333 s->float_exception_flags = orig_flags | float_flag_invalid; 2334 return max; 2335 case float_class_inf: 2336 s->float_exception_flags = orig_flags | float_flag_invalid; 2337 return p.sign ? min : max; 2338 case float_class_zero: 2339 return 0; 2340 case float_class_normal: 2341 if (p.exp <= DECOMPOSED_BINARY_POINT) { 2342 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2343 } else { 2344 r = UINT64_MAX; 2345 } 2346 if (p.sign) { 2347 if (r <= -(uint64_t) min) { 2348 return -r; 2349 } else { 2350 s->float_exception_flags = orig_flags | float_flag_invalid; 2351 return min; 2352 } 2353 } else { 2354 if (r <= max) { 2355 return r; 2356 } else { 2357 s->float_exception_flags = orig_flags | float_flag_invalid; 2358 return max; 2359 } 2360 } 2361 default: 2362 g_assert_not_reached(); 2363 } 2364 } 2365 2366 int8_t float16_to_int8_scalbn(float16 a, FloatRoundMode rmode, int scale, 2367 float_status *s) 2368 { 2369 FloatParts64 p; 2370 2371 float16_unpack_canonical(&p, a, s); 2372 return round_to_int_and_pack(p, rmode, scale, INT8_MIN, INT8_MAX, s); 2373 } 2374 2375 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale, 2376 float_status *s) 2377 { 2378 FloatParts64 p; 2379 2380 float16_unpack_canonical(&p, a, s); 2381 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2382 } 2383 2384 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale, 2385 float_status *s) 2386 { 2387 FloatParts64 p; 2388 2389 float16_unpack_canonical(&p, a, s); 2390 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2391 } 2392 2393 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale, 2394 float_status *s) 2395 { 2396 FloatParts64 p; 2397 2398 float16_unpack_canonical(&p, a, s); 2399 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2400 } 2401 2402 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale, 2403 float_status *s) 2404 { 2405 FloatParts64 p; 2406 2407 float32_unpack_canonical(&p, a, s); 2408 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2409 } 2410 2411 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale, 2412 float_status *s) 2413 { 2414 FloatParts64 p; 2415 2416 float32_unpack_canonical(&p, a, s); 2417 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2418 } 2419 2420 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale, 2421 float_status *s) 2422 { 2423 FloatParts64 p; 2424 2425 float32_unpack_canonical(&p, a, s); 2426 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2427 } 2428 2429 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale, 2430 float_status *s) 2431 { 2432 FloatParts64 p; 2433 2434 float64_unpack_canonical(&p, a, s); 2435 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2436 } 2437 2438 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale, 2439 float_status *s) 2440 { 2441 FloatParts64 p; 2442 2443 float64_unpack_canonical(&p, a, s); 2444 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2445 } 2446 2447 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale, 2448 float_status *s) 2449 { 2450 FloatParts64 p; 2451 2452 float64_unpack_canonical(&p, a, s); 2453 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2454 } 2455 2456 int8_t float16_to_int8(float16 a, float_status *s) 2457 { 2458 return float16_to_int8_scalbn(a, s->float_rounding_mode, 0, s); 2459 } 2460 2461 int16_t float16_to_int16(float16 a, float_status *s) 2462 { 2463 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2464 } 2465 2466 int32_t float16_to_int32(float16 a, float_status *s) 2467 { 2468 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2469 } 2470 2471 int64_t float16_to_int64(float16 a, float_status *s) 2472 { 2473 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2474 } 2475 2476 int16_t float32_to_int16(float32 a, float_status *s) 2477 { 2478 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2479 } 2480 2481 int32_t float32_to_int32(float32 a, float_status *s) 2482 { 2483 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2484 } 2485 2486 int64_t float32_to_int64(float32 a, float_status *s) 2487 { 2488 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2489 } 2490 2491 int16_t float64_to_int16(float64 a, float_status *s) 2492 { 2493 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2494 } 2495 2496 int32_t float64_to_int32(float64 a, float_status *s) 2497 { 2498 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2499 } 2500 2501 int64_t float64_to_int64(float64 a, float_status *s) 2502 { 2503 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2504 } 2505 2506 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s) 2507 { 2508 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2509 } 2510 2511 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s) 2512 { 2513 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2514 } 2515 2516 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s) 2517 { 2518 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2519 } 2520 2521 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s) 2522 { 2523 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s); 2524 } 2525 2526 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s) 2527 { 2528 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s); 2529 } 2530 2531 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s) 2532 { 2533 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s); 2534 } 2535 2536 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s) 2537 { 2538 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s); 2539 } 2540 2541 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s) 2542 { 2543 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s); 2544 } 2545 2546 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s) 2547 { 2548 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s); 2549 } 2550 2551 /* 2552 * Returns the result of converting the floating-point value `a' to 2553 * the two's complement integer format. 2554 */ 2555 2556 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2557 float_status *s) 2558 { 2559 FloatParts64 p; 2560 2561 bfloat16_unpack_canonical(&p, a, s); 2562 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2563 } 2564 2565 int32_t bfloat16_to_int32_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2566 float_status *s) 2567 { 2568 FloatParts64 p; 2569 2570 bfloat16_unpack_canonical(&p, a, s); 2571 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2572 } 2573 2574 int64_t bfloat16_to_int64_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2575 float_status *s) 2576 { 2577 FloatParts64 p; 2578 2579 bfloat16_unpack_canonical(&p, a, s); 2580 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2581 } 2582 2583 int16_t bfloat16_to_int16(bfloat16 a, float_status *s) 2584 { 2585 return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2586 } 2587 2588 int32_t bfloat16_to_int32(bfloat16 a, float_status *s) 2589 { 2590 return bfloat16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2591 } 2592 2593 int64_t bfloat16_to_int64(bfloat16 a, float_status *s) 2594 { 2595 return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2596 } 2597 2598 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s) 2599 { 2600 return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2601 } 2602 2603 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a, float_status *s) 2604 { 2605 return bfloat16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2606 } 2607 2608 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a, float_status *s) 2609 { 2610 return bfloat16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2611 } 2612 2613 /* 2614 * Returns the result of converting the floating-point value `a' to 2615 * the unsigned integer format. The conversion is performed according 2616 * to the IEC/IEEE Standard for Binary Floating-Point 2617 * Arithmetic---which means in particular that the conversion is 2618 * rounded according to the current rounding mode. If `a' is a NaN, 2619 * the largest unsigned integer is returned. Otherwise, if the 2620 * conversion overflows, the largest unsigned integer is returned. If 2621 * the 'a' is negative, the result is rounded and zero is returned; 2622 * values that do not round to zero will raise the inexact exception 2623 * flag. 2624 */ 2625 2626 static uint64_t round_to_uint_and_pack(FloatParts64 in, FloatRoundMode rmode, 2627 int scale, uint64_t max, 2628 float_status *s) 2629 { 2630 int orig_flags = get_float_exception_flags(s); 2631 FloatParts64 p = round_to_int(in, rmode, scale, s); 2632 uint64_t r; 2633 2634 switch (p.cls) { 2635 case float_class_snan: 2636 case float_class_qnan: 2637 s->float_exception_flags = orig_flags | float_flag_invalid; 2638 return max; 2639 case float_class_inf: 2640 s->float_exception_flags = orig_flags | float_flag_invalid; 2641 return p.sign ? 0 : max; 2642 case float_class_zero: 2643 return 0; 2644 case float_class_normal: 2645 if (p.sign) { 2646 s->float_exception_flags = orig_flags | float_flag_invalid; 2647 return 0; 2648 } 2649 2650 if (p.exp <= DECOMPOSED_BINARY_POINT) { 2651 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2652 } else { 2653 s->float_exception_flags = orig_flags | float_flag_invalid; 2654 return max; 2655 } 2656 2657 /* For uint64 this will never trip, but if p.exp is too large 2658 * to shift a decomposed fraction we shall have exited via the 2659 * 3rd leg above. 2660 */ 2661 if (r > max) { 2662 s->float_exception_flags = orig_flags | float_flag_invalid; 2663 return max; 2664 } 2665 return r; 2666 default: 2667 g_assert_not_reached(); 2668 } 2669 } 2670 2671 uint8_t float16_to_uint8_scalbn(float16 a, FloatRoundMode rmode, int scale, 2672 float_status *s) 2673 { 2674 FloatParts64 p; 2675 2676 float16_unpack_canonical(&p, a, s); 2677 return round_to_uint_and_pack(p, rmode, scale, UINT8_MAX, s); 2678 } 2679 2680 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale, 2681 float_status *s) 2682 { 2683 FloatParts64 p; 2684 2685 float16_unpack_canonical(&p, a, s); 2686 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2687 } 2688 2689 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale, 2690 float_status *s) 2691 { 2692 FloatParts64 p; 2693 2694 float16_unpack_canonical(&p, a, s); 2695 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2696 } 2697 2698 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale, 2699 float_status *s) 2700 { 2701 FloatParts64 p; 2702 2703 float16_unpack_canonical(&p, a, s); 2704 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2705 } 2706 2707 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale, 2708 float_status *s) 2709 { 2710 FloatParts64 p; 2711 2712 float32_unpack_canonical(&p, a, s); 2713 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2714 } 2715 2716 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale, 2717 float_status *s) 2718 { 2719 FloatParts64 p; 2720 2721 float32_unpack_canonical(&p, a, s); 2722 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2723 } 2724 2725 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale, 2726 float_status *s) 2727 { 2728 FloatParts64 p; 2729 2730 float32_unpack_canonical(&p, a, s); 2731 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2732 } 2733 2734 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale, 2735 float_status *s) 2736 { 2737 FloatParts64 p; 2738 2739 float64_unpack_canonical(&p, a, s); 2740 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2741 } 2742 2743 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale, 2744 float_status *s) 2745 { 2746 FloatParts64 p; 2747 2748 float64_unpack_canonical(&p, a, s); 2749 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2750 } 2751 2752 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale, 2753 float_status *s) 2754 { 2755 FloatParts64 p; 2756 2757 float64_unpack_canonical(&p, a, s); 2758 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2759 } 2760 2761 uint8_t float16_to_uint8(float16 a, float_status *s) 2762 { 2763 return float16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s); 2764 } 2765 2766 uint16_t float16_to_uint16(float16 a, float_status *s) 2767 { 2768 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2769 } 2770 2771 uint32_t float16_to_uint32(float16 a, float_status *s) 2772 { 2773 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2774 } 2775 2776 uint64_t float16_to_uint64(float16 a, float_status *s) 2777 { 2778 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2779 } 2780 2781 uint16_t float32_to_uint16(float32 a, float_status *s) 2782 { 2783 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2784 } 2785 2786 uint32_t float32_to_uint32(float32 a, float_status *s) 2787 { 2788 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2789 } 2790 2791 uint64_t float32_to_uint64(float32 a, float_status *s) 2792 { 2793 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2794 } 2795 2796 uint16_t float64_to_uint16(float64 a, float_status *s) 2797 { 2798 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2799 } 2800 2801 uint32_t float64_to_uint32(float64 a, float_status *s) 2802 { 2803 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2804 } 2805 2806 uint64_t float64_to_uint64(float64 a, float_status *s) 2807 { 2808 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2809 } 2810 2811 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s) 2812 { 2813 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2814 } 2815 2816 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s) 2817 { 2818 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2819 } 2820 2821 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s) 2822 { 2823 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2824 } 2825 2826 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s) 2827 { 2828 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2829 } 2830 2831 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s) 2832 { 2833 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2834 } 2835 2836 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s) 2837 { 2838 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2839 } 2840 2841 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s) 2842 { 2843 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2844 } 2845 2846 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s) 2847 { 2848 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2849 } 2850 2851 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s) 2852 { 2853 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2854 } 2855 2856 /* 2857 * Returns the result of converting the bfloat16 value `a' to 2858 * the unsigned integer format. 2859 */ 2860 2861 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode, 2862 int scale, float_status *s) 2863 { 2864 FloatParts64 p; 2865 2866 bfloat16_unpack_canonical(&p, a, s); 2867 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2868 } 2869 2870 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode rmode, 2871 int scale, float_status *s) 2872 { 2873 FloatParts64 p; 2874 2875 bfloat16_unpack_canonical(&p, a, s); 2876 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2877 } 2878 2879 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode rmode, 2880 int scale, float_status *s) 2881 { 2882 FloatParts64 p; 2883 2884 bfloat16_unpack_canonical(&p, a, s); 2885 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2886 } 2887 2888 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s) 2889 { 2890 return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2891 } 2892 2893 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *s) 2894 { 2895 return bfloat16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2896 } 2897 2898 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s) 2899 { 2900 return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2901 } 2902 2903 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s) 2904 { 2905 return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2906 } 2907 2908 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *s) 2909 { 2910 return bfloat16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2911 } 2912 2913 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *s) 2914 { 2915 return bfloat16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2916 } 2917 2918 /* 2919 * Integer to float conversions 2920 * 2921 * Returns the result of converting the two's complement integer `a' 2922 * to the floating-point format. The conversion is performed according 2923 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2924 */ 2925 2926 static FloatParts64 int_to_float(int64_t a, int scale, float_status *status) 2927 { 2928 FloatParts64 r = { .sign = false }; 2929 2930 if (a == 0) { 2931 r.cls = float_class_zero; 2932 } else { 2933 uint64_t f = a; 2934 int shift; 2935 2936 r.cls = float_class_normal; 2937 if (a < 0) { 2938 f = -f; 2939 r.sign = true; 2940 } 2941 shift = clz64(f); 2942 scale = MIN(MAX(scale, -0x10000), 0x10000); 2943 2944 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 2945 r.frac = f << shift; 2946 } 2947 2948 return r; 2949 } 2950 2951 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status) 2952 { 2953 FloatParts64 pa = int_to_float(a, scale, status); 2954 return float16_round_pack_canonical(&pa, status); 2955 } 2956 2957 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status) 2958 { 2959 return int64_to_float16_scalbn(a, scale, status); 2960 } 2961 2962 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status) 2963 { 2964 return int64_to_float16_scalbn(a, scale, status); 2965 } 2966 2967 float16 int64_to_float16(int64_t a, float_status *status) 2968 { 2969 return int64_to_float16_scalbn(a, 0, status); 2970 } 2971 2972 float16 int32_to_float16(int32_t a, float_status *status) 2973 { 2974 return int64_to_float16_scalbn(a, 0, status); 2975 } 2976 2977 float16 int16_to_float16(int16_t a, float_status *status) 2978 { 2979 return int64_to_float16_scalbn(a, 0, status); 2980 } 2981 2982 float16 int8_to_float16(int8_t a, float_status *status) 2983 { 2984 return int64_to_float16_scalbn(a, 0, status); 2985 } 2986 2987 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status) 2988 { 2989 FloatParts64 pa = int_to_float(a, scale, status); 2990 return float32_round_pack_canonical(&pa, status); 2991 } 2992 2993 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status) 2994 { 2995 return int64_to_float32_scalbn(a, scale, status); 2996 } 2997 2998 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status) 2999 { 3000 return int64_to_float32_scalbn(a, scale, status); 3001 } 3002 3003 float32 int64_to_float32(int64_t a, float_status *status) 3004 { 3005 return int64_to_float32_scalbn(a, 0, status); 3006 } 3007 3008 float32 int32_to_float32(int32_t a, float_status *status) 3009 { 3010 return int64_to_float32_scalbn(a, 0, status); 3011 } 3012 3013 float32 int16_to_float32(int16_t a, float_status *status) 3014 { 3015 return int64_to_float32_scalbn(a, 0, status); 3016 } 3017 3018 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status) 3019 { 3020 FloatParts64 pa = int_to_float(a, scale, status); 3021 return float64_round_pack_canonical(&pa, status); 3022 } 3023 3024 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status) 3025 { 3026 return int64_to_float64_scalbn(a, scale, status); 3027 } 3028 3029 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status) 3030 { 3031 return int64_to_float64_scalbn(a, scale, status); 3032 } 3033 3034 float64 int64_to_float64(int64_t a, float_status *status) 3035 { 3036 return int64_to_float64_scalbn(a, 0, status); 3037 } 3038 3039 float64 int32_to_float64(int32_t a, float_status *status) 3040 { 3041 return int64_to_float64_scalbn(a, 0, status); 3042 } 3043 3044 float64 int16_to_float64(int16_t a, float_status *status) 3045 { 3046 return int64_to_float64_scalbn(a, 0, status); 3047 } 3048 3049 /* 3050 * Returns the result of converting the two's complement integer `a' 3051 * to the bfloat16 format. 3052 */ 3053 3054 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status) 3055 { 3056 FloatParts64 pa = int_to_float(a, scale, status); 3057 return bfloat16_round_pack_canonical(&pa, status); 3058 } 3059 3060 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int scale, float_status *status) 3061 { 3062 return int64_to_bfloat16_scalbn(a, scale, status); 3063 } 3064 3065 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status) 3066 { 3067 return int64_to_bfloat16_scalbn(a, scale, status); 3068 } 3069 3070 bfloat16 int64_to_bfloat16(int64_t a, float_status *status) 3071 { 3072 return int64_to_bfloat16_scalbn(a, 0, status); 3073 } 3074 3075 bfloat16 int32_to_bfloat16(int32_t a, float_status *status) 3076 { 3077 return int64_to_bfloat16_scalbn(a, 0, status); 3078 } 3079 3080 bfloat16 int16_to_bfloat16(int16_t a, float_status *status) 3081 { 3082 return int64_to_bfloat16_scalbn(a, 0, status); 3083 } 3084 3085 /* 3086 * Unsigned Integer to float conversions 3087 * 3088 * Returns the result of converting the unsigned integer `a' to the 3089 * floating-point format. The conversion is performed according to the 3090 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3091 */ 3092 3093 static FloatParts64 uint_to_float(uint64_t a, int scale, float_status *status) 3094 { 3095 FloatParts64 r = { .sign = false }; 3096 int shift; 3097 3098 if (a == 0) { 3099 r.cls = float_class_zero; 3100 } else { 3101 scale = MIN(MAX(scale, -0x10000), 0x10000); 3102 shift = clz64(a); 3103 r.cls = float_class_normal; 3104 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 3105 r.frac = a << shift; 3106 } 3107 3108 return r; 3109 } 3110 3111 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status) 3112 { 3113 FloatParts64 pa = uint_to_float(a, scale, status); 3114 return float16_round_pack_canonical(&pa, status); 3115 } 3116 3117 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status) 3118 { 3119 return uint64_to_float16_scalbn(a, scale, status); 3120 } 3121 3122 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status) 3123 { 3124 return uint64_to_float16_scalbn(a, scale, status); 3125 } 3126 3127 float16 uint64_to_float16(uint64_t a, float_status *status) 3128 { 3129 return uint64_to_float16_scalbn(a, 0, status); 3130 } 3131 3132 float16 uint32_to_float16(uint32_t a, float_status *status) 3133 { 3134 return uint64_to_float16_scalbn(a, 0, status); 3135 } 3136 3137 float16 uint16_to_float16(uint16_t a, float_status *status) 3138 { 3139 return uint64_to_float16_scalbn(a, 0, status); 3140 } 3141 3142 float16 uint8_to_float16(uint8_t a, float_status *status) 3143 { 3144 return uint64_to_float16_scalbn(a, 0, status); 3145 } 3146 3147 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status) 3148 { 3149 FloatParts64 pa = uint_to_float(a, scale, status); 3150 return float32_round_pack_canonical(&pa, status); 3151 } 3152 3153 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status) 3154 { 3155 return uint64_to_float32_scalbn(a, scale, status); 3156 } 3157 3158 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status) 3159 { 3160 return uint64_to_float32_scalbn(a, scale, status); 3161 } 3162 3163 float32 uint64_to_float32(uint64_t a, float_status *status) 3164 { 3165 return uint64_to_float32_scalbn(a, 0, status); 3166 } 3167 3168 float32 uint32_to_float32(uint32_t a, float_status *status) 3169 { 3170 return uint64_to_float32_scalbn(a, 0, status); 3171 } 3172 3173 float32 uint16_to_float32(uint16_t a, float_status *status) 3174 { 3175 return uint64_to_float32_scalbn(a, 0, status); 3176 } 3177 3178 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status) 3179 { 3180 FloatParts64 pa = uint_to_float(a, scale, status); 3181 return float64_round_pack_canonical(&pa, status); 3182 } 3183 3184 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status) 3185 { 3186 return uint64_to_float64_scalbn(a, scale, status); 3187 } 3188 3189 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status) 3190 { 3191 return uint64_to_float64_scalbn(a, scale, status); 3192 } 3193 3194 float64 uint64_to_float64(uint64_t a, float_status *status) 3195 { 3196 return uint64_to_float64_scalbn(a, 0, status); 3197 } 3198 3199 float64 uint32_to_float64(uint32_t a, float_status *status) 3200 { 3201 return uint64_to_float64_scalbn(a, 0, status); 3202 } 3203 3204 float64 uint16_to_float64(uint16_t a, float_status *status) 3205 { 3206 return uint64_to_float64_scalbn(a, 0, status); 3207 } 3208 3209 /* 3210 * Returns the result of converting the unsigned integer `a' to the 3211 * bfloat16 format. 3212 */ 3213 3214 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status) 3215 { 3216 FloatParts64 pa = uint_to_float(a, scale, status); 3217 return bfloat16_round_pack_canonical(&pa, status); 3218 } 3219 3220 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int scale, float_status *status) 3221 { 3222 return uint64_to_bfloat16_scalbn(a, scale, status); 3223 } 3224 3225 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status) 3226 { 3227 return uint64_to_bfloat16_scalbn(a, scale, status); 3228 } 3229 3230 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status) 3231 { 3232 return uint64_to_bfloat16_scalbn(a, 0, status); 3233 } 3234 3235 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status) 3236 { 3237 return uint64_to_bfloat16_scalbn(a, 0, status); 3238 } 3239 3240 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status) 3241 { 3242 return uint64_to_bfloat16_scalbn(a, 0, status); 3243 } 3244 3245 /* Float Min/Max */ 3246 /* min() and max() functions. These can't be implemented as 3247 * 'compare and pick one input' because that would mishandle 3248 * NaNs and +0 vs -0. 3249 * 3250 * minnum() and maxnum() functions. These are similar to the min() 3251 * and max() functions but if one of the arguments is a QNaN and 3252 * the other is numerical then the numerical argument is returned. 3253 * SNaNs will get quietened before being returned. 3254 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 3255 * and maxNum() operations. min() and max() are the typical min/max 3256 * semantics provided by many CPUs which predate that specification. 3257 * 3258 * minnummag() and maxnummag() functions correspond to minNumMag() 3259 * and minNumMag() from the IEEE-754 2008. 3260 */ 3261 static FloatParts64 minmax_floats(FloatParts64 a, FloatParts64 b, bool ismin, 3262 bool ieee, bool ismag, float_status *s) 3263 { 3264 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) { 3265 if (ieee) { 3266 /* Takes two floating-point values `a' and `b', one of 3267 * which is a NaN, and returns the appropriate NaN 3268 * result. If either `a' or `b' is a signaling NaN, 3269 * the invalid exception is raised. 3270 */ 3271 if (is_snan(a.cls) || is_snan(b.cls)) { 3272 return *parts_pick_nan(&a, &b, s); 3273 } else if (is_nan(a.cls) && !is_nan(b.cls)) { 3274 return b; 3275 } else if (is_nan(b.cls) && !is_nan(a.cls)) { 3276 return a; 3277 } 3278 } 3279 return *parts_pick_nan(&a, &b, s); 3280 } else { 3281 int a_exp, b_exp; 3282 3283 switch (a.cls) { 3284 case float_class_normal: 3285 a_exp = a.exp; 3286 break; 3287 case float_class_inf: 3288 a_exp = INT_MAX; 3289 break; 3290 case float_class_zero: 3291 a_exp = INT_MIN; 3292 break; 3293 default: 3294 g_assert_not_reached(); 3295 break; 3296 } 3297 switch (b.cls) { 3298 case float_class_normal: 3299 b_exp = b.exp; 3300 break; 3301 case float_class_inf: 3302 b_exp = INT_MAX; 3303 break; 3304 case float_class_zero: 3305 b_exp = INT_MIN; 3306 break; 3307 default: 3308 g_assert_not_reached(); 3309 break; 3310 } 3311 3312 if (ismag && (a_exp != b_exp || a.frac != b.frac)) { 3313 bool a_less = a_exp < b_exp; 3314 if (a_exp == b_exp) { 3315 a_less = a.frac < b.frac; 3316 } 3317 return a_less ^ ismin ? b : a; 3318 } 3319 3320 if (a.sign == b.sign) { 3321 bool a_less = a_exp < b_exp; 3322 if (a_exp == b_exp) { 3323 a_less = a.frac < b.frac; 3324 } 3325 return a.sign ^ a_less ^ ismin ? b : a; 3326 } else { 3327 return a.sign ^ ismin ? b : a; 3328 } 3329 } 3330 } 3331 3332 #define MINMAX(sz, name, ismin, isiee, ismag) \ 3333 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \ 3334 float_status *s) \ 3335 { \ 3336 FloatParts64 pa, pb, pr; \ 3337 float ## sz ## _unpack_canonical(&pa, a, s); \ 3338 float ## sz ## _unpack_canonical(&pb, b, s); \ 3339 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 3340 return float ## sz ## _round_pack_canonical(&pr, s); \ 3341 } 3342 3343 MINMAX(16, min, true, false, false) 3344 MINMAX(16, minnum, true, true, false) 3345 MINMAX(16, minnummag, true, true, true) 3346 MINMAX(16, max, false, false, false) 3347 MINMAX(16, maxnum, false, true, false) 3348 MINMAX(16, maxnummag, false, true, true) 3349 3350 MINMAX(32, min, true, false, false) 3351 MINMAX(32, minnum, true, true, false) 3352 MINMAX(32, minnummag, true, true, true) 3353 MINMAX(32, max, false, false, false) 3354 MINMAX(32, maxnum, false, true, false) 3355 MINMAX(32, maxnummag, false, true, true) 3356 3357 MINMAX(64, min, true, false, false) 3358 MINMAX(64, minnum, true, true, false) 3359 MINMAX(64, minnummag, true, true, true) 3360 MINMAX(64, max, false, false, false) 3361 MINMAX(64, maxnum, false, true, false) 3362 MINMAX(64, maxnummag, false, true, true) 3363 3364 #undef MINMAX 3365 3366 #define BF16_MINMAX(name, ismin, isiee, ismag) \ 3367 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s) \ 3368 { \ 3369 FloatParts64 pa, pb, pr; \ 3370 bfloat16_unpack_canonical(&pa, a, s); \ 3371 bfloat16_unpack_canonical(&pb, b, s); \ 3372 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 3373 return bfloat16_round_pack_canonical(&pr, s); \ 3374 } 3375 3376 BF16_MINMAX(min, true, false, false) 3377 BF16_MINMAX(minnum, true, true, false) 3378 BF16_MINMAX(minnummag, true, true, true) 3379 BF16_MINMAX(max, false, false, false) 3380 BF16_MINMAX(maxnum, false, true, false) 3381 BF16_MINMAX(maxnummag, false, true, true) 3382 3383 #undef BF16_MINMAX 3384 3385 /* Floating point compare */ 3386 static FloatRelation compare_floats(FloatParts64 a, FloatParts64 b, bool is_quiet, 3387 float_status *s) 3388 { 3389 if (is_nan(a.cls) || is_nan(b.cls)) { 3390 if (!is_quiet || 3391 a.cls == float_class_snan || 3392 b.cls == float_class_snan) { 3393 float_raise(float_flag_invalid, s); 3394 } 3395 return float_relation_unordered; 3396 } 3397 3398 if (a.cls == float_class_zero) { 3399 if (b.cls == float_class_zero) { 3400 return float_relation_equal; 3401 } 3402 return b.sign ? float_relation_greater : float_relation_less; 3403 } else if (b.cls == float_class_zero) { 3404 return a.sign ? float_relation_less : float_relation_greater; 3405 } 3406 3407 /* The only really important thing about infinity is its sign. If 3408 * both are infinities the sign marks the smallest of the two. 3409 */ 3410 if (a.cls == float_class_inf) { 3411 if ((b.cls == float_class_inf) && (a.sign == b.sign)) { 3412 return float_relation_equal; 3413 } 3414 return a.sign ? float_relation_less : float_relation_greater; 3415 } else if (b.cls == float_class_inf) { 3416 return b.sign ? float_relation_greater : float_relation_less; 3417 } 3418 3419 if (a.sign != b.sign) { 3420 return a.sign ? float_relation_less : float_relation_greater; 3421 } 3422 3423 if (a.exp == b.exp) { 3424 if (a.frac == b.frac) { 3425 return float_relation_equal; 3426 } 3427 if (a.sign) { 3428 return a.frac > b.frac ? 3429 float_relation_less : float_relation_greater; 3430 } else { 3431 return a.frac > b.frac ? 3432 float_relation_greater : float_relation_less; 3433 } 3434 } else { 3435 if (a.sign) { 3436 return a.exp > b.exp ? float_relation_less : float_relation_greater; 3437 } else { 3438 return a.exp > b.exp ? float_relation_greater : float_relation_less; 3439 } 3440 } 3441 } 3442 3443 #define COMPARE(name, attr, sz) \ 3444 static int attr \ 3445 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \ 3446 { \ 3447 FloatParts64 pa, pb; \ 3448 float ## sz ## _unpack_canonical(&pa, a, s); \ 3449 float ## sz ## _unpack_canonical(&pb, b, s); \ 3450 return compare_floats(pa, pb, is_quiet, s); \ 3451 } 3452 3453 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16) 3454 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32) 3455 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64) 3456 3457 #undef COMPARE 3458 3459 FloatRelation float16_compare(float16 a, float16 b, float_status *s) 3460 { 3461 return soft_f16_compare(a, b, false, s); 3462 } 3463 3464 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s) 3465 { 3466 return soft_f16_compare(a, b, true, s); 3467 } 3468 3469 static FloatRelation QEMU_FLATTEN 3470 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s) 3471 { 3472 union_float32 ua, ub; 3473 3474 ua.s = xa; 3475 ub.s = xb; 3476 3477 if (QEMU_NO_HARDFLOAT) { 3478 goto soft; 3479 } 3480 3481 float32_input_flush2(&ua.s, &ub.s, s); 3482 if (isgreaterequal(ua.h, ub.h)) { 3483 if (isgreater(ua.h, ub.h)) { 3484 return float_relation_greater; 3485 } 3486 return float_relation_equal; 3487 } 3488 if (likely(isless(ua.h, ub.h))) { 3489 return float_relation_less; 3490 } 3491 /* The only condition remaining is unordered. 3492 * Fall through to set flags. 3493 */ 3494 soft: 3495 return soft_f32_compare(ua.s, ub.s, is_quiet, s); 3496 } 3497 3498 FloatRelation float32_compare(float32 a, float32 b, float_status *s) 3499 { 3500 return f32_compare(a, b, false, s); 3501 } 3502 3503 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s) 3504 { 3505 return f32_compare(a, b, true, s); 3506 } 3507 3508 static FloatRelation QEMU_FLATTEN 3509 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s) 3510 { 3511 union_float64 ua, ub; 3512 3513 ua.s = xa; 3514 ub.s = xb; 3515 3516 if (QEMU_NO_HARDFLOAT) { 3517 goto soft; 3518 } 3519 3520 float64_input_flush2(&ua.s, &ub.s, s); 3521 if (isgreaterequal(ua.h, ub.h)) { 3522 if (isgreater(ua.h, ub.h)) { 3523 return float_relation_greater; 3524 } 3525 return float_relation_equal; 3526 } 3527 if (likely(isless(ua.h, ub.h))) { 3528 return float_relation_less; 3529 } 3530 /* The only condition remaining is unordered. 3531 * Fall through to set flags. 3532 */ 3533 soft: 3534 return soft_f64_compare(ua.s, ub.s, is_quiet, s); 3535 } 3536 3537 FloatRelation float64_compare(float64 a, float64 b, float_status *s) 3538 { 3539 return f64_compare(a, b, false, s); 3540 } 3541 3542 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s) 3543 { 3544 return f64_compare(a, b, true, s); 3545 } 3546 3547 static FloatRelation QEMU_FLATTEN 3548 soft_bf16_compare(bfloat16 a, bfloat16 b, bool is_quiet, float_status *s) 3549 { 3550 FloatParts64 pa, pb; 3551 3552 bfloat16_unpack_canonical(&pa, a, s); 3553 bfloat16_unpack_canonical(&pb, b, s); 3554 return compare_floats(pa, pb, is_quiet, s); 3555 } 3556 3557 FloatRelation bfloat16_compare(bfloat16 a, bfloat16 b, float_status *s) 3558 { 3559 return soft_bf16_compare(a, b, false, s); 3560 } 3561 3562 FloatRelation bfloat16_compare_quiet(bfloat16 a, bfloat16 b, float_status *s) 3563 { 3564 return soft_bf16_compare(a, b, true, s); 3565 } 3566 3567 /* Multiply A by 2 raised to the power N. */ 3568 static FloatParts64 scalbn_decomposed(FloatParts64 a, int n, float_status *s) 3569 { 3570 if (unlikely(is_nan(a.cls))) { 3571 parts_return_nan(&a, s); 3572 } 3573 if (a.cls == float_class_normal) { 3574 /* The largest float type (even though not supported by FloatParts64) 3575 * is float128, which has a 15 bit exponent. Bounding N to 16 bits 3576 * still allows rounding to infinity, without allowing overflow 3577 * within the int32_t that backs FloatParts64.exp. 3578 */ 3579 n = MIN(MAX(n, -0x10000), 0x10000); 3580 a.exp += n; 3581 } 3582 return a; 3583 } 3584 3585 float16 float16_scalbn(float16 a, int n, float_status *status) 3586 { 3587 FloatParts64 pa, pr; 3588 3589 float16_unpack_canonical(&pa, a, status); 3590 pr = scalbn_decomposed(pa, n, status); 3591 return float16_round_pack_canonical(&pr, status); 3592 } 3593 3594 float32 float32_scalbn(float32 a, int n, float_status *status) 3595 { 3596 FloatParts64 pa, pr; 3597 3598 float32_unpack_canonical(&pa, a, status); 3599 pr = scalbn_decomposed(pa, n, status); 3600 return float32_round_pack_canonical(&pr, status); 3601 } 3602 3603 float64 float64_scalbn(float64 a, int n, float_status *status) 3604 { 3605 FloatParts64 pa, pr; 3606 3607 float64_unpack_canonical(&pa, a, status); 3608 pr = scalbn_decomposed(pa, n, status); 3609 return float64_round_pack_canonical(&pr, status); 3610 } 3611 3612 bfloat16 bfloat16_scalbn(bfloat16 a, int n, float_status *status) 3613 { 3614 FloatParts64 pa, pr; 3615 3616 bfloat16_unpack_canonical(&pa, a, status); 3617 pr = scalbn_decomposed(pa, n, status); 3618 return bfloat16_round_pack_canonical(&pr, status); 3619 } 3620 3621 /* 3622 * Square Root 3623 * 3624 * The old softfloat code did an approximation step before zeroing in 3625 * on the final result. However for simpleness we just compute the 3626 * square root by iterating down from the implicit bit to enough extra 3627 * bits to ensure we get a correctly rounded result. 3628 * 3629 * This does mean however the calculation is slower than before, 3630 * especially for 64 bit floats. 3631 */ 3632 3633 static FloatParts64 sqrt_float(FloatParts64 a, float_status *s, const FloatFmt *p) 3634 { 3635 uint64_t a_frac, r_frac, s_frac; 3636 int bit, last_bit; 3637 3638 if (is_nan(a.cls)) { 3639 parts_return_nan(&a, s); 3640 return a; 3641 } 3642 if (a.cls == float_class_zero) { 3643 return a; /* sqrt(+-0) = +-0 */ 3644 } 3645 if (a.sign) { 3646 float_raise(float_flag_invalid, s); 3647 parts_default_nan(&a, s); 3648 return a; 3649 } 3650 if (a.cls == float_class_inf) { 3651 return a; /* sqrt(+inf) = +inf */ 3652 } 3653 3654 assert(a.cls == float_class_normal); 3655 3656 /* We need two overflow bits at the top. Adding room for that is a 3657 * right shift. If the exponent is odd, we can discard the low bit 3658 * by multiplying the fraction by 2; that's a left shift. Combine 3659 * those and we shift right by 1 if the exponent is odd, otherwise 2. 3660 */ 3661 a_frac = a.frac >> (2 - (a.exp & 1)); 3662 a.exp >>= 1; 3663 3664 /* Bit-by-bit computation of sqrt. */ 3665 r_frac = 0; 3666 s_frac = 0; 3667 3668 /* Iterate from implicit bit down to the 3 extra bits to compute a 3669 * properly rounded result. Remember we've inserted two more bits 3670 * at the top, so these positions are two less. 3671 */ 3672 bit = DECOMPOSED_BINARY_POINT - 2; 3673 last_bit = MAX(p->frac_shift - 4, 0); 3674 do { 3675 uint64_t q = 1ULL << bit; 3676 uint64_t t_frac = s_frac + q; 3677 if (t_frac <= a_frac) { 3678 s_frac = t_frac + q; 3679 a_frac -= t_frac; 3680 r_frac += q; 3681 } 3682 a_frac <<= 1; 3683 } while (--bit >= last_bit); 3684 3685 /* Undo the right shift done above. If there is any remaining 3686 * fraction, the result is inexact. Set the sticky bit. 3687 */ 3688 a.frac = (r_frac << 2) + (a_frac != 0); 3689 3690 return a; 3691 } 3692 3693 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status) 3694 { 3695 FloatParts64 pa, pr; 3696 3697 float16_unpack_canonical(&pa, a, status); 3698 pr = sqrt_float(pa, status, &float16_params); 3699 return float16_round_pack_canonical(&pr, status); 3700 } 3701 3702 static float32 QEMU_SOFTFLOAT_ATTR 3703 soft_f32_sqrt(float32 a, float_status *status) 3704 { 3705 FloatParts64 pa, pr; 3706 3707 float32_unpack_canonical(&pa, a, status); 3708 pr = sqrt_float(pa, status, &float32_params); 3709 return float32_round_pack_canonical(&pr, status); 3710 } 3711 3712 static float64 QEMU_SOFTFLOAT_ATTR 3713 soft_f64_sqrt(float64 a, float_status *status) 3714 { 3715 FloatParts64 pa, pr; 3716 3717 float64_unpack_canonical(&pa, a, status); 3718 pr = sqrt_float(pa, status, &float64_params); 3719 return float64_round_pack_canonical(&pr, status); 3720 } 3721 3722 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s) 3723 { 3724 union_float32 ua, ur; 3725 3726 ua.s = xa; 3727 if (unlikely(!can_use_fpu(s))) { 3728 goto soft; 3729 } 3730 3731 float32_input_flush1(&ua.s, s); 3732 if (QEMU_HARDFLOAT_1F32_USE_FP) { 3733 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3734 fpclassify(ua.h) == FP_ZERO) || 3735 signbit(ua.h))) { 3736 goto soft; 3737 } 3738 } else if (unlikely(!float32_is_zero_or_normal(ua.s) || 3739 float32_is_neg(ua.s))) { 3740 goto soft; 3741 } 3742 ur.h = sqrtf(ua.h); 3743 return ur.s; 3744 3745 soft: 3746 return soft_f32_sqrt(ua.s, s); 3747 } 3748 3749 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s) 3750 { 3751 union_float64 ua, ur; 3752 3753 ua.s = xa; 3754 if (unlikely(!can_use_fpu(s))) { 3755 goto soft; 3756 } 3757 3758 float64_input_flush1(&ua.s, s); 3759 if (QEMU_HARDFLOAT_1F64_USE_FP) { 3760 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3761 fpclassify(ua.h) == FP_ZERO) || 3762 signbit(ua.h))) { 3763 goto soft; 3764 } 3765 } else if (unlikely(!float64_is_zero_or_normal(ua.s) || 3766 float64_is_neg(ua.s))) { 3767 goto soft; 3768 } 3769 ur.h = sqrt(ua.h); 3770 return ur.s; 3771 3772 soft: 3773 return soft_f64_sqrt(ua.s, s); 3774 } 3775 3776 bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status) 3777 { 3778 FloatParts64 pa, pr; 3779 3780 bfloat16_unpack_canonical(&pa, a, status); 3781 pr = sqrt_float(pa, status, &bfloat16_params); 3782 return bfloat16_round_pack_canonical(&pr, status); 3783 } 3784 3785 /*---------------------------------------------------------------------------- 3786 | The pattern for a default generated NaN. 3787 *----------------------------------------------------------------------------*/ 3788 3789 float16 float16_default_nan(float_status *status) 3790 { 3791 FloatParts64 p; 3792 3793 parts_default_nan(&p, status); 3794 p.frac >>= float16_params.frac_shift; 3795 return float16_pack_raw(&p); 3796 } 3797 3798 float32 float32_default_nan(float_status *status) 3799 { 3800 FloatParts64 p; 3801 3802 parts_default_nan(&p, status); 3803 p.frac >>= float32_params.frac_shift; 3804 return float32_pack_raw(&p); 3805 } 3806 3807 float64 float64_default_nan(float_status *status) 3808 { 3809 FloatParts64 p; 3810 3811 parts_default_nan(&p, status); 3812 p.frac >>= float64_params.frac_shift; 3813 return float64_pack_raw(&p); 3814 } 3815 3816 float128 float128_default_nan(float_status *status) 3817 { 3818 FloatParts128 p; 3819 3820 parts_default_nan(&p, status); 3821 frac_shr(&p, float128_params.frac_shift); 3822 return float128_pack_raw(&p); 3823 } 3824 3825 bfloat16 bfloat16_default_nan(float_status *status) 3826 { 3827 FloatParts64 p; 3828 3829 parts_default_nan(&p, status); 3830 p.frac >>= bfloat16_params.frac_shift; 3831 return bfloat16_pack_raw(&p); 3832 } 3833 3834 /*---------------------------------------------------------------------------- 3835 | Returns a quiet NaN from a signalling NaN for the floating point value `a'. 3836 *----------------------------------------------------------------------------*/ 3837 3838 float16 float16_silence_nan(float16 a, float_status *status) 3839 { 3840 FloatParts64 p; 3841 3842 float16_unpack_raw(&p, a); 3843 p.frac <<= float16_params.frac_shift; 3844 parts_silence_nan(&p, status); 3845 p.frac >>= float16_params.frac_shift; 3846 return float16_pack_raw(&p); 3847 } 3848 3849 float32 float32_silence_nan(float32 a, float_status *status) 3850 { 3851 FloatParts64 p; 3852 3853 float32_unpack_raw(&p, a); 3854 p.frac <<= float32_params.frac_shift; 3855 parts_silence_nan(&p, status); 3856 p.frac >>= float32_params.frac_shift; 3857 return float32_pack_raw(&p); 3858 } 3859 3860 float64 float64_silence_nan(float64 a, float_status *status) 3861 { 3862 FloatParts64 p; 3863 3864 float64_unpack_raw(&p, a); 3865 p.frac <<= float64_params.frac_shift; 3866 parts_silence_nan(&p, status); 3867 p.frac >>= float64_params.frac_shift; 3868 return float64_pack_raw(&p); 3869 } 3870 3871 bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status) 3872 { 3873 FloatParts64 p; 3874 3875 bfloat16_unpack_raw(&p, a); 3876 p.frac <<= bfloat16_params.frac_shift; 3877 parts_silence_nan(&p, status); 3878 p.frac >>= bfloat16_params.frac_shift; 3879 return bfloat16_pack_raw(&p); 3880 } 3881 3882 float128 float128_silence_nan(float128 a, float_status *status) 3883 { 3884 FloatParts128 p; 3885 3886 float128_unpack_raw(&p, a); 3887 frac_shl(&p, float128_params.frac_shift); 3888 parts_silence_nan(&p, status); 3889 frac_shr(&p, float128_params.frac_shift); 3890 return float128_pack_raw(&p); 3891 } 3892 3893 /*---------------------------------------------------------------------------- 3894 | If `a' is denormal and we are in flush-to-zero mode then set the 3895 | input-denormal exception and return zero. Otherwise just return the value. 3896 *----------------------------------------------------------------------------*/ 3897 3898 static bool parts_squash_denormal(FloatParts64 p, float_status *status) 3899 { 3900 if (p.exp == 0 && p.frac != 0) { 3901 float_raise(float_flag_input_denormal, status); 3902 return true; 3903 } 3904 3905 return false; 3906 } 3907 3908 float16 float16_squash_input_denormal(float16 a, float_status *status) 3909 { 3910 if (status->flush_inputs_to_zero) { 3911 FloatParts64 p; 3912 3913 float16_unpack_raw(&p, a); 3914 if (parts_squash_denormal(p, status)) { 3915 return float16_set_sign(float16_zero, p.sign); 3916 } 3917 } 3918 return a; 3919 } 3920 3921 float32 float32_squash_input_denormal(float32 a, float_status *status) 3922 { 3923 if (status->flush_inputs_to_zero) { 3924 FloatParts64 p; 3925 3926 float32_unpack_raw(&p, a); 3927 if (parts_squash_denormal(p, status)) { 3928 return float32_set_sign(float32_zero, p.sign); 3929 } 3930 } 3931 return a; 3932 } 3933 3934 float64 float64_squash_input_denormal(float64 a, float_status *status) 3935 { 3936 if (status->flush_inputs_to_zero) { 3937 FloatParts64 p; 3938 3939 float64_unpack_raw(&p, a); 3940 if (parts_squash_denormal(p, status)) { 3941 return float64_set_sign(float64_zero, p.sign); 3942 } 3943 } 3944 return a; 3945 } 3946 3947 bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status) 3948 { 3949 if (status->flush_inputs_to_zero) { 3950 FloatParts64 p; 3951 3952 bfloat16_unpack_raw(&p, a); 3953 if (parts_squash_denormal(p, status)) { 3954 return bfloat16_set_sign(bfloat16_zero, p.sign); 3955 } 3956 } 3957 return a; 3958 } 3959 3960 /*---------------------------------------------------------------------------- 3961 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 3962 | and 7, and returns the properly rounded 32-bit integer corresponding to the 3963 | input. If `zSign' is 1, the input is negated before being converted to an 3964 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 3965 | is simply rounded to an integer, with the inexact exception raised if the 3966 | input cannot be represented exactly as an integer. However, if the fixed- 3967 | point input is too large, the invalid exception is raised and the largest 3968 | positive or negative integer is returned. 3969 *----------------------------------------------------------------------------*/ 3970 3971 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ, 3972 float_status *status) 3973 { 3974 int8_t roundingMode; 3975 bool roundNearestEven; 3976 int8_t roundIncrement, roundBits; 3977 int32_t z; 3978 3979 roundingMode = status->float_rounding_mode; 3980 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3981 switch (roundingMode) { 3982 case float_round_nearest_even: 3983 case float_round_ties_away: 3984 roundIncrement = 0x40; 3985 break; 3986 case float_round_to_zero: 3987 roundIncrement = 0; 3988 break; 3989 case float_round_up: 3990 roundIncrement = zSign ? 0 : 0x7f; 3991 break; 3992 case float_round_down: 3993 roundIncrement = zSign ? 0x7f : 0; 3994 break; 3995 case float_round_to_odd: 3996 roundIncrement = absZ & 0x80 ? 0 : 0x7f; 3997 break; 3998 default: 3999 abort(); 4000 } 4001 roundBits = absZ & 0x7F; 4002 absZ = ( absZ + roundIncrement )>>7; 4003 if (!(roundBits ^ 0x40) && roundNearestEven) { 4004 absZ &= ~1; 4005 } 4006 z = absZ; 4007 if ( zSign ) z = - z; 4008 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 4009 float_raise(float_flag_invalid, status); 4010 return zSign ? INT32_MIN : INT32_MAX; 4011 } 4012 if (roundBits) { 4013 float_raise(float_flag_inexact, status); 4014 } 4015 return z; 4016 4017 } 4018 4019 /*---------------------------------------------------------------------------- 4020 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 4021 | `absZ1', with binary point between bits 63 and 64 (between the input words), 4022 | and returns the properly rounded 64-bit integer corresponding to the input. 4023 | If `zSign' is 1, the input is negated before being converted to an integer. 4024 | Ordinarily, the fixed-point input is simply rounded to an integer, with 4025 | the inexact exception raised if the input cannot be represented exactly as 4026 | an integer. However, if the fixed-point input is too large, the invalid 4027 | exception is raised and the largest positive or negative integer is 4028 | returned. 4029 *----------------------------------------------------------------------------*/ 4030 4031 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1, 4032 float_status *status) 4033 { 4034 int8_t roundingMode; 4035 bool roundNearestEven, increment; 4036 int64_t z; 4037 4038 roundingMode = status->float_rounding_mode; 4039 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4040 switch (roundingMode) { 4041 case float_round_nearest_even: 4042 case float_round_ties_away: 4043 increment = ((int64_t) absZ1 < 0); 4044 break; 4045 case float_round_to_zero: 4046 increment = 0; 4047 break; 4048 case float_round_up: 4049 increment = !zSign && absZ1; 4050 break; 4051 case float_round_down: 4052 increment = zSign && absZ1; 4053 break; 4054 case float_round_to_odd: 4055 increment = !(absZ0 & 1) && absZ1; 4056 break; 4057 default: 4058 abort(); 4059 } 4060 if ( increment ) { 4061 ++absZ0; 4062 if ( absZ0 == 0 ) goto overflow; 4063 if (!(absZ1 << 1) && roundNearestEven) { 4064 absZ0 &= ~1; 4065 } 4066 } 4067 z = absZ0; 4068 if ( zSign ) z = - z; 4069 if ( z && ( ( z < 0 ) ^ zSign ) ) { 4070 overflow: 4071 float_raise(float_flag_invalid, status); 4072 return zSign ? INT64_MIN : INT64_MAX; 4073 } 4074 if (absZ1) { 4075 float_raise(float_flag_inexact, status); 4076 } 4077 return z; 4078 4079 } 4080 4081 /*---------------------------------------------------------------------------- 4082 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 4083 | `absZ1', with binary point between bits 63 and 64 (between the input words), 4084 | and returns the properly rounded 64-bit unsigned integer corresponding to the 4085 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 4086 | with the inexact exception raised if the input cannot be represented exactly 4087 | as an integer. However, if the fixed-point input is too large, the invalid 4088 | exception is raised and the largest unsigned integer is returned. 4089 *----------------------------------------------------------------------------*/ 4090 4091 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0, 4092 uint64_t absZ1, float_status *status) 4093 { 4094 int8_t roundingMode; 4095 bool roundNearestEven, increment; 4096 4097 roundingMode = status->float_rounding_mode; 4098 roundNearestEven = (roundingMode == float_round_nearest_even); 4099 switch (roundingMode) { 4100 case float_round_nearest_even: 4101 case float_round_ties_away: 4102 increment = ((int64_t)absZ1 < 0); 4103 break; 4104 case float_round_to_zero: 4105 increment = 0; 4106 break; 4107 case float_round_up: 4108 increment = !zSign && absZ1; 4109 break; 4110 case float_round_down: 4111 increment = zSign && absZ1; 4112 break; 4113 case float_round_to_odd: 4114 increment = !(absZ0 & 1) && absZ1; 4115 break; 4116 default: 4117 abort(); 4118 } 4119 if (increment) { 4120 ++absZ0; 4121 if (absZ0 == 0) { 4122 float_raise(float_flag_invalid, status); 4123 return UINT64_MAX; 4124 } 4125 if (!(absZ1 << 1) && roundNearestEven) { 4126 absZ0 &= ~1; 4127 } 4128 } 4129 4130 if (zSign && absZ0) { 4131 float_raise(float_flag_invalid, status); 4132 return 0; 4133 } 4134 4135 if (absZ1) { 4136 float_raise(float_flag_inexact, status); 4137 } 4138 return absZ0; 4139 } 4140 4141 /*---------------------------------------------------------------------------- 4142 | Normalizes the subnormal single-precision floating-point value represented 4143 | by the denormalized significand `aSig'. The normalized exponent and 4144 | significand are stored at the locations pointed to by `zExpPtr' and 4145 | `zSigPtr', respectively. 4146 *----------------------------------------------------------------------------*/ 4147 4148 static void 4149 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 4150 { 4151 int8_t shiftCount; 4152 4153 shiftCount = clz32(aSig) - 8; 4154 *zSigPtr = aSig<<shiftCount; 4155 *zExpPtr = 1 - shiftCount; 4156 4157 } 4158 4159 /*---------------------------------------------------------------------------- 4160 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4161 | and significand `zSig', and returns the proper single-precision floating- 4162 | point value corresponding to the abstract input. Ordinarily, the abstract 4163 | value is simply rounded and packed into the single-precision format, with 4164 | the inexact exception raised if the abstract input cannot be represented 4165 | exactly. However, if the abstract value is too large, the overflow and 4166 | inexact exceptions are raised and an infinity or maximal finite value is 4167 | returned. If the abstract value is too small, the input value is rounded to 4168 | a subnormal number, and the underflow and inexact exceptions are raised if 4169 | the abstract input cannot be represented exactly as a subnormal single- 4170 | precision floating-point number. 4171 | The input significand `zSig' has its binary point between bits 30 4172 | and 29, which is 7 bits to the left of the usual location. This shifted 4173 | significand must be normalized or smaller. If `zSig' is not normalized, 4174 | `zExp' must be 0; in that case, the result returned is a subnormal number, 4175 | and it must not require rounding. In the usual case that `zSig' is 4176 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 4177 | The handling of underflow and overflow follows the IEC/IEEE Standard for 4178 | Binary Floating-Point Arithmetic. 4179 *----------------------------------------------------------------------------*/ 4180 4181 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig, 4182 float_status *status) 4183 { 4184 int8_t roundingMode; 4185 bool roundNearestEven; 4186 int8_t roundIncrement, roundBits; 4187 bool isTiny; 4188 4189 roundingMode = status->float_rounding_mode; 4190 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4191 switch (roundingMode) { 4192 case float_round_nearest_even: 4193 case float_round_ties_away: 4194 roundIncrement = 0x40; 4195 break; 4196 case float_round_to_zero: 4197 roundIncrement = 0; 4198 break; 4199 case float_round_up: 4200 roundIncrement = zSign ? 0 : 0x7f; 4201 break; 4202 case float_round_down: 4203 roundIncrement = zSign ? 0x7f : 0; 4204 break; 4205 case float_round_to_odd: 4206 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 4207 break; 4208 default: 4209 abort(); 4210 break; 4211 } 4212 roundBits = zSig & 0x7F; 4213 if ( 0xFD <= (uint16_t) zExp ) { 4214 if ( ( 0xFD < zExp ) 4215 || ( ( zExp == 0xFD ) 4216 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 4217 ) { 4218 bool overflow_to_inf = roundingMode != float_round_to_odd && 4219 roundIncrement != 0; 4220 float_raise(float_flag_overflow | float_flag_inexact, status); 4221 return packFloat32(zSign, 0xFF, -!overflow_to_inf); 4222 } 4223 if ( zExp < 0 ) { 4224 if (status->flush_to_zero) { 4225 float_raise(float_flag_output_denormal, status); 4226 return packFloat32(zSign, 0, 0); 4227 } 4228 isTiny = status->tininess_before_rounding 4229 || (zExp < -1) 4230 || (zSig + roundIncrement < 0x80000000); 4231 shift32RightJamming( zSig, - zExp, &zSig ); 4232 zExp = 0; 4233 roundBits = zSig & 0x7F; 4234 if (isTiny && roundBits) { 4235 float_raise(float_flag_underflow, status); 4236 } 4237 if (roundingMode == float_round_to_odd) { 4238 /* 4239 * For round-to-odd case, the roundIncrement depends on 4240 * zSig which just changed. 4241 */ 4242 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 4243 } 4244 } 4245 } 4246 if (roundBits) { 4247 float_raise(float_flag_inexact, status); 4248 } 4249 zSig = ( zSig + roundIncrement )>>7; 4250 if (!(roundBits ^ 0x40) && roundNearestEven) { 4251 zSig &= ~1; 4252 } 4253 if ( zSig == 0 ) zExp = 0; 4254 return packFloat32( zSign, zExp, zSig ); 4255 4256 } 4257 4258 /*---------------------------------------------------------------------------- 4259 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4260 | and significand `zSig', and returns the proper single-precision floating- 4261 | point value corresponding to the abstract input. This routine is just like 4262 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 4263 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 4264 | floating-point exponent. 4265 *----------------------------------------------------------------------------*/ 4266 4267 static float32 4268 normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig, 4269 float_status *status) 4270 { 4271 int8_t shiftCount; 4272 4273 shiftCount = clz32(zSig) - 1; 4274 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 4275 status); 4276 4277 } 4278 4279 /*---------------------------------------------------------------------------- 4280 | Normalizes the subnormal double-precision floating-point value represented 4281 | by the denormalized significand `aSig'. The normalized exponent and 4282 | significand are stored at the locations pointed to by `zExpPtr' and 4283 | `zSigPtr', respectively. 4284 *----------------------------------------------------------------------------*/ 4285 4286 static void 4287 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 4288 { 4289 int8_t shiftCount; 4290 4291 shiftCount = clz64(aSig) - 11; 4292 *zSigPtr = aSig<<shiftCount; 4293 *zExpPtr = 1 - shiftCount; 4294 4295 } 4296 4297 /*---------------------------------------------------------------------------- 4298 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 4299 | double-precision floating-point value, returning the result. After being 4300 | shifted into the proper positions, the three fields are simply added 4301 | together to form the result. This means that any integer portion of `zSig' 4302 | will be added into the exponent. Since a properly normalized significand 4303 | will have an integer portion equal to 1, the `zExp' input should be 1 less 4304 | than the desired result exponent whenever `zSig' is a complete, normalized 4305 | significand. 4306 *----------------------------------------------------------------------------*/ 4307 4308 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig) 4309 { 4310 4311 return make_float64( 4312 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 4313 4314 } 4315 4316 /*---------------------------------------------------------------------------- 4317 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4318 | and significand `zSig', and returns the proper double-precision floating- 4319 | point value corresponding to the abstract input. Ordinarily, the abstract 4320 | value is simply rounded and packed into the double-precision format, with 4321 | the inexact exception raised if the abstract input cannot be represented 4322 | exactly. However, if the abstract value is too large, the overflow and 4323 | inexact exceptions are raised and an infinity or maximal finite value is 4324 | returned. If the abstract value is too small, the input value is rounded to 4325 | a subnormal number, and the underflow and inexact exceptions are raised if 4326 | the abstract input cannot be represented exactly as a subnormal double- 4327 | precision floating-point number. 4328 | The input significand `zSig' has its binary point between bits 62 4329 | and 61, which is 10 bits to the left of the usual location. This shifted 4330 | significand must be normalized or smaller. If `zSig' is not normalized, 4331 | `zExp' must be 0; in that case, the result returned is a subnormal number, 4332 | and it must not require rounding. In the usual case that `zSig' is 4333 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 4334 | The handling of underflow and overflow follows the IEC/IEEE Standard for 4335 | Binary Floating-Point Arithmetic. 4336 *----------------------------------------------------------------------------*/ 4337 4338 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig, 4339 float_status *status) 4340 { 4341 int8_t roundingMode; 4342 bool roundNearestEven; 4343 int roundIncrement, roundBits; 4344 bool isTiny; 4345 4346 roundingMode = status->float_rounding_mode; 4347 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4348 switch (roundingMode) { 4349 case float_round_nearest_even: 4350 case float_round_ties_away: 4351 roundIncrement = 0x200; 4352 break; 4353 case float_round_to_zero: 4354 roundIncrement = 0; 4355 break; 4356 case float_round_up: 4357 roundIncrement = zSign ? 0 : 0x3ff; 4358 break; 4359 case float_round_down: 4360 roundIncrement = zSign ? 0x3ff : 0; 4361 break; 4362 case float_round_to_odd: 4363 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 4364 break; 4365 default: 4366 abort(); 4367 } 4368 roundBits = zSig & 0x3FF; 4369 if ( 0x7FD <= (uint16_t) zExp ) { 4370 if ( ( 0x7FD < zExp ) 4371 || ( ( zExp == 0x7FD ) 4372 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 4373 ) { 4374 bool overflow_to_inf = roundingMode != float_round_to_odd && 4375 roundIncrement != 0; 4376 float_raise(float_flag_overflow | float_flag_inexact, status); 4377 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 4378 } 4379 if ( zExp < 0 ) { 4380 if (status->flush_to_zero) { 4381 float_raise(float_flag_output_denormal, status); 4382 return packFloat64(zSign, 0, 0); 4383 } 4384 isTiny = status->tininess_before_rounding 4385 || (zExp < -1) 4386 || (zSig + roundIncrement < UINT64_C(0x8000000000000000)); 4387 shift64RightJamming( zSig, - zExp, &zSig ); 4388 zExp = 0; 4389 roundBits = zSig & 0x3FF; 4390 if (isTiny && roundBits) { 4391 float_raise(float_flag_underflow, status); 4392 } 4393 if (roundingMode == float_round_to_odd) { 4394 /* 4395 * For round-to-odd case, the roundIncrement depends on 4396 * zSig which just changed. 4397 */ 4398 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 4399 } 4400 } 4401 } 4402 if (roundBits) { 4403 float_raise(float_flag_inexact, status); 4404 } 4405 zSig = ( zSig + roundIncrement )>>10; 4406 if (!(roundBits ^ 0x200) && roundNearestEven) { 4407 zSig &= ~1; 4408 } 4409 if ( zSig == 0 ) zExp = 0; 4410 return packFloat64( zSign, zExp, zSig ); 4411 4412 } 4413 4414 /*---------------------------------------------------------------------------- 4415 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4416 | and significand `zSig', and returns the proper double-precision floating- 4417 | point value corresponding to the abstract input. This routine is just like 4418 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 4419 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 4420 | floating-point exponent. 4421 *----------------------------------------------------------------------------*/ 4422 4423 static float64 4424 normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig, 4425 float_status *status) 4426 { 4427 int8_t shiftCount; 4428 4429 shiftCount = clz64(zSig) - 1; 4430 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 4431 status); 4432 4433 } 4434 4435 /*---------------------------------------------------------------------------- 4436 | Normalizes the subnormal extended double-precision floating-point value 4437 | represented by the denormalized significand `aSig'. The normalized exponent 4438 | and significand are stored at the locations pointed to by `zExpPtr' and 4439 | `zSigPtr', respectively. 4440 *----------------------------------------------------------------------------*/ 4441 4442 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr, 4443 uint64_t *zSigPtr) 4444 { 4445 int8_t shiftCount; 4446 4447 shiftCount = clz64(aSig); 4448 *zSigPtr = aSig<<shiftCount; 4449 *zExpPtr = 1 - shiftCount; 4450 } 4451 4452 /*---------------------------------------------------------------------------- 4453 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4454 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 4455 | and returns the proper extended double-precision floating-point value 4456 | corresponding to the abstract input. Ordinarily, the abstract value is 4457 | rounded and packed into the extended double-precision format, with the 4458 | inexact exception raised if the abstract input cannot be represented 4459 | exactly. However, if the abstract value is too large, the overflow and 4460 | inexact exceptions are raised and an infinity or maximal finite value is 4461 | returned. If the abstract value is too small, the input value is rounded to 4462 | a subnormal number, and the underflow and inexact exceptions are raised if 4463 | the abstract input cannot be represented exactly as a subnormal extended 4464 | double-precision floating-point number. 4465 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 4466 | number of bits as single or double precision, respectively. Otherwise, the 4467 | result is rounded to the full precision of the extended double-precision 4468 | format. 4469 | The input significand must be normalized or smaller. If the input 4470 | significand is not normalized, `zExp' must be 0; in that case, the result 4471 | returned is a subnormal number, and it must not require rounding. The 4472 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 4473 | Floating-Point Arithmetic. 4474 *----------------------------------------------------------------------------*/ 4475 4476 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign, 4477 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 4478 float_status *status) 4479 { 4480 int8_t roundingMode; 4481 bool roundNearestEven, increment, isTiny; 4482 int64_t roundIncrement, roundMask, roundBits; 4483 4484 roundingMode = status->float_rounding_mode; 4485 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4486 if ( roundingPrecision == 80 ) goto precision80; 4487 if ( roundingPrecision == 64 ) { 4488 roundIncrement = UINT64_C(0x0000000000000400); 4489 roundMask = UINT64_C(0x00000000000007FF); 4490 } 4491 else if ( roundingPrecision == 32 ) { 4492 roundIncrement = UINT64_C(0x0000008000000000); 4493 roundMask = UINT64_C(0x000000FFFFFFFFFF); 4494 } 4495 else { 4496 goto precision80; 4497 } 4498 zSig0 |= ( zSig1 != 0 ); 4499 switch (roundingMode) { 4500 case float_round_nearest_even: 4501 case float_round_ties_away: 4502 break; 4503 case float_round_to_zero: 4504 roundIncrement = 0; 4505 break; 4506 case float_round_up: 4507 roundIncrement = zSign ? 0 : roundMask; 4508 break; 4509 case float_round_down: 4510 roundIncrement = zSign ? roundMask : 0; 4511 break; 4512 default: 4513 abort(); 4514 } 4515 roundBits = zSig0 & roundMask; 4516 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 4517 if ( ( 0x7FFE < zExp ) 4518 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 4519 ) { 4520 goto overflow; 4521 } 4522 if ( zExp <= 0 ) { 4523 if (status->flush_to_zero) { 4524 float_raise(float_flag_output_denormal, status); 4525 return packFloatx80(zSign, 0, 0); 4526 } 4527 isTiny = status->tininess_before_rounding 4528 || (zExp < 0 ) 4529 || (zSig0 <= zSig0 + roundIncrement); 4530 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 4531 zExp = 0; 4532 roundBits = zSig0 & roundMask; 4533 if (isTiny && roundBits) { 4534 float_raise(float_flag_underflow, status); 4535 } 4536 if (roundBits) { 4537 float_raise(float_flag_inexact, status); 4538 } 4539 zSig0 += roundIncrement; 4540 if ( (int64_t) zSig0 < 0 ) zExp = 1; 4541 roundIncrement = roundMask + 1; 4542 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 4543 roundMask |= roundIncrement; 4544 } 4545 zSig0 &= ~ roundMask; 4546 return packFloatx80( zSign, zExp, zSig0 ); 4547 } 4548 } 4549 if (roundBits) { 4550 float_raise(float_flag_inexact, status); 4551 } 4552 zSig0 += roundIncrement; 4553 if ( zSig0 < roundIncrement ) { 4554 ++zExp; 4555 zSig0 = UINT64_C(0x8000000000000000); 4556 } 4557 roundIncrement = roundMask + 1; 4558 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 4559 roundMask |= roundIncrement; 4560 } 4561 zSig0 &= ~ roundMask; 4562 if ( zSig0 == 0 ) zExp = 0; 4563 return packFloatx80( zSign, zExp, zSig0 ); 4564 precision80: 4565 switch (roundingMode) { 4566 case float_round_nearest_even: 4567 case float_round_ties_away: 4568 increment = ((int64_t)zSig1 < 0); 4569 break; 4570 case float_round_to_zero: 4571 increment = 0; 4572 break; 4573 case float_round_up: 4574 increment = !zSign && zSig1; 4575 break; 4576 case float_round_down: 4577 increment = zSign && zSig1; 4578 break; 4579 default: 4580 abort(); 4581 } 4582 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 4583 if ( ( 0x7FFE < zExp ) 4584 || ( ( zExp == 0x7FFE ) 4585 && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) ) 4586 && increment 4587 ) 4588 ) { 4589 roundMask = 0; 4590 overflow: 4591 float_raise(float_flag_overflow | float_flag_inexact, status); 4592 if ( ( roundingMode == float_round_to_zero ) 4593 || ( zSign && ( roundingMode == float_round_up ) ) 4594 || ( ! zSign && ( roundingMode == float_round_down ) ) 4595 ) { 4596 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 4597 } 4598 return packFloatx80(zSign, 4599 floatx80_infinity_high, 4600 floatx80_infinity_low); 4601 } 4602 if ( zExp <= 0 ) { 4603 isTiny = status->tininess_before_rounding 4604 || (zExp < 0) 4605 || !increment 4606 || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF)); 4607 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 4608 zExp = 0; 4609 if (isTiny && zSig1) { 4610 float_raise(float_flag_underflow, status); 4611 } 4612 if (zSig1) { 4613 float_raise(float_flag_inexact, status); 4614 } 4615 switch (roundingMode) { 4616 case float_round_nearest_even: 4617 case float_round_ties_away: 4618 increment = ((int64_t)zSig1 < 0); 4619 break; 4620 case float_round_to_zero: 4621 increment = 0; 4622 break; 4623 case float_round_up: 4624 increment = !zSign && zSig1; 4625 break; 4626 case float_round_down: 4627 increment = zSign && zSig1; 4628 break; 4629 default: 4630 abort(); 4631 } 4632 if ( increment ) { 4633 ++zSig0; 4634 if (!(zSig1 << 1) && roundNearestEven) { 4635 zSig0 &= ~1; 4636 } 4637 if ( (int64_t) zSig0 < 0 ) zExp = 1; 4638 } 4639 return packFloatx80( zSign, zExp, zSig0 ); 4640 } 4641 } 4642 if (zSig1) { 4643 float_raise(float_flag_inexact, status); 4644 } 4645 if ( increment ) { 4646 ++zSig0; 4647 if ( zSig0 == 0 ) { 4648 ++zExp; 4649 zSig0 = UINT64_C(0x8000000000000000); 4650 } 4651 else { 4652 if (!(zSig1 << 1) && roundNearestEven) { 4653 zSig0 &= ~1; 4654 } 4655 } 4656 } 4657 else { 4658 if ( zSig0 == 0 ) zExp = 0; 4659 } 4660 return packFloatx80( zSign, zExp, zSig0 ); 4661 4662 } 4663 4664 /*---------------------------------------------------------------------------- 4665 | Takes an abstract floating-point value having sign `zSign', exponent 4666 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 4667 | and returns the proper extended double-precision floating-point value 4668 | corresponding to the abstract input. This routine is just like 4669 | `roundAndPackFloatx80' except that the input significand does not have to be 4670 | normalized. 4671 *----------------------------------------------------------------------------*/ 4672 4673 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 4674 bool zSign, int32_t zExp, 4675 uint64_t zSig0, uint64_t zSig1, 4676 float_status *status) 4677 { 4678 int8_t shiftCount; 4679 4680 if ( zSig0 == 0 ) { 4681 zSig0 = zSig1; 4682 zSig1 = 0; 4683 zExp -= 64; 4684 } 4685 shiftCount = clz64(zSig0); 4686 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4687 zExp -= shiftCount; 4688 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 4689 zSig0, zSig1, status); 4690 4691 } 4692 4693 /*---------------------------------------------------------------------------- 4694 | Returns the least-significant 64 fraction bits of the quadruple-precision 4695 | floating-point value `a'. 4696 *----------------------------------------------------------------------------*/ 4697 4698 static inline uint64_t extractFloat128Frac1( float128 a ) 4699 { 4700 4701 return a.low; 4702 4703 } 4704 4705 /*---------------------------------------------------------------------------- 4706 | Returns the most-significant 48 fraction bits of the quadruple-precision 4707 | floating-point value `a'. 4708 *----------------------------------------------------------------------------*/ 4709 4710 static inline uint64_t extractFloat128Frac0( float128 a ) 4711 { 4712 4713 return a.high & UINT64_C(0x0000FFFFFFFFFFFF); 4714 4715 } 4716 4717 /*---------------------------------------------------------------------------- 4718 | Returns the exponent bits of the quadruple-precision floating-point value 4719 | `a'. 4720 *----------------------------------------------------------------------------*/ 4721 4722 static inline int32_t extractFloat128Exp( float128 a ) 4723 { 4724 4725 return ( a.high>>48 ) & 0x7FFF; 4726 4727 } 4728 4729 /*---------------------------------------------------------------------------- 4730 | Returns the sign bit of the quadruple-precision floating-point value `a'. 4731 *----------------------------------------------------------------------------*/ 4732 4733 static inline bool extractFloat128Sign(float128 a) 4734 { 4735 return a.high >> 63; 4736 } 4737 4738 /*---------------------------------------------------------------------------- 4739 | Normalizes the subnormal quadruple-precision floating-point value 4740 | represented by the denormalized significand formed by the concatenation of 4741 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 4742 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 4743 | significand are stored at the location pointed to by `zSig0Ptr', and the 4744 | least significant 64 bits of the normalized significand are stored at the 4745 | location pointed to by `zSig1Ptr'. 4746 *----------------------------------------------------------------------------*/ 4747 4748 static void 4749 normalizeFloat128Subnormal( 4750 uint64_t aSig0, 4751 uint64_t aSig1, 4752 int32_t *zExpPtr, 4753 uint64_t *zSig0Ptr, 4754 uint64_t *zSig1Ptr 4755 ) 4756 { 4757 int8_t shiftCount; 4758 4759 if ( aSig0 == 0 ) { 4760 shiftCount = clz64(aSig1) - 15; 4761 if ( shiftCount < 0 ) { 4762 *zSig0Ptr = aSig1>>( - shiftCount ); 4763 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 4764 } 4765 else { 4766 *zSig0Ptr = aSig1<<shiftCount; 4767 *zSig1Ptr = 0; 4768 } 4769 *zExpPtr = - shiftCount - 63; 4770 } 4771 else { 4772 shiftCount = clz64(aSig0) - 15; 4773 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 4774 *zExpPtr = 1 - shiftCount; 4775 } 4776 4777 } 4778 4779 /*---------------------------------------------------------------------------- 4780 | Packs the sign `zSign', the exponent `zExp', and the significand formed 4781 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 4782 | floating-point value, returning the result. After being shifted into the 4783 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 4784 | added together to form the most significant 32 bits of the result. This 4785 | means that any integer portion of `zSig0' will be added into the exponent. 4786 | Since a properly normalized significand will have an integer portion equal 4787 | to 1, the `zExp' input should be 1 less than the desired result exponent 4788 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 4789 | significand. 4790 *----------------------------------------------------------------------------*/ 4791 4792 static inline float128 4793 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1) 4794 { 4795 float128 z; 4796 4797 z.low = zSig1; 4798 z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0; 4799 return z; 4800 } 4801 4802 /*---------------------------------------------------------------------------- 4803 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4804 | and extended significand formed by the concatenation of `zSig0', `zSig1', 4805 | and `zSig2', and returns the proper quadruple-precision floating-point value 4806 | corresponding to the abstract input. Ordinarily, the abstract value is 4807 | simply rounded and packed into the quadruple-precision format, with the 4808 | inexact exception raised if the abstract input cannot be represented 4809 | exactly. However, if the abstract value is too large, the overflow and 4810 | inexact exceptions are raised and an infinity or maximal finite value is 4811 | returned. If the abstract value is too small, the input value is rounded to 4812 | a subnormal number, and the underflow and inexact exceptions are raised if 4813 | the abstract input cannot be represented exactly as a subnormal quadruple- 4814 | precision floating-point number. 4815 | The input significand must be normalized or smaller. If the input 4816 | significand is not normalized, `zExp' must be 0; in that case, the result 4817 | returned is a subnormal number, and it must not require rounding. In the 4818 | usual case that the input significand is normalized, `zExp' must be 1 less 4819 | than the ``true'' floating-point exponent. The handling of underflow and 4820 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4821 *----------------------------------------------------------------------------*/ 4822 4823 static float128 roundAndPackFloat128(bool zSign, int32_t zExp, 4824 uint64_t zSig0, uint64_t zSig1, 4825 uint64_t zSig2, float_status *status) 4826 { 4827 int8_t roundingMode; 4828 bool roundNearestEven, increment, isTiny; 4829 4830 roundingMode = status->float_rounding_mode; 4831 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4832 switch (roundingMode) { 4833 case float_round_nearest_even: 4834 case float_round_ties_away: 4835 increment = ((int64_t)zSig2 < 0); 4836 break; 4837 case float_round_to_zero: 4838 increment = 0; 4839 break; 4840 case float_round_up: 4841 increment = !zSign && zSig2; 4842 break; 4843 case float_round_down: 4844 increment = zSign && zSig2; 4845 break; 4846 case float_round_to_odd: 4847 increment = !(zSig1 & 0x1) && zSig2; 4848 break; 4849 default: 4850 abort(); 4851 } 4852 if ( 0x7FFD <= (uint32_t) zExp ) { 4853 if ( ( 0x7FFD < zExp ) 4854 || ( ( zExp == 0x7FFD ) 4855 && eq128( 4856 UINT64_C(0x0001FFFFFFFFFFFF), 4857 UINT64_C(0xFFFFFFFFFFFFFFFF), 4858 zSig0, 4859 zSig1 4860 ) 4861 && increment 4862 ) 4863 ) { 4864 float_raise(float_flag_overflow | float_flag_inexact, status); 4865 if ( ( roundingMode == float_round_to_zero ) 4866 || ( zSign && ( roundingMode == float_round_up ) ) 4867 || ( ! zSign && ( roundingMode == float_round_down ) ) 4868 || (roundingMode == float_round_to_odd) 4869 ) { 4870 return 4871 packFloat128( 4872 zSign, 4873 0x7FFE, 4874 UINT64_C(0x0000FFFFFFFFFFFF), 4875 UINT64_C(0xFFFFFFFFFFFFFFFF) 4876 ); 4877 } 4878 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4879 } 4880 if ( zExp < 0 ) { 4881 if (status->flush_to_zero) { 4882 float_raise(float_flag_output_denormal, status); 4883 return packFloat128(zSign, 0, 0, 0); 4884 } 4885 isTiny = status->tininess_before_rounding 4886 || (zExp < -1) 4887 || !increment 4888 || lt128(zSig0, zSig1, 4889 UINT64_C(0x0001FFFFFFFFFFFF), 4890 UINT64_C(0xFFFFFFFFFFFFFFFF)); 4891 shift128ExtraRightJamming( 4892 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 4893 zExp = 0; 4894 if (isTiny && zSig2) { 4895 float_raise(float_flag_underflow, status); 4896 } 4897 switch (roundingMode) { 4898 case float_round_nearest_even: 4899 case float_round_ties_away: 4900 increment = ((int64_t)zSig2 < 0); 4901 break; 4902 case float_round_to_zero: 4903 increment = 0; 4904 break; 4905 case float_round_up: 4906 increment = !zSign && zSig2; 4907 break; 4908 case float_round_down: 4909 increment = zSign && zSig2; 4910 break; 4911 case float_round_to_odd: 4912 increment = !(zSig1 & 0x1) && zSig2; 4913 break; 4914 default: 4915 abort(); 4916 } 4917 } 4918 } 4919 if (zSig2) { 4920 float_raise(float_flag_inexact, status); 4921 } 4922 if ( increment ) { 4923 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 4924 if ((zSig2 + zSig2 == 0) && roundNearestEven) { 4925 zSig1 &= ~1; 4926 } 4927 } 4928 else { 4929 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 4930 } 4931 return packFloat128( zSign, zExp, zSig0, zSig1 ); 4932 4933 } 4934 4935 /*---------------------------------------------------------------------------- 4936 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4937 | and significand formed by the concatenation of `zSig0' and `zSig1', and 4938 | returns the proper quadruple-precision floating-point value corresponding 4939 | to the abstract input. This routine is just like `roundAndPackFloat128' 4940 | except that the input significand has fewer bits and does not have to be 4941 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 4942 | point exponent. 4943 *----------------------------------------------------------------------------*/ 4944 4945 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp, 4946 uint64_t zSig0, uint64_t zSig1, 4947 float_status *status) 4948 { 4949 int8_t shiftCount; 4950 uint64_t zSig2; 4951 4952 if ( zSig0 == 0 ) { 4953 zSig0 = zSig1; 4954 zSig1 = 0; 4955 zExp -= 64; 4956 } 4957 shiftCount = clz64(zSig0) - 15; 4958 if ( 0 <= shiftCount ) { 4959 zSig2 = 0; 4960 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4961 } 4962 else { 4963 shift128ExtraRightJamming( 4964 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 4965 } 4966 zExp -= shiftCount; 4967 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 4968 4969 } 4970 4971 4972 /*---------------------------------------------------------------------------- 4973 | Returns the result of converting the 32-bit two's complement integer `a' 4974 | to the extended double-precision floating-point format. The conversion 4975 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4976 | Arithmetic. 4977 *----------------------------------------------------------------------------*/ 4978 4979 floatx80 int32_to_floatx80(int32_t a, float_status *status) 4980 { 4981 bool zSign; 4982 uint32_t absA; 4983 int8_t shiftCount; 4984 uint64_t zSig; 4985 4986 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 4987 zSign = ( a < 0 ); 4988 absA = zSign ? - a : a; 4989 shiftCount = clz32(absA) + 32; 4990 zSig = absA; 4991 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 4992 4993 } 4994 4995 /*---------------------------------------------------------------------------- 4996 | Returns the result of converting the 32-bit two's complement integer `a' to 4997 | the quadruple-precision floating-point format. The conversion is performed 4998 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4999 *----------------------------------------------------------------------------*/ 5000 5001 float128 int32_to_float128(int32_t a, float_status *status) 5002 { 5003 bool zSign; 5004 uint32_t absA; 5005 int8_t shiftCount; 5006 uint64_t zSig0; 5007 5008 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 5009 zSign = ( a < 0 ); 5010 absA = zSign ? - a : a; 5011 shiftCount = clz32(absA) + 17; 5012 zSig0 = absA; 5013 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 5014 5015 } 5016 5017 /*---------------------------------------------------------------------------- 5018 | Returns the result of converting the 64-bit two's complement integer `a' 5019 | to the extended double-precision floating-point format. The conversion 5020 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5021 | Arithmetic. 5022 *----------------------------------------------------------------------------*/ 5023 5024 floatx80 int64_to_floatx80(int64_t a, float_status *status) 5025 { 5026 bool zSign; 5027 uint64_t absA; 5028 int8_t shiftCount; 5029 5030 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 5031 zSign = ( a < 0 ); 5032 absA = zSign ? - a : a; 5033 shiftCount = clz64(absA); 5034 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 5035 5036 } 5037 5038 /*---------------------------------------------------------------------------- 5039 | Returns the result of converting the 64-bit two's complement integer `a' to 5040 | the quadruple-precision floating-point format. The conversion is performed 5041 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5042 *----------------------------------------------------------------------------*/ 5043 5044 float128 int64_to_float128(int64_t a, float_status *status) 5045 { 5046 bool zSign; 5047 uint64_t absA; 5048 int8_t shiftCount; 5049 int32_t zExp; 5050 uint64_t zSig0, zSig1; 5051 5052 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 5053 zSign = ( a < 0 ); 5054 absA = zSign ? - a : a; 5055 shiftCount = clz64(absA) + 49; 5056 zExp = 0x406E - shiftCount; 5057 if ( 64 <= shiftCount ) { 5058 zSig1 = 0; 5059 zSig0 = absA; 5060 shiftCount -= 64; 5061 } 5062 else { 5063 zSig1 = absA; 5064 zSig0 = 0; 5065 } 5066 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 5067 return packFloat128( zSign, zExp, zSig0, zSig1 ); 5068 5069 } 5070 5071 /*---------------------------------------------------------------------------- 5072 | Returns the result of converting the 64-bit unsigned integer `a' 5073 | to the quadruple-precision floating-point format. The conversion is performed 5074 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5075 *----------------------------------------------------------------------------*/ 5076 5077 float128 uint64_to_float128(uint64_t a, float_status *status) 5078 { 5079 if (a == 0) { 5080 return float128_zero; 5081 } 5082 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status); 5083 } 5084 5085 /*---------------------------------------------------------------------------- 5086 | Returns the result of converting the single-precision floating-point value 5087 | `a' to the extended double-precision floating-point format. The conversion 5088 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5089 | Arithmetic. 5090 *----------------------------------------------------------------------------*/ 5091 5092 floatx80 float32_to_floatx80(float32 a, float_status *status) 5093 { 5094 bool aSign; 5095 int aExp; 5096 uint32_t aSig; 5097 5098 a = float32_squash_input_denormal(a, status); 5099 aSig = extractFloat32Frac( a ); 5100 aExp = extractFloat32Exp( a ); 5101 aSign = extractFloat32Sign( a ); 5102 if ( aExp == 0xFF ) { 5103 if (aSig) { 5104 floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status), 5105 status); 5106 return floatx80_silence_nan(res, status); 5107 } 5108 return packFloatx80(aSign, 5109 floatx80_infinity_high, 5110 floatx80_infinity_low); 5111 } 5112 if ( aExp == 0 ) { 5113 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 5114 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5115 } 5116 aSig |= 0x00800000; 5117 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 5118 5119 } 5120 5121 /*---------------------------------------------------------------------------- 5122 | Returns the result of converting the single-precision floating-point value 5123 | `a' to the double-precision floating-point format. The conversion is 5124 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5125 | Arithmetic. 5126 *----------------------------------------------------------------------------*/ 5127 5128 float128 float32_to_float128(float32 a, float_status *status) 5129 { 5130 bool aSign; 5131 int aExp; 5132 uint32_t aSig; 5133 5134 a = float32_squash_input_denormal(a, status); 5135 aSig = extractFloat32Frac( a ); 5136 aExp = extractFloat32Exp( a ); 5137 aSign = extractFloat32Sign( a ); 5138 if ( aExp == 0xFF ) { 5139 if (aSig) { 5140 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 5141 } 5142 return packFloat128( aSign, 0x7FFF, 0, 0 ); 5143 } 5144 if ( aExp == 0 ) { 5145 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 5146 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5147 --aExp; 5148 } 5149 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 5150 5151 } 5152 5153 /*---------------------------------------------------------------------------- 5154 | Returns the remainder of the single-precision floating-point value `a' 5155 | with respect to the corresponding value `b'. The operation is performed 5156 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5157 *----------------------------------------------------------------------------*/ 5158 5159 float32 float32_rem(float32 a, float32 b, float_status *status) 5160 { 5161 bool aSign, zSign; 5162 int aExp, bExp, expDiff; 5163 uint32_t aSig, bSig; 5164 uint32_t q; 5165 uint64_t aSig64, bSig64, q64; 5166 uint32_t alternateASig; 5167 int32_t sigMean; 5168 a = float32_squash_input_denormal(a, status); 5169 b = float32_squash_input_denormal(b, status); 5170 5171 aSig = extractFloat32Frac( a ); 5172 aExp = extractFloat32Exp( a ); 5173 aSign = extractFloat32Sign( a ); 5174 bSig = extractFloat32Frac( b ); 5175 bExp = extractFloat32Exp( b ); 5176 if ( aExp == 0xFF ) { 5177 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 5178 return propagateFloat32NaN(a, b, status); 5179 } 5180 float_raise(float_flag_invalid, status); 5181 return float32_default_nan(status); 5182 } 5183 if ( bExp == 0xFF ) { 5184 if (bSig) { 5185 return propagateFloat32NaN(a, b, status); 5186 } 5187 return a; 5188 } 5189 if ( bExp == 0 ) { 5190 if ( bSig == 0 ) { 5191 float_raise(float_flag_invalid, status); 5192 return float32_default_nan(status); 5193 } 5194 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 5195 } 5196 if ( aExp == 0 ) { 5197 if ( aSig == 0 ) return a; 5198 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5199 } 5200 expDiff = aExp - bExp; 5201 aSig |= 0x00800000; 5202 bSig |= 0x00800000; 5203 if ( expDiff < 32 ) { 5204 aSig <<= 8; 5205 bSig <<= 8; 5206 if ( expDiff < 0 ) { 5207 if ( expDiff < -1 ) return a; 5208 aSig >>= 1; 5209 } 5210 q = ( bSig <= aSig ); 5211 if ( q ) aSig -= bSig; 5212 if ( 0 < expDiff ) { 5213 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 5214 q >>= 32 - expDiff; 5215 bSig >>= 2; 5216 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 5217 } 5218 else { 5219 aSig >>= 2; 5220 bSig >>= 2; 5221 } 5222 } 5223 else { 5224 if ( bSig <= aSig ) aSig -= bSig; 5225 aSig64 = ( (uint64_t) aSig )<<40; 5226 bSig64 = ( (uint64_t) bSig )<<40; 5227 expDiff -= 64; 5228 while ( 0 < expDiff ) { 5229 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 5230 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 5231 aSig64 = - ( ( bSig * q64 )<<38 ); 5232 expDiff -= 62; 5233 } 5234 expDiff += 64; 5235 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 5236 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 5237 q = q64>>( 64 - expDiff ); 5238 bSig <<= 6; 5239 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 5240 } 5241 do { 5242 alternateASig = aSig; 5243 ++q; 5244 aSig -= bSig; 5245 } while ( 0 <= (int32_t) aSig ); 5246 sigMean = aSig + alternateASig; 5247 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 5248 aSig = alternateASig; 5249 } 5250 zSign = ( (int32_t) aSig < 0 ); 5251 if ( zSign ) aSig = - aSig; 5252 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 5253 } 5254 5255 5256 5257 /*---------------------------------------------------------------------------- 5258 | Returns the binary exponential of the single-precision floating-point value 5259 | `a'. The operation is performed according to the IEC/IEEE Standard for 5260 | Binary Floating-Point Arithmetic. 5261 | 5262 | Uses the following identities: 5263 | 5264 | 1. ------------------------------------------------------------------------- 5265 | x x*ln(2) 5266 | 2 = e 5267 | 5268 | 2. ------------------------------------------------------------------------- 5269 | 2 3 4 5 n 5270 | x x x x x x x 5271 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 5272 | 1! 2! 3! 4! 5! n! 5273 *----------------------------------------------------------------------------*/ 5274 5275 static const float64 float32_exp2_coefficients[15] = 5276 { 5277 const_float64( 0x3ff0000000000000ll ), /* 1 */ 5278 const_float64( 0x3fe0000000000000ll ), /* 2 */ 5279 const_float64( 0x3fc5555555555555ll ), /* 3 */ 5280 const_float64( 0x3fa5555555555555ll ), /* 4 */ 5281 const_float64( 0x3f81111111111111ll ), /* 5 */ 5282 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 5283 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 5284 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 5285 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 5286 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 5287 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 5288 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 5289 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 5290 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 5291 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 5292 }; 5293 5294 float32 float32_exp2(float32 a, float_status *status) 5295 { 5296 bool aSign; 5297 int aExp; 5298 uint32_t aSig; 5299 float64 r, x, xn; 5300 int i; 5301 a = float32_squash_input_denormal(a, status); 5302 5303 aSig = extractFloat32Frac( a ); 5304 aExp = extractFloat32Exp( a ); 5305 aSign = extractFloat32Sign( a ); 5306 5307 if ( aExp == 0xFF) { 5308 if (aSig) { 5309 return propagateFloat32NaN(a, float32_zero, status); 5310 } 5311 return (aSign) ? float32_zero : a; 5312 } 5313 if (aExp == 0) { 5314 if (aSig == 0) return float32_one; 5315 } 5316 5317 float_raise(float_flag_inexact, status); 5318 5319 /* ******************************* */ 5320 /* using float64 for approximation */ 5321 /* ******************************* */ 5322 x = float32_to_float64(a, status); 5323 x = float64_mul(x, float64_ln2, status); 5324 5325 xn = x; 5326 r = float64_one; 5327 for (i = 0 ; i < 15 ; i++) { 5328 float64 f; 5329 5330 f = float64_mul(xn, float32_exp2_coefficients[i], status); 5331 r = float64_add(r, f, status); 5332 5333 xn = float64_mul(xn, x, status); 5334 } 5335 5336 return float64_to_float32(r, status); 5337 } 5338 5339 /*---------------------------------------------------------------------------- 5340 | Returns the binary log of the single-precision floating-point value `a'. 5341 | The operation is performed according to the IEC/IEEE Standard for Binary 5342 | Floating-Point Arithmetic. 5343 *----------------------------------------------------------------------------*/ 5344 float32 float32_log2(float32 a, float_status *status) 5345 { 5346 bool aSign, zSign; 5347 int aExp; 5348 uint32_t aSig, zSig, i; 5349 5350 a = float32_squash_input_denormal(a, status); 5351 aSig = extractFloat32Frac( a ); 5352 aExp = extractFloat32Exp( a ); 5353 aSign = extractFloat32Sign( a ); 5354 5355 if ( aExp == 0 ) { 5356 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 5357 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5358 } 5359 if ( aSign ) { 5360 float_raise(float_flag_invalid, status); 5361 return float32_default_nan(status); 5362 } 5363 if ( aExp == 0xFF ) { 5364 if (aSig) { 5365 return propagateFloat32NaN(a, float32_zero, status); 5366 } 5367 return a; 5368 } 5369 5370 aExp -= 0x7F; 5371 aSig |= 0x00800000; 5372 zSign = aExp < 0; 5373 zSig = aExp << 23; 5374 5375 for (i = 1 << 22; i > 0; i >>= 1) { 5376 aSig = ( (uint64_t)aSig * aSig ) >> 23; 5377 if ( aSig & 0x01000000 ) { 5378 aSig >>= 1; 5379 zSig |= i; 5380 } 5381 } 5382 5383 if ( zSign ) 5384 zSig = -zSig; 5385 5386 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 5387 } 5388 5389 /*---------------------------------------------------------------------------- 5390 | Returns the result of converting the double-precision floating-point value 5391 | `a' to the extended double-precision floating-point format. The conversion 5392 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5393 | Arithmetic. 5394 *----------------------------------------------------------------------------*/ 5395 5396 floatx80 float64_to_floatx80(float64 a, float_status *status) 5397 { 5398 bool aSign; 5399 int aExp; 5400 uint64_t aSig; 5401 5402 a = float64_squash_input_denormal(a, status); 5403 aSig = extractFloat64Frac( a ); 5404 aExp = extractFloat64Exp( a ); 5405 aSign = extractFloat64Sign( a ); 5406 if ( aExp == 0x7FF ) { 5407 if (aSig) { 5408 floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status), 5409 status); 5410 return floatx80_silence_nan(res, status); 5411 } 5412 return packFloatx80(aSign, 5413 floatx80_infinity_high, 5414 floatx80_infinity_low); 5415 } 5416 if ( aExp == 0 ) { 5417 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 5418 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5419 } 5420 return 5421 packFloatx80( 5422 aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11); 5423 5424 } 5425 5426 /*---------------------------------------------------------------------------- 5427 | Returns the result of converting the double-precision floating-point value 5428 | `a' to the quadruple-precision floating-point format. The conversion is 5429 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5430 | Arithmetic. 5431 *----------------------------------------------------------------------------*/ 5432 5433 float128 float64_to_float128(float64 a, float_status *status) 5434 { 5435 bool aSign; 5436 int aExp; 5437 uint64_t aSig, zSig0, zSig1; 5438 5439 a = float64_squash_input_denormal(a, status); 5440 aSig = extractFloat64Frac( a ); 5441 aExp = extractFloat64Exp( a ); 5442 aSign = extractFloat64Sign( a ); 5443 if ( aExp == 0x7FF ) { 5444 if (aSig) { 5445 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 5446 } 5447 return packFloat128( aSign, 0x7FFF, 0, 0 ); 5448 } 5449 if ( aExp == 0 ) { 5450 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 5451 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5452 --aExp; 5453 } 5454 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 5455 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 5456 5457 } 5458 5459 5460 /*---------------------------------------------------------------------------- 5461 | Returns the remainder of the double-precision floating-point value `a' 5462 | with respect to the corresponding value `b'. The operation is performed 5463 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5464 *----------------------------------------------------------------------------*/ 5465 5466 float64 float64_rem(float64 a, float64 b, float_status *status) 5467 { 5468 bool aSign, zSign; 5469 int aExp, bExp, expDiff; 5470 uint64_t aSig, bSig; 5471 uint64_t q, alternateASig; 5472 int64_t sigMean; 5473 5474 a = float64_squash_input_denormal(a, status); 5475 b = float64_squash_input_denormal(b, status); 5476 aSig = extractFloat64Frac( a ); 5477 aExp = extractFloat64Exp( a ); 5478 aSign = extractFloat64Sign( a ); 5479 bSig = extractFloat64Frac( b ); 5480 bExp = extractFloat64Exp( b ); 5481 if ( aExp == 0x7FF ) { 5482 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 5483 return propagateFloat64NaN(a, b, status); 5484 } 5485 float_raise(float_flag_invalid, status); 5486 return float64_default_nan(status); 5487 } 5488 if ( bExp == 0x7FF ) { 5489 if (bSig) { 5490 return propagateFloat64NaN(a, b, status); 5491 } 5492 return a; 5493 } 5494 if ( bExp == 0 ) { 5495 if ( bSig == 0 ) { 5496 float_raise(float_flag_invalid, status); 5497 return float64_default_nan(status); 5498 } 5499 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 5500 } 5501 if ( aExp == 0 ) { 5502 if ( aSig == 0 ) return a; 5503 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5504 } 5505 expDiff = aExp - bExp; 5506 aSig = (aSig | UINT64_C(0x0010000000000000)) << 11; 5507 bSig = (bSig | UINT64_C(0x0010000000000000)) << 11; 5508 if ( expDiff < 0 ) { 5509 if ( expDiff < -1 ) return a; 5510 aSig >>= 1; 5511 } 5512 q = ( bSig <= aSig ); 5513 if ( q ) aSig -= bSig; 5514 expDiff -= 64; 5515 while ( 0 < expDiff ) { 5516 q = estimateDiv128To64( aSig, 0, bSig ); 5517 q = ( 2 < q ) ? q - 2 : 0; 5518 aSig = - ( ( bSig>>2 ) * q ); 5519 expDiff -= 62; 5520 } 5521 expDiff += 64; 5522 if ( 0 < expDiff ) { 5523 q = estimateDiv128To64( aSig, 0, bSig ); 5524 q = ( 2 < q ) ? q - 2 : 0; 5525 q >>= 64 - expDiff; 5526 bSig >>= 2; 5527 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 5528 } 5529 else { 5530 aSig >>= 2; 5531 bSig >>= 2; 5532 } 5533 do { 5534 alternateASig = aSig; 5535 ++q; 5536 aSig -= bSig; 5537 } while ( 0 <= (int64_t) aSig ); 5538 sigMean = aSig + alternateASig; 5539 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 5540 aSig = alternateASig; 5541 } 5542 zSign = ( (int64_t) aSig < 0 ); 5543 if ( zSign ) aSig = - aSig; 5544 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 5545 5546 } 5547 5548 /*---------------------------------------------------------------------------- 5549 | Returns the binary log of the double-precision floating-point value `a'. 5550 | The operation is performed according to the IEC/IEEE Standard for Binary 5551 | Floating-Point Arithmetic. 5552 *----------------------------------------------------------------------------*/ 5553 float64 float64_log2(float64 a, float_status *status) 5554 { 5555 bool aSign, zSign; 5556 int aExp; 5557 uint64_t aSig, aSig0, aSig1, zSig, i; 5558 a = float64_squash_input_denormal(a, status); 5559 5560 aSig = extractFloat64Frac( a ); 5561 aExp = extractFloat64Exp( a ); 5562 aSign = extractFloat64Sign( a ); 5563 5564 if ( aExp == 0 ) { 5565 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 5566 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5567 } 5568 if ( aSign ) { 5569 float_raise(float_flag_invalid, status); 5570 return float64_default_nan(status); 5571 } 5572 if ( aExp == 0x7FF ) { 5573 if (aSig) { 5574 return propagateFloat64NaN(a, float64_zero, status); 5575 } 5576 return a; 5577 } 5578 5579 aExp -= 0x3FF; 5580 aSig |= UINT64_C(0x0010000000000000); 5581 zSign = aExp < 0; 5582 zSig = (uint64_t)aExp << 52; 5583 for (i = 1LL << 51; i > 0; i >>= 1) { 5584 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 5585 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 5586 if ( aSig & UINT64_C(0x0020000000000000) ) { 5587 aSig >>= 1; 5588 zSig |= i; 5589 } 5590 } 5591 5592 if ( zSign ) 5593 zSig = -zSig; 5594 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 5595 } 5596 5597 /*---------------------------------------------------------------------------- 5598 | Returns the result of converting the extended double-precision floating- 5599 | point value `a' to the 32-bit two's complement integer format. The 5600 | conversion is performed according to the IEC/IEEE Standard for Binary 5601 | Floating-Point Arithmetic---which means in particular that the conversion 5602 | is rounded according to the current rounding mode. If `a' is a NaN, the 5603 | largest positive integer is returned. Otherwise, if the conversion 5604 | overflows, the largest integer with the same sign as `a' is returned. 5605 *----------------------------------------------------------------------------*/ 5606 5607 int32_t floatx80_to_int32(floatx80 a, float_status *status) 5608 { 5609 bool aSign; 5610 int32_t aExp, shiftCount; 5611 uint64_t aSig; 5612 5613 if (floatx80_invalid_encoding(a)) { 5614 float_raise(float_flag_invalid, status); 5615 return 1 << 31; 5616 } 5617 aSig = extractFloatx80Frac( a ); 5618 aExp = extractFloatx80Exp( a ); 5619 aSign = extractFloatx80Sign( a ); 5620 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5621 shiftCount = 0x4037 - aExp; 5622 if ( shiftCount <= 0 ) shiftCount = 1; 5623 shift64RightJamming( aSig, shiftCount, &aSig ); 5624 return roundAndPackInt32(aSign, aSig, status); 5625 5626 } 5627 5628 /*---------------------------------------------------------------------------- 5629 | Returns the result of converting the extended double-precision floating- 5630 | point value `a' to the 32-bit two's complement integer format. The 5631 | conversion is performed according to the IEC/IEEE Standard for Binary 5632 | Floating-Point Arithmetic, except that the conversion is always rounded 5633 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5634 | Otherwise, if the conversion overflows, the largest integer with the same 5635 | sign as `a' is returned. 5636 *----------------------------------------------------------------------------*/ 5637 5638 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 5639 { 5640 bool aSign; 5641 int32_t aExp, shiftCount; 5642 uint64_t aSig, savedASig; 5643 int32_t z; 5644 5645 if (floatx80_invalid_encoding(a)) { 5646 float_raise(float_flag_invalid, status); 5647 return 1 << 31; 5648 } 5649 aSig = extractFloatx80Frac( a ); 5650 aExp = extractFloatx80Exp( a ); 5651 aSign = extractFloatx80Sign( a ); 5652 if ( 0x401E < aExp ) { 5653 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5654 goto invalid; 5655 } 5656 else if ( aExp < 0x3FFF ) { 5657 if (aExp || aSig) { 5658 float_raise(float_flag_inexact, status); 5659 } 5660 return 0; 5661 } 5662 shiftCount = 0x403E - aExp; 5663 savedASig = aSig; 5664 aSig >>= shiftCount; 5665 z = aSig; 5666 if ( aSign ) z = - z; 5667 if ( ( z < 0 ) ^ aSign ) { 5668 invalid: 5669 float_raise(float_flag_invalid, status); 5670 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5671 } 5672 if ( ( aSig<<shiftCount ) != savedASig ) { 5673 float_raise(float_flag_inexact, status); 5674 } 5675 return z; 5676 5677 } 5678 5679 /*---------------------------------------------------------------------------- 5680 | Returns the result of converting the extended double-precision floating- 5681 | point value `a' to the 64-bit two's complement integer format. The 5682 | conversion is performed according to the IEC/IEEE Standard for Binary 5683 | Floating-Point Arithmetic---which means in particular that the conversion 5684 | is rounded according to the current rounding mode. If `a' is a NaN, 5685 | the largest positive integer is returned. Otherwise, if the conversion 5686 | overflows, the largest integer with the same sign as `a' is returned. 5687 *----------------------------------------------------------------------------*/ 5688 5689 int64_t floatx80_to_int64(floatx80 a, float_status *status) 5690 { 5691 bool aSign; 5692 int32_t aExp, shiftCount; 5693 uint64_t aSig, aSigExtra; 5694 5695 if (floatx80_invalid_encoding(a)) { 5696 float_raise(float_flag_invalid, status); 5697 return 1ULL << 63; 5698 } 5699 aSig = extractFloatx80Frac( a ); 5700 aExp = extractFloatx80Exp( a ); 5701 aSign = extractFloatx80Sign( a ); 5702 shiftCount = 0x403E - aExp; 5703 if ( shiftCount <= 0 ) { 5704 if ( shiftCount ) { 5705 float_raise(float_flag_invalid, status); 5706 if (!aSign || floatx80_is_any_nan(a)) { 5707 return INT64_MAX; 5708 } 5709 return INT64_MIN; 5710 } 5711 aSigExtra = 0; 5712 } 5713 else { 5714 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 5715 } 5716 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 5717 5718 } 5719 5720 /*---------------------------------------------------------------------------- 5721 | Returns the result of converting the extended double-precision floating- 5722 | point value `a' to the 64-bit two's complement integer format. The 5723 | conversion is performed according to the IEC/IEEE Standard for Binary 5724 | Floating-Point Arithmetic, except that the conversion is always rounded 5725 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5726 | Otherwise, if the conversion overflows, the largest integer with the same 5727 | sign as `a' is returned. 5728 *----------------------------------------------------------------------------*/ 5729 5730 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 5731 { 5732 bool aSign; 5733 int32_t aExp, shiftCount; 5734 uint64_t aSig; 5735 int64_t z; 5736 5737 if (floatx80_invalid_encoding(a)) { 5738 float_raise(float_flag_invalid, status); 5739 return 1ULL << 63; 5740 } 5741 aSig = extractFloatx80Frac( a ); 5742 aExp = extractFloatx80Exp( a ); 5743 aSign = extractFloatx80Sign( a ); 5744 shiftCount = aExp - 0x403E; 5745 if ( 0 <= shiftCount ) { 5746 aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF); 5747 if ( ( a.high != 0xC03E ) || aSig ) { 5748 float_raise(float_flag_invalid, status); 5749 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 5750 return INT64_MAX; 5751 } 5752 } 5753 return INT64_MIN; 5754 } 5755 else if ( aExp < 0x3FFF ) { 5756 if (aExp | aSig) { 5757 float_raise(float_flag_inexact, status); 5758 } 5759 return 0; 5760 } 5761 z = aSig>>( - shiftCount ); 5762 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 5763 float_raise(float_flag_inexact, status); 5764 } 5765 if ( aSign ) z = - z; 5766 return z; 5767 5768 } 5769 5770 /*---------------------------------------------------------------------------- 5771 | Returns the result of converting the extended double-precision floating- 5772 | point value `a' to the single-precision floating-point format. The 5773 | conversion is performed according to the IEC/IEEE Standard for Binary 5774 | Floating-Point Arithmetic. 5775 *----------------------------------------------------------------------------*/ 5776 5777 float32 floatx80_to_float32(floatx80 a, float_status *status) 5778 { 5779 bool aSign; 5780 int32_t aExp; 5781 uint64_t aSig; 5782 5783 if (floatx80_invalid_encoding(a)) { 5784 float_raise(float_flag_invalid, status); 5785 return float32_default_nan(status); 5786 } 5787 aSig = extractFloatx80Frac( a ); 5788 aExp = extractFloatx80Exp( a ); 5789 aSign = extractFloatx80Sign( a ); 5790 if ( aExp == 0x7FFF ) { 5791 if ( (uint64_t) ( aSig<<1 ) ) { 5792 float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status), 5793 status); 5794 return float32_silence_nan(res, status); 5795 } 5796 return packFloat32( aSign, 0xFF, 0 ); 5797 } 5798 shift64RightJamming( aSig, 33, &aSig ); 5799 if ( aExp || aSig ) aExp -= 0x3F81; 5800 return roundAndPackFloat32(aSign, aExp, aSig, status); 5801 5802 } 5803 5804 /*---------------------------------------------------------------------------- 5805 | Returns the result of converting the extended double-precision floating- 5806 | point value `a' to the double-precision floating-point format. The 5807 | conversion is performed according to the IEC/IEEE Standard for Binary 5808 | Floating-Point Arithmetic. 5809 *----------------------------------------------------------------------------*/ 5810 5811 float64 floatx80_to_float64(floatx80 a, float_status *status) 5812 { 5813 bool aSign; 5814 int32_t aExp; 5815 uint64_t aSig, zSig; 5816 5817 if (floatx80_invalid_encoding(a)) { 5818 float_raise(float_flag_invalid, status); 5819 return float64_default_nan(status); 5820 } 5821 aSig = extractFloatx80Frac( a ); 5822 aExp = extractFloatx80Exp( a ); 5823 aSign = extractFloatx80Sign( a ); 5824 if ( aExp == 0x7FFF ) { 5825 if ( (uint64_t) ( aSig<<1 ) ) { 5826 float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status), 5827 status); 5828 return float64_silence_nan(res, status); 5829 } 5830 return packFloat64( aSign, 0x7FF, 0 ); 5831 } 5832 shift64RightJamming( aSig, 1, &zSig ); 5833 if ( aExp || aSig ) aExp -= 0x3C01; 5834 return roundAndPackFloat64(aSign, aExp, zSig, status); 5835 5836 } 5837 5838 /*---------------------------------------------------------------------------- 5839 | Returns the result of converting the extended double-precision floating- 5840 | point value `a' to the quadruple-precision floating-point format. The 5841 | conversion is performed according to the IEC/IEEE Standard for Binary 5842 | Floating-Point Arithmetic. 5843 *----------------------------------------------------------------------------*/ 5844 5845 float128 floatx80_to_float128(floatx80 a, float_status *status) 5846 { 5847 bool aSign; 5848 int aExp; 5849 uint64_t aSig, zSig0, zSig1; 5850 5851 if (floatx80_invalid_encoding(a)) { 5852 float_raise(float_flag_invalid, status); 5853 return float128_default_nan(status); 5854 } 5855 aSig = extractFloatx80Frac( a ); 5856 aExp = extractFloatx80Exp( a ); 5857 aSign = extractFloatx80Sign( a ); 5858 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 5859 float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status), 5860 status); 5861 return float128_silence_nan(res, status); 5862 } 5863 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 5864 return packFloat128( aSign, aExp, zSig0, zSig1 ); 5865 5866 } 5867 5868 /*---------------------------------------------------------------------------- 5869 | Rounds the extended double-precision floating-point value `a' 5870 | to the precision provided by floatx80_rounding_precision and returns the 5871 | result as an extended double-precision floating-point value. 5872 | The operation is performed according to the IEC/IEEE Standard for Binary 5873 | Floating-Point Arithmetic. 5874 *----------------------------------------------------------------------------*/ 5875 5876 floatx80 floatx80_round(floatx80 a, float_status *status) 5877 { 5878 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5879 extractFloatx80Sign(a), 5880 extractFloatx80Exp(a), 5881 extractFloatx80Frac(a), 0, status); 5882 } 5883 5884 /*---------------------------------------------------------------------------- 5885 | Rounds the extended double-precision floating-point value `a' to an integer, 5886 | and returns the result as an extended quadruple-precision floating-point 5887 | value. The operation is performed according to the IEC/IEEE Standard for 5888 | Binary Floating-Point Arithmetic. 5889 *----------------------------------------------------------------------------*/ 5890 5891 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 5892 { 5893 bool aSign; 5894 int32_t aExp; 5895 uint64_t lastBitMask, roundBitsMask; 5896 floatx80 z; 5897 5898 if (floatx80_invalid_encoding(a)) { 5899 float_raise(float_flag_invalid, status); 5900 return floatx80_default_nan(status); 5901 } 5902 aExp = extractFloatx80Exp( a ); 5903 if ( 0x403E <= aExp ) { 5904 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 5905 return propagateFloatx80NaN(a, a, status); 5906 } 5907 return a; 5908 } 5909 if ( aExp < 0x3FFF ) { 5910 if ( ( aExp == 0 ) 5911 && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) { 5912 return a; 5913 } 5914 float_raise(float_flag_inexact, status); 5915 aSign = extractFloatx80Sign( a ); 5916 switch (status->float_rounding_mode) { 5917 case float_round_nearest_even: 5918 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 5919 ) { 5920 return 5921 packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000)); 5922 } 5923 break; 5924 case float_round_ties_away: 5925 if (aExp == 0x3FFE) { 5926 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000)); 5927 } 5928 break; 5929 case float_round_down: 5930 return 5931 aSign ? 5932 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000)) 5933 : packFloatx80( 0, 0, 0 ); 5934 case float_round_up: 5935 return 5936 aSign ? packFloatx80( 1, 0, 0 ) 5937 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000)); 5938 5939 case float_round_to_zero: 5940 break; 5941 default: 5942 g_assert_not_reached(); 5943 } 5944 return packFloatx80( aSign, 0, 0 ); 5945 } 5946 lastBitMask = 1; 5947 lastBitMask <<= 0x403E - aExp; 5948 roundBitsMask = lastBitMask - 1; 5949 z = a; 5950 switch (status->float_rounding_mode) { 5951 case float_round_nearest_even: 5952 z.low += lastBitMask>>1; 5953 if ((z.low & roundBitsMask) == 0) { 5954 z.low &= ~lastBitMask; 5955 } 5956 break; 5957 case float_round_ties_away: 5958 z.low += lastBitMask >> 1; 5959 break; 5960 case float_round_to_zero: 5961 break; 5962 case float_round_up: 5963 if (!extractFloatx80Sign(z)) { 5964 z.low += roundBitsMask; 5965 } 5966 break; 5967 case float_round_down: 5968 if (extractFloatx80Sign(z)) { 5969 z.low += roundBitsMask; 5970 } 5971 break; 5972 default: 5973 abort(); 5974 } 5975 z.low &= ~ roundBitsMask; 5976 if ( z.low == 0 ) { 5977 ++z.high; 5978 z.low = UINT64_C(0x8000000000000000); 5979 } 5980 if (z.low != a.low) { 5981 float_raise(float_flag_inexact, status); 5982 } 5983 return z; 5984 5985 } 5986 5987 /*---------------------------------------------------------------------------- 5988 | Returns the result of adding the absolute values of the extended double- 5989 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 5990 | negated before being returned. `zSign' is ignored if the result is a NaN. 5991 | The addition is performed according to the IEC/IEEE Standard for Binary 5992 | Floating-Point Arithmetic. 5993 *----------------------------------------------------------------------------*/ 5994 5995 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign, 5996 float_status *status) 5997 { 5998 int32_t aExp, bExp, zExp; 5999 uint64_t aSig, bSig, zSig0, zSig1; 6000 int32_t expDiff; 6001 6002 aSig = extractFloatx80Frac( a ); 6003 aExp = extractFloatx80Exp( a ); 6004 bSig = extractFloatx80Frac( b ); 6005 bExp = extractFloatx80Exp( b ); 6006 expDiff = aExp - bExp; 6007 if ( 0 < expDiff ) { 6008 if ( aExp == 0x7FFF ) { 6009 if ((uint64_t)(aSig << 1)) { 6010 return propagateFloatx80NaN(a, b, status); 6011 } 6012 return a; 6013 } 6014 if ( bExp == 0 ) --expDiff; 6015 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 6016 zExp = aExp; 6017 } 6018 else if ( expDiff < 0 ) { 6019 if ( bExp == 0x7FFF ) { 6020 if ((uint64_t)(bSig << 1)) { 6021 return propagateFloatx80NaN(a, b, status); 6022 } 6023 return packFloatx80(zSign, 6024 floatx80_infinity_high, 6025 floatx80_infinity_low); 6026 } 6027 if ( aExp == 0 ) ++expDiff; 6028 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 6029 zExp = bExp; 6030 } 6031 else { 6032 if ( aExp == 0x7FFF ) { 6033 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 6034 return propagateFloatx80NaN(a, b, status); 6035 } 6036 return a; 6037 } 6038 zSig1 = 0; 6039 zSig0 = aSig + bSig; 6040 if ( aExp == 0 ) { 6041 if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) { 6042 /* At least one of the values is a pseudo-denormal, 6043 * and there is a carry out of the result. */ 6044 zExp = 1; 6045 goto shiftRight1; 6046 } 6047 if (zSig0 == 0) { 6048 return packFloatx80(zSign, 0, 0); 6049 } 6050 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 6051 goto roundAndPack; 6052 } 6053 zExp = aExp; 6054 goto shiftRight1; 6055 } 6056 zSig0 = aSig + bSig; 6057 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 6058 shiftRight1: 6059 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 6060 zSig0 |= UINT64_C(0x8000000000000000); 6061 ++zExp; 6062 roundAndPack: 6063 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6064 zSign, zExp, zSig0, zSig1, status); 6065 } 6066 6067 /*---------------------------------------------------------------------------- 6068 | Returns the result of subtracting the absolute values of the extended 6069 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 6070 | difference is negated before being returned. `zSign' is ignored if the 6071 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6072 | Standard for Binary Floating-Point Arithmetic. 6073 *----------------------------------------------------------------------------*/ 6074 6075 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign, 6076 float_status *status) 6077 { 6078 int32_t aExp, bExp, zExp; 6079 uint64_t aSig, bSig, zSig0, zSig1; 6080 int32_t expDiff; 6081 6082 aSig = extractFloatx80Frac( a ); 6083 aExp = extractFloatx80Exp( a ); 6084 bSig = extractFloatx80Frac( b ); 6085 bExp = extractFloatx80Exp( b ); 6086 expDiff = aExp - bExp; 6087 if ( 0 < expDiff ) goto aExpBigger; 6088 if ( expDiff < 0 ) goto bExpBigger; 6089 if ( aExp == 0x7FFF ) { 6090 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 6091 return propagateFloatx80NaN(a, b, status); 6092 } 6093 float_raise(float_flag_invalid, status); 6094 return floatx80_default_nan(status); 6095 } 6096 if ( aExp == 0 ) { 6097 aExp = 1; 6098 bExp = 1; 6099 } 6100 zSig1 = 0; 6101 if ( bSig < aSig ) goto aBigger; 6102 if ( aSig < bSig ) goto bBigger; 6103 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 6104 bExpBigger: 6105 if ( bExp == 0x7FFF ) { 6106 if ((uint64_t)(bSig << 1)) { 6107 return propagateFloatx80NaN(a, b, status); 6108 } 6109 return packFloatx80(zSign ^ 1, floatx80_infinity_high, 6110 floatx80_infinity_low); 6111 } 6112 if ( aExp == 0 ) ++expDiff; 6113 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 6114 bBigger: 6115 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 6116 zExp = bExp; 6117 zSign ^= 1; 6118 goto normalizeRoundAndPack; 6119 aExpBigger: 6120 if ( aExp == 0x7FFF ) { 6121 if ((uint64_t)(aSig << 1)) { 6122 return propagateFloatx80NaN(a, b, status); 6123 } 6124 return a; 6125 } 6126 if ( bExp == 0 ) --expDiff; 6127 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 6128 aBigger: 6129 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 6130 zExp = aExp; 6131 normalizeRoundAndPack: 6132 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 6133 zSign, zExp, zSig0, zSig1, status); 6134 } 6135 6136 /*---------------------------------------------------------------------------- 6137 | Returns the result of adding the extended double-precision floating-point 6138 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6139 | Standard for Binary Floating-Point Arithmetic. 6140 *----------------------------------------------------------------------------*/ 6141 6142 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 6143 { 6144 bool aSign, bSign; 6145 6146 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6147 float_raise(float_flag_invalid, status); 6148 return floatx80_default_nan(status); 6149 } 6150 aSign = extractFloatx80Sign( a ); 6151 bSign = extractFloatx80Sign( b ); 6152 if ( aSign == bSign ) { 6153 return addFloatx80Sigs(a, b, aSign, status); 6154 } 6155 else { 6156 return subFloatx80Sigs(a, b, aSign, status); 6157 } 6158 6159 } 6160 6161 /*---------------------------------------------------------------------------- 6162 | Returns the result of subtracting the extended double-precision floating- 6163 | point values `a' and `b'. The operation is performed according to the 6164 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6165 *----------------------------------------------------------------------------*/ 6166 6167 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 6168 { 6169 bool aSign, bSign; 6170 6171 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6172 float_raise(float_flag_invalid, status); 6173 return floatx80_default_nan(status); 6174 } 6175 aSign = extractFloatx80Sign( a ); 6176 bSign = extractFloatx80Sign( b ); 6177 if ( aSign == bSign ) { 6178 return subFloatx80Sigs(a, b, aSign, status); 6179 } 6180 else { 6181 return addFloatx80Sigs(a, b, aSign, status); 6182 } 6183 6184 } 6185 6186 /*---------------------------------------------------------------------------- 6187 | Returns the result of multiplying the extended double-precision floating- 6188 | point values `a' and `b'. The operation is performed according to the 6189 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6190 *----------------------------------------------------------------------------*/ 6191 6192 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 6193 { 6194 bool aSign, bSign, zSign; 6195 int32_t aExp, bExp, zExp; 6196 uint64_t aSig, bSig, zSig0, zSig1; 6197 6198 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6199 float_raise(float_flag_invalid, status); 6200 return floatx80_default_nan(status); 6201 } 6202 aSig = extractFloatx80Frac( a ); 6203 aExp = extractFloatx80Exp( a ); 6204 aSign = extractFloatx80Sign( a ); 6205 bSig = extractFloatx80Frac( b ); 6206 bExp = extractFloatx80Exp( b ); 6207 bSign = extractFloatx80Sign( b ); 6208 zSign = aSign ^ bSign; 6209 if ( aExp == 0x7FFF ) { 6210 if ( (uint64_t) ( aSig<<1 ) 6211 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 6212 return propagateFloatx80NaN(a, b, status); 6213 } 6214 if ( ( bExp | bSig ) == 0 ) goto invalid; 6215 return packFloatx80(zSign, floatx80_infinity_high, 6216 floatx80_infinity_low); 6217 } 6218 if ( bExp == 0x7FFF ) { 6219 if ((uint64_t)(bSig << 1)) { 6220 return propagateFloatx80NaN(a, b, status); 6221 } 6222 if ( ( aExp | aSig ) == 0 ) { 6223 invalid: 6224 float_raise(float_flag_invalid, status); 6225 return floatx80_default_nan(status); 6226 } 6227 return packFloatx80(zSign, floatx80_infinity_high, 6228 floatx80_infinity_low); 6229 } 6230 if ( aExp == 0 ) { 6231 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6232 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 6233 } 6234 if ( bExp == 0 ) { 6235 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6236 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6237 } 6238 zExp = aExp + bExp - 0x3FFE; 6239 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 6240 if ( 0 < (int64_t) zSig0 ) { 6241 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 6242 --zExp; 6243 } 6244 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6245 zSign, zExp, zSig0, zSig1, status); 6246 } 6247 6248 /*---------------------------------------------------------------------------- 6249 | Returns the result of dividing the extended double-precision floating-point 6250 | value `a' by the corresponding value `b'. The operation is performed 6251 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6252 *----------------------------------------------------------------------------*/ 6253 6254 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 6255 { 6256 bool aSign, bSign, zSign; 6257 int32_t aExp, bExp, zExp; 6258 uint64_t aSig, bSig, zSig0, zSig1; 6259 uint64_t rem0, rem1, rem2, term0, term1, term2; 6260 6261 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6262 float_raise(float_flag_invalid, status); 6263 return floatx80_default_nan(status); 6264 } 6265 aSig = extractFloatx80Frac( a ); 6266 aExp = extractFloatx80Exp( a ); 6267 aSign = extractFloatx80Sign( a ); 6268 bSig = extractFloatx80Frac( b ); 6269 bExp = extractFloatx80Exp( b ); 6270 bSign = extractFloatx80Sign( b ); 6271 zSign = aSign ^ bSign; 6272 if ( aExp == 0x7FFF ) { 6273 if ((uint64_t)(aSig << 1)) { 6274 return propagateFloatx80NaN(a, b, status); 6275 } 6276 if ( bExp == 0x7FFF ) { 6277 if ((uint64_t)(bSig << 1)) { 6278 return propagateFloatx80NaN(a, b, status); 6279 } 6280 goto invalid; 6281 } 6282 return packFloatx80(zSign, floatx80_infinity_high, 6283 floatx80_infinity_low); 6284 } 6285 if ( bExp == 0x7FFF ) { 6286 if ((uint64_t)(bSig << 1)) { 6287 return propagateFloatx80NaN(a, b, status); 6288 } 6289 return packFloatx80( zSign, 0, 0 ); 6290 } 6291 if ( bExp == 0 ) { 6292 if ( bSig == 0 ) { 6293 if ( ( aExp | aSig ) == 0 ) { 6294 invalid: 6295 float_raise(float_flag_invalid, status); 6296 return floatx80_default_nan(status); 6297 } 6298 float_raise(float_flag_divbyzero, status); 6299 return packFloatx80(zSign, floatx80_infinity_high, 6300 floatx80_infinity_low); 6301 } 6302 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6303 } 6304 if ( aExp == 0 ) { 6305 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6306 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 6307 } 6308 zExp = aExp - bExp + 0x3FFE; 6309 rem1 = 0; 6310 if ( bSig <= aSig ) { 6311 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 6312 ++zExp; 6313 } 6314 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 6315 mul64To128( bSig, zSig0, &term0, &term1 ); 6316 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 6317 while ( (int64_t) rem0 < 0 ) { 6318 --zSig0; 6319 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 6320 } 6321 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 6322 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 6323 mul64To128( bSig, zSig1, &term1, &term2 ); 6324 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6325 while ( (int64_t) rem1 < 0 ) { 6326 --zSig1; 6327 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 6328 } 6329 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 6330 } 6331 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6332 zSign, zExp, zSig0, zSig1, status); 6333 } 6334 6335 /*---------------------------------------------------------------------------- 6336 | Returns the remainder of the extended double-precision floating-point value 6337 | `a' with respect to the corresponding value `b'. The operation is performed 6338 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic, 6339 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating 6340 | the quotient toward zero instead. '*quotient' is set to the low 64 bits of 6341 | the absolute value of the integer quotient. 6342 *----------------------------------------------------------------------------*/ 6343 6344 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient, 6345 float_status *status) 6346 { 6347 bool aSign, zSign; 6348 int32_t aExp, bExp, expDiff, aExpOrig; 6349 uint64_t aSig0, aSig1, bSig; 6350 uint64_t q, term0, term1, alternateASig0, alternateASig1; 6351 6352 *quotient = 0; 6353 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6354 float_raise(float_flag_invalid, status); 6355 return floatx80_default_nan(status); 6356 } 6357 aSig0 = extractFloatx80Frac( a ); 6358 aExpOrig = aExp = extractFloatx80Exp( a ); 6359 aSign = extractFloatx80Sign( a ); 6360 bSig = extractFloatx80Frac( b ); 6361 bExp = extractFloatx80Exp( b ); 6362 if ( aExp == 0x7FFF ) { 6363 if ( (uint64_t) ( aSig0<<1 ) 6364 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 6365 return propagateFloatx80NaN(a, b, status); 6366 } 6367 goto invalid; 6368 } 6369 if ( bExp == 0x7FFF ) { 6370 if ((uint64_t)(bSig << 1)) { 6371 return propagateFloatx80NaN(a, b, status); 6372 } 6373 if (aExp == 0 && aSig0 >> 63) { 6374 /* 6375 * Pseudo-denormal argument must be returned in normalized 6376 * form. 6377 */ 6378 return packFloatx80(aSign, 1, aSig0); 6379 } 6380 return a; 6381 } 6382 if ( bExp == 0 ) { 6383 if ( bSig == 0 ) { 6384 invalid: 6385 float_raise(float_flag_invalid, status); 6386 return floatx80_default_nan(status); 6387 } 6388 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6389 } 6390 if ( aExp == 0 ) { 6391 if ( aSig0 == 0 ) return a; 6392 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6393 } 6394 zSign = aSign; 6395 expDiff = aExp - bExp; 6396 aSig1 = 0; 6397 if ( expDiff < 0 ) { 6398 if ( mod || expDiff < -1 ) { 6399 if (aExp == 1 && aExpOrig == 0) { 6400 /* 6401 * Pseudo-denormal argument must be returned in 6402 * normalized form. 6403 */ 6404 return packFloatx80(aSign, aExp, aSig0); 6405 } 6406 return a; 6407 } 6408 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 6409 expDiff = 0; 6410 } 6411 *quotient = q = ( bSig <= aSig0 ); 6412 if ( q ) aSig0 -= bSig; 6413 expDiff -= 64; 6414 while ( 0 < expDiff ) { 6415 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6416 q = ( 2 < q ) ? q - 2 : 0; 6417 mul64To128( bSig, q, &term0, &term1 ); 6418 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6419 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 6420 expDiff -= 62; 6421 *quotient <<= 62; 6422 *quotient += q; 6423 } 6424 expDiff += 64; 6425 if ( 0 < expDiff ) { 6426 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6427 q = ( 2 < q ) ? q - 2 : 0; 6428 q >>= 64 - expDiff; 6429 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 6430 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6431 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 6432 while ( le128( term0, term1, aSig0, aSig1 ) ) { 6433 ++q; 6434 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6435 } 6436 if (expDiff < 64) { 6437 *quotient <<= expDiff; 6438 } else { 6439 *quotient = 0; 6440 } 6441 *quotient += q; 6442 } 6443 else { 6444 term1 = 0; 6445 term0 = bSig; 6446 } 6447 if (!mod) { 6448 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 6449 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6450 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6451 && ( q & 1 ) ) 6452 ) { 6453 aSig0 = alternateASig0; 6454 aSig1 = alternateASig1; 6455 zSign = ! zSign; 6456 ++*quotient; 6457 } 6458 } 6459 return 6460 normalizeRoundAndPackFloatx80( 6461 80, zSign, bExp + expDiff, aSig0, aSig1, status); 6462 6463 } 6464 6465 /*---------------------------------------------------------------------------- 6466 | Returns the remainder of the extended double-precision floating-point value 6467 | `a' with respect to the corresponding value `b'. The operation is performed 6468 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6469 *----------------------------------------------------------------------------*/ 6470 6471 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 6472 { 6473 uint64_t quotient; 6474 return floatx80_modrem(a, b, false, "ient, status); 6475 } 6476 6477 /*---------------------------------------------------------------------------- 6478 | Returns the remainder of the extended double-precision floating-point value 6479 | `a' with respect to the corresponding value `b', with the quotient truncated 6480 | toward zero. 6481 *----------------------------------------------------------------------------*/ 6482 6483 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status) 6484 { 6485 uint64_t quotient; 6486 return floatx80_modrem(a, b, true, "ient, status); 6487 } 6488 6489 /*---------------------------------------------------------------------------- 6490 | Returns the square root of the extended double-precision floating-point 6491 | value `a'. The operation is performed according to the IEC/IEEE Standard 6492 | for Binary Floating-Point Arithmetic. 6493 *----------------------------------------------------------------------------*/ 6494 6495 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 6496 { 6497 bool aSign; 6498 int32_t aExp, zExp; 6499 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 6500 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6501 6502 if (floatx80_invalid_encoding(a)) { 6503 float_raise(float_flag_invalid, status); 6504 return floatx80_default_nan(status); 6505 } 6506 aSig0 = extractFloatx80Frac( a ); 6507 aExp = extractFloatx80Exp( a ); 6508 aSign = extractFloatx80Sign( a ); 6509 if ( aExp == 0x7FFF ) { 6510 if ((uint64_t)(aSig0 << 1)) { 6511 return propagateFloatx80NaN(a, a, status); 6512 } 6513 if ( ! aSign ) return a; 6514 goto invalid; 6515 } 6516 if ( aSign ) { 6517 if ( ( aExp | aSig0 ) == 0 ) return a; 6518 invalid: 6519 float_raise(float_flag_invalid, status); 6520 return floatx80_default_nan(status); 6521 } 6522 if ( aExp == 0 ) { 6523 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 6524 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6525 } 6526 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 6527 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 6528 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 6529 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6530 doubleZSig0 = zSig0<<1; 6531 mul64To128( zSig0, zSig0, &term0, &term1 ); 6532 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6533 while ( (int64_t) rem0 < 0 ) { 6534 --zSig0; 6535 doubleZSig0 -= 2; 6536 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6537 } 6538 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6539 if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) { 6540 if ( zSig1 == 0 ) zSig1 = 1; 6541 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6542 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6543 mul64To128( zSig1, zSig1, &term2, &term3 ); 6544 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6545 while ( (int64_t) rem1 < 0 ) { 6546 --zSig1; 6547 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6548 term3 |= 1; 6549 term2 |= doubleZSig0; 6550 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6551 } 6552 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6553 } 6554 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 6555 zSig0 |= doubleZSig0; 6556 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6557 0, zExp, zSig0, zSig1, status); 6558 } 6559 6560 /*---------------------------------------------------------------------------- 6561 | Returns the result of converting the quadruple-precision floating-point 6562 | value `a' to the 32-bit two's complement integer format. The conversion 6563 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6564 | Arithmetic---which means in particular that the conversion is rounded 6565 | according to the current rounding mode. If `a' is a NaN, the largest 6566 | positive integer is returned. Otherwise, if the conversion overflows, the 6567 | largest integer with the same sign as `a' is returned. 6568 *----------------------------------------------------------------------------*/ 6569 6570 int32_t float128_to_int32(float128 a, float_status *status) 6571 { 6572 bool aSign; 6573 int32_t aExp, shiftCount; 6574 uint64_t aSig0, aSig1; 6575 6576 aSig1 = extractFloat128Frac1( a ); 6577 aSig0 = extractFloat128Frac0( a ); 6578 aExp = extractFloat128Exp( a ); 6579 aSign = extractFloat128Sign( a ); 6580 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 6581 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6582 aSig0 |= ( aSig1 != 0 ); 6583 shiftCount = 0x4028 - aExp; 6584 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 6585 return roundAndPackInt32(aSign, aSig0, status); 6586 6587 } 6588 6589 /*---------------------------------------------------------------------------- 6590 | Returns the result of converting the quadruple-precision floating-point 6591 | value `a' to the 32-bit two's complement integer format. The conversion 6592 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6593 | Arithmetic, except that the conversion is always rounded toward zero. If 6594 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 6595 | conversion overflows, the largest integer with the same sign as `a' is 6596 | returned. 6597 *----------------------------------------------------------------------------*/ 6598 6599 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 6600 { 6601 bool aSign; 6602 int32_t aExp, shiftCount; 6603 uint64_t aSig0, aSig1, savedASig; 6604 int32_t z; 6605 6606 aSig1 = extractFloat128Frac1( a ); 6607 aSig0 = extractFloat128Frac0( a ); 6608 aExp = extractFloat128Exp( a ); 6609 aSign = extractFloat128Sign( a ); 6610 aSig0 |= ( aSig1 != 0 ); 6611 if ( 0x401E < aExp ) { 6612 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 6613 goto invalid; 6614 } 6615 else if ( aExp < 0x3FFF ) { 6616 if (aExp || aSig0) { 6617 float_raise(float_flag_inexact, status); 6618 } 6619 return 0; 6620 } 6621 aSig0 |= UINT64_C(0x0001000000000000); 6622 shiftCount = 0x402F - aExp; 6623 savedASig = aSig0; 6624 aSig0 >>= shiftCount; 6625 z = aSig0; 6626 if ( aSign ) z = - z; 6627 if ( ( z < 0 ) ^ aSign ) { 6628 invalid: 6629 float_raise(float_flag_invalid, status); 6630 return aSign ? INT32_MIN : INT32_MAX; 6631 } 6632 if ( ( aSig0<<shiftCount ) != savedASig ) { 6633 float_raise(float_flag_inexact, status); 6634 } 6635 return z; 6636 6637 } 6638 6639 /*---------------------------------------------------------------------------- 6640 | Returns the result of converting the quadruple-precision floating-point 6641 | value `a' to the 64-bit two's complement integer format. The conversion 6642 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6643 | Arithmetic---which means in particular that the conversion is rounded 6644 | according to the current rounding mode. If `a' is a NaN, the largest 6645 | positive integer is returned. Otherwise, if the conversion overflows, the 6646 | largest integer with the same sign as `a' is returned. 6647 *----------------------------------------------------------------------------*/ 6648 6649 int64_t float128_to_int64(float128 a, float_status *status) 6650 { 6651 bool aSign; 6652 int32_t aExp, shiftCount; 6653 uint64_t aSig0, aSig1; 6654 6655 aSig1 = extractFloat128Frac1( a ); 6656 aSig0 = extractFloat128Frac0( a ); 6657 aExp = extractFloat128Exp( a ); 6658 aSign = extractFloat128Sign( a ); 6659 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6660 shiftCount = 0x402F - aExp; 6661 if ( shiftCount <= 0 ) { 6662 if ( 0x403E < aExp ) { 6663 float_raise(float_flag_invalid, status); 6664 if ( ! aSign 6665 || ( ( aExp == 0x7FFF ) 6666 && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) ) 6667 ) 6668 ) { 6669 return INT64_MAX; 6670 } 6671 return INT64_MIN; 6672 } 6673 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 6674 } 6675 else { 6676 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 6677 } 6678 return roundAndPackInt64(aSign, aSig0, aSig1, status); 6679 6680 } 6681 6682 /*---------------------------------------------------------------------------- 6683 | Returns the result of converting the quadruple-precision floating-point 6684 | value `a' to the 64-bit two's complement integer format. The conversion 6685 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6686 | Arithmetic, except that the conversion is always rounded toward zero. 6687 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 6688 | the conversion overflows, the largest integer with the same sign as `a' is 6689 | returned. 6690 *----------------------------------------------------------------------------*/ 6691 6692 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 6693 { 6694 bool aSign; 6695 int32_t aExp, shiftCount; 6696 uint64_t aSig0, aSig1; 6697 int64_t z; 6698 6699 aSig1 = extractFloat128Frac1( a ); 6700 aSig0 = extractFloat128Frac0( a ); 6701 aExp = extractFloat128Exp( a ); 6702 aSign = extractFloat128Sign( a ); 6703 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6704 shiftCount = aExp - 0x402F; 6705 if ( 0 < shiftCount ) { 6706 if ( 0x403E <= aExp ) { 6707 aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF); 6708 if ( ( a.high == UINT64_C(0xC03E000000000000) ) 6709 && ( aSig1 < UINT64_C(0x0002000000000000) ) ) { 6710 if (aSig1) { 6711 float_raise(float_flag_inexact, status); 6712 } 6713 } 6714 else { 6715 float_raise(float_flag_invalid, status); 6716 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 6717 return INT64_MAX; 6718 } 6719 } 6720 return INT64_MIN; 6721 } 6722 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 6723 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 6724 float_raise(float_flag_inexact, status); 6725 } 6726 } 6727 else { 6728 if ( aExp < 0x3FFF ) { 6729 if ( aExp | aSig0 | aSig1 ) { 6730 float_raise(float_flag_inexact, status); 6731 } 6732 return 0; 6733 } 6734 z = aSig0>>( - shiftCount ); 6735 if ( aSig1 6736 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 6737 float_raise(float_flag_inexact, status); 6738 } 6739 } 6740 if ( aSign ) z = - z; 6741 return z; 6742 6743 } 6744 6745 /*---------------------------------------------------------------------------- 6746 | Returns the result of converting the quadruple-precision floating-point value 6747 | `a' to the 64-bit unsigned integer format. The conversion is 6748 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6749 | Arithmetic---which means in particular that the conversion is rounded 6750 | according to the current rounding mode. If `a' is a NaN, the largest 6751 | positive integer is returned. If the conversion overflows, the 6752 | largest unsigned integer is returned. If 'a' is negative, the value is 6753 | rounded and zero is returned; negative values that do not round to zero 6754 | will raise the inexact exception. 6755 *----------------------------------------------------------------------------*/ 6756 6757 uint64_t float128_to_uint64(float128 a, float_status *status) 6758 { 6759 bool aSign; 6760 int aExp; 6761 int shiftCount; 6762 uint64_t aSig0, aSig1; 6763 6764 aSig0 = extractFloat128Frac0(a); 6765 aSig1 = extractFloat128Frac1(a); 6766 aExp = extractFloat128Exp(a); 6767 aSign = extractFloat128Sign(a); 6768 if (aSign && (aExp > 0x3FFE)) { 6769 float_raise(float_flag_invalid, status); 6770 if (float128_is_any_nan(a)) { 6771 return UINT64_MAX; 6772 } else { 6773 return 0; 6774 } 6775 } 6776 if (aExp) { 6777 aSig0 |= UINT64_C(0x0001000000000000); 6778 } 6779 shiftCount = 0x402F - aExp; 6780 if (shiftCount <= 0) { 6781 if (0x403E < aExp) { 6782 float_raise(float_flag_invalid, status); 6783 return UINT64_MAX; 6784 } 6785 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 6786 } else { 6787 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 6788 } 6789 return roundAndPackUint64(aSign, aSig0, aSig1, status); 6790 } 6791 6792 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 6793 { 6794 uint64_t v; 6795 signed char current_rounding_mode = status->float_rounding_mode; 6796 6797 set_float_rounding_mode(float_round_to_zero, status); 6798 v = float128_to_uint64(a, status); 6799 set_float_rounding_mode(current_rounding_mode, status); 6800 6801 return v; 6802 } 6803 6804 /*---------------------------------------------------------------------------- 6805 | Returns the result of converting the quadruple-precision floating-point 6806 | value `a' to the 32-bit unsigned integer format. The conversion 6807 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6808 | Arithmetic except that the conversion is always rounded toward zero. 6809 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 6810 | if the conversion overflows, the largest unsigned integer is returned. 6811 | If 'a' is negative, the value is rounded and zero is returned; negative 6812 | values that do not round to zero will raise the inexact exception. 6813 *----------------------------------------------------------------------------*/ 6814 6815 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 6816 { 6817 uint64_t v; 6818 uint32_t res; 6819 int old_exc_flags = get_float_exception_flags(status); 6820 6821 v = float128_to_uint64_round_to_zero(a, status); 6822 if (v > 0xffffffff) { 6823 res = 0xffffffff; 6824 } else { 6825 return v; 6826 } 6827 set_float_exception_flags(old_exc_flags, status); 6828 float_raise(float_flag_invalid, status); 6829 return res; 6830 } 6831 6832 /*---------------------------------------------------------------------------- 6833 | Returns the result of converting the quadruple-precision floating-point value 6834 | `a' to the 32-bit unsigned integer format. The conversion is 6835 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6836 | Arithmetic---which means in particular that the conversion is rounded 6837 | according to the current rounding mode. If `a' is a NaN, the largest 6838 | positive integer is returned. If the conversion overflows, the 6839 | largest unsigned integer is returned. If 'a' is negative, the value is 6840 | rounded and zero is returned; negative values that do not round to zero 6841 | will raise the inexact exception. 6842 *----------------------------------------------------------------------------*/ 6843 6844 uint32_t float128_to_uint32(float128 a, float_status *status) 6845 { 6846 uint64_t v; 6847 uint32_t res; 6848 int old_exc_flags = get_float_exception_flags(status); 6849 6850 v = float128_to_uint64(a, status); 6851 if (v > 0xffffffff) { 6852 res = 0xffffffff; 6853 } else { 6854 return v; 6855 } 6856 set_float_exception_flags(old_exc_flags, status); 6857 float_raise(float_flag_invalid, status); 6858 return res; 6859 } 6860 6861 /*---------------------------------------------------------------------------- 6862 | Returns the result of converting the quadruple-precision floating-point 6863 | value `a' to the single-precision floating-point format. The conversion 6864 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6865 | Arithmetic. 6866 *----------------------------------------------------------------------------*/ 6867 6868 float32 float128_to_float32(float128 a, float_status *status) 6869 { 6870 bool aSign; 6871 int32_t aExp; 6872 uint64_t aSig0, aSig1; 6873 uint32_t zSig; 6874 6875 aSig1 = extractFloat128Frac1( a ); 6876 aSig0 = extractFloat128Frac0( a ); 6877 aExp = extractFloat128Exp( a ); 6878 aSign = extractFloat128Sign( a ); 6879 if ( aExp == 0x7FFF ) { 6880 if ( aSig0 | aSig1 ) { 6881 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 6882 } 6883 return packFloat32( aSign, 0xFF, 0 ); 6884 } 6885 aSig0 |= ( aSig1 != 0 ); 6886 shift64RightJamming( aSig0, 18, &aSig0 ); 6887 zSig = aSig0; 6888 if ( aExp || zSig ) { 6889 zSig |= 0x40000000; 6890 aExp -= 0x3F81; 6891 } 6892 return roundAndPackFloat32(aSign, aExp, zSig, status); 6893 6894 } 6895 6896 /*---------------------------------------------------------------------------- 6897 | Returns the result of converting the quadruple-precision floating-point 6898 | value `a' to the double-precision floating-point format. The conversion 6899 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6900 | Arithmetic. 6901 *----------------------------------------------------------------------------*/ 6902 6903 float64 float128_to_float64(float128 a, float_status *status) 6904 { 6905 bool aSign; 6906 int32_t aExp; 6907 uint64_t aSig0, aSig1; 6908 6909 aSig1 = extractFloat128Frac1( a ); 6910 aSig0 = extractFloat128Frac0( a ); 6911 aExp = extractFloat128Exp( a ); 6912 aSign = extractFloat128Sign( a ); 6913 if ( aExp == 0x7FFF ) { 6914 if ( aSig0 | aSig1 ) { 6915 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 6916 } 6917 return packFloat64( aSign, 0x7FF, 0 ); 6918 } 6919 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6920 aSig0 |= ( aSig1 != 0 ); 6921 if ( aExp || aSig0 ) { 6922 aSig0 |= UINT64_C(0x4000000000000000); 6923 aExp -= 0x3C01; 6924 } 6925 return roundAndPackFloat64(aSign, aExp, aSig0, status); 6926 6927 } 6928 6929 /*---------------------------------------------------------------------------- 6930 | Returns the result of converting the quadruple-precision floating-point 6931 | value `a' to the extended double-precision floating-point format. The 6932 | conversion is performed according to the IEC/IEEE Standard for Binary 6933 | Floating-Point Arithmetic. 6934 *----------------------------------------------------------------------------*/ 6935 6936 floatx80 float128_to_floatx80(float128 a, float_status *status) 6937 { 6938 bool aSign; 6939 int32_t aExp; 6940 uint64_t aSig0, aSig1; 6941 6942 aSig1 = extractFloat128Frac1( a ); 6943 aSig0 = extractFloat128Frac0( a ); 6944 aExp = extractFloat128Exp( a ); 6945 aSign = extractFloat128Sign( a ); 6946 if ( aExp == 0x7FFF ) { 6947 if ( aSig0 | aSig1 ) { 6948 floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status), 6949 status); 6950 return floatx80_silence_nan(res, status); 6951 } 6952 return packFloatx80(aSign, floatx80_infinity_high, 6953 floatx80_infinity_low); 6954 } 6955 if ( aExp == 0 ) { 6956 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6957 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6958 } 6959 else { 6960 aSig0 |= UINT64_C(0x0001000000000000); 6961 } 6962 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6963 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6964 6965 } 6966 6967 /*---------------------------------------------------------------------------- 6968 | Rounds the quadruple-precision floating-point value `a' to an integer, and 6969 | returns the result as a quadruple-precision floating-point value. The 6970 | operation is performed according to the IEC/IEEE Standard for Binary 6971 | Floating-Point Arithmetic. 6972 *----------------------------------------------------------------------------*/ 6973 6974 float128 float128_round_to_int(float128 a, float_status *status) 6975 { 6976 bool aSign; 6977 int32_t aExp; 6978 uint64_t lastBitMask, roundBitsMask; 6979 float128 z; 6980 6981 aExp = extractFloat128Exp( a ); 6982 if ( 0x402F <= aExp ) { 6983 if ( 0x406F <= aExp ) { 6984 if ( ( aExp == 0x7FFF ) 6985 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 6986 ) { 6987 return propagateFloat128NaN(a, a, status); 6988 } 6989 return a; 6990 } 6991 lastBitMask = 1; 6992 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 6993 roundBitsMask = lastBitMask - 1; 6994 z = a; 6995 switch (status->float_rounding_mode) { 6996 case float_round_nearest_even: 6997 if ( lastBitMask ) { 6998 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 6999 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 7000 } 7001 else { 7002 if ( (int64_t) z.low < 0 ) { 7003 ++z.high; 7004 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 7005 } 7006 } 7007 break; 7008 case float_round_ties_away: 7009 if (lastBitMask) { 7010 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 7011 } else { 7012 if ((int64_t) z.low < 0) { 7013 ++z.high; 7014 } 7015 } 7016 break; 7017 case float_round_to_zero: 7018 break; 7019 case float_round_up: 7020 if (!extractFloat128Sign(z)) { 7021 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 7022 } 7023 break; 7024 case float_round_down: 7025 if (extractFloat128Sign(z)) { 7026 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 7027 } 7028 break; 7029 case float_round_to_odd: 7030 /* 7031 * Note that if lastBitMask == 0, the last bit is the lsb 7032 * of high, and roundBitsMask == -1. 7033 */ 7034 if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) { 7035 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 7036 } 7037 break; 7038 default: 7039 abort(); 7040 } 7041 z.low &= ~ roundBitsMask; 7042 } 7043 else { 7044 if ( aExp < 0x3FFF ) { 7045 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 7046 float_raise(float_flag_inexact, status); 7047 aSign = extractFloat128Sign( a ); 7048 switch (status->float_rounding_mode) { 7049 case float_round_nearest_even: 7050 if ( ( aExp == 0x3FFE ) 7051 && ( extractFloat128Frac0( a ) 7052 | extractFloat128Frac1( a ) ) 7053 ) { 7054 return packFloat128( aSign, 0x3FFF, 0, 0 ); 7055 } 7056 break; 7057 case float_round_ties_away: 7058 if (aExp == 0x3FFE) { 7059 return packFloat128(aSign, 0x3FFF, 0, 0); 7060 } 7061 break; 7062 case float_round_down: 7063 return 7064 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 7065 : packFloat128( 0, 0, 0, 0 ); 7066 case float_round_up: 7067 return 7068 aSign ? packFloat128( 1, 0, 0, 0 ) 7069 : packFloat128( 0, 0x3FFF, 0, 0 ); 7070 7071 case float_round_to_odd: 7072 return packFloat128(aSign, 0x3FFF, 0, 0); 7073 7074 case float_round_to_zero: 7075 break; 7076 } 7077 return packFloat128( aSign, 0, 0, 0 ); 7078 } 7079 lastBitMask = 1; 7080 lastBitMask <<= 0x402F - aExp; 7081 roundBitsMask = lastBitMask - 1; 7082 z.low = 0; 7083 z.high = a.high; 7084 switch (status->float_rounding_mode) { 7085 case float_round_nearest_even: 7086 z.high += lastBitMask>>1; 7087 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 7088 z.high &= ~ lastBitMask; 7089 } 7090 break; 7091 case float_round_ties_away: 7092 z.high += lastBitMask>>1; 7093 break; 7094 case float_round_to_zero: 7095 break; 7096 case float_round_up: 7097 if (!extractFloat128Sign(z)) { 7098 z.high |= ( a.low != 0 ); 7099 z.high += roundBitsMask; 7100 } 7101 break; 7102 case float_round_down: 7103 if (extractFloat128Sign(z)) { 7104 z.high |= (a.low != 0); 7105 z.high += roundBitsMask; 7106 } 7107 break; 7108 case float_round_to_odd: 7109 if ((z.high & lastBitMask) == 0) { 7110 z.high |= (a.low != 0); 7111 z.high += roundBitsMask; 7112 } 7113 break; 7114 default: 7115 abort(); 7116 } 7117 z.high &= ~ roundBitsMask; 7118 } 7119 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 7120 float_raise(float_flag_inexact, status); 7121 } 7122 return z; 7123 7124 } 7125 7126 /*---------------------------------------------------------------------------- 7127 | Returns the result of dividing the quadruple-precision floating-point value 7128 | `a' by the corresponding value `b'. The operation is performed according to 7129 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7130 *----------------------------------------------------------------------------*/ 7131 7132 float128 float128_div(float128 a, float128 b, float_status *status) 7133 { 7134 bool aSign, bSign, zSign; 7135 int32_t aExp, bExp, zExp; 7136 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 7137 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7138 7139 aSig1 = extractFloat128Frac1( a ); 7140 aSig0 = extractFloat128Frac0( a ); 7141 aExp = extractFloat128Exp( a ); 7142 aSign = extractFloat128Sign( a ); 7143 bSig1 = extractFloat128Frac1( b ); 7144 bSig0 = extractFloat128Frac0( b ); 7145 bExp = extractFloat128Exp( b ); 7146 bSign = extractFloat128Sign( b ); 7147 zSign = aSign ^ bSign; 7148 if ( aExp == 0x7FFF ) { 7149 if (aSig0 | aSig1) { 7150 return propagateFloat128NaN(a, b, status); 7151 } 7152 if ( bExp == 0x7FFF ) { 7153 if (bSig0 | bSig1) { 7154 return propagateFloat128NaN(a, b, status); 7155 } 7156 goto invalid; 7157 } 7158 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7159 } 7160 if ( bExp == 0x7FFF ) { 7161 if (bSig0 | bSig1) { 7162 return propagateFloat128NaN(a, b, status); 7163 } 7164 return packFloat128( zSign, 0, 0, 0 ); 7165 } 7166 if ( bExp == 0 ) { 7167 if ( ( bSig0 | bSig1 ) == 0 ) { 7168 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 7169 invalid: 7170 float_raise(float_flag_invalid, status); 7171 return float128_default_nan(status); 7172 } 7173 float_raise(float_flag_divbyzero, status); 7174 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7175 } 7176 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7177 } 7178 if ( aExp == 0 ) { 7179 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7180 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7181 } 7182 zExp = aExp - bExp + 0x3FFD; 7183 shortShift128Left( 7184 aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 ); 7185 shortShift128Left( 7186 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 ); 7187 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 7188 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 7189 ++zExp; 7190 } 7191 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7192 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 7193 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 7194 while ( (int64_t) rem0 < 0 ) { 7195 --zSig0; 7196 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 7197 } 7198 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 7199 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 7200 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 7201 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 7202 while ( (int64_t) rem1 < 0 ) { 7203 --zSig1; 7204 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 7205 } 7206 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7207 } 7208 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 7209 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7210 7211 } 7212 7213 /*---------------------------------------------------------------------------- 7214 | Returns the remainder of the quadruple-precision floating-point value `a' 7215 | with respect to the corresponding value `b'. The operation is performed 7216 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7217 *----------------------------------------------------------------------------*/ 7218 7219 float128 float128_rem(float128 a, float128 b, float_status *status) 7220 { 7221 bool aSign, zSign; 7222 int32_t aExp, bExp, expDiff; 7223 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 7224 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 7225 int64_t sigMean0; 7226 7227 aSig1 = extractFloat128Frac1( a ); 7228 aSig0 = extractFloat128Frac0( a ); 7229 aExp = extractFloat128Exp( a ); 7230 aSign = extractFloat128Sign( a ); 7231 bSig1 = extractFloat128Frac1( b ); 7232 bSig0 = extractFloat128Frac0( b ); 7233 bExp = extractFloat128Exp( b ); 7234 if ( aExp == 0x7FFF ) { 7235 if ( ( aSig0 | aSig1 ) 7236 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 7237 return propagateFloat128NaN(a, b, status); 7238 } 7239 goto invalid; 7240 } 7241 if ( bExp == 0x7FFF ) { 7242 if (bSig0 | bSig1) { 7243 return propagateFloat128NaN(a, b, status); 7244 } 7245 return a; 7246 } 7247 if ( bExp == 0 ) { 7248 if ( ( bSig0 | bSig1 ) == 0 ) { 7249 invalid: 7250 float_raise(float_flag_invalid, status); 7251 return float128_default_nan(status); 7252 } 7253 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7254 } 7255 if ( aExp == 0 ) { 7256 if ( ( aSig0 | aSig1 ) == 0 ) return a; 7257 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7258 } 7259 expDiff = aExp - bExp; 7260 if ( expDiff < -1 ) return a; 7261 shortShift128Left( 7262 aSig0 | UINT64_C(0x0001000000000000), 7263 aSig1, 7264 15 - ( expDiff < 0 ), 7265 &aSig0, 7266 &aSig1 7267 ); 7268 shortShift128Left( 7269 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 ); 7270 q = le128( bSig0, bSig1, aSig0, aSig1 ); 7271 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7272 expDiff -= 64; 7273 while ( 0 < expDiff ) { 7274 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7275 q = ( 4 < q ) ? q - 4 : 0; 7276 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7277 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 7278 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 7279 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 7280 expDiff -= 61; 7281 } 7282 if ( -64 < expDiff ) { 7283 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7284 q = ( 4 < q ) ? q - 4 : 0; 7285 q >>= - expDiff; 7286 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7287 expDiff += 52; 7288 if ( expDiff < 0 ) { 7289 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 7290 } 7291 else { 7292 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 7293 } 7294 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7295 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 7296 } 7297 else { 7298 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 7299 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7300 } 7301 do { 7302 alternateASig0 = aSig0; 7303 alternateASig1 = aSig1; 7304 ++q; 7305 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7306 } while ( 0 <= (int64_t) aSig0 ); 7307 add128( 7308 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 7309 if ( ( sigMean0 < 0 ) 7310 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 7311 aSig0 = alternateASig0; 7312 aSig1 = alternateASig1; 7313 } 7314 zSign = ( (int64_t) aSig0 < 0 ); 7315 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 7316 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 7317 status); 7318 } 7319 7320 /*---------------------------------------------------------------------------- 7321 | Returns the square root of the quadruple-precision floating-point value `a'. 7322 | The operation is performed according to the IEC/IEEE Standard for Binary 7323 | Floating-Point Arithmetic. 7324 *----------------------------------------------------------------------------*/ 7325 7326 float128 float128_sqrt(float128 a, float_status *status) 7327 { 7328 bool aSign; 7329 int32_t aExp, zExp; 7330 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 7331 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7332 7333 aSig1 = extractFloat128Frac1( a ); 7334 aSig0 = extractFloat128Frac0( a ); 7335 aExp = extractFloat128Exp( a ); 7336 aSign = extractFloat128Sign( a ); 7337 if ( aExp == 0x7FFF ) { 7338 if (aSig0 | aSig1) { 7339 return propagateFloat128NaN(a, a, status); 7340 } 7341 if ( ! aSign ) return a; 7342 goto invalid; 7343 } 7344 if ( aSign ) { 7345 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 7346 invalid: 7347 float_raise(float_flag_invalid, status); 7348 return float128_default_nan(status); 7349 } 7350 if ( aExp == 0 ) { 7351 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 7352 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7353 } 7354 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 7355 aSig0 |= UINT64_C(0x0001000000000000); 7356 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 7357 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 7358 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 7359 doubleZSig0 = zSig0<<1; 7360 mul64To128( zSig0, zSig0, &term0, &term1 ); 7361 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 7362 while ( (int64_t) rem0 < 0 ) { 7363 --zSig0; 7364 doubleZSig0 -= 2; 7365 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 7366 } 7367 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 7368 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 7369 if ( zSig1 == 0 ) zSig1 = 1; 7370 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 7371 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 7372 mul64To128( zSig1, zSig1, &term2, &term3 ); 7373 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 7374 while ( (int64_t) rem1 < 0 ) { 7375 --zSig1; 7376 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 7377 term3 |= 1; 7378 term2 |= doubleZSig0; 7379 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 7380 } 7381 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7382 } 7383 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 7384 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 7385 7386 } 7387 7388 static inline FloatRelation 7389 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet, 7390 float_status *status) 7391 { 7392 bool aSign, bSign; 7393 7394 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 7395 float_raise(float_flag_invalid, status); 7396 return float_relation_unordered; 7397 } 7398 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 7399 ( extractFloatx80Frac( a )<<1 ) ) || 7400 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 7401 ( extractFloatx80Frac( b )<<1 ) )) { 7402 if (!is_quiet || 7403 floatx80_is_signaling_nan(a, status) || 7404 floatx80_is_signaling_nan(b, status)) { 7405 float_raise(float_flag_invalid, status); 7406 } 7407 return float_relation_unordered; 7408 } 7409 aSign = extractFloatx80Sign( a ); 7410 bSign = extractFloatx80Sign( b ); 7411 if ( aSign != bSign ) { 7412 7413 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 7414 ( ( a.low | b.low ) == 0 ) ) { 7415 /* zero case */ 7416 return float_relation_equal; 7417 } else { 7418 return 1 - (2 * aSign); 7419 } 7420 } else { 7421 /* Normalize pseudo-denormals before comparison. */ 7422 if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) { 7423 ++a.high; 7424 } 7425 if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) { 7426 ++b.high; 7427 } 7428 if (a.low == b.low && a.high == b.high) { 7429 return float_relation_equal; 7430 } else { 7431 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7432 } 7433 } 7434 } 7435 7436 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status) 7437 { 7438 return floatx80_compare_internal(a, b, 0, status); 7439 } 7440 7441 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b, 7442 float_status *status) 7443 { 7444 return floatx80_compare_internal(a, b, 1, status); 7445 } 7446 7447 static inline FloatRelation 7448 float128_compare_internal(float128 a, float128 b, bool is_quiet, 7449 float_status *status) 7450 { 7451 bool aSign, bSign; 7452 7453 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7454 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7455 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7456 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7457 if (!is_quiet || 7458 float128_is_signaling_nan(a, status) || 7459 float128_is_signaling_nan(b, status)) { 7460 float_raise(float_flag_invalid, status); 7461 } 7462 return float_relation_unordered; 7463 } 7464 aSign = extractFloat128Sign( a ); 7465 bSign = extractFloat128Sign( b ); 7466 if ( aSign != bSign ) { 7467 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7468 /* zero case */ 7469 return float_relation_equal; 7470 } else { 7471 return 1 - (2 * aSign); 7472 } 7473 } else { 7474 if (a.low == b.low && a.high == b.high) { 7475 return float_relation_equal; 7476 } else { 7477 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7478 } 7479 } 7480 } 7481 7482 FloatRelation float128_compare(float128 a, float128 b, float_status *status) 7483 { 7484 return float128_compare_internal(a, b, 0, status); 7485 } 7486 7487 FloatRelation float128_compare_quiet(float128 a, float128 b, 7488 float_status *status) 7489 { 7490 return float128_compare_internal(a, b, 1, status); 7491 } 7492 7493 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7494 { 7495 bool aSign; 7496 int32_t aExp; 7497 uint64_t aSig; 7498 7499 if (floatx80_invalid_encoding(a)) { 7500 float_raise(float_flag_invalid, status); 7501 return floatx80_default_nan(status); 7502 } 7503 aSig = extractFloatx80Frac( a ); 7504 aExp = extractFloatx80Exp( a ); 7505 aSign = extractFloatx80Sign( a ); 7506 7507 if ( aExp == 0x7FFF ) { 7508 if ( aSig<<1 ) { 7509 return propagateFloatx80NaN(a, a, status); 7510 } 7511 return a; 7512 } 7513 7514 if (aExp == 0) { 7515 if (aSig == 0) { 7516 return a; 7517 } 7518 aExp++; 7519 } 7520 7521 if (n > 0x10000) { 7522 n = 0x10000; 7523 } else if (n < -0x10000) { 7524 n = -0x10000; 7525 } 7526 7527 aExp += n; 7528 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7529 aSign, aExp, aSig, 0, status); 7530 } 7531 7532 float128 float128_scalbn(float128 a, int n, float_status *status) 7533 { 7534 bool aSign; 7535 int32_t aExp; 7536 uint64_t aSig0, aSig1; 7537 7538 aSig1 = extractFloat128Frac1( a ); 7539 aSig0 = extractFloat128Frac0( a ); 7540 aExp = extractFloat128Exp( a ); 7541 aSign = extractFloat128Sign( a ); 7542 if ( aExp == 0x7FFF ) { 7543 if ( aSig0 | aSig1 ) { 7544 return propagateFloat128NaN(a, a, status); 7545 } 7546 return a; 7547 } 7548 if (aExp != 0) { 7549 aSig0 |= UINT64_C(0x0001000000000000); 7550 } else if (aSig0 == 0 && aSig1 == 0) { 7551 return a; 7552 } else { 7553 aExp++; 7554 } 7555 7556 if (n > 0x10000) { 7557 n = 0x10000; 7558 } else if (n < -0x10000) { 7559 n = -0x10000; 7560 } 7561 7562 aExp += n - 1; 7563 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7564 , status); 7565 7566 } 7567 7568 static void __attribute__((constructor)) softfloat_init(void) 7569 { 7570 union_float64 ua, ub, uc, ur; 7571 7572 if (QEMU_NO_HARDFLOAT) { 7573 return; 7574 } 7575 /* 7576 * Test that the host's FMA is not obviously broken. For example, 7577 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see 7578 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304 7579 */ 7580 ua.s = 0x0020000000000001ULL; 7581 ub.s = 0x3ca0000000000000ULL; 7582 uc.s = 0x0020000000000000ULL; 7583 ur.h = fma(ua.h, ub.h, uc.h); 7584 if (ur.s != 0x0020000000000001ULL) { 7585 force_soft_fma = true; 7586 } 7587 } 7588