1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include <math.h> 87 #include "qemu/bitops.h" 88 #include "fpu/softfloat.h" 89 90 /* We only need stdlib for abort() */ 91 92 /*---------------------------------------------------------------------------- 93 | Primitive arithmetic functions, including multi-word arithmetic, and 94 | division and square root approximations. (Can be specialized to target if 95 | desired.) 96 *----------------------------------------------------------------------------*/ 97 #include "fpu/softfloat-macros.h" 98 99 /* 100 * Hardfloat 101 * 102 * Fast emulation of guest FP instructions is challenging for two reasons. 103 * First, FP instruction semantics are similar but not identical, particularly 104 * when handling NaNs. Second, emulating at reasonable speed the guest FP 105 * exception flags is not trivial: reading the host's flags register with a 106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp], 107 * and trapping on every FP exception is not fast nor pleasant to work with. 108 * 109 * We address these challenges by leveraging the host FPU for a subset of the 110 * operations. To do this we expand on the idea presented in this paper: 111 * 112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a 113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615. 114 * 115 * The idea is thus to leverage the host FPU to (1) compute FP operations 116 * and (2) identify whether FP exceptions occurred while avoiding 117 * expensive exception flag register accesses. 118 * 119 * An important optimization shown in the paper is that given that exception 120 * flags are rarely cleared by the guest, we can avoid recomputing some flags. 121 * This is particularly useful for the inexact flag, which is very frequently 122 * raised in floating-point workloads. 123 * 124 * We optimize the code further by deferring to soft-fp whenever FP exception 125 * detection might get hairy. Two examples: (1) when at least one operand is 126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result 127 * and the result is < the minimum normal. 128 */ 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \ 130 static inline void name(soft_t *a, float_status *s) \ 131 { \ 132 if (unlikely(soft_t ## _is_denormal(*a))) { \ 133 *a = soft_t ## _set_sign(soft_t ## _zero, \ 134 soft_t ## _is_neg(*a)); \ 135 float_raise(float_flag_input_denormal, s); \ 136 } \ 137 } 138 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32) 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64) 141 #undef GEN_INPUT_FLUSH__NOCHECK 142 143 #define GEN_INPUT_FLUSH1(name, soft_t) \ 144 static inline void name(soft_t *a, float_status *s) \ 145 { \ 146 if (likely(!s->flush_inputs_to_zero)) { \ 147 return; \ 148 } \ 149 soft_t ## _input_flush__nocheck(a, s); \ 150 } 151 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32) 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64) 154 #undef GEN_INPUT_FLUSH1 155 156 #define GEN_INPUT_FLUSH2(name, soft_t) \ 157 static inline void name(soft_t *a, soft_t *b, float_status *s) \ 158 { \ 159 if (likely(!s->flush_inputs_to_zero)) { \ 160 return; \ 161 } \ 162 soft_t ## _input_flush__nocheck(a, s); \ 163 soft_t ## _input_flush__nocheck(b, s); \ 164 } 165 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32) 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64) 168 #undef GEN_INPUT_FLUSH2 169 170 #define GEN_INPUT_FLUSH3(name, soft_t) \ 171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \ 172 { \ 173 if (likely(!s->flush_inputs_to_zero)) { \ 174 return; \ 175 } \ 176 soft_t ## _input_flush__nocheck(a, s); \ 177 soft_t ## _input_flush__nocheck(b, s); \ 178 soft_t ## _input_flush__nocheck(c, s); \ 179 } 180 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32) 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64) 183 #undef GEN_INPUT_FLUSH3 184 185 /* 186 * Choose whether to use fpclassify or float32/64_* primitives in the generated 187 * hardfloat functions. Each combination of number of inputs and float size 188 * gets its own value. 189 */ 190 #if defined(__x86_64__) 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1 197 #else 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0 204 #endif 205 206 /* 207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over 208 * float{32,64}_is_infinity when !USE_FP. 209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup. 210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%. 211 */ 212 #if defined(__x86_64__) || defined(__aarch64__) 213 # define QEMU_HARDFLOAT_USE_ISINF 1 214 #else 215 # define QEMU_HARDFLOAT_USE_ISINF 0 216 #endif 217 218 /* 219 * Some targets clear the FP flags before most FP operations. This prevents 220 * the use of hardfloat, since hardfloat relies on the inexact flag being 221 * already set. 222 */ 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__) 224 # if defined(__FAST_MATH__) 225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \ 226 IEEE implementation 227 # endif 228 # define QEMU_NO_HARDFLOAT 1 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN 230 #else 231 # define QEMU_NO_HARDFLOAT 0 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline)) 233 #endif 234 235 static inline bool can_use_fpu(const float_status *s) 236 { 237 if (QEMU_NO_HARDFLOAT) { 238 return false; 239 } 240 return likely(s->float_exception_flags & float_flag_inexact && 241 s->float_rounding_mode == float_round_nearest_even); 242 } 243 244 /* 245 * Hardfloat generation functions. Each operation can have two flavors: 246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for 247 * most condition checks, or native ones (e.g. fpclassify). 248 * 249 * The flavor is chosen by the callers. Instead of using macros, we rely on the 250 * compiler to propagate constants and inline everything into the callers. 251 * 252 * We only generate functions for operations with two inputs, since only 253 * these are common enough to justify consolidating them into common code. 254 */ 255 256 typedef union { 257 float32 s; 258 float h; 259 } union_float32; 260 261 typedef union { 262 float64 s; 263 double h; 264 } union_float64; 265 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b); 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b); 268 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s); 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s); 271 typedef float (*hard_f32_op2_fn)(float a, float b); 272 typedef double (*hard_f64_op2_fn)(double a, double b); 273 274 /* 2-input is-zero-or-normal */ 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b) 276 { 277 if (QEMU_HARDFLOAT_2F32_USE_FP) { 278 /* 279 * Not using a temp variable for consecutive fpclassify calls ends up 280 * generating faster code. 281 */ 282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 284 } 285 return float32_is_zero_or_normal(a.s) && 286 float32_is_zero_or_normal(b.s); 287 } 288 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b) 290 { 291 if (QEMU_HARDFLOAT_2F64_USE_FP) { 292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 294 } 295 return float64_is_zero_or_normal(a.s) && 296 float64_is_zero_or_normal(b.s); 297 } 298 299 /* 3-input is-zero-or-normal */ 300 static inline 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c) 302 { 303 if (QEMU_HARDFLOAT_3F32_USE_FP) { 304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 307 } 308 return float32_is_zero_or_normal(a.s) && 309 float32_is_zero_or_normal(b.s) && 310 float32_is_zero_or_normal(c.s); 311 } 312 313 static inline 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c) 315 { 316 if (QEMU_HARDFLOAT_3F64_USE_FP) { 317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 320 } 321 return float64_is_zero_or_normal(a.s) && 322 float64_is_zero_or_normal(b.s) && 323 float64_is_zero_or_normal(c.s); 324 } 325 326 static inline bool f32_is_inf(union_float32 a) 327 { 328 if (QEMU_HARDFLOAT_USE_ISINF) { 329 return isinf(a.h); 330 } 331 return float32_is_infinity(a.s); 332 } 333 334 static inline bool f64_is_inf(union_float64 a) 335 { 336 if (QEMU_HARDFLOAT_USE_ISINF) { 337 return isinf(a.h); 338 } 339 return float64_is_infinity(a.s); 340 } 341 342 static inline float32 343 float32_gen2(float32 xa, float32 xb, float_status *s, 344 hard_f32_op2_fn hard, soft_f32_op2_fn soft, 345 f32_check_fn pre, f32_check_fn post) 346 { 347 union_float32 ua, ub, ur; 348 349 ua.s = xa; 350 ub.s = xb; 351 352 if (unlikely(!can_use_fpu(s))) { 353 goto soft; 354 } 355 356 float32_input_flush2(&ua.s, &ub.s, s); 357 if (unlikely(!pre(ua, ub))) { 358 goto soft; 359 } 360 361 ur.h = hard(ua.h, ub.h); 362 if (unlikely(f32_is_inf(ur))) { 363 float_raise(float_flag_overflow, s); 364 } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) { 365 goto soft; 366 } 367 return ur.s; 368 369 soft: 370 return soft(ua.s, ub.s, s); 371 } 372 373 static inline float64 374 float64_gen2(float64 xa, float64 xb, float_status *s, 375 hard_f64_op2_fn hard, soft_f64_op2_fn soft, 376 f64_check_fn pre, f64_check_fn post) 377 { 378 union_float64 ua, ub, ur; 379 380 ua.s = xa; 381 ub.s = xb; 382 383 if (unlikely(!can_use_fpu(s))) { 384 goto soft; 385 } 386 387 float64_input_flush2(&ua.s, &ub.s, s); 388 if (unlikely(!pre(ua, ub))) { 389 goto soft; 390 } 391 392 ur.h = hard(ua.h, ub.h); 393 if (unlikely(f64_is_inf(ur))) { 394 float_raise(float_flag_overflow, s); 395 } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) { 396 goto soft; 397 } 398 return ur.s; 399 400 soft: 401 return soft(ua.s, ub.s, s); 402 } 403 404 /*---------------------------------------------------------------------------- 405 | Returns the fraction bits of the single-precision floating-point value `a'. 406 *----------------------------------------------------------------------------*/ 407 408 static inline uint32_t extractFloat32Frac(float32 a) 409 { 410 return float32_val(a) & 0x007FFFFF; 411 } 412 413 /*---------------------------------------------------------------------------- 414 | Returns the exponent bits of the single-precision floating-point value `a'. 415 *----------------------------------------------------------------------------*/ 416 417 static inline int extractFloat32Exp(float32 a) 418 { 419 return (float32_val(a) >> 23) & 0xFF; 420 } 421 422 /*---------------------------------------------------------------------------- 423 | Returns the sign bit of the single-precision floating-point value `a'. 424 *----------------------------------------------------------------------------*/ 425 426 static inline bool extractFloat32Sign(float32 a) 427 { 428 return float32_val(a) >> 31; 429 } 430 431 /*---------------------------------------------------------------------------- 432 | Returns the fraction bits of the double-precision floating-point value `a'. 433 *----------------------------------------------------------------------------*/ 434 435 static inline uint64_t extractFloat64Frac(float64 a) 436 { 437 return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF); 438 } 439 440 /*---------------------------------------------------------------------------- 441 | Returns the exponent bits of the double-precision floating-point value `a'. 442 *----------------------------------------------------------------------------*/ 443 444 static inline int extractFloat64Exp(float64 a) 445 { 446 return (float64_val(a) >> 52) & 0x7FF; 447 } 448 449 /*---------------------------------------------------------------------------- 450 | Returns the sign bit of the double-precision floating-point value `a'. 451 *----------------------------------------------------------------------------*/ 452 453 static inline bool extractFloat64Sign(float64 a) 454 { 455 return float64_val(a) >> 63; 456 } 457 458 /* 459 * Classify a floating point number. Everything above float_class_qnan 460 * is a NaN so cls >= float_class_qnan is any NaN. 461 */ 462 463 typedef enum __attribute__ ((__packed__)) { 464 float_class_unclassified, 465 float_class_zero, 466 float_class_normal, 467 float_class_inf, 468 float_class_qnan, /* all NaNs from here */ 469 float_class_snan, 470 } FloatClass; 471 472 #define float_cmask(bit) (1u << (bit)) 473 474 enum { 475 float_cmask_zero = float_cmask(float_class_zero), 476 float_cmask_normal = float_cmask(float_class_normal), 477 float_cmask_inf = float_cmask(float_class_inf), 478 float_cmask_qnan = float_cmask(float_class_qnan), 479 float_cmask_snan = float_cmask(float_class_snan), 480 481 float_cmask_infzero = float_cmask_zero | float_cmask_inf, 482 float_cmask_anynan = float_cmask_qnan | float_cmask_snan, 483 }; 484 485 486 /* Simple helpers for checking if, or what kind of, NaN we have */ 487 static inline __attribute__((unused)) bool is_nan(FloatClass c) 488 { 489 return unlikely(c >= float_class_qnan); 490 } 491 492 static inline __attribute__((unused)) bool is_snan(FloatClass c) 493 { 494 return c == float_class_snan; 495 } 496 497 static inline __attribute__((unused)) bool is_qnan(FloatClass c) 498 { 499 return c == float_class_qnan; 500 } 501 502 /* 503 * Structure holding all of the decomposed parts of a float. 504 * The exponent is unbiased and the fraction is normalized. 505 * 506 * The fraction words are stored in big-endian word ordering, 507 * so that truncation from a larger format to a smaller format 508 * can be done simply by ignoring subsequent elements. 509 */ 510 511 typedef struct { 512 FloatClass cls; 513 bool sign; 514 int32_t exp; 515 union { 516 /* Routines that know the structure may reference the singular name. */ 517 uint64_t frac; 518 /* 519 * Routines expanded with multiple structures reference "hi" and "lo" 520 * depending on the operation. In FloatParts64, "hi" and "lo" are 521 * both the same word and aliased here. 522 */ 523 uint64_t frac_hi; 524 uint64_t frac_lo; 525 }; 526 } FloatParts64; 527 528 typedef struct { 529 FloatClass cls; 530 bool sign; 531 int32_t exp; 532 uint64_t frac_hi; 533 uint64_t frac_lo; 534 } FloatParts128; 535 536 typedef struct { 537 FloatClass cls; 538 bool sign; 539 int32_t exp; 540 uint64_t frac_hi; 541 uint64_t frac_hm; /* high-middle */ 542 uint64_t frac_lm; /* low-middle */ 543 uint64_t frac_lo; 544 } FloatParts256; 545 546 /* These apply to the most significant word of each FloatPartsN. */ 547 #define DECOMPOSED_BINARY_POINT 63 548 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 549 550 /* Structure holding all of the relevant parameters for a format. 551 * exp_size: the size of the exponent field 552 * exp_bias: the offset applied to the exponent field 553 * exp_max: the maximum normalised exponent 554 * frac_size: the size of the fraction field 555 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 556 * The following are computed based the size of fraction 557 * frac_lsb: least significant bit of fraction 558 * frac_lsbm1: the bit below the least significant bit (for rounding) 559 * round_mask/roundeven_mask: masks used for rounding 560 * The following optional modifiers are available: 561 * arm_althp: handle ARM Alternative Half Precision 562 */ 563 typedef struct { 564 int exp_size; 565 int exp_bias; 566 int exp_max; 567 int frac_size; 568 int frac_shift; 569 uint64_t frac_lsb; 570 uint64_t frac_lsbm1; 571 uint64_t round_mask; 572 uint64_t roundeven_mask; 573 bool arm_althp; 574 } FloatFmt; 575 576 /* Expand fields based on the size of exponent and fraction */ 577 #define FLOAT_PARAMS(E, F) \ 578 .exp_size = E, \ 579 .exp_bias = ((1 << E) - 1) >> 1, \ 580 .exp_max = (1 << E) - 1, \ 581 .frac_size = F, \ 582 .frac_shift = (-F - 1) & 63, \ 583 .frac_lsb = 1ull << ((-F - 1) & 63), \ 584 .frac_lsbm1 = 1ull << ((-F - 2) & 63), \ 585 .round_mask = (1ull << ((-F - 1) & 63)) - 1, \ 586 .roundeven_mask = (2ull << ((-F - 1) & 63)) - 1 587 588 static const FloatFmt float16_params = { 589 FLOAT_PARAMS(5, 10) 590 }; 591 592 static const FloatFmt float16_params_ahp = { 593 FLOAT_PARAMS(5, 10), 594 .arm_althp = true 595 }; 596 597 static const FloatFmt bfloat16_params = { 598 FLOAT_PARAMS(8, 7) 599 }; 600 601 static const FloatFmt float32_params = { 602 FLOAT_PARAMS(8, 23) 603 }; 604 605 static const FloatFmt float64_params = { 606 FLOAT_PARAMS(11, 52) 607 }; 608 609 static const FloatFmt float128_params = { 610 FLOAT_PARAMS(15, 112) 611 }; 612 613 /* Unpack a float to parts, but do not canonicalize. */ 614 static void unpack_raw64(FloatParts64 *r, const FloatFmt *fmt, uint64_t raw) 615 { 616 const int f_size = fmt->frac_size; 617 const int e_size = fmt->exp_size; 618 619 *r = (FloatParts64) { 620 .cls = float_class_unclassified, 621 .sign = extract64(raw, f_size + e_size, 1), 622 .exp = extract64(raw, f_size, e_size), 623 .frac = extract64(raw, 0, f_size) 624 }; 625 } 626 627 static inline void float16_unpack_raw(FloatParts64 *p, float16 f) 628 { 629 unpack_raw64(p, &float16_params, f); 630 } 631 632 static inline void bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f) 633 { 634 unpack_raw64(p, &bfloat16_params, f); 635 } 636 637 static inline void float32_unpack_raw(FloatParts64 *p, float32 f) 638 { 639 unpack_raw64(p, &float32_params, f); 640 } 641 642 static inline void float64_unpack_raw(FloatParts64 *p, float64 f) 643 { 644 unpack_raw64(p, &float64_params, f); 645 } 646 647 static void float128_unpack_raw(FloatParts128 *p, float128 f) 648 { 649 const int f_size = float128_params.frac_size - 64; 650 const int e_size = float128_params.exp_size; 651 652 *p = (FloatParts128) { 653 .cls = float_class_unclassified, 654 .sign = extract64(f.high, f_size + e_size, 1), 655 .exp = extract64(f.high, f_size, e_size), 656 .frac_hi = extract64(f.high, 0, f_size), 657 .frac_lo = f.low, 658 }; 659 } 660 661 /* Pack a float from parts, but do not canonicalize. */ 662 static uint64_t pack_raw64(const FloatParts64 *p, const FloatFmt *fmt) 663 { 664 const int f_size = fmt->frac_size; 665 const int e_size = fmt->exp_size; 666 uint64_t ret; 667 668 ret = (uint64_t)p->sign << (f_size + e_size); 669 ret = deposit64(ret, f_size, e_size, p->exp); 670 ret = deposit64(ret, 0, f_size, p->frac); 671 return ret; 672 } 673 674 static inline float16 float16_pack_raw(const FloatParts64 *p) 675 { 676 return make_float16(pack_raw64(p, &float16_params)); 677 } 678 679 static inline bfloat16 bfloat16_pack_raw(const FloatParts64 *p) 680 { 681 return pack_raw64(p, &bfloat16_params); 682 } 683 684 static inline float32 float32_pack_raw(const FloatParts64 *p) 685 { 686 return make_float32(pack_raw64(p, &float32_params)); 687 } 688 689 static inline float64 float64_pack_raw(const FloatParts64 *p) 690 { 691 return make_float64(pack_raw64(p, &float64_params)); 692 } 693 694 static float128 float128_pack_raw(const FloatParts128 *p) 695 { 696 const int f_size = float128_params.frac_size - 64; 697 const int e_size = float128_params.exp_size; 698 uint64_t hi; 699 700 hi = (uint64_t)p->sign << (f_size + e_size); 701 hi = deposit64(hi, f_size, e_size, p->exp); 702 hi = deposit64(hi, 0, f_size, p->frac_hi); 703 return make_float128(hi, p->frac_lo); 704 } 705 706 /*---------------------------------------------------------------------------- 707 | Functions and definitions to determine: (1) whether tininess for underflow 708 | is detected before or after rounding by default, (2) what (if anything) 709 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 710 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 711 | are propagated from function inputs to output. These details are target- 712 | specific. 713 *----------------------------------------------------------------------------*/ 714 #include "softfloat-specialize.c.inc" 715 716 #define PARTS_GENERIC_64_128(NAME, P) \ 717 QEMU_GENERIC(P, (FloatParts128 *, parts128_##NAME), parts64_##NAME) 718 719 #define PARTS_GENERIC_64_128_256(NAME, P) \ 720 QEMU_GENERIC(P, (FloatParts256 *, parts256_##NAME), \ 721 (FloatParts128 *, parts128_##NAME), parts64_##NAME) 722 723 #define parts_default_nan(P, S) PARTS_GENERIC_64_128(default_nan, P)(P, S) 724 #define parts_silence_nan(P, S) PARTS_GENERIC_64_128(silence_nan, P)(P, S) 725 726 static void parts64_return_nan(FloatParts64 *a, float_status *s); 727 static void parts128_return_nan(FloatParts128 *a, float_status *s); 728 729 #define parts_return_nan(P, S) PARTS_GENERIC_64_128(return_nan, P)(P, S) 730 731 static FloatParts64 *parts64_pick_nan(FloatParts64 *a, FloatParts64 *b, 732 float_status *s); 733 static FloatParts128 *parts128_pick_nan(FloatParts128 *a, FloatParts128 *b, 734 float_status *s); 735 736 #define parts_pick_nan(A, B, S) PARTS_GENERIC_64_128(pick_nan, A)(A, B, S) 737 738 static FloatParts64 *parts64_pick_nan_muladd(FloatParts64 *a, FloatParts64 *b, 739 FloatParts64 *c, float_status *s, 740 int ab_mask, int abc_mask); 741 static FloatParts128 *parts128_pick_nan_muladd(FloatParts128 *a, 742 FloatParts128 *b, 743 FloatParts128 *c, 744 float_status *s, 745 int ab_mask, int abc_mask); 746 747 #define parts_pick_nan_muladd(A, B, C, S, ABM, ABCM) \ 748 PARTS_GENERIC_64_128(pick_nan_muladd, A)(A, B, C, S, ABM, ABCM) 749 750 static void parts64_canonicalize(FloatParts64 *p, float_status *status, 751 const FloatFmt *fmt); 752 static void parts128_canonicalize(FloatParts128 *p, float_status *status, 753 const FloatFmt *fmt); 754 755 #define parts_canonicalize(A, S, F) \ 756 PARTS_GENERIC_64_128(canonicalize, A)(A, S, F) 757 758 static void parts64_uncanon(FloatParts64 *p, float_status *status, 759 const FloatFmt *fmt); 760 static void parts128_uncanon(FloatParts128 *p, float_status *status, 761 const FloatFmt *fmt); 762 763 #define parts_uncanon(A, S, F) \ 764 PARTS_GENERIC_64_128(uncanon, A)(A, S, F) 765 766 static void parts64_add_normal(FloatParts64 *a, FloatParts64 *b); 767 static void parts128_add_normal(FloatParts128 *a, FloatParts128 *b); 768 static void parts256_add_normal(FloatParts256 *a, FloatParts256 *b); 769 770 #define parts_add_normal(A, B) \ 771 PARTS_GENERIC_64_128_256(add_normal, A)(A, B) 772 773 static bool parts64_sub_normal(FloatParts64 *a, FloatParts64 *b); 774 static bool parts128_sub_normal(FloatParts128 *a, FloatParts128 *b); 775 static bool parts256_sub_normal(FloatParts256 *a, FloatParts256 *b); 776 777 #define parts_sub_normal(A, B) \ 778 PARTS_GENERIC_64_128_256(sub_normal, A)(A, B) 779 780 static FloatParts64 *parts64_addsub(FloatParts64 *a, FloatParts64 *b, 781 float_status *s, bool subtract); 782 static FloatParts128 *parts128_addsub(FloatParts128 *a, FloatParts128 *b, 783 float_status *s, bool subtract); 784 785 #define parts_addsub(A, B, S, Z) \ 786 PARTS_GENERIC_64_128(addsub, A)(A, B, S, Z) 787 788 static FloatParts64 *parts64_mul(FloatParts64 *a, FloatParts64 *b, 789 float_status *s); 790 static FloatParts128 *parts128_mul(FloatParts128 *a, FloatParts128 *b, 791 float_status *s); 792 793 #define parts_mul(A, B, S) \ 794 PARTS_GENERIC_64_128(mul, A)(A, B, S) 795 796 static FloatParts64 *parts64_muladd(FloatParts64 *a, FloatParts64 *b, 797 FloatParts64 *c, int flags, 798 float_status *s); 799 static FloatParts128 *parts128_muladd(FloatParts128 *a, FloatParts128 *b, 800 FloatParts128 *c, int flags, 801 float_status *s); 802 803 #define parts_muladd(A, B, C, Z, S) \ 804 PARTS_GENERIC_64_128(muladd, A)(A, B, C, Z, S) 805 806 static FloatParts64 *parts64_div(FloatParts64 *a, FloatParts64 *b, 807 float_status *s); 808 static FloatParts128 *parts128_div(FloatParts128 *a, FloatParts128 *b, 809 float_status *s); 810 811 #define parts_div(A, B, S) \ 812 PARTS_GENERIC_64_128(div, A)(A, B, S) 813 814 static bool parts64_round_to_int_normal(FloatParts64 *a, FloatRoundMode rm, 815 int scale, int frac_size); 816 static bool parts128_round_to_int_normal(FloatParts128 *a, FloatRoundMode r, 817 int scale, int frac_size); 818 819 #define parts_round_to_int_normal(A, R, C, F) \ 820 PARTS_GENERIC_64_128(round_to_int_normal, A)(A, R, C, F) 821 822 static void parts64_round_to_int(FloatParts64 *a, FloatRoundMode rm, 823 int scale, float_status *s, 824 const FloatFmt *fmt); 825 static void parts128_round_to_int(FloatParts128 *a, FloatRoundMode r, 826 int scale, float_status *s, 827 const FloatFmt *fmt); 828 829 #define parts_round_to_int(A, R, C, S, F) \ 830 PARTS_GENERIC_64_128(round_to_int, A)(A, R, C, S, F) 831 832 static int64_t parts64_float_to_sint(FloatParts64 *p, FloatRoundMode rmode, 833 int scale, int64_t min, int64_t max, 834 float_status *s); 835 static int64_t parts128_float_to_sint(FloatParts128 *p, FloatRoundMode rmode, 836 int scale, int64_t min, int64_t max, 837 float_status *s); 838 839 #define parts_float_to_sint(P, R, Z, MN, MX, S) \ 840 PARTS_GENERIC_64_128(float_to_sint, P)(P, R, Z, MN, MX, S) 841 842 /* 843 * Helper functions for softfloat-parts.c.inc, per-size operations. 844 */ 845 846 #define FRAC_GENERIC_64_128(NAME, P) \ 847 QEMU_GENERIC(P, (FloatParts128 *, frac128_##NAME), frac64_##NAME) 848 849 #define FRAC_GENERIC_64_128_256(NAME, P) \ 850 QEMU_GENERIC(P, (FloatParts256 *, frac256_##NAME), \ 851 (FloatParts128 *, frac128_##NAME), frac64_##NAME) 852 853 static bool frac64_add(FloatParts64 *r, FloatParts64 *a, FloatParts64 *b) 854 { 855 return uadd64_overflow(a->frac, b->frac, &r->frac); 856 } 857 858 static bool frac128_add(FloatParts128 *r, FloatParts128 *a, FloatParts128 *b) 859 { 860 bool c = 0; 861 r->frac_lo = uadd64_carry(a->frac_lo, b->frac_lo, &c); 862 r->frac_hi = uadd64_carry(a->frac_hi, b->frac_hi, &c); 863 return c; 864 } 865 866 static bool frac256_add(FloatParts256 *r, FloatParts256 *a, FloatParts256 *b) 867 { 868 bool c = 0; 869 r->frac_lo = uadd64_carry(a->frac_lo, b->frac_lo, &c); 870 r->frac_lm = uadd64_carry(a->frac_lm, b->frac_lm, &c); 871 r->frac_hm = uadd64_carry(a->frac_hm, b->frac_hm, &c); 872 r->frac_hi = uadd64_carry(a->frac_hi, b->frac_hi, &c); 873 return c; 874 } 875 876 #define frac_add(R, A, B) FRAC_GENERIC_64_128_256(add, R)(R, A, B) 877 878 static bool frac64_addi(FloatParts64 *r, FloatParts64 *a, uint64_t c) 879 { 880 return uadd64_overflow(a->frac, c, &r->frac); 881 } 882 883 static bool frac128_addi(FloatParts128 *r, FloatParts128 *a, uint64_t c) 884 { 885 c = uadd64_overflow(a->frac_lo, c, &r->frac_lo); 886 return uadd64_overflow(a->frac_hi, c, &r->frac_hi); 887 } 888 889 #define frac_addi(R, A, C) FRAC_GENERIC_64_128(addi, R)(R, A, C) 890 891 static void frac64_allones(FloatParts64 *a) 892 { 893 a->frac = -1; 894 } 895 896 static void frac128_allones(FloatParts128 *a) 897 { 898 a->frac_hi = a->frac_lo = -1; 899 } 900 901 #define frac_allones(A) FRAC_GENERIC_64_128(allones, A)(A) 902 903 static int frac64_cmp(FloatParts64 *a, FloatParts64 *b) 904 { 905 return a->frac == b->frac ? 0 : a->frac < b->frac ? -1 : 1; 906 } 907 908 static int frac128_cmp(FloatParts128 *a, FloatParts128 *b) 909 { 910 uint64_t ta = a->frac_hi, tb = b->frac_hi; 911 if (ta == tb) { 912 ta = a->frac_lo, tb = b->frac_lo; 913 if (ta == tb) { 914 return 0; 915 } 916 } 917 return ta < tb ? -1 : 1; 918 } 919 920 #define frac_cmp(A, B) FRAC_GENERIC_64_128(cmp, A)(A, B) 921 922 static void frac64_clear(FloatParts64 *a) 923 { 924 a->frac = 0; 925 } 926 927 static void frac128_clear(FloatParts128 *a) 928 { 929 a->frac_hi = a->frac_lo = 0; 930 } 931 932 #define frac_clear(A) FRAC_GENERIC_64_128(clear, A)(A) 933 934 static bool frac64_div(FloatParts64 *a, FloatParts64 *b) 935 { 936 uint64_t n1, n0, r, q; 937 bool ret; 938 939 /* 940 * We want a 2*N / N-bit division to produce exactly an N-bit 941 * result, so that we do not lose any precision and so that we 942 * do not have to renormalize afterward. If A.frac < B.frac, 943 * then division would produce an (N-1)-bit result; shift A left 944 * by one to produce the an N-bit result, and return true to 945 * decrement the exponent to match. 946 * 947 * The udiv_qrnnd algorithm that we're using requires normalization, 948 * i.e. the msb of the denominator must be set, which is already true. 949 */ 950 ret = a->frac < b->frac; 951 if (ret) { 952 n0 = a->frac; 953 n1 = 0; 954 } else { 955 n0 = a->frac >> 1; 956 n1 = a->frac << 63; 957 } 958 q = udiv_qrnnd(&r, n0, n1, b->frac); 959 960 /* Set lsb if there is a remainder, to set inexact. */ 961 a->frac = q | (r != 0); 962 963 return ret; 964 } 965 966 static bool frac128_div(FloatParts128 *a, FloatParts128 *b) 967 { 968 uint64_t q0, q1, a0, a1, b0, b1; 969 uint64_t r0, r1, r2, r3, t0, t1, t2, t3; 970 bool ret = false; 971 972 a0 = a->frac_hi, a1 = a->frac_lo; 973 b0 = b->frac_hi, b1 = b->frac_lo; 974 975 ret = lt128(a0, a1, b0, b1); 976 if (!ret) { 977 a1 = shr_double(a0, a1, 1); 978 a0 = a0 >> 1; 979 } 980 981 /* Use 128/64 -> 64 division as estimate for 192/128 -> 128 division. */ 982 q0 = estimateDiv128To64(a0, a1, b0); 983 984 /* 985 * Estimate is high because B1 was not included (unless B1 == 0). 986 * Reduce quotient and increase remainder until remainder is non-negative. 987 * This loop will execute 0 to 2 times. 988 */ 989 mul128By64To192(b0, b1, q0, &t0, &t1, &t2); 990 sub192(a0, a1, 0, t0, t1, t2, &r0, &r1, &r2); 991 while (r0 != 0) { 992 q0--; 993 add192(r0, r1, r2, 0, b0, b1, &r0, &r1, &r2); 994 } 995 996 /* Repeat using the remainder, producing a second word of quotient. */ 997 q1 = estimateDiv128To64(r1, r2, b0); 998 mul128By64To192(b0, b1, q1, &t1, &t2, &t3); 999 sub192(r1, r2, 0, t1, t2, t3, &r1, &r2, &r3); 1000 while (r1 != 0) { 1001 q1--; 1002 add192(r1, r2, r3, 0, b0, b1, &r1, &r2, &r3); 1003 } 1004 1005 /* Any remainder indicates inexact; set sticky bit. */ 1006 q1 |= (r2 | r3) != 0; 1007 1008 a->frac_hi = q0; 1009 a->frac_lo = q1; 1010 return ret; 1011 } 1012 1013 #define frac_div(A, B) FRAC_GENERIC_64_128(div, A)(A, B) 1014 1015 static bool frac64_eqz(FloatParts64 *a) 1016 { 1017 return a->frac == 0; 1018 } 1019 1020 static bool frac128_eqz(FloatParts128 *a) 1021 { 1022 return (a->frac_hi | a->frac_lo) == 0; 1023 } 1024 1025 #define frac_eqz(A) FRAC_GENERIC_64_128(eqz, A)(A) 1026 1027 static void frac64_mulw(FloatParts128 *r, FloatParts64 *a, FloatParts64 *b) 1028 { 1029 mulu64(&r->frac_lo, &r->frac_hi, a->frac, b->frac); 1030 } 1031 1032 static void frac128_mulw(FloatParts256 *r, FloatParts128 *a, FloatParts128 *b) 1033 { 1034 mul128To256(a->frac_hi, a->frac_lo, b->frac_hi, b->frac_lo, 1035 &r->frac_hi, &r->frac_hm, &r->frac_lm, &r->frac_lo); 1036 } 1037 1038 #define frac_mulw(R, A, B) FRAC_GENERIC_64_128(mulw, A)(R, A, B) 1039 1040 static void frac64_neg(FloatParts64 *a) 1041 { 1042 a->frac = -a->frac; 1043 } 1044 1045 static void frac128_neg(FloatParts128 *a) 1046 { 1047 bool c = 0; 1048 a->frac_lo = usub64_borrow(0, a->frac_lo, &c); 1049 a->frac_hi = usub64_borrow(0, a->frac_hi, &c); 1050 } 1051 1052 static void frac256_neg(FloatParts256 *a) 1053 { 1054 bool c = 0; 1055 a->frac_lo = usub64_borrow(0, a->frac_lo, &c); 1056 a->frac_lm = usub64_borrow(0, a->frac_lm, &c); 1057 a->frac_hm = usub64_borrow(0, a->frac_hm, &c); 1058 a->frac_hi = usub64_borrow(0, a->frac_hi, &c); 1059 } 1060 1061 #define frac_neg(A) FRAC_GENERIC_64_128_256(neg, A)(A) 1062 1063 static int frac64_normalize(FloatParts64 *a) 1064 { 1065 if (a->frac) { 1066 int shift = clz64(a->frac); 1067 a->frac <<= shift; 1068 return shift; 1069 } 1070 return 64; 1071 } 1072 1073 static int frac128_normalize(FloatParts128 *a) 1074 { 1075 if (a->frac_hi) { 1076 int shl = clz64(a->frac_hi); 1077 a->frac_hi = shl_double(a->frac_hi, a->frac_lo, shl); 1078 a->frac_lo <<= shl; 1079 return shl; 1080 } else if (a->frac_lo) { 1081 int shl = clz64(a->frac_lo); 1082 a->frac_hi = a->frac_lo << shl; 1083 a->frac_lo = 0; 1084 return shl + 64; 1085 } 1086 return 128; 1087 } 1088 1089 static int frac256_normalize(FloatParts256 *a) 1090 { 1091 uint64_t a0 = a->frac_hi, a1 = a->frac_hm; 1092 uint64_t a2 = a->frac_lm, a3 = a->frac_lo; 1093 int ret, shl; 1094 1095 if (likely(a0)) { 1096 shl = clz64(a0); 1097 if (shl == 0) { 1098 return 0; 1099 } 1100 ret = shl; 1101 } else { 1102 if (a1) { 1103 ret = 64; 1104 a0 = a1, a1 = a2, a2 = a3, a3 = 0; 1105 } else if (a2) { 1106 ret = 128; 1107 a0 = a2, a1 = a3, a2 = 0, a3 = 0; 1108 } else if (a3) { 1109 ret = 192; 1110 a0 = a3, a1 = 0, a2 = 0, a3 = 0; 1111 } else { 1112 ret = 256; 1113 a0 = 0, a1 = 0, a2 = 0, a3 = 0; 1114 goto done; 1115 } 1116 shl = clz64(a0); 1117 if (shl == 0) { 1118 goto done; 1119 } 1120 ret += shl; 1121 } 1122 1123 a0 = shl_double(a0, a1, shl); 1124 a1 = shl_double(a1, a2, shl); 1125 a2 = shl_double(a2, a3, shl); 1126 a3 <<= shl; 1127 1128 done: 1129 a->frac_hi = a0; 1130 a->frac_hm = a1; 1131 a->frac_lm = a2; 1132 a->frac_lo = a3; 1133 return ret; 1134 } 1135 1136 #define frac_normalize(A) FRAC_GENERIC_64_128_256(normalize, A)(A) 1137 1138 static void frac64_shl(FloatParts64 *a, int c) 1139 { 1140 a->frac <<= c; 1141 } 1142 1143 static void frac128_shl(FloatParts128 *a, int c) 1144 { 1145 uint64_t a0 = a->frac_hi, a1 = a->frac_lo; 1146 1147 if (c & 64) { 1148 a0 = a1, a1 = 0; 1149 } 1150 1151 c &= 63; 1152 if (c) { 1153 a0 = shl_double(a0, a1, c); 1154 a1 = a1 << c; 1155 } 1156 1157 a->frac_hi = a0; 1158 a->frac_lo = a1; 1159 } 1160 1161 #define frac_shl(A, C) FRAC_GENERIC_64_128(shl, A)(A, C) 1162 1163 static void frac64_shr(FloatParts64 *a, int c) 1164 { 1165 a->frac >>= c; 1166 } 1167 1168 static void frac128_shr(FloatParts128 *a, int c) 1169 { 1170 uint64_t a0 = a->frac_hi, a1 = a->frac_lo; 1171 1172 if (c & 64) { 1173 a1 = a0, a0 = 0; 1174 } 1175 1176 c &= 63; 1177 if (c) { 1178 a1 = shr_double(a0, a1, c); 1179 a0 = a0 >> c; 1180 } 1181 1182 a->frac_hi = a0; 1183 a->frac_lo = a1; 1184 } 1185 1186 #define frac_shr(A, C) FRAC_GENERIC_64_128(shr, A)(A, C) 1187 1188 static void frac64_shrjam(FloatParts64 *a, int c) 1189 { 1190 uint64_t a0 = a->frac; 1191 1192 if (likely(c != 0)) { 1193 if (likely(c < 64)) { 1194 a0 = (a0 >> c) | (shr_double(a0, 0, c) != 0); 1195 } else { 1196 a0 = a0 != 0; 1197 } 1198 a->frac = a0; 1199 } 1200 } 1201 1202 static void frac128_shrjam(FloatParts128 *a, int c) 1203 { 1204 uint64_t a0 = a->frac_hi, a1 = a->frac_lo; 1205 uint64_t sticky = 0; 1206 1207 if (unlikely(c == 0)) { 1208 return; 1209 } else if (likely(c < 64)) { 1210 /* nothing */ 1211 } else if (likely(c < 128)) { 1212 sticky = a1; 1213 a1 = a0; 1214 a0 = 0; 1215 c &= 63; 1216 if (c == 0) { 1217 goto done; 1218 } 1219 } else { 1220 sticky = a0 | a1; 1221 a0 = a1 = 0; 1222 goto done; 1223 } 1224 1225 sticky |= shr_double(a1, 0, c); 1226 a1 = shr_double(a0, a1, c); 1227 a0 = a0 >> c; 1228 1229 done: 1230 a->frac_lo = a1 | (sticky != 0); 1231 a->frac_hi = a0; 1232 } 1233 1234 static void frac256_shrjam(FloatParts256 *a, int c) 1235 { 1236 uint64_t a0 = a->frac_hi, a1 = a->frac_hm; 1237 uint64_t a2 = a->frac_lm, a3 = a->frac_lo; 1238 uint64_t sticky = 0; 1239 1240 if (unlikely(c == 0)) { 1241 return; 1242 } else if (likely(c < 64)) { 1243 /* nothing */ 1244 } else if (likely(c < 256)) { 1245 if (unlikely(c & 128)) { 1246 sticky |= a2 | a3; 1247 a3 = a1, a2 = a0, a1 = 0, a0 = 0; 1248 } 1249 if (unlikely(c & 64)) { 1250 sticky |= a3; 1251 a3 = a2, a2 = a1, a1 = a0, a0 = 0; 1252 } 1253 c &= 63; 1254 if (c == 0) { 1255 goto done; 1256 } 1257 } else { 1258 sticky = a0 | a1 | a2 | a3; 1259 a0 = a1 = a2 = a3 = 0; 1260 goto done; 1261 } 1262 1263 sticky |= shr_double(a3, 0, c); 1264 a3 = shr_double(a2, a3, c); 1265 a2 = shr_double(a1, a2, c); 1266 a1 = shr_double(a0, a1, c); 1267 a0 = a0 >> c; 1268 1269 done: 1270 a->frac_lo = a3 | (sticky != 0); 1271 a->frac_lm = a2; 1272 a->frac_hm = a1; 1273 a->frac_hi = a0; 1274 } 1275 1276 #define frac_shrjam(A, C) FRAC_GENERIC_64_128_256(shrjam, A)(A, C) 1277 1278 static bool frac64_sub(FloatParts64 *r, FloatParts64 *a, FloatParts64 *b) 1279 { 1280 return usub64_overflow(a->frac, b->frac, &r->frac); 1281 } 1282 1283 static bool frac128_sub(FloatParts128 *r, FloatParts128 *a, FloatParts128 *b) 1284 { 1285 bool c = 0; 1286 r->frac_lo = usub64_borrow(a->frac_lo, b->frac_lo, &c); 1287 r->frac_hi = usub64_borrow(a->frac_hi, b->frac_hi, &c); 1288 return c; 1289 } 1290 1291 static bool frac256_sub(FloatParts256 *r, FloatParts256 *a, FloatParts256 *b) 1292 { 1293 bool c = 0; 1294 r->frac_lo = usub64_borrow(a->frac_lo, b->frac_lo, &c); 1295 r->frac_lm = usub64_borrow(a->frac_lm, b->frac_lm, &c); 1296 r->frac_hm = usub64_borrow(a->frac_hm, b->frac_hm, &c); 1297 r->frac_hi = usub64_borrow(a->frac_hi, b->frac_hi, &c); 1298 return c; 1299 } 1300 1301 #define frac_sub(R, A, B) FRAC_GENERIC_64_128_256(sub, R)(R, A, B) 1302 1303 static void frac64_truncjam(FloatParts64 *r, FloatParts128 *a) 1304 { 1305 r->frac = a->frac_hi | (a->frac_lo != 0); 1306 } 1307 1308 static void frac128_truncjam(FloatParts128 *r, FloatParts256 *a) 1309 { 1310 r->frac_hi = a->frac_hi; 1311 r->frac_lo = a->frac_hm | ((a->frac_lm | a->frac_lo) != 0); 1312 } 1313 1314 #define frac_truncjam(R, A) FRAC_GENERIC_64_128(truncjam, R)(R, A) 1315 1316 static void frac64_widen(FloatParts128 *r, FloatParts64 *a) 1317 { 1318 r->frac_hi = a->frac; 1319 r->frac_lo = 0; 1320 } 1321 1322 static void frac128_widen(FloatParts256 *r, FloatParts128 *a) 1323 { 1324 r->frac_hi = a->frac_hi; 1325 r->frac_hm = a->frac_lo; 1326 r->frac_lm = 0; 1327 r->frac_lo = 0; 1328 } 1329 1330 #define frac_widen(A, B) FRAC_GENERIC_64_128(widen, B)(A, B) 1331 1332 #define partsN(NAME) glue(glue(glue(parts,N),_),NAME) 1333 #define FloatPartsN glue(FloatParts,N) 1334 #define FloatPartsW glue(FloatParts,W) 1335 1336 #define N 64 1337 #define W 128 1338 1339 #include "softfloat-parts-addsub.c.inc" 1340 #include "softfloat-parts.c.inc" 1341 1342 #undef N 1343 #undef W 1344 #define N 128 1345 #define W 256 1346 1347 #include "softfloat-parts-addsub.c.inc" 1348 #include "softfloat-parts.c.inc" 1349 1350 #undef N 1351 #undef W 1352 #define N 256 1353 1354 #include "softfloat-parts-addsub.c.inc" 1355 1356 #undef N 1357 #undef W 1358 #undef partsN 1359 #undef FloatPartsN 1360 #undef FloatPartsW 1361 1362 /* 1363 * Pack/unpack routines with a specific FloatFmt. 1364 */ 1365 1366 static void float16a_unpack_canonical(FloatParts64 *p, float16 f, 1367 float_status *s, const FloatFmt *params) 1368 { 1369 float16_unpack_raw(p, f); 1370 parts_canonicalize(p, s, params); 1371 } 1372 1373 static void float16_unpack_canonical(FloatParts64 *p, float16 f, 1374 float_status *s) 1375 { 1376 float16a_unpack_canonical(p, f, s, &float16_params); 1377 } 1378 1379 static void bfloat16_unpack_canonical(FloatParts64 *p, bfloat16 f, 1380 float_status *s) 1381 { 1382 bfloat16_unpack_raw(p, f); 1383 parts_canonicalize(p, s, &bfloat16_params); 1384 } 1385 1386 static float16 float16a_round_pack_canonical(FloatParts64 *p, 1387 float_status *s, 1388 const FloatFmt *params) 1389 { 1390 parts_uncanon(p, s, params); 1391 return float16_pack_raw(p); 1392 } 1393 1394 static float16 float16_round_pack_canonical(FloatParts64 *p, 1395 float_status *s) 1396 { 1397 return float16a_round_pack_canonical(p, s, &float16_params); 1398 } 1399 1400 static bfloat16 bfloat16_round_pack_canonical(FloatParts64 *p, 1401 float_status *s) 1402 { 1403 parts_uncanon(p, s, &bfloat16_params); 1404 return bfloat16_pack_raw(p); 1405 } 1406 1407 static void float32_unpack_canonical(FloatParts64 *p, float32 f, 1408 float_status *s) 1409 { 1410 float32_unpack_raw(p, f); 1411 parts_canonicalize(p, s, &float32_params); 1412 } 1413 1414 static float32 float32_round_pack_canonical(FloatParts64 *p, 1415 float_status *s) 1416 { 1417 parts_uncanon(p, s, &float32_params); 1418 return float32_pack_raw(p); 1419 } 1420 1421 static void float64_unpack_canonical(FloatParts64 *p, float64 f, 1422 float_status *s) 1423 { 1424 float64_unpack_raw(p, f); 1425 parts_canonicalize(p, s, &float64_params); 1426 } 1427 1428 static float64 float64_round_pack_canonical(FloatParts64 *p, 1429 float_status *s) 1430 { 1431 parts_uncanon(p, s, &float64_params); 1432 return float64_pack_raw(p); 1433 } 1434 1435 static void float128_unpack_canonical(FloatParts128 *p, float128 f, 1436 float_status *s) 1437 { 1438 float128_unpack_raw(p, f); 1439 parts_canonicalize(p, s, &float128_params); 1440 } 1441 1442 static float128 float128_round_pack_canonical(FloatParts128 *p, 1443 float_status *s) 1444 { 1445 parts_uncanon(p, s, &float128_params); 1446 return float128_pack_raw(p); 1447 } 1448 1449 /* 1450 * Addition and subtraction 1451 */ 1452 1453 static float16 QEMU_FLATTEN 1454 float16_addsub(float16 a, float16 b, float_status *status, bool subtract) 1455 { 1456 FloatParts64 pa, pb, *pr; 1457 1458 float16_unpack_canonical(&pa, a, status); 1459 float16_unpack_canonical(&pb, b, status); 1460 pr = parts_addsub(&pa, &pb, status, subtract); 1461 1462 return float16_round_pack_canonical(pr, status); 1463 } 1464 1465 float16 float16_add(float16 a, float16 b, float_status *status) 1466 { 1467 return float16_addsub(a, b, status, false); 1468 } 1469 1470 float16 float16_sub(float16 a, float16 b, float_status *status) 1471 { 1472 return float16_addsub(a, b, status, true); 1473 } 1474 1475 static float32 QEMU_SOFTFLOAT_ATTR 1476 soft_f32_addsub(float32 a, float32 b, float_status *status, bool subtract) 1477 { 1478 FloatParts64 pa, pb, *pr; 1479 1480 float32_unpack_canonical(&pa, a, status); 1481 float32_unpack_canonical(&pb, b, status); 1482 pr = parts_addsub(&pa, &pb, status, subtract); 1483 1484 return float32_round_pack_canonical(pr, status); 1485 } 1486 1487 static float32 soft_f32_add(float32 a, float32 b, float_status *status) 1488 { 1489 return soft_f32_addsub(a, b, status, false); 1490 } 1491 1492 static float32 soft_f32_sub(float32 a, float32 b, float_status *status) 1493 { 1494 return soft_f32_addsub(a, b, status, true); 1495 } 1496 1497 static float64 QEMU_SOFTFLOAT_ATTR 1498 soft_f64_addsub(float64 a, float64 b, float_status *status, bool subtract) 1499 { 1500 FloatParts64 pa, pb, *pr; 1501 1502 float64_unpack_canonical(&pa, a, status); 1503 float64_unpack_canonical(&pb, b, status); 1504 pr = parts_addsub(&pa, &pb, status, subtract); 1505 1506 return float64_round_pack_canonical(pr, status); 1507 } 1508 1509 static float64 soft_f64_add(float64 a, float64 b, float_status *status) 1510 { 1511 return soft_f64_addsub(a, b, status, false); 1512 } 1513 1514 static float64 soft_f64_sub(float64 a, float64 b, float_status *status) 1515 { 1516 return soft_f64_addsub(a, b, status, true); 1517 } 1518 1519 static float hard_f32_add(float a, float b) 1520 { 1521 return a + b; 1522 } 1523 1524 static float hard_f32_sub(float a, float b) 1525 { 1526 return a - b; 1527 } 1528 1529 static double hard_f64_add(double a, double b) 1530 { 1531 return a + b; 1532 } 1533 1534 static double hard_f64_sub(double a, double b) 1535 { 1536 return a - b; 1537 } 1538 1539 static bool f32_addsubmul_post(union_float32 a, union_float32 b) 1540 { 1541 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1542 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1543 } 1544 return !(float32_is_zero(a.s) && float32_is_zero(b.s)); 1545 } 1546 1547 static bool f64_addsubmul_post(union_float64 a, union_float64 b) 1548 { 1549 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1550 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1551 } else { 1552 return !(float64_is_zero(a.s) && float64_is_zero(b.s)); 1553 } 1554 } 1555 1556 static float32 float32_addsub(float32 a, float32 b, float_status *s, 1557 hard_f32_op2_fn hard, soft_f32_op2_fn soft) 1558 { 1559 return float32_gen2(a, b, s, hard, soft, 1560 f32_is_zon2, f32_addsubmul_post); 1561 } 1562 1563 static float64 float64_addsub(float64 a, float64 b, float_status *s, 1564 hard_f64_op2_fn hard, soft_f64_op2_fn soft) 1565 { 1566 return float64_gen2(a, b, s, hard, soft, 1567 f64_is_zon2, f64_addsubmul_post); 1568 } 1569 1570 float32 QEMU_FLATTEN 1571 float32_add(float32 a, float32 b, float_status *s) 1572 { 1573 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add); 1574 } 1575 1576 float32 QEMU_FLATTEN 1577 float32_sub(float32 a, float32 b, float_status *s) 1578 { 1579 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub); 1580 } 1581 1582 float64 QEMU_FLATTEN 1583 float64_add(float64 a, float64 b, float_status *s) 1584 { 1585 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add); 1586 } 1587 1588 float64 QEMU_FLATTEN 1589 float64_sub(float64 a, float64 b, float_status *s) 1590 { 1591 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub); 1592 } 1593 1594 static bfloat16 QEMU_FLATTEN 1595 bfloat16_addsub(bfloat16 a, bfloat16 b, float_status *status, bool subtract) 1596 { 1597 FloatParts64 pa, pb, *pr; 1598 1599 bfloat16_unpack_canonical(&pa, a, status); 1600 bfloat16_unpack_canonical(&pb, b, status); 1601 pr = parts_addsub(&pa, &pb, status, subtract); 1602 1603 return bfloat16_round_pack_canonical(pr, status); 1604 } 1605 1606 bfloat16 bfloat16_add(bfloat16 a, bfloat16 b, float_status *status) 1607 { 1608 return bfloat16_addsub(a, b, status, false); 1609 } 1610 1611 bfloat16 bfloat16_sub(bfloat16 a, bfloat16 b, float_status *status) 1612 { 1613 return bfloat16_addsub(a, b, status, true); 1614 } 1615 1616 static float128 QEMU_FLATTEN 1617 float128_addsub(float128 a, float128 b, float_status *status, bool subtract) 1618 { 1619 FloatParts128 pa, pb, *pr; 1620 1621 float128_unpack_canonical(&pa, a, status); 1622 float128_unpack_canonical(&pb, b, status); 1623 pr = parts_addsub(&pa, &pb, status, subtract); 1624 1625 return float128_round_pack_canonical(pr, status); 1626 } 1627 1628 float128 float128_add(float128 a, float128 b, float_status *status) 1629 { 1630 return float128_addsub(a, b, status, false); 1631 } 1632 1633 float128 float128_sub(float128 a, float128 b, float_status *status) 1634 { 1635 return float128_addsub(a, b, status, true); 1636 } 1637 1638 /* 1639 * Multiplication 1640 */ 1641 1642 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status) 1643 { 1644 FloatParts64 pa, pb, *pr; 1645 1646 float16_unpack_canonical(&pa, a, status); 1647 float16_unpack_canonical(&pb, b, status); 1648 pr = parts_mul(&pa, &pb, status); 1649 1650 return float16_round_pack_canonical(pr, status); 1651 } 1652 1653 static float32 QEMU_SOFTFLOAT_ATTR 1654 soft_f32_mul(float32 a, float32 b, float_status *status) 1655 { 1656 FloatParts64 pa, pb, *pr; 1657 1658 float32_unpack_canonical(&pa, a, status); 1659 float32_unpack_canonical(&pb, b, status); 1660 pr = parts_mul(&pa, &pb, status); 1661 1662 return float32_round_pack_canonical(pr, status); 1663 } 1664 1665 static float64 QEMU_SOFTFLOAT_ATTR 1666 soft_f64_mul(float64 a, float64 b, float_status *status) 1667 { 1668 FloatParts64 pa, pb, *pr; 1669 1670 float64_unpack_canonical(&pa, a, status); 1671 float64_unpack_canonical(&pb, b, status); 1672 pr = parts_mul(&pa, &pb, status); 1673 1674 return float64_round_pack_canonical(pr, status); 1675 } 1676 1677 static float hard_f32_mul(float a, float b) 1678 { 1679 return a * b; 1680 } 1681 1682 static double hard_f64_mul(double a, double b) 1683 { 1684 return a * b; 1685 } 1686 1687 float32 QEMU_FLATTEN 1688 float32_mul(float32 a, float32 b, float_status *s) 1689 { 1690 return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul, 1691 f32_is_zon2, f32_addsubmul_post); 1692 } 1693 1694 float64 QEMU_FLATTEN 1695 float64_mul(float64 a, float64 b, float_status *s) 1696 { 1697 return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul, 1698 f64_is_zon2, f64_addsubmul_post); 1699 } 1700 1701 bfloat16 QEMU_FLATTEN 1702 bfloat16_mul(bfloat16 a, bfloat16 b, float_status *status) 1703 { 1704 FloatParts64 pa, pb, *pr; 1705 1706 bfloat16_unpack_canonical(&pa, a, status); 1707 bfloat16_unpack_canonical(&pb, b, status); 1708 pr = parts_mul(&pa, &pb, status); 1709 1710 return bfloat16_round_pack_canonical(pr, status); 1711 } 1712 1713 float128 QEMU_FLATTEN 1714 float128_mul(float128 a, float128 b, float_status *status) 1715 { 1716 FloatParts128 pa, pb, *pr; 1717 1718 float128_unpack_canonical(&pa, a, status); 1719 float128_unpack_canonical(&pb, b, status); 1720 pr = parts_mul(&pa, &pb, status); 1721 1722 return float128_round_pack_canonical(pr, status); 1723 } 1724 1725 /* 1726 * Fused multiply-add 1727 */ 1728 1729 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c, 1730 int flags, float_status *status) 1731 { 1732 FloatParts64 pa, pb, pc, *pr; 1733 1734 float16_unpack_canonical(&pa, a, status); 1735 float16_unpack_canonical(&pb, b, status); 1736 float16_unpack_canonical(&pc, c, status); 1737 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1738 1739 return float16_round_pack_canonical(pr, status); 1740 } 1741 1742 static float32 QEMU_SOFTFLOAT_ATTR 1743 soft_f32_muladd(float32 a, float32 b, float32 c, int flags, 1744 float_status *status) 1745 { 1746 FloatParts64 pa, pb, pc, *pr; 1747 1748 float32_unpack_canonical(&pa, a, status); 1749 float32_unpack_canonical(&pb, b, status); 1750 float32_unpack_canonical(&pc, c, status); 1751 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1752 1753 return float32_round_pack_canonical(pr, status); 1754 } 1755 1756 static float64 QEMU_SOFTFLOAT_ATTR 1757 soft_f64_muladd(float64 a, float64 b, float64 c, int flags, 1758 float_status *status) 1759 { 1760 FloatParts64 pa, pb, pc, *pr; 1761 1762 float64_unpack_canonical(&pa, a, status); 1763 float64_unpack_canonical(&pb, b, status); 1764 float64_unpack_canonical(&pc, c, status); 1765 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1766 1767 return float64_round_pack_canonical(pr, status); 1768 } 1769 1770 static bool force_soft_fma; 1771 1772 float32 QEMU_FLATTEN 1773 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s) 1774 { 1775 union_float32 ua, ub, uc, ur; 1776 1777 ua.s = xa; 1778 ub.s = xb; 1779 uc.s = xc; 1780 1781 if (unlikely(!can_use_fpu(s))) { 1782 goto soft; 1783 } 1784 if (unlikely(flags & float_muladd_halve_result)) { 1785 goto soft; 1786 } 1787 1788 float32_input_flush3(&ua.s, &ub.s, &uc.s, s); 1789 if (unlikely(!f32_is_zon3(ua, ub, uc))) { 1790 goto soft; 1791 } 1792 1793 if (unlikely(force_soft_fma)) { 1794 goto soft; 1795 } 1796 1797 /* 1798 * When (a || b) == 0, there's no need to check for under/over flow, 1799 * since we know the addend is (normal || 0) and the product is 0. 1800 */ 1801 if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) { 1802 union_float32 up; 1803 bool prod_sign; 1804 1805 prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s); 1806 prod_sign ^= !!(flags & float_muladd_negate_product); 1807 up.s = float32_set_sign(float32_zero, prod_sign); 1808 1809 if (flags & float_muladd_negate_c) { 1810 uc.h = -uc.h; 1811 } 1812 ur.h = up.h + uc.h; 1813 } else { 1814 union_float32 ua_orig = ua; 1815 union_float32 uc_orig = uc; 1816 1817 if (flags & float_muladd_negate_product) { 1818 ua.h = -ua.h; 1819 } 1820 if (flags & float_muladd_negate_c) { 1821 uc.h = -uc.h; 1822 } 1823 1824 ur.h = fmaf(ua.h, ub.h, uc.h); 1825 1826 if (unlikely(f32_is_inf(ur))) { 1827 float_raise(float_flag_overflow, s); 1828 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) { 1829 ua = ua_orig; 1830 uc = uc_orig; 1831 goto soft; 1832 } 1833 } 1834 if (flags & float_muladd_negate_result) { 1835 return float32_chs(ur.s); 1836 } 1837 return ur.s; 1838 1839 soft: 1840 return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s); 1841 } 1842 1843 float64 QEMU_FLATTEN 1844 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s) 1845 { 1846 union_float64 ua, ub, uc, ur; 1847 1848 ua.s = xa; 1849 ub.s = xb; 1850 uc.s = xc; 1851 1852 if (unlikely(!can_use_fpu(s))) { 1853 goto soft; 1854 } 1855 if (unlikely(flags & float_muladd_halve_result)) { 1856 goto soft; 1857 } 1858 1859 float64_input_flush3(&ua.s, &ub.s, &uc.s, s); 1860 if (unlikely(!f64_is_zon3(ua, ub, uc))) { 1861 goto soft; 1862 } 1863 1864 if (unlikely(force_soft_fma)) { 1865 goto soft; 1866 } 1867 1868 /* 1869 * When (a || b) == 0, there's no need to check for under/over flow, 1870 * since we know the addend is (normal || 0) and the product is 0. 1871 */ 1872 if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) { 1873 union_float64 up; 1874 bool prod_sign; 1875 1876 prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s); 1877 prod_sign ^= !!(flags & float_muladd_negate_product); 1878 up.s = float64_set_sign(float64_zero, prod_sign); 1879 1880 if (flags & float_muladd_negate_c) { 1881 uc.h = -uc.h; 1882 } 1883 ur.h = up.h + uc.h; 1884 } else { 1885 union_float64 ua_orig = ua; 1886 union_float64 uc_orig = uc; 1887 1888 if (flags & float_muladd_negate_product) { 1889 ua.h = -ua.h; 1890 } 1891 if (flags & float_muladd_negate_c) { 1892 uc.h = -uc.h; 1893 } 1894 1895 ur.h = fma(ua.h, ub.h, uc.h); 1896 1897 if (unlikely(f64_is_inf(ur))) { 1898 float_raise(float_flag_overflow, s); 1899 } else if (unlikely(fabs(ur.h) <= FLT_MIN)) { 1900 ua = ua_orig; 1901 uc = uc_orig; 1902 goto soft; 1903 } 1904 } 1905 if (flags & float_muladd_negate_result) { 1906 return float64_chs(ur.s); 1907 } 1908 return ur.s; 1909 1910 soft: 1911 return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s); 1912 } 1913 1914 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c, 1915 int flags, float_status *status) 1916 { 1917 FloatParts64 pa, pb, pc, *pr; 1918 1919 bfloat16_unpack_canonical(&pa, a, status); 1920 bfloat16_unpack_canonical(&pb, b, status); 1921 bfloat16_unpack_canonical(&pc, c, status); 1922 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1923 1924 return bfloat16_round_pack_canonical(pr, status); 1925 } 1926 1927 float128 QEMU_FLATTEN float128_muladd(float128 a, float128 b, float128 c, 1928 int flags, float_status *status) 1929 { 1930 FloatParts128 pa, pb, pc, *pr; 1931 1932 float128_unpack_canonical(&pa, a, status); 1933 float128_unpack_canonical(&pb, b, status); 1934 float128_unpack_canonical(&pc, c, status); 1935 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1936 1937 return float128_round_pack_canonical(pr, status); 1938 } 1939 1940 /* 1941 * Division 1942 */ 1943 1944 float16 float16_div(float16 a, float16 b, float_status *status) 1945 { 1946 FloatParts64 pa, pb, *pr; 1947 1948 float16_unpack_canonical(&pa, a, status); 1949 float16_unpack_canonical(&pb, b, status); 1950 pr = parts_div(&pa, &pb, status); 1951 1952 return float16_round_pack_canonical(pr, status); 1953 } 1954 1955 static float32 QEMU_SOFTFLOAT_ATTR 1956 soft_f32_div(float32 a, float32 b, float_status *status) 1957 { 1958 FloatParts64 pa, pb, *pr; 1959 1960 float32_unpack_canonical(&pa, a, status); 1961 float32_unpack_canonical(&pb, b, status); 1962 pr = parts_div(&pa, &pb, status); 1963 1964 return float32_round_pack_canonical(pr, status); 1965 } 1966 1967 static float64 QEMU_SOFTFLOAT_ATTR 1968 soft_f64_div(float64 a, float64 b, float_status *status) 1969 { 1970 FloatParts64 pa, pb, *pr; 1971 1972 float64_unpack_canonical(&pa, a, status); 1973 float64_unpack_canonical(&pb, b, status); 1974 pr = parts_div(&pa, &pb, status); 1975 1976 return float64_round_pack_canonical(pr, status); 1977 } 1978 1979 static float hard_f32_div(float a, float b) 1980 { 1981 return a / b; 1982 } 1983 1984 static double hard_f64_div(double a, double b) 1985 { 1986 return a / b; 1987 } 1988 1989 static bool f32_div_pre(union_float32 a, union_float32 b) 1990 { 1991 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1992 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1993 fpclassify(b.h) == FP_NORMAL; 1994 } 1995 return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s); 1996 } 1997 1998 static bool f64_div_pre(union_float64 a, union_float64 b) 1999 { 2000 if (QEMU_HARDFLOAT_2F64_USE_FP) { 2001 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 2002 fpclassify(b.h) == FP_NORMAL; 2003 } 2004 return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s); 2005 } 2006 2007 static bool f32_div_post(union_float32 a, union_float32 b) 2008 { 2009 if (QEMU_HARDFLOAT_2F32_USE_FP) { 2010 return fpclassify(a.h) != FP_ZERO; 2011 } 2012 return !float32_is_zero(a.s); 2013 } 2014 2015 static bool f64_div_post(union_float64 a, union_float64 b) 2016 { 2017 if (QEMU_HARDFLOAT_2F64_USE_FP) { 2018 return fpclassify(a.h) != FP_ZERO; 2019 } 2020 return !float64_is_zero(a.s); 2021 } 2022 2023 float32 QEMU_FLATTEN 2024 float32_div(float32 a, float32 b, float_status *s) 2025 { 2026 return float32_gen2(a, b, s, hard_f32_div, soft_f32_div, 2027 f32_div_pre, f32_div_post); 2028 } 2029 2030 float64 QEMU_FLATTEN 2031 float64_div(float64 a, float64 b, float_status *s) 2032 { 2033 return float64_gen2(a, b, s, hard_f64_div, soft_f64_div, 2034 f64_div_pre, f64_div_post); 2035 } 2036 2037 bfloat16 QEMU_FLATTEN 2038 bfloat16_div(bfloat16 a, bfloat16 b, float_status *status) 2039 { 2040 FloatParts64 pa, pb, *pr; 2041 2042 bfloat16_unpack_canonical(&pa, a, status); 2043 bfloat16_unpack_canonical(&pb, b, status); 2044 pr = parts_div(&pa, &pb, status); 2045 2046 return bfloat16_round_pack_canonical(pr, status); 2047 } 2048 2049 float128 QEMU_FLATTEN 2050 float128_div(float128 a, float128 b, float_status *status) 2051 { 2052 FloatParts128 pa, pb, *pr; 2053 2054 float128_unpack_canonical(&pa, a, status); 2055 float128_unpack_canonical(&pb, b, status); 2056 pr = parts_div(&pa, &pb, status); 2057 2058 return float128_round_pack_canonical(pr, status); 2059 } 2060 2061 /* 2062 * Float to Float conversions 2063 * 2064 * Returns the result of converting one float format to another. The 2065 * conversion is performed according to the IEC/IEEE Standard for 2066 * Binary Floating-Point Arithmetic. 2067 * 2068 * Usually this only needs to take care of raising invalid exceptions 2069 * and handling the conversion on NaNs. 2070 */ 2071 2072 static void parts_float_to_ahp(FloatParts64 *a, float_status *s) 2073 { 2074 switch (a->cls) { 2075 case float_class_qnan: 2076 case float_class_snan: 2077 /* 2078 * There is no NaN in the destination format. Raise Invalid 2079 * and return a zero with the sign of the input NaN. 2080 */ 2081 float_raise(float_flag_invalid, s); 2082 a->cls = float_class_zero; 2083 break; 2084 2085 case float_class_inf: 2086 /* 2087 * There is no Inf in the destination format. Raise Invalid 2088 * and return the maximum normal with the correct sign. 2089 */ 2090 float_raise(float_flag_invalid, s); 2091 a->cls = float_class_normal; 2092 a->exp = float16_params_ahp.exp_max; 2093 a->frac = MAKE_64BIT_MASK(float16_params_ahp.frac_shift, 2094 float16_params_ahp.frac_size + 1); 2095 break; 2096 2097 case float_class_normal: 2098 case float_class_zero: 2099 break; 2100 2101 default: 2102 g_assert_not_reached(); 2103 } 2104 } 2105 2106 static void parts64_float_to_float(FloatParts64 *a, float_status *s) 2107 { 2108 if (is_nan(a->cls)) { 2109 parts_return_nan(a, s); 2110 } 2111 } 2112 2113 static void parts128_float_to_float(FloatParts128 *a, float_status *s) 2114 { 2115 if (is_nan(a->cls)) { 2116 parts_return_nan(a, s); 2117 } 2118 } 2119 2120 #define parts_float_to_float(P, S) \ 2121 PARTS_GENERIC_64_128(float_to_float, P)(P, S) 2122 2123 static void parts_float_to_float_narrow(FloatParts64 *a, FloatParts128 *b, 2124 float_status *s) 2125 { 2126 a->cls = b->cls; 2127 a->sign = b->sign; 2128 a->exp = b->exp; 2129 2130 if (a->cls == float_class_normal) { 2131 frac_truncjam(a, b); 2132 } else if (is_nan(a->cls)) { 2133 /* Discard the low bits of the NaN. */ 2134 a->frac = b->frac_hi; 2135 parts_return_nan(a, s); 2136 } 2137 } 2138 2139 static void parts_float_to_float_widen(FloatParts128 *a, FloatParts64 *b, 2140 float_status *s) 2141 { 2142 a->cls = b->cls; 2143 a->sign = b->sign; 2144 a->exp = b->exp; 2145 frac_widen(a, b); 2146 2147 if (is_nan(a->cls)) { 2148 parts_return_nan(a, s); 2149 } 2150 } 2151 2152 float32 float16_to_float32(float16 a, bool ieee, float_status *s) 2153 { 2154 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2155 FloatParts64 p; 2156 2157 float16a_unpack_canonical(&p, a, s, fmt16); 2158 parts_float_to_float(&p, s); 2159 return float32_round_pack_canonical(&p, s); 2160 } 2161 2162 float64 float16_to_float64(float16 a, bool ieee, float_status *s) 2163 { 2164 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2165 FloatParts64 p; 2166 2167 float16a_unpack_canonical(&p, a, s, fmt16); 2168 parts_float_to_float(&p, s); 2169 return float64_round_pack_canonical(&p, s); 2170 } 2171 2172 float16 float32_to_float16(float32 a, bool ieee, float_status *s) 2173 { 2174 FloatParts64 p; 2175 const FloatFmt *fmt; 2176 2177 float32_unpack_canonical(&p, a, s); 2178 if (ieee) { 2179 parts_float_to_float(&p, s); 2180 fmt = &float16_params; 2181 } else { 2182 parts_float_to_ahp(&p, s); 2183 fmt = &float16_params_ahp; 2184 } 2185 return float16a_round_pack_canonical(&p, s, fmt); 2186 } 2187 2188 static float64 QEMU_SOFTFLOAT_ATTR 2189 soft_float32_to_float64(float32 a, float_status *s) 2190 { 2191 FloatParts64 p; 2192 2193 float32_unpack_canonical(&p, a, s); 2194 parts_float_to_float(&p, s); 2195 return float64_round_pack_canonical(&p, s); 2196 } 2197 2198 float64 float32_to_float64(float32 a, float_status *s) 2199 { 2200 if (likely(float32_is_normal(a))) { 2201 /* Widening conversion can never produce inexact results. */ 2202 union_float32 uf; 2203 union_float64 ud; 2204 uf.s = a; 2205 ud.h = uf.h; 2206 return ud.s; 2207 } else if (float32_is_zero(a)) { 2208 return float64_set_sign(float64_zero, float32_is_neg(a)); 2209 } else { 2210 return soft_float32_to_float64(a, s); 2211 } 2212 } 2213 2214 float16 float64_to_float16(float64 a, bool ieee, float_status *s) 2215 { 2216 FloatParts64 p; 2217 const FloatFmt *fmt; 2218 2219 float64_unpack_canonical(&p, a, s); 2220 if (ieee) { 2221 parts_float_to_float(&p, s); 2222 fmt = &float16_params; 2223 } else { 2224 parts_float_to_ahp(&p, s); 2225 fmt = &float16_params_ahp; 2226 } 2227 return float16a_round_pack_canonical(&p, s, fmt); 2228 } 2229 2230 float32 float64_to_float32(float64 a, float_status *s) 2231 { 2232 FloatParts64 p; 2233 2234 float64_unpack_canonical(&p, a, s); 2235 parts_float_to_float(&p, s); 2236 return float32_round_pack_canonical(&p, s); 2237 } 2238 2239 float32 bfloat16_to_float32(bfloat16 a, float_status *s) 2240 { 2241 FloatParts64 p; 2242 2243 bfloat16_unpack_canonical(&p, a, s); 2244 parts_float_to_float(&p, s); 2245 return float32_round_pack_canonical(&p, s); 2246 } 2247 2248 float64 bfloat16_to_float64(bfloat16 a, float_status *s) 2249 { 2250 FloatParts64 p; 2251 2252 bfloat16_unpack_canonical(&p, a, s); 2253 parts_float_to_float(&p, s); 2254 return float64_round_pack_canonical(&p, s); 2255 } 2256 2257 bfloat16 float32_to_bfloat16(float32 a, float_status *s) 2258 { 2259 FloatParts64 p; 2260 2261 float32_unpack_canonical(&p, a, s); 2262 parts_float_to_float(&p, s); 2263 return bfloat16_round_pack_canonical(&p, s); 2264 } 2265 2266 bfloat16 float64_to_bfloat16(float64 a, float_status *s) 2267 { 2268 FloatParts64 p; 2269 2270 float64_unpack_canonical(&p, a, s); 2271 parts_float_to_float(&p, s); 2272 return bfloat16_round_pack_canonical(&p, s); 2273 } 2274 2275 float32 float128_to_float32(float128 a, float_status *s) 2276 { 2277 FloatParts64 p64; 2278 FloatParts128 p128; 2279 2280 float128_unpack_canonical(&p128, a, s); 2281 parts_float_to_float_narrow(&p64, &p128, s); 2282 return float32_round_pack_canonical(&p64, s); 2283 } 2284 2285 float64 float128_to_float64(float128 a, float_status *s) 2286 { 2287 FloatParts64 p64; 2288 FloatParts128 p128; 2289 2290 float128_unpack_canonical(&p128, a, s); 2291 parts_float_to_float_narrow(&p64, &p128, s); 2292 return float64_round_pack_canonical(&p64, s); 2293 } 2294 2295 float128 float32_to_float128(float32 a, float_status *s) 2296 { 2297 FloatParts64 p64; 2298 FloatParts128 p128; 2299 2300 float32_unpack_canonical(&p64, a, s); 2301 parts_float_to_float_widen(&p128, &p64, s); 2302 return float128_round_pack_canonical(&p128, s); 2303 } 2304 2305 float128 float64_to_float128(float64 a, float_status *s) 2306 { 2307 FloatParts64 p64; 2308 FloatParts128 p128; 2309 2310 float64_unpack_canonical(&p64, a, s); 2311 parts_float_to_float_widen(&p128, &p64, s); 2312 return float128_round_pack_canonical(&p128, s); 2313 } 2314 2315 /* 2316 * Round to integral value 2317 */ 2318 2319 float16 float16_round_to_int(float16 a, float_status *s) 2320 { 2321 FloatParts64 p; 2322 2323 float16_unpack_canonical(&p, a, s); 2324 parts_round_to_int(&p, s->float_rounding_mode, 0, s, &float16_params); 2325 return float16_round_pack_canonical(&p, s); 2326 } 2327 2328 float32 float32_round_to_int(float32 a, float_status *s) 2329 { 2330 FloatParts64 p; 2331 2332 float32_unpack_canonical(&p, a, s); 2333 parts_round_to_int(&p, s->float_rounding_mode, 0, s, &float32_params); 2334 return float32_round_pack_canonical(&p, s); 2335 } 2336 2337 float64 float64_round_to_int(float64 a, float_status *s) 2338 { 2339 FloatParts64 p; 2340 2341 float64_unpack_canonical(&p, a, s); 2342 parts_round_to_int(&p, s->float_rounding_mode, 0, s, &float64_params); 2343 return float64_round_pack_canonical(&p, s); 2344 } 2345 2346 bfloat16 bfloat16_round_to_int(bfloat16 a, float_status *s) 2347 { 2348 FloatParts64 p; 2349 2350 bfloat16_unpack_canonical(&p, a, s); 2351 parts_round_to_int(&p, s->float_rounding_mode, 0, s, &bfloat16_params); 2352 return bfloat16_round_pack_canonical(&p, s); 2353 } 2354 2355 float128 float128_round_to_int(float128 a, float_status *s) 2356 { 2357 FloatParts128 p; 2358 2359 float128_unpack_canonical(&p, a, s); 2360 parts_round_to_int(&p, s->float_rounding_mode, 0, s, &float128_params); 2361 return float128_round_pack_canonical(&p, s); 2362 } 2363 2364 /* 2365 * Floating-point to signed integer conversions 2366 */ 2367 2368 int8_t float16_to_int8_scalbn(float16 a, FloatRoundMode rmode, int scale, 2369 float_status *s) 2370 { 2371 FloatParts64 p; 2372 2373 float16_unpack_canonical(&p, a, s); 2374 return parts_float_to_sint(&p, rmode, scale, INT8_MIN, INT8_MAX, s); 2375 } 2376 2377 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale, 2378 float_status *s) 2379 { 2380 FloatParts64 p; 2381 2382 float16_unpack_canonical(&p, a, s); 2383 return parts_float_to_sint(&p, rmode, scale, INT16_MIN, INT16_MAX, s); 2384 } 2385 2386 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale, 2387 float_status *s) 2388 { 2389 FloatParts64 p; 2390 2391 float16_unpack_canonical(&p, a, s); 2392 return parts_float_to_sint(&p, rmode, scale, INT32_MIN, INT32_MAX, s); 2393 } 2394 2395 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale, 2396 float_status *s) 2397 { 2398 FloatParts64 p; 2399 2400 float16_unpack_canonical(&p, a, s); 2401 return parts_float_to_sint(&p, rmode, scale, INT64_MIN, INT64_MAX, s); 2402 } 2403 2404 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale, 2405 float_status *s) 2406 { 2407 FloatParts64 p; 2408 2409 float32_unpack_canonical(&p, a, s); 2410 return parts_float_to_sint(&p, rmode, scale, INT16_MIN, INT16_MAX, s); 2411 } 2412 2413 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale, 2414 float_status *s) 2415 { 2416 FloatParts64 p; 2417 2418 float32_unpack_canonical(&p, a, s); 2419 return parts_float_to_sint(&p, rmode, scale, INT32_MIN, INT32_MAX, s); 2420 } 2421 2422 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale, 2423 float_status *s) 2424 { 2425 FloatParts64 p; 2426 2427 float32_unpack_canonical(&p, a, s); 2428 return parts_float_to_sint(&p, rmode, scale, INT64_MIN, INT64_MAX, s); 2429 } 2430 2431 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale, 2432 float_status *s) 2433 { 2434 FloatParts64 p; 2435 2436 float64_unpack_canonical(&p, a, s); 2437 return parts_float_to_sint(&p, rmode, scale, INT16_MIN, INT16_MAX, s); 2438 } 2439 2440 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale, 2441 float_status *s) 2442 { 2443 FloatParts64 p; 2444 2445 float64_unpack_canonical(&p, a, s); 2446 return parts_float_to_sint(&p, rmode, scale, INT32_MIN, INT32_MAX, s); 2447 } 2448 2449 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale, 2450 float_status *s) 2451 { 2452 FloatParts64 p; 2453 2454 float64_unpack_canonical(&p, a, s); 2455 return parts_float_to_sint(&p, rmode, scale, INT64_MIN, INT64_MAX, s); 2456 } 2457 2458 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2459 float_status *s) 2460 { 2461 FloatParts64 p; 2462 2463 bfloat16_unpack_canonical(&p, a, s); 2464 return parts_float_to_sint(&p, rmode, scale, INT16_MIN, INT16_MAX, s); 2465 } 2466 2467 int32_t bfloat16_to_int32_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2468 float_status *s) 2469 { 2470 FloatParts64 p; 2471 2472 bfloat16_unpack_canonical(&p, a, s); 2473 return parts_float_to_sint(&p, rmode, scale, INT32_MIN, INT32_MAX, s); 2474 } 2475 2476 int64_t bfloat16_to_int64_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2477 float_status *s) 2478 { 2479 FloatParts64 p; 2480 2481 bfloat16_unpack_canonical(&p, a, s); 2482 return parts_float_to_sint(&p, rmode, scale, INT64_MIN, INT64_MAX, s); 2483 } 2484 2485 static int32_t float128_to_int32_scalbn(float128 a, FloatRoundMode rmode, 2486 int scale, float_status *s) 2487 { 2488 FloatParts128 p; 2489 2490 float128_unpack_canonical(&p, a, s); 2491 return parts_float_to_sint(&p, rmode, scale, INT32_MIN, INT32_MAX, s); 2492 } 2493 2494 static int64_t float128_to_int64_scalbn(float128 a, FloatRoundMode rmode, 2495 int scale, float_status *s) 2496 { 2497 FloatParts128 p; 2498 2499 float128_unpack_canonical(&p, a, s); 2500 return parts_float_to_sint(&p, rmode, scale, INT64_MIN, INT64_MAX, s); 2501 } 2502 2503 int8_t float16_to_int8(float16 a, float_status *s) 2504 { 2505 return float16_to_int8_scalbn(a, s->float_rounding_mode, 0, s); 2506 } 2507 2508 int16_t float16_to_int16(float16 a, float_status *s) 2509 { 2510 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2511 } 2512 2513 int32_t float16_to_int32(float16 a, float_status *s) 2514 { 2515 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2516 } 2517 2518 int64_t float16_to_int64(float16 a, float_status *s) 2519 { 2520 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2521 } 2522 2523 int16_t float32_to_int16(float32 a, float_status *s) 2524 { 2525 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2526 } 2527 2528 int32_t float32_to_int32(float32 a, float_status *s) 2529 { 2530 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2531 } 2532 2533 int64_t float32_to_int64(float32 a, float_status *s) 2534 { 2535 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2536 } 2537 2538 int16_t float64_to_int16(float64 a, float_status *s) 2539 { 2540 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2541 } 2542 2543 int32_t float64_to_int32(float64 a, float_status *s) 2544 { 2545 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2546 } 2547 2548 int64_t float64_to_int64(float64 a, float_status *s) 2549 { 2550 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2551 } 2552 2553 int32_t float128_to_int32(float128 a, float_status *s) 2554 { 2555 return float128_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2556 } 2557 2558 int64_t float128_to_int64(float128 a, float_status *s) 2559 { 2560 return float128_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2561 } 2562 2563 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s) 2564 { 2565 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2566 } 2567 2568 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s) 2569 { 2570 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2571 } 2572 2573 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s) 2574 { 2575 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2576 } 2577 2578 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s) 2579 { 2580 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s); 2581 } 2582 2583 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s) 2584 { 2585 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s); 2586 } 2587 2588 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s) 2589 { 2590 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s); 2591 } 2592 2593 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s) 2594 { 2595 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s); 2596 } 2597 2598 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s) 2599 { 2600 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s); 2601 } 2602 2603 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s) 2604 { 2605 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s); 2606 } 2607 2608 int32_t float128_to_int32_round_to_zero(float128 a, float_status *s) 2609 { 2610 return float128_to_int32_scalbn(a, float_round_to_zero, 0, s); 2611 } 2612 2613 int64_t float128_to_int64_round_to_zero(float128 a, float_status *s) 2614 { 2615 return float128_to_int64_scalbn(a, float_round_to_zero, 0, s); 2616 } 2617 2618 int16_t bfloat16_to_int16(bfloat16 a, float_status *s) 2619 { 2620 return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2621 } 2622 2623 int32_t bfloat16_to_int32(bfloat16 a, float_status *s) 2624 { 2625 return bfloat16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2626 } 2627 2628 int64_t bfloat16_to_int64(bfloat16 a, float_status *s) 2629 { 2630 return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2631 } 2632 2633 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s) 2634 { 2635 return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2636 } 2637 2638 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a, float_status *s) 2639 { 2640 return bfloat16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2641 } 2642 2643 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a, float_status *s) 2644 { 2645 return bfloat16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2646 } 2647 2648 /* 2649 * Returns the result of converting the floating-point value `a' to 2650 * the unsigned integer format. The conversion is performed according 2651 * to the IEC/IEEE Standard for Binary Floating-Point 2652 * Arithmetic---which means in particular that the conversion is 2653 * rounded according to the current rounding mode. If `a' is a NaN, 2654 * the largest unsigned integer is returned. Otherwise, if the 2655 * conversion overflows, the largest unsigned integer is returned. If 2656 * the 'a' is negative, the result is rounded and zero is returned; 2657 * values that do not round to zero will raise the inexact exception 2658 * flag. 2659 */ 2660 2661 static uint64_t round_to_uint_and_pack(FloatParts64 p, FloatRoundMode rmode, 2662 int scale, uint64_t max, 2663 float_status *s) 2664 { 2665 int flags = 0; 2666 uint64_t r; 2667 2668 switch (p.cls) { 2669 case float_class_snan: 2670 case float_class_qnan: 2671 flags = float_flag_invalid; 2672 r = max; 2673 break; 2674 2675 case float_class_inf: 2676 flags = float_flag_invalid; 2677 r = p.sign ? 0 : max; 2678 break; 2679 2680 case float_class_zero: 2681 return 0; 2682 2683 case float_class_normal: 2684 /* TODO: 62 = N - 2, frac_size for rounding */ 2685 if (parts_round_to_int_normal(&p, rmode, scale, 62)) { 2686 flags = float_flag_inexact; 2687 if (p.cls == float_class_zero) { 2688 r = 0; 2689 break; 2690 } 2691 } 2692 2693 if (p.sign) { 2694 flags = float_flag_invalid; 2695 r = 0; 2696 } else if (p.exp > DECOMPOSED_BINARY_POINT) { 2697 flags = float_flag_invalid; 2698 r = max; 2699 } else { 2700 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2701 if (r > max) { 2702 flags = float_flag_invalid; 2703 r = max; 2704 } 2705 } 2706 break; 2707 2708 default: 2709 g_assert_not_reached(); 2710 } 2711 2712 float_raise(flags, s); 2713 return r; 2714 } 2715 2716 uint8_t float16_to_uint8_scalbn(float16 a, FloatRoundMode rmode, int scale, 2717 float_status *s) 2718 { 2719 FloatParts64 p; 2720 2721 float16_unpack_canonical(&p, a, s); 2722 return round_to_uint_and_pack(p, rmode, scale, UINT8_MAX, s); 2723 } 2724 2725 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale, 2726 float_status *s) 2727 { 2728 FloatParts64 p; 2729 2730 float16_unpack_canonical(&p, a, s); 2731 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2732 } 2733 2734 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale, 2735 float_status *s) 2736 { 2737 FloatParts64 p; 2738 2739 float16_unpack_canonical(&p, a, s); 2740 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2741 } 2742 2743 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale, 2744 float_status *s) 2745 { 2746 FloatParts64 p; 2747 2748 float16_unpack_canonical(&p, a, s); 2749 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2750 } 2751 2752 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale, 2753 float_status *s) 2754 { 2755 FloatParts64 p; 2756 2757 float32_unpack_canonical(&p, a, s); 2758 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2759 } 2760 2761 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale, 2762 float_status *s) 2763 { 2764 FloatParts64 p; 2765 2766 float32_unpack_canonical(&p, a, s); 2767 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2768 } 2769 2770 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale, 2771 float_status *s) 2772 { 2773 FloatParts64 p; 2774 2775 float32_unpack_canonical(&p, a, s); 2776 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2777 } 2778 2779 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale, 2780 float_status *s) 2781 { 2782 FloatParts64 p; 2783 2784 float64_unpack_canonical(&p, a, s); 2785 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2786 } 2787 2788 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale, 2789 float_status *s) 2790 { 2791 FloatParts64 p; 2792 2793 float64_unpack_canonical(&p, a, s); 2794 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2795 } 2796 2797 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale, 2798 float_status *s) 2799 { 2800 FloatParts64 p; 2801 2802 float64_unpack_canonical(&p, a, s); 2803 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2804 } 2805 2806 uint8_t float16_to_uint8(float16 a, float_status *s) 2807 { 2808 return float16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s); 2809 } 2810 2811 uint16_t float16_to_uint16(float16 a, float_status *s) 2812 { 2813 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2814 } 2815 2816 uint32_t float16_to_uint32(float16 a, float_status *s) 2817 { 2818 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2819 } 2820 2821 uint64_t float16_to_uint64(float16 a, float_status *s) 2822 { 2823 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2824 } 2825 2826 uint16_t float32_to_uint16(float32 a, float_status *s) 2827 { 2828 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2829 } 2830 2831 uint32_t float32_to_uint32(float32 a, float_status *s) 2832 { 2833 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2834 } 2835 2836 uint64_t float32_to_uint64(float32 a, float_status *s) 2837 { 2838 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2839 } 2840 2841 uint16_t float64_to_uint16(float64 a, float_status *s) 2842 { 2843 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2844 } 2845 2846 uint32_t float64_to_uint32(float64 a, float_status *s) 2847 { 2848 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2849 } 2850 2851 uint64_t float64_to_uint64(float64 a, float_status *s) 2852 { 2853 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2854 } 2855 2856 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s) 2857 { 2858 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2859 } 2860 2861 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s) 2862 { 2863 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2864 } 2865 2866 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s) 2867 { 2868 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2869 } 2870 2871 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s) 2872 { 2873 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2874 } 2875 2876 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s) 2877 { 2878 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2879 } 2880 2881 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s) 2882 { 2883 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2884 } 2885 2886 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s) 2887 { 2888 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2889 } 2890 2891 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s) 2892 { 2893 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2894 } 2895 2896 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s) 2897 { 2898 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2899 } 2900 2901 /* 2902 * Returns the result of converting the bfloat16 value `a' to 2903 * the unsigned integer format. 2904 */ 2905 2906 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode, 2907 int scale, float_status *s) 2908 { 2909 FloatParts64 p; 2910 2911 bfloat16_unpack_canonical(&p, a, s); 2912 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2913 } 2914 2915 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode rmode, 2916 int scale, float_status *s) 2917 { 2918 FloatParts64 p; 2919 2920 bfloat16_unpack_canonical(&p, a, s); 2921 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2922 } 2923 2924 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode rmode, 2925 int scale, float_status *s) 2926 { 2927 FloatParts64 p; 2928 2929 bfloat16_unpack_canonical(&p, a, s); 2930 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2931 } 2932 2933 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s) 2934 { 2935 return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2936 } 2937 2938 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *s) 2939 { 2940 return bfloat16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2941 } 2942 2943 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s) 2944 { 2945 return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2946 } 2947 2948 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s) 2949 { 2950 return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2951 } 2952 2953 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *s) 2954 { 2955 return bfloat16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2956 } 2957 2958 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *s) 2959 { 2960 return bfloat16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2961 } 2962 2963 /* 2964 * Integer to float conversions 2965 * 2966 * Returns the result of converting the two's complement integer `a' 2967 * to the floating-point format. The conversion is performed according 2968 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2969 */ 2970 2971 static FloatParts64 int_to_float(int64_t a, int scale, float_status *status) 2972 { 2973 FloatParts64 r = { .sign = false }; 2974 2975 if (a == 0) { 2976 r.cls = float_class_zero; 2977 } else { 2978 uint64_t f = a; 2979 int shift; 2980 2981 r.cls = float_class_normal; 2982 if (a < 0) { 2983 f = -f; 2984 r.sign = true; 2985 } 2986 shift = clz64(f); 2987 scale = MIN(MAX(scale, -0x10000), 0x10000); 2988 2989 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 2990 r.frac = f << shift; 2991 } 2992 2993 return r; 2994 } 2995 2996 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status) 2997 { 2998 FloatParts64 pa = int_to_float(a, scale, status); 2999 return float16_round_pack_canonical(&pa, status); 3000 } 3001 3002 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status) 3003 { 3004 return int64_to_float16_scalbn(a, scale, status); 3005 } 3006 3007 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status) 3008 { 3009 return int64_to_float16_scalbn(a, scale, status); 3010 } 3011 3012 float16 int64_to_float16(int64_t a, float_status *status) 3013 { 3014 return int64_to_float16_scalbn(a, 0, status); 3015 } 3016 3017 float16 int32_to_float16(int32_t a, float_status *status) 3018 { 3019 return int64_to_float16_scalbn(a, 0, status); 3020 } 3021 3022 float16 int16_to_float16(int16_t a, float_status *status) 3023 { 3024 return int64_to_float16_scalbn(a, 0, status); 3025 } 3026 3027 float16 int8_to_float16(int8_t a, float_status *status) 3028 { 3029 return int64_to_float16_scalbn(a, 0, status); 3030 } 3031 3032 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status) 3033 { 3034 FloatParts64 pa = int_to_float(a, scale, status); 3035 return float32_round_pack_canonical(&pa, status); 3036 } 3037 3038 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status) 3039 { 3040 return int64_to_float32_scalbn(a, scale, status); 3041 } 3042 3043 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status) 3044 { 3045 return int64_to_float32_scalbn(a, scale, status); 3046 } 3047 3048 float32 int64_to_float32(int64_t a, float_status *status) 3049 { 3050 return int64_to_float32_scalbn(a, 0, status); 3051 } 3052 3053 float32 int32_to_float32(int32_t a, float_status *status) 3054 { 3055 return int64_to_float32_scalbn(a, 0, status); 3056 } 3057 3058 float32 int16_to_float32(int16_t a, float_status *status) 3059 { 3060 return int64_to_float32_scalbn(a, 0, status); 3061 } 3062 3063 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status) 3064 { 3065 FloatParts64 pa = int_to_float(a, scale, status); 3066 return float64_round_pack_canonical(&pa, status); 3067 } 3068 3069 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status) 3070 { 3071 return int64_to_float64_scalbn(a, scale, status); 3072 } 3073 3074 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status) 3075 { 3076 return int64_to_float64_scalbn(a, scale, status); 3077 } 3078 3079 float64 int64_to_float64(int64_t a, float_status *status) 3080 { 3081 return int64_to_float64_scalbn(a, 0, status); 3082 } 3083 3084 float64 int32_to_float64(int32_t a, float_status *status) 3085 { 3086 return int64_to_float64_scalbn(a, 0, status); 3087 } 3088 3089 float64 int16_to_float64(int16_t a, float_status *status) 3090 { 3091 return int64_to_float64_scalbn(a, 0, status); 3092 } 3093 3094 /* 3095 * Returns the result of converting the two's complement integer `a' 3096 * to the bfloat16 format. 3097 */ 3098 3099 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status) 3100 { 3101 FloatParts64 pa = int_to_float(a, scale, status); 3102 return bfloat16_round_pack_canonical(&pa, status); 3103 } 3104 3105 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int scale, float_status *status) 3106 { 3107 return int64_to_bfloat16_scalbn(a, scale, status); 3108 } 3109 3110 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status) 3111 { 3112 return int64_to_bfloat16_scalbn(a, scale, status); 3113 } 3114 3115 bfloat16 int64_to_bfloat16(int64_t a, float_status *status) 3116 { 3117 return int64_to_bfloat16_scalbn(a, 0, status); 3118 } 3119 3120 bfloat16 int32_to_bfloat16(int32_t a, float_status *status) 3121 { 3122 return int64_to_bfloat16_scalbn(a, 0, status); 3123 } 3124 3125 bfloat16 int16_to_bfloat16(int16_t a, float_status *status) 3126 { 3127 return int64_to_bfloat16_scalbn(a, 0, status); 3128 } 3129 3130 /* 3131 * Unsigned Integer to float conversions 3132 * 3133 * Returns the result of converting the unsigned integer `a' to the 3134 * floating-point format. The conversion is performed according to the 3135 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3136 */ 3137 3138 static FloatParts64 uint_to_float(uint64_t a, int scale, float_status *status) 3139 { 3140 FloatParts64 r = { .sign = false }; 3141 int shift; 3142 3143 if (a == 0) { 3144 r.cls = float_class_zero; 3145 } else { 3146 scale = MIN(MAX(scale, -0x10000), 0x10000); 3147 shift = clz64(a); 3148 r.cls = float_class_normal; 3149 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 3150 r.frac = a << shift; 3151 } 3152 3153 return r; 3154 } 3155 3156 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status) 3157 { 3158 FloatParts64 pa = uint_to_float(a, scale, status); 3159 return float16_round_pack_canonical(&pa, status); 3160 } 3161 3162 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status) 3163 { 3164 return uint64_to_float16_scalbn(a, scale, status); 3165 } 3166 3167 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status) 3168 { 3169 return uint64_to_float16_scalbn(a, scale, status); 3170 } 3171 3172 float16 uint64_to_float16(uint64_t a, float_status *status) 3173 { 3174 return uint64_to_float16_scalbn(a, 0, status); 3175 } 3176 3177 float16 uint32_to_float16(uint32_t a, float_status *status) 3178 { 3179 return uint64_to_float16_scalbn(a, 0, status); 3180 } 3181 3182 float16 uint16_to_float16(uint16_t a, float_status *status) 3183 { 3184 return uint64_to_float16_scalbn(a, 0, status); 3185 } 3186 3187 float16 uint8_to_float16(uint8_t a, float_status *status) 3188 { 3189 return uint64_to_float16_scalbn(a, 0, status); 3190 } 3191 3192 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status) 3193 { 3194 FloatParts64 pa = uint_to_float(a, scale, status); 3195 return float32_round_pack_canonical(&pa, status); 3196 } 3197 3198 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status) 3199 { 3200 return uint64_to_float32_scalbn(a, scale, status); 3201 } 3202 3203 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status) 3204 { 3205 return uint64_to_float32_scalbn(a, scale, status); 3206 } 3207 3208 float32 uint64_to_float32(uint64_t a, float_status *status) 3209 { 3210 return uint64_to_float32_scalbn(a, 0, status); 3211 } 3212 3213 float32 uint32_to_float32(uint32_t a, float_status *status) 3214 { 3215 return uint64_to_float32_scalbn(a, 0, status); 3216 } 3217 3218 float32 uint16_to_float32(uint16_t a, float_status *status) 3219 { 3220 return uint64_to_float32_scalbn(a, 0, status); 3221 } 3222 3223 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status) 3224 { 3225 FloatParts64 pa = uint_to_float(a, scale, status); 3226 return float64_round_pack_canonical(&pa, status); 3227 } 3228 3229 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status) 3230 { 3231 return uint64_to_float64_scalbn(a, scale, status); 3232 } 3233 3234 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status) 3235 { 3236 return uint64_to_float64_scalbn(a, scale, status); 3237 } 3238 3239 float64 uint64_to_float64(uint64_t a, float_status *status) 3240 { 3241 return uint64_to_float64_scalbn(a, 0, status); 3242 } 3243 3244 float64 uint32_to_float64(uint32_t a, float_status *status) 3245 { 3246 return uint64_to_float64_scalbn(a, 0, status); 3247 } 3248 3249 float64 uint16_to_float64(uint16_t a, float_status *status) 3250 { 3251 return uint64_to_float64_scalbn(a, 0, status); 3252 } 3253 3254 /* 3255 * Returns the result of converting the unsigned integer `a' to the 3256 * bfloat16 format. 3257 */ 3258 3259 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status) 3260 { 3261 FloatParts64 pa = uint_to_float(a, scale, status); 3262 return bfloat16_round_pack_canonical(&pa, status); 3263 } 3264 3265 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int scale, float_status *status) 3266 { 3267 return uint64_to_bfloat16_scalbn(a, scale, status); 3268 } 3269 3270 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status) 3271 { 3272 return uint64_to_bfloat16_scalbn(a, scale, status); 3273 } 3274 3275 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status) 3276 { 3277 return uint64_to_bfloat16_scalbn(a, 0, status); 3278 } 3279 3280 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status) 3281 { 3282 return uint64_to_bfloat16_scalbn(a, 0, status); 3283 } 3284 3285 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status) 3286 { 3287 return uint64_to_bfloat16_scalbn(a, 0, status); 3288 } 3289 3290 /* Float Min/Max */ 3291 /* min() and max() functions. These can't be implemented as 3292 * 'compare and pick one input' because that would mishandle 3293 * NaNs and +0 vs -0. 3294 * 3295 * minnum() and maxnum() functions. These are similar to the min() 3296 * and max() functions but if one of the arguments is a QNaN and 3297 * the other is numerical then the numerical argument is returned. 3298 * SNaNs will get quietened before being returned. 3299 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 3300 * and maxNum() operations. min() and max() are the typical min/max 3301 * semantics provided by many CPUs which predate that specification. 3302 * 3303 * minnummag() and maxnummag() functions correspond to minNumMag() 3304 * and minNumMag() from the IEEE-754 2008. 3305 */ 3306 static FloatParts64 minmax_floats(FloatParts64 a, FloatParts64 b, bool ismin, 3307 bool ieee, bool ismag, float_status *s) 3308 { 3309 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) { 3310 if (ieee) { 3311 /* Takes two floating-point values `a' and `b', one of 3312 * which is a NaN, and returns the appropriate NaN 3313 * result. If either `a' or `b' is a signaling NaN, 3314 * the invalid exception is raised. 3315 */ 3316 if (is_snan(a.cls) || is_snan(b.cls)) { 3317 return *parts_pick_nan(&a, &b, s); 3318 } else if (is_nan(a.cls) && !is_nan(b.cls)) { 3319 return b; 3320 } else if (is_nan(b.cls) && !is_nan(a.cls)) { 3321 return a; 3322 } 3323 } 3324 return *parts_pick_nan(&a, &b, s); 3325 } else { 3326 int a_exp, b_exp; 3327 3328 switch (a.cls) { 3329 case float_class_normal: 3330 a_exp = a.exp; 3331 break; 3332 case float_class_inf: 3333 a_exp = INT_MAX; 3334 break; 3335 case float_class_zero: 3336 a_exp = INT_MIN; 3337 break; 3338 default: 3339 g_assert_not_reached(); 3340 break; 3341 } 3342 switch (b.cls) { 3343 case float_class_normal: 3344 b_exp = b.exp; 3345 break; 3346 case float_class_inf: 3347 b_exp = INT_MAX; 3348 break; 3349 case float_class_zero: 3350 b_exp = INT_MIN; 3351 break; 3352 default: 3353 g_assert_not_reached(); 3354 break; 3355 } 3356 3357 if (ismag && (a_exp != b_exp || a.frac != b.frac)) { 3358 bool a_less = a_exp < b_exp; 3359 if (a_exp == b_exp) { 3360 a_less = a.frac < b.frac; 3361 } 3362 return a_less ^ ismin ? b : a; 3363 } 3364 3365 if (a.sign == b.sign) { 3366 bool a_less = a_exp < b_exp; 3367 if (a_exp == b_exp) { 3368 a_less = a.frac < b.frac; 3369 } 3370 return a.sign ^ a_less ^ ismin ? b : a; 3371 } else { 3372 return a.sign ^ ismin ? b : a; 3373 } 3374 } 3375 } 3376 3377 #define MINMAX(sz, name, ismin, isiee, ismag) \ 3378 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \ 3379 float_status *s) \ 3380 { \ 3381 FloatParts64 pa, pb, pr; \ 3382 float ## sz ## _unpack_canonical(&pa, a, s); \ 3383 float ## sz ## _unpack_canonical(&pb, b, s); \ 3384 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 3385 return float ## sz ## _round_pack_canonical(&pr, s); \ 3386 } 3387 3388 MINMAX(16, min, true, false, false) 3389 MINMAX(16, minnum, true, true, false) 3390 MINMAX(16, minnummag, true, true, true) 3391 MINMAX(16, max, false, false, false) 3392 MINMAX(16, maxnum, false, true, false) 3393 MINMAX(16, maxnummag, false, true, true) 3394 3395 MINMAX(32, min, true, false, false) 3396 MINMAX(32, minnum, true, true, false) 3397 MINMAX(32, minnummag, true, true, true) 3398 MINMAX(32, max, false, false, false) 3399 MINMAX(32, maxnum, false, true, false) 3400 MINMAX(32, maxnummag, false, true, true) 3401 3402 MINMAX(64, min, true, false, false) 3403 MINMAX(64, minnum, true, true, false) 3404 MINMAX(64, minnummag, true, true, true) 3405 MINMAX(64, max, false, false, false) 3406 MINMAX(64, maxnum, false, true, false) 3407 MINMAX(64, maxnummag, false, true, true) 3408 3409 #undef MINMAX 3410 3411 #define BF16_MINMAX(name, ismin, isiee, ismag) \ 3412 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s) \ 3413 { \ 3414 FloatParts64 pa, pb, pr; \ 3415 bfloat16_unpack_canonical(&pa, a, s); \ 3416 bfloat16_unpack_canonical(&pb, b, s); \ 3417 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 3418 return bfloat16_round_pack_canonical(&pr, s); \ 3419 } 3420 3421 BF16_MINMAX(min, true, false, false) 3422 BF16_MINMAX(minnum, true, true, false) 3423 BF16_MINMAX(minnummag, true, true, true) 3424 BF16_MINMAX(max, false, false, false) 3425 BF16_MINMAX(maxnum, false, true, false) 3426 BF16_MINMAX(maxnummag, false, true, true) 3427 3428 #undef BF16_MINMAX 3429 3430 /* Floating point compare */ 3431 static FloatRelation compare_floats(FloatParts64 a, FloatParts64 b, bool is_quiet, 3432 float_status *s) 3433 { 3434 if (is_nan(a.cls) || is_nan(b.cls)) { 3435 if (!is_quiet || 3436 a.cls == float_class_snan || 3437 b.cls == float_class_snan) { 3438 float_raise(float_flag_invalid, s); 3439 } 3440 return float_relation_unordered; 3441 } 3442 3443 if (a.cls == float_class_zero) { 3444 if (b.cls == float_class_zero) { 3445 return float_relation_equal; 3446 } 3447 return b.sign ? float_relation_greater : float_relation_less; 3448 } else if (b.cls == float_class_zero) { 3449 return a.sign ? float_relation_less : float_relation_greater; 3450 } 3451 3452 /* The only really important thing about infinity is its sign. If 3453 * both are infinities the sign marks the smallest of the two. 3454 */ 3455 if (a.cls == float_class_inf) { 3456 if ((b.cls == float_class_inf) && (a.sign == b.sign)) { 3457 return float_relation_equal; 3458 } 3459 return a.sign ? float_relation_less : float_relation_greater; 3460 } else if (b.cls == float_class_inf) { 3461 return b.sign ? float_relation_greater : float_relation_less; 3462 } 3463 3464 if (a.sign != b.sign) { 3465 return a.sign ? float_relation_less : float_relation_greater; 3466 } 3467 3468 if (a.exp == b.exp) { 3469 if (a.frac == b.frac) { 3470 return float_relation_equal; 3471 } 3472 if (a.sign) { 3473 return a.frac > b.frac ? 3474 float_relation_less : float_relation_greater; 3475 } else { 3476 return a.frac > b.frac ? 3477 float_relation_greater : float_relation_less; 3478 } 3479 } else { 3480 if (a.sign) { 3481 return a.exp > b.exp ? float_relation_less : float_relation_greater; 3482 } else { 3483 return a.exp > b.exp ? float_relation_greater : float_relation_less; 3484 } 3485 } 3486 } 3487 3488 #define COMPARE(name, attr, sz) \ 3489 static int attr \ 3490 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \ 3491 { \ 3492 FloatParts64 pa, pb; \ 3493 float ## sz ## _unpack_canonical(&pa, a, s); \ 3494 float ## sz ## _unpack_canonical(&pb, b, s); \ 3495 return compare_floats(pa, pb, is_quiet, s); \ 3496 } 3497 3498 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16) 3499 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32) 3500 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64) 3501 3502 #undef COMPARE 3503 3504 FloatRelation float16_compare(float16 a, float16 b, float_status *s) 3505 { 3506 return soft_f16_compare(a, b, false, s); 3507 } 3508 3509 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s) 3510 { 3511 return soft_f16_compare(a, b, true, s); 3512 } 3513 3514 static FloatRelation QEMU_FLATTEN 3515 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s) 3516 { 3517 union_float32 ua, ub; 3518 3519 ua.s = xa; 3520 ub.s = xb; 3521 3522 if (QEMU_NO_HARDFLOAT) { 3523 goto soft; 3524 } 3525 3526 float32_input_flush2(&ua.s, &ub.s, s); 3527 if (isgreaterequal(ua.h, ub.h)) { 3528 if (isgreater(ua.h, ub.h)) { 3529 return float_relation_greater; 3530 } 3531 return float_relation_equal; 3532 } 3533 if (likely(isless(ua.h, ub.h))) { 3534 return float_relation_less; 3535 } 3536 /* The only condition remaining is unordered. 3537 * Fall through to set flags. 3538 */ 3539 soft: 3540 return soft_f32_compare(ua.s, ub.s, is_quiet, s); 3541 } 3542 3543 FloatRelation float32_compare(float32 a, float32 b, float_status *s) 3544 { 3545 return f32_compare(a, b, false, s); 3546 } 3547 3548 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s) 3549 { 3550 return f32_compare(a, b, true, s); 3551 } 3552 3553 static FloatRelation QEMU_FLATTEN 3554 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s) 3555 { 3556 union_float64 ua, ub; 3557 3558 ua.s = xa; 3559 ub.s = xb; 3560 3561 if (QEMU_NO_HARDFLOAT) { 3562 goto soft; 3563 } 3564 3565 float64_input_flush2(&ua.s, &ub.s, s); 3566 if (isgreaterequal(ua.h, ub.h)) { 3567 if (isgreater(ua.h, ub.h)) { 3568 return float_relation_greater; 3569 } 3570 return float_relation_equal; 3571 } 3572 if (likely(isless(ua.h, ub.h))) { 3573 return float_relation_less; 3574 } 3575 /* The only condition remaining is unordered. 3576 * Fall through to set flags. 3577 */ 3578 soft: 3579 return soft_f64_compare(ua.s, ub.s, is_quiet, s); 3580 } 3581 3582 FloatRelation float64_compare(float64 a, float64 b, float_status *s) 3583 { 3584 return f64_compare(a, b, false, s); 3585 } 3586 3587 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s) 3588 { 3589 return f64_compare(a, b, true, s); 3590 } 3591 3592 static FloatRelation QEMU_FLATTEN 3593 soft_bf16_compare(bfloat16 a, bfloat16 b, bool is_quiet, float_status *s) 3594 { 3595 FloatParts64 pa, pb; 3596 3597 bfloat16_unpack_canonical(&pa, a, s); 3598 bfloat16_unpack_canonical(&pb, b, s); 3599 return compare_floats(pa, pb, is_quiet, s); 3600 } 3601 3602 FloatRelation bfloat16_compare(bfloat16 a, bfloat16 b, float_status *s) 3603 { 3604 return soft_bf16_compare(a, b, false, s); 3605 } 3606 3607 FloatRelation bfloat16_compare_quiet(bfloat16 a, bfloat16 b, float_status *s) 3608 { 3609 return soft_bf16_compare(a, b, true, s); 3610 } 3611 3612 /* Multiply A by 2 raised to the power N. */ 3613 static FloatParts64 scalbn_decomposed(FloatParts64 a, int n, float_status *s) 3614 { 3615 if (unlikely(is_nan(a.cls))) { 3616 parts_return_nan(&a, s); 3617 } 3618 if (a.cls == float_class_normal) { 3619 /* The largest float type (even though not supported by FloatParts64) 3620 * is float128, which has a 15 bit exponent. Bounding N to 16 bits 3621 * still allows rounding to infinity, without allowing overflow 3622 * within the int32_t that backs FloatParts64.exp. 3623 */ 3624 n = MIN(MAX(n, -0x10000), 0x10000); 3625 a.exp += n; 3626 } 3627 return a; 3628 } 3629 3630 float16 float16_scalbn(float16 a, int n, float_status *status) 3631 { 3632 FloatParts64 pa, pr; 3633 3634 float16_unpack_canonical(&pa, a, status); 3635 pr = scalbn_decomposed(pa, n, status); 3636 return float16_round_pack_canonical(&pr, status); 3637 } 3638 3639 float32 float32_scalbn(float32 a, int n, float_status *status) 3640 { 3641 FloatParts64 pa, pr; 3642 3643 float32_unpack_canonical(&pa, a, status); 3644 pr = scalbn_decomposed(pa, n, status); 3645 return float32_round_pack_canonical(&pr, status); 3646 } 3647 3648 float64 float64_scalbn(float64 a, int n, float_status *status) 3649 { 3650 FloatParts64 pa, pr; 3651 3652 float64_unpack_canonical(&pa, a, status); 3653 pr = scalbn_decomposed(pa, n, status); 3654 return float64_round_pack_canonical(&pr, status); 3655 } 3656 3657 bfloat16 bfloat16_scalbn(bfloat16 a, int n, float_status *status) 3658 { 3659 FloatParts64 pa, pr; 3660 3661 bfloat16_unpack_canonical(&pa, a, status); 3662 pr = scalbn_decomposed(pa, n, status); 3663 return bfloat16_round_pack_canonical(&pr, status); 3664 } 3665 3666 /* 3667 * Square Root 3668 * 3669 * The old softfloat code did an approximation step before zeroing in 3670 * on the final result. However for simpleness we just compute the 3671 * square root by iterating down from the implicit bit to enough extra 3672 * bits to ensure we get a correctly rounded result. 3673 * 3674 * This does mean however the calculation is slower than before, 3675 * especially for 64 bit floats. 3676 */ 3677 3678 static FloatParts64 sqrt_float(FloatParts64 a, float_status *s, const FloatFmt *p) 3679 { 3680 uint64_t a_frac, r_frac, s_frac; 3681 int bit, last_bit; 3682 3683 if (is_nan(a.cls)) { 3684 parts_return_nan(&a, s); 3685 return a; 3686 } 3687 if (a.cls == float_class_zero) { 3688 return a; /* sqrt(+-0) = +-0 */ 3689 } 3690 if (a.sign) { 3691 float_raise(float_flag_invalid, s); 3692 parts_default_nan(&a, s); 3693 return a; 3694 } 3695 if (a.cls == float_class_inf) { 3696 return a; /* sqrt(+inf) = +inf */ 3697 } 3698 3699 assert(a.cls == float_class_normal); 3700 3701 /* We need two overflow bits at the top. Adding room for that is a 3702 * right shift. If the exponent is odd, we can discard the low bit 3703 * by multiplying the fraction by 2; that's a left shift. Combine 3704 * those and we shift right by 1 if the exponent is odd, otherwise 2. 3705 */ 3706 a_frac = a.frac >> (2 - (a.exp & 1)); 3707 a.exp >>= 1; 3708 3709 /* Bit-by-bit computation of sqrt. */ 3710 r_frac = 0; 3711 s_frac = 0; 3712 3713 /* Iterate from implicit bit down to the 3 extra bits to compute a 3714 * properly rounded result. Remember we've inserted two more bits 3715 * at the top, so these positions are two less. 3716 */ 3717 bit = DECOMPOSED_BINARY_POINT - 2; 3718 last_bit = MAX(p->frac_shift - 4, 0); 3719 do { 3720 uint64_t q = 1ULL << bit; 3721 uint64_t t_frac = s_frac + q; 3722 if (t_frac <= a_frac) { 3723 s_frac = t_frac + q; 3724 a_frac -= t_frac; 3725 r_frac += q; 3726 } 3727 a_frac <<= 1; 3728 } while (--bit >= last_bit); 3729 3730 /* Undo the right shift done above. If there is any remaining 3731 * fraction, the result is inexact. Set the sticky bit. 3732 */ 3733 a.frac = (r_frac << 2) + (a_frac != 0); 3734 3735 return a; 3736 } 3737 3738 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status) 3739 { 3740 FloatParts64 pa, pr; 3741 3742 float16_unpack_canonical(&pa, a, status); 3743 pr = sqrt_float(pa, status, &float16_params); 3744 return float16_round_pack_canonical(&pr, status); 3745 } 3746 3747 static float32 QEMU_SOFTFLOAT_ATTR 3748 soft_f32_sqrt(float32 a, float_status *status) 3749 { 3750 FloatParts64 pa, pr; 3751 3752 float32_unpack_canonical(&pa, a, status); 3753 pr = sqrt_float(pa, status, &float32_params); 3754 return float32_round_pack_canonical(&pr, status); 3755 } 3756 3757 static float64 QEMU_SOFTFLOAT_ATTR 3758 soft_f64_sqrt(float64 a, float_status *status) 3759 { 3760 FloatParts64 pa, pr; 3761 3762 float64_unpack_canonical(&pa, a, status); 3763 pr = sqrt_float(pa, status, &float64_params); 3764 return float64_round_pack_canonical(&pr, status); 3765 } 3766 3767 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s) 3768 { 3769 union_float32 ua, ur; 3770 3771 ua.s = xa; 3772 if (unlikely(!can_use_fpu(s))) { 3773 goto soft; 3774 } 3775 3776 float32_input_flush1(&ua.s, s); 3777 if (QEMU_HARDFLOAT_1F32_USE_FP) { 3778 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3779 fpclassify(ua.h) == FP_ZERO) || 3780 signbit(ua.h))) { 3781 goto soft; 3782 } 3783 } else if (unlikely(!float32_is_zero_or_normal(ua.s) || 3784 float32_is_neg(ua.s))) { 3785 goto soft; 3786 } 3787 ur.h = sqrtf(ua.h); 3788 return ur.s; 3789 3790 soft: 3791 return soft_f32_sqrt(ua.s, s); 3792 } 3793 3794 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s) 3795 { 3796 union_float64 ua, ur; 3797 3798 ua.s = xa; 3799 if (unlikely(!can_use_fpu(s))) { 3800 goto soft; 3801 } 3802 3803 float64_input_flush1(&ua.s, s); 3804 if (QEMU_HARDFLOAT_1F64_USE_FP) { 3805 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3806 fpclassify(ua.h) == FP_ZERO) || 3807 signbit(ua.h))) { 3808 goto soft; 3809 } 3810 } else if (unlikely(!float64_is_zero_or_normal(ua.s) || 3811 float64_is_neg(ua.s))) { 3812 goto soft; 3813 } 3814 ur.h = sqrt(ua.h); 3815 return ur.s; 3816 3817 soft: 3818 return soft_f64_sqrt(ua.s, s); 3819 } 3820 3821 bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status) 3822 { 3823 FloatParts64 pa, pr; 3824 3825 bfloat16_unpack_canonical(&pa, a, status); 3826 pr = sqrt_float(pa, status, &bfloat16_params); 3827 return bfloat16_round_pack_canonical(&pr, status); 3828 } 3829 3830 /*---------------------------------------------------------------------------- 3831 | The pattern for a default generated NaN. 3832 *----------------------------------------------------------------------------*/ 3833 3834 float16 float16_default_nan(float_status *status) 3835 { 3836 FloatParts64 p; 3837 3838 parts_default_nan(&p, status); 3839 p.frac >>= float16_params.frac_shift; 3840 return float16_pack_raw(&p); 3841 } 3842 3843 float32 float32_default_nan(float_status *status) 3844 { 3845 FloatParts64 p; 3846 3847 parts_default_nan(&p, status); 3848 p.frac >>= float32_params.frac_shift; 3849 return float32_pack_raw(&p); 3850 } 3851 3852 float64 float64_default_nan(float_status *status) 3853 { 3854 FloatParts64 p; 3855 3856 parts_default_nan(&p, status); 3857 p.frac >>= float64_params.frac_shift; 3858 return float64_pack_raw(&p); 3859 } 3860 3861 float128 float128_default_nan(float_status *status) 3862 { 3863 FloatParts128 p; 3864 3865 parts_default_nan(&p, status); 3866 frac_shr(&p, float128_params.frac_shift); 3867 return float128_pack_raw(&p); 3868 } 3869 3870 bfloat16 bfloat16_default_nan(float_status *status) 3871 { 3872 FloatParts64 p; 3873 3874 parts_default_nan(&p, status); 3875 p.frac >>= bfloat16_params.frac_shift; 3876 return bfloat16_pack_raw(&p); 3877 } 3878 3879 /*---------------------------------------------------------------------------- 3880 | Returns a quiet NaN from a signalling NaN for the floating point value `a'. 3881 *----------------------------------------------------------------------------*/ 3882 3883 float16 float16_silence_nan(float16 a, float_status *status) 3884 { 3885 FloatParts64 p; 3886 3887 float16_unpack_raw(&p, a); 3888 p.frac <<= float16_params.frac_shift; 3889 parts_silence_nan(&p, status); 3890 p.frac >>= float16_params.frac_shift; 3891 return float16_pack_raw(&p); 3892 } 3893 3894 float32 float32_silence_nan(float32 a, float_status *status) 3895 { 3896 FloatParts64 p; 3897 3898 float32_unpack_raw(&p, a); 3899 p.frac <<= float32_params.frac_shift; 3900 parts_silence_nan(&p, status); 3901 p.frac >>= float32_params.frac_shift; 3902 return float32_pack_raw(&p); 3903 } 3904 3905 float64 float64_silence_nan(float64 a, float_status *status) 3906 { 3907 FloatParts64 p; 3908 3909 float64_unpack_raw(&p, a); 3910 p.frac <<= float64_params.frac_shift; 3911 parts_silence_nan(&p, status); 3912 p.frac >>= float64_params.frac_shift; 3913 return float64_pack_raw(&p); 3914 } 3915 3916 bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status) 3917 { 3918 FloatParts64 p; 3919 3920 bfloat16_unpack_raw(&p, a); 3921 p.frac <<= bfloat16_params.frac_shift; 3922 parts_silence_nan(&p, status); 3923 p.frac >>= bfloat16_params.frac_shift; 3924 return bfloat16_pack_raw(&p); 3925 } 3926 3927 float128 float128_silence_nan(float128 a, float_status *status) 3928 { 3929 FloatParts128 p; 3930 3931 float128_unpack_raw(&p, a); 3932 frac_shl(&p, float128_params.frac_shift); 3933 parts_silence_nan(&p, status); 3934 frac_shr(&p, float128_params.frac_shift); 3935 return float128_pack_raw(&p); 3936 } 3937 3938 /*---------------------------------------------------------------------------- 3939 | If `a' is denormal and we are in flush-to-zero mode then set the 3940 | input-denormal exception and return zero. Otherwise just return the value. 3941 *----------------------------------------------------------------------------*/ 3942 3943 static bool parts_squash_denormal(FloatParts64 p, float_status *status) 3944 { 3945 if (p.exp == 0 && p.frac != 0) { 3946 float_raise(float_flag_input_denormal, status); 3947 return true; 3948 } 3949 3950 return false; 3951 } 3952 3953 float16 float16_squash_input_denormal(float16 a, float_status *status) 3954 { 3955 if (status->flush_inputs_to_zero) { 3956 FloatParts64 p; 3957 3958 float16_unpack_raw(&p, a); 3959 if (parts_squash_denormal(p, status)) { 3960 return float16_set_sign(float16_zero, p.sign); 3961 } 3962 } 3963 return a; 3964 } 3965 3966 float32 float32_squash_input_denormal(float32 a, float_status *status) 3967 { 3968 if (status->flush_inputs_to_zero) { 3969 FloatParts64 p; 3970 3971 float32_unpack_raw(&p, a); 3972 if (parts_squash_denormal(p, status)) { 3973 return float32_set_sign(float32_zero, p.sign); 3974 } 3975 } 3976 return a; 3977 } 3978 3979 float64 float64_squash_input_denormal(float64 a, float_status *status) 3980 { 3981 if (status->flush_inputs_to_zero) { 3982 FloatParts64 p; 3983 3984 float64_unpack_raw(&p, a); 3985 if (parts_squash_denormal(p, status)) { 3986 return float64_set_sign(float64_zero, p.sign); 3987 } 3988 } 3989 return a; 3990 } 3991 3992 bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status) 3993 { 3994 if (status->flush_inputs_to_zero) { 3995 FloatParts64 p; 3996 3997 bfloat16_unpack_raw(&p, a); 3998 if (parts_squash_denormal(p, status)) { 3999 return bfloat16_set_sign(bfloat16_zero, p.sign); 4000 } 4001 } 4002 return a; 4003 } 4004 4005 /*---------------------------------------------------------------------------- 4006 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 4007 | and 7, and returns the properly rounded 32-bit integer corresponding to the 4008 | input. If `zSign' is 1, the input is negated before being converted to an 4009 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 4010 | is simply rounded to an integer, with the inexact exception raised if the 4011 | input cannot be represented exactly as an integer. However, if the fixed- 4012 | point input is too large, the invalid exception is raised and the largest 4013 | positive or negative integer is returned. 4014 *----------------------------------------------------------------------------*/ 4015 4016 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ, 4017 float_status *status) 4018 { 4019 int8_t roundingMode; 4020 bool roundNearestEven; 4021 int8_t roundIncrement, roundBits; 4022 int32_t z; 4023 4024 roundingMode = status->float_rounding_mode; 4025 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4026 switch (roundingMode) { 4027 case float_round_nearest_even: 4028 case float_round_ties_away: 4029 roundIncrement = 0x40; 4030 break; 4031 case float_round_to_zero: 4032 roundIncrement = 0; 4033 break; 4034 case float_round_up: 4035 roundIncrement = zSign ? 0 : 0x7f; 4036 break; 4037 case float_round_down: 4038 roundIncrement = zSign ? 0x7f : 0; 4039 break; 4040 case float_round_to_odd: 4041 roundIncrement = absZ & 0x80 ? 0 : 0x7f; 4042 break; 4043 default: 4044 abort(); 4045 } 4046 roundBits = absZ & 0x7F; 4047 absZ = ( absZ + roundIncrement )>>7; 4048 if (!(roundBits ^ 0x40) && roundNearestEven) { 4049 absZ &= ~1; 4050 } 4051 z = absZ; 4052 if ( zSign ) z = - z; 4053 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 4054 float_raise(float_flag_invalid, status); 4055 return zSign ? INT32_MIN : INT32_MAX; 4056 } 4057 if (roundBits) { 4058 float_raise(float_flag_inexact, status); 4059 } 4060 return z; 4061 4062 } 4063 4064 /*---------------------------------------------------------------------------- 4065 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 4066 | `absZ1', with binary point between bits 63 and 64 (between the input words), 4067 | and returns the properly rounded 64-bit integer corresponding to the input. 4068 | If `zSign' is 1, the input is negated before being converted to an integer. 4069 | Ordinarily, the fixed-point input is simply rounded to an integer, with 4070 | the inexact exception raised if the input cannot be represented exactly as 4071 | an integer. However, if the fixed-point input is too large, the invalid 4072 | exception is raised and the largest positive or negative integer is 4073 | returned. 4074 *----------------------------------------------------------------------------*/ 4075 4076 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1, 4077 float_status *status) 4078 { 4079 int8_t roundingMode; 4080 bool roundNearestEven, increment; 4081 int64_t z; 4082 4083 roundingMode = status->float_rounding_mode; 4084 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4085 switch (roundingMode) { 4086 case float_round_nearest_even: 4087 case float_round_ties_away: 4088 increment = ((int64_t) absZ1 < 0); 4089 break; 4090 case float_round_to_zero: 4091 increment = 0; 4092 break; 4093 case float_round_up: 4094 increment = !zSign && absZ1; 4095 break; 4096 case float_round_down: 4097 increment = zSign && absZ1; 4098 break; 4099 case float_round_to_odd: 4100 increment = !(absZ0 & 1) && absZ1; 4101 break; 4102 default: 4103 abort(); 4104 } 4105 if ( increment ) { 4106 ++absZ0; 4107 if ( absZ0 == 0 ) goto overflow; 4108 if (!(absZ1 << 1) && roundNearestEven) { 4109 absZ0 &= ~1; 4110 } 4111 } 4112 z = absZ0; 4113 if ( zSign ) z = - z; 4114 if ( z && ( ( z < 0 ) ^ zSign ) ) { 4115 overflow: 4116 float_raise(float_flag_invalid, status); 4117 return zSign ? INT64_MIN : INT64_MAX; 4118 } 4119 if (absZ1) { 4120 float_raise(float_flag_inexact, status); 4121 } 4122 return z; 4123 4124 } 4125 4126 /*---------------------------------------------------------------------------- 4127 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 4128 | `absZ1', with binary point between bits 63 and 64 (between the input words), 4129 | and returns the properly rounded 64-bit unsigned integer corresponding to the 4130 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 4131 | with the inexact exception raised if the input cannot be represented exactly 4132 | as an integer. However, if the fixed-point input is too large, the invalid 4133 | exception is raised and the largest unsigned integer is returned. 4134 *----------------------------------------------------------------------------*/ 4135 4136 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0, 4137 uint64_t absZ1, float_status *status) 4138 { 4139 int8_t roundingMode; 4140 bool roundNearestEven, increment; 4141 4142 roundingMode = status->float_rounding_mode; 4143 roundNearestEven = (roundingMode == float_round_nearest_even); 4144 switch (roundingMode) { 4145 case float_round_nearest_even: 4146 case float_round_ties_away: 4147 increment = ((int64_t)absZ1 < 0); 4148 break; 4149 case float_round_to_zero: 4150 increment = 0; 4151 break; 4152 case float_round_up: 4153 increment = !zSign && absZ1; 4154 break; 4155 case float_round_down: 4156 increment = zSign && absZ1; 4157 break; 4158 case float_round_to_odd: 4159 increment = !(absZ0 & 1) && absZ1; 4160 break; 4161 default: 4162 abort(); 4163 } 4164 if (increment) { 4165 ++absZ0; 4166 if (absZ0 == 0) { 4167 float_raise(float_flag_invalid, status); 4168 return UINT64_MAX; 4169 } 4170 if (!(absZ1 << 1) && roundNearestEven) { 4171 absZ0 &= ~1; 4172 } 4173 } 4174 4175 if (zSign && absZ0) { 4176 float_raise(float_flag_invalid, status); 4177 return 0; 4178 } 4179 4180 if (absZ1) { 4181 float_raise(float_flag_inexact, status); 4182 } 4183 return absZ0; 4184 } 4185 4186 /*---------------------------------------------------------------------------- 4187 | Normalizes the subnormal single-precision floating-point value represented 4188 | by the denormalized significand `aSig'. The normalized exponent and 4189 | significand are stored at the locations pointed to by `zExpPtr' and 4190 | `zSigPtr', respectively. 4191 *----------------------------------------------------------------------------*/ 4192 4193 static void 4194 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 4195 { 4196 int8_t shiftCount; 4197 4198 shiftCount = clz32(aSig) - 8; 4199 *zSigPtr = aSig<<shiftCount; 4200 *zExpPtr = 1 - shiftCount; 4201 4202 } 4203 4204 /*---------------------------------------------------------------------------- 4205 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4206 | and significand `zSig', and returns the proper single-precision floating- 4207 | point value corresponding to the abstract input. Ordinarily, the abstract 4208 | value is simply rounded and packed into the single-precision format, with 4209 | the inexact exception raised if the abstract input cannot be represented 4210 | exactly. However, if the abstract value is too large, the overflow and 4211 | inexact exceptions are raised and an infinity or maximal finite value is 4212 | returned. If the abstract value is too small, the input value is rounded to 4213 | a subnormal number, and the underflow and inexact exceptions are raised if 4214 | the abstract input cannot be represented exactly as a subnormal single- 4215 | precision floating-point number. 4216 | The input significand `zSig' has its binary point between bits 30 4217 | and 29, which is 7 bits to the left of the usual location. This shifted 4218 | significand must be normalized or smaller. If `zSig' is not normalized, 4219 | `zExp' must be 0; in that case, the result returned is a subnormal number, 4220 | and it must not require rounding. In the usual case that `zSig' is 4221 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 4222 | The handling of underflow and overflow follows the IEC/IEEE Standard for 4223 | Binary Floating-Point Arithmetic. 4224 *----------------------------------------------------------------------------*/ 4225 4226 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig, 4227 float_status *status) 4228 { 4229 int8_t roundingMode; 4230 bool roundNearestEven; 4231 int8_t roundIncrement, roundBits; 4232 bool isTiny; 4233 4234 roundingMode = status->float_rounding_mode; 4235 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4236 switch (roundingMode) { 4237 case float_round_nearest_even: 4238 case float_round_ties_away: 4239 roundIncrement = 0x40; 4240 break; 4241 case float_round_to_zero: 4242 roundIncrement = 0; 4243 break; 4244 case float_round_up: 4245 roundIncrement = zSign ? 0 : 0x7f; 4246 break; 4247 case float_round_down: 4248 roundIncrement = zSign ? 0x7f : 0; 4249 break; 4250 case float_round_to_odd: 4251 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 4252 break; 4253 default: 4254 abort(); 4255 break; 4256 } 4257 roundBits = zSig & 0x7F; 4258 if ( 0xFD <= (uint16_t) zExp ) { 4259 if ( ( 0xFD < zExp ) 4260 || ( ( zExp == 0xFD ) 4261 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 4262 ) { 4263 bool overflow_to_inf = roundingMode != float_round_to_odd && 4264 roundIncrement != 0; 4265 float_raise(float_flag_overflow | float_flag_inexact, status); 4266 return packFloat32(zSign, 0xFF, -!overflow_to_inf); 4267 } 4268 if ( zExp < 0 ) { 4269 if (status->flush_to_zero) { 4270 float_raise(float_flag_output_denormal, status); 4271 return packFloat32(zSign, 0, 0); 4272 } 4273 isTiny = status->tininess_before_rounding 4274 || (zExp < -1) 4275 || (zSig + roundIncrement < 0x80000000); 4276 shift32RightJamming( zSig, - zExp, &zSig ); 4277 zExp = 0; 4278 roundBits = zSig & 0x7F; 4279 if (isTiny && roundBits) { 4280 float_raise(float_flag_underflow, status); 4281 } 4282 if (roundingMode == float_round_to_odd) { 4283 /* 4284 * For round-to-odd case, the roundIncrement depends on 4285 * zSig which just changed. 4286 */ 4287 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 4288 } 4289 } 4290 } 4291 if (roundBits) { 4292 float_raise(float_flag_inexact, status); 4293 } 4294 zSig = ( zSig + roundIncrement )>>7; 4295 if (!(roundBits ^ 0x40) && roundNearestEven) { 4296 zSig &= ~1; 4297 } 4298 if ( zSig == 0 ) zExp = 0; 4299 return packFloat32( zSign, zExp, zSig ); 4300 4301 } 4302 4303 /*---------------------------------------------------------------------------- 4304 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4305 | and significand `zSig', and returns the proper single-precision floating- 4306 | point value corresponding to the abstract input. This routine is just like 4307 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 4308 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 4309 | floating-point exponent. 4310 *----------------------------------------------------------------------------*/ 4311 4312 static float32 4313 normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig, 4314 float_status *status) 4315 { 4316 int8_t shiftCount; 4317 4318 shiftCount = clz32(zSig) - 1; 4319 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 4320 status); 4321 4322 } 4323 4324 /*---------------------------------------------------------------------------- 4325 | Normalizes the subnormal double-precision floating-point value represented 4326 | by the denormalized significand `aSig'. The normalized exponent and 4327 | significand are stored at the locations pointed to by `zExpPtr' and 4328 | `zSigPtr', respectively. 4329 *----------------------------------------------------------------------------*/ 4330 4331 static void 4332 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 4333 { 4334 int8_t shiftCount; 4335 4336 shiftCount = clz64(aSig) - 11; 4337 *zSigPtr = aSig<<shiftCount; 4338 *zExpPtr = 1 - shiftCount; 4339 4340 } 4341 4342 /*---------------------------------------------------------------------------- 4343 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 4344 | double-precision floating-point value, returning the result. After being 4345 | shifted into the proper positions, the three fields are simply added 4346 | together to form the result. This means that any integer portion of `zSig' 4347 | will be added into the exponent. Since a properly normalized significand 4348 | will have an integer portion equal to 1, the `zExp' input should be 1 less 4349 | than the desired result exponent whenever `zSig' is a complete, normalized 4350 | significand. 4351 *----------------------------------------------------------------------------*/ 4352 4353 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig) 4354 { 4355 4356 return make_float64( 4357 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 4358 4359 } 4360 4361 /*---------------------------------------------------------------------------- 4362 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4363 | and significand `zSig', and returns the proper double-precision floating- 4364 | point value corresponding to the abstract input. Ordinarily, the abstract 4365 | value is simply rounded and packed into the double-precision format, with 4366 | the inexact exception raised if the abstract input cannot be represented 4367 | exactly. However, if the abstract value is too large, the overflow and 4368 | inexact exceptions are raised and an infinity or maximal finite value is 4369 | returned. If the abstract value is too small, the input value is rounded to 4370 | a subnormal number, and the underflow and inexact exceptions are raised if 4371 | the abstract input cannot be represented exactly as a subnormal double- 4372 | precision floating-point number. 4373 | The input significand `zSig' has its binary point between bits 62 4374 | and 61, which is 10 bits to the left of the usual location. This shifted 4375 | significand must be normalized or smaller. If `zSig' is not normalized, 4376 | `zExp' must be 0; in that case, the result returned is a subnormal number, 4377 | and it must not require rounding. In the usual case that `zSig' is 4378 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 4379 | The handling of underflow and overflow follows the IEC/IEEE Standard for 4380 | Binary Floating-Point Arithmetic. 4381 *----------------------------------------------------------------------------*/ 4382 4383 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig, 4384 float_status *status) 4385 { 4386 int8_t roundingMode; 4387 bool roundNearestEven; 4388 int roundIncrement, roundBits; 4389 bool isTiny; 4390 4391 roundingMode = status->float_rounding_mode; 4392 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4393 switch (roundingMode) { 4394 case float_round_nearest_even: 4395 case float_round_ties_away: 4396 roundIncrement = 0x200; 4397 break; 4398 case float_round_to_zero: 4399 roundIncrement = 0; 4400 break; 4401 case float_round_up: 4402 roundIncrement = zSign ? 0 : 0x3ff; 4403 break; 4404 case float_round_down: 4405 roundIncrement = zSign ? 0x3ff : 0; 4406 break; 4407 case float_round_to_odd: 4408 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 4409 break; 4410 default: 4411 abort(); 4412 } 4413 roundBits = zSig & 0x3FF; 4414 if ( 0x7FD <= (uint16_t) zExp ) { 4415 if ( ( 0x7FD < zExp ) 4416 || ( ( zExp == 0x7FD ) 4417 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 4418 ) { 4419 bool overflow_to_inf = roundingMode != float_round_to_odd && 4420 roundIncrement != 0; 4421 float_raise(float_flag_overflow | float_flag_inexact, status); 4422 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 4423 } 4424 if ( zExp < 0 ) { 4425 if (status->flush_to_zero) { 4426 float_raise(float_flag_output_denormal, status); 4427 return packFloat64(zSign, 0, 0); 4428 } 4429 isTiny = status->tininess_before_rounding 4430 || (zExp < -1) 4431 || (zSig + roundIncrement < UINT64_C(0x8000000000000000)); 4432 shift64RightJamming( zSig, - zExp, &zSig ); 4433 zExp = 0; 4434 roundBits = zSig & 0x3FF; 4435 if (isTiny && roundBits) { 4436 float_raise(float_flag_underflow, status); 4437 } 4438 if (roundingMode == float_round_to_odd) { 4439 /* 4440 * For round-to-odd case, the roundIncrement depends on 4441 * zSig which just changed. 4442 */ 4443 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 4444 } 4445 } 4446 } 4447 if (roundBits) { 4448 float_raise(float_flag_inexact, status); 4449 } 4450 zSig = ( zSig + roundIncrement )>>10; 4451 if (!(roundBits ^ 0x200) && roundNearestEven) { 4452 zSig &= ~1; 4453 } 4454 if ( zSig == 0 ) zExp = 0; 4455 return packFloat64( zSign, zExp, zSig ); 4456 4457 } 4458 4459 /*---------------------------------------------------------------------------- 4460 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4461 | and significand `zSig', and returns the proper double-precision floating- 4462 | point value corresponding to the abstract input. This routine is just like 4463 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 4464 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 4465 | floating-point exponent. 4466 *----------------------------------------------------------------------------*/ 4467 4468 static float64 4469 normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig, 4470 float_status *status) 4471 { 4472 int8_t shiftCount; 4473 4474 shiftCount = clz64(zSig) - 1; 4475 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 4476 status); 4477 4478 } 4479 4480 /*---------------------------------------------------------------------------- 4481 | Normalizes the subnormal extended double-precision floating-point value 4482 | represented by the denormalized significand `aSig'. The normalized exponent 4483 | and significand are stored at the locations pointed to by `zExpPtr' and 4484 | `zSigPtr', respectively. 4485 *----------------------------------------------------------------------------*/ 4486 4487 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr, 4488 uint64_t *zSigPtr) 4489 { 4490 int8_t shiftCount; 4491 4492 shiftCount = clz64(aSig); 4493 *zSigPtr = aSig<<shiftCount; 4494 *zExpPtr = 1 - shiftCount; 4495 } 4496 4497 /*---------------------------------------------------------------------------- 4498 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4499 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 4500 | and returns the proper extended double-precision floating-point value 4501 | corresponding to the abstract input. Ordinarily, the abstract value is 4502 | rounded and packed into the extended double-precision format, with the 4503 | inexact exception raised if the abstract input cannot be represented 4504 | exactly. However, if the abstract value is too large, the overflow and 4505 | inexact exceptions are raised and an infinity or maximal finite value is 4506 | returned. If the abstract value is too small, the input value is rounded to 4507 | a subnormal number, and the underflow and inexact exceptions are raised if 4508 | the abstract input cannot be represented exactly as a subnormal extended 4509 | double-precision floating-point number. 4510 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 4511 | number of bits as single or double precision, respectively. Otherwise, the 4512 | result is rounded to the full precision of the extended double-precision 4513 | format. 4514 | The input significand must be normalized or smaller. If the input 4515 | significand is not normalized, `zExp' must be 0; in that case, the result 4516 | returned is a subnormal number, and it must not require rounding. The 4517 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 4518 | Floating-Point Arithmetic. 4519 *----------------------------------------------------------------------------*/ 4520 4521 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign, 4522 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 4523 float_status *status) 4524 { 4525 int8_t roundingMode; 4526 bool roundNearestEven, increment, isTiny; 4527 int64_t roundIncrement, roundMask, roundBits; 4528 4529 roundingMode = status->float_rounding_mode; 4530 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4531 if ( roundingPrecision == 80 ) goto precision80; 4532 if ( roundingPrecision == 64 ) { 4533 roundIncrement = UINT64_C(0x0000000000000400); 4534 roundMask = UINT64_C(0x00000000000007FF); 4535 } 4536 else if ( roundingPrecision == 32 ) { 4537 roundIncrement = UINT64_C(0x0000008000000000); 4538 roundMask = UINT64_C(0x000000FFFFFFFFFF); 4539 } 4540 else { 4541 goto precision80; 4542 } 4543 zSig0 |= ( zSig1 != 0 ); 4544 switch (roundingMode) { 4545 case float_round_nearest_even: 4546 case float_round_ties_away: 4547 break; 4548 case float_round_to_zero: 4549 roundIncrement = 0; 4550 break; 4551 case float_round_up: 4552 roundIncrement = zSign ? 0 : roundMask; 4553 break; 4554 case float_round_down: 4555 roundIncrement = zSign ? roundMask : 0; 4556 break; 4557 default: 4558 abort(); 4559 } 4560 roundBits = zSig0 & roundMask; 4561 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 4562 if ( ( 0x7FFE < zExp ) 4563 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 4564 ) { 4565 goto overflow; 4566 } 4567 if ( zExp <= 0 ) { 4568 if (status->flush_to_zero) { 4569 float_raise(float_flag_output_denormal, status); 4570 return packFloatx80(zSign, 0, 0); 4571 } 4572 isTiny = status->tininess_before_rounding 4573 || (zExp < 0 ) 4574 || (zSig0 <= zSig0 + roundIncrement); 4575 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 4576 zExp = 0; 4577 roundBits = zSig0 & roundMask; 4578 if (isTiny && roundBits) { 4579 float_raise(float_flag_underflow, status); 4580 } 4581 if (roundBits) { 4582 float_raise(float_flag_inexact, status); 4583 } 4584 zSig0 += roundIncrement; 4585 if ( (int64_t) zSig0 < 0 ) zExp = 1; 4586 roundIncrement = roundMask + 1; 4587 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 4588 roundMask |= roundIncrement; 4589 } 4590 zSig0 &= ~ roundMask; 4591 return packFloatx80( zSign, zExp, zSig0 ); 4592 } 4593 } 4594 if (roundBits) { 4595 float_raise(float_flag_inexact, status); 4596 } 4597 zSig0 += roundIncrement; 4598 if ( zSig0 < roundIncrement ) { 4599 ++zExp; 4600 zSig0 = UINT64_C(0x8000000000000000); 4601 } 4602 roundIncrement = roundMask + 1; 4603 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 4604 roundMask |= roundIncrement; 4605 } 4606 zSig0 &= ~ roundMask; 4607 if ( zSig0 == 0 ) zExp = 0; 4608 return packFloatx80( zSign, zExp, zSig0 ); 4609 precision80: 4610 switch (roundingMode) { 4611 case float_round_nearest_even: 4612 case float_round_ties_away: 4613 increment = ((int64_t)zSig1 < 0); 4614 break; 4615 case float_round_to_zero: 4616 increment = 0; 4617 break; 4618 case float_round_up: 4619 increment = !zSign && zSig1; 4620 break; 4621 case float_round_down: 4622 increment = zSign && zSig1; 4623 break; 4624 default: 4625 abort(); 4626 } 4627 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 4628 if ( ( 0x7FFE < zExp ) 4629 || ( ( zExp == 0x7FFE ) 4630 && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) ) 4631 && increment 4632 ) 4633 ) { 4634 roundMask = 0; 4635 overflow: 4636 float_raise(float_flag_overflow | float_flag_inexact, status); 4637 if ( ( roundingMode == float_round_to_zero ) 4638 || ( zSign && ( roundingMode == float_round_up ) ) 4639 || ( ! zSign && ( roundingMode == float_round_down ) ) 4640 ) { 4641 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 4642 } 4643 return packFloatx80(zSign, 4644 floatx80_infinity_high, 4645 floatx80_infinity_low); 4646 } 4647 if ( zExp <= 0 ) { 4648 isTiny = status->tininess_before_rounding 4649 || (zExp < 0) 4650 || !increment 4651 || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF)); 4652 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 4653 zExp = 0; 4654 if (isTiny && zSig1) { 4655 float_raise(float_flag_underflow, status); 4656 } 4657 if (zSig1) { 4658 float_raise(float_flag_inexact, status); 4659 } 4660 switch (roundingMode) { 4661 case float_round_nearest_even: 4662 case float_round_ties_away: 4663 increment = ((int64_t)zSig1 < 0); 4664 break; 4665 case float_round_to_zero: 4666 increment = 0; 4667 break; 4668 case float_round_up: 4669 increment = !zSign && zSig1; 4670 break; 4671 case float_round_down: 4672 increment = zSign && zSig1; 4673 break; 4674 default: 4675 abort(); 4676 } 4677 if ( increment ) { 4678 ++zSig0; 4679 if (!(zSig1 << 1) && roundNearestEven) { 4680 zSig0 &= ~1; 4681 } 4682 if ( (int64_t) zSig0 < 0 ) zExp = 1; 4683 } 4684 return packFloatx80( zSign, zExp, zSig0 ); 4685 } 4686 } 4687 if (zSig1) { 4688 float_raise(float_flag_inexact, status); 4689 } 4690 if ( increment ) { 4691 ++zSig0; 4692 if ( zSig0 == 0 ) { 4693 ++zExp; 4694 zSig0 = UINT64_C(0x8000000000000000); 4695 } 4696 else { 4697 if (!(zSig1 << 1) && roundNearestEven) { 4698 zSig0 &= ~1; 4699 } 4700 } 4701 } 4702 else { 4703 if ( zSig0 == 0 ) zExp = 0; 4704 } 4705 return packFloatx80( zSign, zExp, zSig0 ); 4706 4707 } 4708 4709 /*---------------------------------------------------------------------------- 4710 | Takes an abstract floating-point value having sign `zSign', exponent 4711 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 4712 | and returns the proper extended double-precision floating-point value 4713 | corresponding to the abstract input. This routine is just like 4714 | `roundAndPackFloatx80' except that the input significand does not have to be 4715 | normalized. 4716 *----------------------------------------------------------------------------*/ 4717 4718 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 4719 bool zSign, int32_t zExp, 4720 uint64_t zSig0, uint64_t zSig1, 4721 float_status *status) 4722 { 4723 int8_t shiftCount; 4724 4725 if ( zSig0 == 0 ) { 4726 zSig0 = zSig1; 4727 zSig1 = 0; 4728 zExp -= 64; 4729 } 4730 shiftCount = clz64(zSig0); 4731 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4732 zExp -= shiftCount; 4733 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 4734 zSig0, zSig1, status); 4735 4736 } 4737 4738 /*---------------------------------------------------------------------------- 4739 | Returns the least-significant 64 fraction bits of the quadruple-precision 4740 | floating-point value `a'. 4741 *----------------------------------------------------------------------------*/ 4742 4743 static inline uint64_t extractFloat128Frac1( float128 a ) 4744 { 4745 4746 return a.low; 4747 4748 } 4749 4750 /*---------------------------------------------------------------------------- 4751 | Returns the most-significant 48 fraction bits of the quadruple-precision 4752 | floating-point value `a'. 4753 *----------------------------------------------------------------------------*/ 4754 4755 static inline uint64_t extractFloat128Frac0( float128 a ) 4756 { 4757 4758 return a.high & UINT64_C(0x0000FFFFFFFFFFFF); 4759 4760 } 4761 4762 /*---------------------------------------------------------------------------- 4763 | Returns the exponent bits of the quadruple-precision floating-point value 4764 | `a'. 4765 *----------------------------------------------------------------------------*/ 4766 4767 static inline int32_t extractFloat128Exp( float128 a ) 4768 { 4769 4770 return ( a.high>>48 ) & 0x7FFF; 4771 4772 } 4773 4774 /*---------------------------------------------------------------------------- 4775 | Returns the sign bit of the quadruple-precision floating-point value `a'. 4776 *----------------------------------------------------------------------------*/ 4777 4778 static inline bool extractFloat128Sign(float128 a) 4779 { 4780 return a.high >> 63; 4781 } 4782 4783 /*---------------------------------------------------------------------------- 4784 | Normalizes the subnormal quadruple-precision floating-point value 4785 | represented by the denormalized significand formed by the concatenation of 4786 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 4787 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 4788 | significand are stored at the location pointed to by `zSig0Ptr', and the 4789 | least significant 64 bits of the normalized significand are stored at the 4790 | location pointed to by `zSig1Ptr'. 4791 *----------------------------------------------------------------------------*/ 4792 4793 static void 4794 normalizeFloat128Subnormal( 4795 uint64_t aSig0, 4796 uint64_t aSig1, 4797 int32_t *zExpPtr, 4798 uint64_t *zSig0Ptr, 4799 uint64_t *zSig1Ptr 4800 ) 4801 { 4802 int8_t shiftCount; 4803 4804 if ( aSig0 == 0 ) { 4805 shiftCount = clz64(aSig1) - 15; 4806 if ( shiftCount < 0 ) { 4807 *zSig0Ptr = aSig1>>( - shiftCount ); 4808 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 4809 } 4810 else { 4811 *zSig0Ptr = aSig1<<shiftCount; 4812 *zSig1Ptr = 0; 4813 } 4814 *zExpPtr = - shiftCount - 63; 4815 } 4816 else { 4817 shiftCount = clz64(aSig0) - 15; 4818 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 4819 *zExpPtr = 1 - shiftCount; 4820 } 4821 4822 } 4823 4824 /*---------------------------------------------------------------------------- 4825 | Packs the sign `zSign', the exponent `zExp', and the significand formed 4826 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 4827 | floating-point value, returning the result. After being shifted into the 4828 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 4829 | added together to form the most significant 32 bits of the result. This 4830 | means that any integer portion of `zSig0' will be added into the exponent. 4831 | Since a properly normalized significand will have an integer portion equal 4832 | to 1, the `zExp' input should be 1 less than the desired result exponent 4833 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 4834 | significand. 4835 *----------------------------------------------------------------------------*/ 4836 4837 static inline float128 4838 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1) 4839 { 4840 float128 z; 4841 4842 z.low = zSig1; 4843 z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0; 4844 return z; 4845 } 4846 4847 /*---------------------------------------------------------------------------- 4848 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4849 | and extended significand formed by the concatenation of `zSig0', `zSig1', 4850 | and `zSig2', and returns the proper quadruple-precision floating-point value 4851 | corresponding to the abstract input. Ordinarily, the abstract value is 4852 | simply rounded and packed into the quadruple-precision format, with the 4853 | inexact exception raised if the abstract input cannot be represented 4854 | exactly. However, if the abstract value is too large, the overflow and 4855 | inexact exceptions are raised and an infinity or maximal finite value is 4856 | returned. If the abstract value is too small, the input value is rounded to 4857 | a subnormal number, and the underflow and inexact exceptions are raised if 4858 | the abstract input cannot be represented exactly as a subnormal quadruple- 4859 | precision floating-point number. 4860 | The input significand must be normalized or smaller. If the input 4861 | significand is not normalized, `zExp' must be 0; in that case, the result 4862 | returned is a subnormal number, and it must not require rounding. In the 4863 | usual case that the input significand is normalized, `zExp' must be 1 less 4864 | than the ``true'' floating-point exponent. The handling of underflow and 4865 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4866 *----------------------------------------------------------------------------*/ 4867 4868 static float128 roundAndPackFloat128(bool zSign, int32_t zExp, 4869 uint64_t zSig0, uint64_t zSig1, 4870 uint64_t zSig2, float_status *status) 4871 { 4872 int8_t roundingMode; 4873 bool roundNearestEven, increment, isTiny; 4874 4875 roundingMode = status->float_rounding_mode; 4876 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4877 switch (roundingMode) { 4878 case float_round_nearest_even: 4879 case float_round_ties_away: 4880 increment = ((int64_t)zSig2 < 0); 4881 break; 4882 case float_round_to_zero: 4883 increment = 0; 4884 break; 4885 case float_round_up: 4886 increment = !zSign && zSig2; 4887 break; 4888 case float_round_down: 4889 increment = zSign && zSig2; 4890 break; 4891 case float_round_to_odd: 4892 increment = !(zSig1 & 0x1) && zSig2; 4893 break; 4894 default: 4895 abort(); 4896 } 4897 if ( 0x7FFD <= (uint32_t) zExp ) { 4898 if ( ( 0x7FFD < zExp ) 4899 || ( ( zExp == 0x7FFD ) 4900 && eq128( 4901 UINT64_C(0x0001FFFFFFFFFFFF), 4902 UINT64_C(0xFFFFFFFFFFFFFFFF), 4903 zSig0, 4904 zSig1 4905 ) 4906 && increment 4907 ) 4908 ) { 4909 float_raise(float_flag_overflow | float_flag_inexact, status); 4910 if ( ( roundingMode == float_round_to_zero ) 4911 || ( zSign && ( roundingMode == float_round_up ) ) 4912 || ( ! zSign && ( roundingMode == float_round_down ) ) 4913 || (roundingMode == float_round_to_odd) 4914 ) { 4915 return 4916 packFloat128( 4917 zSign, 4918 0x7FFE, 4919 UINT64_C(0x0000FFFFFFFFFFFF), 4920 UINT64_C(0xFFFFFFFFFFFFFFFF) 4921 ); 4922 } 4923 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4924 } 4925 if ( zExp < 0 ) { 4926 if (status->flush_to_zero) { 4927 float_raise(float_flag_output_denormal, status); 4928 return packFloat128(zSign, 0, 0, 0); 4929 } 4930 isTiny = status->tininess_before_rounding 4931 || (zExp < -1) 4932 || !increment 4933 || lt128(zSig0, zSig1, 4934 UINT64_C(0x0001FFFFFFFFFFFF), 4935 UINT64_C(0xFFFFFFFFFFFFFFFF)); 4936 shift128ExtraRightJamming( 4937 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 4938 zExp = 0; 4939 if (isTiny && zSig2) { 4940 float_raise(float_flag_underflow, status); 4941 } 4942 switch (roundingMode) { 4943 case float_round_nearest_even: 4944 case float_round_ties_away: 4945 increment = ((int64_t)zSig2 < 0); 4946 break; 4947 case float_round_to_zero: 4948 increment = 0; 4949 break; 4950 case float_round_up: 4951 increment = !zSign && zSig2; 4952 break; 4953 case float_round_down: 4954 increment = zSign && zSig2; 4955 break; 4956 case float_round_to_odd: 4957 increment = !(zSig1 & 0x1) && zSig2; 4958 break; 4959 default: 4960 abort(); 4961 } 4962 } 4963 } 4964 if (zSig2) { 4965 float_raise(float_flag_inexact, status); 4966 } 4967 if ( increment ) { 4968 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 4969 if ((zSig2 + zSig2 == 0) && roundNearestEven) { 4970 zSig1 &= ~1; 4971 } 4972 } 4973 else { 4974 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 4975 } 4976 return packFloat128( zSign, zExp, zSig0, zSig1 ); 4977 4978 } 4979 4980 /*---------------------------------------------------------------------------- 4981 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4982 | and significand formed by the concatenation of `zSig0' and `zSig1', and 4983 | returns the proper quadruple-precision floating-point value corresponding 4984 | to the abstract input. This routine is just like `roundAndPackFloat128' 4985 | except that the input significand has fewer bits and does not have to be 4986 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 4987 | point exponent. 4988 *----------------------------------------------------------------------------*/ 4989 4990 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp, 4991 uint64_t zSig0, uint64_t zSig1, 4992 float_status *status) 4993 { 4994 int8_t shiftCount; 4995 uint64_t zSig2; 4996 4997 if ( zSig0 == 0 ) { 4998 zSig0 = zSig1; 4999 zSig1 = 0; 5000 zExp -= 64; 5001 } 5002 shiftCount = clz64(zSig0) - 15; 5003 if ( 0 <= shiftCount ) { 5004 zSig2 = 0; 5005 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 5006 } 5007 else { 5008 shift128ExtraRightJamming( 5009 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 5010 } 5011 zExp -= shiftCount; 5012 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 5013 5014 } 5015 5016 5017 /*---------------------------------------------------------------------------- 5018 | Returns the result of converting the 32-bit two's complement integer `a' 5019 | to the extended double-precision floating-point format. The conversion 5020 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5021 | Arithmetic. 5022 *----------------------------------------------------------------------------*/ 5023 5024 floatx80 int32_to_floatx80(int32_t a, float_status *status) 5025 { 5026 bool zSign; 5027 uint32_t absA; 5028 int8_t shiftCount; 5029 uint64_t zSig; 5030 5031 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 5032 zSign = ( a < 0 ); 5033 absA = zSign ? - a : a; 5034 shiftCount = clz32(absA) + 32; 5035 zSig = absA; 5036 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 5037 5038 } 5039 5040 /*---------------------------------------------------------------------------- 5041 | Returns the result of converting the 32-bit two's complement integer `a' to 5042 | the quadruple-precision floating-point format. The conversion is performed 5043 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5044 *----------------------------------------------------------------------------*/ 5045 5046 float128 int32_to_float128(int32_t a, float_status *status) 5047 { 5048 bool zSign; 5049 uint32_t absA; 5050 int8_t shiftCount; 5051 uint64_t zSig0; 5052 5053 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 5054 zSign = ( a < 0 ); 5055 absA = zSign ? - a : a; 5056 shiftCount = clz32(absA) + 17; 5057 zSig0 = absA; 5058 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 5059 5060 } 5061 5062 /*---------------------------------------------------------------------------- 5063 | Returns the result of converting the 64-bit two's complement integer `a' 5064 | to the extended double-precision floating-point format. The conversion 5065 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5066 | Arithmetic. 5067 *----------------------------------------------------------------------------*/ 5068 5069 floatx80 int64_to_floatx80(int64_t a, float_status *status) 5070 { 5071 bool zSign; 5072 uint64_t absA; 5073 int8_t shiftCount; 5074 5075 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 5076 zSign = ( a < 0 ); 5077 absA = zSign ? - a : a; 5078 shiftCount = clz64(absA); 5079 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 5080 5081 } 5082 5083 /*---------------------------------------------------------------------------- 5084 | Returns the result of converting the 64-bit two's complement integer `a' to 5085 | the quadruple-precision floating-point format. The conversion is performed 5086 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5087 *----------------------------------------------------------------------------*/ 5088 5089 float128 int64_to_float128(int64_t a, float_status *status) 5090 { 5091 bool zSign; 5092 uint64_t absA; 5093 int8_t shiftCount; 5094 int32_t zExp; 5095 uint64_t zSig0, zSig1; 5096 5097 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 5098 zSign = ( a < 0 ); 5099 absA = zSign ? - a : a; 5100 shiftCount = clz64(absA) + 49; 5101 zExp = 0x406E - shiftCount; 5102 if ( 64 <= shiftCount ) { 5103 zSig1 = 0; 5104 zSig0 = absA; 5105 shiftCount -= 64; 5106 } 5107 else { 5108 zSig1 = absA; 5109 zSig0 = 0; 5110 } 5111 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 5112 return packFloat128( zSign, zExp, zSig0, zSig1 ); 5113 5114 } 5115 5116 /*---------------------------------------------------------------------------- 5117 | Returns the result of converting the 64-bit unsigned integer `a' 5118 | to the quadruple-precision floating-point format. The conversion is performed 5119 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5120 *----------------------------------------------------------------------------*/ 5121 5122 float128 uint64_to_float128(uint64_t a, float_status *status) 5123 { 5124 if (a == 0) { 5125 return float128_zero; 5126 } 5127 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status); 5128 } 5129 5130 /*---------------------------------------------------------------------------- 5131 | Returns the result of converting the single-precision floating-point value 5132 | `a' to the extended double-precision floating-point format. The conversion 5133 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5134 | Arithmetic. 5135 *----------------------------------------------------------------------------*/ 5136 5137 floatx80 float32_to_floatx80(float32 a, float_status *status) 5138 { 5139 bool aSign; 5140 int aExp; 5141 uint32_t aSig; 5142 5143 a = float32_squash_input_denormal(a, status); 5144 aSig = extractFloat32Frac( a ); 5145 aExp = extractFloat32Exp( a ); 5146 aSign = extractFloat32Sign( a ); 5147 if ( aExp == 0xFF ) { 5148 if (aSig) { 5149 floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status), 5150 status); 5151 return floatx80_silence_nan(res, status); 5152 } 5153 return packFloatx80(aSign, 5154 floatx80_infinity_high, 5155 floatx80_infinity_low); 5156 } 5157 if ( aExp == 0 ) { 5158 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 5159 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5160 } 5161 aSig |= 0x00800000; 5162 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 5163 5164 } 5165 5166 /*---------------------------------------------------------------------------- 5167 | Returns the remainder of the single-precision floating-point value `a' 5168 | with respect to the corresponding value `b'. The operation is performed 5169 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5170 *----------------------------------------------------------------------------*/ 5171 5172 float32 float32_rem(float32 a, float32 b, float_status *status) 5173 { 5174 bool aSign, zSign; 5175 int aExp, bExp, expDiff; 5176 uint32_t aSig, bSig; 5177 uint32_t q; 5178 uint64_t aSig64, bSig64, q64; 5179 uint32_t alternateASig; 5180 int32_t sigMean; 5181 a = float32_squash_input_denormal(a, status); 5182 b = float32_squash_input_denormal(b, status); 5183 5184 aSig = extractFloat32Frac( a ); 5185 aExp = extractFloat32Exp( a ); 5186 aSign = extractFloat32Sign( a ); 5187 bSig = extractFloat32Frac( b ); 5188 bExp = extractFloat32Exp( b ); 5189 if ( aExp == 0xFF ) { 5190 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 5191 return propagateFloat32NaN(a, b, status); 5192 } 5193 float_raise(float_flag_invalid, status); 5194 return float32_default_nan(status); 5195 } 5196 if ( bExp == 0xFF ) { 5197 if (bSig) { 5198 return propagateFloat32NaN(a, b, status); 5199 } 5200 return a; 5201 } 5202 if ( bExp == 0 ) { 5203 if ( bSig == 0 ) { 5204 float_raise(float_flag_invalid, status); 5205 return float32_default_nan(status); 5206 } 5207 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 5208 } 5209 if ( aExp == 0 ) { 5210 if ( aSig == 0 ) return a; 5211 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5212 } 5213 expDiff = aExp - bExp; 5214 aSig |= 0x00800000; 5215 bSig |= 0x00800000; 5216 if ( expDiff < 32 ) { 5217 aSig <<= 8; 5218 bSig <<= 8; 5219 if ( expDiff < 0 ) { 5220 if ( expDiff < -1 ) return a; 5221 aSig >>= 1; 5222 } 5223 q = ( bSig <= aSig ); 5224 if ( q ) aSig -= bSig; 5225 if ( 0 < expDiff ) { 5226 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 5227 q >>= 32 - expDiff; 5228 bSig >>= 2; 5229 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 5230 } 5231 else { 5232 aSig >>= 2; 5233 bSig >>= 2; 5234 } 5235 } 5236 else { 5237 if ( bSig <= aSig ) aSig -= bSig; 5238 aSig64 = ( (uint64_t) aSig )<<40; 5239 bSig64 = ( (uint64_t) bSig )<<40; 5240 expDiff -= 64; 5241 while ( 0 < expDiff ) { 5242 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 5243 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 5244 aSig64 = - ( ( bSig * q64 )<<38 ); 5245 expDiff -= 62; 5246 } 5247 expDiff += 64; 5248 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 5249 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 5250 q = q64>>( 64 - expDiff ); 5251 bSig <<= 6; 5252 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 5253 } 5254 do { 5255 alternateASig = aSig; 5256 ++q; 5257 aSig -= bSig; 5258 } while ( 0 <= (int32_t) aSig ); 5259 sigMean = aSig + alternateASig; 5260 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 5261 aSig = alternateASig; 5262 } 5263 zSign = ( (int32_t) aSig < 0 ); 5264 if ( zSign ) aSig = - aSig; 5265 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 5266 } 5267 5268 5269 5270 /*---------------------------------------------------------------------------- 5271 | Returns the binary exponential of the single-precision floating-point value 5272 | `a'. The operation is performed according to the IEC/IEEE Standard for 5273 | Binary Floating-Point Arithmetic. 5274 | 5275 | Uses the following identities: 5276 | 5277 | 1. ------------------------------------------------------------------------- 5278 | x x*ln(2) 5279 | 2 = e 5280 | 5281 | 2. ------------------------------------------------------------------------- 5282 | 2 3 4 5 n 5283 | x x x x x x x 5284 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 5285 | 1! 2! 3! 4! 5! n! 5286 *----------------------------------------------------------------------------*/ 5287 5288 static const float64 float32_exp2_coefficients[15] = 5289 { 5290 const_float64( 0x3ff0000000000000ll ), /* 1 */ 5291 const_float64( 0x3fe0000000000000ll ), /* 2 */ 5292 const_float64( 0x3fc5555555555555ll ), /* 3 */ 5293 const_float64( 0x3fa5555555555555ll ), /* 4 */ 5294 const_float64( 0x3f81111111111111ll ), /* 5 */ 5295 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 5296 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 5297 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 5298 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 5299 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 5300 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 5301 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 5302 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 5303 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 5304 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 5305 }; 5306 5307 float32 float32_exp2(float32 a, float_status *status) 5308 { 5309 bool aSign; 5310 int aExp; 5311 uint32_t aSig; 5312 float64 r, x, xn; 5313 int i; 5314 a = float32_squash_input_denormal(a, status); 5315 5316 aSig = extractFloat32Frac( a ); 5317 aExp = extractFloat32Exp( a ); 5318 aSign = extractFloat32Sign( a ); 5319 5320 if ( aExp == 0xFF) { 5321 if (aSig) { 5322 return propagateFloat32NaN(a, float32_zero, status); 5323 } 5324 return (aSign) ? float32_zero : a; 5325 } 5326 if (aExp == 0) { 5327 if (aSig == 0) return float32_one; 5328 } 5329 5330 float_raise(float_flag_inexact, status); 5331 5332 /* ******************************* */ 5333 /* using float64 for approximation */ 5334 /* ******************************* */ 5335 x = float32_to_float64(a, status); 5336 x = float64_mul(x, float64_ln2, status); 5337 5338 xn = x; 5339 r = float64_one; 5340 for (i = 0 ; i < 15 ; i++) { 5341 float64 f; 5342 5343 f = float64_mul(xn, float32_exp2_coefficients[i], status); 5344 r = float64_add(r, f, status); 5345 5346 xn = float64_mul(xn, x, status); 5347 } 5348 5349 return float64_to_float32(r, status); 5350 } 5351 5352 /*---------------------------------------------------------------------------- 5353 | Returns the binary log of the single-precision floating-point value `a'. 5354 | The operation is performed according to the IEC/IEEE Standard for Binary 5355 | Floating-Point Arithmetic. 5356 *----------------------------------------------------------------------------*/ 5357 float32 float32_log2(float32 a, float_status *status) 5358 { 5359 bool aSign, zSign; 5360 int aExp; 5361 uint32_t aSig, zSig, i; 5362 5363 a = float32_squash_input_denormal(a, status); 5364 aSig = extractFloat32Frac( a ); 5365 aExp = extractFloat32Exp( a ); 5366 aSign = extractFloat32Sign( a ); 5367 5368 if ( aExp == 0 ) { 5369 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 5370 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5371 } 5372 if ( aSign ) { 5373 float_raise(float_flag_invalid, status); 5374 return float32_default_nan(status); 5375 } 5376 if ( aExp == 0xFF ) { 5377 if (aSig) { 5378 return propagateFloat32NaN(a, float32_zero, status); 5379 } 5380 return a; 5381 } 5382 5383 aExp -= 0x7F; 5384 aSig |= 0x00800000; 5385 zSign = aExp < 0; 5386 zSig = aExp << 23; 5387 5388 for (i = 1 << 22; i > 0; i >>= 1) { 5389 aSig = ( (uint64_t)aSig * aSig ) >> 23; 5390 if ( aSig & 0x01000000 ) { 5391 aSig >>= 1; 5392 zSig |= i; 5393 } 5394 } 5395 5396 if ( zSign ) 5397 zSig = -zSig; 5398 5399 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 5400 } 5401 5402 /*---------------------------------------------------------------------------- 5403 | Returns the result of converting the double-precision floating-point value 5404 | `a' to the extended double-precision floating-point format. The conversion 5405 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5406 | Arithmetic. 5407 *----------------------------------------------------------------------------*/ 5408 5409 floatx80 float64_to_floatx80(float64 a, float_status *status) 5410 { 5411 bool aSign; 5412 int aExp; 5413 uint64_t aSig; 5414 5415 a = float64_squash_input_denormal(a, status); 5416 aSig = extractFloat64Frac( a ); 5417 aExp = extractFloat64Exp( a ); 5418 aSign = extractFloat64Sign( a ); 5419 if ( aExp == 0x7FF ) { 5420 if (aSig) { 5421 floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status), 5422 status); 5423 return floatx80_silence_nan(res, status); 5424 } 5425 return packFloatx80(aSign, 5426 floatx80_infinity_high, 5427 floatx80_infinity_low); 5428 } 5429 if ( aExp == 0 ) { 5430 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 5431 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5432 } 5433 return 5434 packFloatx80( 5435 aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11); 5436 5437 } 5438 5439 /*---------------------------------------------------------------------------- 5440 | Returns the remainder of the double-precision floating-point value `a' 5441 | with respect to the corresponding value `b'. The operation is performed 5442 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5443 *----------------------------------------------------------------------------*/ 5444 5445 float64 float64_rem(float64 a, float64 b, float_status *status) 5446 { 5447 bool aSign, zSign; 5448 int aExp, bExp, expDiff; 5449 uint64_t aSig, bSig; 5450 uint64_t q, alternateASig; 5451 int64_t sigMean; 5452 5453 a = float64_squash_input_denormal(a, status); 5454 b = float64_squash_input_denormal(b, status); 5455 aSig = extractFloat64Frac( a ); 5456 aExp = extractFloat64Exp( a ); 5457 aSign = extractFloat64Sign( a ); 5458 bSig = extractFloat64Frac( b ); 5459 bExp = extractFloat64Exp( b ); 5460 if ( aExp == 0x7FF ) { 5461 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 5462 return propagateFloat64NaN(a, b, status); 5463 } 5464 float_raise(float_flag_invalid, status); 5465 return float64_default_nan(status); 5466 } 5467 if ( bExp == 0x7FF ) { 5468 if (bSig) { 5469 return propagateFloat64NaN(a, b, status); 5470 } 5471 return a; 5472 } 5473 if ( bExp == 0 ) { 5474 if ( bSig == 0 ) { 5475 float_raise(float_flag_invalid, status); 5476 return float64_default_nan(status); 5477 } 5478 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 5479 } 5480 if ( aExp == 0 ) { 5481 if ( aSig == 0 ) return a; 5482 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5483 } 5484 expDiff = aExp - bExp; 5485 aSig = (aSig | UINT64_C(0x0010000000000000)) << 11; 5486 bSig = (bSig | UINT64_C(0x0010000000000000)) << 11; 5487 if ( expDiff < 0 ) { 5488 if ( expDiff < -1 ) return a; 5489 aSig >>= 1; 5490 } 5491 q = ( bSig <= aSig ); 5492 if ( q ) aSig -= bSig; 5493 expDiff -= 64; 5494 while ( 0 < expDiff ) { 5495 q = estimateDiv128To64( aSig, 0, bSig ); 5496 q = ( 2 < q ) ? q - 2 : 0; 5497 aSig = - ( ( bSig>>2 ) * q ); 5498 expDiff -= 62; 5499 } 5500 expDiff += 64; 5501 if ( 0 < expDiff ) { 5502 q = estimateDiv128To64( aSig, 0, bSig ); 5503 q = ( 2 < q ) ? q - 2 : 0; 5504 q >>= 64 - expDiff; 5505 bSig >>= 2; 5506 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 5507 } 5508 else { 5509 aSig >>= 2; 5510 bSig >>= 2; 5511 } 5512 do { 5513 alternateASig = aSig; 5514 ++q; 5515 aSig -= bSig; 5516 } while ( 0 <= (int64_t) aSig ); 5517 sigMean = aSig + alternateASig; 5518 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 5519 aSig = alternateASig; 5520 } 5521 zSign = ( (int64_t) aSig < 0 ); 5522 if ( zSign ) aSig = - aSig; 5523 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 5524 5525 } 5526 5527 /*---------------------------------------------------------------------------- 5528 | Returns the binary log of the double-precision floating-point value `a'. 5529 | The operation is performed according to the IEC/IEEE Standard for Binary 5530 | Floating-Point Arithmetic. 5531 *----------------------------------------------------------------------------*/ 5532 float64 float64_log2(float64 a, float_status *status) 5533 { 5534 bool aSign, zSign; 5535 int aExp; 5536 uint64_t aSig, aSig0, aSig1, zSig, i; 5537 a = float64_squash_input_denormal(a, status); 5538 5539 aSig = extractFloat64Frac( a ); 5540 aExp = extractFloat64Exp( a ); 5541 aSign = extractFloat64Sign( a ); 5542 5543 if ( aExp == 0 ) { 5544 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 5545 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5546 } 5547 if ( aSign ) { 5548 float_raise(float_flag_invalid, status); 5549 return float64_default_nan(status); 5550 } 5551 if ( aExp == 0x7FF ) { 5552 if (aSig) { 5553 return propagateFloat64NaN(a, float64_zero, status); 5554 } 5555 return a; 5556 } 5557 5558 aExp -= 0x3FF; 5559 aSig |= UINT64_C(0x0010000000000000); 5560 zSign = aExp < 0; 5561 zSig = (uint64_t)aExp << 52; 5562 for (i = 1LL << 51; i > 0; i >>= 1) { 5563 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 5564 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 5565 if ( aSig & UINT64_C(0x0020000000000000) ) { 5566 aSig >>= 1; 5567 zSig |= i; 5568 } 5569 } 5570 5571 if ( zSign ) 5572 zSig = -zSig; 5573 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 5574 } 5575 5576 /*---------------------------------------------------------------------------- 5577 | Returns the result of converting the extended double-precision floating- 5578 | point value `a' to the 32-bit two's complement integer format. The 5579 | conversion is performed according to the IEC/IEEE Standard for Binary 5580 | Floating-Point Arithmetic---which means in particular that the conversion 5581 | is rounded according to the current rounding mode. If `a' is a NaN, the 5582 | largest positive integer is returned. Otherwise, if the conversion 5583 | overflows, the largest integer with the same sign as `a' is returned. 5584 *----------------------------------------------------------------------------*/ 5585 5586 int32_t floatx80_to_int32(floatx80 a, float_status *status) 5587 { 5588 bool aSign; 5589 int32_t aExp, shiftCount; 5590 uint64_t aSig; 5591 5592 if (floatx80_invalid_encoding(a)) { 5593 float_raise(float_flag_invalid, status); 5594 return 1 << 31; 5595 } 5596 aSig = extractFloatx80Frac( a ); 5597 aExp = extractFloatx80Exp( a ); 5598 aSign = extractFloatx80Sign( a ); 5599 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5600 shiftCount = 0x4037 - aExp; 5601 if ( shiftCount <= 0 ) shiftCount = 1; 5602 shift64RightJamming( aSig, shiftCount, &aSig ); 5603 return roundAndPackInt32(aSign, aSig, status); 5604 5605 } 5606 5607 /*---------------------------------------------------------------------------- 5608 | Returns the result of converting the extended double-precision floating- 5609 | point value `a' to the 32-bit two's complement integer format. The 5610 | conversion is performed according to the IEC/IEEE Standard for Binary 5611 | Floating-Point Arithmetic, except that the conversion is always rounded 5612 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5613 | Otherwise, if the conversion overflows, the largest integer with the same 5614 | sign as `a' is returned. 5615 *----------------------------------------------------------------------------*/ 5616 5617 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 5618 { 5619 bool aSign; 5620 int32_t aExp, shiftCount; 5621 uint64_t aSig, savedASig; 5622 int32_t z; 5623 5624 if (floatx80_invalid_encoding(a)) { 5625 float_raise(float_flag_invalid, status); 5626 return 1 << 31; 5627 } 5628 aSig = extractFloatx80Frac( a ); 5629 aExp = extractFloatx80Exp( a ); 5630 aSign = extractFloatx80Sign( a ); 5631 if ( 0x401E < aExp ) { 5632 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5633 goto invalid; 5634 } 5635 else if ( aExp < 0x3FFF ) { 5636 if (aExp || aSig) { 5637 float_raise(float_flag_inexact, status); 5638 } 5639 return 0; 5640 } 5641 shiftCount = 0x403E - aExp; 5642 savedASig = aSig; 5643 aSig >>= shiftCount; 5644 z = aSig; 5645 if ( aSign ) z = - z; 5646 if ( ( z < 0 ) ^ aSign ) { 5647 invalid: 5648 float_raise(float_flag_invalid, status); 5649 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5650 } 5651 if ( ( aSig<<shiftCount ) != savedASig ) { 5652 float_raise(float_flag_inexact, status); 5653 } 5654 return z; 5655 5656 } 5657 5658 /*---------------------------------------------------------------------------- 5659 | Returns the result of converting the extended double-precision floating- 5660 | point value `a' to the 64-bit two's complement integer format. The 5661 | conversion is performed according to the IEC/IEEE Standard for Binary 5662 | Floating-Point Arithmetic---which means in particular that the conversion 5663 | is rounded according to the current rounding mode. If `a' is a NaN, 5664 | the largest positive integer is returned. Otherwise, if the conversion 5665 | overflows, the largest integer with the same sign as `a' is returned. 5666 *----------------------------------------------------------------------------*/ 5667 5668 int64_t floatx80_to_int64(floatx80 a, float_status *status) 5669 { 5670 bool aSign; 5671 int32_t aExp, shiftCount; 5672 uint64_t aSig, aSigExtra; 5673 5674 if (floatx80_invalid_encoding(a)) { 5675 float_raise(float_flag_invalid, status); 5676 return 1ULL << 63; 5677 } 5678 aSig = extractFloatx80Frac( a ); 5679 aExp = extractFloatx80Exp( a ); 5680 aSign = extractFloatx80Sign( a ); 5681 shiftCount = 0x403E - aExp; 5682 if ( shiftCount <= 0 ) { 5683 if ( shiftCount ) { 5684 float_raise(float_flag_invalid, status); 5685 if (!aSign || floatx80_is_any_nan(a)) { 5686 return INT64_MAX; 5687 } 5688 return INT64_MIN; 5689 } 5690 aSigExtra = 0; 5691 } 5692 else { 5693 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 5694 } 5695 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 5696 5697 } 5698 5699 /*---------------------------------------------------------------------------- 5700 | Returns the result of converting the extended double-precision floating- 5701 | point value `a' to the 64-bit two's complement integer format. The 5702 | conversion is performed according to the IEC/IEEE Standard for Binary 5703 | Floating-Point Arithmetic, except that the conversion is always rounded 5704 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5705 | Otherwise, if the conversion overflows, the largest integer with the same 5706 | sign as `a' is returned. 5707 *----------------------------------------------------------------------------*/ 5708 5709 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 5710 { 5711 bool aSign; 5712 int32_t aExp, shiftCount; 5713 uint64_t aSig; 5714 int64_t z; 5715 5716 if (floatx80_invalid_encoding(a)) { 5717 float_raise(float_flag_invalid, status); 5718 return 1ULL << 63; 5719 } 5720 aSig = extractFloatx80Frac( a ); 5721 aExp = extractFloatx80Exp( a ); 5722 aSign = extractFloatx80Sign( a ); 5723 shiftCount = aExp - 0x403E; 5724 if ( 0 <= shiftCount ) { 5725 aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF); 5726 if ( ( a.high != 0xC03E ) || aSig ) { 5727 float_raise(float_flag_invalid, status); 5728 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 5729 return INT64_MAX; 5730 } 5731 } 5732 return INT64_MIN; 5733 } 5734 else if ( aExp < 0x3FFF ) { 5735 if (aExp | aSig) { 5736 float_raise(float_flag_inexact, status); 5737 } 5738 return 0; 5739 } 5740 z = aSig>>( - shiftCount ); 5741 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 5742 float_raise(float_flag_inexact, status); 5743 } 5744 if ( aSign ) z = - z; 5745 return z; 5746 5747 } 5748 5749 /*---------------------------------------------------------------------------- 5750 | Returns the result of converting the extended double-precision floating- 5751 | point value `a' to the single-precision floating-point format. The 5752 | conversion is performed according to the IEC/IEEE Standard for Binary 5753 | Floating-Point Arithmetic. 5754 *----------------------------------------------------------------------------*/ 5755 5756 float32 floatx80_to_float32(floatx80 a, float_status *status) 5757 { 5758 bool aSign; 5759 int32_t aExp; 5760 uint64_t aSig; 5761 5762 if (floatx80_invalid_encoding(a)) { 5763 float_raise(float_flag_invalid, status); 5764 return float32_default_nan(status); 5765 } 5766 aSig = extractFloatx80Frac( a ); 5767 aExp = extractFloatx80Exp( a ); 5768 aSign = extractFloatx80Sign( a ); 5769 if ( aExp == 0x7FFF ) { 5770 if ( (uint64_t) ( aSig<<1 ) ) { 5771 float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status), 5772 status); 5773 return float32_silence_nan(res, status); 5774 } 5775 return packFloat32( aSign, 0xFF, 0 ); 5776 } 5777 shift64RightJamming( aSig, 33, &aSig ); 5778 if ( aExp || aSig ) aExp -= 0x3F81; 5779 return roundAndPackFloat32(aSign, aExp, aSig, status); 5780 5781 } 5782 5783 /*---------------------------------------------------------------------------- 5784 | Returns the result of converting the extended double-precision floating- 5785 | point value `a' to the double-precision floating-point format. The 5786 | conversion is performed according to the IEC/IEEE Standard for Binary 5787 | Floating-Point Arithmetic. 5788 *----------------------------------------------------------------------------*/ 5789 5790 float64 floatx80_to_float64(floatx80 a, float_status *status) 5791 { 5792 bool aSign; 5793 int32_t aExp; 5794 uint64_t aSig, zSig; 5795 5796 if (floatx80_invalid_encoding(a)) { 5797 float_raise(float_flag_invalid, status); 5798 return float64_default_nan(status); 5799 } 5800 aSig = extractFloatx80Frac( a ); 5801 aExp = extractFloatx80Exp( a ); 5802 aSign = extractFloatx80Sign( a ); 5803 if ( aExp == 0x7FFF ) { 5804 if ( (uint64_t) ( aSig<<1 ) ) { 5805 float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status), 5806 status); 5807 return float64_silence_nan(res, status); 5808 } 5809 return packFloat64( aSign, 0x7FF, 0 ); 5810 } 5811 shift64RightJamming( aSig, 1, &zSig ); 5812 if ( aExp || aSig ) aExp -= 0x3C01; 5813 return roundAndPackFloat64(aSign, aExp, zSig, status); 5814 5815 } 5816 5817 /*---------------------------------------------------------------------------- 5818 | Returns the result of converting the extended double-precision floating- 5819 | point value `a' to the quadruple-precision floating-point format. The 5820 | conversion is performed according to the IEC/IEEE Standard for Binary 5821 | Floating-Point Arithmetic. 5822 *----------------------------------------------------------------------------*/ 5823 5824 float128 floatx80_to_float128(floatx80 a, float_status *status) 5825 { 5826 bool aSign; 5827 int aExp; 5828 uint64_t aSig, zSig0, zSig1; 5829 5830 if (floatx80_invalid_encoding(a)) { 5831 float_raise(float_flag_invalid, status); 5832 return float128_default_nan(status); 5833 } 5834 aSig = extractFloatx80Frac( a ); 5835 aExp = extractFloatx80Exp( a ); 5836 aSign = extractFloatx80Sign( a ); 5837 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 5838 float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status), 5839 status); 5840 return float128_silence_nan(res, status); 5841 } 5842 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 5843 return packFloat128( aSign, aExp, zSig0, zSig1 ); 5844 5845 } 5846 5847 /*---------------------------------------------------------------------------- 5848 | Rounds the extended double-precision floating-point value `a' 5849 | to the precision provided by floatx80_rounding_precision and returns the 5850 | result as an extended double-precision floating-point value. 5851 | The operation is performed according to the IEC/IEEE Standard for Binary 5852 | Floating-Point Arithmetic. 5853 *----------------------------------------------------------------------------*/ 5854 5855 floatx80 floatx80_round(floatx80 a, float_status *status) 5856 { 5857 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5858 extractFloatx80Sign(a), 5859 extractFloatx80Exp(a), 5860 extractFloatx80Frac(a), 0, status); 5861 } 5862 5863 /*---------------------------------------------------------------------------- 5864 | Rounds the extended double-precision floating-point value `a' to an integer, 5865 | and returns the result as an extended quadruple-precision floating-point 5866 | value. The operation is performed according to the IEC/IEEE Standard for 5867 | Binary Floating-Point Arithmetic. 5868 *----------------------------------------------------------------------------*/ 5869 5870 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 5871 { 5872 bool aSign; 5873 int32_t aExp; 5874 uint64_t lastBitMask, roundBitsMask; 5875 floatx80 z; 5876 5877 if (floatx80_invalid_encoding(a)) { 5878 float_raise(float_flag_invalid, status); 5879 return floatx80_default_nan(status); 5880 } 5881 aExp = extractFloatx80Exp( a ); 5882 if ( 0x403E <= aExp ) { 5883 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 5884 return propagateFloatx80NaN(a, a, status); 5885 } 5886 return a; 5887 } 5888 if ( aExp < 0x3FFF ) { 5889 if ( ( aExp == 0 ) 5890 && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) { 5891 return a; 5892 } 5893 float_raise(float_flag_inexact, status); 5894 aSign = extractFloatx80Sign( a ); 5895 switch (status->float_rounding_mode) { 5896 case float_round_nearest_even: 5897 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 5898 ) { 5899 return 5900 packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000)); 5901 } 5902 break; 5903 case float_round_ties_away: 5904 if (aExp == 0x3FFE) { 5905 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000)); 5906 } 5907 break; 5908 case float_round_down: 5909 return 5910 aSign ? 5911 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000)) 5912 : packFloatx80( 0, 0, 0 ); 5913 case float_round_up: 5914 return 5915 aSign ? packFloatx80( 1, 0, 0 ) 5916 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000)); 5917 5918 case float_round_to_zero: 5919 break; 5920 default: 5921 g_assert_not_reached(); 5922 } 5923 return packFloatx80( aSign, 0, 0 ); 5924 } 5925 lastBitMask = 1; 5926 lastBitMask <<= 0x403E - aExp; 5927 roundBitsMask = lastBitMask - 1; 5928 z = a; 5929 switch (status->float_rounding_mode) { 5930 case float_round_nearest_even: 5931 z.low += lastBitMask>>1; 5932 if ((z.low & roundBitsMask) == 0) { 5933 z.low &= ~lastBitMask; 5934 } 5935 break; 5936 case float_round_ties_away: 5937 z.low += lastBitMask >> 1; 5938 break; 5939 case float_round_to_zero: 5940 break; 5941 case float_round_up: 5942 if (!extractFloatx80Sign(z)) { 5943 z.low += roundBitsMask; 5944 } 5945 break; 5946 case float_round_down: 5947 if (extractFloatx80Sign(z)) { 5948 z.low += roundBitsMask; 5949 } 5950 break; 5951 default: 5952 abort(); 5953 } 5954 z.low &= ~ roundBitsMask; 5955 if ( z.low == 0 ) { 5956 ++z.high; 5957 z.low = UINT64_C(0x8000000000000000); 5958 } 5959 if (z.low != a.low) { 5960 float_raise(float_flag_inexact, status); 5961 } 5962 return z; 5963 5964 } 5965 5966 /*---------------------------------------------------------------------------- 5967 | Returns the result of adding the absolute values of the extended double- 5968 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 5969 | negated before being returned. `zSign' is ignored if the result is a NaN. 5970 | The addition is performed according to the IEC/IEEE Standard for Binary 5971 | Floating-Point Arithmetic. 5972 *----------------------------------------------------------------------------*/ 5973 5974 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign, 5975 float_status *status) 5976 { 5977 int32_t aExp, bExp, zExp; 5978 uint64_t aSig, bSig, zSig0, zSig1; 5979 int32_t expDiff; 5980 5981 aSig = extractFloatx80Frac( a ); 5982 aExp = extractFloatx80Exp( a ); 5983 bSig = extractFloatx80Frac( b ); 5984 bExp = extractFloatx80Exp( b ); 5985 expDiff = aExp - bExp; 5986 if ( 0 < expDiff ) { 5987 if ( aExp == 0x7FFF ) { 5988 if ((uint64_t)(aSig << 1)) { 5989 return propagateFloatx80NaN(a, b, status); 5990 } 5991 return a; 5992 } 5993 if ( bExp == 0 ) --expDiff; 5994 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5995 zExp = aExp; 5996 } 5997 else if ( expDiff < 0 ) { 5998 if ( bExp == 0x7FFF ) { 5999 if ((uint64_t)(bSig << 1)) { 6000 return propagateFloatx80NaN(a, b, status); 6001 } 6002 return packFloatx80(zSign, 6003 floatx80_infinity_high, 6004 floatx80_infinity_low); 6005 } 6006 if ( aExp == 0 ) ++expDiff; 6007 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 6008 zExp = bExp; 6009 } 6010 else { 6011 if ( aExp == 0x7FFF ) { 6012 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 6013 return propagateFloatx80NaN(a, b, status); 6014 } 6015 return a; 6016 } 6017 zSig1 = 0; 6018 zSig0 = aSig + bSig; 6019 if ( aExp == 0 ) { 6020 if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) { 6021 /* At least one of the values is a pseudo-denormal, 6022 * and there is a carry out of the result. */ 6023 zExp = 1; 6024 goto shiftRight1; 6025 } 6026 if (zSig0 == 0) { 6027 return packFloatx80(zSign, 0, 0); 6028 } 6029 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 6030 goto roundAndPack; 6031 } 6032 zExp = aExp; 6033 goto shiftRight1; 6034 } 6035 zSig0 = aSig + bSig; 6036 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 6037 shiftRight1: 6038 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 6039 zSig0 |= UINT64_C(0x8000000000000000); 6040 ++zExp; 6041 roundAndPack: 6042 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6043 zSign, zExp, zSig0, zSig1, status); 6044 } 6045 6046 /*---------------------------------------------------------------------------- 6047 | Returns the result of subtracting the absolute values of the extended 6048 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 6049 | difference is negated before being returned. `zSign' is ignored if the 6050 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6051 | Standard for Binary Floating-Point Arithmetic. 6052 *----------------------------------------------------------------------------*/ 6053 6054 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign, 6055 float_status *status) 6056 { 6057 int32_t aExp, bExp, zExp; 6058 uint64_t aSig, bSig, zSig0, zSig1; 6059 int32_t expDiff; 6060 6061 aSig = extractFloatx80Frac( a ); 6062 aExp = extractFloatx80Exp( a ); 6063 bSig = extractFloatx80Frac( b ); 6064 bExp = extractFloatx80Exp( b ); 6065 expDiff = aExp - bExp; 6066 if ( 0 < expDiff ) goto aExpBigger; 6067 if ( expDiff < 0 ) goto bExpBigger; 6068 if ( aExp == 0x7FFF ) { 6069 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 6070 return propagateFloatx80NaN(a, b, status); 6071 } 6072 float_raise(float_flag_invalid, status); 6073 return floatx80_default_nan(status); 6074 } 6075 if ( aExp == 0 ) { 6076 aExp = 1; 6077 bExp = 1; 6078 } 6079 zSig1 = 0; 6080 if ( bSig < aSig ) goto aBigger; 6081 if ( aSig < bSig ) goto bBigger; 6082 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 6083 bExpBigger: 6084 if ( bExp == 0x7FFF ) { 6085 if ((uint64_t)(bSig << 1)) { 6086 return propagateFloatx80NaN(a, b, status); 6087 } 6088 return packFloatx80(zSign ^ 1, floatx80_infinity_high, 6089 floatx80_infinity_low); 6090 } 6091 if ( aExp == 0 ) ++expDiff; 6092 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 6093 bBigger: 6094 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 6095 zExp = bExp; 6096 zSign ^= 1; 6097 goto normalizeRoundAndPack; 6098 aExpBigger: 6099 if ( aExp == 0x7FFF ) { 6100 if ((uint64_t)(aSig << 1)) { 6101 return propagateFloatx80NaN(a, b, status); 6102 } 6103 return a; 6104 } 6105 if ( bExp == 0 ) --expDiff; 6106 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 6107 aBigger: 6108 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 6109 zExp = aExp; 6110 normalizeRoundAndPack: 6111 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 6112 zSign, zExp, zSig0, zSig1, status); 6113 } 6114 6115 /*---------------------------------------------------------------------------- 6116 | Returns the result of adding the extended double-precision floating-point 6117 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6118 | Standard for Binary Floating-Point Arithmetic. 6119 *----------------------------------------------------------------------------*/ 6120 6121 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 6122 { 6123 bool aSign, bSign; 6124 6125 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6126 float_raise(float_flag_invalid, status); 6127 return floatx80_default_nan(status); 6128 } 6129 aSign = extractFloatx80Sign( a ); 6130 bSign = extractFloatx80Sign( b ); 6131 if ( aSign == bSign ) { 6132 return addFloatx80Sigs(a, b, aSign, status); 6133 } 6134 else { 6135 return subFloatx80Sigs(a, b, aSign, status); 6136 } 6137 6138 } 6139 6140 /*---------------------------------------------------------------------------- 6141 | Returns the result of subtracting the extended double-precision floating- 6142 | point values `a' and `b'. The operation is performed according to the 6143 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6144 *----------------------------------------------------------------------------*/ 6145 6146 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 6147 { 6148 bool aSign, bSign; 6149 6150 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6151 float_raise(float_flag_invalid, status); 6152 return floatx80_default_nan(status); 6153 } 6154 aSign = extractFloatx80Sign( a ); 6155 bSign = extractFloatx80Sign( b ); 6156 if ( aSign == bSign ) { 6157 return subFloatx80Sigs(a, b, aSign, status); 6158 } 6159 else { 6160 return addFloatx80Sigs(a, b, aSign, status); 6161 } 6162 6163 } 6164 6165 /*---------------------------------------------------------------------------- 6166 | Returns the result of multiplying the extended double-precision floating- 6167 | point values `a' and `b'. The operation is performed according to the 6168 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6169 *----------------------------------------------------------------------------*/ 6170 6171 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 6172 { 6173 bool aSign, bSign, zSign; 6174 int32_t aExp, bExp, zExp; 6175 uint64_t aSig, bSig, zSig0, zSig1; 6176 6177 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6178 float_raise(float_flag_invalid, status); 6179 return floatx80_default_nan(status); 6180 } 6181 aSig = extractFloatx80Frac( a ); 6182 aExp = extractFloatx80Exp( a ); 6183 aSign = extractFloatx80Sign( a ); 6184 bSig = extractFloatx80Frac( b ); 6185 bExp = extractFloatx80Exp( b ); 6186 bSign = extractFloatx80Sign( b ); 6187 zSign = aSign ^ bSign; 6188 if ( aExp == 0x7FFF ) { 6189 if ( (uint64_t) ( aSig<<1 ) 6190 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 6191 return propagateFloatx80NaN(a, b, status); 6192 } 6193 if ( ( bExp | bSig ) == 0 ) goto invalid; 6194 return packFloatx80(zSign, floatx80_infinity_high, 6195 floatx80_infinity_low); 6196 } 6197 if ( bExp == 0x7FFF ) { 6198 if ((uint64_t)(bSig << 1)) { 6199 return propagateFloatx80NaN(a, b, status); 6200 } 6201 if ( ( aExp | aSig ) == 0 ) { 6202 invalid: 6203 float_raise(float_flag_invalid, status); 6204 return floatx80_default_nan(status); 6205 } 6206 return packFloatx80(zSign, floatx80_infinity_high, 6207 floatx80_infinity_low); 6208 } 6209 if ( aExp == 0 ) { 6210 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6211 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 6212 } 6213 if ( bExp == 0 ) { 6214 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6215 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6216 } 6217 zExp = aExp + bExp - 0x3FFE; 6218 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 6219 if ( 0 < (int64_t) zSig0 ) { 6220 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 6221 --zExp; 6222 } 6223 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6224 zSign, zExp, zSig0, zSig1, status); 6225 } 6226 6227 /*---------------------------------------------------------------------------- 6228 | Returns the result of dividing the extended double-precision floating-point 6229 | value `a' by the corresponding value `b'. The operation is performed 6230 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6231 *----------------------------------------------------------------------------*/ 6232 6233 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 6234 { 6235 bool aSign, bSign, zSign; 6236 int32_t aExp, bExp, zExp; 6237 uint64_t aSig, bSig, zSig0, zSig1; 6238 uint64_t rem0, rem1, rem2, term0, term1, term2; 6239 6240 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6241 float_raise(float_flag_invalid, status); 6242 return floatx80_default_nan(status); 6243 } 6244 aSig = extractFloatx80Frac( a ); 6245 aExp = extractFloatx80Exp( a ); 6246 aSign = extractFloatx80Sign( a ); 6247 bSig = extractFloatx80Frac( b ); 6248 bExp = extractFloatx80Exp( b ); 6249 bSign = extractFloatx80Sign( b ); 6250 zSign = aSign ^ bSign; 6251 if ( aExp == 0x7FFF ) { 6252 if ((uint64_t)(aSig << 1)) { 6253 return propagateFloatx80NaN(a, b, status); 6254 } 6255 if ( bExp == 0x7FFF ) { 6256 if ((uint64_t)(bSig << 1)) { 6257 return propagateFloatx80NaN(a, b, status); 6258 } 6259 goto invalid; 6260 } 6261 return packFloatx80(zSign, floatx80_infinity_high, 6262 floatx80_infinity_low); 6263 } 6264 if ( bExp == 0x7FFF ) { 6265 if ((uint64_t)(bSig << 1)) { 6266 return propagateFloatx80NaN(a, b, status); 6267 } 6268 return packFloatx80( zSign, 0, 0 ); 6269 } 6270 if ( bExp == 0 ) { 6271 if ( bSig == 0 ) { 6272 if ( ( aExp | aSig ) == 0 ) { 6273 invalid: 6274 float_raise(float_flag_invalid, status); 6275 return floatx80_default_nan(status); 6276 } 6277 float_raise(float_flag_divbyzero, status); 6278 return packFloatx80(zSign, floatx80_infinity_high, 6279 floatx80_infinity_low); 6280 } 6281 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6282 } 6283 if ( aExp == 0 ) { 6284 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6285 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 6286 } 6287 zExp = aExp - bExp + 0x3FFE; 6288 rem1 = 0; 6289 if ( bSig <= aSig ) { 6290 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 6291 ++zExp; 6292 } 6293 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 6294 mul64To128( bSig, zSig0, &term0, &term1 ); 6295 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 6296 while ( (int64_t) rem0 < 0 ) { 6297 --zSig0; 6298 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 6299 } 6300 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 6301 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 6302 mul64To128( bSig, zSig1, &term1, &term2 ); 6303 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6304 while ( (int64_t) rem1 < 0 ) { 6305 --zSig1; 6306 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 6307 } 6308 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 6309 } 6310 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6311 zSign, zExp, zSig0, zSig1, status); 6312 } 6313 6314 /*---------------------------------------------------------------------------- 6315 | Returns the remainder of the extended double-precision floating-point value 6316 | `a' with respect to the corresponding value `b'. The operation is performed 6317 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic, 6318 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating 6319 | the quotient toward zero instead. '*quotient' is set to the low 64 bits of 6320 | the absolute value of the integer quotient. 6321 *----------------------------------------------------------------------------*/ 6322 6323 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient, 6324 float_status *status) 6325 { 6326 bool aSign, zSign; 6327 int32_t aExp, bExp, expDiff, aExpOrig; 6328 uint64_t aSig0, aSig1, bSig; 6329 uint64_t q, term0, term1, alternateASig0, alternateASig1; 6330 6331 *quotient = 0; 6332 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6333 float_raise(float_flag_invalid, status); 6334 return floatx80_default_nan(status); 6335 } 6336 aSig0 = extractFloatx80Frac( a ); 6337 aExpOrig = aExp = extractFloatx80Exp( a ); 6338 aSign = extractFloatx80Sign( a ); 6339 bSig = extractFloatx80Frac( b ); 6340 bExp = extractFloatx80Exp( b ); 6341 if ( aExp == 0x7FFF ) { 6342 if ( (uint64_t) ( aSig0<<1 ) 6343 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 6344 return propagateFloatx80NaN(a, b, status); 6345 } 6346 goto invalid; 6347 } 6348 if ( bExp == 0x7FFF ) { 6349 if ((uint64_t)(bSig << 1)) { 6350 return propagateFloatx80NaN(a, b, status); 6351 } 6352 if (aExp == 0 && aSig0 >> 63) { 6353 /* 6354 * Pseudo-denormal argument must be returned in normalized 6355 * form. 6356 */ 6357 return packFloatx80(aSign, 1, aSig0); 6358 } 6359 return a; 6360 } 6361 if ( bExp == 0 ) { 6362 if ( bSig == 0 ) { 6363 invalid: 6364 float_raise(float_flag_invalid, status); 6365 return floatx80_default_nan(status); 6366 } 6367 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6368 } 6369 if ( aExp == 0 ) { 6370 if ( aSig0 == 0 ) return a; 6371 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6372 } 6373 zSign = aSign; 6374 expDiff = aExp - bExp; 6375 aSig1 = 0; 6376 if ( expDiff < 0 ) { 6377 if ( mod || expDiff < -1 ) { 6378 if (aExp == 1 && aExpOrig == 0) { 6379 /* 6380 * Pseudo-denormal argument must be returned in 6381 * normalized form. 6382 */ 6383 return packFloatx80(aSign, aExp, aSig0); 6384 } 6385 return a; 6386 } 6387 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 6388 expDiff = 0; 6389 } 6390 *quotient = q = ( bSig <= aSig0 ); 6391 if ( q ) aSig0 -= bSig; 6392 expDiff -= 64; 6393 while ( 0 < expDiff ) { 6394 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6395 q = ( 2 < q ) ? q - 2 : 0; 6396 mul64To128( bSig, q, &term0, &term1 ); 6397 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6398 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 6399 expDiff -= 62; 6400 *quotient <<= 62; 6401 *quotient += q; 6402 } 6403 expDiff += 64; 6404 if ( 0 < expDiff ) { 6405 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6406 q = ( 2 < q ) ? q - 2 : 0; 6407 q >>= 64 - expDiff; 6408 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 6409 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6410 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 6411 while ( le128( term0, term1, aSig0, aSig1 ) ) { 6412 ++q; 6413 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6414 } 6415 if (expDiff < 64) { 6416 *quotient <<= expDiff; 6417 } else { 6418 *quotient = 0; 6419 } 6420 *quotient += q; 6421 } 6422 else { 6423 term1 = 0; 6424 term0 = bSig; 6425 } 6426 if (!mod) { 6427 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 6428 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6429 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6430 && ( q & 1 ) ) 6431 ) { 6432 aSig0 = alternateASig0; 6433 aSig1 = alternateASig1; 6434 zSign = ! zSign; 6435 ++*quotient; 6436 } 6437 } 6438 return 6439 normalizeRoundAndPackFloatx80( 6440 80, zSign, bExp + expDiff, aSig0, aSig1, status); 6441 6442 } 6443 6444 /*---------------------------------------------------------------------------- 6445 | Returns the remainder of the extended double-precision floating-point value 6446 | `a' with respect to the corresponding value `b'. The operation is performed 6447 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6448 *----------------------------------------------------------------------------*/ 6449 6450 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 6451 { 6452 uint64_t quotient; 6453 return floatx80_modrem(a, b, false, "ient, status); 6454 } 6455 6456 /*---------------------------------------------------------------------------- 6457 | Returns the remainder of the extended double-precision floating-point value 6458 | `a' with respect to the corresponding value `b', with the quotient truncated 6459 | toward zero. 6460 *----------------------------------------------------------------------------*/ 6461 6462 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status) 6463 { 6464 uint64_t quotient; 6465 return floatx80_modrem(a, b, true, "ient, status); 6466 } 6467 6468 /*---------------------------------------------------------------------------- 6469 | Returns the square root of the extended double-precision floating-point 6470 | value `a'. The operation is performed according to the IEC/IEEE Standard 6471 | for Binary Floating-Point Arithmetic. 6472 *----------------------------------------------------------------------------*/ 6473 6474 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 6475 { 6476 bool aSign; 6477 int32_t aExp, zExp; 6478 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 6479 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6480 6481 if (floatx80_invalid_encoding(a)) { 6482 float_raise(float_flag_invalid, status); 6483 return floatx80_default_nan(status); 6484 } 6485 aSig0 = extractFloatx80Frac( a ); 6486 aExp = extractFloatx80Exp( a ); 6487 aSign = extractFloatx80Sign( a ); 6488 if ( aExp == 0x7FFF ) { 6489 if ((uint64_t)(aSig0 << 1)) { 6490 return propagateFloatx80NaN(a, a, status); 6491 } 6492 if ( ! aSign ) return a; 6493 goto invalid; 6494 } 6495 if ( aSign ) { 6496 if ( ( aExp | aSig0 ) == 0 ) return a; 6497 invalid: 6498 float_raise(float_flag_invalid, status); 6499 return floatx80_default_nan(status); 6500 } 6501 if ( aExp == 0 ) { 6502 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 6503 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6504 } 6505 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 6506 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 6507 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 6508 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6509 doubleZSig0 = zSig0<<1; 6510 mul64To128( zSig0, zSig0, &term0, &term1 ); 6511 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6512 while ( (int64_t) rem0 < 0 ) { 6513 --zSig0; 6514 doubleZSig0 -= 2; 6515 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6516 } 6517 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6518 if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) { 6519 if ( zSig1 == 0 ) zSig1 = 1; 6520 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6521 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6522 mul64To128( zSig1, zSig1, &term2, &term3 ); 6523 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6524 while ( (int64_t) rem1 < 0 ) { 6525 --zSig1; 6526 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6527 term3 |= 1; 6528 term2 |= doubleZSig0; 6529 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6530 } 6531 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6532 } 6533 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 6534 zSig0 |= doubleZSig0; 6535 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6536 0, zExp, zSig0, zSig1, status); 6537 } 6538 6539 /*---------------------------------------------------------------------------- 6540 | Returns the result of converting the quadruple-precision floating-point value 6541 | `a' to the 64-bit unsigned integer format. The conversion is 6542 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6543 | Arithmetic---which means in particular that the conversion is rounded 6544 | according to the current rounding mode. If `a' is a NaN, the largest 6545 | positive integer is returned. If the conversion overflows, the 6546 | largest unsigned integer is returned. If 'a' is negative, the value is 6547 | rounded and zero is returned; negative values that do not round to zero 6548 | will raise the inexact exception. 6549 *----------------------------------------------------------------------------*/ 6550 6551 uint64_t float128_to_uint64(float128 a, float_status *status) 6552 { 6553 bool aSign; 6554 int aExp; 6555 int shiftCount; 6556 uint64_t aSig0, aSig1; 6557 6558 aSig0 = extractFloat128Frac0(a); 6559 aSig1 = extractFloat128Frac1(a); 6560 aExp = extractFloat128Exp(a); 6561 aSign = extractFloat128Sign(a); 6562 if (aSign && (aExp > 0x3FFE)) { 6563 float_raise(float_flag_invalid, status); 6564 if (float128_is_any_nan(a)) { 6565 return UINT64_MAX; 6566 } else { 6567 return 0; 6568 } 6569 } 6570 if (aExp) { 6571 aSig0 |= UINT64_C(0x0001000000000000); 6572 } 6573 shiftCount = 0x402F - aExp; 6574 if (shiftCount <= 0) { 6575 if (0x403E < aExp) { 6576 float_raise(float_flag_invalid, status); 6577 return UINT64_MAX; 6578 } 6579 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 6580 } else { 6581 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 6582 } 6583 return roundAndPackUint64(aSign, aSig0, aSig1, status); 6584 } 6585 6586 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 6587 { 6588 uint64_t v; 6589 signed char current_rounding_mode = status->float_rounding_mode; 6590 6591 set_float_rounding_mode(float_round_to_zero, status); 6592 v = float128_to_uint64(a, status); 6593 set_float_rounding_mode(current_rounding_mode, status); 6594 6595 return v; 6596 } 6597 6598 /*---------------------------------------------------------------------------- 6599 | Returns the result of converting the quadruple-precision floating-point 6600 | value `a' to the 32-bit unsigned integer format. The conversion 6601 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6602 | Arithmetic except that the conversion is always rounded toward zero. 6603 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 6604 | if the conversion overflows, the largest unsigned integer is returned. 6605 | If 'a' is negative, the value is rounded and zero is returned; negative 6606 | values that do not round to zero will raise the inexact exception. 6607 *----------------------------------------------------------------------------*/ 6608 6609 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 6610 { 6611 uint64_t v; 6612 uint32_t res; 6613 int old_exc_flags = get_float_exception_flags(status); 6614 6615 v = float128_to_uint64_round_to_zero(a, status); 6616 if (v > 0xffffffff) { 6617 res = 0xffffffff; 6618 } else { 6619 return v; 6620 } 6621 set_float_exception_flags(old_exc_flags, status); 6622 float_raise(float_flag_invalid, status); 6623 return res; 6624 } 6625 6626 /*---------------------------------------------------------------------------- 6627 | Returns the result of converting the quadruple-precision floating-point value 6628 | `a' to the 32-bit unsigned integer format. The conversion is 6629 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6630 | Arithmetic---which means in particular that the conversion is rounded 6631 | according to the current rounding mode. If `a' is a NaN, the largest 6632 | positive integer is returned. If the conversion overflows, the 6633 | largest unsigned integer is returned. If 'a' is negative, the value is 6634 | rounded and zero is returned; negative values that do not round to zero 6635 | will raise the inexact exception. 6636 *----------------------------------------------------------------------------*/ 6637 6638 uint32_t float128_to_uint32(float128 a, float_status *status) 6639 { 6640 uint64_t v; 6641 uint32_t res; 6642 int old_exc_flags = get_float_exception_flags(status); 6643 6644 v = float128_to_uint64(a, status); 6645 if (v > 0xffffffff) { 6646 res = 0xffffffff; 6647 } else { 6648 return v; 6649 } 6650 set_float_exception_flags(old_exc_flags, status); 6651 float_raise(float_flag_invalid, status); 6652 return res; 6653 } 6654 6655 /*---------------------------------------------------------------------------- 6656 | Returns the result of converting the quadruple-precision floating-point 6657 | value `a' to the extended double-precision floating-point format. The 6658 | conversion is performed according to the IEC/IEEE Standard for Binary 6659 | Floating-Point Arithmetic. 6660 *----------------------------------------------------------------------------*/ 6661 6662 floatx80 float128_to_floatx80(float128 a, float_status *status) 6663 { 6664 bool aSign; 6665 int32_t aExp; 6666 uint64_t aSig0, aSig1; 6667 6668 aSig1 = extractFloat128Frac1( a ); 6669 aSig0 = extractFloat128Frac0( a ); 6670 aExp = extractFloat128Exp( a ); 6671 aSign = extractFloat128Sign( a ); 6672 if ( aExp == 0x7FFF ) { 6673 if ( aSig0 | aSig1 ) { 6674 floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status), 6675 status); 6676 return floatx80_silence_nan(res, status); 6677 } 6678 return packFloatx80(aSign, floatx80_infinity_high, 6679 floatx80_infinity_low); 6680 } 6681 if ( aExp == 0 ) { 6682 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6683 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6684 } 6685 else { 6686 aSig0 |= UINT64_C(0x0001000000000000); 6687 } 6688 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6689 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6690 6691 } 6692 6693 /*---------------------------------------------------------------------------- 6694 | Returns the remainder of the quadruple-precision floating-point value `a' 6695 | with respect to the corresponding value `b'. The operation is performed 6696 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6697 *----------------------------------------------------------------------------*/ 6698 6699 float128 float128_rem(float128 a, float128 b, float_status *status) 6700 { 6701 bool aSign, zSign; 6702 int32_t aExp, bExp, expDiff; 6703 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 6704 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 6705 int64_t sigMean0; 6706 6707 aSig1 = extractFloat128Frac1( a ); 6708 aSig0 = extractFloat128Frac0( a ); 6709 aExp = extractFloat128Exp( a ); 6710 aSign = extractFloat128Sign( a ); 6711 bSig1 = extractFloat128Frac1( b ); 6712 bSig0 = extractFloat128Frac0( b ); 6713 bExp = extractFloat128Exp( b ); 6714 if ( aExp == 0x7FFF ) { 6715 if ( ( aSig0 | aSig1 ) 6716 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6717 return propagateFloat128NaN(a, b, status); 6718 } 6719 goto invalid; 6720 } 6721 if ( bExp == 0x7FFF ) { 6722 if (bSig0 | bSig1) { 6723 return propagateFloat128NaN(a, b, status); 6724 } 6725 return a; 6726 } 6727 if ( bExp == 0 ) { 6728 if ( ( bSig0 | bSig1 ) == 0 ) { 6729 invalid: 6730 float_raise(float_flag_invalid, status); 6731 return float128_default_nan(status); 6732 } 6733 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6734 } 6735 if ( aExp == 0 ) { 6736 if ( ( aSig0 | aSig1 ) == 0 ) return a; 6737 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6738 } 6739 expDiff = aExp - bExp; 6740 if ( expDiff < -1 ) return a; 6741 shortShift128Left( 6742 aSig0 | UINT64_C(0x0001000000000000), 6743 aSig1, 6744 15 - ( expDiff < 0 ), 6745 &aSig0, 6746 &aSig1 6747 ); 6748 shortShift128Left( 6749 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 ); 6750 q = le128( bSig0, bSig1, aSig0, aSig1 ); 6751 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6752 expDiff -= 64; 6753 while ( 0 < expDiff ) { 6754 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6755 q = ( 4 < q ) ? q - 4 : 0; 6756 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6757 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 6758 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 6759 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 6760 expDiff -= 61; 6761 } 6762 if ( -64 < expDiff ) { 6763 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6764 q = ( 4 < q ) ? q - 4 : 0; 6765 q >>= - expDiff; 6766 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6767 expDiff += 52; 6768 if ( expDiff < 0 ) { 6769 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6770 } 6771 else { 6772 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 6773 } 6774 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6775 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 6776 } 6777 else { 6778 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 6779 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6780 } 6781 do { 6782 alternateASig0 = aSig0; 6783 alternateASig1 = aSig1; 6784 ++q; 6785 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6786 } while ( 0 <= (int64_t) aSig0 ); 6787 add128( 6788 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 6789 if ( ( sigMean0 < 0 ) 6790 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 6791 aSig0 = alternateASig0; 6792 aSig1 = alternateASig1; 6793 } 6794 zSign = ( (int64_t) aSig0 < 0 ); 6795 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 6796 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 6797 status); 6798 } 6799 6800 /*---------------------------------------------------------------------------- 6801 | Returns the square root of the quadruple-precision floating-point value `a'. 6802 | The operation is performed according to the IEC/IEEE Standard for Binary 6803 | Floating-Point Arithmetic. 6804 *----------------------------------------------------------------------------*/ 6805 6806 float128 float128_sqrt(float128 a, float_status *status) 6807 { 6808 bool aSign; 6809 int32_t aExp, zExp; 6810 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 6811 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6812 6813 aSig1 = extractFloat128Frac1( a ); 6814 aSig0 = extractFloat128Frac0( a ); 6815 aExp = extractFloat128Exp( a ); 6816 aSign = extractFloat128Sign( a ); 6817 if ( aExp == 0x7FFF ) { 6818 if (aSig0 | aSig1) { 6819 return propagateFloat128NaN(a, a, status); 6820 } 6821 if ( ! aSign ) return a; 6822 goto invalid; 6823 } 6824 if ( aSign ) { 6825 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 6826 invalid: 6827 float_raise(float_flag_invalid, status); 6828 return float128_default_nan(status); 6829 } 6830 if ( aExp == 0 ) { 6831 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 6832 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6833 } 6834 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 6835 aSig0 |= UINT64_C(0x0001000000000000); 6836 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 6837 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 6838 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6839 doubleZSig0 = zSig0<<1; 6840 mul64To128( zSig0, zSig0, &term0, &term1 ); 6841 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6842 while ( (int64_t) rem0 < 0 ) { 6843 --zSig0; 6844 doubleZSig0 -= 2; 6845 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6846 } 6847 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6848 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 6849 if ( zSig1 == 0 ) zSig1 = 1; 6850 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6851 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6852 mul64To128( zSig1, zSig1, &term2, &term3 ); 6853 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6854 while ( (int64_t) rem1 < 0 ) { 6855 --zSig1; 6856 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6857 term3 |= 1; 6858 term2 |= doubleZSig0; 6859 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6860 } 6861 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6862 } 6863 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 6864 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 6865 6866 } 6867 6868 static inline FloatRelation 6869 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet, 6870 float_status *status) 6871 { 6872 bool aSign, bSign; 6873 6874 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6875 float_raise(float_flag_invalid, status); 6876 return float_relation_unordered; 6877 } 6878 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 6879 ( extractFloatx80Frac( a )<<1 ) ) || 6880 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 6881 ( extractFloatx80Frac( b )<<1 ) )) { 6882 if (!is_quiet || 6883 floatx80_is_signaling_nan(a, status) || 6884 floatx80_is_signaling_nan(b, status)) { 6885 float_raise(float_flag_invalid, status); 6886 } 6887 return float_relation_unordered; 6888 } 6889 aSign = extractFloatx80Sign( a ); 6890 bSign = extractFloatx80Sign( b ); 6891 if ( aSign != bSign ) { 6892 6893 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 6894 ( ( a.low | b.low ) == 0 ) ) { 6895 /* zero case */ 6896 return float_relation_equal; 6897 } else { 6898 return 1 - (2 * aSign); 6899 } 6900 } else { 6901 /* Normalize pseudo-denormals before comparison. */ 6902 if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) { 6903 ++a.high; 6904 } 6905 if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) { 6906 ++b.high; 6907 } 6908 if (a.low == b.low && a.high == b.high) { 6909 return float_relation_equal; 6910 } else { 6911 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 6912 } 6913 } 6914 } 6915 6916 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status) 6917 { 6918 return floatx80_compare_internal(a, b, 0, status); 6919 } 6920 6921 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b, 6922 float_status *status) 6923 { 6924 return floatx80_compare_internal(a, b, 1, status); 6925 } 6926 6927 static inline FloatRelation 6928 float128_compare_internal(float128 a, float128 b, bool is_quiet, 6929 float_status *status) 6930 { 6931 bool aSign, bSign; 6932 6933 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 6934 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 6935 ( ( extractFloat128Exp( b ) == 0x7fff ) && 6936 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 6937 if (!is_quiet || 6938 float128_is_signaling_nan(a, status) || 6939 float128_is_signaling_nan(b, status)) { 6940 float_raise(float_flag_invalid, status); 6941 } 6942 return float_relation_unordered; 6943 } 6944 aSign = extractFloat128Sign( a ); 6945 bSign = extractFloat128Sign( b ); 6946 if ( aSign != bSign ) { 6947 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 6948 /* zero case */ 6949 return float_relation_equal; 6950 } else { 6951 return 1 - (2 * aSign); 6952 } 6953 } else { 6954 if (a.low == b.low && a.high == b.high) { 6955 return float_relation_equal; 6956 } else { 6957 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 6958 } 6959 } 6960 } 6961 6962 FloatRelation float128_compare(float128 a, float128 b, float_status *status) 6963 { 6964 return float128_compare_internal(a, b, 0, status); 6965 } 6966 6967 FloatRelation float128_compare_quiet(float128 a, float128 b, 6968 float_status *status) 6969 { 6970 return float128_compare_internal(a, b, 1, status); 6971 } 6972 6973 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 6974 { 6975 bool aSign; 6976 int32_t aExp; 6977 uint64_t aSig; 6978 6979 if (floatx80_invalid_encoding(a)) { 6980 float_raise(float_flag_invalid, status); 6981 return floatx80_default_nan(status); 6982 } 6983 aSig = extractFloatx80Frac( a ); 6984 aExp = extractFloatx80Exp( a ); 6985 aSign = extractFloatx80Sign( a ); 6986 6987 if ( aExp == 0x7FFF ) { 6988 if ( aSig<<1 ) { 6989 return propagateFloatx80NaN(a, a, status); 6990 } 6991 return a; 6992 } 6993 6994 if (aExp == 0) { 6995 if (aSig == 0) { 6996 return a; 6997 } 6998 aExp++; 6999 } 7000 7001 if (n > 0x10000) { 7002 n = 0x10000; 7003 } else if (n < -0x10000) { 7004 n = -0x10000; 7005 } 7006 7007 aExp += n; 7008 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7009 aSign, aExp, aSig, 0, status); 7010 } 7011 7012 float128 float128_scalbn(float128 a, int n, float_status *status) 7013 { 7014 bool aSign; 7015 int32_t aExp; 7016 uint64_t aSig0, aSig1; 7017 7018 aSig1 = extractFloat128Frac1( a ); 7019 aSig0 = extractFloat128Frac0( a ); 7020 aExp = extractFloat128Exp( a ); 7021 aSign = extractFloat128Sign( a ); 7022 if ( aExp == 0x7FFF ) { 7023 if ( aSig0 | aSig1 ) { 7024 return propagateFloat128NaN(a, a, status); 7025 } 7026 return a; 7027 } 7028 if (aExp != 0) { 7029 aSig0 |= UINT64_C(0x0001000000000000); 7030 } else if (aSig0 == 0 && aSig1 == 0) { 7031 return a; 7032 } else { 7033 aExp++; 7034 } 7035 7036 if (n > 0x10000) { 7037 n = 0x10000; 7038 } else if (n < -0x10000) { 7039 n = -0x10000; 7040 } 7041 7042 aExp += n - 1; 7043 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7044 , status); 7045 7046 } 7047 7048 static void __attribute__((constructor)) softfloat_init(void) 7049 { 7050 union_float64 ua, ub, uc, ur; 7051 7052 if (QEMU_NO_HARDFLOAT) { 7053 return; 7054 } 7055 /* 7056 * Test that the host's FMA is not obviously broken. For example, 7057 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see 7058 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304 7059 */ 7060 ua.s = 0x0020000000000001ULL; 7061 ub.s = 0x3ca0000000000000ULL; 7062 uc.s = 0x0020000000000000ULL; 7063 ur.h = fma(ua.h, ub.h, uc.h); 7064 if (ur.s != 0x0020000000000001ULL) { 7065 force_soft_fma = true; 7066 } 7067 } 7068