1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include <math.h> 87 #include "qemu/bitops.h" 88 #include "fpu/softfloat.h" 89 90 /* We only need stdlib for abort() */ 91 92 /*---------------------------------------------------------------------------- 93 | Primitive arithmetic functions, including multi-word arithmetic, and 94 | division and square root approximations. (Can be specialized to target if 95 | desired.) 96 *----------------------------------------------------------------------------*/ 97 #include "fpu/softfloat-macros.h" 98 99 /* 100 * Hardfloat 101 * 102 * Fast emulation of guest FP instructions is challenging for two reasons. 103 * First, FP instruction semantics are similar but not identical, particularly 104 * when handling NaNs. Second, emulating at reasonable speed the guest FP 105 * exception flags is not trivial: reading the host's flags register with a 106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp], 107 * and trapping on every FP exception is not fast nor pleasant to work with. 108 * 109 * We address these challenges by leveraging the host FPU for a subset of the 110 * operations. To do this we expand on the idea presented in this paper: 111 * 112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a 113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615. 114 * 115 * The idea is thus to leverage the host FPU to (1) compute FP operations 116 * and (2) identify whether FP exceptions occurred while avoiding 117 * expensive exception flag register accesses. 118 * 119 * An important optimization shown in the paper is that given that exception 120 * flags are rarely cleared by the guest, we can avoid recomputing some flags. 121 * This is particularly useful for the inexact flag, which is very frequently 122 * raised in floating-point workloads. 123 * 124 * We optimize the code further by deferring to soft-fp whenever FP exception 125 * detection might get hairy. Two examples: (1) when at least one operand is 126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result 127 * and the result is < the minimum normal. 128 */ 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \ 130 static inline void name(soft_t *a, float_status *s) \ 131 { \ 132 if (unlikely(soft_t ## _is_denormal(*a))) { \ 133 *a = soft_t ## _set_sign(soft_t ## _zero, \ 134 soft_t ## _is_neg(*a)); \ 135 float_raise(float_flag_input_denormal, s); \ 136 } \ 137 } 138 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32) 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64) 141 #undef GEN_INPUT_FLUSH__NOCHECK 142 143 #define GEN_INPUT_FLUSH1(name, soft_t) \ 144 static inline void name(soft_t *a, float_status *s) \ 145 { \ 146 if (likely(!s->flush_inputs_to_zero)) { \ 147 return; \ 148 } \ 149 soft_t ## _input_flush__nocheck(a, s); \ 150 } 151 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32) 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64) 154 #undef GEN_INPUT_FLUSH1 155 156 #define GEN_INPUT_FLUSH2(name, soft_t) \ 157 static inline void name(soft_t *a, soft_t *b, float_status *s) \ 158 { \ 159 if (likely(!s->flush_inputs_to_zero)) { \ 160 return; \ 161 } \ 162 soft_t ## _input_flush__nocheck(a, s); \ 163 soft_t ## _input_flush__nocheck(b, s); \ 164 } 165 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32) 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64) 168 #undef GEN_INPUT_FLUSH2 169 170 #define GEN_INPUT_FLUSH3(name, soft_t) \ 171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \ 172 { \ 173 if (likely(!s->flush_inputs_to_zero)) { \ 174 return; \ 175 } \ 176 soft_t ## _input_flush__nocheck(a, s); \ 177 soft_t ## _input_flush__nocheck(b, s); \ 178 soft_t ## _input_flush__nocheck(c, s); \ 179 } 180 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32) 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64) 183 #undef GEN_INPUT_FLUSH3 184 185 /* 186 * Choose whether to use fpclassify or float32/64_* primitives in the generated 187 * hardfloat functions. Each combination of number of inputs and float size 188 * gets its own value. 189 */ 190 #if defined(__x86_64__) 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1 197 #else 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0 204 #endif 205 206 /* 207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over 208 * float{32,64}_is_infinity when !USE_FP. 209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup. 210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%. 211 */ 212 #if defined(__x86_64__) || defined(__aarch64__) 213 # define QEMU_HARDFLOAT_USE_ISINF 1 214 #else 215 # define QEMU_HARDFLOAT_USE_ISINF 0 216 #endif 217 218 /* 219 * Some targets clear the FP flags before most FP operations. This prevents 220 * the use of hardfloat, since hardfloat relies on the inexact flag being 221 * already set. 222 */ 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__) 224 # if defined(__FAST_MATH__) 225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \ 226 IEEE implementation 227 # endif 228 # define QEMU_NO_HARDFLOAT 1 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN 230 #else 231 # define QEMU_NO_HARDFLOAT 0 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline)) 233 #endif 234 235 static inline bool can_use_fpu(const float_status *s) 236 { 237 if (QEMU_NO_HARDFLOAT) { 238 return false; 239 } 240 return likely(s->float_exception_flags & float_flag_inexact && 241 s->float_rounding_mode == float_round_nearest_even); 242 } 243 244 /* 245 * Hardfloat generation functions. Each operation can have two flavors: 246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for 247 * most condition checks, or native ones (e.g. fpclassify). 248 * 249 * The flavor is chosen by the callers. Instead of using macros, we rely on the 250 * compiler to propagate constants and inline everything into the callers. 251 * 252 * We only generate functions for operations with two inputs, since only 253 * these are common enough to justify consolidating them into common code. 254 */ 255 256 typedef union { 257 float32 s; 258 float h; 259 } union_float32; 260 261 typedef union { 262 float64 s; 263 double h; 264 } union_float64; 265 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b); 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b); 268 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s); 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s); 271 typedef float (*hard_f32_op2_fn)(float a, float b); 272 typedef double (*hard_f64_op2_fn)(double a, double b); 273 274 /* 2-input is-zero-or-normal */ 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b) 276 { 277 if (QEMU_HARDFLOAT_2F32_USE_FP) { 278 /* 279 * Not using a temp variable for consecutive fpclassify calls ends up 280 * generating faster code. 281 */ 282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 284 } 285 return float32_is_zero_or_normal(a.s) && 286 float32_is_zero_or_normal(b.s); 287 } 288 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b) 290 { 291 if (QEMU_HARDFLOAT_2F64_USE_FP) { 292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 294 } 295 return float64_is_zero_or_normal(a.s) && 296 float64_is_zero_or_normal(b.s); 297 } 298 299 /* 3-input is-zero-or-normal */ 300 static inline 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c) 302 { 303 if (QEMU_HARDFLOAT_3F32_USE_FP) { 304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 307 } 308 return float32_is_zero_or_normal(a.s) && 309 float32_is_zero_or_normal(b.s) && 310 float32_is_zero_or_normal(c.s); 311 } 312 313 static inline 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c) 315 { 316 if (QEMU_HARDFLOAT_3F64_USE_FP) { 317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 320 } 321 return float64_is_zero_or_normal(a.s) && 322 float64_is_zero_or_normal(b.s) && 323 float64_is_zero_or_normal(c.s); 324 } 325 326 static inline bool f32_is_inf(union_float32 a) 327 { 328 if (QEMU_HARDFLOAT_USE_ISINF) { 329 return isinf(a.h); 330 } 331 return float32_is_infinity(a.s); 332 } 333 334 static inline bool f64_is_inf(union_float64 a) 335 { 336 if (QEMU_HARDFLOAT_USE_ISINF) { 337 return isinf(a.h); 338 } 339 return float64_is_infinity(a.s); 340 } 341 342 static inline float32 343 float32_gen2(float32 xa, float32 xb, float_status *s, 344 hard_f32_op2_fn hard, soft_f32_op2_fn soft, 345 f32_check_fn pre, f32_check_fn post) 346 { 347 union_float32 ua, ub, ur; 348 349 ua.s = xa; 350 ub.s = xb; 351 352 if (unlikely(!can_use_fpu(s))) { 353 goto soft; 354 } 355 356 float32_input_flush2(&ua.s, &ub.s, s); 357 if (unlikely(!pre(ua, ub))) { 358 goto soft; 359 } 360 361 ur.h = hard(ua.h, ub.h); 362 if (unlikely(f32_is_inf(ur))) { 363 float_raise(float_flag_overflow, s); 364 } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) { 365 goto soft; 366 } 367 return ur.s; 368 369 soft: 370 return soft(ua.s, ub.s, s); 371 } 372 373 static inline float64 374 float64_gen2(float64 xa, float64 xb, float_status *s, 375 hard_f64_op2_fn hard, soft_f64_op2_fn soft, 376 f64_check_fn pre, f64_check_fn post) 377 { 378 union_float64 ua, ub, ur; 379 380 ua.s = xa; 381 ub.s = xb; 382 383 if (unlikely(!can_use_fpu(s))) { 384 goto soft; 385 } 386 387 float64_input_flush2(&ua.s, &ub.s, s); 388 if (unlikely(!pre(ua, ub))) { 389 goto soft; 390 } 391 392 ur.h = hard(ua.h, ub.h); 393 if (unlikely(f64_is_inf(ur))) { 394 float_raise(float_flag_overflow, s); 395 } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) { 396 goto soft; 397 } 398 return ur.s; 399 400 soft: 401 return soft(ua.s, ub.s, s); 402 } 403 404 /*---------------------------------------------------------------------------- 405 | Returns the fraction bits of the single-precision floating-point value `a'. 406 *----------------------------------------------------------------------------*/ 407 408 static inline uint32_t extractFloat32Frac(float32 a) 409 { 410 return float32_val(a) & 0x007FFFFF; 411 } 412 413 /*---------------------------------------------------------------------------- 414 | Returns the exponent bits of the single-precision floating-point value `a'. 415 *----------------------------------------------------------------------------*/ 416 417 static inline int extractFloat32Exp(float32 a) 418 { 419 return (float32_val(a) >> 23) & 0xFF; 420 } 421 422 /*---------------------------------------------------------------------------- 423 | Returns the sign bit of the single-precision floating-point value `a'. 424 *----------------------------------------------------------------------------*/ 425 426 static inline bool extractFloat32Sign(float32 a) 427 { 428 return float32_val(a) >> 31; 429 } 430 431 /*---------------------------------------------------------------------------- 432 | Returns the fraction bits of the double-precision floating-point value `a'. 433 *----------------------------------------------------------------------------*/ 434 435 static inline uint64_t extractFloat64Frac(float64 a) 436 { 437 return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF); 438 } 439 440 /*---------------------------------------------------------------------------- 441 | Returns the exponent bits of the double-precision floating-point value `a'. 442 *----------------------------------------------------------------------------*/ 443 444 static inline int extractFloat64Exp(float64 a) 445 { 446 return (float64_val(a) >> 52) & 0x7FF; 447 } 448 449 /*---------------------------------------------------------------------------- 450 | Returns the sign bit of the double-precision floating-point value `a'. 451 *----------------------------------------------------------------------------*/ 452 453 static inline bool extractFloat64Sign(float64 a) 454 { 455 return float64_val(a) >> 63; 456 } 457 458 /* 459 * Classify a floating point number. Everything above float_class_qnan 460 * is a NaN so cls >= float_class_qnan is any NaN. 461 */ 462 463 typedef enum __attribute__ ((__packed__)) { 464 float_class_unclassified, 465 float_class_zero, 466 float_class_normal, 467 float_class_inf, 468 float_class_qnan, /* all NaNs from here */ 469 float_class_snan, 470 } FloatClass; 471 472 #define float_cmask(bit) (1u << (bit)) 473 474 enum { 475 float_cmask_zero = float_cmask(float_class_zero), 476 float_cmask_normal = float_cmask(float_class_normal), 477 float_cmask_inf = float_cmask(float_class_inf), 478 float_cmask_qnan = float_cmask(float_class_qnan), 479 float_cmask_snan = float_cmask(float_class_snan), 480 481 float_cmask_infzero = float_cmask_zero | float_cmask_inf, 482 float_cmask_anynan = float_cmask_qnan | float_cmask_snan, 483 }; 484 485 486 /* Simple helpers for checking if, or what kind of, NaN we have */ 487 static inline __attribute__((unused)) bool is_nan(FloatClass c) 488 { 489 return unlikely(c >= float_class_qnan); 490 } 491 492 static inline __attribute__((unused)) bool is_snan(FloatClass c) 493 { 494 return c == float_class_snan; 495 } 496 497 static inline __attribute__((unused)) bool is_qnan(FloatClass c) 498 { 499 return c == float_class_qnan; 500 } 501 502 /* 503 * Structure holding all of the decomposed parts of a float. 504 * The exponent is unbiased and the fraction is normalized. 505 * 506 * The fraction words are stored in big-endian word ordering, 507 * so that truncation from a larger format to a smaller format 508 * can be done simply by ignoring subsequent elements. 509 */ 510 511 typedef struct { 512 FloatClass cls; 513 bool sign; 514 int32_t exp; 515 union { 516 /* Routines that know the structure may reference the singular name. */ 517 uint64_t frac; 518 /* 519 * Routines expanded with multiple structures reference "hi" and "lo" 520 * depending on the operation. In FloatParts64, "hi" and "lo" are 521 * both the same word and aliased here. 522 */ 523 uint64_t frac_hi; 524 uint64_t frac_lo; 525 }; 526 } FloatParts64; 527 528 typedef struct { 529 FloatClass cls; 530 bool sign; 531 int32_t exp; 532 uint64_t frac_hi; 533 uint64_t frac_lo; 534 } FloatParts128; 535 536 typedef struct { 537 FloatClass cls; 538 bool sign; 539 int32_t exp; 540 uint64_t frac_hi; 541 uint64_t frac_hm; /* high-middle */ 542 uint64_t frac_lm; /* low-middle */ 543 uint64_t frac_lo; 544 } FloatParts256; 545 546 /* These apply to the most significant word of each FloatPartsN. */ 547 #define DECOMPOSED_BINARY_POINT 63 548 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 549 550 /* Structure holding all of the relevant parameters for a format. 551 * exp_size: the size of the exponent field 552 * exp_bias: the offset applied to the exponent field 553 * exp_max: the maximum normalised exponent 554 * frac_size: the size of the fraction field 555 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 556 * The following are computed based the size of fraction 557 * frac_lsb: least significant bit of fraction 558 * frac_lsbm1: the bit below the least significant bit (for rounding) 559 * round_mask/roundeven_mask: masks used for rounding 560 * The following optional modifiers are available: 561 * arm_althp: handle ARM Alternative Half Precision 562 */ 563 typedef struct { 564 int exp_size; 565 int exp_bias; 566 int exp_max; 567 int frac_size; 568 int frac_shift; 569 uint64_t frac_lsb; 570 uint64_t frac_lsbm1; 571 uint64_t round_mask; 572 uint64_t roundeven_mask; 573 bool arm_althp; 574 } FloatFmt; 575 576 /* Expand fields based on the size of exponent and fraction */ 577 #define FLOAT_PARAMS(E, F) \ 578 .exp_size = E, \ 579 .exp_bias = ((1 << E) - 1) >> 1, \ 580 .exp_max = (1 << E) - 1, \ 581 .frac_size = F, \ 582 .frac_shift = (-F - 1) & 63, \ 583 .frac_lsb = 1ull << ((-F - 1) & 63), \ 584 .frac_lsbm1 = 1ull << ((-F - 2) & 63), \ 585 .round_mask = (1ull << ((-F - 1) & 63)) - 1, \ 586 .roundeven_mask = (2ull << ((-F - 1) & 63)) - 1 587 588 static const FloatFmt float16_params = { 589 FLOAT_PARAMS(5, 10) 590 }; 591 592 static const FloatFmt float16_params_ahp = { 593 FLOAT_PARAMS(5, 10), 594 .arm_althp = true 595 }; 596 597 static const FloatFmt bfloat16_params = { 598 FLOAT_PARAMS(8, 7) 599 }; 600 601 static const FloatFmt float32_params = { 602 FLOAT_PARAMS(8, 23) 603 }; 604 605 static const FloatFmt float64_params = { 606 FLOAT_PARAMS(11, 52) 607 }; 608 609 static const FloatFmt float128_params = { 610 FLOAT_PARAMS(15, 112) 611 }; 612 613 /* Unpack a float to parts, but do not canonicalize. */ 614 static void unpack_raw64(FloatParts64 *r, const FloatFmt *fmt, uint64_t raw) 615 { 616 const int f_size = fmt->frac_size; 617 const int e_size = fmt->exp_size; 618 619 *r = (FloatParts64) { 620 .cls = float_class_unclassified, 621 .sign = extract64(raw, f_size + e_size, 1), 622 .exp = extract64(raw, f_size, e_size), 623 .frac = extract64(raw, 0, f_size) 624 }; 625 } 626 627 static inline void float16_unpack_raw(FloatParts64 *p, float16 f) 628 { 629 unpack_raw64(p, &float16_params, f); 630 } 631 632 static inline void bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f) 633 { 634 unpack_raw64(p, &bfloat16_params, f); 635 } 636 637 static inline void float32_unpack_raw(FloatParts64 *p, float32 f) 638 { 639 unpack_raw64(p, &float32_params, f); 640 } 641 642 static inline void float64_unpack_raw(FloatParts64 *p, float64 f) 643 { 644 unpack_raw64(p, &float64_params, f); 645 } 646 647 static void float128_unpack_raw(FloatParts128 *p, float128 f) 648 { 649 const int f_size = float128_params.frac_size - 64; 650 const int e_size = float128_params.exp_size; 651 652 *p = (FloatParts128) { 653 .cls = float_class_unclassified, 654 .sign = extract64(f.high, f_size + e_size, 1), 655 .exp = extract64(f.high, f_size, e_size), 656 .frac_hi = extract64(f.high, 0, f_size), 657 .frac_lo = f.low, 658 }; 659 } 660 661 /* Pack a float from parts, but do not canonicalize. */ 662 static uint64_t pack_raw64(const FloatParts64 *p, const FloatFmt *fmt) 663 { 664 const int f_size = fmt->frac_size; 665 const int e_size = fmt->exp_size; 666 uint64_t ret; 667 668 ret = (uint64_t)p->sign << (f_size + e_size); 669 ret = deposit64(ret, f_size, e_size, p->exp); 670 ret = deposit64(ret, 0, f_size, p->frac); 671 return ret; 672 } 673 674 static inline float16 float16_pack_raw(const FloatParts64 *p) 675 { 676 return make_float16(pack_raw64(p, &float16_params)); 677 } 678 679 static inline bfloat16 bfloat16_pack_raw(const FloatParts64 *p) 680 { 681 return pack_raw64(p, &bfloat16_params); 682 } 683 684 static inline float32 float32_pack_raw(const FloatParts64 *p) 685 { 686 return make_float32(pack_raw64(p, &float32_params)); 687 } 688 689 static inline float64 float64_pack_raw(const FloatParts64 *p) 690 { 691 return make_float64(pack_raw64(p, &float64_params)); 692 } 693 694 static float128 float128_pack_raw(const FloatParts128 *p) 695 { 696 const int f_size = float128_params.frac_size - 64; 697 const int e_size = float128_params.exp_size; 698 uint64_t hi; 699 700 hi = (uint64_t)p->sign << (f_size + e_size); 701 hi = deposit64(hi, f_size, e_size, p->exp); 702 hi = deposit64(hi, 0, f_size, p->frac_hi); 703 return make_float128(hi, p->frac_lo); 704 } 705 706 /*---------------------------------------------------------------------------- 707 | Functions and definitions to determine: (1) whether tininess for underflow 708 | is detected before or after rounding by default, (2) what (if anything) 709 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 710 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 711 | are propagated from function inputs to output. These details are target- 712 | specific. 713 *----------------------------------------------------------------------------*/ 714 #include "softfloat-specialize.c.inc" 715 716 #define PARTS_GENERIC_64_128(NAME, P) \ 717 QEMU_GENERIC(P, (FloatParts128 *, parts128_##NAME), parts64_##NAME) 718 719 #define PARTS_GENERIC_64_128_256(NAME, P) \ 720 QEMU_GENERIC(P, (FloatParts256 *, parts256_##NAME), \ 721 (FloatParts128 *, parts128_##NAME), parts64_##NAME) 722 723 #define parts_default_nan(P, S) PARTS_GENERIC_64_128(default_nan, P)(P, S) 724 #define parts_silence_nan(P, S) PARTS_GENERIC_64_128(silence_nan, P)(P, S) 725 726 static void parts64_return_nan(FloatParts64 *a, float_status *s); 727 static void parts128_return_nan(FloatParts128 *a, float_status *s); 728 729 #define parts_return_nan(P, S) PARTS_GENERIC_64_128(return_nan, P)(P, S) 730 731 static FloatParts64 *parts64_pick_nan(FloatParts64 *a, FloatParts64 *b, 732 float_status *s); 733 static FloatParts128 *parts128_pick_nan(FloatParts128 *a, FloatParts128 *b, 734 float_status *s); 735 736 #define parts_pick_nan(A, B, S) PARTS_GENERIC_64_128(pick_nan, A)(A, B, S) 737 738 static FloatParts64 *parts64_pick_nan_muladd(FloatParts64 *a, FloatParts64 *b, 739 FloatParts64 *c, float_status *s, 740 int ab_mask, int abc_mask); 741 static FloatParts128 *parts128_pick_nan_muladd(FloatParts128 *a, 742 FloatParts128 *b, 743 FloatParts128 *c, 744 float_status *s, 745 int ab_mask, int abc_mask); 746 747 #define parts_pick_nan_muladd(A, B, C, S, ABM, ABCM) \ 748 PARTS_GENERIC_64_128(pick_nan_muladd, A)(A, B, C, S, ABM, ABCM) 749 750 static void parts64_canonicalize(FloatParts64 *p, float_status *status, 751 const FloatFmt *fmt); 752 static void parts128_canonicalize(FloatParts128 *p, float_status *status, 753 const FloatFmt *fmt); 754 755 #define parts_canonicalize(A, S, F) \ 756 PARTS_GENERIC_64_128(canonicalize, A)(A, S, F) 757 758 static void parts64_uncanon(FloatParts64 *p, float_status *status, 759 const FloatFmt *fmt); 760 static void parts128_uncanon(FloatParts128 *p, float_status *status, 761 const FloatFmt *fmt); 762 763 #define parts_uncanon(A, S, F) \ 764 PARTS_GENERIC_64_128(uncanon, A)(A, S, F) 765 766 static void parts64_add_normal(FloatParts64 *a, FloatParts64 *b); 767 static void parts128_add_normal(FloatParts128 *a, FloatParts128 *b); 768 static void parts256_add_normal(FloatParts256 *a, FloatParts256 *b); 769 770 #define parts_add_normal(A, B) \ 771 PARTS_GENERIC_64_128_256(add_normal, A)(A, B) 772 773 static bool parts64_sub_normal(FloatParts64 *a, FloatParts64 *b); 774 static bool parts128_sub_normal(FloatParts128 *a, FloatParts128 *b); 775 static bool parts256_sub_normal(FloatParts256 *a, FloatParts256 *b); 776 777 #define parts_sub_normal(A, B) \ 778 PARTS_GENERIC_64_128_256(sub_normal, A)(A, B) 779 780 static FloatParts64 *parts64_addsub(FloatParts64 *a, FloatParts64 *b, 781 float_status *s, bool subtract); 782 static FloatParts128 *parts128_addsub(FloatParts128 *a, FloatParts128 *b, 783 float_status *s, bool subtract); 784 785 #define parts_addsub(A, B, S, Z) \ 786 PARTS_GENERIC_64_128(addsub, A)(A, B, S, Z) 787 788 static FloatParts64 *parts64_mul(FloatParts64 *a, FloatParts64 *b, 789 float_status *s); 790 static FloatParts128 *parts128_mul(FloatParts128 *a, FloatParts128 *b, 791 float_status *s); 792 793 #define parts_mul(A, B, S) \ 794 PARTS_GENERIC_64_128(mul, A)(A, B, S) 795 796 static FloatParts64 *parts64_muladd(FloatParts64 *a, FloatParts64 *b, 797 FloatParts64 *c, int flags, 798 float_status *s); 799 static FloatParts128 *parts128_muladd(FloatParts128 *a, FloatParts128 *b, 800 FloatParts128 *c, int flags, 801 float_status *s); 802 803 #define parts_muladd(A, B, C, Z, S) \ 804 PARTS_GENERIC_64_128(muladd, A)(A, B, C, Z, S) 805 806 static FloatParts64 *parts64_div(FloatParts64 *a, FloatParts64 *b, 807 float_status *s); 808 static FloatParts128 *parts128_div(FloatParts128 *a, FloatParts128 *b, 809 float_status *s); 810 811 #define parts_div(A, B, S) \ 812 PARTS_GENERIC_64_128(div, A)(A, B, S) 813 814 /* 815 * Helper functions for softfloat-parts.c.inc, per-size operations. 816 */ 817 818 #define FRAC_GENERIC_64_128(NAME, P) \ 819 QEMU_GENERIC(P, (FloatParts128 *, frac128_##NAME), frac64_##NAME) 820 821 #define FRAC_GENERIC_64_128_256(NAME, P) \ 822 QEMU_GENERIC(P, (FloatParts256 *, frac256_##NAME), \ 823 (FloatParts128 *, frac128_##NAME), frac64_##NAME) 824 825 static bool frac64_add(FloatParts64 *r, FloatParts64 *a, FloatParts64 *b) 826 { 827 return uadd64_overflow(a->frac, b->frac, &r->frac); 828 } 829 830 static bool frac128_add(FloatParts128 *r, FloatParts128 *a, FloatParts128 *b) 831 { 832 bool c = 0; 833 r->frac_lo = uadd64_carry(a->frac_lo, b->frac_lo, &c); 834 r->frac_hi = uadd64_carry(a->frac_hi, b->frac_hi, &c); 835 return c; 836 } 837 838 static bool frac256_add(FloatParts256 *r, FloatParts256 *a, FloatParts256 *b) 839 { 840 bool c = 0; 841 r->frac_lo = uadd64_carry(a->frac_lo, b->frac_lo, &c); 842 r->frac_lm = uadd64_carry(a->frac_lm, b->frac_lm, &c); 843 r->frac_hm = uadd64_carry(a->frac_hm, b->frac_hm, &c); 844 r->frac_hi = uadd64_carry(a->frac_hi, b->frac_hi, &c); 845 return c; 846 } 847 848 #define frac_add(R, A, B) FRAC_GENERIC_64_128_256(add, R)(R, A, B) 849 850 static bool frac64_addi(FloatParts64 *r, FloatParts64 *a, uint64_t c) 851 { 852 return uadd64_overflow(a->frac, c, &r->frac); 853 } 854 855 static bool frac128_addi(FloatParts128 *r, FloatParts128 *a, uint64_t c) 856 { 857 c = uadd64_overflow(a->frac_lo, c, &r->frac_lo); 858 return uadd64_overflow(a->frac_hi, c, &r->frac_hi); 859 } 860 861 #define frac_addi(R, A, C) FRAC_GENERIC_64_128(addi, R)(R, A, C) 862 863 static void frac64_allones(FloatParts64 *a) 864 { 865 a->frac = -1; 866 } 867 868 static void frac128_allones(FloatParts128 *a) 869 { 870 a->frac_hi = a->frac_lo = -1; 871 } 872 873 #define frac_allones(A) FRAC_GENERIC_64_128(allones, A)(A) 874 875 static int frac64_cmp(FloatParts64 *a, FloatParts64 *b) 876 { 877 return a->frac == b->frac ? 0 : a->frac < b->frac ? -1 : 1; 878 } 879 880 static int frac128_cmp(FloatParts128 *a, FloatParts128 *b) 881 { 882 uint64_t ta = a->frac_hi, tb = b->frac_hi; 883 if (ta == tb) { 884 ta = a->frac_lo, tb = b->frac_lo; 885 if (ta == tb) { 886 return 0; 887 } 888 } 889 return ta < tb ? -1 : 1; 890 } 891 892 #define frac_cmp(A, B) FRAC_GENERIC_64_128(cmp, A)(A, B) 893 894 static void frac64_clear(FloatParts64 *a) 895 { 896 a->frac = 0; 897 } 898 899 static void frac128_clear(FloatParts128 *a) 900 { 901 a->frac_hi = a->frac_lo = 0; 902 } 903 904 #define frac_clear(A) FRAC_GENERIC_64_128(clear, A)(A) 905 906 static bool frac64_div(FloatParts64 *a, FloatParts64 *b) 907 { 908 uint64_t n1, n0, r, q; 909 bool ret; 910 911 /* 912 * We want a 2*N / N-bit division to produce exactly an N-bit 913 * result, so that we do not lose any precision and so that we 914 * do not have to renormalize afterward. If A.frac < B.frac, 915 * then division would produce an (N-1)-bit result; shift A left 916 * by one to produce the an N-bit result, and return true to 917 * decrement the exponent to match. 918 * 919 * The udiv_qrnnd algorithm that we're using requires normalization, 920 * i.e. the msb of the denominator must be set, which is already true. 921 */ 922 ret = a->frac < b->frac; 923 if (ret) { 924 n0 = a->frac; 925 n1 = 0; 926 } else { 927 n0 = a->frac >> 1; 928 n1 = a->frac << 63; 929 } 930 q = udiv_qrnnd(&r, n0, n1, b->frac); 931 932 /* Set lsb if there is a remainder, to set inexact. */ 933 a->frac = q | (r != 0); 934 935 return ret; 936 } 937 938 static bool frac128_div(FloatParts128 *a, FloatParts128 *b) 939 { 940 uint64_t q0, q1, a0, a1, b0, b1; 941 uint64_t r0, r1, r2, r3, t0, t1, t2, t3; 942 bool ret = false; 943 944 a0 = a->frac_hi, a1 = a->frac_lo; 945 b0 = b->frac_hi, b1 = b->frac_lo; 946 947 ret = lt128(a0, a1, b0, b1); 948 if (!ret) { 949 a1 = shr_double(a0, a1, 1); 950 a0 = a0 >> 1; 951 } 952 953 /* Use 128/64 -> 64 division as estimate for 192/128 -> 128 division. */ 954 q0 = estimateDiv128To64(a0, a1, b0); 955 956 /* 957 * Estimate is high because B1 was not included (unless B1 == 0). 958 * Reduce quotient and increase remainder until remainder is non-negative. 959 * This loop will execute 0 to 2 times. 960 */ 961 mul128By64To192(b0, b1, q0, &t0, &t1, &t2); 962 sub192(a0, a1, 0, t0, t1, t2, &r0, &r1, &r2); 963 while (r0 != 0) { 964 q0--; 965 add192(r0, r1, r2, 0, b0, b1, &r0, &r1, &r2); 966 } 967 968 /* Repeat using the remainder, producing a second word of quotient. */ 969 q1 = estimateDiv128To64(r1, r2, b0); 970 mul128By64To192(b0, b1, q1, &t1, &t2, &t3); 971 sub192(r1, r2, 0, t1, t2, t3, &r1, &r2, &r3); 972 while (r1 != 0) { 973 q1--; 974 add192(r1, r2, r3, 0, b0, b1, &r1, &r2, &r3); 975 } 976 977 /* Any remainder indicates inexact; set sticky bit. */ 978 q1 |= (r2 | r3) != 0; 979 980 a->frac_hi = q0; 981 a->frac_lo = q1; 982 return ret; 983 } 984 985 #define frac_div(A, B) FRAC_GENERIC_64_128(div, A)(A, B) 986 987 static bool frac64_eqz(FloatParts64 *a) 988 { 989 return a->frac == 0; 990 } 991 992 static bool frac128_eqz(FloatParts128 *a) 993 { 994 return (a->frac_hi | a->frac_lo) == 0; 995 } 996 997 #define frac_eqz(A) FRAC_GENERIC_64_128(eqz, A)(A) 998 999 static void frac64_mulw(FloatParts128 *r, FloatParts64 *a, FloatParts64 *b) 1000 { 1001 mulu64(&r->frac_lo, &r->frac_hi, a->frac, b->frac); 1002 } 1003 1004 static void frac128_mulw(FloatParts256 *r, FloatParts128 *a, FloatParts128 *b) 1005 { 1006 mul128To256(a->frac_hi, a->frac_lo, b->frac_hi, b->frac_lo, 1007 &r->frac_hi, &r->frac_hm, &r->frac_lm, &r->frac_lo); 1008 } 1009 1010 #define frac_mulw(R, A, B) FRAC_GENERIC_64_128(mulw, A)(R, A, B) 1011 1012 static void frac64_neg(FloatParts64 *a) 1013 { 1014 a->frac = -a->frac; 1015 } 1016 1017 static void frac128_neg(FloatParts128 *a) 1018 { 1019 bool c = 0; 1020 a->frac_lo = usub64_borrow(0, a->frac_lo, &c); 1021 a->frac_hi = usub64_borrow(0, a->frac_hi, &c); 1022 } 1023 1024 static void frac256_neg(FloatParts256 *a) 1025 { 1026 bool c = 0; 1027 a->frac_lo = usub64_borrow(0, a->frac_lo, &c); 1028 a->frac_lm = usub64_borrow(0, a->frac_lm, &c); 1029 a->frac_hm = usub64_borrow(0, a->frac_hm, &c); 1030 a->frac_hi = usub64_borrow(0, a->frac_hi, &c); 1031 } 1032 1033 #define frac_neg(A) FRAC_GENERIC_64_128_256(neg, A)(A) 1034 1035 static int frac64_normalize(FloatParts64 *a) 1036 { 1037 if (a->frac) { 1038 int shift = clz64(a->frac); 1039 a->frac <<= shift; 1040 return shift; 1041 } 1042 return 64; 1043 } 1044 1045 static int frac128_normalize(FloatParts128 *a) 1046 { 1047 if (a->frac_hi) { 1048 int shl = clz64(a->frac_hi); 1049 a->frac_hi = shl_double(a->frac_hi, a->frac_lo, shl); 1050 a->frac_lo <<= shl; 1051 return shl; 1052 } else if (a->frac_lo) { 1053 int shl = clz64(a->frac_lo); 1054 a->frac_hi = a->frac_lo << shl; 1055 a->frac_lo = 0; 1056 return shl + 64; 1057 } 1058 return 128; 1059 } 1060 1061 static int frac256_normalize(FloatParts256 *a) 1062 { 1063 uint64_t a0 = a->frac_hi, a1 = a->frac_hm; 1064 uint64_t a2 = a->frac_lm, a3 = a->frac_lo; 1065 int ret, shl; 1066 1067 if (likely(a0)) { 1068 shl = clz64(a0); 1069 if (shl == 0) { 1070 return 0; 1071 } 1072 ret = shl; 1073 } else { 1074 if (a1) { 1075 ret = 64; 1076 a0 = a1, a1 = a2, a2 = a3, a3 = 0; 1077 } else if (a2) { 1078 ret = 128; 1079 a0 = a2, a1 = a3, a2 = 0, a3 = 0; 1080 } else if (a3) { 1081 ret = 192; 1082 a0 = a3, a1 = 0, a2 = 0, a3 = 0; 1083 } else { 1084 ret = 256; 1085 a0 = 0, a1 = 0, a2 = 0, a3 = 0; 1086 goto done; 1087 } 1088 shl = clz64(a0); 1089 if (shl == 0) { 1090 goto done; 1091 } 1092 ret += shl; 1093 } 1094 1095 a0 = shl_double(a0, a1, shl); 1096 a1 = shl_double(a1, a2, shl); 1097 a2 = shl_double(a2, a3, shl); 1098 a3 <<= shl; 1099 1100 done: 1101 a->frac_hi = a0; 1102 a->frac_hm = a1; 1103 a->frac_lm = a2; 1104 a->frac_lo = a3; 1105 return ret; 1106 } 1107 1108 #define frac_normalize(A) FRAC_GENERIC_64_128_256(normalize, A)(A) 1109 1110 static void frac64_shl(FloatParts64 *a, int c) 1111 { 1112 a->frac <<= c; 1113 } 1114 1115 static void frac128_shl(FloatParts128 *a, int c) 1116 { 1117 uint64_t a0 = a->frac_hi, a1 = a->frac_lo; 1118 1119 if (c & 64) { 1120 a0 = a1, a1 = 0; 1121 } 1122 1123 c &= 63; 1124 if (c) { 1125 a0 = shl_double(a0, a1, c); 1126 a1 = a1 << c; 1127 } 1128 1129 a->frac_hi = a0; 1130 a->frac_lo = a1; 1131 } 1132 1133 #define frac_shl(A, C) FRAC_GENERIC_64_128(shl, A)(A, C) 1134 1135 static void frac64_shr(FloatParts64 *a, int c) 1136 { 1137 a->frac >>= c; 1138 } 1139 1140 static void frac128_shr(FloatParts128 *a, int c) 1141 { 1142 uint64_t a0 = a->frac_hi, a1 = a->frac_lo; 1143 1144 if (c & 64) { 1145 a1 = a0, a0 = 0; 1146 } 1147 1148 c &= 63; 1149 if (c) { 1150 a1 = shr_double(a0, a1, c); 1151 a0 = a0 >> c; 1152 } 1153 1154 a->frac_hi = a0; 1155 a->frac_lo = a1; 1156 } 1157 1158 #define frac_shr(A, C) FRAC_GENERIC_64_128(shr, A)(A, C) 1159 1160 static void frac64_shrjam(FloatParts64 *a, int c) 1161 { 1162 uint64_t a0 = a->frac; 1163 1164 if (likely(c != 0)) { 1165 if (likely(c < 64)) { 1166 a0 = (a0 >> c) | (shr_double(a0, 0, c) != 0); 1167 } else { 1168 a0 = a0 != 0; 1169 } 1170 a->frac = a0; 1171 } 1172 } 1173 1174 static void frac128_shrjam(FloatParts128 *a, int c) 1175 { 1176 uint64_t a0 = a->frac_hi, a1 = a->frac_lo; 1177 uint64_t sticky = 0; 1178 1179 if (unlikely(c == 0)) { 1180 return; 1181 } else if (likely(c < 64)) { 1182 /* nothing */ 1183 } else if (likely(c < 128)) { 1184 sticky = a1; 1185 a1 = a0; 1186 a0 = 0; 1187 c &= 63; 1188 if (c == 0) { 1189 goto done; 1190 } 1191 } else { 1192 sticky = a0 | a1; 1193 a0 = a1 = 0; 1194 goto done; 1195 } 1196 1197 sticky |= shr_double(a1, 0, c); 1198 a1 = shr_double(a0, a1, c); 1199 a0 = a0 >> c; 1200 1201 done: 1202 a->frac_lo = a1 | (sticky != 0); 1203 a->frac_hi = a0; 1204 } 1205 1206 static void frac256_shrjam(FloatParts256 *a, int c) 1207 { 1208 uint64_t a0 = a->frac_hi, a1 = a->frac_hm; 1209 uint64_t a2 = a->frac_lm, a3 = a->frac_lo; 1210 uint64_t sticky = 0; 1211 1212 if (unlikely(c == 0)) { 1213 return; 1214 } else if (likely(c < 64)) { 1215 /* nothing */ 1216 } else if (likely(c < 256)) { 1217 if (unlikely(c & 128)) { 1218 sticky |= a2 | a3; 1219 a3 = a1, a2 = a0, a1 = 0, a0 = 0; 1220 } 1221 if (unlikely(c & 64)) { 1222 sticky |= a3; 1223 a3 = a2, a2 = a1, a1 = a0, a0 = 0; 1224 } 1225 c &= 63; 1226 if (c == 0) { 1227 goto done; 1228 } 1229 } else { 1230 sticky = a0 | a1 | a2 | a3; 1231 a0 = a1 = a2 = a3 = 0; 1232 goto done; 1233 } 1234 1235 sticky |= shr_double(a3, 0, c); 1236 a3 = shr_double(a2, a3, c); 1237 a2 = shr_double(a1, a2, c); 1238 a1 = shr_double(a0, a1, c); 1239 a0 = a0 >> c; 1240 1241 done: 1242 a->frac_lo = a3 | (sticky != 0); 1243 a->frac_lm = a2; 1244 a->frac_hm = a1; 1245 a->frac_hi = a0; 1246 } 1247 1248 #define frac_shrjam(A, C) FRAC_GENERIC_64_128_256(shrjam, A)(A, C) 1249 1250 static bool frac64_sub(FloatParts64 *r, FloatParts64 *a, FloatParts64 *b) 1251 { 1252 return usub64_overflow(a->frac, b->frac, &r->frac); 1253 } 1254 1255 static bool frac128_sub(FloatParts128 *r, FloatParts128 *a, FloatParts128 *b) 1256 { 1257 bool c = 0; 1258 r->frac_lo = usub64_borrow(a->frac_lo, b->frac_lo, &c); 1259 r->frac_hi = usub64_borrow(a->frac_hi, b->frac_hi, &c); 1260 return c; 1261 } 1262 1263 static bool frac256_sub(FloatParts256 *r, FloatParts256 *a, FloatParts256 *b) 1264 { 1265 bool c = 0; 1266 r->frac_lo = usub64_borrow(a->frac_lo, b->frac_lo, &c); 1267 r->frac_lm = usub64_borrow(a->frac_lm, b->frac_lm, &c); 1268 r->frac_hm = usub64_borrow(a->frac_hm, b->frac_hm, &c); 1269 r->frac_hi = usub64_borrow(a->frac_hi, b->frac_hi, &c); 1270 return c; 1271 } 1272 1273 #define frac_sub(R, A, B) FRAC_GENERIC_64_128_256(sub, R)(R, A, B) 1274 1275 static void frac64_truncjam(FloatParts64 *r, FloatParts128 *a) 1276 { 1277 r->frac = a->frac_hi | (a->frac_lo != 0); 1278 } 1279 1280 static void frac128_truncjam(FloatParts128 *r, FloatParts256 *a) 1281 { 1282 r->frac_hi = a->frac_hi; 1283 r->frac_lo = a->frac_hm | ((a->frac_lm | a->frac_lo) != 0); 1284 } 1285 1286 #define frac_truncjam(R, A) FRAC_GENERIC_64_128(truncjam, R)(R, A) 1287 1288 static void frac64_widen(FloatParts128 *r, FloatParts64 *a) 1289 { 1290 r->frac_hi = a->frac; 1291 r->frac_lo = 0; 1292 } 1293 1294 static void frac128_widen(FloatParts256 *r, FloatParts128 *a) 1295 { 1296 r->frac_hi = a->frac_hi; 1297 r->frac_hm = a->frac_lo; 1298 r->frac_lm = 0; 1299 r->frac_lo = 0; 1300 } 1301 1302 #define frac_widen(A, B) FRAC_GENERIC_64_128(widen, B)(A, B) 1303 1304 #define partsN(NAME) glue(glue(glue(parts,N),_),NAME) 1305 #define FloatPartsN glue(FloatParts,N) 1306 #define FloatPartsW glue(FloatParts,W) 1307 1308 #define N 64 1309 #define W 128 1310 1311 #include "softfloat-parts-addsub.c.inc" 1312 #include "softfloat-parts.c.inc" 1313 1314 #undef N 1315 #undef W 1316 #define N 128 1317 #define W 256 1318 1319 #include "softfloat-parts-addsub.c.inc" 1320 #include "softfloat-parts.c.inc" 1321 1322 #undef N 1323 #undef W 1324 #define N 256 1325 1326 #include "softfloat-parts-addsub.c.inc" 1327 1328 #undef N 1329 #undef W 1330 #undef partsN 1331 #undef FloatPartsN 1332 #undef FloatPartsW 1333 1334 /* 1335 * Pack/unpack routines with a specific FloatFmt. 1336 */ 1337 1338 static void float16a_unpack_canonical(FloatParts64 *p, float16 f, 1339 float_status *s, const FloatFmt *params) 1340 { 1341 float16_unpack_raw(p, f); 1342 parts_canonicalize(p, s, params); 1343 } 1344 1345 static void float16_unpack_canonical(FloatParts64 *p, float16 f, 1346 float_status *s) 1347 { 1348 float16a_unpack_canonical(p, f, s, &float16_params); 1349 } 1350 1351 static void bfloat16_unpack_canonical(FloatParts64 *p, bfloat16 f, 1352 float_status *s) 1353 { 1354 bfloat16_unpack_raw(p, f); 1355 parts_canonicalize(p, s, &bfloat16_params); 1356 } 1357 1358 static float16 float16a_round_pack_canonical(FloatParts64 *p, 1359 float_status *s, 1360 const FloatFmt *params) 1361 { 1362 parts_uncanon(p, s, params); 1363 return float16_pack_raw(p); 1364 } 1365 1366 static float16 float16_round_pack_canonical(FloatParts64 *p, 1367 float_status *s) 1368 { 1369 return float16a_round_pack_canonical(p, s, &float16_params); 1370 } 1371 1372 static bfloat16 bfloat16_round_pack_canonical(FloatParts64 *p, 1373 float_status *s) 1374 { 1375 parts_uncanon(p, s, &bfloat16_params); 1376 return bfloat16_pack_raw(p); 1377 } 1378 1379 static void float32_unpack_canonical(FloatParts64 *p, float32 f, 1380 float_status *s) 1381 { 1382 float32_unpack_raw(p, f); 1383 parts_canonicalize(p, s, &float32_params); 1384 } 1385 1386 static float32 float32_round_pack_canonical(FloatParts64 *p, 1387 float_status *s) 1388 { 1389 parts_uncanon(p, s, &float32_params); 1390 return float32_pack_raw(p); 1391 } 1392 1393 static void float64_unpack_canonical(FloatParts64 *p, float64 f, 1394 float_status *s) 1395 { 1396 float64_unpack_raw(p, f); 1397 parts_canonicalize(p, s, &float64_params); 1398 } 1399 1400 static float64 float64_round_pack_canonical(FloatParts64 *p, 1401 float_status *s) 1402 { 1403 parts_uncanon(p, s, &float64_params); 1404 return float64_pack_raw(p); 1405 } 1406 1407 static void float128_unpack_canonical(FloatParts128 *p, float128 f, 1408 float_status *s) 1409 { 1410 float128_unpack_raw(p, f); 1411 parts_canonicalize(p, s, &float128_params); 1412 } 1413 1414 static float128 float128_round_pack_canonical(FloatParts128 *p, 1415 float_status *s) 1416 { 1417 parts_uncanon(p, s, &float128_params); 1418 return float128_pack_raw(p); 1419 } 1420 1421 /* 1422 * Addition and subtraction 1423 */ 1424 1425 static float16 QEMU_FLATTEN 1426 float16_addsub(float16 a, float16 b, float_status *status, bool subtract) 1427 { 1428 FloatParts64 pa, pb, *pr; 1429 1430 float16_unpack_canonical(&pa, a, status); 1431 float16_unpack_canonical(&pb, b, status); 1432 pr = parts_addsub(&pa, &pb, status, subtract); 1433 1434 return float16_round_pack_canonical(pr, status); 1435 } 1436 1437 float16 float16_add(float16 a, float16 b, float_status *status) 1438 { 1439 return float16_addsub(a, b, status, false); 1440 } 1441 1442 float16 float16_sub(float16 a, float16 b, float_status *status) 1443 { 1444 return float16_addsub(a, b, status, true); 1445 } 1446 1447 static float32 QEMU_SOFTFLOAT_ATTR 1448 soft_f32_addsub(float32 a, float32 b, float_status *status, bool subtract) 1449 { 1450 FloatParts64 pa, pb, *pr; 1451 1452 float32_unpack_canonical(&pa, a, status); 1453 float32_unpack_canonical(&pb, b, status); 1454 pr = parts_addsub(&pa, &pb, status, subtract); 1455 1456 return float32_round_pack_canonical(pr, status); 1457 } 1458 1459 static float32 soft_f32_add(float32 a, float32 b, float_status *status) 1460 { 1461 return soft_f32_addsub(a, b, status, false); 1462 } 1463 1464 static float32 soft_f32_sub(float32 a, float32 b, float_status *status) 1465 { 1466 return soft_f32_addsub(a, b, status, true); 1467 } 1468 1469 static float64 QEMU_SOFTFLOAT_ATTR 1470 soft_f64_addsub(float64 a, float64 b, float_status *status, bool subtract) 1471 { 1472 FloatParts64 pa, pb, *pr; 1473 1474 float64_unpack_canonical(&pa, a, status); 1475 float64_unpack_canonical(&pb, b, status); 1476 pr = parts_addsub(&pa, &pb, status, subtract); 1477 1478 return float64_round_pack_canonical(pr, status); 1479 } 1480 1481 static float64 soft_f64_add(float64 a, float64 b, float_status *status) 1482 { 1483 return soft_f64_addsub(a, b, status, false); 1484 } 1485 1486 static float64 soft_f64_sub(float64 a, float64 b, float_status *status) 1487 { 1488 return soft_f64_addsub(a, b, status, true); 1489 } 1490 1491 static float hard_f32_add(float a, float b) 1492 { 1493 return a + b; 1494 } 1495 1496 static float hard_f32_sub(float a, float b) 1497 { 1498 return a - b; 1499 } 1500 1501 static double hard_f64_add(double a, double b) 1502 { 1503 return a + b; 1504 } 1505 1506 static double hard_f64_sub(double a, double b) 1507 { 1508 return a - b; 1509 } 1510 1511 static bool f32_addsubmul_post(union_float32 a, union_float32 b) 1512 { 1513 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1514 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1515 } 1516 return !(float32_is_zero(a.s) && float32_is_zero(b.s)); 1517 } 1518 1519 static bool f64_addsubmul_post(union_float64 a, union_float64 b) 1520 { 1521 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1522 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1523 } else { 1524 return !(float64_is_zero(a.s) && float64_is_zero(b.s)); 1525 } 1526 } 1527 1528 static float32 float32_addsub(float32 a, float32 b, float_status *s, 1529 hard_f32_op2_fn hard, soft_f32_op2_fn soft) 1530 { 1531 return float32_gen2(a, b, s, hard, soft, 1532 f32_is_zon2, f32_addsubmul_post); 1533 } 1534 1535 static float64 float64_addsub(float64 a, float64 b, float_status *s, 1536 hard_f64_op2_fn hard, soft_f64_op2_fn soft) 1537 { 1538 return float64_gen2(a, b, s, hard, soft, 1539 f64_is_zon2, f64_addsubmul_post); 1540 } 1541 1542 float32 QEMU_FLATTEN 1543 float32_add(float32 a, float32 b, float_status *s) 1544 { 1545 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add); 1546 } 1547 1548 float32 QEMU_FLATTEN 1549 float32_sub(float32 a, float32 b, float_status *s) 1550 { 1551 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub); 1552 } 1553 1554 float64 QEMU_FLATTEN 1555 float64_add(float64 a, float64 b, float_status *s) 1556 { 1557 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add); 1558 } 1559 1560 float64 QEMU_FLATTEN 1561 float64_sub(float64 a, float64 b, float_status *s) 1562 { 1563 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub); 1564 } 1565 1566 static bfloat16 QEMU_FLATTEN 1567 bfloat16_addsub(bfloat16 a, bfloat16 b, float_status *status, bool subtract) 1568 { 1569 FloatParts64 pa, pb, *pr; 1570 1571 bfloat16_unpack_canonical(&pa, a, status); 1572 bfloat16_unpack_canonical(&pb, b, status); 1573 pr = parts_addsub(&pa, &pb, status, subtract); 1574 1575 return bfloat16_round_pack_canonical(pr, status); 1576 } 1577 1578 bfloat16 bfloat16_add(bfloat16 a, bfloat16 b, float_status *status) 1579 { 1580 return bfloat16_addsub(a, b, status, false); 1581 } 1582 1583 bfloat16 bfloat16_sub(bfloat16 a, bfloat16 b, float_status *status) 1584 { 1585 return bfloat16_addsub(a, b, status, true); 1586 } 1587 1588 static float128 QEMU_FLATTEN 1589 float128_addsub(float128 a, float128 b, float_status *status, bool subtract) 1590 { 1591 FloatParts128 pa, pb, *pr; 1592 1593 float128_unpack_canonical(&pa, a, status); 1594 float128_unpack_canonical(&pb, b, status); 1595 pr = parts_addsub(&pa, &pb, status, subtract); 1596 1597 return float128_round_pack_canonical(pr, status); 1598 } 1599 1600 float128 float128_add(float128 a, float128 b, float_status *status) 1601 { 1602 return float128_addsub(a, b, status, false); 1603 } 1604 1605 float128 float128_sub(float128 a, float128 b, float_status *status) 1606 { 1607 return float128_addsub(a, b, status, true); 1608 } 1609 1610 /* 1611 * Multiplication 1612 */ 1613 1614 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status) 1615 { 1616 FloatParts64 pa, pb, *pr; 1617 1618 float16_unpack_canonical(&pa, a, status); 1619 float16_unpack_canonical(&pb, b, status); 1620 pr = parts_mul(&pa, &pb, status); 1621 1622 return float16_round_pack_canonical(pr, status); 1623 } 1624 1625 static float32 QEMU_SOFTFLOAT_ATTR 1626 soft_f32_mul(float32 a, float32 b, float_status *status) 1627 { 1628 FloatParts64 pa, pb, *pr; 1629 1630 float32_unpack_canonical(&pa, a, status); 1631 float32_unpack_canonical(&pb, b, status); 1632 pr = parts_mul(&pa, &pb, status); 1633 1634 return float32_round_pack_canonical(pr, status); 1635 } 1636 1637 static float64 QEMU_SOFTFLOAT_ATTR 1638 soft_f64_mul(float64 a, float64 b, float_status *status) 1639 { 1640 FloatParts64 pa, pb, *pr; 1641 1642 float64_unpack_canonical(&pa, a, status); 1643 float64_unpack_canonical(&pb, b, status); 1644 pr = parts_mul(&pa, &pb, status); 1645 1646 return float64_round_pack_canonical(pr, status); 1647 } 1648 1649 static float hard_f32_mul(float a, float b) 1650 { 1651 return a * b; 1652 } 1653 1654 static double hard_f64_mul(double a, double b) 1655 { 1656 return a * b; 1657 } 1658 1659 float32 QEMU_FLATTEN 1660 float32_mul(float32 a, float32 b, float_status *s) 1661 { 1662 return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul, 1663 f32_is_zon2, f32_addsubmul_post); 1664 } 1665 1666 float64 QEMU_FLATTEN 1667 float64_mul(float64 a, float64 b, float_status *s) 1668 { 1669 return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul, 1670 f64_is_zon2, f64_addsubmul_post); 1671 } 1672 1673 bfloat16 QEMU_FLATTEN 1674 bfloat16_mul(bfloat16 a, bfloat16 b, float_status *status) 1675 { 1676 FloatParts64 pa, pb, *pr; 1677 1678 bfloat16_unpack_canonical(&pa, a, status); 1679 bfloat16_unpack_canonical(&pb, b, status); 1680 pr = parts_mul(&pa, &pb, status); 1681 1682 return bfloat16_round_pack_canonical(pr, status); 1683 } 1684 1685 float128 QEMU_FLATTEN 1686 float128_mul(float128 a, float128 b, float_status *status) 1687 { 1688 FloatParts128 pa, pb, *pr; 1689 1690 float128_unpack_canonical(&pa, a, status); 1691 float128_unpack_canonical(&pb, b, status); 1692 pr = parts_mul(&pa, &pb, status); 1693 1694 return float128_round_pack_canonical(pr, status); 1695 } 1696 1697 /* 1698 * Fused multiply-add 1699 */ 1700 1701 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c, 1702 int flags, float_status *status) 1703 { 1704 FloatParts64 pa, pb, pc, *pr; 1705 1706 float16_unpack_canonical(&pa, a, status); 1707 float16_unpack_canonical(&pb, b, status); 1708 float16_unpack_canonical(&pc, c, status); 1709 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1710 1711 return float16_round_pack_canonical(pr, status); 1712 } 1713 1714 static float32 QEMU_SOFTFLOAT_ATTR 1715 soft_f32_muladd(float32 a, float32 b, float32 c, int flags, 1716 float_status *status) 1717 { 1718 FloatParts64 pa, pb, pc, *pr; 1719 1720 float32_unpack_canonical(&pa, a, status); 1721 float32_unpack_canonical(&pb, b, status); 1722 float32_unpack_canonical(&pc, c, status); 1723 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1724 1725 return float32_round_pack_canonical(pr, status); 1726 } 1727 1728 static float64 QEMU_SOFTFLOAT_ATTR 1729 soft_f64_muladd(float64 a, float64 b, float64 c, int flags, 1730 float_status *status) 1731 { 1732 FloatParts64 pa, pb, pc, *pr; 1733 1734 float64_unpack_canonical(&pa, a, status); 1735 float64_unpack_canonical(&pb, b, status); 1736 float64_unpack_canonical(&pc, c, status); 1737 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1738 1739 return float64_round_pack_canonical(pr, status); 1740 } 1741 1742 static bool force_soft_fma; 1743 1744 float32 QEMU_FLATTEN 1745 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s) 1746 { 1747 union_float32 ua, ub, uc, ur; 1748 1749 ua.s = xa; 1750 ub.s = xb; 1751 uc.s = xc; 1752 1753 if (unlikely(!can_use_fpu(s))) { 1754 goto soft; 1755 } 1756 if (unlikely(flags & float_muladd_halve_result)) { 1757 goto soft; 1758 } 1759 1760 float32_input_flush3(&ua.s, &ub.s, &uc.s, s); 1761 if (unlikely(!f32_is_zon3(ua, ub, uc))) { 1762 goto soft; 1763 } 1764 1765 if (unlikely(force_soft_fma)) { 1766 goto soft; 1767 } 1768 1769 /* 1770 * When (a || b) == 0, there's no need to check for under/over flow, 1771 * since we know the addend is (normal || 0) and the product is 0. 1772 */ 1773 if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) { 1774 union_float32 up; 1775 bool prod_sign; 1776 1777 prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s); 1778 prod_sign ^= !!(flags & float_muladd_negate_product); 1779 up.s = float32_set_sign(float32_zero, prod_sign); 1780 1781 if (flags & float_muladd_negate_c) { 1782 uc.h = -uc.h; 1783 } 1784 ur.h = up.h + uc.h; 1785 } else { 1786 union_float32 ua_orig = ua; 1787 union_float32 uc_orig = uc; 1788 1789 if (flags & float_muladd_negate_product) { 1790 ua.h = -ua.h; 1791 } 1792 if (flags & float_muladd_negate_c) { 1793 uc.h = -uc.h; 1794 } 1795 1796 ur.h = fmaf(ua.h, ub.h, uc.h); 1797 1798 if (unlikely(f32_is_inf(ur))) { 1799 float_raise(float_flag_overflow, s); 1800 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) { 1801 ua = ua_orig; 1802 uc = uc_orig; 1803 goto soft; 1804 } 1805 } 1806 if (flags & float_muladd_negate_result) { 1807 return float32_chs(ur.s); 1808 } 1809 return ur.s; 1810 1811 soft: 1812 return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s); 1813 } 1814 1815 float64 QEMU_FLATTEN 1816 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s) 1817 { 1818 union_float64 ua, ub, uc, ur; 1819 1820 ua.s = xa; 1821 ub.s = xb; 1822 uc.s = xc; 1823 1824 if (unlikely(!can_use_fpu(s))) { 1825 goto soft; 1826 } 1827 if (unlikely(flags & float_muladd_halve_result)) { 1828 goto soft; 1829 } 1830 1831 float64_input_flush3(&ua.s, &ub.s, &uc.s, s); 1832 if (unlikely(!f64_is_zon3(ua, ub, uc))) { 1833 goto soft; 1834 } 1835 1836 if (unlikely(force_soft_fma)) { 1837 goto soft; 1838 } 1839 1840 /* 1841 * When (a || b) == 0, there's no need to check for under/over flow, 1842 * since we know the addend is (normal || 0) and the product is 0. 1843 */ 1844 if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) { 1845 union_float64 up; 1846 bool prod_sign; 1847 1848 prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s); 1849 prod_sign ^= !!(flags & float_muladd_negate_product); 1850 up.s = float64_set_sign(float64_zero, prod_sign); 1851 1852 if (flags & float_muladd_negate_c) { 1853 uc.h = -uc.h; 1854 } 1855 ur.h = up.h + uc.h; 1856 } else { 1857 union_float64 ua_orig = ua; 1858 union_float64 uc_orig = uc; 1859 1860 if (flags & float_muladd_negate_product) { 1861 ua.h = -ua.h; 1862 } 1863 if (flags & float_muladd_negate_c) { 1864 uc.h = -uc.h; 1865 } 1866 1867 ur.h = fma(ua.h, ub.h, uc.h); 1868 1869 if (unlikely(f64_is_inf(ur))) { 1870 float_raise(float_flag_overflow, s); 1871 } else if (unlikely(fabs(ur.h) <= FLT_MIN)) { 1872 ua = ua_orig; 1873 uc = uc_orig; 1874 goto soft; 1875 } 1876 } 1877 if (flags & float_muladd_negate_result) { 1878 return float64_chs(ur.s); 1879 } 1880 return ur.s; 1881 1882 soft: 1883 return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s); 1884 } 1885 1886 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c, 1887 int flags, float_status *status) 1888 { 1889 FloatParts64 pa, pb, pc, *pr; 1890 1891 bfloat16_unpack_canonical(&pa, a, status); 1892 bfloat16_unpack_canonical(&pb, b, status); 1893 bfloat16_unpack_canonical(&pc, c, status); 1894 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1895 1896 return bfloat16_round_pack_canonical(pr, status); 1897 } 1898 1899 float128 QEMU_FLATTEN float128_muladd(float128 a, float128 b, float128 c, 1900 int flags, float_status *status) 1901 { 1902 FloatParts128 pa, pb, pc, *pr; 1903 1904 float128_unpack_canonical(&pa, a, status); 1905 float128_unpack_canonical(&pb, b, status); 1906 float128_unpack_canonical(&pc, c, status); 1907 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1908 1909 return float128_round_pack_canonical(pr, status); 1910 } 1911 1912 /* 1913 * Division 1914 */ 1915 1916 float16 float16_div(float16 a, float16 b, float_status *status) 1917 { 1918 FloatParts64 pa, pb, *pr; 1919 1920 float16_unpack_canonical(&pa, a, status); 1921 float16_unpack_canonical(&pb, b, status); 1922 pr = parts_div(&pa, &pb, status); 1923 1924 return float16_round_pack_canonical(pr, status); 1925 } 1926 1927 static float32 QEMU_SOFTFLOAT_ATTR 1928 soft_f32_div(float32 a, float32 b, float_status *status) 1929 { 1930 FloatParts64 pa, pb, *pr; 1931 1932 float32_unpack_canonical(&pa, a, status); 1933 float32_unpack_canonical(&pb, b, status); 1934 pr = parts_div(&pa, &pb, status); 1935 1936 return float32_round_pack_canonical(pr, status); 1937 } 1938 1939 static float64 QEMU_SOFTFLOAT_ATTR 1940 soft_f64_div(float64 a, float64 b, float_status *status) 1941 { 1942 FloatParts64 pa, pb, *pr; 1943 1944 float64_unpack_canonical(&pa, a, status); 1945 float64_unpack_canonical(&pb, b, status); 1946 pr = parts_div(&pa, &pb, status); 1947 1948 return float64_round_pack_canonical(pr, status); 1949 } 1950 1951 static float hard_f32_div(float a, float b) 1952 { 1953 return a / b; 1954 } 1955 1956 static double hard_f64_div(double a, double b) 1957 { 1958 return a / b; 1959 } 1960 1961 static bool f32_div_pre(union_float32 a, union_float32 b) 1962 { 1963 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1964 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1965 fpclassify(b.h) == FP_NORMAL; 1966 } 1967 return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s); 1968 } 1969 1970 static bool f64_div_pre(union_float64 a, union_float64 b) 1971 { 1972 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1973 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1974 fpclassify(b.h) == FP_NORMAL; 1975 } 1976 return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s); 1977 } 1978 1979 static bool f32_div_post(union_float32 a, union_float32 b) 1980 { 1981 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1982 return fpclassify(a.h) != FP_ZERO; 1983 } 1984 return !float32_is_zero(a.s); 1985 } 1986 1987 static bool f64_div_post(union_float64 a, union_float64 b) 1988 { 1989 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1990 return fpclassify(a.h) != FP_ZERO; 1991 } 1992 return !float64_is_zero(a.s); 1993 } 1994 1995 float32 QEMU_FLATTEN 1996 float32_div(float32 a, float32 b, float_status *s) 1997 { 1998 return float32_gen2(a, b, s, hard_f32_div, soft_f32_div, 1999 f32_div_pre, f32_div_post); 2000 } 2001 2002 float64 QEMU_FLATTEN 2003 float64_div(float64 a, float64 b, float_status *s) 2004 { 2005 return float64_gen2(a, b, s, hard_f64_div, soft_f64_div, 2006 f64_div_pre, f64_div_post); 2007 } 2008 2009 bfloat16 QEMU_FLATTEN 2010 bfloat16_div(bfloat16 a, bfloat16 b, float_status *status) 2011 { 2012 FloatParts64 pa, pb, *pr; 2013 2014 bfloat16_unpack_canonical(&pa, a, status); 2015 bfloat16_unpack_canonical(&pb, b, status); 2016 pr = parts_div(&pa, &pb, status); 2017 2018 return bfloat16_round_pack_canonical(pr, status); 2019 } 2020 2021 float128 QEMU_FLATTEN 2022 float128_div(float128 a, float128 b, float_status *status) 2023 { 2024 FloatParts128 pa, pb, *pr; 2025 2026 float128_unpack_canonical(&pa, a, status); 2027 float128_unpack_canonical(&pb, b, status); 2028 pr = parts_div(&pa, &pb, status); 2029 2030 return float128_round_pack_canonical(pr, status); 2031 } 2032 2033 /* 2034 * Float to Float conversions 2035 * 2036 * Returns the result of converting one float format to another. The 2037 * conversion is performed according to the IEC/IEEE Standard for 2038 * Binary Floating-Point Arithmetic. 2039 * 2040 * Usually this only needs to take care of raising invalid exceptions 2041 * and handling the conversion on NaNs. 2042 */ 2043 2044 static void parts_float_to_ahp(FloatParts64 *a, float_status *s) 2045 { 2046 switch (a->cls) { 2047 case float_class_qnan: 2048 case float_class_snan: 2049 /* 2050 * There is no NaN in the destination format. Raise Invalid 2051 * and return a zero with the sign of the input NaN. 2052 */ 2053 float_raise(float_flag_invalid, s); 2054 a->cls = float_class_zero; 2055 break; 2056 2057 case float_class_inf: 2058 /* 2059 * There is no Inf in the destination format. Raise Invalid 2060 * and return the maximum normal with the correct sign. 2061 */ 2062 float_raise(float_flag_invalid, s); 2063 a->cls = float_class_normal; 2064 a->exp = float16_params_ahp.exp_max; 2065 a->frac = MAKE_64BIT_MASK(float16_params_ahp.frac_shift, 2066 float16_params_ahp.frac_size + 1); 2067 break; 2068 2069 case float_class_normal: 2070 case float_class_zero: 2071 break; 2072 2073 default: 2074 g_assert_not_reached(); 2075 } 2076 } 2077 2078 static void parts64_float_to_float(FloatParts64 *a, float_status *s) 2079 { 2080 if (is_nan(a->cls)) { 2081 parts_return_nan(a, s); 2082 } 2083 } 2084 2085 static void parts128_float_to_float(FloatParts128 *a, float_status *s) 2086 { 2087 if (is_nan(a->cls)) { 2088 parts_return_nan(a, s); 2089 } 2090 } 2091 2092 #define parts_float_to_float(P, S) \ 2093 PARTS_GENERIC_64_128(float_to_float, P)(P, S) 2094 2095 static void parts_float_to_float_narrow(FloatParts64 *a, FloatParts128 *b, 2096 float_status *s) 2097 { 2098 a->cls = b->cls; 2099 a->sign = b->sign; 2100 a->exp = b->exp; 2101 2102 if (a->cls == float_class_normal) { 2103 frac_truncjam(a, b); 2104 } else if (is_nan(a->cls)) { 2105 /* Discard the low bits of the NaN. */ 2106 a->frac = b->frac_hi; 2107 parts_return_nan(a, s); 2108 } 2109 } 2110 2111 static void parts_float_to_float_widen(FloatParts128 *a, FloatParts64 *b, 2112 float_status *s) 2113 { 2114 a->cls = b->cls; 2115 a->sign = b->sign; 2116 a->exp = b->exp; 2117 frac_widen(a, b); 2118 2119 if (is_nan(a->cls)) { 2120 parts_return_nan(a, s); 2121 } 2122 } 2123 2124 float32 float16_to_float32(float16 a, bool ieee, float_status *s) 2125 { 2126 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2127 FloatParts64 p; 2128 2129 float16a_unpack_canonical(&p, a, s, fmt16); 2130 parts_float_to_float(&p, s); 2131 return float32_round_pack_canonical(&p, s); 2132 } 2133 2134 float64 float16_to_float64(float16 a, bool ieee, float_status *s) 2135 { 2136 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2137 FloatParts64 p; 2138 2139 float16a_unpack_canonical(&p, a, s, fmt16); 2140 parts_float_to_float(&p, s); 2141 return float64_round_pack_canonical(&p, s); 2142 } 2143 2144 float16 float32_to_float16(float32 a, bool ieee, float_status *s) 2145 { 2146 FloatParts64 p; 2147 const FloatFmt *fmt; 2148 2149 float32_unpack_canonical(&p, a, s); 2150 if (ieee) { 2151 parts_float_to_float(&p, s); 2152 fmt = &float16_params; 2153 } else { 2154 parts_float_to_ahp(&p, s); 2155 fmt = &float16_params_ahp; 2156 } 2157 return float16a_round_pack_canonical(&p, s, fmt); 2158 } 2159 2160 static float64 QEMU_SOFTFLOAT_ATTR 2161 soft_float32_to_float64(float32 a, float_status *s) 2162 { 2163 FloatParts64 p; 2164 2165 float32_unpack_canonical(&p, a, s); 2166 parts_float_to_float(&p, s); 2167 return float64_round_pack_canonical(&p, s); 2168 } 2169 2170 float64 float32_to_float64(float32 a, float_status *s) 2171 { 2172 if (likely(float32_is_normal(a))) { 2173 /* Widening conversion can never produce inexact results. */ 2174 union_float32 uf; 2175 union_float64 ud; 2176 uf.s = a; 2177 ud.h = uf.h; 2178 return ud.s; 2179 } else if (float32_is_zero(a)) { 2180 return float64_set_sign(float64_zero, float32_is_neg(a)); 2181 } else { 2182 return soft_float32_to_float64(a, s); 2183 } 2184 } 2185 2186 float16 float64_to_float16(float64 a, bool ieee, float_status *s) 2187 { 2188 FloatParts64 p; 2189 const FloatFmt *fmt; 2190 2191 float64_unpack_canonical(&p, a, s); 2192 if (ieee) { 2193 parts_float_to_float(&p, s); 2194 fmt = &float16_params; 2195 } else { 2196 parts_float_to_ahp(&p, s); 2197 fmt = &float16_params_ahp; 2198 } 2199 return float16a_round_pack_canonical(&p, s, fmt); 2200 } 2201 2202 float32 float64_to_float32(float64 a, float_status *s) 2203 { 2204 FloatParts64 p; 2205 2206 float64_unpack_canonical(&p, a, s); 2207 parts_float_to_float(&p, s); 2208 return float32_round_pack_canonical(&p, s); 2209 } 2210 2211 float32 bfloat16_to_float32(bfloat16 a, float_status *s) 2212 { 2213 FloatParts64 p; 2214 2215 bfloat16_unpack_canonical(&p, a, s); 2216 parts_float_to_float(&p, s); 2217 return float32_round_pack_canonical(&p, s); 2218 } 2219 2220 float64 bfloat16_to_float64(bfloat16 a, float_status *s) 2221 { 2222 FloatParts64 p; 2223 2224 bfloat16_unpack_canonical(&p, a, s); 2225 parts_float_to_float(&p, s); 2226 return float64_round_pack_canonical(&p, s); 2227 } 2228 2229 bfloat16 float32_to_bfloat16(float32 a, float_status *s) 2230 { 2231 FloatParts64 p; 2232 2233 float32_unpack_canonical(&p, a, s); 2234 parts_float_to_float(&p, s); 2235 return bfloat16_round_pack_canonical(&p, s); 2236 } 2237 2238 bfloat16 float64_to_bfloat16(float64 a, float_status *s) 2239 { 2240 FloatParts64 p; 2241 2242 float64_unpack_canonical(&p, a, s); 2243 parts_float_to_float(&p, s); 2244 return bfloat16_round_pack_canonical(&p, s); 2245 } 2246 2247 float32 float128_to_float32(float128 a, float_status *s) 2248 { 2249 FloatParts64 p64; 2250 FloatParts128 p128; 2251 2252 float128_unpack_canonical(&p128, a, s); 2253 parts_float_to_float_narrow(&p64, &p128, s); 2254 return float32_round_pack_canonical(&p64, s); 2255 } 2256 2257 float64 float128_to_float64(float128 a, float_status *s) 2258 { 2259 FloatParts64 p64; 2260 FloatParts128 p128; 2261 2262 float128_unpack_canonical(&p128, a, s); 2263 parts_float_to_float_narrow(&p64, &p128, s); 2264 return float64_round_pack_canonical(&p64, s); 2265 } 2266 2267 float128 float32_to_float128(float32 a, float_status *s) 2268 { 2269 FloatParts64 p64; 2270 FloatParts128 p128; 2271 2272 float32_unpack_canonical(&p64, a, s); 2273 parts_float_to_float_widen(&p128, &p64, s); 2274 return float128_round_pack_canonical(&p128, s); 2275 } 2276 2277 float128 float64_to_float128(float64 a, float_status *s) 2278 { 2279 FloatParts64 p64; 2280 FloatParts128 p128; 2281 2282 float64_unpack_canonical(&p64, a, s); 2283 parts_float_to_float_widen(&p128, &p64, s); 2284 return float128_round_pack_canonical(&p128, s); 2285 } 2286 2287 /* 2288 * Rounds the floating-point value `a' to an integer, and returns the 2289 * result as a floating-point value. The operation is performed 2290 * according to the IEC/IEEE Standard for Binary Floating-Point 2291 * Arithmetic. 2292 */ 2293 2294 static FloatParts64 round_to_int(FloatParts64 a, FloatRoundMode rmode, 2295 int scale, float_status *s) 2296 { 2297 switch (a.cls) { 2298 case float_class_qnan: 2299 case float_class_snan: 2300 parts_return_nan(&a, s); 2301 break; 2302 2303 case float_class_zero: 2304 case float_class_inf: 2305 /* already "integral" */ 2306 break; 2307 2308 case float_class_normal: 2309 scale = MIN(MAX(scale, -0x10000), 0x10000); 2310 a.exp += scale; 2311 2312 if (a.exp >= DECOMPOSED_BINARY_POINT) { 2313 /* already integral */ 2314 break; 2315 } 2316 if (a.exp < 0) { 2317 bool one; 2318 /* all fractional */ 2319 float_raise(float_flag_inexact, s); 2320 switch (rmode) { 2321 case float_round_nearest_even: 2322 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT; 2323 break; 2324 case float_round_ties_away: 2325 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT; 2326 break; 2327 case float_round_to_zero: 2328 one = false; 2329 break; 2330 case float_round_up: 2331 one = !a.sign; 2332 break; 2333 case float_round_down: 2334 one = a.sign; 2335 break; 2336 case float_round_to_odd: 2337 one = true; 2338 break; 2339 default: 2340 g_assert_not_reached(); 2341 } 2342 2343 if (one) { 2344 a.frac = DECOMPOSED_IMPLICIT_BIT; 2345 a.exp = 0; 2346 } else { 2347 a.cls = float_class_zero; 2348 } 2349 } else { 2350 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp; 2351 uint64_t frac_lsbm1 = frac_lsb >> 1; 2352 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb; 2353 uint64_t rnd_mask = rnd_even_mask >> 1; 2354 uint64_t inc; 2355 2356 switch (rmode) { 2357 case float_round_nearest_even: 2358 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 2359 break; 2360 case float_round_ties_away: 2361 inc = frac_lsbm1; 2362 break; 2363 case float_round_to_zero: 2364 inc = 0; 2365 break; 2366 case float_round_up: 2367 inc = a.sign ? 0 : rnd_mask; 2368 break; 2369 case float_round_down: 2370 inc = a.sign ? rnd_mask : 0; 2371 break; 2372 case float_round_to_odd: 2373 inc = a.frac & frac_lsb ? 0 : rnd_mask; 2374 break; 2375 default: 2376 g_assert_not_reached(); 2377 } 2378 2379 if (a.frac & rnd_mask) { 2380 float_raise(float_flag_inexact, s); 2381 if (uadd64_overflow(a.frac, inc, &a.frac)) { 2382 a.frac >>= 1; 2383 a.frac |= DECOMPOSED_IMPLICIT_BIT; 2384 a.exp++; 2385 } 2386 a.frac &= ~rnd_mask; 2387 } 2388 } 2389 break; 2390 default: 2391 g_assert_not_reached(); 2392 } 2393 return a; 2394 } 2395 2396 float16 float16_round_to_int(float16 a, float_status *s) 2397 { 2398 FloatParts64 pa, pr; 2399 2400 float16_unpack_canonical(&pa, a, s); 2401 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2402 return float16_round_pack_canonical(&pr, s); 2403 } 2404 2405 float32 float32_round_to_int(float32 a, float_status *s) 2406 { 2407 FloatParts64 pa, pr; 2408 2409 float32_unpack_canonical(&pa, a, s); 2410 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2411 return float32_round_pack_canonical(&pr, s); 2412 } 2413 2414 float64 float64_round_to_int(float64 a, float_status *s) 2415 { 2416 FloatParts64 pa, pr; 2417 2418 float64_unpack_canonical(&pa, a, s); 2419 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2420 return float64_round_pack_canonical(&pr, s); 2421 } 2422 2423 /* 2424 * Rounds the bfloat16 value `a' to an integer, and returns the 2425 * result as a bfloat16 value. 2426 */ 2427 2428 bfloat16 bfloat16_round_to_int(bfloat16 a, float_status *s) 2429 { 2430 FloatParts64 pa, pr; 2431 2432 bfloat16_unpack_canonical(&pa, a, s); 2433 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2434 return bfloat16_round_pack_canonical(&pr, s); 2435 } 2436 2437 /* 2438 * Returns the result of converting the floating-point value `a' to 2439 * the two's complement integer format. The conversion is performed 2440 * according to the IEC/IEEE Standard for Binary Floating-Point 2441 * Arithmetic---which means in particular that the conversion is 2442 * rounded according to the current rounding mode. If `a' is a NaN, 2443 * the largest positive integer is returned. Otherwise, if the 2444 * conversion overflows, the largest integer with the same sign as `a' 2445 * is returned. 2446 */ 2447 2448 static int64_t round_to_int_and_pack(FloatParts64 in, FloatRoundMode rmode, 2449 int scale, int64_t min, int64_t max, 2450 float_status *s) 2451 { 2452 uint64_t r; 2453 int orig_flags = get_float_exception_flags(s); 2454 FloatParts64 p = round_to_int(in, rmode, scale, s); 2455 2456 switch (p.cls) { 2457 case float_class_snan: 2458 case float_class_qnan: 2459 s->float_exception_flags = orig_flags | float_flag_invalid; 2460 return max; 2461 case float_class_inf: 2462 s->float_exception_flags = orig_flags | float_flag_invalid; 2463 return p.sign ? min : max; 2464 case float_class_zero: 2465 return 0; 2466 case float_class_normal: 2467 if (p.exp <= DECOMPOSED_BINARY_POINT) { 2468 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2469 } else { 2470 r = UINT64_MAX; 2471 } 2472 if (p.sign) { 2473 if (r <= -(uint64_t) min) { 2474 return -r; 2475 } else { 2476 s->float_exception_flags = orig_flags | float_flag_invalid; 2477 return min; 2478 } 2479 } else { 2480 if (r <= max) { 2481 return r; 2482 } else { 2483 s->float_exception_flags = orig_flags | float_flag_invalid; 2484 return max; 2485 } 2486 } 2487 default: 2488 g_assert_not_reached(); 2489 } 2490 } 2491 2492 int8_t float16_to_int8_scalbn(float16 a, FloatRoundMode rmode, int scale, 2493 float_status *s) 2494 { 2495 FloatParts64 p; 2496 2497 float16_unpack_canonical(&p, a, s); 2498 return round_to_int_and_pack(p, rmode, scale, INT8_MIN, INT8_MAX, s); 2499 } 2500 2501 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale, 2502 float_status *s) 2503 { 2504 FloatParts64 p; 2505 2506 float16_unpack_canonical(&p, a, s); 2507 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2508 } 2509 2510 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale, 2511 float_status *s) 2512 { 2513 FloatParts64 p; 2514 2515 float16_unpack_canonical(&p, a, s); 2516 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2517 } 2518 2519 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale, 2520 float_status *s) 2521 { 2522 FloatParts64 p; 2523 2524 float16_unpack_canonical(&p, a, s); 2525 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2526 } 2527 2528 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale, 2529 float_status *s) 2530 { 2531 FloatParts64 p; 2532 2533 float32_unpack_canonical(&p, a, s); 2534 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2535 } 2536 2537 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale, 2538 float_status *s) 2539 { 2540 FloatParts64 p; 2541 2542 float32_unpack_canonical(&p, a, s); 2543 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2544 } 2545 2546 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale, 2547 float_status *s) 2548 { 2549 FloatParts64 p; 2550 2551 float32_unpack_canonical(&p, a, s); 2552 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2553 } 2554 2555 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale, 2556 float_status *s) 2557 { 2558 FloatParts64 p; 2559 2560 float64_unpack_canonical(&p, a, s); 2561 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2562 } 2563 2564 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale, 2565 float_status *s) 2566 { 2567 FloatParts64 p; 2568 2569 float64_unpack_canonical(&p, a, s); 2570 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2571 } 2572 2573 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale, 2574 float_status *s) 2575 { 2576 FloatParts64 p; 2577 2578 float64_unpack_canonical(&p, a, s); 2579 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2580 } 2581 2582 int8_t float16_to_int8(float16 a, float_status *s) 2583 { 2584 return float16_to_int8_scalbn(a, s->float_rounding_mode, 0, s); 2585 } 2586 2587 int16_t float16_to_int16(float16 a, float_status *s) 2588 { 2589 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2590 } 2591 2592 int32_t float16_to_int32(float16 a, float_status *s) 2593 { 2594 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2595 } 2596 2597 int64_t float16_to_int64(float16 a, float_status *s) 2598 { 2599 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2600 } 2601 2602 int16_t float32_to_int16(float32 a, float_status *s) 2603 { 2604 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2605 } 2606 2607 int32_t float32_to_int32(float32 a, float_status *s) 2608 { 2609 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2610 } 2611 2612 int64_t float32_to_int64(float32 a, float_status *s) 2613 { 2614 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2615 } 2616 2617 int16_t float64_to_int16(float64 a, float_status *s) 2618 { 2619 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2620 } 2621 2622 int32_t float64_to_int32(float64 a, float_status *s) 2623 { 2624 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2625 } 2626 2627 int64_t float64_to_int64(float64 a, float_status *s) 2628 { 2629 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2630 } 2631 2632 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s) 2633 { 2634 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2635 } 2636 2637 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s) 2638 { 2639 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2640 } 2641 2642 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s) 2643 { 2644 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2645 } 2646 2647 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s) 2648 { 2649 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s); 2650 } 2651 2652 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s) 2653 { 2654 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s); 2655 } 2656 2657 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s) 2658 { 2659 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s); 2660 } 2661 2662 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s) 2663 { 2664 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s); 2665 } 2666 2667 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s) 2668 { 2669 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s); 2670 } 2671 2672 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s) 2673 { 2674 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s); 2675 } 2676 2677 /* 2678 * Returns the result of converting the floating-point value `a' to 2679 * the two's complement integer format. 2680 */ 2681 2682 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2683 float_status *s) 2684 { 2685 FloatParts64 p; 2686 2687 bfloat16_unpack_canonical(&p, a, s); 2688 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2689 } 2690 2691 int32_t bfloat16_to_int32_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2692 float_status *s) 2693 { 2694 FloatParts64 p; 2695 2696 bfloat16_unpack_canonical(&p, a, s); 2697 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2698 } 2699 2700 int64_t bfloat16_to_int64_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2701 float_status *s) 2702 { 2703 FloatParts64 p; 2704 2705 bfloat16_unpack_canonical(&p, a, s); 2706 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2707 } 2708 2709 int16_t bfloat16_to_int16(bfloat16 a, float_status *s) 2710 { 2711 return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2712 } 2713 2714 int32_t bfloat16_to_int32(bfloat16 a, float_status *s) 2715 { 2716 return bfloat16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2717 } 2718 2719 int64_t bfloat16_to_int64(bfloat16 a, float_status *s) 2720 { 2721 return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2722 } 2723 2724 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s) 2725 { 2726 return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2727 } 2728 2729 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a, float_status *s) 2730 { 2731 return bfloat16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2732 } 2733 2734 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a, float_status *s) 2735 { 2736 return bfloat16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2737 } 2738 2739 /* 2740 * Returns the result of converting the floating-point value `a' to 2741 * the unsigned integer format. The conversion is performed according 2742 * to the IEC/IEEE Standard for Binary Floating-Point 2743 * Arithmetic---which means in particular that the conversion is 2744 * rounded according to the current rounding mode. If `a' is a NaN, 2745 * the largest unsigned integer is returned. Otherwise, if the 2746 * conversion overflows, the largest unsigned integer is returned. If 2747 * the 'a' is negative, the result is rounded and zero is returned; 2748 * values that do not round to zero will raise the inexact exception 2749 * flag. 2750 */ 2751 2752 static uint64_t round_to_uint_and_pack(FloatParts64 in, FloatRoundMode rmode, 2753 int scale, uint64_t max, 2754 float_status *s) 2755 { 2756 int orig_flags = get_float_exception_flags(s); 2757 FloatParts64 p = round_to_int(in, rmode, scale, s); 2758 uint64_t r; 2759 2760 switch (p.cls) { 2761 case float_class_snan: 2762 case float_class_qnan: 2763 s->float_exception_flags = orig_flags | float_flag_invalid; 2764 return max; 2765 case float_class_inf: 2766 s->float_exception_flags = orig_flags | float_flag_invalid; 2767 return p.sign ? 0 : max; 2768 case float_class_zero: 2769 return 0; 2770 case float_class_normal: 2771 if (p.sign) { 2772 s->float_exception_flags = orig_flags | float_flag_invalid; 2773 return 0; 2774 } 2775 2776 if (p.exp <= DECOMPOSED_BINARY_POINT) { 2777 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2778 } else { 2779 s->float_exception_flags = orig_flags | float_flag_invalid; 2780 return max; 2781 } 2782 2783 /* For uint64 this will never trip, but if p.exp is too large 2784 * to shift a decomposed fraction we shall have exited via the 2785 * 3rd leg above. 2786 */ 2787 if (r > max) { 2788 s->float_exception_flags = orig_flags | float_flag_invalid; 2789 return max; 2790 } 2791 return r; 2792 default: 2793 g_assert_not_reached(); 2794 } 2795 } 2796 2797 uint8_t float16_to_uint8_scalbn(float16 a, FloatRoundMode rmode, int scale, 2798 float_status *s) 2799 { 2800 FloatParts64 p; 2801 2802 float16_unpack_canonical(&p, a, s); 2803 return round_to_uint_and_pack(p, rmode, scale, UINT8_MAX, s); 2804 } 2805 2806 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale, 2807 float_status *s) 2808 { 2809 FloatParts64 p; 2810 2811 float16_unpack_canonical(&p, a, s); 2812 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2813 } 2814 2815 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale, 2816 float_status *s) 2817 { 2818 FloatParts64 p; 2819 2820 float16_unpack_canonical(&p, a, s); 2821 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2822 } 2823 2824 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale, 2825 float_status *s) 2826 { 2827 FloatParts64 p; 2828 2829 float16_unpack_canonical(&p, a, s); 2830 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2831 } 2832 2833 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale, 2834 float_status *s) 2835 { 2836 FloatParts64 p; 2837 2838 float32_unpack_canonical(&p, a, s); 2839 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2840 } 2841 2842 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale, 2843 float_status *s) 2844 { 2845 FloatParts64 p; 2846 2847 float32_unpack_canonical(&p, a, s); 2848 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2849 } 2850 2851 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale, 2852 float_status *s) 2853 { 2854 FloatParts64 p; 2855 2856 float32_unpack_canonical(&p, a, s); 2857 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2858 } 2859 2860 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale, 2861 float_status *s) 2862 { 2863 FloatParts64 p; 2864 2865 float64_unpack_canonical(&p, a, s); 2866 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2867 } 2868 2869 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale, 2870 float_status *s) 2871 { 2872 FloatParts64 p; 2873 2874 float64_unpack_canonical(&p, a, s); 2875 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2876 } 2877 2878 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale, 2879 float_status *s) 2880 { 2881 FloatParts64 p; 2882 2883 float64_unpack_canonical(&p, a, s); 2884 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2885 } 2886 2887 uint8_t float16_to_uint8(float16 a, float_status *s) 2888 { 2889 return float16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s); 2890 } 2891 2892 uint16_t float16_to_uint16(float16 a, float_status *s) 2893 { 2894 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2895 } 2896 2897 uint32_t float16_to_uint32(float16 a, float_status *s) 2898 { 2899 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2900 } 2901 2902 uint64_t float16_to_uint64(float16 a, float_status *s) 2903 { 2904 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2905 } 2906 2907 uint16_t float32_to_uint16(float32 a, float_status *s) 2908 { 2909 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2910 } 2911 2912 uint32_t float32_to_uint32(float32 a, float_status *s) 2913 { 2914 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2915 } 2916 2917 uint64_t float32_to_uint64(float32 a, float_status *s) 2918 { 2919 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2920 } 2921 2922 uint16_t float64_to_uint16(float64 a, float_status *s) 2923 { 2924 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2925 } 2926 2927 uint32_t float64_to_uint32(float64 a, float_status *s) 2928 { 2929 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2930 } 2931 2932 uint64_t float64_to_uint64(float64 a, float_status *s) 2933 { 2934 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2935 } 2936 2937 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s) 2938 { 2939 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2940 } 2941 2942 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s) 2943 { 2944 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2945 } 2946 2947 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s) 2948 { 2949 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2950 } 2951 2952 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s) 2953 { 2954 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2955 } 2956 2957 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s) 2958 { 2959 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2960 } 2961 2962 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s) 2963 { 2964 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2965 } 2966 2967 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s) 2968 { 2969 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2970 } 2971 2972 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s) 2973 { 2974 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2975 } 2976 2977 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s) 2978 { 2979 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2980 } 2981 2982 /* 2983 * Returns the result of converting the bfloat16 value `a' to 2984 * the unsigned integer format. 2985 */ 2986 2987 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode, 2988 int scale, float_status *s) 2989 { 2990 FloatParts64 p; 2991 2992 bfloat16_unpack_canonical(&p, a, s); 2993 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2994 } 2995 2996 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode rmode, 2997 int scale, float_status *s) 2998 { 2999 FloatParts64 p; 3000 3001 bfloat16_unpack_canonical(&p, a, s); 3002 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 3003 } 3004 3005 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode rmode, 3006 int scale, float_status *s) 3007 { 3008 FloatParts64 p; 3009 3010 bfloat16_unpack_canonical(&p, a, s); 3011 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 3012 } 3013 3014 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s) 3015 { 3016 return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 3017 } 3018 3019 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *s) 3020 { 3021 return bfloat16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 3022 } 3023 3024 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s) 3025 { 3026 return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 3027 } 3028 3029 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s) 3030 { 3031 return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 3032 } 3033 3034 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *s) 3035 { 3036 return bfloat16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 3037 } 3038 3039 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *s) 3040 { 3041 return bfloat16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 3042 } 3043 3044 /* 3045 * Integer to float conversions 3046 * 3047 * Returns the result of converting the two's complement integer `a' 3048 * to the floating-point format. The conversion is performed according 3049 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3050 */ 3051 3052 static FloatParts64 int_to_float(int64_t a, int scale, float_status *status) 3053 { 3054 FloatParts64 r = { .sign = false }; 3055 3056 if (a == 0) { 3057 r.cls = float_class_zero; 3058 } else { 3059 uint64_t f = a; 3060 int shift; 3061 3062 r.cls = float_class_normal; 3063 if (a < 0) { 3064 f = -f; 3065 r.sign = true; 3066 } 3067 shift = clz64(f); 3068 scale = MIN(MAX(scale, -0x10000), 0x10000); 3069 3070 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 3071 r.frac = f << shift; 3072 } 3073 3074 return r; 3075 } 3076 3077 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status) 3078 { 3079 FloatParts64 pa = int_to_float(a, scale, status); 3080 return float16_round_pack_canonical(&pa, status); 3081 } 3082 3083 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status) 3084 { 3085 return int64_to_float16_scalbn(a, scale, status); 3086 } 3087 3088 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status) 3089 { 3090 return int64_to_float16_scalbn(a, scale, status); 3091 } 3092 3093 float16 int64_to_float16(int64_t a, float_status *status) 3094 { 3095 return int64_to_float16_scalbn(a, 0, status); 3096 } 3097 3098 float16 int32_to_float16(int32_t a, float_status *status) 3099 { 3100 return int64_to_float16_scalbn(a, 0, status); 3101 } 3102 3103 float16 int16_to_float16(int16_t a, float_status *status) 3104 { 3105 return int64_to_float16_scalbn(a, 0, status); 3106 } 3107 3108 float16 int8_to_float16(int8_t a, float_status *status) 3109 { 3110 return int64_to_float16_scalbn(a, 0, status); 3111 } 3112 3113 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status) 3114 { 3115 FloatParts64 pa = int_to_float(a, scale, status); 3116 return float32_round_pack_canonical(&pa, status); 3117 } 3118 3119 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status) 3120 { 3121 return int64_to_float32_scalbn(a, scale, status); 3122 } 3123 3124 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status) 3125 { 3126 return int64_to_float32_scalbn(a, scale, status); 3127 } 3128 3129 float32 int64_to_float32(int64_t a, float_status *status) 3130 { 3131 return int64_to_float32_scalbn(a, 0, status); 3132 } 3133 3134 float32 int32_to_float32(int32_t a, float_status *status) 3135 { 3136 return int64_to_float32_scalbn(a, 0, status); 3137 } 3138 3139 float32 int16_to_float32(int16_t a, float_status *status) 3140 { 3141 return int64_to_float32_scalbn(a, 0, status); 3142 } 3143 3144 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status) 3145 { 3146 FloatParts64 pa = int_to_float(a, scale, status); 3147 return float64_round_pack_canonical(&pa, status); 3148 } 3149 3150 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status) 3151 { 3152 return int64_to_float64_scalbn(a, scale, status); 3153 } 3154 3155 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status) 3156 { 3157 return int64_to_float64_scalbn(a, scale, status); 3158 } 3159 3160 float64 int64_to_float64(int64_t a, float_status *status) 3161 { 3162 return int64_to_float64_scalbn(a, 0, status); 3163 } 3164 3165 float64 int32_to_float64(int32_t a, float_status *status) 3166 { 3167 return int64_to_float64_scalbn(a, 0, status); 3168 } 3169 3170 float64 int16_to_float64(int16_t a, float_status *status) 3171 { 3172 return int64_to_float64_scalbn(a, 0, status); 3173 } 3174 3175 /* 3176 * Returns the result of converting the two's complement integer `a' 3177 * to the bfloat16 format. 3178 */ 3179 3180 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status) 3181 { 3182 FloatParts64 pa = int_to_float(a, scale, status); 3183 return bfloat16_round_pack_canonical(&pa, status); 3184 } 3185 3186 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int scale, float_status *status) 3187 { 3188 return int64_to_bfloat16_scalbn(a, scale, status); 3189 } 3190 3191 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status) 3192 { 3193 return int64_to_bfloat16_scalbn(a, scale, status); 3194 } 3195 3196 bfloat16 int64_to_bfloat16(int64_t a, float_status *status) 3197 { 3198 return int64_to_bfloat16_scalbn(a, 0, status); 3199 } 3200 3201 bfloat16 int32_to_bfloat16(int32_t a, float_status *status) 3202 { 3203 return int64_to_bfloat16_scalbn(a, 0, status); 3204 } 3205 3206 bfloat16 int16_to_bfloat16(int16_t a, float_status *status) 3207 { 3208 return int64_to_bfloat16_scalbn(a, 0, status); 3209 } 3210 3211 /* 3212 * Unsigned Integer to float conversions 3213 * 3214 * Returns the result of converting the unsigned integer `a' to the 3215 * floating-point format. The conversion is performed according to the 3216 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3217 */ 3218 3219 static FloatParts64 uint_to_float(uint64_t a, int scale, float_status *status) 3220 { 3221 FloatParts64 r = { .sign = false }; 3222 int shift; 3223 3224 if (a == 0) { 3225 r.cls = float_class_zero; 3226 } else { 3227 scale = MIN(MAX(scale, -0x10000), 0x10000); 3228 shift = clz64(a); 3229 r.cls = float_class_normal; 3230 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 3231 r.frac = a << shift; 3232 } 3233 3234 return r; 3235 } 3236 3237 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status) 3238 { 3239 FloatParts64 pa = uint_to_float(a, scale, status); 3240 return float16_round_pack_canonical(&pa, status); 3241 } 3242 3243 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status) 3244 { 3245 return uint64_to_float16_scalbn(a, scale, status); 3246 } 3247 3248 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status) 3249 { 3250 return uint64_to_float16_scalbn(a, scale, status); 3251 } 3252 3253 float16 uint64_to_float16(uint64_t a, float_status *status) 3254 { 3255 return uint64_to_float16_scalbn(a, 0, status); 3256 } 3257 3258 float16 uint32_to_float16(uint32_t a, float_status *status) 3259 { 3260 return uint64_to_float16_scalbn(a, 0, status); 3261 } 3262 3263 float16 uint16_to_float16(uint16_t a, float_status *status) 3264 { 3265 return uint64_to_float16_scalbn(a, 0, status); 3266 } 3267 3268 float16 uint8_to_float16(uint8_t a, float_status *status) 3269 { 3270 return uint64_to_float16_scalbn(a, 0, status); 3271 } 3272 3273 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status) 3274 { 3275 FloatParts64 pa = uint_to_float(a, scale, status); 3276 return float32_round_pack_canonical(&pa, status); 3277 } 3278 3279 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status) 3280 { 3281 return uint64_to_float32_scalbn(a, scale, status); 3282 } 3283 3284 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status) 3285 { 3286 return uint64_to_float32_scalbn(a, scale, status); 3287 } 3288 3289 float32 uint64_to_float32(uint64_t a, float_status *status) 3290 { 3291 return uint64_to_float32_scalbn(a, 0, status); 3292 } 3293 3294 float32 uint32_to_float32(uint32_t a, float_status *status) 3295 { 3296 return uint64_to_float32_scalbn(a, 0, status); 3297 } 3298 3299 float32 uint16_to_float32(uint16_t a, float_status *status) 3300 { 3301 return uint64_to_float32_scalbn(a, 0, status); 3302 } 3303 3304 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status) 3305 { 3306 FloatParts64 pa = uint_to_float(a, scale, status); 3307 return float64_round_pack_canonical(&pa, status); 3308 } 3309 3310 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status) 3311 { 3312 return uint64_to_float64_scalbn(a, scale, status); 3313 } 3314 3315 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status) 3316 { 3317 return uint64_to_float64_scalbn(a, scale, status); 3318 } 3319 3320 float64 uint64_to_float64(uint64_t a, float_status *status) 3321 { 3322 return uint64_to_float64_scalbn(a, 0, status); 3323 } 3324 3325 float64 uint32_to_float64(uint32_t a, float_status *status) 3326 { 3327 return uint64_to_float64_scalbn(a, 0, status); 3328 } 3329 3330 float64 uint16_to_float64(uint16_t a, float_status *status) 3331 { 3332 return uint64_to_float64_scalbn(a, 0, status); 3333 } 3334 3335 /* 3336 * Returns the result of converting the unsigned integer `a' to the 3337 * bfloat16 format. 3338 */ 3339 3340 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status) 3341 { 3342 FloatParts64 pa = uint_to_float(a, scale, status); 3343 return bfloat16_round_pack_canonical(&pa, status); 3344 } 3345 3346 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int scale, float_status *status) 3347 { 3348 return uint64_to_bfloat16_scalbn(a, scale, status); 3349 } 3350 3351 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status) 3352 { 3353 return uint64_to_bfloat16_scalbn(a, scale, status); 3354 } 3355 3356 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status) 3357 { 3358 return uint64_to_bfloat16_scalbn(a, 0, status); 3359 } 3360 3361 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status) 3362 { 3363 return uint64_to_bfloat16_scalbn(a, 0, status); 3364 } 3365 3366 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status) 3367 { 3368 return uint64_to_bfloat16_scalbn(a, 0, status); 3369 } 3370 3371 /* Float Min/Max */ 3372 /* min() and max() functions. These can't be implemented as 3373 * 'compare and pick one input' because that would mishandle 3374 * NaNs and +0 vs -0. 3375 * 3376 * minnum() and maxnum() functions. These are similar to the min() 3377 * and max() functions but if one of the arguments is a QNaN and 3378 * the other is numerical then the numerical argument is returned. 3379 * SNaNs will get quietened before being returned. 3380 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 3381 * and maxNum() operations. min() and max() are the typical min/max 3382 * semantics provided by many CPUs which predate that specification. 3383 * 3384 * minnummag() and maxnummag() functions correspond to minNumMag() 3385 * and minNumMag() from the IEEE-754 2008. 3386 */ 3387 static FloatParts64 minmax_floats(FloatParts64 a, FloatParts64 b, bool ismin, 3388 bool ieee, bool ismag, float_status *s) 3389 { 3390 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) { 3391 if (ieee) { 3392 /* Takes two floating-point values `a' and `b', one of 3393 * which is a NaN, and returns the appropriate NaN 3394 * result. If either `a' or `b' is a signaling NaN, 3395 * the invalid exception is raised. 3396 */ 3397 if (is_snan(a.cls) || is_snan(b.cls)) { 3398 return *parts_pick_nan(&a, &b, s); 3399 } else if (is_nan(a.cls) && !is_nan(b.cls)) { 3400 return b; 3401 } else if (is_nan(b.cls) && !is_nan(a.cls)) { 3402 return a; 3403 } 3404 } 3405 return *parts_pick_nan(&a, &b, s); 3406 } else { 3407 int a_exp, b_exp; 3408 3409 switch (a.cls) { 3410 case float_class_normal: 3411 a_exp = a.exp; 3412 break; 3413 case float_class_inf: 3414 a_exp = INT_MAX; 3415 break; 3416 case float_class_zero: 3417 a_exp = INT_MIN; 3418 break; 3419 default: 3420 g_assert_not_reached(); 3421 break; 3422 } 3423 switch (b.cls) { 3424 case float_class_normal: 3425 b_exp = b.exp; 3426 break; 3427 case float_class_inf: 3428 b_exp = INT_MAX; 3429 break; 3430 case float_class_zero: 3431 b_exp = INT_MIN; 3432 break; 3433 default: 3434 g_assert_not_reached(); 3435 break; 3436 } 3437 3438 if (ismag && (a_exp != b_exp || a.frac != b.frac)) { 3439 bool a_less = a_exp < b_exp; 3440 if (a_exp == b_exp) { 3441 a_less = a.frac < b.frac; 3442 } 3443 return a_less ^ ismin ? b : a; 3444 } 3445 3446 if (a.sign == b.sign) { 3447 bool a_less = a_exp < b_exp; 3448 if (a_exp == b_exp) { 3449 a_less = a.frac < b.frac; 3450 } 3451 return a.sign ^ a_less ^ ismin ? b : a; 3452 } else { 3453 return a.sign ^ ismin ? b : a; 3454 } 3455 } 3456 } 3457 3458 #define MINMAX(sz, name, ismin, isiee, ismag) \ 3459 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \ 3460 float_status *s) \ 3461 { \ 3462 FloatParts64 pa, pb, pr; \ 3463 float ## sz ## _unpack_canonical(&pa, a, s); \ 3464 float ## sz ## _unpack_canonical(&pb, b, s); \ 3465 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 3466 return float ## sz ## _round_pack_canonical(&pr, s); \ 3467 } 3468 3469 MINMAX(16, min, true, false, false) 3470 MINMAX(16, minnum, true, true, false) 3471 MINMAX(16, minnummag, true, true, true) 3472 MINMAX(16, max, false, false, false) 3473 MINMAX(16, maxnum, false, true, false) 3474 MINMAX(16, maxnummag, false, true, true) 3475 3476 MINMAX(32, min, true, false, false) 3477 MINMAX(32, minnum, true, true, false) 3478 MINMAX(32, minnummag, true, true, true) 3479 MINMAX(32, max, false, false, false) 3480 MINMAX(32, maxnum, false, true, false) 3481 MINMAX(32, maxnummag, false, true, true) 3482 3483 MINMAX(64, min, true, false, false) 3484 MINMAX(64, minnum, true, true, false) 3485 MINMAX(64, minnummag, true, true, true) 3486 MINMAX(64, max, false, false, false) 3487 MINMAX(64, maxnum, false, true, false) 3488 MINMAX(64, maxnummag, false, true, true) 3489 3490 #undef MINMAX 3491 3492 #define BF16_MINMAX(name, ismin, isiee, ismag) \ 3493 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s) \ 3494 { \ 3495 FloatParts64 pa, pb, pr; \ 3496 bfloat16_unpack_canonical(&pa, a, s); \ 3497 bfloat16_unpack_canonical(&pb, b, s); \ 3498 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 3499 return bfloat16_round_pack_canonical(&pr, s); \ 3500 } 3501 3502 BF16_MINMAX(min, true, false, false) 3503 BF16_MINMAX(minnum, true, true, false) 3504 BF16_MINMAX(minnummag, true, true, true) 3505 BF16_MINMAX(max, false, false, false) 3506 BF16_MINMAX(maxnum, false, true, false) 3507 BF16_MINMAX(maxnummag, false, true, true) 3508 3509 #undef BF16_MINMAX 3510 3511 /* Floating point compare */ 3512 static FloatRelation compare_floats(FloatParts64 a, FloatParts64 b, bool is_quiet, 3513 float_status *s) 3514 { 3515 if (is_nan(a.cls) || is_nan(b.cls)) { 3516 if (!is_quiet || 3517 a.cls == float_class_snan || 3518 b.cls == float_class_snan) { 3519 float_raise(float_flag_invalid, s); 3520 } 3521 return float_relation_unordered; 3522 } 3523 3524 if (a.cls == float_class_zero) { 3525 if (b.cls == float_class_zero) { 3526 return float_relation_equal; 3527 } 3528 return b.sign ? float_relation_greater : float_relation_less; 3529 } else if (b.cls == float_class_zero) { 3530 return a.sign ? float_relation_less : float_relation_greater; 3531 } 3532 3533 /* The only really important thing about infinity is its sign. If 3534 * both are infinities the sign marks the smallest of the two. 3535 */ 3536 if (a.cls == float_class_inf) { 3537 if ((b.cls == float_class_inf) && (a.sign == b.sign)) { 3538 return float_relation_equal; 3539 } 3540 return a.sign ? float_relation_less : float_relation_greater; 3541 } else if (b.cls == float_class_inf) { 3542 return b.sign ? float_relation_greater : float_relation_less; 3543 } 3544 3545 if (a.sign != b.sign) { 3546 return a.sign ? float_relation_less : float_relation_greater; 3547 } 3548 3549 if (a.exp == b.exp) { 3550 if (a.frac == b.frac) { 3551 return float_relation_equal; 3552 } 3553 if (a.sign) { 3554 return a.frac > b.frac ? 3555 float_relation_less : float_relation_greater; 3556 } else { 3557 return a.frac > b.frac ? 3558 float_relation_greater : float_relation_less; 3559 } 3560 } else { 3561 if (a.sign) { 3562 return a.exp > b.exp ? float_relation_less : float_relation_greater; 3563 } else { 3564 return a.exp > b.exp ? float_relation_greater : float_relation_less; 3565 } 3566 } 3567 } 3568 3569 #define COMPARE(name, attr, sz) \ 3570 static int attr \ 3571 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \ 3572 { \ 3573 FloatParts64 pa, pb; \ 3574 float ## sz ## _unpack_canonical(&pa, a, s); \ 3575 float ## sz ## _unpack_canonical(&pb, b, s); \ 3576 return compare_floats(pa, pb, is_quiet, s); \ 3577 } 3578 3579 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16) 3580 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32) 3581 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64) 3582 3583 #undef COMPARE 3584 3585 FloatRelation float16_compare(float16 a, float16 b, float_status *s) 3586 { 3587 return soft_f16_compare(a, b, false, s); 3588 } 3589 3590 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s) 3591 { 3592 return soft_f16_compare(a, b, true, s); 3593 } 3594 3595 static FloatRelation QEMU_FLATTEN 3596 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s) 3597 { 3598 union_float32 ua, ub; 3599 3600 ua.s = xa; 3601 ub.s = xb; 3602 3603 if (QEMU_NO_HARDFLOAT) { 3604 goto soft; 3605 } 3606 3607 float32_input_flush2(&ua.s, &ub.s, s); 3608 if (isgreaterequal(ua.h, ub.h)) { 3609 if (isgreater(ua.h, ub.h)) { 3610 return float_relation_greater; 3611 } 3612 return float_relation_equal; 3613 } 3614 if (likely(isless(ua.h, ub.h))) { 3615 return float_relation_less; 3616 } 3617 /* The only condition remaining is unordered. 3618 * Fall through to set flags. 3619 */ 3620 soft: 3621 return soft_f32_compare(ua.s, ub.s, is_quiet, s); 3622 } 3623 3624 FloatRelation float32_compare(float32 a, float32 b, float_status *s) 3625 { 3626 return f32_compare(a, b, false, s); 3627 } 3628 3629 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s) 3630 { 3631 return f32_compare(a, b, true, s); 3632 } 3633 3634 static FloatRelation QEMU_FLATTEN 3635 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s) 3636 { 3637 union_float64 ua, ub; 3638 3639 ua.s = xa; 3640 ub.s = xb; 3641 3642 if (QEMU_NO_HARDFLOAT) { 3643 goto soft; 3644 } 3645 3646 float64_input_flush2(&ua.s, &ub.s, s); 3647 if (isgreaterequal(ua.h, ub.h)) { 3648 if (isgreater(ua.h, ub.h)) { 3649 return float_relation_greater; 3650 } 3651 return float_relation_equal; 3652 } 3653 if (likely(isless(ua.h, ub.h))) { 3654 return float_relation_less; 3655 } 3656 /* The only condition remaining is unordered. 3657 * Fall through to set flags. 3658 */ 3659 soft: 3660 return soft_f64_compare(ua.s, ub.s, is_quiet, s); 3661 } 3662 3663 FloatRelation float64_compare(float64 a, float64 b, float_status *s) 3664 { 3665 return f64_compare(a, b, false, s); 3666 } 3667 3668 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s) 3669 { 3670 return f64_compare(a, b, true, s); 3671 } 3672 3673 static FloatRelation QEMU_FLATTEN 3674 soft_bf16_compare(bfloat16 a, bfloat16 b, bool is_quiet, float_status *s) 3675 { 3676 FloatParts64 pa, pb; 3677 3678 bfloat16_unpack_canonical(&pa, a, s); 3679 bfloat16_unpack_canonical(&pb, b, s); 3680 return compare_floats(pa, pb, is_quiet, s); 3681 } 3682 3683 FloatRelation bfloat16_compare(bfloat16 a, bfloat16 b, float_status *s) 3684 { 3685 return soft_bf16_compare(a, b, false, s); 3686 } 3687 3688 FloatRelation bfloat16_compare_quiet(bfloat16 a, bfloat16 b, float_status *s) 3689 { 3690 return soft_bf16_compare(a, b, true, s); 3691 } 3692 3693 /* Multiply A by 2 raised to the power N. */ 3694 static FloatParts64 scalbn_decomposed(FloatParts64 a, int n, float_status *s) 3695 { 3696 if (unlikely(is_nan(a.cls))) { 3697 parts_return_nan(&a, s); 3698 } 3699 if (a.cls == float_class_normal) { 3700 /* The largest float type (even though not supported by FloatParts64) 3701 * is float128, which has a 15 bit exponent. Bounding N to 16 bits 3702 * still allows rounding to infinity, without allowing overflow 3703 * within the int32_t that backs FloatParts64.exp. 3704 */ 3705 n = MIN(MAX(n, -0x10000), 0x10000); 3706 a.exp += n; 3707 } 3708 return a; 3709 } 3710 3711 float16 float16_scalbn(float16 a, int n, float_status *status) 3712 { 3713 FloatParts64 pa, pr; 3714 3715 float16_unpack_canonical(&pa, a, status); 3716 pr = scalbn_decomposed(pa, n, status); 3717 return float16_round_pack_canonical(&pr, status); 3718 } 3719 3720 float32 float32_scalbn(float32 a, int n, float_status *status) 3721 { 3722 FloatParts64 pa, pr; 3723 3724 float32_unpack_canonical(&pa, a, status); 3725 pr = scalbn_decomposed(pa, n, status); 3726 return float32_round_pack_canonical(&pr, status); 3727 } 3728 3729 float64 float64_scalbn(float64 a, int n, float_status *status) 3730 { 3731 FloatParts64 pa, pr; 3732 3733 float64_unpack_canonical(&pa, a, status); 3734 pr = scalbn_decomposed(pa, n, status); 3735 return float64_round_pack_canonical(&pr, status); 3736 } 3737 3738 bfloat16 bfloat16_scalbn(bfloat16 a, int n, float_status *status) 3739 { 3740 FloatParts64 pa, pr; 3741 3742 bfloat16_unpack_canonical(&pa, a, status); 3743 pr = scalbn_decomposed(pa, n, status); 3744 return bfloat16_round_pack_canonical(&pr, status); 3745 } 3746 3747 /* 3748 * Square Root 3749 * 3750 * The old softfloat code did an approximation step before zeroing in 3751 * on the final result. However for simpleness we just compute the 3752 * square root by iterating down from the implicit bit to enough extra 3753 * bits to ensure we get a correctly rounded result. 3754 * 3755 * This does mean however the calculation is slower than before, 3756 * especially for 64 bit floats. 3757 */ 3758 3759 static FloatParts64 sqrt_float(FloatParts64 a, float_status *s, const FloatFmt *p) 3760 { 3761 uint64_t a_frac, r_frac, s_frac; 3762 int bit, last_bit; 3763 3764 if (is_nan(a.cls)) { 3765 parts_return_nan(&a, s); 3766 return a; 3767 } 3768 if (a.cls == float_class_zero) { 3769 return a; /* sqrt(+-0) = +-0 */ 3770 } 3771 if (a.sign) { 3772 float_raise(float_flag_invalid, s); 3773 parts_default_nan(&a, s); 3774 return a; 3775 } 3776 if (a.cls == float_class_inf) { 3777 return a; /* sqrt(+inf) = +inf */ 3778 } 3779 3780 assert(a.cls == float_class_normal); 3781 3782 /* We need two overflow bits at the top. Adding room for that is a 3783 * right shift. If the exponent is odd, we can discard the low bit 3784 * by multiplying the fraction by 2; that's a left shift. Combine 3785 * those and we shift right by 1 if the exponent is odd, otherwise 2. 3786 */ 3787 a_frac = a.frac >> (2 - (a.exp & 1)); 3788 a.exp >>= 1; 3789 3790 /* Bit-by-bit computation of sqrt. */ 3791 r_frac = 0; 3792 s_frac = 0; 3793 3794 /* Iterate from implicit bit down to the 3 extra bits to compute a 3795 * properly rounded result. Remember we've inserted two more bits 3796 * at the top, so these positions are two less. 3797 */ 3798 bit = DECOMPOSED_BINARY_POINT - 2; 3799 last_bit = MAX(p->frac_shift - 4, 0); 3800 do { 3801 uint64_t q = 1ULL << bit; 3802 uint64_t t_frac = s_frac + q; 3803 if (t_frac <= a_frac) { 3804 s_frac = t_frac + q; 3805 a_frac -= t_frac; 3806 r_frac += q; 3807 } 3808 a_frac <<= 1; 3809 } while (--bit >= last_bit); 3810 3811 /* Undo the right shift done above. If there is any remaining 3812 * fraction, the result is inexact. Set the sticky bit. 3813 */ 3814 a.frac = (r_frac << 2) + (a_frac != 0); 3815 3816 return a; 3817 } 3818 3819 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status) 3820 { 3821 FloatParts64 pa, pr; 3822 3823 float16_unpack_canonical(&pa, a, status); 3824 pr = sqrt_float(pa, status, &float16_params); 3825 return float16_round_pack_canonical(&pr, status); 3826 } 3827 3828 static float32 QEMU_SOFTFLOAT_ATTR 3829 soft_f32_sqrt(float32 a, float_status *status) 3830 { 3831 FloatParts64 pa, pr; 3832 3833 float32_unpack_canonical(&pa, a, status); 3834 pr = sqrt_float(pa, status, &float32_params); 3835 return float32_round_pack_canonical(&pr, status); 3836 } 3837 3838 static float64 QEMU_SOFTFLOAT_ATTR 3839 soft_f64_sqrt(float64 a, float_status *status) 3840 { 3841 FloatParts64 pa, pr; 3842 3843 float64_unpack_canonical(&pa, a, status); 3844 pr = sqrt_float(pa, status, &float64_params); 3845 return float64_round_pack_canonical(&pr, status); 3846 } 3847 3848 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s) 3849 { 3850 union_float32 ua, ur; 3851 3852 ua.s = xa; 3853 if (unlikely(!can_use_fpu(s))) { 3854 goto soft; 3855 } 3856 3857 float32_input_flush1(&ua.s, s); 3858 if (QEMU_HARDFLOAT_1F32_USE_FP) { 3859 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3860 fpclassify(ua.h) == FP_ZERO) || 3861 signbit(ua.h))) { 3862 goto soft; 3863 } 3864 } else if (unlikely(!float32_is_zero_or_normal(ua.s) || 3865 float32_is_neg(ua.s))) { 3866 goto soft; 3867 } 3868 ur.h = sqrtf(ua.h); 3869 return ur.s; 3870 3871 soft: 3872 return soft_f32_sqrt(ua.s, s); 3873 } 3874 3875 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s) 3876 { 3877 union_float64 ua, ur; 3878 3879 ua.s = xa; 3880 if (unlikely(!can_use_fpu(s))) { 3881 goto soft; 3882 } 3883 3884 float64_input_flush1(&ua.s, s); 3885 if (QEMU_HARDFLOAT_1F64_USE_FP) { 3886 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3887 fpclassify(ua.h) == FP_ZERO) || 3888 signbit(ua.h))) { 3889 goto soft; 3890 } 3891 } else if (unlikely(!float64_is_zero_or_normal(ua.s) || 3892 float64_is_neg(ua.s))) { 3893 goto soft; 3894 } 3895 ur.h = sqrt(ua.h); 3896 return ur.s; 3897 3898 soft: 3899 return soft_f64_sqrt(ua.s, s); 3900 } 3901 3902 bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status) 3903 { 3904 FloatParts64 pa, pr; 3905 3906 bfloat16_unpack_canonical(&pa, a, status); 3907 pr = sqrt_float(pa, status, &bfloat16_params); 3908 return bfloat16_round_pack_canonical(&pr, status); 3909 } 3910 3911 /*---------------------------------------------------------------------------- 3912 | The pattern for a default generated NaN. 3913 *----------------------------------------------------------------------------*/ 3914 3915 float16 float16_default_nan(float_status *status) 3916 { 3917 FloatParts64 p; 3918 3919 parts_default_nan(&p, status); 3920 p.frac >>= float16_params.frac_shift; 3921 return float16_pack_raw(&p); 3922 } 3923 3924 float32 float32_default_nan(float_status *status) 3925 { 3926 FloatParts64 p; 3927 3928 parts_default_nan(&p, status); 3929 p.frac >>= float32_params.frac_shift; 3930 return float32_pack_raw(&p); 3931 } 3932 3933 float64 float64_default_nan(float_status *status) 3934 { 3935 FloatParts64 p; 3936 3937 parts_default_nan(&p, status); 3938 p.frac >>= float64_params.frac_shift; 3939 return float64_pack_raw(&p); 3940 } 3941 3942 float128 float128_default_nan(float_status *status) 3943 { 3944 FloatParts128 p; 3945 3946 parts_default_nan(&p, status); 3947 frac_shr(&p, float128_params.frac_shift); 3948 return float128_pack_raw(&p); 3949 } 3950 3951 bfloat16 bfloat16_default_nan(float_status *status) 3952 { 3953 FloatParts64 p; 3954 3955 parts_default_nan(&p, status); 3956 p.frac >>= bfloat16_params.frac_shift; 3957 return bfloat16_pack_raw(&p); 3958 } 3959 3960 /*---------------------------------------------------------------------------- 3961 | Returns a quiet NaN from a signalling NaN for the floating point value `a'. 3962 *----------------------------------------------------------------------------*/ 3963 3964 float16 float16_silence_nan(float16 a, float_status *status) 3965 { 3966 FloatParts64 p; 3967 3968 float16_unpack_raw(&p, a); 3969 p.frac <<= float16_params.frac_shift; 3970 parts_silence_nan(&p, status); 3971 p.frac >>= float16_params.frac_shift; 3972 return float16_pack_raw(&p); 3973 } 3974 3975 float32 float32_silence_nan(float32 a, float_status *status) 3976 { 3977 FloatParts64 p; 3978 3979 float32_unpack_raw(&p, a); 3980 p.frac <<= float32_params.frac_shift; 3981 parts_silence_nan(&p, status); 3982 p.frac >>= float32_params.frac_shift; 3983 return float32_pack_raw(&p); 3984 } 3985 3986 float64 float64_silence_nan(float64 a, float_status *status) 3987 { 3988 FloatParts64 p; 3989 3990 float64_unpack_raw(&p, a); 3991 p.frac <<= float64_params.frac_shift; 3992 parts_silence_nan(&p, status); 3993 p.frac >>= float64_params.frac_shift; 3994 return float64_pack_raw(&p); 3995 } 3996 3997 bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status) 3998 { 3999 FloatParts64 p; 4000 4001 bfloat16_unpack_raw(&p, a); 4002 p.frac <<= bfloat16_params.frac_shift; 4003 parts_silence_nan(&p, status); 4004 p.frac >>= bfloat16_params.frac_shift; 4005 return bfloat16_pack_raw(&p); 4006 } 4007 4008 float128 float128_silence_nan(float128 a, float_status *status) 4009 { 4010 FloatParts128 p; 4011 4012 float128_unpack_raw(&p, a); 4013 frac_shl(&p, float128_params.frac_shift); 4014 parts_silence_nan(&p, status); 4015 frac_shr(&p, float128_params.frac_shift); 4016 return float128_pack_raw(&p); 4017 } 4018 4019 /*---------------------------------------------------------------------------- 4020 | If `a' is denormal and we are in flush-to-zero mode then set the 4021 | input-denormal exception and return zero. Otherwise just return the value. 4022 *----------------------------------------------------------------------------*/ 4023 4024 static bool parts_squash_denormal(FloatParts64 p, float_status *status) 4025 { 4026 if (p.exp == 0 && p.frac != 0) { 4027 float_raise(float_flag_input_denormal, status); 4028 return true; 4029 } 4030 4031 return false; 4032 } 4033 4034 float16 float16_squash_input_denormal(float16 a, float_status *status) 4035 { 4036 if (status->flush_inputs_to_zero) { 4037 FloatParts64 p; 4038 4039 float16_unpack_raw(&p, a); 4040 if (parts_squash_denormal(p, status)) { 4041 return float16_set_sign(float16_zero, p.sign); 4042 } 4043 } 4044 return a; 4045 } 4046 4047 float32 float32_squash_input_denormal(float32 a, float_status *status) 4048 { 4049 if (status->flush_inputs_to_zero) { 4050 FloatParts64 p; 4051 4052 float32_unpack_raw(&p, a); 4053 if (parts_squash_denormal(p, status)) { 4054 return float32_set_sign(float32_zero, p.sign); 4055 } 4056 } 4057 return a; 4058 } 4059 4060 float64 float64_squash_input_denormal(float64 a, float_status *status) 4061 { 4062 if (status->flush_inputs_to_zero) { 4063 FloatParts64 p; 4064 4065 float64_unpack_raw(&p, a); 4066 if (parts_squash_denormal(p, status)) { 4067 return float64_set_sign(float64_zero, p.sign); 4068 } 4069 } 4070 return a; 4071 } 4072 4073 bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status) 4074 { 4075 if (status->flush_inputs_to_zero) { 4076 FloatParts64 p; 4077 4078 bfloat16_unpack_raw(&p, a); 4079 if (parts_squash_denormal(p, status)) { 4080 return bfloat16_set_sign(bfloat16_zero, p.sign); 4081 } 4082 } 4083 return a; 4084 } 4085 4086 /*---------------------------------------------------------------------------- 4087 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 4088 | and 7, and returns the properly rounded 32-bit integer corresponding to the 4089 | input. If `zSign' is 1, the input is negated before being converted to an 4090 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 4091 | is simply rounded to an integer, with the inexact exception raised if the 4092 | input cannot be represented exactly as an integer. However, if the fixed- 4093 | point input is too large, the invalid exception is raised and the largest 4094 | positive or negative integer is returned. 4095 *----------------------------------------------------------------------------*/ 4096 4097 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ, 4098 float_status *status) 4099 { 4100 int8_t roundingMode; 4101 bool roundNearestEven; 4102 int8_t roundIncrement, roundBits; 4103 int32_t z; 4104 4105 roundingMode = status->float_rounding_mode; 4106 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4107 switch (roundingMode) { 4108 case float_round_nearest_even: 4109 case float_round_ties_away: 4110 roundIncrement = 0x40; 4111 break; 4112 case float_round_to_zero: 4113 roundIncrement = 0; 4114 break; 4115 case float_round_up: 4116 roundIncrement = zSign ? 0 : 0x7f; 4117 break; 4118 case float_round_down: 4119 roundIncrement = zSign ? 0x7f : 0; 4120 break; 4121 case float_round_to_odd: 4122 roundIncrement = absZ & 0x80 ? 0 : 0x7f; 4123 break; 4124 default: 4125 abort(); 4126 } 4127 roundBits = absZ & 0x7F; 4128 absZ = ( absZ + roundIncrement )>>7; 4129 if (!(roundBits ^ 0x40) && roundNearestEven) { 4130 absZ &= ~1; 4131 } 4132 z = absZ; 4133 if ( zSign ) z = - z; 4134 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 4135 float_raise(float_flag_invalid, status); 4136 return zSign ? INT32_MIN : INT32_MAX; 4137 } 4138 if (roundBits) { 4139 float_raise(float_flag_inexact, status); 4140 } 4141 return z; 4142 4143 } 4144 4145 /*---------------------------------------------------------------------------- 4146 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 4147 | `absZ1', with binary point between bits 63 and 64 (between the input words), 4148 | and returns the properly rounded 64-bit integer corresponding to the input. 4149 | If `zSign' is 1, the input is negated before being converted to an integer. 4150 | Ordinarily, the fixed-point input is simply rounded to an integer, with 4151 | the inexact exception raised if the input cannot be represented exactly as 4152 | an integer. However, if the fixed-point input is too large, the invalid 4153 | exception is raised and the largest positive or negative integer is 4154 | returned. 4155 *----------------------------------------------------------------------------*/ 4156 4157 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1, 4158 float_status *status) 4159 { 4160 int8_t roundingMode; 4161 bool roundNearestEven, increment; 4162 int64_t z; 4163 4164 roundingMode = status->float_rounding_mode; 4165 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4166 switch (roundingMode) { 4167 case float_round_nearest_even: 4168 case float_round_ties_away: 4169 increment = ((int64_t) absZ1 < 0); 4170 break; 4171 case float_round_to_zero: 4172 increment = 0; 4173 break; 4174 case float_round_up: 4175 increment = !zSign && absZ1; 4176 break; 4177 case float_round_down: 4178 increment = zSign && absZ1; 4179 break; 4180 case float_round_to_odd: 4181 increment = !(absZ0 & 1) && absZ1; 4182 break; 4183 default: 4184 abort(); 4185 } 4186 if ( increment ) { 4187 ++absZ0; 4188 if ( absZ0 == 0 ) goto overflow; 4189 if (!(absZ1 << 1) && roundNearestEven) { 4190 absZ0 &= ~1; 4191 } 4192 } 4193 z = absZ0; 4194 if ( zSign ) z = - z; 4195 if ( z && ( ( z < 0 ) ^ zSign ) ) { 4196 overflow: 4197 float_raise(float_flag_invalid, status); 4198 return zSign ? INT64_MIN : INT64_MAX; 4199 } 4200 if (absZ1) { 4201 float_raise(float_flag_inexact, status); 4202 } 4203 return z; 4204 4205 } 4206 4207 /*---------------------------------------------------------------------------- 4208 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 4209 | `absZ1', with binary point between bits 63 and 64 (between the input words), 4210 | and returns the properly rounded 64-bit unsigned integer corresponding to the 4211 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 4212 | with the inexact exception raised if the input cannot be represented exactly 4213 | as an integer. However, if the fixed-point input is too large, the invalid 4214 | exception is raised and the largest unsigned integer is returned. 4215 *----------------------------------------------------------------------------*/ 4216 4217 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0, 4218 uint64_t absZ1, float_status *status) 4219 { 4220 int8_t roundingMode; 4221 bool roundNearestEven, increment; 4222 4223 roundingMode = status->float_rounding_mode; 4224 roundNearestEven = (roundingMode == float_round_nearest_even); 4225 switch (roundingMode) { 4226 case float_round_nearest_even: 4227 case float_round_ties_away: 4228 increment = ((int64_t)absZ1 < 0); 4229 break; 4230 case float_round_to_zero: 4231 increment = 0; 4232 break; 4233 case float_round_up: 4234 increment = !zSign && absZ1; 4235 break; 4236 case float_round_down: 4237 increment = zSign && absZ1; 4238 break; 4239 case float_round_to_odd: 4240 increment = !(absZ0 & 1) && absZ1; 4241 break; 4242 default: 4243 abort(); 4244 } 4245 if (increment) { 4246 ++absZ0; 4247 if (absZ0 == 0) { 4248 float_raise(float_flag_invalid, status); 4249 return UINT64_MAX; 4250 } 4251 if (!(absZ1 << 1) && roundNearestEven) { 4252 absZ0 &= ~1; 4253 } 4254 } 4255 4256 if (zSign && absZ0) { 4257 float_raise(float_flag_invalid, status); 4258 return 0; 4259 } 4260 4261 if (absZ1) { 4262 float_raise(float_flag_inexact, status); 4263 } 4264 return absZ0; 4265 } 4266 4267 /*---------------------------------------------------------------------------- 4268 | Normalizes the subnormal single-precision floating-point value represented 4269 | by the denormalized significand `aSig'. The normalized exponent and 4270 | significand are stored at the locations pointed to by `zExpPtr' and 4271 | `zSigPtr', respectively. 4272 *----------------------------------------------------------------------------*/ 4273 4274 static void 4275 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 4276 { 4277 int8_t shiftCount; 4278 4279 shiftCount = clz32(aSig) - 8; 4280 *zSigPtr = aSig<<shiftCount; 4281 *zExpPtr = 1 - shiftCount; 4282 4283 } 4284 4285 /*---------------------------------------------------------------------------- 4286 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4287 | and significand `zSig', and returns the proper single-precision floating- 4288 | point value corresponding to the abstract input. Ordinarily, the abstract 4289 | value is simply rounded and packed into the single-precision format, with 4290 | the inexact exception raised if the abstract input cannot be represented 4291 | exactly. However, if the abstract value is too large, the overflow and 4292 | inexact exceptions are raised and an infinity or maximal finite value is 4293 | returned. If the abstract value is too small, the input value is rounded to 4294 | a subnormal number, and the underflow and inexact exceptions are raised if 4295 | the abstract input cannot be represented exactly as a subnormal single- 4296 | precision floating-point number. 4297 | The input significand `zSig' has its binary point between bits 30 4298 | and 29, which is 7 bits to the left of the usual location. This shifted 4299 | significand must be normalized or smaller. If `zSig' is not normalized, 4300 | `zExp' must be 0; in that case, the result returned is a subnormal number, 4301 | and it must not require rounding. In the usual case that `zSig' is 4302 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 4303 | The handling of underflow and overflow follows the IEC/IEEE Standard for 4304 | Binary Floating-Point Arithmetic. 4305 *----------------------------------------------------------------------------*/ 4306 4307 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig, 4308 float_status *status) 4309 { 4310 int8_t roundingMode; 4311 bool roundNearestEven; 4312 int8_t roundIncrement, roundBits; 4313 bool isTiny; 4314 4315 roundingMode = status->float_rounding_mode; 4316 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4317 switch (roundingMode) { 4318 case float_round_nearest_even: 4319 case float_round_ties_away: 4320 roundIncrement = 0x40; 4321 break; 4322 case float_round_to_zero: 4323 roundIncrement = 0; 4324 break; 4325 case float_round_up: 4326 roundIncrement = zSign ? 0 : 0x7f; 4327 break; 4328 case float_round_down: 4329 roundIncrement = zSign ? 0x7f : 0; 4330 break; 4331 case float_round_to_odd: 4332 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 4333 break; 4334 default: 4335 abort(); 4336 break; 4337 } 4338 roundBits = zSig & 0x7F; 4339 if ( 0xFD <= (uint16_t) zExp ) { 4340 if ( ( 0xFD < zExp ) 4341 || ( ( zExp == 0xFD ) 4342 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 4343 ) { 4344 bool overflow_to_inf = roundingMode != float_round_to_odd && 4345 roundIncrement != 0; 4346 float_raise(float_flag_overflow | float_flag_inexact, status); 4347 return packFloat32(zSign, 0xFF, -!overflow_to_inf); 4348 } 4349 if ( zExp < 0 ) { 4350 if (status->flush_to_zero) { 4351 float_raise(float_flag_output_denormal, status); 4352 return packFloat32(zSign, 0, 0); 4353 } 4354 isTiny = status->tininess_before_rounding 4355 || (zExp < -1) 4356 || (zSig + roundIncrement < 0x80000000); 4357 shift32RightJamming( zSig, - zExp, &zSig ); 4358 zExp = 0; 4359 roundBits = zSig & 0x7F; 4360 if (isTiny && roundBits) { 4361 float_raise(float_flag_underflow, status); 4362 } 4363 if (roundingMode == float_round_to_odd) { 4364 /* 4365 * For round-to-odd case, the roundIncrement depends on 4366 * zSig which just changed. 4367 */ 4368 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 4369 } 4370 } 4371 } 4372 if (roundBits) { 4373 float_raise(float_flag_inexact, status); 4374 } 4375 zSig = ( zSig + roundIncrement )>>7; 4376 if (!(roundBits ^ 0x40) && roundNearestEven) { 4377 zSig &= ~1; 4378 } 4379 if ( zSig == 0 ) zExp = 0; 4380 return packFloat32( zSign, zExp, zSig ); 4381 4382 } 4383 4384 /*---------------------------------------------------------------------------- 4385 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4386 | and significand `zSig', and returns the proper single-precision floating- 4387 | point value corresponding to the abstract input. This routine is just like 4388 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 4389 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 4390 | floating-point exponent. 4391 *----------------------------------------------------------------------------*/ 4392 4393 static float32 4394 normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig, 4395 float_status *status) 4396 { 4397 int8_t shiftCount; 4398 4399 shiftCount = clz32(zSig) - 1; 4400 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 4401 status); 4402 4403 } 4404 4405 /*---------------------------------------------------------------------------- 4406 | Normalizes the subnormal double-precision floating-point value represented 4407 | by the denormalized significand `aSig'. The normalized exponent and 4408 | significand are stored at the locations pointed to by `zExpPtr' and 4409 | `zSigPtr', respectively. 4410 *----------------------------------------------------------------------------*/ 4411 4412 static void 4413 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 4414 { 4415 int8_t shiftCount; 4416 4417 shiftCount = clz64(aSig) - 11; 4418 *zSigPtr = aSig<<shiftCount; 4419 *zExpPtr = 1 - shiftCount; 4420 4421 } 4422 4423 /*---------------------------------------------------------------------------- 4424 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 4425 | double-precision floating-point value, returning the result. After being 4426 | shifted into the proper positions, the three fields are simply added 4427 | together to form the result. This means that any integer portion of `zSig' 4428 | will be added into the exponent. Since a properly normalized significand 4429 | will have an integer portion equal to 1, the `zExp' input should be 1 less 4430 | than the desired result exponent whenever `zSig' is a complete, normalized 4431 | significand. 4432 *----------------------------------------------------------------------------*/ 4433 4434 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig) 4435 { 4436 4437 return make_float64( 4438 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 4439 4440 } 4441 4442 /*---------------------------------------------------------------------------- 4443 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4444 | and significand `zSig', and returns the proper double-precision floating- 4445 | point value corresponding to the abstract input. Ordinarily, the abstract 4446 | value is simply rounded and packed into the double-precision format, with 4447 | the inexact exception raised if the abstract input cannot be represented 4448 | exactly. However, if the abstract value is too large, the overflow and 4449 | inexact exceptions are raised and an infinity or maximal finite value is 4450 | returned. If the abstract value is too small, the input value is rounded to 4451 | a subnormal number, and the underflow and inexact exceptions are raised if 4452 | the abstract input cannot be represented exactly as a subnormal double- 4453 | precision floating-point number. 4454 | The input significand `zSig' has its binary point between bits 62 4455 | and 61, which is 10 bits to the left of the usual location. This shifted 4456 | significand must be normalized or smaller. If `zSig' is not normalized, 4457 | `zExp' must be 0; in that case, the result returned is a subnormal number, 4458 | and it must not require rounding. In the usual case that `zSig' is 4459 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 4460 | The handling of underflow and overflow follows the IEC/IEEE Standard for 4461 | Binary Floating-Point Arithmetic. 4462 *----------------------------------------------------------------------------*/ 4463 4464 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig, 4465 float_status *status) 4466 { 4467 int8_t roundingMode; 4468 bool roundNearestEven; 4469 int roundIncrement, roundBits; 4470 bool isTiny; 4471 4472 roundingMode = status->float_rounding_mode; 4473 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4474 switch (roundingMode) { 4475 case float_round_nearest_even: 4476 case float_round_ties_away: 4477 roundIncrement = 0x200; 4478 break; 4479 case float_round_to_zero: 4480 roundIncrement = 0; 4481 break; 4482 case float_round_up: 4483 roundIncrement = zSign ? 0 : 0x3ff; 4484 break; 4485 case float_round_down: 4486 roundIncrement = zSign ? 0x3ff : 0; 4487 break; 4488 case float_round_to_odd: 4489 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 4490 break; 4491 default: 4492 abort(); 4493 } 4494 roundBits = zSig & 0x3FF; 4495 if ( 0x7FD <= (uint16_t) zExp ) { 4496 if ( ( 0x7FD < zExp ) 4497 || ( ( zExp == 0x7FD ) 4498 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 4499 ) { 4500 bool overflow_to_inf = roundingMode != float_round_to_odd && 4501 roundIncrement != 0; 4502 float_raise(float_flag_overflow | float_flag_inexact, status); 4503 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 4504 } 4505 if ( zExp < 0 ) { 4506 if (status->flush_to_zero) { 4507 float_raise(float_flag_output_denormal, status); 4508 return packFloat64(zSign, 0, 0); 4509 } 4510 isTiny = status->tininess_before_rounding 4511 || (zExp < -1) 4512 || (zSig + roundIncrement < UINT64_C(0x8000000000000000)); 4513 shift64RightJamming( zSig, - zExp, &zSig ); 4514 zExp = 0; 4515 roundBits = zSig & 0x3FF; 4516 if (isTiny && roundBits) { 4517 float_raise(float_flag_underflow, status); 4518 } 4519 if (roundingMode == float_round_to_odd) { 4520 /* 4521 * For round-to-odd case, the roundIncrement depends on 4522 * zSig which just changed. 4523 */ 4524 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 4525 } 4526 } 4527 } 4528 if (roundBits) { 4529 float_raise(float_flag_inexact, status); 4530 } 4531 zSig = ( zSig + roundIncrement )>>10; 4532 if (!(roundBits ^ 0x200) && roundNearestEven) { 4533 zSig &= ~1; 4534 } 4535 if ( zSig == 0 ) zExp = 0; 4536 return packFloat64( zSign, zExp, zSig ); 4537 4538 } 4539 4540 /*---------------------------------------------------------------------------- 4541 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4542 | and significand `zSig', and returns the proper double-precision floating- 4543 | point value corresponding to the abstract input. This routine is just like 4544 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 4545 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 4546 | floating-point exponent. 4547 *----------------------------------------------------------------------------*/ 4548 4549 static float64 4550 normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig, 4551 float_status *status) 4552 { 4553 int8_t shiftCount; 4554 4555 shiftCount = clz64(zSig) - 1; 4556 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 4557 status); 4558 4559 } 4560 4561 /*---------------------------------------------------------------------------- 4562 | Normalizes the subnormal extended double-precision floating-point value 4563 | represented by the denormalized significand `aSig'. The normalized exponent 4564 | and significand are stored at the locations pointed to by `zExpPtr' and 4565 | `zSigPtr', respectively. 4566 *----------------------------------------------------------------------------*/ 4567 4568 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr, 4569 uint64_t *zSigPtr) 4570 { 4571 int8_t shiftCount; 4572 4573 shiftCount = clz64(aSig); 4574 *zSigPtr = aSig<<shiftCount; 4575 *zExpPtr = 1 - shiftCount; 4576 } 4577 4578 /*---------------------------------------------------------------------------- 4579 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4580 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 4581 | and returns the proper extended double-precision floating-point value 4582 | corresponding to the abstract input. Ordinarily, the abstract value is 4583 | rounded and packed into the extended double-precision format, with the 4584 | inexact exception raised if the abstract input cannot be represented 4585 | exactly. However, if the abstract value is too large, the overflow and 4586 | inexact exceptions are raised and an infinity or maximal finite value is 4587 | returned. If the abstract value is too small, the input value is rounded to 4588 | a subnormal number, and the underflow and inexact exceptions are raised if 4589 | the abstract input cannot be represented exactly as a subnormal extended 4590 | double-precision floating-point number. 4591 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 4592 | number of bits as single or double precision, respectively. Otherwise, the 4593 | result is rounded to the full precision of the extended double-precision 4594 | format. 4595 | The input significand must be normalized or smaller. If the input 4596 | significand is not normalized, `zExp' must be 0; in that case, the result 4597 | returned is a subnormal number, and it must not require rounding. The 4598 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 4599 | Floating-Point Arithmetic. 4600 *----------------------------------------------------------------------------*/ 4601 4602 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign, 4603 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 4604 float_status *status) 4605 { 4606 int8_t roundingMode; 4607 bool roundNearestEven, increment, isTiny; 4608 int64_t roundIncrement, roundMask, roundBits; 4609 4610 roundingMode = status->float_rounding_mode; 4611 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4612 if ( roundingPrecision == 80 ) goto precision80; 4613 if ( roundingPrecision == 64 ) { 4614 roundIncrement = UINT64_C(0x0000000000000400); 4615 roundMask = UINT64_C(0x00000000000007FF); 4616 } 4617 else if ( roundingPrecision == 32 ) { 4618 roundIncrement = UINT64_C(0x0000008000000000); 4619 roundMask = UINT64_C(0x000000FFFFFFFFFF); 4620 } 4621 else { 4622 goto precision80; 4623 } 4624 zSig0 |= ( zSig1 != 0 ); 4625 switch (roundingMode) { 4626 case float_round_nearest_even: 4627 case float_round_ties_away: 4628 break; 4629 case float_round_to_zero: 4630 roundIncrement = 0; 4631 break; 4632 case float_round_up: 4633 roundIncrement = zSign ? 0 : roundMask; 4634 break; 4635 case float_round_down: 4636 roundIncrement = zSign ? roundMask : 0; 4637 break; 4638 default: 4639 abort(); 4640 } 4641 roundBits = zSig0 & roundMask; 4642 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 4643 if ( ( 0x7FFE < zExp ) 4644 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 4645 ) { 4646 goto overflow; 4647 } 4648 if ( zExp <= 0 ) { 4649 if (status->flush_to_zero) { 4650 float_raise(float_flag_output_denormal, status); 4651 return packFloatx80(zSign, 0, 0); 4652 } 4653 isTiny = status->tininess_before_rounding 4654 || (zExp < 0 ) 4655 || (zSig0 <= zSig0 + roundIncrement); 4656 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 4657 zExp = 0; 4658 roundBits = zSig0 & roundMask; 4659 if (isTiny && roundBits) { 4660 float_raise(float_flag_underflow, status); 4661 } 4662 if (roundBits) { 4663 float_raise(float_flag_inexact, status); 4664 } 4665 zSig0 += roundIncrement; 4666 if ( (int64_t) zSig0 < 0 ) zExp = 1; 4667 roundIncrement = roundMask + 1; 4668 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 4669 roundMask |= roundIncrement; 4670 } 4671 zSig0 &= ~ roundMask; 4672 return packFloatx80( zSign, zExp, zSig0 ); 4673 } 4674 } 4675 if (roundBits) { 4676 float_raise(float_flag_inexact, status); 4677 } 4678 zSig0 += roundIncrement; 4679 if ( zSig0 < roundIncrement ) { 4680 ++zExp; 4681 zSig0 = UINT64_C(0x8000000000000000); 4682 } 4683 roundIncrement = roundMask + 1; 4684 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 4685 roundMask |= roundIncrement; 4686 } 4687 zSig0 &= ~ roundMask; 4688 if ( zSig0 == 0 ) zExp = 0; 4689 return packFloatx80( zSign, zExp, zSig0 ); 4690 precision80: 4691 switch (roundingMode) { 4692 case float_round_nearest_even: 4693 case float_round_ties_away: 4694 increment = ((int64_t)zSig1 < 0); 4695 break; 4696 case float_round_to_zero: 4697 increment = 0; 4698 break; 4699 case float_round_up: 4700 increment = !zSign && zSig1; 4701 break; 4702 case float_round_down: 4703 increment = zSign && zSig1; 4704 break; 4705 default: 4706 abort(); 4707 } 4708 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 4709 if ( ( 0x7FFE < zExp ) 4710 || ( ( zExp == 0x7FFE ) 4711 && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) ) 4712 && increment 4713 ) 4714 ) { 4715 roundMask = 0; 4716 overflow: 4717 float_raise(float_flag_overflow | float_flag_inexact, status); 4718 if ( ( roundingMode == float_round_to_zero ) 4719 || ( zSign && ( roundingMode == float_round_up ) ) 4720 || ( ! zSign && ( roundingMode == float_round_down ) ) 4721 ) { 4722 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 4723 } 4724 return packFloatx80(zSign, 4725 floatx80_infinity_high, 4726 floatx80_infinity_low); 4727 } 4728 if ( zExp <= 0 ) { 4729 isTiny = status->tininess_before_rounding 4730 || (zExp < 0) 4731 || !increment 4732 || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF)); 4733 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 4734 zExp = 0; 4735 if (isTiny && zSig1) { 4736 float_raise(float_flag_underflow, status); 4737 } 4738 if (zSig1) { 4739 float_raise(float_flag_inexact, status); 4740 } 4741 switch (roundingMode) { 4742 case float_round_nearest_even: 4743 case float_round_ties_away: 4744 increment = ((int64_t)zSig1 < 0); 4745 break; 4746 case float_round_to_zero: 4747 increment = 0; 4748 break; 4749 case float_round_up: 4750 increment = !zSign && zSig1; 4751 break; 4752 case float_round_down: 4753 increment = zSign && zSig1; 4754 break; 4755 default: 4756 abort(); 4757 } 4758 if ( increment ) { 4759 ++zSig0; 4760 if (!(zSig1 << 1) && roundNearestEven) { 4761 zSig0 &= ~1; 4762 } 4763 if ( (int64_t) zSig0 < 0 ) zExp = 1; 4764 } 4765 return packFloatx80( zSign, zExp, zSig0 ); 4766 } 4767 } 4768 if (zSig1) { 4769 float_raise(float_flag_inexact, status); 4770 } 4771 if ( increment ) { 4772 ++zSig0; 4773 if ( zSig0 == 0 ) { 4774 ++zExp; 4775 zSig0 = UINT64_C(0x8000000000000000); 4776 } 4777 else { 4778 if (!(zSig1 << 1) && roundNearestEven) { 4779 zSig0 &= ~1; 4780 } 4781 } 4782 } 4783 else { 4784 if ( zSig0 == 0 ) zExp = 0; 4785 } 4786 return packFloatx80( zSign, zExp, zSig0 ); 4787 4788 } 4789 4790 /*---------------------------------------------------------------------------- 4791 | Takes an abstract floating-point value having sign `zSign', exponent 4792 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 4793 | and returns the proper extended double-precision floating-point value 4794 | corresponding to the abstract input. This routine is just like 4795 | `roundAndPackFloatx80' except that the input significand does not have to be 4796 | normalized. 4797 *----------------------------------------------------------------------------*/ 4798 4799 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 4800 bool zSign, int32_t zExp, 4801 uint64_t zSig0, uint64_t zSig1, 4802 float_status *status) 4803 { 4804 int8_t shiftCount; 4805 4806 if ( zSig0 == 0 ) { 4807 zSig0 = zSig1; 4808 zSig1 = 0; 4809 zExp -= 64; 4810 } 4811 shiftCount = clz64(zSig0); 4812 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4813 zExp -= shiftCount; 4814 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 4815 zSig0, zSig1, status); 4816 4817 } 4818 4819 /*---------------------------------------------------------------------------- 4820 | Returns the least-significant 64 fraction bits of the quadruple-precision 4821 | floating-point value `a'. 4822 *----------------------------------------------------------------------------*/ 4823 4824 static inline uint64_t extractFloat128Frac1( float128 a ) 4825 { 4826 4827 return a.low; 4828 4829 } 4830 4831 /*---------------------------------------------------------------------------- 4832 | Returns the most-significant 48 fraction bits of the quadruple-precision 4833 | floating-point value `a'. 4834 *----------------------------------------------------------------------------*/ 4835 4836 static inline uint64_t extractFloat128Frac0( float128 a ) 4837 { 4838 4839 return a.high & UINT64_C(0x0000FFFFFFFFFFFF); 4840 4841 } 4842 4843 /*---------------------------------------------------------------------------- 4844 | Returns the exponent bits of the quadruple-precision floating-point value 4845 | `a'. 4846 *----------------------------------------------------------------------------*/ 4847 4848 static inline int32_t extractFloat128Exp( float128 a ) 4849 { 4850 4851 return ( a.high>>48 ) & 0x7FFF; 4852 4853 } 4854 4855 /*---------------------------------------------------------------------------- 4856 | Returns the sign bit of the quadruple-precision floating-point value `a'. 4857 *----------------------------------------------------------------------------*/ 4858 4859 static inline bool extractFloat128Sign(float128 a) 4860 { 4861 return a.high >> 63; 4862 } 4863 4864 /*---------------------------------------------------------------------------- 4865 | Normalizes the subnormal quadruple-precision floating-point value 4866 | represented by the denormalized significand formed by the concatenation of 4867 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 4868 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 4869 | significand are stored at the location pointed to by `zSig0Ptr', and the 4870 | least significant 64 bits of the normalized significand are stored at the 4871 | location pointed to by `zSig1Ptr'. 4872 *----------------------------------------------------------------------------*/ 4873 4874 static void 4875 normalizeFloat128Subnormal( 4876 uint64_t aSig0, 4877 uint64_t aSig1, 4878 int32_t *zExpPtr, 4879 uint64_t *zSig0Ptr, 4880 uint64_t *zSig1Ptr 4881 ) 4882 { 4883 int8_t shiftCount; 4884 4885 if ( aSig0 == 0 ) { 4886 shiftCount = clz64(aSig1) - 15; 4887 if ( shiftCount < 0 ) { 4888 *zSig0Ptr = aSig1>>( - shiftCount ); 4889 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 4890 } 4891 else { 4892 *zSig0Ptr = aSig1<<shiftCount; 4893 *zSig1Ptr = 0; 4894 } 4895 *zExpPtr = - shiftCount - 63; 4896 } 4897 else { 4898 shiftCount = clz64(aSig0) - 15; 4899 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 4900 *zExpPtr = 1 - shiftCount; 4901 } 4902 4903 } 4904 4905 /*---------------------------------------------------------------------------- 4906 | Packs the sign `zSign', the exponent `zExp', and the significand formed 4907 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 4908 | floating-point value, returning the result. After being shifted into the 4909 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 4910 | added together to form the most significant 32 bits of the result. This 4911 | means that any integer portion of `zSig0' will be added into the exponent. 4912 | Since a properly normalized significand will have an integer portion equal 4913 | to 1, the `zExp' input should be 1 less than the desired result exponent 4914 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 4915 | significand. 4916 *----------------------------------------------------------------------------*/ 4917 4918 static inline float128 4919 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1) 4920 { 4921 float128 z; 4922 4923 z.low = zSig1; 4924 z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0; 4925 return z; 4926 } 4927 4928 /*---------------------------------------------------------------------------- 4929 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4930 | and extended significand formed by the concatenation of `zSig0', `zSig1', 4931 | and `zSig2', and returns the proper quadruple-precision floating-point value 4932 | corresponding to the abstract input. Ordinarily, the abstract value is 4933 | simply rounded and packed into the quadruple-precision format, with the 4934 | inexact exception raised if the abstract input cannot be represented 4935 | exactly. However, if the abstract value is too large, the overflow and 4936 | inexact exceptions are raised and an infinity or maximal finite value is 4937 | returned. If the abstract value is too small, the input value is rounded to 4938 | a subnormal number, and the underflow and inexact exceptions are raised if 4939 | the abstract input cannot be represented exactly as a subnormal quadruple- 4940 | precision floating-point number. 4941 | The input significand must be normalized or smaller. If the input 4942 | significand is not normalized, `zExp' must be 0; in that case, the result 4943 | returned is a subnormal number, and it must not require rounding. In the 4944 | usual case that the input significand is normalized, `zExp' must be 1 less 4945 | than the ``true'' floating-point exponent. The handling of underflow and 4946 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4947 *----------------------------------------------------------------------------*/ 4948 4949 static float128 roundAndPackFloat128(bool zSign, int32_t zExp, 4950 uint64_t zSig0, uint64_t zSig1, 4951 uint64_t zSig2, float_status *status) 4952 { 4953 int8_t roundingMode; 4954 bool roundNearestEven, increment, isTiny; 4955 4956 roundingMode = status->float_rounding_mode; 4957 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4958 switch (roundingMode) { 4959 case float_round_nearest_even: 4960 case float_round_ties_away: 4961 increment = ((int64_t)zSig2 < 0); 4962 break; 4963 case float_round_to_zero: 4964 increment = 0; 4965 break; 4966 case float_round_up: 4967 increment = !zSign && zSig2; 4968 break; 4969 case float_round_down: 4970 increment = zSign && zSig2; 4971 break; 4972 case float_round_to_odd: 4973 increment = !(zSig1 & 0x1) && zSig2; 4974 break; 4975 default: 4976 abort(); 4977 } 4978 if ( 0x7FFD <= (uint32_t) zExp ) { 4979 if ( ( 0x7FFD < zExp ) 4980 || ( ( zExp == 0x7FFD ) 4981 && eq128( 4982 UINT64_C(0x0001FFFFFFFFFFFF), 4983 UINT64_C(0xFFFFFFFFFFFFFFFF), 4984 zSig0, 4985 zSig1 4986 ) 4987 && increment 4988 ) 4989 ) { 4990 float_raise(float_flag_overflow | float_flag_inexact, status); 4991 if ( ( roundingMode == float_round_to_zero ) 4992 || ( zSign && ( roundingMode == float_round_up ) ) 4993 || ( ! zSign && ( roundingMode == float_round_down ) ) 4994 || (roundingMode == float_round_to_odd) 4995 ) { 4996 return 4997 packFloat128( 4998 zSign, 4999 0x7FFE, 5000 UINT64_C(0x0000FFFFFFFFFFFF), 5001 UINT64_C(0xFFFFFFFFFFFFFFFF) 5002 ); 5003 } 5004 return packFloat128( zSign, 0x7FFF, 0, 0 ); 5005 } 5006 if ( zExp < 0 ) { 5007 if (status->flush_to_zero) { 5008 float_raise(float_flag_output_denormal, status); 5009 return packFloat128(zSign, 0, 0, 0); 5010 } 5011 isTiny = status->tininess_before_rounding 5012 || (zExp < -1) 5013 || !increment 5014 || lt128(zSig0, zSig1, 5015 UINT64_C(0x0001FFFFFFFFFFFF), 5016 UINT64_C(0xFFFFFFFFFFFFFFFF)); 5017 shift128ExtraRightJamming( 5018 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 5019 zExp = 0; 5020 if (isTiny && zSig2) { 5021 float_raise(float_flag_underflow, status); 5022 } 5023 switch (roundingMode) { 5024 case float_round_nearest_even: 5025 case float_round_ties_away: 5026 increment = ((int64_t)zSig2 < 0); 5027 break; 5028 case float_round_to_zero: 5029 increment = 0; 5030 break; 5031 case float_round_up: 5032 increment = !zSign && zSig2; 5033 break; 5034 case float_round_down: 5035 increment = zSign && zSig2; 5036 break; 5037 case float_round_to_odd: 5038 increment = !(zSig1 & 0x1) && zSig2; 5039 break; 5040 default: 5041 abort(); 5042 } 5043 } 5044 } 5045 if (zSig2) { 5046 float_raise(float_flag_inexact, status); 5047 } 5048 if ( increment ) { 5049 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 5050 if ((zSig2 + zSig2 == 0) && roundNearestEven) { 5051 zSig1 &= ~1; 5052 } 5053 } 5054 else { 5055 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 5056 } 5057 return packFloat128( zSign, zExp, zSig0, zSig1 ); 5058 5059 } 5060 5061 /*---------------------------------------------------------------------------- 5062 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 5063 | and significand formed by the concatenation of `zSig0' and `zSig1', and 5064 | returns the proper quadruple-precision floating-point value corresponding 5065 | to the abstract input. This routine is just like `roundAndPackFloat128' 5066 | except that the input significand has fewer bits and does not have to be 5067 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 5068 | point exponent. 5069 *----------------------------------------------------------------------------*/ 5070 5071 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp, 5072 uint64_t zSig0, uint64_t zSig1, 5073 float_status *status) 5074 { 5075 int8_t shiftCount; 5076 uint64_t zSig2; 5077 5078 if ( zSig0 == 0 ) { 5079 zSig0 = zSig1; 5080 zSig1 = 0; 5081 zExp -= 64; 5082 } 5083 shiftCount = clz64(zSig0) - 15; 5084 if ( 0 <= shiftCount ) { 5085 zSig2 = 0; 5086 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 5087 } 5088 else { 5089 shift128ExtraRightJamming( 5090 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 5091 } 5092 zExp -= shiftCount; 5093 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 5094 5095 } 5096 5097 5098 /*---------------------------------------------------------------------------- 5099 | Returns the result of converting the 32-bit two's complement integer `a' 5100 | to the extended double-precision floating-point format. The conversion 5101 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5102 | Arithmetic. 5103 *----------------------------------------------------------------------------*/ 5104 5105 floatx80 int32_to_floatx80(int32_t a, float_status *status) 5106 { 5107 bool zSign; 5108 uint32_t absA; 5109 int8_t shiftCount; 5110 uint64_t zSig; 5111 5112 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 5113 zSign = ( a < 0 ); 5114 absA = zSign ? - a : a; 5115 shiftCount = clz32(absA) + 32; 5116 zSig = absA; 5117 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 5118 5119 } 5120 5121 /*---------------------------------------------------------------------------- 5122 | Returns the result of converting the 32-bit two's complement integer `a' to 5123 | the quadruple-precision floating-point format. The conversion is performed 5124 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5125 *----------------------------------------------------------------------------*/ 5126 5127 float128 int32_to_float128(int32_t a, float_status *status) 5128 { 5129 bool zSign; 5130 uint32_t absA; 5131 int8_t shiftCount; 5132 uint64_t zSig0; 5133 5134 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 5135 zSign = ( a < 0 ); 5136 absA = zSign ? - a : a; 5137 shiftCount = clz32(absA) + 17; 5138 zSig0 = absA; 5139 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 5140 5141 } 5142 5143 /*---------------------------------------------------------------------------- 5144 | Returns the result of converting the 64-bit two's complement integer `a' 5145 | to the extended double-precision floating-point format. The conversion 5146 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5147 | Arithmetic. 5148 *----------------------------------------------------------------------------*/ 5149 5150 floatx80 int64_to_floatx80(int64_t a, float_status *status) 5151 { 5152 bool zSign; 5153 uint64_t absA; 5154 int8_t shiftCount; 5155 5156 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 5157 zSign = ( a < 0 ); 5158 absA = zSign ? - a : a; 5159 shiftCount = clz64(absA); 5160 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 5161 5162 } 5163 5164 /*---------------------------------------------------------------------------- 5165 | Returns the result of converting the 64-bit two's complement integer `a' to 5166 | the quadruple-precision floating-point format. The conversion is performed 5167 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5168 *----------------------------------------------------------------------------*/ 5169 5170 float128 int64_to_float128(int64_t a, float_status *status) 5171 { 5172 bool zSign; 5173 uint64_t absA; 5174 int8_t shiftCount; 5175 int32_t zExp; 5176 uint64_t zSig0, zSig1; 5177 5178 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 5179 zSign = ( a < 0 ); 5180 absA = zSign ? - a : a; 5181 shiftCount = clz64(absA) + 49; 5182 zExp = 0x406E - shiftCount; 5183 if ( 64 <= shiftCount ) { 5184 zSig1 = 0; 5185 zSig0 = absA; 5186 shiftCount -= 64; 5187 } 5188 else { 5189 zSig1 = absA; 5190 zSig0 = 0; 5191 } 5192 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 5193 return packFloat128( zSign, zExp, zSig0, zSig1 ); 5194 5195 } 5196 5197 /*---------------------------------------------------------------------------- 5198 | Returns the result of converting the 64-bit unsigned integer `a' 5199 | to the quadruple-precision floating-point format. The conversion is performed 5200 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5201 *----------------------------------------------------------------------------*/ 5202 5203 float128 uint64_to_float128(uint64_t a, float_status *status) 5204 { 5205 if (a == 0) { 5206 return float128_zero; 5207 } 5208 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status); 5209 } 5210 5211 /*---------------------------------------------------------------------------- 5212 | Returns the result of converting the single-precision floating-point value 5213 | `a' to the extended double-precision floating-point format. The conversion 5214 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5215 | Arithmetic. 5216 *----------------------------------------------------------------------------*/ 5217 5218 floatx80 float32_to_floatx80(float32 a, float_status *status) 5219 { 5220 bool aSign; 5221 int aExp; 5222 uint32_t aSig; 5223 5224 a = float32_squash_input_denormal(a, status); 5225 aSig = extractFloat32Frac( a ); 5226 aExp = extractFloat32Exp( a ); 5227 aSign = extractFloat32Sign( a ); 5228 if ( aExp == 0xFF ) { 5229 if (aSig) { 5230 floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status), 5231 status); 5232 return floatx80_silence_nan(res, status); 5233 } 5234 return packFloatx80(aSign, 5235 floatx80_infinity_high, 5236 floatx80_infinity_low); 5237 } 5238 if ( aExp == 0 ) { 5239 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 5240 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5241 } 5242 aSig |= 0x00800000; 5243 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 5244 5245 } 5246 5247 /*---------------------------------------------------------------------------- 5248 | Returns the remainder of the single-precision floating-point value `a' 5249 | with respect to the corresponding value `b'. The operation is performed 5250 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5251 *----------------------------------------------------------------------------*/ 5252 5253 float32 float32_rem(float32 a, float32 b, float_status *status) 5254 { 5255 bool aSign, zSign; 5256 int aExp, bExp, expDiff; 5257 uint32_t aSig, bSig; 5258 uint32_t q; 5259 uint64_t aSig64, bSig64, q64; 5260 uint32_t alternateASig; 5261 int32_t sigMean; 5262 a = float32_squash_input_denormal(a, status); 5263 b = float32_squash_input_denormal(b, status); 5264 5265 aSig = extractFloat32Frac( a ); 5266 aExp = extractFloat32Exp( a ); 5267 aSign = extractFloat32Sign( a ); 5268 bSig = extractFloat32Frac( b ); 5269 bExp = extractFloat32Exp( b ); 5270 if ( aExp == 0xFF ) { 5271 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 5272 return propagateFloat32NaN(a, b, status); 5273 } 5274 float_raise(float_flag_invalid, status); 5275 return float32_default_nan(status); 5276 } 5277 if ( bExp == 0xFF ) { 5278 if (bSig) { 5279 return propagateFloat32NaN(a, b, status); 5280 } 5281 return a; 5282 } 5283 if ( bExp == 0 ) { 5284 if ( bSig == 0 ) { 5285 float_raise(float_flag_invalid, status); 5286 return float32_default_nan(status); 5287 } 5288 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 5289 } 5290 if ( aExp == 0 ) { 5291 if ( aSig == 0 ) return a; 5292 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5293 } 5294 expDiff = aExp - bExp; 5295 aSig |= 0x00800000; 5296 bSig |= 0x00800000; 5297 if ( expDiff < 32 ) { 5298 aSig <<= 8; 5299 bSig <<= 8; 5300 if ( expDiff < 0 ) { 5301 if ( expDiff < -1 ) return a; 5302 aSig >>= 1; 5303 } 5304 q = ( bSig <= aSig ); 5305 if ( q ) aSig -= bSig; 5306 if ( 0 < expDiff ) { 5307 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 5308 q >>= 32 - expDiff; 5309 bSig >>= 2; 5310 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 5311 } 5312 else { 5313 aSig >>= 2; 5314 bSig >>= 2; 5315 } 5316 } 5317 else { 5318 if ( bSig <= aSig ) aSig -= bSig; 5319 aSig64 = ( (uint64_t) aSig )<<40; 5320 bSig64 = ( (uint64_t) bSig )<<40; 5321 expDiff -= 64; 5322 while ( 0 < expDiff ) { 5323 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 5324 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 5325 aSig64 = - ( ( bSig * q64 )<<38 ); 5326 expDiff -= 62; 5327 } 5328 expDiff += 64; 5329 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 5330 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 5331 q = q64>>( 64 - expDiff ); 5332 bSig <<= 6; 5333 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 5334 } 5335 do { 5336 alternateASig = aSig; 5337 ++q; 5338 aSig -= bSig; 5339 } while ( 0 <= (int32_t) aSig ); 5340 sigMean = aSig + alternateASig; 5341 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 5342 aSig = alternateASig; 5343 } 5344 zSign = ( (int32_t) aSig < 0 ); 5345 if ( zSign ) aSig = - aSig; 5346 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 5347 } 5348 5349 5350 5351 /*---------------------------------------------------------------------------- 5352 | Returns the binary exponential of the single-precision floating-point value 5353 | `a'. The operation is performed according to the IEC/IEEE Standard for 5354 | Binary Floating-Point Arithmetic. 5355 | 5356 | Uses the following identities: 5357 | 5358 | 1. ------------------------------------------------------------------------- 5359 | x x*ln(2) 5360 | 2 = e 5361 | 5362 | 2. ------------------------------------------------------------------------- 5363 | 2 3 4 5 n 5364 | x x x x x x x 5365 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 5366 | 1! 2! 3! 4! 5! n! 5367 *----------------------------------------------------------------------------*/ 5368 5369 static const float64 float32_exp2_coefficients[15] = 5370 { 5371 const_float64( 0x3ff0000000000000ll ), /* 1 */ 5372 const_float64( 0x3fe0000000000000ll ), /* 2 */ 5373 const_float64( 0x3fc5555555555555ll ), /* 3 */ 5374 const_float64( 0x3fa5555555555555ll ), /* 4 */ 5375 const_float64( 0x3f81111111111111ll ), /* 5 */ 5376 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 5377 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 5378 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 5379 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 5380 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 5381 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 5382 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 5383 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 5384 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 5385 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 5386 }; 5387 5388 float32 float32_exp2(float32 a, float_status *status) 5389 { 5390 bool aSign; 5391 int aExp; 5392 uint32_t aSig; 5393 float64 r, x, xn; 5394 int i; 5395 a = float32_squash_input_denormal(a, status); 5396 5397 aSig = extractFloat32Frac( a ); 5398 aExp = extractFloat32Exp( a ); 5399 aSign = extractFloat32Sign( a ); 5400 5401 if ( aExp == 0xFF) { 5402 if (aSig) { 5403 return propagateFloat32NaN(a, float32_zero, status); 5404 } 5405 return (aSign) ? float32_zero : a; 5406 } 5407 if (aExp == 0) { 5408 if (aSig == 0) return float32_one; 5409 } 5410 5411 float_raise(float_flag_inexact, status); 5412 5413 /* ******************************* */ 5414 /* using float64 for approximation */ 5415 /* ******************************* */ 5416 x = float32_to_float64(a, status); 5417 x = float64_mul(x, float64_ln2, status); 5418 5419 xn = x; 5420 r = float64_one; 5421 for (i = 0 ; i < 15 ; i++) { 5422 float64 f; 5423 5424 f = float64_mul(xn, float32_exp2_coefficients[i], status); 5425 r = float64_add(r, f, status); 5426 5427 xn = float64_mul(xn, x, status); 5428 } 5429 5430 return float64_to_float32(r, status); 5431 } 5432 5433 /*---------------------------------------------------------------------------- 5434 | Returns the binary log of the single-precision floating-point value `a'. 5435 | The operation is performed according to the IEC/IEEE Standard for Binary 5436 | Floating-Point Arithmetic. 5437 *----------------------------------------------------------------------------*/ 5438 float32 float32_log2(float32 a, float_status *status) 5439 { 5440 bool aSign, zSign; 5441 int aExp; 5442 uint32_t aSig, zSig, i; 5443 5444 a = float32_squash_input_denormal(a, status); 5445 aSig = extractFloat32Frac( a ); 5446 aExp = extractFloat32Exp( a ); 5447 aSign = extractFloat32Sign( a ); 5448 5449 if ( aExp == 0 ) { 5450 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 5451 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5452 } 5453 if ( aSign ) { 5454 float_raise(float_flag_invalid, status); 5455 return float32_default_nan(status); 5456 } 5457 if ( aExp == 0xFF ) { 5458 if (aSig) { 5459 return propagateFloat32NaN(a, float32_zero, status); 5460 } 5461 return a; 5462 } 5463 5464 aExp -= 0x7F; 5465 aSig |= 0x00800000; 5466 zSign = aExp < 0; 5467 zSig = aExp << 23; 5468 5469 for (i = 1 << 22; i > 0; i >>= 1) { 5470 aSig = ( (uint64_t)aSig * aSig ) >> 23; 5471 if ( aSig & 0x01000000 ) { 5472 aSig >>= 1; 5473 zSig |= i; 5474 } 5475 } 5476 5477 if ( zSign ) 5478 zSig = -zSig; 5479 5480 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 5481 } 5482 5483 /*---------------------------------------------------------------------------- 5484 | Returns the result of converting the double-precision floating-point value 5485 | `a' to the extended double-precision floating-point format. The conversion 5486 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5487 | Arithmetic. 5488 *----------------------------------------------------------------------------*/ 5489 5490 floatx80 float64_to_floatx80(float64 a, float_status *status) 5491 { 5492 bool aSign; 5493 int aExp; 5494 uint64_t aSig; 5495 5496 a = float64_squash_input_denormal(a, status); 5497 aSig = extractFloat64Frac( a ); 5498 aExp = extractFloat64Exp( a ); 5499 aSign = extractFloat64Sign( a ); 5500 if ( aExp == 0x7FF ) { 5501 if (aSig) { 5502 floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status), 5503 status); 5504 return floatx80_silence_nan(res, status); 5505 } 5506 return packFloatx80(aSign, 5507 floatx80_infinity_high, 5508 floatx80_infinity_low); 5509 } 5510 if ( aExp == 0 ) { 5511 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 5512 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5513 } 5514 return 5515 packFloatx80( 5516 aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11); 5517 5518 } 5519 5520 /*---------------------------------------------------------------------------- 5521 | Returns the remainder of the double-precision floating-point value `a' 5522 | with respect to the corresponding value `b'. The operation is performed 5523 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5524 *----------------------------------------------------------------------------*/ 5525 5526 float64 float64_rem(float64 a, float64 b, float_status *status) 5527 { 5528 bool aSign, zSign; 5529 int aExp, bExp, expDiff; 5530 uint64_t aSig, bSig; 5531 uint64_t q, alternateASig; 5532 int64_t sigMean; 5533 5534 a = float64_squash_input_denormal(a, status); 5535 b = float64_squash_input_denormal(b, status); 5536 aSig = extractFloat64Frac( a ); 5537 aExp = extractFloat64Exp( a ); 5538 aSign = extractFloat64Sign( a ); 5539 bSig = extractFloat64Frac( b ); 5540 bExp = extractFloat64Exp( b ); 5541 if ( aExp == 0x7FF ) { 5542 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 5543 return propagateFloat64NaN(a, b, status); 5544 } 5545 float_raise(float_flag_invalid, status); 5546 return float64_default_nan(status); 5547 } 5548 if ( bExp == 0x7FF ) { 5549 if (bSig) { 5550 return propagateFloat64NaN(a, b, status); 5551 } 5552 return a; 5553 } 5554 if ( bExp == 0 ) { 5555 if ( bSig == 0 ) { 5556 float_raise(float_flag_invalid, status); 5557 return float64_default_nan(status); 5558 } 5559 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 5560 } 5561 if ( aExp == 0 ) { 5562 if ( aSig == 0 ) return a; 5563 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5564 } 5565 expDiff = aExp - bExp; 5566 aSig = (aSig | UINT64_C(0x0010000000000000)) << 11; 5567 bSig = (bSig | UINT64_C(0x0010000000000000)) << 11; 5568 if ( expDiff < 0 ) { 5569 if ( expDiff < -1 ) return a; 5570 aSig >>= 1; 5571 } 5572 q = ( bSig <= aSig ); 5573 if ( q ) aSig -= bSig; 5574 expDiff -= 64; 5575 while ( 0 < expDiff ) { 5576 q = estimateDiv128To64( aSig, 0, bSig ); 5577 q = ( 2 < q ) ? q - 2 : 0; 5578 aSig = - ( ( bSig>>2 ) * q ); 5579 expDiff -= 62; 5580 } 5581 expDiff += 64; 5582 if ( 0 < expDiff ) { 5583 q = estimateDiv128To64( aSig, 0, bSig ); 5584 q = ( 2 < q ) ? q - 2 : 0; 5585 q >>= 64 - expDiff; 5586 bSig >>= 2; 5587 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 5588 } 5589 else { 5590 aSig >>= 2; 5591 bSig >>= 2; 5592 } 5593 do { 5594 alternateASig = aSig; 5595 ++q; 5596 aSig -= bSig; 5597 } while ( 0 <= (int64_t) aSig ); 5598 sigMean = aSig + alternateASig; 5599 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 5600 aSig = alternateASig; 5601 } 5602 zSign = ( (int64_t) aSig < 0 ); 5603 if ( zSign ) aSig = - aSig; 5604 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 5605 5606 } 5607 5608 /*---------------------------------------------------------------------------- 5609 | Returns the binary log of the double-precision floating-point value `a'. 5610 | The operation is performed according to the IEC/IEEE Standard for Binary 5611 | Floating-Point Arithmetic. 5612 *----------------------------------------------------------------------------*/ 5613 float64 float64_log2(float64 a, float_status *status) 5614 { 5615 bool aSign, zSign; 5616 int aExp; 5617 uint64_t aSig, aSig0, aSig1, zSig, i; 5618 a = float64_squash_input_denormal(a, status); 5619 5620 aSig = extractFloat64Frac( a ); 5621 aExp = extractFloat64Exp( a ); 5622 aSign = extractFloat64Sign( a ); 5623 5624 if ( aExp == 0 ) { 5625 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 5626 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5627 } 5628 if ( aSign ) { 5629 float_raise(float_flag_invalid, status); 5630 return float64_default_nan(status); 5631 } 5632 if ( aExp == 0x7FF ) { 5633 if (aSig) { 5634 return propagateFloat64NaN(a, float64_zero, status); 5635 } 5636 return a; 5637 } 5638 5639 aExp -= 0x3FF; 5640 aSig |= UINT64_C(0x0010000000000000); 5641 zSign = aExp < 0; 5642 zSig = (uint64_t)aExp << 52; 5643 for (i = 1LL << 51; i > 0; i >>= 1) { 5644 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 5645 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 5646 if ( aSig & UINT64_C(0x0020000000000000) ) { 5647 aSig >>= 1; 5648 zSig |= i; 5649 } 5650 } 5651 5652 if ( zSign ) 5653 zSig = -zSig; 5654 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 5655 } 5656 5657 /*---------------------------------------------------------------------------- 5658 | Returns the result of converting the extended double-precision floating- 5659 | point value `a' to the 32-bit two's complement integer format. The 5660 | conversion is performed according to the IEC/IEEE Standard for Binary 5661 | Floating-Point Arithmetic---which means in particular that the conversion 5662 | is rounded according to the current rounding mode. If `a' is a NaN, the 5663 | largest positive integer is returned. Otherwise, if the conversion 5664 | overflows, the largest integer with the same sign as `a' is returned. 5665 *----------------------------------------------------------------------------*/ 5666 5667 int32_t floatx80_to_int32(floatx80 a, float_status *status) 5668 { 5669 bool aSign; 5670 int32_t aExp, shiftCount; 5671 uint64_t aSig; 5672 5673 if (floatx80_invalid_encoding(a)) { 5674 float_raise(float_flag_invalid, status); 5675 return 1 << 31; 5676 } 5677 aSig = extractFloatx80Frac( a ); 5678 aExp = extractFloatx80Exp( a ); 5679 aSign = extractFloatx80Sign( a ); 5680 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5681 shiftCount = 0x4037 - aExp; 5682 if ( shiftCount <= 0 ) shiftCount = 1; 5683 shift64RightJamming( aSig, shiftCount, &aSig ); 5684 return roundAndPackInt32(aSign, aSig, status); 5685 5686 } 5687 5688 /*---------------------------------------------------------------------------- 5689 | Returns the result of converting the extended double-precision floating- 5690 | point value `a' to the 32-bit two's complement integer format. The 5691 | conversion is performed according to the IEC/IEEE Standard for Binary 5692 | Floating-Point Arithmetic, except that the conversion is always rounded 5693 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5694 | Otherwise, if the conversion overflows, the largest integer with the same 5695 | sign as `a' is returned. 5696 *----------------------------------------------------------------------------*/ 5697 5698 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 5699 { 5700 bool aSign; 5701 int32_t aExp, shiftCount; 5702 uint64_t aSig, savedASig; 5703 int32_t z; 5704 5705 if (floatx80_invalid_encoding(a)) { 5706 float_raise(float_flag_invalid, status); 5707 return 1 << 31; 5708 } 5709 aSig = extractFloatx80Frac( a ); 5710 aExp = extractFloatx80Exp( a ); 5711 aSign = extractFloatx80Sign( a ); 5712 if ( 0x401E < aExp ) { 5713 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5714 goto invalid; 5715 } 5716 else if ( aExp < 0x3FFF ) { 5717 if (aExp || aSig) { 5718 float_raise(float_flag_inexact, status); 5719 } 5720 return 0; 5721 } 5722 shiftCount = 0x403E - aExp; 5723 savedASig = aSig; 5724 aSig >>= shiftCount; 5725 z = aSig; 5726 if ( aSign ) z = - z; 5727 if ( ( z < 0 ) ^ aSign ) { 5728 invalid: 5729 float_raise(float_flag_invalid, status); 5730 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5731 } 5732 if ( ( aSig<<shiftCount ) != savedASig ) { 5733 float_raise(float_flag_inexact, status); 5734 } 5735 return z; 5736 5737 } 5738 5739 /*---------------------------------------------------------------------------- 5740 | Returns the result of converting the extended double-precision floating- 5741 | point value `a' to the 64-bit two's complement integer format. The 5742 | conversion is performed according to the IEC/IEEE Standard for Binary 5743 | Floating-Point Arithmetic---which means in particular that the conversion 5744 | is rounded according to the current rounding mode. If `a' is a NaN, 5745 | the largest positive integer is returned. Otherwise, if the conversion 5746 | overflows, the largest integer with the same sign as `a' is returned. 5747 *----------------------------------------------------------------------------*/ 5748 5749 int64_t floatx80_to_int64(floatx80 a, float_status *status) 5750 { 5751 bool aSign; 5752 int32_t aExp, shiftCount; 5753 uint64_t aSig, aSigExtra; 5754 5755 if (floatx80_invalid_encoding(a)) { 5756 float_raise(float_flag_invalid, status); 5757 return 1ULL << 63; 5758 } 5759 aSig = extractFloatx80Frac( a ); 5760 aExp = extractFloatx80Exp( a ); 5761 aSign = extractFloatx80Sign( a ); 5762 shiftCount = 0x403E - aExp; 5763 if ( shiftCount <= 0 ) { 5764 if ( shiftCount ) { 5765 float_raise(float_flag_invalid, status); 5766 if (!aSign || floatx80_is_any_nan(a)) { 5767 return INT64_MAX; 5768 } 5769 return INT64_MIN; 5770 } 5771 aSigExtra = 0; 5772 } 5773 else { 5774 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 5775 } 5776 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 5777 5778 } 5779 5780 /*---------------------------------------------------------------------------- 5781 | Returns the result of converting the extended double-precision floating- 5782 | point value `a' to the 64-bit two's complement integer format. The 5783 | conversion is performed according to the IEC/IEEE Standard for Binary 5784 | Floating-Point Arithmetic, except that the conversion is always rounded 5785 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5786 | Otherwise, if the conversion overflows, the largest integer with the same 5787 | sign as `a' is returned. 5788 *----------------------------------------------------------------------------*/ 5789 5790 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 5791 { 5792 bool aSign; 5793 int32_t aExp, shiftCount; 5794 uint64_t aSig; 5795 int64_t z; 5796 5797 if (floatx80_invalid_encoding(a)) { 5798 float_raise(float_flag_invalid, status); 5799 return 1ULL << 63; 5800 } 5801 aSig = extractFloatx80Frac( a ); 5802 aExp = extractFloatx80Exp( a ); 5803 aSign = extractFloatx80Sign( a ); 5804 shiftCount = aExp - 0x403E; 5805 if ( 0 <= shiftCount ) { 5806 aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF); 5807 if ( ( a.high != 0xC03E ) || aSig ) { 5808 float_raise(float_flag_invalid, status); 5809 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 5810 return INT64_MAX; 5811 } 5812 } 5813 return INT64_MIN; 5814 } 5815 else if ( aExp < 0x3FFF ) { 5816 if (aExp | aSig) { 5817 float_raise(float_flag_inexact, status); 5818 } 5819 return 0; 5820 } 5821 z = aSig>>( - shiftCount ); 5822 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 5823 float_raise(float_flag_inexact, status); 5824 } 5825 if ( aSign ) z = - z; 5826 return z; 5827 5828 } 5829 5830 /*---------------------------------------------------------------------------- 5831 | Returns the result of converting the extended double-precision floating- 5832 | point value `a' to the single-precision floating-point format. The 5833 | conversion is performed according to the IEC/IEEE Standard for Binary 5834 | Floating-Point Arithmetic. 5835 *----------------------------------------------------------------------------*/ 5836 5837 float32 floatx80_to_float32(floatx80 a, float_status *status) 5838 { 5839 bool aSign; 5840 int32_t aExp; 5841 uint64_t aSig; 5842 5843 if (floatx80_invalid_encoding(a)) { 5844 float_raise(float_flag_invalid, status); 5845 return float32_default_nan(status); 5846 } 5847 aSig = extractFloatx80Frac( a ); 5848 aExp = extractFloatx80Exp( a ); 5849 aSign = extractFloatx80Sign( a ); 5850 if ( aExp == 0x7FFF ) { 5851 if ( (uint64_t) ( aSig<<1 ) ) { 5852 float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status), 5853 status); 5854 return float32_silence_nan(res, status); 5855 } 5856 return packFloat32( aSign, 0xFF, 0 ); 5857 } 5858 shift64RightJamming( aSig, 33, &aSig ); 5859 if ( aExp || aSig ) aExp -= 0x3F81; 5860 return roundAndPackFloat32(aSign, aExp, aSig, status); 5861 5862 } 5863 5864 /*---------------------------------------------------------------------------- 5865 | Returns the result of converting the extended double-precision floating- 5866 | point value `a' to the double-precision floating-point format. The 5867 | conversion is performed according to the IEC/IEEE Standard for Binary 5868 | Floating-Point Arithmetic. 5869 *----------------------------------------------------------------------------*/ 5870 5871 float64 floatx80_to_float64(floatx80 a, float_status *status) 5872 { 5873 bool aSign; 5874 int32_t aExp; 5875 uint64_t aSig, zSig; 5876 5877 if (floatx80_invalid_encoding(a)) { 5878 float_raise(float_flag_invalid, status); 5879 return float64_default_nan(status); 5880 } 5881 aSig = extractFloatx80Frac( a ); 5882 aExp = extractFloatx80Exp( a ); 5883 aSign = extractFloatx80Sign( a ); 5884 if ( aExp == 0x7FFF ) { 5885 if ( (uint64_t) ( aSig<<1 ) ) { 5886 float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status), 5887 status); 5888 return float64_silence_nan(res, status); 5889 } 5890 return packFloat64( aSign, 0x7FF, 0 ); 5891 } 5892 shift64RightJamming( aSig, 1, &zSig ); 5893 if ( aExp || aSig ) aExp -= 0x3C01; 5894 return roundAndPackFloat64(aSign, aExp, zSig, status); 5895 5896 } 5897 5898 /*---------------------------------------------------------------------------- 5899 | Returns the result of converting the extended double-precision floating- 5900 | point value `a' to the quadruple-precision floating-point format. The 5901 | conversion is performed according to the IEC/IEEE Standard for Binary 5902 | Floating-Point Arithmetic. 5903 *----------------------------------------------------------------------------*/ 5904 5905 float128 floatx80_to_float128(floatx80 a, float_status *status) 5906 { 5907 bool aSign; 5908 int aExp; 5909 uint64_t aSig, zSig0, zSig1; 5910 5911 if (floatx80_invalid_encoding(a)) { 5912 float_raise(float_flag_invalid, status); 5913 return float128_default_nan(status); 5914 } 5915 aSig = extractFloatx80Frac( a ); 5916 aExp = extractFloatx80Exp( a ); 5917 aSign = extractFloatx80Sign( a ); 5918 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 5919 float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status), 5920 status); 5921 return float128_silence_nan(res, status); 5922 } 5923 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 5924 return packFloat128( aSign, aExp, zSig0, zSig1 ); 5925 5926 } 5927 5928 /*---------------------------------------------------------------------------- 5929 | Rounds the extended double-precision floating-point value `a' 5930 | to the precision provided by floatx80_rounding_precision and returns the 5931 | result as an extended double-precision floating-point value. 5932 | The operation is performed according to the IEC/IEEE Standard for Binary 5933 | Floating-Point Arithmetic. 5934 *----------------------------------------------------------------------------*/ 5935 5936 floatx80 floatx80_round(floatx80 a, float_status *status) 5937 { 5938 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5939 extractFloatx80Sign(a), 5940 extractFloatx80Exp(a), 5941 extractFloatx80Frac(a), 0, status); 5942 } 5943 5944 /*---------------------------------------------------------------------------- 5945 | Rounds the extended double-precision floating-point value `a' to an integer, 5946 | and returns the result as an extended quadruple-precision floating-point 5947 | value. The operation is performed according to the IEC/IEEE Standard for 5948 | Binary Floating-Point Arithmetic. 5949 *----------------------------------------------------------------------------*/ 5950 5951 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 5952 { 5953 bool aSign; 5954 int32_t aExp; 5955 uint64_t lastBitMask, roundBitsMask; 5956 floatx80 z; 5957 5958 if (floatx80_invalid_encoding(a)) { 5959 float_raise(float_flag_invalid, status); 5960 return floatx80_default_nan(status); 5961 } 5962 aExp = extractFloatx80Exp( a ); 5963 if ( 0x403E <= aExp ) { 5964 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 5965 return propagateFloatx80NaN(a, a, status); 5966 } 5967 return a; 5968 } 5969 if ( aExp < 0x3FFF ) { 5970 if ( ( aExp == 0 ) 5971 && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) { 5972 return a; 5973 } 5974 float_raise(float_flag_inexact, status); 5975 aSign = extractFloatx80Sign( a ); 5976 switch (status->float_rounding_mode) { 5977 case float_round_nearest_even: 5978 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 5979 ) { 5980 return 5981 packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000)); 5982 } 5983 break; 5984 case float_round_ties_away: 5985 if (aExp == 0x3FFE) { 5986 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000)); 5987 } 5988 break; 5989 case float_round_down: 5990 return 5991 aSign ? 5992 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000)) 5993 : packFloatx80( 0, 0, 0 ); 5994 case float_round_up: 5995 return 5996 aSign ? packFloatx80( 1, 0, 0 ) 5997 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000)); 5998 5999 case float_round_to_zero: 6000 break; 6001 default: 6002 g_assert_not_reached(); 6003 } 6004 return packFloatx80( aSign, 0, 0 ); 6005 } 6006 lastBitMask = 1; 6007 lastBitMask <<= 0x403E - aExp; 6008 roundBitsMask = lastBitMask - 1; 6009 z = a; 6010 switch (status->float_rounding_mode) { 6011 case float_round_nearest_even: 6012 z.low += lastBitMask>>1; 6013 if ((z.low & roundBitsMask) == 0) { 6014 z.low &= ~lastBitMask; 6015 } 6016 break; 6017 case float_round_ties_away: 6018 z.low += lastBitMask >> 1; 6019 break; 6020 case float_round_to_zero: 6021 break; 6022 case float_round_up: 6023 if (!extractFloatx80Sign(z)) { 6024 z.low += roundBitsMask; 6025 } 6026 break; 6027 case float_round_down: 6028 if (extractFloatx80Sign(z)) { 6029 z.low += roundBitsMask; 6030 } 6031 break; 6032 default: 6033 abort(); 6034 } 6035 z.low &= ~ roundBitsMask; 6036 if ( z.low == 0 ) { 6037 ++z.high; 6038 z.low = UINT64_C(0x8000000000000000); 6039 } 6040 if (z.low != a.low) { 6041 float_raise(float_flag_inexact, status); 6042 } 6043 return z; 6044 6045 } 6046 6047 /*---------------------------------------------------------------------------- 6048 | Returns the result of adding the absolute values of the extended double- 6049 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 6050 | negated before being returned. `zSign' is ignored if the result is a NaN. 6051 | The addition is performed according to the IEC/IEEE Standard for Binary 6052 | Floating-Point Arithmetic. 6053 *----------------------------------------------------------------------------*/ 6054 6055 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign, 6056 float_status *status) 6057 { 6058 int32_t aExp, bExp, zExp; 6059 uint64_t aSig, bSig, zSig0, zSig1; 6060 int32_t expDiff; 6061 6062 aSig = extractFloatx80Frac( a ); 6063 aExp = extractFloatx80Exp( a ); 6064 bSig = extractFloatx80Frac( b ); 6065 bExp = extractFloatx80Exp( b ); 6066 expDiff = aExp - bExp; 6067 if ( 0 < expDiff ) { 6068 if ( aExp == 0x7FFF ) { 6069 if ((uint64_t)(aSig << 1)) { 6070 return propagateFloatx80NaN(a, b, status); 6071 } 6072 return a; 6073 } 6074 if ( bExp == 0 ) --expDiff; 6075 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 6076 zExp = aExp; 6077 } 6078 else if ( expDiff < 0 ) { 6079 if ( bExp == 0x7FFF ) { 6080 if ((uint64_t)(bSig << 1)) { 6081 return propagateFloatx80NaN(a, b, status); 6082 } 6083 return packFloatx80(zSign, 6084 floatx80_infinity_high, 6085 floatx80_infinity_low); 6086 } 6087 if ( aExp == 0 ) ++expDiff; 6088 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 6089 zExp = bExp; 6090 } 6091 else { 6092 if ( aExp == 0x7FFF ) { 6093 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 6094 return propagateFloatx80NaN(a, b, status); 6095 } 6096 return a; 6097 } 6098 zSig1 = 0; 6099 zSig0 = aSig + bSig; 6100 if ( aExp == 0 ) { 6101 if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) { 6102 /* At least one of the values is a pseudo-denormal, 6103 * and there is a carry out of the result. */ 6104 zExp = 1; 6105 goto shiftRight1; 6106 } 6107 if (zSig0 == 0) { 6108 return packFloatx80(zSign, 0, 0); 6109 } 6110 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 6111 goto roundAndPack; 6112 } 6113 zExp = aExp; 6114 goto shiftRight1; 6115 } 6116 zSig0 = aSig + bSig; 6117 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 6118 shiftRight1: 6119 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 6120 zSig0 |= UINT64_C(0x8000000000000000); 6121 ++zExp; 6122 roundAndPack: 6123 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6124 zSign, zExp, zSig0, zSig1, status); 6125 } 6126 6127 /*---------------------------------------------------------------------------- 6128 | Returns the result of subtracting the absolute values of the extended 6129 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 6130 | difference is negated before being returned. `zSign' is ignored if the 6131 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6132 | Standard for Binary Floating-Point Arithmetic. 6133 *----------------------------------------------------------------------------*/ 6134 6135 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign, 6136 float_status *status) 6137 { 6138 int32_t aExp, bExp, zExp; 6139 uint64_t aSig, bSig, zSig0, zSig1; 6140 int32_t expDiff; 6141 6142 aSig = extractFloatx80Frac( a ); 6143 aExp = extractFloatx80Exp( a ); 6144 bSig = extractFloatx80Frac( b ); 6145 bExp = extractFloatx80Exp( b ); 6146 expDiff = aExp - bExp; 6147 if ( 0 < expDiff ) goto aExpBigger; 6148 if ( expDiff < 0 ) goto bExpBigger; 6149 if ( aExp == 0x7FFF ) { 6150 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 6151 return propagateFloatx80NaN(a, b, status); 6152 } 6153 float_raise(float_flag_invalid, status); 6154 return floatx80_default_nan(status); 6155 } 6156 if ( aExp == 0 ) { 6157 aExp = 1; 6158 bExp = 1; 6159 } 6160 zSig1 = 0; 6161 if ( bSig < aSig ) goto aBigger; 6162 if ( aSig < bSig ) goto bBigger; 6163 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 6164 bExpBigger: 6165 if ( bExp == 0x7FFF ) { 6166 if ((uint64_t)(bSig << 1)) { 6167 return propagateFloatx80NaN(a, b, status); 6168 } 6169 return packFloatx80(zSign ^ 1, floatx80_infinity_high, 6170 floatx80_infinity_low); 6171 } 6172 if ( aExp == 0 ) ++expDiff; 6173 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 6174 bBigger: 6175 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 6176 zExp = bExp; 6177 zSign ^= 1; 6178 goto normalizeRoundAndPack; 6179 aExpBigger: 6180 if ( aExp == 0x7FFF ) { 6181 if ((uint64_t)(aSig << 1)) { 6182 return propagateFloatx80NaN(a, b, status); 6183 } 6184 return a; 6185 } 6186 if ( bExp == 0 ) --expDiff; 6187 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 6188 aBigger: 6189 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 6190 zExp = aExp; 6191 normalizeRoundAndPack: 6192 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 6193 zSign, zExp, zSig0, zSig1, status); 6194 } 6195 6196 /*---------------------------------------------------------------------------- 6197 | Returns the result of adding the extended double-precision floating-point 6198 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6199 | Standard for Binary Floating-Point Arithmetic. 6200 *----------------------------------------------------------------------------*/ 6201 6202 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 6203 { 6204 bool aSign, bSign; 6205 6206 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6207 float_raise(float_flag_invalid, status); 6208 return floatx80_default_nan(status); 6209 } 6210 aSign = extractFloatx80Sign( a ); 6211 bSign = extractFloatx80Sign( b ); 6212 if ( aSign == bSign ) { 6213 return addFloatx80Sigs(a, b, aSign, status); 6214 } 6215 else { 6216 return subFloatx80Sigs(a, b, aSign, status); 6217 } 6218 6219 } 6220 6221 /*---------------------------------------------------------------------------- 6222 | Returns the result of subtracting the extended double-precision floating- 6223 | point values `a' and `b'. The operation is performed according to the 6224 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6225 *----------------------------------------------------------------------------*/ 6226 6227 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 6228 { 6229 bool aSign, bSign; 6230 6231 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6232 float_raise(float_flag_invalid, status); 6233 return floatx80_default_nan(status); 6234 } 6235 aSign = extractFloatx80Sign( a ); 6236 bSign = extractFloatx80Sign( b ); 6237 if ( aSign == bSign ) { 6238 return subFloatx80Sigs(a, b, aSign, status); 6239 } 6240 else { 6241 return addFloatx80Sigs(a, b, aSign, status); 6242 } 6243 6244 } 6245 6246 /*---------------------------------------------------------------------------- 6247 | Returns the result of multiplying the extended double-precision floating- 6248 | point values `a' and `b'. The operation is performed according to the 6249 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6250 *----------------------------------------------------------------------------*/ 6251 6252 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 6253 { 6254 bool aSign, bSign, zSign; 6255 int32_t aExp, bExp, zExp; 6256 uint64_t aSig, bSig, zSig0, zSig1; 6257 6258 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6259 float_raise(float_flag_invalid, status); 6260 return floatx80_default_nan(status); 6261 } 6262 aSig = extractFloatx80Frac( a ); 6263 aExp = extractFloatx80Exp( a ); 6264 aSign = extractFloatx80Sign( a ); 6265 bSig = extractFloatx80Frac( b ); 6266 bExp = extractFloatx80Exp( b ); 6267 bSign = extractFloatx80Sign( b ); 6268 zSign = aSign ^ bSign; 6269 if ( aExp == 0x7FFF ) { 6270 if ( (uint64_t) ( aSig<<1 ) 6271 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 6272 return propagateFloatx80NaN(a, b, status); 6273 } 6274 if ( ( bExp | bSig ) == 0 ) goto invalid; 6275 return packFloatx80(zSign, floatx80_infinity_high, 6276 floatx80_infinity_low); 6277 } 6278 if ( bExp == 0x7FFF ) { 6279 if ((uint64_t)(bSig << 1)) { 6280 return propagateFloatx80NaN(a, b, status); 6281 } 6282 if ( ( aExp | aSig ) == 0 ) { 6283 invalid: 6284 float_raise(float_flag_invalid, status); 6285 return floatx80_default_nan(status); 6286 } 6287 return packFloatx80(zSign, floatx80_infinity_high, 6288 floatx80_infinity_low); 6289 } 6290 if ( aExp == 0 ) { 6291 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6292 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 6293 } 6294 if ( bExp == 0 ) { 6295 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6296 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6297 } 6298 zExp = aExp + bExp - 0x3FFE; 6299 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 6300 if ( 0 < (int64_t) zSig0 ) { 6301 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 6302 --zExp; 6303 } 6304 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6305 zSign, zExp, zSig0, zSig1, status); 6306 } 6307 6308 /*---------------------------------------------------------------------------- 6309 | Returns the result of dividing the extended double-precision floating-point 6310 | value `a' by the corresponding value `b'. The operation is performed 6311 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6312 *----------------------------------------------------------------------------*/ 6313 6314 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 6315 { 6316 bool aSign, bSign, zSign; 6317 int32_t aExp, bExp, zExp; 6318 uint64_t aSig, bSig, zSig0, zSig1; 6319 uint64_t rem0, rem1, rem2, term0, term1, term2; 6320 6321 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6322 float_raise(float_flag_invalid, status); 6323 return floatx80_default_nan(status); 6324 } 6325 aSig = extractFloatx80Frac( a ); 6326 aExp = extractFloatx80Exp( a ); 6327 aSign = extractFloatx80Sign( a ); 6328 bSig = extractFloatx80Frac( b ); 6329 bExp = extractFloatx80Exp( b ); 6330 bSign = extractFloatx80Sign( b ); 6331 zSign = aSign ^ bSign; 6332 if ( aExp == 0x7FFF ) { 6333 if ((uint64_t)(aSig << 1)) { 6334 return propagateFloatx80NaN(a, b, status); 6335 } 6336 if ( bExp == 0x7FFF ) { 6337 if ((uint64_t)(bSig << 1)) { 6338 return propagateFloatx80NaN(a, b, status); 6339 } 6340 goto invalid; 6341 } 6342 return packFloatx80(zSign, floatx80_infinity_high, 6343 floatx80_infinity_low); 6344 } 6345 if ( bExp == 0x7FFF ) { 6346 if ((uint64_t)(bSig << 1)) { 6347 return propagateFloatx80NaN(a, b, status); 6348 } 6349 return packFloatx80( zSign, 0, 0 ); 6350 } 6351 if ( bExp == 0 ) { 6352 if ( bSig == 0 ) { 6353 if ( ( aExp | aSig ) == 0 ) { 6354 invalid: 6355 float_raise(float_flag_invalid, status); 6356 return floatx80_default_nan(status); 6357 } 6358 float_raise(float_flag_divbyzero, status); 6359 return packFloatx80(zSign, floatx80_infinity_high, 6360 floatx80_infinity_low); 6361 } 6362 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6363 } 6364 if ( aExp == 0 ) { 6365 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6366 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 6367 } 6368 zExp = aExp - bExp + 0x3FFE; 6369 rem1 = 0; 6370 if ( bSig <= aSig ) { 6371 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 6372 ++zExp; 6373 } 6374 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 6375 mul64To128( bSig, zSig0, &term0, &term1 ); 6376 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 6377 while ( (int64_t) rem0 < 0 ) { 6378 --zSig0; 6379 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 6380 } 6381 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 6382 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 6383 mul64To128( bSig, zSig1, &term1, &term2 ); 6384 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6385 while ( (int64_t) rem1 < 0 ) { 6386 --zSig1; 6387 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 6388 } 6389 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 6390 } 6391 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6392 zSign, zExp, zSig0, zSig1, status); 6393 } 6394 6395 /*---------------------------------------------------------------------------- 6396 | Returns the remainder of the extended double-precision floating-point value 6397 | `a' with respect to the corresponding value `b'. The operation is performed 6398 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic, 6399 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating 6400 | the quotient toward zero instead. '*quotient' is set to the low 64 bits of 6401 | the absolute value of the integer quotient. 6402 *----------------------------------------------------------------------------*/ 6403 6404 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient, 6405 float_status *status) 6406 { 6407 bool aSign, zSign; 6408 int32_t aExp, bExp, expDiff, aExpOrig; 6409 uint64_t aSig0, aSig1, bSig; 6410 uint64_t q, term0, term1, alternateASig0, alternateASig1; 6411 6412 *quotient = 0; 6413 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6414 float_raise(float_flag_invalid, status); 6415 return floatx80_default_nan(status); 6416 } 6417 aSig0 = extractFloatx80Frac( a ); 6418 aExpOrig = aExp = extractFloatx80Exp( a ); 6419 aSign = extractFloatx80Sign( a ); 6420 bSig = extractFloatx80Frac( b ); 6421 bExp = extractFloatx80Exp( b ); 6422 if ( aExp == 0x7FFF ) { 6423 if ( (uint64_t) ( aSig0<<1 ) 6424 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 6425 return propagateFloatx80NaN(a, b, status); 6426 } 6427 goto invalid; 6428 } 6429 if ( bExp == 0x7FFF ) { 6430 if ((uint64_t)(bSig << 1)) { 6431 return propagateFloatx80NaN(a, b, status); 6432 } 6433 if (aExp == 0 && aSig0 >> 63) { 6434 /* 6435 * Pseudo-denormal argument must be returned in normalized 6436 * form. 6437 */ 6438 return packFloatx80(aSign, 1, aSig0); 6439 } 6440 return a; 6441 } 6442 if ( bExp == 0 ) { 6443 if ( bSig == 0 ) { 6444 invalid: 6445 float_raise(float_flag_invalid, status); 6446 return floatx80_default_nan(status); 6447 } 6448 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6449 } 6450 if ( aExp == 0 ) { 6451 if ( aSig0 == 0 ) return a; 6452 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6453 } 6454 zSign = aSign; 6455 expDiff = aExp - bExp; 6456 aSig1 = 0; 6457 if ( expDiff < 0 ) { 6458 if ( mod || expDiff < -1 ) { 6459 if (aExp == 1 && aExpOrig == 0) { 6460 /* 6461 * Pseudo-denormal argument must be returned in 6462 * normalized form. 6463 */ 6464 return packFloatx80(aSign, aExp, aSig0); 6465 } 6466 return a; 6467 } 6468 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 6469 expDiff = 0; 6470 } 6471 *quotient = q = ( bSig <= aSig0 ); 6472 if ( q ) aSig0 -= bSig; 6473 expDiff -= 64; 6474 while ( 0 < expDiff ) { 6475 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6476 q = ( 2 < q ) ? q - 2 : 0; 6477 mul64To128( bSig, q, &term0, &term1 ); 6478 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6479 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 6480 expDiff -= 62; 6481 *quotient <<= 62; 6482 *quotient += q; 6483 } 6484 expDiff += 64; 6485 if ( 0 < expDiff ) { 6486 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6487 q = ( 2 < q ) ? q - 2 : 0; 6488 q >>= 64 - expDiff; 6489 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 6490 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6491 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 6492 while ( le128( term0, term1, aSig0, aSig1 ) ) { 6493 ++q; 6494 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6495 } 6496 if (expDiff < 64) { 6497 *quotient <<= expDiff; 6498 } else { 6499 *quotient = 0; 6500 } 6501 *quotient += q; 6502 } 6503 else { 6504 term1 = 0; 6505 term0 = bSig; 6506 } 6507 if (!mod) { 6508 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 6509 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6510 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6511 && ( q & 1 ) ) 6512 ) { 6513 aSig0 = alternateASig0; 6514 aSig1 = alternateASig1; 6515 zSign = ! zSign; 6516 ++*quotient; 6517 } 6518 } 6519 return 6520 normalizeRoundAndPackFloatx80( 6521 80, zSign, bExp + expDiff, aSig0, aSig1, status); 6522 6523 } 6524 6525 /*---------------------------------------------------------------------------- 6526 | Returns the remainder of the extended double-precision floating-point value 6527 | `a' with respect to the corresponding value `b'. The operation is performed 6528 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6529 *----------------------------------------------------------------------------*/ 6530 6531 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 6532 { 6533 uint64_t quotient; 6534 return floatx80_modrem(a, b, false, "ient, status); 6535 } 6536 6537 /*---------------------------------------------------------------------------- 6538 | Returns the remainder of the extended double-precision floating-point value 6539 | `a' with respect to the corresponding value `b', with the quotient truncated 6540 | toward zero. 6541 *----------------------------------------------------------------------------*/ 6542 6543 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status) 6544 { 6545 uint64_t quotient; 6546 return floatx80_modrem(a, b, true, "ient, status); 6547 } 6548 6549 /*---------------------------------------------------------------------------- 6550 | Returns the square root of the extended double-precision floating-point 6551 | value `a'. The operation is performed according to the IEC/IEEE Standard 6552 | for Binary Floating-Point Arithmetic. 6553 *----------------------------------------------------------------------------*/ 6554 6555 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 6556 { 6557 bool aSign; 6558 int32_t aExp, zExp; 6559 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 6560 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6561 6562 if (floatx80_invalid_encoding(a)) { 6563 float_raise(float_flag_invalid, status); 6564 return floatx80_default_nan(status); 6565 } 6566 aSig0 = extractFloatx80Frac( a ); 6567 aExp = extractFloatx80Exp( a ); 6568 aSign = extractFloatx80Sign( a ); 6569 if ( aExp == 0x7FFF ) { 6570 if ((uint64_t)(aSig0 << 1)) { 6571 return propagateFloatx80NaN(a, a, status); 6572 } 6573 if ( ! aSign ) return a; 6574 goto invalid; 6575 } 6576 if ( aSign ) { 6577 if ( ( aExp | aSig0 ) == 0 ) return a; 6578 invalid: 6579 float_raise(float_flag_invalid, status); 6580 return floatx80_default_nan(status); 6581 } 6582 if ( aExp == 0 ) { 6583 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 6584 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6585 } 6586 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 6587 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 6588 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 6589 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6590 doubleZSig0 = zSig0<<1; 6591 mul64To128( zSig0, zSig0, &term0, &term1 ); 6592 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6593 while ( (int64_t) rem0 < 0 ) { 6594 --zSig0; 6595 doubleZSig0 -= 2; 6596 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6597 } 6598 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6599 if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) { 6600 if ( zSig1 == 0 ) zSig1 = 1; 6601 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6602 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6603 mul64To128( zSig1, zSig1, &term2, &term3 ); 6604 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6605 while ( (int64_t) rem1 < 0 ) { 6606 --zSig1; 6607 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6608 term3 |= 1; 6609 term2 |= doubleZSig0; 6610 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6611 } 6612 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6613 } 6614 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 6615 zSig0 |= doubleZSig0; 6616 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6617 0, zExp, zSig0, zSig1, status); 6618 } 6619 6620 /*---------------------------------------------------------------------------- 6621 | Returns the result of converting the quadruple-precision floating-point 6622 | value `a' to the 32-bit two's complement integer format. The conversion 6623 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6624 | Arithmetic---which means in particular that the conversion is rounded 6625 | according to the current rounding mode. If `a' is a NaN, the largest 6626 | positive integer is returned. Otherwise, if the conversion overflows, the 6627 | largest integer with the same sign as `a' is returned. 6628 *----------------------------------------------------------------------------*/ 6629 6630 int32_t float128_to_int32(float128 a, float_status *status) 6631 { 6632 bool aSign; 6633 int32_t aExp, shiftCount; 6634 uint64_t aSig0, aSig1; 6635 6636 aSig1 = extractFloat128Frac1( a ); 6637 aSig0 = extractFloat128Frac0( a ); 6638 aExp = extractFloat128Exp( a ); 6639 aSign = extractFloat128Sign( a ); 6640 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 6641 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6642 aSig0 |= ( aSig1 != 0 ); 6643 shiftCount = 0x4028 - aExp; 6644 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 6645 return roundAndPackInt32(aSign, aSig0, status); 6646 6647 } 6648 6649 /*---------------------------------------------------------------------------- 6650 | Returns the result of converting the quadruple-precision floating-point 6651 | value `a' to the 32-bit two's complement integer format. The conversion 6652 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6653 | Arithmetic, except that the conversion is always rounded toward zero. If 6654 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 6655 | conversion overflows, the largest integer with the same sign as `a' is 6656 | returned. 6657 *----------------------------------------------------------------------------*/ 6658 6659 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 6660 { 6661 bool aSign; 6662 int32_t aExp, shiftCount; 6663 uint64_t aSig0, aSig1, savedASig; 6664 int32_t z; 6665 6666 aSig1 = extractFloat128Frac1( a ); 6667 aSig0 = extractFloat128Frac0( a ); 6668 aExp = extractFloat128Exp( a ); 6669 aSign = extractFloat128Sign( a ); 6670 aSig0 |= ( aSig1 != 0 ); 6671 if ( 0x401E < aExp ) { 6672 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 6673 goto invalid; 6674 } 6675 else if ( aExp < 0x3FFF ) { 6676 if (aExp || aSig0) { 6677 float_raise(float_flag_inexact, status); 6678 } 6679 return 0; 6680 } 6681 aSig0 |= UINT64_C(0x0001000000000000); 6682 shiftCount = 0x402F - aExp; 6683 savedASig = aSig0; 6684 aSig0 >>= shiftCount; 6685 z = aSig0; 6686 if ( aSign ) z = - z; 6687 if ( ( z < 0 ) ^ aSign ) { 6688 invalid: 6689 float_raise(float_flag_invalid, status); 6690 return aSign ? INT32_MIN : INT32_MAX; 6691 } 6692 if ( ( aSig0<<shiftCount ) != savedASig ) { 6693 float_raise(float_flag_inexact, status); 6694 } 6695 return z; 6696 6697 } 6698 6699 /*---------------------------------------------------------------------------- 6700 | Returns the result of converting the quadruple-precision floating-point 6701 | value `a' to the 64-bit two's complement integer format. The conversion 6702 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6703 | Arithmetic---which means in particular that the conversion is rounded 6704 | according to the current rounding mode. If `a' is a NaN, the largest 6705 | positive integer is returned. Otherwise, if the conversion overflows, the 6706 | largest integer with the same sign as `a' is returned. 6707 *----------------------------------------------------------------------------*/ 6708 6709 int64_t float128_to_int64(float128 a, float_status *status) 6710 { 6711 bool aSign; 6712 int32_t aExp, shiftCount; 6713 uint64_t aSig0, aSig1; 6714 6715 aSig1 = extractFloat128Frac1( a ); 6716 aSig0 = extractFloat128Frac0( a ); 6717 aExp = extractFloat128Exp( a ); 6718 aSign = extractFloat128Sign( a ); 6719 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6720 shiftCount = 0x402F - aExp; 6721 if ( shiftCount <= 0 ) { 6722 if ( 0x403E < aExp ) { 6723 float_raise(float_flag_invalid, status); 6724 if ( ! aSign 6725 || ( ( aExp == 0x7FFF ) 6726 && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) ) 6727 ) 6728 ) { 6729 return INT64_MAX; 6730 } 6731 return INT64_MIN; 6732 } 6733 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 6734 } 6735 else { 6736 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 6737 } 6738 return roundAndPackInt64(aSign, aSig0, aSig1, status); 6739 6740 } 6741 6742 /*---------------------------------------------------------------------------- 6743 | Returns the result of converting the quadruple-precision floating-point 6744 | value `a' to the 64-bit two's complement integer format. The conversion 6745 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6746 | Arithmetic, except that the conversion is always rounded toward zero. 6747 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 6748 | the conversion overflows, the largest integer with the same sign as `a' is 6749 | returned. 6750 *----------------------------------------------------------------------------*/ 6751 6752 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 6753 { 6754 bool aSign; 6755 int32_t aExp, shiftCount; 6756 uint64_t aSig0, aSig1; 6757 int64_t z; 6758 6759 aSig1 = extractFloat128Frac1( a ); 6760 aSig0 = extractFloat128Frac0( a ); 6761 aExp = extractFloat128Exp( a ); 6762 aSign = extractFloat128Sign( a ); 6763 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6764 shiftCount = aExp - 0x402F; 6765 if ( 0 < shiftCount ) { 6766 if ( 0x403E <= aExp ) { 6767 aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF); 6768 if ( ( a.high == UINT64_C(0xC03E000000000000) ) 6769 && ( aSig1 < UINT64_C(0x0002000000000000) ) ) { 6770 if (aSig1) { 6771 float_raise(float_flag_inexact, status); 6772 } 6773 } 6774 else { 6775 float_raise(float_flag_invalid, status); 6776 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 6777 return INT64_MAX; 6778 } 6779 } 6780 return INT64_MIN; 6781 } 6782 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 6783 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 6784 float_raise(float_flag_inexact, status); 6785 } 6786 } 6787 else { 6788 if ( aExp < 0x3FFF ) { 6789 if ( aExp | aSig0 | aSig1 ) { 6790 float_raise(float_flag_inexact, status); 6791 } 6792 return 0; 6793 } 6794 z = aSig0>>( - shiftCount ); 6795 if ( aSig1 6796 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 6797 float_raise(float_flag_inexact, status); 6798 } 6799 } 6800 if ( aSign ) z = - z; 6801 return z; 6802 6803 } 6804 6805 /*---------------------------------------------------------------------------- 6806 | Returns the result of converting the quadruple-precision floating-point value 6807 | `a' to the 64-bit unsigned integer format. The conversion is 6808 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6809 | Arithmetic---which means in particular that the conversion is rounded 6810 | according to the current rounding mode. If `a' is a NaN, the largest 6811 | positive integer is returned. If the conversion overflows, the 6812 | largest unsigned integer is returned. If 'a' is negative, the value is 6813 | rounded and zero is returned; negative values that do not round to zero 6814 | will raise the inexact exception. 6815 *----------------------------------------------------------------------------*/ 6816 6817 uint64_t float128_to_uint64(float128 a, float_status *status) 6818 { 6819 bool aSign; 6820 int aExp; 6821 int shiftCount; 6822 uint64_t aSig0, aSig1; 6823 6824 aSig0 = extractFloat128Frac0(a); 6825 aSig1 = extractFloat128Frac1(a); 6826 aExp = extractFloat128Exp(a); 6827 aSign = extractFloat128Sign(a); 6828 if (aSign && (aExp > 0x3FFE)) { 6829 float_raise(float_flag_invalid, status); 6830 if (float128_is_any_nan(a)) { 6831 return UINT64_MAX; 6832 } else { 6833 return 0; 6834 } 6835 } 6836 if (aExp) { 6837 aSig0 |= UINT64_C(0x0001000000000000); 6838 } 6839 shiftCount = 0x402F - aExp; 6840 if (shiftCount <= 0) { 6841 if (0x403E < aExp) { 6842 float_raise(float_flag_invalid, status); 6843 return UINT64_MAX; 6844 } 6845 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 6846 } else { 6847 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 6848 } 6849 return roundAndPackUint64(aSign, aSig0, aSig1, status); 6850 } 6851 6852 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 6853 { 6854 uint64_t v; 6855 signed char current_rounding_mode = status->float_rounding_mode; 6856 6857 set_float_rounding_mode(float_round_to_zero, status); 6858 v = float128_to_uint64(a, status); 6859 set_float_rounding_mode(current_rounding_mode, status); 6860 6861 return v; 6862 } 6863 6864 /*---------------------------------------------------------------------------- 6865 | Returns the result of converting the quadruple-precision floating-point 6866 | value `a' to the 32-bit unsigned integer format. The conversion 6867 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6868 | Arithmetic except that the conversion is always rounded toward zero. 6869 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 6870 | if the conversion overflows, the largest unsigned integer is returned. 6871 | If 'a' is negative, the value is rounded and zero is returned; negative 6872 | values that do not round to zero will raise the inexact exception. 6873 *----------------------------------------------------------------------------*/ 6874 6875 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 6876 { 6877 uint64_t v; 6878 uint32_t res; 6879 int old_exc_flags = get_float_exception_flags(status); 6880 6881 v = float128_to_uint64_round_to_zero(a, status); 6882 if (v > 0xffffffff) { 6883 res = 0xffffffff; 6884 } else { 6885 return v; 6886 } 6887 set_float_exception_flags(old_exc_flags, status); 6888 float_raise(float_flag_invalid, status); 6889 return res; 6890 } 6891 6892 /*---------------------------------------------------------------------------- 6893 | Returns the result of converting the quadruple-precision floating-point value 6894 | `a' to the 32-bit unsigned integer format. The conversion is 6895 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6896 | Arithmetic---which means in particular that the conversion is rounded 6897 | according to the current rounding mode. If `a' is a NaN, the largest 6898 | positive integer is returned. If the conversion overflows, the 6899 | largest unsigned integer is returned. If 'a' is negative, the value is 6900 | rounded and zero is returned; negative values that do not round to zero 6901 | will raise the inexact exception. 6902 *----------------------------------------------------------------------------*/ 6903 6904 uint32_t float128_to_uint32(float128 a, float_status *status) 6905 { 6906 uint64_t v; 6907 uint32_t res; 6908 int old_exc_flags = get_float_exception_flags(status); 6909 6910 v = float128_to_uint64(a, status); 6911 if (v > 0xffffffff) { 6912 res = 0xffffffff; 6913 } else { 6914 return v; 6915 } 6916 set_float_exception_flags(old_exc_flags, status); 6917 float_raise(float_flag_invalid, status); 6918 return res; 6919 } 6920 6921 /*---------------------------------------------------------------------------- 6922 | Returns the result of converting the quadruple-precision floating-point 6923 | value `a' to the extended double-precision floating-point format. The 6924 | conversion is performed according to the IEC/IEEE Standard for Binary 6925 | Floating-Point Arithmetic. 6926 *----------------------------------------------------------------------------*/ 6927 6928 floatx80 float128_to_floatx80(float128 a, float_status *status) 6929 { 6930 bool aSign; 6931 int32_t aExp; 6932 uint64_t aSig0, aSig1; 6933 6934 aSig1 = extractFloat128Frac1( a ); 6935 aSig0 = extractFloat128Frac0( a ); 6936 aExp = extractFloat128Exp( a ); 6937 aSign = extractFloat128Sign( a ); 6938 if ( aExp == 0x7FFF ) { 6939 if ( aSig0 | aSig1 ) { 6940 floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status), 6941 status); 6942 return floatx80_silence_nan(res, status); 6943 } 6944 return packFloatx80(aSign, floatx80_infinity_high, 6945 floatx80_infinity_low); 6946 } 6947 if ( aExp == 0 ) { 6948 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6949 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6950 } 6951 else { 6952 aSig0 |= UINT64_C(0x0001000000000000); 6953 } 6954 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6955 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6956 6957 } 6958 6959 /*---------------------------------------------------------------------------- 6960 | Rounds the quadruple-precision floating-point value `a' to an integer, and 6961 | returns the result as a quadruple-precision floating-point value. The 6962 | operation is performed according to the IEC/IEEE Standard for Binary 6963 | Floating-Point Arithmetic. 6964 *----------------------------------------------------------------------------*/ 6965 6966 float128 float128_round_to_int(float128 a, float_status *status) 6967 { 6968 bool aSign; 6969 int32_t aExp; 6970 uint64_t lastBitMask, roundBitsMask; 6971 float128 z; 6972 6973 aExp = extractFloat128Exp( a ); 6974 if ( 0x402F <= aExp ) { 6975 if ( 0x406F <= aExp ) { 6976 if ( ( aExp == 0x7FFF ) 6977 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 6978 ) { 6979 return propagateFloat128NaN(a, a, status); 6980 } 6981 return a; 6982 } 6983 lastBitMask = 1; 6984 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 6985 roundBitsMask = lastBitMask - 1; 6986 z = a; 6987 switch (status->float_rounding_mode) { 6988 case float_round_nearest_even: 6989 if ( lastBitMask ) { 6990 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 6991 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 6992 } 6993 else { 6994 if ( (int64_t) z.low < 0 ) { 6995 ++z.high; 6996 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 6997 } 6998 } 6999 break; 7000 case float_round_ties_away: 7001 if (lastBitMask) { 7002 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 7003 } else { 7004 if ((int64_t) z.low < 0) { 7005 ++z.high; 7006 } 7007 } 7008 break; 7009 case float_round_to_zero: 7010 break; 7011 case float_round_up: 7012 if (!extractFloat128Sign(z)) { 7013 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 7014 } 7015 break; 7016 case float_round_down: 7017 if (extractFloat128Sign(z)) { 7018 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 7019 } 7020 break; 7021 case float_round_to_odd: 7022 /* 7023 * Note that if lastBitMask == 0, the last bit is the lsb 7024 * of high, and roundBitsMask == -1. 7025 */ 7026 if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) { 7027 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 7028 } 7029 break; 7030 default: 7031 abort(); 7032 } 7033 z.low &= ~ roundBitsMask; 7034 } 7035 else { 7036 if ( aExp < 0x3FFF ) { 7037 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 7038 float_raise(float_flag_inexact, status); 7039 aSign = extractFloat128Sign( a ); 7040 switch (status->float_rounding_mode) { 7041 case float_round_nearest_even: 7042 if ( ( aExp == 0x3FFE ) 7043 && ( extractFloat128Frac0( a ) 7044 | extractFloat128Frac1( a ) ) 7045 ) { 7046 return packFloat128( aSign, 0x3FFF, 0, 0 ); 7047 } 7048 break; 7049 case float_round_ties_away: 7050 if (aExp == 0x3FFE) { 7051 return packFloat128(aSign, 0x3FFF, 0, 0); 7052 } 7053 break; 7054 case float_round_down: 7055 return 7056 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 7057 : packFloat128( 0, 0, 0, 0 ); 7058 case float_round_up: 7059 return 7060 aSign ? packFloat128( 1, 0, 0, 0 ) 7061 : packFloat128( 0, 0x3FFF, 0, 0 ); 7062 7063 case float_round_to_odd: 7064 return packFloat128(aSign, 0x3FFF, 0, 0); 7065 7066 case float_round_to_zero: 7067 break; 7068 } 7069 return packFloat128( aSign, 0, 0, 0 ); 7070 } 7071 lastBitMask = 1; 7072 lastBitMask <<= 0x402F - aExp; 7073 roundBitsMask = lastBitMask - 1; 7074 z.low = 0; 7075 z.high = a.high; 7076 switch (status->float_rounding_mode) { 7077 case float_round_nearest_even: 7078 z.high += lastBitMask>>1; 7079 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 7080 z.high &= ~ lastBitMask; 7081 } 7082 break; 7083 case float_round_ties_away: 7084 z.high += lastBitMask>>1; 7085 break; 7086 case float_round_to_zero: 7087 break; 7088 case float_round_up: 7089 if (!extractFloat128Sign(z)) { 7090 z.high |= ( a.low != 0 ); 7091 z.high += roundBitsMask; 7092 } 7093 break; 7094 case float_round_down: 7095 if (extractFloat128Sign(z)) { 7096 z.high |= (a.low != 0); 7097 z.high += roundBitsMask; 7098 } 7099 break; 7100 case float_round_to_odd: 7101 if ((z.high & lastBitMask) == 0) { 7102 z.high |= (a.low != 0); 7103 z.high += roundBitsMask; 7104 } 7105 break; 7106 default: 7107 abort(); 7108 } 7109 z.high &= ~ roundBitsMask; 7110 } 7111 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 7112 float_raise(float_flag_inexact, status); 7113 } 7114 return z; 7115 7116 } 7117 7118 /*---------------------------------------------------------------------------- 7119 | Returns the remainder of the quadruple-precision floating-point value `a' 7120 | with respect to the corresponding value `b'. The operation is performed 7121 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7122 *----------------------------------------------------------------------------*/ 7123 7124 float128 float128_rem(float128 a, float128 b, float_status *status) 7125 { 7126 bool aSign, zSign; 7127 int32_t aExp, bExp, expDiff; 7128 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 7129 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 7130 int64_t sigMean0; 7131 7132 aSig1 = extractFloat128Frac1( a ); 7133 aSig0 = extractFloat128Frac0( a ); 7134 aExp = extractFloat128Exp( a ); 7135 aSign = extractFloat128Sign( a ); 7136 bSig1 = extractFloat128Frac1( b ); 7137 bSig0 = extractFloat128Frac0( b ); 7138 bExp = extractFloat128Exp( b ); 7139 if ( aExp == 0x7FFF ) { 7140 if ( ( aSig0 | aSig1 ) 7141 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 7142 return propagateFloat128NaN(a, b, status); 7143 } 7144 goto invalid; 7145 } 7146 if ( bExp == 0x7FFF ) { 7147 if (bSig0 | bSig1) { 7148 return propagateFloat128NaN(a, b, status); 7149 } 7150 return a; 7151 } 7152 if ( bExp == 0 ) { 7153 if ( ( bSig0 | bSig1 ) == 0 ) { 7154 invalid: 7155 float_raise(float_flag_invalid, status); 7156 return float128_default_nan(status); 7157 } 7158 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7159 } 7160 if ( aExp == 0 ) { 7161 if ( ( aSig0 | aSig1 ) == 0 ) return a; 7162 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7163 } 7164 expDiff = aExp - bExp; 7165 if ( expDiff < -1 ) return a; 7166 shortShift128Left( 7167 aSig0 | UINT64_C(0x0001000000000000), 7168 aSig1, 7169 15 - ( expDiff < 0 ), 7170 &aSig0, 7171 &aSig1 7172 ); 7173 shortShift128Left( 7174 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 ); 7175 q = le128( bSig0, bSig1, aSig0, aSig1 ); 7176 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7177 expDiff -= 64; 7178 while ( 0 < expDiff ) { 7179 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7180 q = ( 4 < q ) ? q - 4 : 0; 7181 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7182 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 7183 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 7184 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 7185 expDiff -= 61; 7186 } 7187 if ( -64 < expDiff ) { 7188 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7189 q = ( 4 < q ) ? q - 4 : 0; 7190 q >>= - expDiff; 7191 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7192 expDiff += 52; 7193 if ( expDiff < 0 ) { 7194 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 7195 } 7196 else { 7197 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 7198 } 7199 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7200 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 7201 } 7202 else { 7203 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 7204 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7205 } 7206 do { 7207 alternateASig0 = aSig0; 7208 alternateASig1 = aSig1; 7209 ++q; 7210 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7211 } while ( 0 <= (int64_t) aSig0 ); 7212 add128( 7213 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 7214 if ( ( sigMean0 < 0 ) 7215 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 7216 aSig0 = alternateASig0; 7217 aSig1 = alternateASig1; 7218 } 7219 zSign = ( (int64_t) aSig0 < 0 ); 7220 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 7221 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 7222 status); 7223 } 7224 7225 /*---------------------------------------------------------------------------- 7226 | Returns the square root of the quadruple-precision floating-point value `a'. 7227 | The operation is performed according to the IEC/IEEE Standard for Binary 7228 | Floating-Point Arithmetic. 7229 *----------------------------------------------------------------------------*/ 7230 7231 float128 float128_sqrt(float128 a, float_status *status) 7232 { 7233 bool aSign; 7234 int32_t aExp, zExp; 7235 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 7236 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7237 7238 aSig1 = extractFloat128Frac1( a ); 7239 aSig0 = extractFloat128Frac0( a ); 7240 aExp = extractFloat128Exp( a ); 7241 aSign = extractFloat128Sign( a ); 7242 if ( aExp == 0x7FFF ) { 7243 if (aSig0 | aSig1) { 7244 return propagateFloat128NaN(a, a, status); 7245 } 7246 if ( ! aSign ) return a; 7247 goto invalid; 7248 } 7249 if ( aSign ) { 7250 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 7251 invalid: 7252 float_raise(float_flag_invalid, status); 7253 return float128_default_nan(status); 7254 } 7255 if ( aExp == 0 ) { 7256 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 7257 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7258 } 7259 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 7260 aSig0 |= UINT64_C(0x0001000000000000); 7261 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 7262 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 7263 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 7264 doubleZSig0 = zSig0<<1; 7265 mul64To128( zSig0, zSig0, &term0, &term1 ); 7266 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 7267 while ( (int64_t) rem0 < 0 ) { 7268 --zSig0; 7269 doubleZSig0 -= 2; 7270 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 7271 } 7272 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 7273 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 7274 if ( zSig1 == 0 ) zSig1 = 1; 7275 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 7276 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 7277 mul64To128( zSig1, zSig1, &term2, &term3 ); 7278 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 7279 while ( (int64_t) rem1 < 0 ) { 7280 --zSig1; 7281 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 7282 term3 |= 1; 7283 term2 |= doubleZSig0; 7284 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 7285 } 7286 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7287 } 7288 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 7289 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 7290 7291 } 7292 7293 static inline FloatRelation 7294 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet, 7295 float_status *status) 7296 { 7297 bool aSign, bSign; 7298 7299 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 7300 float_raise(float_flag_invalid, status); 7301 return float_relation_unordered; 7302 } 7303 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 7304 ( extractFloatx80Frac( a )<<1 ) ) || 7305 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 7306 ( extractFloatx80Frac( b )<<1 ) )) { 7307 if (!is_quiet || 7308 floatx80_is_signaling_nan(a, status) || 7309 floatx80_is_signaling_nan(b, status)) { 7310 float_raise(float_flag_invalid, status); 7311 } 7312 return float_relation_unordered; 7313 } 7314 aSign = extractFloatx80Sign( a ); 7315 bSign = extractFloatx80Sign( b ); 7316 if ( aSign != bSign ) { 7317 7318 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 7319 ( ( a.low | b.low ) == 0 ) ) { 7320 /* zero case */ 7321 return float_relation_equal; 7322 } else { 7323 return 1 - (2 * aSign); 7324 } 7325 } else { 7326 /* Normalize pseudo-denormals before comparison. */ 7327 if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) { 7328 ++a.high; 7329 } 7330 if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) { 7331 ++b.high; 7332 } 7333 if (a.low == b.low && a.high == b.high) { 7334 return float_relation_equal; 7335 } else { 7336 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7337 } 7338 } 7339 } 7340 7341 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status) 7342 { 7343 return floatx80_compare_internal(a, b, 0, status); 7344 } 7345 7346 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b, 7347 float_status *status) 7348 { 7349 return floatx80_compare_internal(a, b, 1, status); 7350 } 7351 7352 static inline FloatRelation 7353 float128_compare_internal(float128 a, float128 b, bool is_quiet, 7354 float_status *status) 7355 { 7356 bool aSign, bSign; 7357 7358 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7359 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7360 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7361 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7362 if (!is_quiet || 7363 float128_is_signaling_nan(a, status) || 7364 float128_is_signaling_nan(b, status)) { 7365 float_raise(float_flag_invalid, status); 7366 } 7367 return float_relation_unordered; 7368 } 7369 aSign = extractFloat128Sign( a ); 7370 bSign = extractFloat128Sign( b ); 7371 if ( aSign != bSign ) { 7372 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7373 /* zero case */ 7374 return float_relation_equal; 7375 } else { 7376 return 1 - (2 * aSign); 7377 } 7378 } else { 7379 if (a.low == b.low && a.high == b.high) { 7380 return float_relation_equal; 7381 } else { 7382 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7383 } 7384 } 7385 } 7386 7387 FloatRelation float128_compare(float128 a, float128 b, float_status *status) 7388 { 7389 return float128_compare_internal(a, b, 0, status); 7390 } 7391 7392 FloatRelation float128_compare_quiet(float128 a, float128 b, 7393 float_status *status) 7394 { 7395 return float128_compare_internal(a, b, 1, status); 7396 } 7397 7398 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7399 { 7400 bool aSign; 7401 int32_t aExp; 7402 uint64_t aSig; 7403 7404 if (floatx80_invalid_encoding(a)) { 7405 float_raise(float_flag_invalid, status); 7406 return floatx80_default_nan(status); 7407 } 7408 aSig = extractFloatx80Frac( a ); 7409 aExp = extractFloatx80Exp( a ); 7410 aSign = extractFloatx80Sign( a ); 7411 7412 if ( aExp == 0x7FFF ) { 7413 if ( aSig<<1 ) { 7414 return propagateFloatx80NaN(a, a, status); 7415 } 7416 return a; 7417 } 7418 7419 if (aExp == 0) { 7420 if (aSig == 0) { 7421 return a; 7422 } 7423 aExp++; 7424 } 7425 7426 if (n > 0x10000) { 7427 n = 0x10000; 7428 } else if (n < -0x10000) { 7429 n = -0x10000; 7430 } 7431 7432 aExp += n; 7433 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7434 aSign, aExp, aSig, 0, status); 7435 } 7436 7437 float128 float128_scalbn(float128 a, int n, float_status *status) 7438 { 7439 bool aSign; 7440 int32_t aExp; 7441 uint64_t aSig0, aSig1; 7442 7443 aSig1 = extractFloat128Frac1( a ); 7444 aSig0 = extractFloat128Frac0( a ); 7445 aExp = extractFloat128Exp( a ); 7446 aSign = extractFloat128Sign( a ); 7447 if ( aExp == 0x7FFF ) { 7448 if ( aSig0 | aSig1 ) { 7449 return propagateFloat128NaN(a, a, status); 7450 } 7451 return a; 7452 } 7453 if (aExp != 0) { 7454 aSig0 |= UINT64_C(0x0001000000000000); 7455 } else if (aSig0 == 0 && aSig1 == 0) { 7456 return a; 7457 } else { 7458 aExp++; 7459 } 7460 7461 if (n > 0x10000) { 7462 n = 0x10000; 7463 } else if (n < -0x10000) { 7464 n = -0x10000; 7465 } 7466 7467 aExp += n - 1; 7468 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7469 , status); 7470 7471 } 7472 7473 static void __attribute__((constructor)) softfloat_init(void) 7474 { 7475 union_float64 ua, ub, uc, ur; 7476 7477 if (QEMU_NO_HARDFLOAT) { 7478 return; 7479 } 7480 /* 7481 * Test that the host's FMA is not obviously broken. For example, 7482 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see 7483 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304 7484 */ 7485 ua.s = 0x0020000000000001ULL; 7486 ub.s = 0x3ca0000000000000ULL; 7487 uc.s = 0x0020000000000000ULL; 7488 ur.h = fma(ua.h, ub.h, uc.h); 7489 if (ur.s != 0x0020000000000001ULL) { 7490 force_soft_fma = true; 7491 } 7492 } 7493