1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include <math.h> 87 #include "qemu/bitops.h" 88 #include "fpu/softfloat.h" 89 90 /* We only need stdlib for abort() */ 91 92 /*---------------------------------------------------------------------------- 93 | Primitive arithmetic functions, including multi-word arithmetic, and 94 | division and square root approximations. (Can be specialized to target if 95 | desired.) 96 *----------------------------------------------------------------------------*/ 97 #include "fpu/softfloat-macros.h" 98 99 /* 100 * Hardfloat 101 * 102 * Fast emulation of guest FP instructions is challenging for two reasons. 103 * First, FP instruction semantics are similar but not identical, particularly 104 * when handling NaNs. Second, emulating at reasonable speed the guest FP 105 * exception flags is not trivial: reading the host's flags register with a 106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp], 107 * and trapping on every FP exception is not fast nor pleasant to work with. 108 * 109 * We address these challenges by leveraging the host FPU for a subset of the 110 * operations. To do this we expand on the idea presented in this paper: 111 * 112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a 113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615. 114 * 115 * The idea is thus to leverage the host FPU to (1) compute FP operations 116 * and (2) identify whether FP exceptions occurred while avoiding 117 * expensive exception flag register accesses. 118 * 119 * An important optimization shown in the paper is that given that exception 120 * flags are rarely cleared by the guest, we can avoid recomputing some flags. 121 * This is particularly useful for the inexact flag, which is very frequently 122 * raised in floating-point workloads. 123 * 124 * We optimize the code further by deferring to soft-fp whenever FP exception 125 * detection might get hairy. Two examples: (1) when at least one operand is 126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result 127 * and the result is < the minimum normal. 128 */ 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \ 130 static inline void name(soft_t *a, float_status *s) \ 131 { \ 132 if (unlikely(soft_t ## _is_denormal(*a))) { \ 133 *a = soft_t ## _set_sign(soft_t ## _zero, \ 134 soft_t ## _is_neg(*a)); \ 135 float_raise(float_flag_input_denormal, s); \ 136 } \ 137 } 138 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32) 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64) 141 #undef GEN_INPUT_FLUSH__NOCHECK 142 143 #define GEN_INPUT_FLUSH1(name, soft_t) \ 144 static inline void name(soft_t *a, float_status *s) \ 145 { \ 146 if (likely(!s->flush_inputs_to_zero)) { \ 147 return; \ 148 } \ 149 soft_t ## _input_flush__nocheck(a, s); \ 150 } 151 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32) 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64) 154 #undef GEN_INPUT_FLUSH1 155 156 #define GEN_INPUT_FLUSH2(name, soft_t) \ 157 static inline void name(soft_t *a, soft_t *b, float_status *s) \ 158 { \ 159 if (likely(!s->flush_inputs_to_zero)) { \ 160 return; \ 161 } \ 162 soft_t ## _input_flush__nocheck(a, s); \ 163 soft_t ## _input_flush__nocheck(b, s); \ 164 } 165 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32) 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64) 168 #undef GEN_INPUT_FLUSH2 169 170 #define GEN_INPUT_FLUSH3(name, soft_t) \ 171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \ 172 { \ 173 if (likely(!s->flush_inputs_to_zero)) { \ 174 return; \ 175 } \ 176 soft_t ## _input_flush__nocheck(a, s); \ 177 soft_t ## _input_flush__nocheck(b, s); \ 178 soft_t ## _input_flush__nocheck(c, s); \ 179 } 180 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32) 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64) 183 #undef GEN_INPUT_FLUSH3 184 185 /* 186 * Choose whether to use fpclassify or float32/64_* primitives in the generated 187 * hardfloat functions. Each combination of number of inputs and float size 188 * gets its own value. 189 */ 190 #if defined(__x86_64__) 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1 197 #else 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0 204 #endif 205 206 /* 207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over 208 * float{32,64}_is_infinity when !USE_FP. 209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup. 210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%. 211 */ 212 #if defined(__x86_64__) || defined(__aarch64__) 213 # define QEMU_HARDFLOAT_USE_ISINF 1 214 #else 215 # define QEMU_HARDFLOAT_USE_ISINF 0 216 #endif 217 218 /* 219 * Some targets clear the FP flags before most FP operations. This prevents 220 * the use of hardfloat, since hardfloat relies on the inexact flag being 221 * already set. 222 */ 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__) 224 # if defined(__FAST_MATH__) 225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \ 226 IEEE implementation 227 # endif 228 # define QEMU_NO_HARDFLOAT 1 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN 230 #else 231 # define QEMU_NO_HARDFLOAT 0 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline)) 233 #endif 234 235 static inline bool can_use_fpu(const float_status *s) 236 { 237 if (QEMU_NO_HARDFLOAT) { 238 return false; 239 } 240 return likely(s->float_exception_flags & float_flag_inexact && 241 s->float_rounding_mode == float_round_nearest_even); 242 } 243 244 /* 245 * Hardfloat generation functions. Each operation can have two flavors: 246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for 247 * most condition checks, or native ones (e.g. fpclassify). 248 * 249 * The flavor is chosen by the callers. Instead of using macros, we rely on the 250 * compiler to propagate constants and inline everything into the callers. 251 * 252 * We only generate functions for operations with two inputs, since only 253 * these are common enough to justify consolidating them into common code. 254 */ 255 256 typedef union { 257 float32 s; 258 float h; 259 } union_float32; 260 261 typedef union { 262 float64 s; 263 double h; 264 } union_float64; 265 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b); 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b); 268 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s); 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s); 271 typedef float (*hard_f32_op2_fn)(float a, float b); 272 typedef double (*hard_f64_op2_fn)(double a, double b); 273 274 /* 2-input is-zero-or-normal */ 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b) 276 { 277 if (QEMU_HARDFLOAT_2F32_USE_FP) { 278 /* 279 * Not using a temp variable for consecutive fpclassify calls ends up 280 * generating faster code. 281 */ 282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 284 } 285 return float32_is_zero_or_normal(a.s) && 286 float32_is_zero_or_normal(b.s); 287 } 288 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b) 290 { 291 if (QEMU_HARDFLOAT_2F64_USE_FP) { 292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 294 } 295 return float64_is_zero_or_normal(a.s) && 296 float64_is_zero_or_normal(b.s); 297 } 298 299 /* 3-input is-zero-or-normal */ 300 static inline 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c) 302 { 303 if (QEMU_HARDFLOAT_3F32_USE_FP) { 304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 307 } 308 return float32_is_zero_or_normal(a.s) && 309 float32_is_zero_or_normal(b.s) && 310 float32_is_zero_or_normal(c.s); 311 } 312 313 static inline 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c) 315 { 316 if (QEMU_HARDFLOAT_3F64_USE_FP) { 317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 320 } 321 return float64_is_zero_or_normal(a.s) && 322 float64_is_zero_or_normal(b.s) && 323 float64_is_zero_or_normal(c.s); 324 } 325 326 static inline bool f32_is_inf(union_float32 a) 327 { 328 if (QEMU_HARDFLOAT_USE_ISINF) { 329 return isinf(a.h); 330 } 331 return float32_is_infinity(a.s); 332 } 333 334 static inline bool f64_is_inf(union_float64 a) 335 { 336 if (QEMU_HARDFLOAT_USE_ISINF) { 337 return isinf(a.h); 338 } 339 return float64_is_infinity(a.s); 340 } 341 342 static inline float32 343 float32_gen2(float32 xa, float32 xb, float_status *s, 344 hard_f32_op2_fn hard, soft_f32_op2_fn soft, 345 f32_check_fn pre, f32_check_fn post) 346 { 347 union_float32 ua, ub, ur; 348 349 ua.s = xa; 350 ub.s = xb; 351 352 if (unlikely(!can_use_fpu(s))) { 353 goto soft; 354 } 355 356 float32_input_flush2(&ua.s, &ub.s, s); 357 if (unlikely(!pre(ua, ub))) { 358 goto soft; 359 } 360 361 ur.h = hard(ua.h, ub.h); 362 if (unlikely(f32_is_inf(ur))) { 363 float_raise(float_flag_overflow, s); 364 } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) { 365 goto soft; 366 } 367 return ur.s; 368 369 soft: 370 return soft(ua.s, ub.s, s); 371 } 372 373 static inline float64 374 float64_gen2(float64 xa, float64 xb, float_status *s, 375 hard_f64_op2_fn hard, soft_f64_op2_fn soft, 376 f64_check_fn pre, f64_check_fn post) 377 { 378 union_float64 ua, ub, ur; 379 380 ua.s = xa; 381 ub.s = xb; 382 383 if (unlikely(!can_use_fpu(s))) { 384 goto soft; 385 } 386 387 float64_input_flush2(&ua.s, &ub.s, s); 388 if (unlikely(!pre(ua, ub))) { 389 goto soft; 390 } 391 392 ur.h = hard(ua.h, ub.h); 393 if (unlikely(f64_is_inf(ur))) { 394 float_raise(float_flag_overflow, s); 395 } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) { 396 goto soft; 397 } 398 return ur.s; 399 400 soft: 401 return soft(ua.s, ub.s, s); 402 } 403 404 /*---------------------------------------------------------------------------- 405 | Returns the fraction bits of the single-precision floating-point value `a'. 406 *----------------------------------------------------------------------------*/ 407 408 static inline uint32_t extractFloat32Frac(float32 a) 409 { 410 return float32_val(a) & 0x007FFFFF; 411 } 412 413 /*---------------------------------------------------------------------------- 414 | Returns the exponent bits of the single-precision floating-point value `a'. 415 *----------------------------------------------------------------------------*/ 416 417 static inline int extractFloat32Exp(float32 a) 418 { 419 return (float32_val(a) >> 23) & 0xFF; 420 } 421 422 /*---------------------------------------------------------------------------- 423 | Returns the sign bit of the single-precision floating-point value `a'. 424 *----------------------------------------------------------------------------*/ 425 426 static inline bool extractFloat32Sign(float32 a) 427 { 428 return float32_val(a) >> 31; 429 } 430 431 /*---------------------------------------------------------------------------- 432 | Returns the fraction bits of the double-precision floating-point value `a'. 433 *----------------------------------------------------------------------------*/ 434 435 static inline uint64_t extractFloat64Frac(float64 a) 436 { 437 return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF); 438 } 439 440 /*---------------------------------------------------------------------------- 441 | Returns the exponent bits of the double-precision floating-point value `a'. 442 *----------------------------------------------------------------------------*/ 443 444 static inline int extractFloat64Exp(float64 a) 445 { 446 return (float64_val(a) >> 52) & 0x7FF; 447 } 448 449 /*---------------------------------------------------------------------------- 450 | Returns the sign bit of the double-precision floating-point value `a'. 451 *----------------------------------------------------------------------------*/ 452 453 static inline bool extractFloat64Sign(float64 a) 454 { 455 return float64_val(a) >> 63; 456 } 457 458 /* 459 * Classify a floating point number. Everything above float_class_qnan 460 * is a NaN so cls >= float_class_qnan is any NaN. 461 */ 462 463 typedef enum __attribute__ ((__packed__)) { 464 float_class_unclassified, 465 float_class_zero, 466 float_class_normal, 467 float_class_inf, 468 float_class_qnan, /* all NaNs from here */ 469 float_class_snan, 470 } FloatClass; 471 472 #define float_cmask(bit) (1u << (bit)) 473 474 enum { 475 float_cmask_zero = float_cmask(float_class_zero), 476 float_cmask_normal = float_cmask(float_class_normal), 477 float_cmask_inf = float_cmask(float_class_inf), 478 float_cmask_qnan = float_cmask(float_class_qnan), 479 float_cmask_snan = float_cmask(float_class_snan), 480 481 float_cmask_infzero = float_cmask_zero | float_cmask_inf, 482 float_cmask_anynan = float_cmask_qnan | float_cmask_snan, 483 }; 484 485 /* Flags for parts_minmax. */ 486 enum { 487 /* Set for minimum; clear for maximum. */ 488 minmax_ismin = 1, 489 /* Set for the IEEE 754-2008 minNum() and maxNum() operations. */ 490 minmax_isnum = 2, 491 /* Set for the IEEE 754-2008 minNumMag() and minNumMag() operations. */ 492 minmax_ismag = 4, 493 }; 494 495 /* Simple helpers for checking if, or what kind of, NaN we have */ 496 static inline __attribute__((unused)) bool is_nan(FloatClass c) 497 { 498 return unlikely(c >= float_class_qnan); 499 } 500 501 static inline __attribute__((unused)) bool is_snan(FloatClass c) 502 { 503 return c == float_class_snan; 504 } 505 506 static inline __attribute__((unused)) bool is_qnan(FloatClass c) 507 { 508 return c == float_class_qnan; 509 } 510 511 /* 512 * Structure holding all of the decomposed parts of a float. 513 * The exponent is unbiased and the fraction is normalized. 514 * 515 * The fraction words are stored in big-endian word ordering, 516 * so that truncation from a larger format to a smaller format 517 * can be done simply by ignoring subsequent elements. 518 */ 519 520 typedef struct { 521 FloatClass cls; 522 bool sign; 523 int32_t exp; 524 union { 525 /* Routines that know the structure may reference the singular name. */ 526 uint64_t frac; 527 /* 528 * Routines expanded with multiple structures reference "hi" and "lo" 529 * depending on the operation. In FloatParts64, "hi" and "lo" are 530 * both the same word and aliased here. 531 */ 532 uint64_t frac_hi; 533 uint64_t frac_lo; 534 }; 535 } FloatParts64; 536 537 typedef struct { 538 FloatClass cls; 539 bool sign; 540 int32_t exp; 541 uint64_t frac_hi; 542 uint64_t frac_lo; 543 } FloatParts128; 544 545 typedef struct { 546 FloatClass cls; 547 bool sign; 548 int32_t exp; 549 uint64_t frac_hi; 550 uint64_t frac_hm; /* high-middle */ 551 uint64_t frac_lm; /* low-middle */ 552 uint64_t frac_lo; 553 } FloatParts256; 554 555 /* These apply to the most significant word of each FloatPartsN. */ 556 #define DECOMPOSED_BINARY_POINT 63 557 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 558 559 /* Structure holding all of the relevant parameters for a format. 560 * exp_size: the size of the exponent field 561 * exp_bias: the offset applied to the exponent field 562 * exp_max: the maximum normalised exponent 563 * frac_size: the size of the fraction field 564 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 565 * The following are computed based the size of fraction 566 * frac_lsb: least significant bit of fraction 567 * frac_lsbm1: the bit below the least significant bit (for rounding) 568 * round_mask/roundeven_mask: masks used for rounding 569 * The following optional modifiers are available: 570 * arm_althp: handle ARM Alternative Half Precision 571 */ 572 typedef struct { 573 int exp_size; 574 int exp_bias; 575 int exp_max; 576 int frac_size; 577 int frac_shift; 578 uint64_t frac_lsb; 579 uint64_t frac_lsbm1; 580 uint64_t round_mask; 581 uint64_t roundeven_mask; 582 bool arm_althp; 583 } FloatFmt; 584 585 /* Expand fields based on the size of exponent and fraction */ 586 #define FLOAT_PARAMS(E, F) \ 587 .exp_size = E, \ 588 .exp_bias = ((1 << E) - 1) >> 1, \ 589 .exp_max = (1 << E) - 1, \ 590 .frac_size = F, \ 591 .frac_shift = (-F - 1) & 63, \ 592 .frac_lsb = 1ull << ((-F - 1) & 63), \ 593 .frac_lsbm1 = 1ull << ((-F - 2) & 63), \ 594 .round_mask = (1ull << ((-F - 1) & 63)) - 1, \ 595 .roundeven_mask = (2ull << ((-F - 1) & 63)) - 1 596 597 static const FloatFmt float16_params = { 598 FLOAT_PARAMS(5, 10) 599 }; 600 601 static const FloatFmt float16_params_ahp = { 602 FLOAT_PARAMS(5, 10), 603 .arm_althp = true 604 }; 605 606 static const FloatFmt bfloat16_params = { 607 FLOAT_PARAMS(8, 7) 608 }; 609 610 static const FloatFmt float32_params = { 611 FLOAT_PARAMS(8, 23) 612 }; 613 614 static const FloatFmt float64_params = { 615 FLOAT_PARAMS(11, 52) 616 }; 617 618 static const FloatFmt float128_params = { 619 FLOAT_PARAMS(15, 112) 620 }; 621 622 /* Unpack a float to parts, but do not canonicalize. */ 623 static void unpack_raw64(FloatParts64 *r, const FloatFmt *fmt, uint64_t raw) 624 { 625 const int f_size = fmt->frac_size; 626 const int e_size = fmt->exp_size; 627 628 *r = (FloatParts64) { 629 .cls = float_class_unclassified, 630 .sign = extract64(raw, f_size + e_size, 1), 631 .exp = extract64(raw, f_size, e_size), 632 .frac = extract64(raw, 0, f_size) 633 }; 634 } 635 636 static inline void float16_unpack_raw(FloatParts64 *p, float16 f) 637 { 638 unpack_raw64(p, &float16_params, f); 639 } 640 641 static inline void bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f) 642 { 643 unpack_raw64(p, &bfloat16_params, f); 644 } 645 646 static inline void float32_unpack_raw(FloatParts64 *p, float32 f) 647 { 648 unpack_raw64(p, &float32_params, f); 649 } 650 651 static inline void float64_unpack_raw(FloatParts64 *p, float64 f) 652 { 653 unpack_raw64(p, &float64_params, f); 654 } 655 656 static void float128_unpack_raw(FloatParts128 *p, float128 f) 657 { 658 const int f_size = float128_params.frac_size - 64; 659 const int e_size = float128_params.exp_size; 660 661 *p = (FloatParts128) { 662 .cls = float_class_unclassified, 663 .sign = extract64(f.high, f_size + e_size, 1), 664 .exp = extract64(f.high, f_size, e_size), 665 .frac_hi = extract64(f.high, 0, f_size), 666 .frac_lo = f.low, 667 }; 668 } 669 670 /* Pack a float from parts, but do not canonicalize. */ 671 static uint64_t pack_raw64(const FloatParts64 *p, const FloatFmt *fmt) 672 { 673 const int f_size = fmt->frac_size; 674 const int e_size = fmt->exp_size; 675 uint64_t ret; 676 677 ret = (uint64_t)p->sign << (f_size + e_size); 678 ret = deposit64(ret, f_size, e_size, p->exp); 679 ret = deposit64(ret, 0, f_size, p->frac); 680 return ret; 681 } 682 683 static inline float16 float16_pack_raw(const FloatParts64 *p) 684 { 685 return make_float16(pack_raw64(p, &float16_params)); 686 } 687 688 static inline bfloat16 bfloat16_pack_raw(const FloatParts64 *p) 689 { 690 return pack_raw64(p, &bfloat16_params); 691 } 692 693 static inline float32 float32_pack_raw(const FloatParts64 *p) 694 { 695 return make_float32(pack_raw64(p, &float32_params)); 696 } 697 698 static inline float64 float64_pack_raw(const FloatParts64 *p) 699 { 700 return make_float64(pack_raw64(p, &float64_params)); 701 } 702 703 static float128 float128_pack_raw(const FloatParts128 *p) 704 { 705 const int f_size = float128_params.frac_size - 64; 706 const int e_size = float128_params.exp_size; 707 uint64_t hi; 708 709 hi = (uint64_t)p->sign << (f_size + e_size); 710 hi = deposit64(hi, f_size, e_size, p->exp); 711 hi = deposit64(hi, 0, f_size, p->frac_hi); 712 return make_float128(hi, p->frac_lo); 713 } 714 715 /*---------------------------------------------------------------------------- 716 | Functions and definitions to determine: (1) whether tininess for underflow 717 | is detected before or after rounding by default, (2) what (if anything) 718 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 719 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 720 | are propagated from function inputs to output. These details are target- 721 | specific. 722 *----------------------------------------------------------------------------*/ 723 #include "softfloat-specialize.c.inc" 724 725 #define PARTS_GENERIC_64_128(NAME, P) \ 726 QEMU_GENERIC(P, (FloatParts128 *, parts128_##NAME), parts64_##NAME) 727 728 #define PARTS_GENERIC_64_128_256(NAME, P) \ 729 QEMU_GENERIC(P, (FloatParts256 *, parts256_##NAME), \ 730 (FloatParts128 *, parts128_##NAME), parts64_##NAME) 731 732 #define parts_default_nan(P, S) PARTS_GENERIC_64_128(default_nan, P)(P, S) 733 #define parts_silence_nan(P, S) PARTS_GENERIC_64_128(silence_nan, P)(P, S) 734 735 static void parts64_return_nan(FloatParts64 *a, float_status *s); 736 static void parts128_return_nan(FloatParts128 *a, float_status *s); 737 738 #define parts_return_nan(P, S) PARTS_GENERIC_64_128(return_nan, P)(P, S) 739 740 static FloatParts64 *parts64_pick_nan(FloatParts64 *a, FloatParts64 *b, 741 float_status *s); 742 static FloatParts128 *parts128_pick_nan(FloatParts128 *a, FloatParts128 *b, 743 float_status *s); 744 745 #define parts_pick_nan(A, B, S) PARTS_GENERIC_64_128(pick_nan, A)(A, B, S) 746 747 static FloatParts64 *parts64_pick_nan_muladd(FloatParts64 *a, FloatParts64 *b, 748 FloatParts64 *c, float_status *s, 749 int ab_mask, int abc_mask); 750 static FloatParts128 *parts128_pick_nan_muladd(FloatParts128 *a, 751 FloatParts128 *b, 752 FloatParts128 *c, 753 float_status *s, 754 int ab_mask, int abc_mask); 755 756 #define parts_pick_nan_muladd(A, B, C, S, ABM, ABCM) \ 757 PARTS_GENERIC_64_128(pick_nan_muladd, A)(A, B, C, S, ABM, ABCM) 758 759 static void parts64_canonicalize(FloatParts64 *p, float_status *status, 760 const FloatFmt *fmt); 761 static void parts128_canonicalize(FloatParts128 *p, float_status *status, 762 const FloatFmt *fmt); 763 764 #define parts_canonicalize(A, S, F) \ 765 PARTS_GENERIC_64_128(canonicalize, A)(A, S, F) 766 767 static void parts64_uncanon(FloatParts64 *p, float_status *status, 768 const FloatFmt *fmt); 769 static void parts128_uncanon(FloatParts128 *p, float_status *status, 770 const FloatFmt *fmt); 771 772 #define parts_uncanon(A, S, F) \ 773 PARTS_GENERIC_64_128(uncanon, A)(A, S, F) 774 775 static void parts64_add_normal(FloatParts64 *a, FloatParts64 *b); 776 static void parts128_add_normal(FloatParts128 *a, FloatParts128 *b); 777 static void parts256_add_normal(FloatParts256 *a, FloatParts256 *b); 778 779 #define parts_add_normal(A, B) \ 780 PARTS_GENERIC_64_128_256(add_normal, A)(A, B) 781 782 static bool parts64_sub_normal(FloatParts64 *a, FloatParts64 *b); 783 static bool parts128_sub_normal(FloatParts128 *a, FloatParts128 *b); 784 static bool parts256_sub_normal(FloatParts256 *a, FloatParts256 *b); 785 786 #define parts_sub_normal(A, B) \ 787 PARTS_GENERIC_64_128_256(sub_normal, A)(A, B) 788 789 static FloatParts64 *parts64_addsub(FloatParts64 *a, FloatParts64 *b, 790 float_status *s, bool subtract); 791 static FloatParts128 *parts128_addsub(FloatParts128 *a, FloatParts128 *b, 792 float_status *s, bool subtract); 793 794 #define parts_addsub(A, B, S, Z) \ 795 PARTS_GENERIC_64_128(addsub, A)(A, B, S, Z) 796 797 static FloatParts64 *parts64_mul(FloatParts64 *a, FloatParts64 *b, 798 float_status *s); 799 static FloatParts128 *parts128_mul(FloatParts128 *a, FloatParts128 *b, 800 float_status *s); 801 802 #define parts_mul(A, B, S) \ 803 PARTS_GENERIC_64_128(mul, A)(A, B, S) 804 805 static FloatParts64 *parts64_muladd(FloatParts64 *a, FloatParts64 *b, 806 FloatParts64 *c, int flags, 807 float_status *s); 808 static FloatParts128 *parts128_muladd(FloatParts128 *a, FloatParts128 *b, 809 FloatParts128 *c, int flags, 810 float_status *s); 811 812 #define parts_muladd(A, B, C, Z, S) \ 813 PARTS_GENERIC_64_128(muladd, A)(A, B, C, Z, S) 814 815 static FloatParts64 *parts64_div(FloatParts64 *a, FloatParts64 *b, 816 float_status *s); 817 static FloatParts128 *parts128_div(FloatParts128 *a, FloatParts128 *b, 818 float_status *s); 819 820 #define parts_div(A, B, S) \ 821 PARTS_GENERIC_64_128(div, A)(A, B, S) 822 823 static bool parts64_round_to_int_normal(FloatParts64 *a, FloatRoundMode rm, 824 int scale, int frac_size); 825 static bool parts128_round_to_int_normal(FloatParts128 *a, FloatRoundMode r, 826 int scale, int frac_size); 827 828 #define parts_round_to_int_normal(A, R, C, F) \ 829 PARTS_GENERIC_64_128(round_to_int_normal, A)(A, R, C, F) 830 831 static void parts64_round_to_int(FloatParts64 *a, FloatRoundMode rm, 832 int scale, float_status *s, 833 const FloatFmt *fmt); 834 static void parts128_round_to_int(FloatParts128 *a, FloatRoundMode r, 835 int scale, float_status *s, 836 const FloatFmt *fmt); 837 838 #define parts_round_to_int(A, R, C, S, F) \ 839 PARTS_GENERIC_64_128(round_to_int, A)(A, R, C, S, F) 840 841 static int64_t parts64_float_to_sint(FloatParts64 *p, FloatRoundMode rmode, 842 int scale, int64_t min, int64_t max, 843 float_status *s); 844 static int64_t parts128_float_to_sint(FloatParts128 *p, FloatRoundMode rmode, 845 int scale, int64_t min, int64_t max, 846 float_status *s); 847 848 #define parts_float_to_sint(P, R, Z, MN, MX, S) \ 849 PARTS_GENERIC_64_128(float_to_sint, P)(P, R, Z, MN, MX, S) 850 851 static uint64_t parts64_float_to_uint(FloatParts64 *p, FloatRoundMode rmode, 852 int scale, uint64_t max, 853 float_status *s); 854 static uint64_t parts128_float_to_uint(FloatParts128 *p, FloatRoundMode rmode, 855 int scale, uint64_t max, 856 float_status *s); 857 858 #define parts_float_to_uint(P, R, Z, M, S) \ 859 PARTS_GENERIC_64_128(float_to_uint, P)(P, R, Z, M, S) 860 861 static void parts64_sint_to_float(FloatParts64 *p, int64_t a, 862 int scale, float_status *s); 863 static void parts128_sint_to_float(FloatParts128 *p, int64_t a, 864 int scale, float_status *s); 865 866 #define parts_sint_to_float(P, I, Z, S) \ 867 PARTS_GENERIC_64_128(sint_to_float, P)(P, I, Z, S) 868 869 static void parts64_uint_to_float(FloatParts64 *p, uint64_t a, 870 int scale, float_status *s); 871 static void parts128_uint_to_float(FloatParts128 *p, uint64_t a, 872 int scale, float_status *s); 873 874 #define parts_uint_to_float(P, I, Z, S) \ 875 PARTS_GENERIC_64_128(uint_to_float, P)(P, I, Z, S) 876 877 static FloatParts64 *parts64_minmax(FloatParts64 *a, FloatParts64 *b, 878 float_status *s, int flags); 879 static FloatParts128 *parts128_minmax(FloatParts128 *a, FloatParts128 *b, 880 float_status *s, int flags); 881 882 #define parts_minmax(A, B, S, F) \ 883 PARTS_GENERIC_64_128(minmax, A)(A, B, S, F) 884 885 static int parts64_compare(FloatParts64 *a, FloatParts64 *b, 886 float_status *s, bool q); 887 static int parts128_compare(FloatParts128 *a, FloatParts128 *b, 888 float_status *s, bool q); 889 890 #define parts_compare(A, B, S, Q) \ 891 PARTS_GENERIC_64_128(compare, A)(A, B, S, Q) 892 893 /* 894 * Helper functions for softfloat-parts.c.inc, per-size operations. 895 */ 896 897 #define FRAC_GENERIC_64_128(NAME, P) \ 898 QEMU_GENERIC(P, (FloatParts128 *, frac128_##NAME), frac64_##NAME) 899 900 #define FRAC_GENERIC_64_128_256(NAME, P) \ 901 QEMU_GENERIC(P, (FloatParts256 *, frac256_##NAME), \ 902 (FloatParts128 *, frac128_##NAME), frac64_##NAME) 903 904 static bool frac64_add(FloatParts64 *r, FloatParts64 *a, FloatParts64 *b) 905 { 906 return uadd64_overflow(a->frac, b->frac, &r->frac); 907 } 908 909 static bool frac128_add(FloatParts128 *r, FloatParts128 *a, FloatParts128 *b) 910 { 911 bool c = 0; 912 r->frac_lo = uadd64_carry(a->frac_lo, b->frac_lo, &c); 913 r->frac_hi = uadd64_carry(a->frac_hi, b->frac_hi, &c); 914 return c; 915 } 916 917 static bool frac256_add(FloatParts256 *r, FloatParts256 *a, FloatParts256 *b) 918 { 919 bool c = 0; 920 r->frac_lo = uadd64_carry(a->frac_lo, b->frac_lo, &c); 921 r->frac_lm = uadd64_carry(a->frac_lm, b->frac_lm, &c); 922 r->frac_hm = uadd64_carry(a->frac_hm, b->frac_hm, &c); 923 r->frac_hi = uadd64_carry(a->frac_hi, b->frac_hi, &c); 924 return c; 925 } 926 927 #define frac_add(R, A, B) FRAC_GENERIC_64_128_256(add, R)(R, A, B) 928 929 static bool frac64_addi(FloatParts64 *r, FloatParts64 *a, uint64_t c) 930 { 931 return uadd64_overflow(a->frac, c, &r->frac); 932 } 933 934 static bool frac128_addi(FloatParts128 *r, FloatParts128 *a, uint64_t c) 935 { 936 c = uadd64_overflow(a->frac_lo, c, &r->frac_lo); 937 return uadd64_overflow(a->frac_hi, c, &r->frac_hi); 938 } 939 940 #define frac_addi(R, A, C) FRAC_GENERIC_64_128(addi, R)(R, A, C) 941 942 static void frac64_allones(FloatParts64 *a) 943 { 944 a->frac = -1; 945 } 946 947 static void frac128_allones(FloatParts128 *a) 948 { 949 a->frac_hi = a->frac_lo = -1; 950 } 951 952 #define frac_allones(A) FRAC_GENERIC_64_128(allones, A)(A) 953 954 static int frac64_cmp(FloatParts64 *a, FloatParts64 *b) 955 { 956 return a->frac == b->frac ? 0 : a->frac < b->frac ? -1 : 1; 957 } 958 959 static int frac128_cmp(FloatParts128 *a, FloatParts128 *b) 960 { 961 uint64_t ta = a->frac_hi, tb = b->frac_hi; 962 if (ta == tb) { 963 ta = a->frac_lo, tb = b->frac_lo; 964 if (ta == tb) { 965 return 0; 966 } 967 } 968 return ta < tb ? -1 : 1; 969 } 970 971 #define frac_cmp(A, B) FRAC_GENERIC_64_128(cmp, A)(A, B) 972 973 static void frac64_clear(FloatParts64 *a) 974 { 975 a->frac = 0; 976 } 977 978 static void frac128_clear(FloatParts128 *a) 979 { 980 a->frac_hi = a->frac_lo = 0; 981 } 982 983 #define frac_clear(A) FRAC_GENERIC_64_128(clear, A)(A) 984 985 static bool frac64_div(FloatParts64 *a, FloatParts64 *b) 986 { 987 uint64_t n1, n0, r, q; 988 bool ret; 989 990 /* 991 * We want a 2*N / N-bit division to produce exactly an N-bit 992 * result, so that we do not lose any precision and so that we 993 * do not have to renormalize afterward. If A.frac < B.frac, 994 * then division would produce an (N-1)-bit result; shift A left 995 * by one to produce the an N-bit result, and return true to 996 * decrement the exponent to match. 997 * 998 * The udiv_qrnnd algorithm that we're using requires normalization, 999 * i.e. the msb of the denominator must be set, which is already true. 1000 */ 1001 ret = a->frac < b->frac; 1002 if (ret) { 1003 n0 = a->frac; 1004 n1 = 0; 1005 } else { 1006 n0 = a->frac >> 1; 1007 n1 = a->frac << 63; 1008 } 1009 q = udiv_qrnnd(&r, n0, n1, b->frac); 1010 1011 /* Set lsb if there is a remainder, to set inexact. */ 1012 a->frac = q | (r != 0); 1013 1014 return ret; 1015 } 1016 1017 static bool frac128_div(FloatParts128 *a, FloatParts128 *b) 1018 { 1019 uint64_t q0, q1, a0, a1, b0, b1; 1020 uint64_t r0, r1, r2, r3, t0, t1, t2, t3; 1021 bool ret = false; 1022 1023 a0 = a->frac_hi, a1 = a->frac_lo; 1024 b0 = b->frac_hi, b1 = b->frac_lo; 1025 1026 ret = lt128(a0, a1, b0, b1); 1027 if (!ret) { 1028 a1 = shr_double(a0, a1, 1); 1029 a0 = a0 >> 1; 1030 } 1031 1032 /* Use 128/64 -> 64 division as estimate for 192/128 -> 128 division. */ 1033 q0 = estimateDiv128To64(a0, a1, b0); 1034 1035 /* 1036 * Estimate is high because B1 was not included (unless B1 == 0). 1037 * Reduce quotient and increase remainder until remainder is non-negative. 1038 * This loop will execute 0 to 2 times. 1039 */ 1040 mul128By64To192(b0, b1, q0, &t0, &t1, &t2); 1041 sub192(a0, a1, 0, t0, t1, t2, &r0, &r1, &r2); 1042 while (r0 != 0) { 1043 q0--; 1044 add192(r0, r1, r2, 0, b0, b1, &r0, &r1, &r2); 1045 } 1046 1047 /* Repeat using the remainder, producing a second word of quotient. */ 1048 q1 = estimateDiv128To64(r1, r2, b0); 1049 mul128By64To192(b0, b1, q1, &t1, &t2, &t3); 1050 sub192(r1, r2, 0, t1, t2, t3, &r1, &r2, &r3); 1051 while (r1 != 0) { 1052 q1--; 1053 add192(r1, r2, r3, 0, b0, b1, &r1, &r2, &r3); 1054 } 1055 1056 /* Any remainder indicates inexact; set sticky bit. */ 1057 q1 |= (r2 | r3) != 0; 1058 1059 a->frac_hi = q0; 1060 a->frac_lo = q1; 1061 return ret; 1062 } 1063 1064 #define frac_div(A, B) FRAC_GENERIC_64_128(div, A)(A, B) 1065 1066 static bool frac64_eqz(FloatParts64 *a) 1067 { 1068 return a->frac == 0; 1069 } 1070 1071 static bool frac128_eqz(FloatParts128 *a) 1072 { 1073 return (a->frac_hi | a->frac_lo) == 0; 1074 } 1075 1076 #define frac_eqz(A) FRAC_GENERIC_64_128(eqz, A)(A) 1077 1078 static void frac64_mulw(FloatParts128 *r, FloatParts64 *a, FloatParts64 *b) 1079 { 1080 mulu64(&r->frac_lo, &r->frac_hi, a->frac, b->frac); 1081 } 1082 1083 static void frac128_mulw(FloatParts256 *r, FloatParts128 *a, FloatParts128 *b) 1084 { 1085 mul128To256(a->frac_hi, a->frac_lo, b->frac_hi, b->frac_lo, 1086 &r->frac_hi, &r->frac_hm, &r->frac_lm, &r->frac_lo); 1087 } 1088 1089 #define frac_mulw(R, A, B) FRAC_GENERIC_64_128(mulw, A)(R, A, B) 1090 1091 static void frac64_neg(FloatParts64 *a) 1092 { 1093 a->frac = -a->frac; 1094 } 1095 1096 static void frac128_neg(FloatParts128 *a) 1097 { 1098 bool c = 0; 1099 a->frac_lo = usub64_borrow(0, a->frac_lo, &c); 1100 a->frac_hi = usub64_borrow(0, a->frac_hi, &c); 1101 } 1102 1103 static void frac256_neg(FloatParts256 *a) 1104 { 1105 bool c = 0; 1106 a->frac_lo = usub64_borrow(0, a->frac_lo, &c); 1107 a->frac_lm = usub64_borrow(0, a->frac_lm, &c); 1108 a->frac_hm = usub64_borrow(0, a->frac_hm, &c); 1109 a->frac_hi = usub64_borrow(0, a->frac_hi, &c); 1110 } 1111 1112 #define frac_neg(A) FRAC_GENERIC_64_128_256(neg, A)(A) 1113 1114 static int frac64_normalize(FloatParts64 *a) 1115 { 1116 if (a->frac) { 1117 int shift = clz64(a->frac); 1118 a->frac <<= shift; 1119 return shift; 1120 } 1121 return 64; 1122 } 1123 1124 static int frac128_normalize(FloatParts128 *a) 1125 { 1126 if (a->frac_hi) { 1127 int shl = clz64(a->frac_hi); 1128 a->frac_hi = shl_double(a->frac_hi, a->frac_lo, shl); 1129 a->frac_lo <<= shl; 1130 return shl; 1131 } else if (a->frac_lo) { 1132 int shl = clz64(a->frac_lo); 1133 a->frac_hi = a->frac_lo << shl; 1134 a->frac_lo = 0; 1135 return shl + 64; 1136 } 1137 return 128; 1138 } 1139 1140 static int frac256_normalize(FloatParts256 *a) 1141 { 1142 uint64_t a0 = a->frac_hi, a1 = a->frac_hm; 1143 uint64_t a2 = a->frac_lm, a3 = a->frac_lo; 1144 int ret, shl; 1145 1146 if (likely(a0)) { 1147 shl = clz64(a0); 1148 if (shl == 0) { 1149 return 0; 1150 } 1151 ret = shl; 1152 } else { 1153 if (a1) { 1154 ret = 64; 1155 a0 = a1, a1 = a2, a2 = a3, a3 = 0; 1156 } else if (a2) { 1157 ret = 128; 1158 a0 = a2, a1 = a3, a2 = 0, a3 = 0; 1159 } else if (a3) { 1160 ret = 192; 1161 a0 = a3, a1 = 0, a2 = 0, a3 = 0; 1162 } else { 1163 ret = 256; 1164 a0 = 0, a1 = 0, a2 = 0, a3 = 0; 1165 goto done; 1166 } 1167 shl = clz64(a0); 1168 if (shl == 0) { 1169 goto done; 1170 } 1171 ret += shl; 1172 } 1173 1174 a0 = shl_double(a0, a1, shl); 1175 a1 = shl_double(a1, a2, shl); 1176 a2 = shl_double(a2, a3, shl); 1177 a3 <<= shl; 1178 1179 done: 1180 a->frac_hi = a0; 1181 a->frac_hm = a1; 1182 a->frac_lm = a2; 1183 a->frac_lo = a3; 1184 return ret; 1185 } 1186 1187 #define frac_normalize(A) FRAC_GENERIC_64_128_256(normalize, A)(A) 1188 1189 static void frac64_shl(FloatParts64 *a, int c) 1190 { 1191 a->frac <<= c; 1192 } 1193 1194 static void frac128_shl(FloatParts128 *a, int c) 1195 { 1196 uint64_t a0 = a->frac_hi, a1 = a->frac_lo; 1197 1198 if (c & 64) { 1199 a0 = a1, a1 = 0; 1200 } 1201 1202 c &= 63; 1203 if (c) { 1204 a0 = shl_double(a0, a1, c); 1205 a1 = a1 << c; 1206 } 1207 1208 a->frac_hi = a0; 1209 a->frac_lo = a1; 1210 } 1211 1212 #define frac_shl(A, C) FRAC_GENERIC_64_128(shl, A)(A, C) 1213 1214 static void frac64_shr(FloatParts64 *a, int c) 1215 { 1216 a->frac >>= c; 1217 } 1218 1219 static void frac128_shr(FloatParts128 *a, int c) 1220 { 1221 uint64_t a0 = a->frac_hi, a1 = a->frac_lo; 1222 1223 if (c & 64) { 1224 a1 = a0, a0 = 0; 1225 } 1226 1227 c &= 63; 1228 if (c) { 1229 a1 = shr_double(a0, a1, c); 1230 a0 = a0 >> c; 1231 } 1232 1233 a->frac_hi = a0; 1234 a->frac_lo = a1; 1235 } 1236 1237 #define frac_shr(A, C) FRAC_GENERIC_64_128(shr, A)(A, C) 1238 1239 static void frac64_shrjam(FloatParts64 *a, int c) 1240 { 1241 uint64_t a0 = a->frac; 1242 1243 if (likely(c != 0)) { 1244 if (likely(c < 64)) { 1245 a0 = (a0 >> c) | (shr_double(a0, 0, c) != 0); 1246 } else { 1247 a0 = a0 != 0; 1248 } 1249 a->frac = a0; 1250 } 1251 } 1252 1253 static void frac128_shrjam(FloatParts128 *a, int c) 1254 { 1255 uint64_t a0 = a->frac_hi, a1 = a->frac_lo; 1256 uint64_t sticky = 0; 1257 1258 if (unlikely(c == 0)) { 1259 return; 1260 } else if (likely(c < 64)) { 1261 /* nothing */ 1262 } else if (likely(c < 128)) { 1263 sticky = a1; 1264 a1 = a0; 1265 a0 = 0; 1266 c &= 63; 1267 if (c == 0) { 1268 goto done; 1269 } 1270 } else { 1271 sticky = a0 | a1; 1272 a0 = a1 = 0; 1273 goto done; 1274 } 1275 1276 sticky |= shr_double(a1, 0, c); 1277 a1 = shr_double(a0, a1, c); 1278 a0 = a0 >> c; 1279 1280 done: 1281 a->frac_lo = a1 | (sticky != 0); 1282 a->frac_hi = a0; 1283 } 1284 1285 static void frac256_shrjam(FloatParts256 *a, int c) 1286 { 1287 uint64_t a0 = a->frac_hi, a1 = a->frac_hm; 1288 uint64_t a2 = a->frac_lm, a3 = a->frac_lo; 1289 uint64_t sticky = 0; 1290 1291 if (unlikely(c == 0)) { 1292 return; 1293 } else if (likely(c < 64)) { 1294 /* nothing */ 1295 } else if (likely(c < 256)) { 1296 if (unlikely(c & 128)) { 1297 sticky |= a2 | a3; 1298 a3 = a1, a2 = a0, a1 = 0, a0 = 0; 1299 } 1300 if (unlikely(c & 64)) { 1301 sticky |= a3; 1302 a3 = a2, a2 = a1, a1 = a0, a0 = 0; 1303 } 1304 c &= 63; 1305 if (c == 0) { 1306 goto done; 1307 } 1308 } else { 1309 sticky = a0 | a1 | a2 | a3; 1310 a0 = a1 = a2 = a3 = 0; 1311 goto done; 1312 } 1313 1314 sticky |= shr_double(a3, 0, c); 1315 a3 = shr_double(a2, a3, c); 1316 a2 = shr_double(a1, a2, c); 1317 a1 = shr_double(a0, a1, c); 1318 a0 = a0 >> c; 1319 1320 done: 1321 a->frac_lo = a3 | (sticky != 0); 1322 a->frac_lm = a2; 1323 a->frac_hm = a1; 1324 a->frac_hi = a0; 1325 } 1326 1327 #define frac_shrjam(A, C) FRAC_GENERIC_64_128_256(shrjam, A)(A, C) 1328 1329 static bool frac64_sub(FloatParts64 *r, FloatParts64 *a, FloatParts64 *b) 1330 { 1331 return usub64_overflow(a->frac, b->frac, &r->frac); 1332 } 1333 1334 static bool frac128_sub(FloatParts128 *r, FloatParts128 *a, FloatParts128 *b) 1335 { 1336 bool c = 0; 1337 r->frac_lo = usub64_borrow(a->frac_lo, b->frac_lo, &c); 1338 r->frac_hi = usub64_borrow(a->frac_hi, b->frac_hi, &c); 1339 return c; 1340 } 1341 1342 static bool frac256_sub(FloatParts256 *r, FloatParts256 *a, FloatParts256 *b) 1343 { 1344 bool c = 0; 1345 r->frac_lo = usub64_borrow(a->frac_lo, b->frac_lo, &c); 1346 r->frac_lm = usub64_borrow(a->frac_lm, b->frac_lm, &c); 1347 r->frac_hm = usub64_borrow(a->frac_hm, b->frac_hm, &c); 1348 r->frac_hi = usub64_borrow(a->frac_hi, b->frac_hi, &c); 1349 return c; 1350 } 1351 1352 #define frac_sub(R, A, B) FRAC_GENERIC_64_128_256(sub, R)(R, A, B) 1353 1354 static void frac64_truncjam(FloatParts64 *r, FloatParts128 *a) 1355 { 1356 r->frac = a->frac_hi | (a->frac_lo != 0); 1357 } 1358 1359 static void frac128_truncjam(FloatParts128 *r, FloatParts256 *a) 1360 { 1361 r->frac_hi = a->frac_hi; 1362 r->frac_lo = a->frac_hm | ((a->frac_lm | a->frac_lo) != 0); 1363 } 1364 1365 #define frac_truncjam(R, A) FRAC_GENERIC_64_128(truncjam, R)(R, A) 1366 1367 static void frac64_widen(FloatParts128 *r, FloatParts64 *a) 1368 { 1369 r->frac_hi = a->frac; 1370 r->frac_lo = 0; 1371 } 1372 1373 static void frac128_widen(FloatParts256 *r, FloatParts128 *a) 1374 { 1375 r->frac_hi = a->frac_hi; 1376 r->frac_hm = a->frac_lo; 1377 r->frac_lm = 0; 1378 r->frac_lo = 0; 1379 } 1380 1381 #define frac_widen(A, B) FRAC_GENERIC_64_128(widen, B)(A, B) 1382 1383 #define partsN(NAME) glue(glue(glue(parts,N),_),NAME) 1384 #define FloatPartsN glue(FloatParts,N) 1385 #define FloatPartsW glue(FloatParts,W) 1386 1387 #define N 64 1388 #define W 128 1389 1390 #include "softfloat-parts-addsub.c.inc" 1391 #include "softfloat-parts.c.inc" 1392 1393 #undef N 1394 #undef W 1395 #define N 128 1396 #define W 256 1397 1398 #include "softfloat-parts-addsub.c.inc" 1399 #include "softfloat-parts.c.inc" 1400 1401 #undef N 1402 #undef W 1403 #define N 256 1404 1405 #include "softfloat-parts-addsub.c.inc" 1406 1407 #undef N 1408 #undef W 1409 #undef partsN 1410 #undef FloatPartsN 1411 #undef FloatPartsW 1412 1413 /* 1414 * Pack/unpack routines with a specific FloatFmt. 1415 */ 1416 1417 static void float16a_unpack_canonical(FloatParts64 *p, float16 f, 1418 float_status *s, const FloatFmt *params) 1419 { 1420 float16_unpack_raw(p, f); 1421 parts_canonicalize(p, s, params); 1422 } 1423 1424 static void float16_unpack_canonical(FloatParts64 *p, float16 f, 1425 float_status *s) 1426 { 1427 float16a_unpack_canonical(p, f, s, &float16_params); 1428 } 1429 1430 static void bfloat16_unpack_canonical(FloatParts64 *p, bfloat16 f, 1431 float_status *s) 1432 { 1433 bfloat16_unpack_raw(p, f); 1434 parts_canonicalize(p, s, &bfloat16_params); 1435 } 1436 1437 static float16 float16a_round_pack_canonical(FloatParts64 *p, 1438 float_status *s, 1439 const FloatFmt *params) 1440 { 1441 parts_uncanon(p, s, params); 1442 return float16_pack_raw(p); 1443 } 1444 1445 static float16 float16_round_pack_canonical(FloatParts64 *p, 1446 float_status *s) 1447 { 1448 return float16a_round_pack_canonical(p, s, &float16_params); 1449 } 1450 1451 static bfloat16 bfloat16_round_pack_canonical(FloatParts64 *p, 1452 float_status *s) 1453 { 1454 parts_uncanon(p, s, &bfloat16_params); 1455 return bfloat16_pack_raw(p); 1456 } 1457 1458 static void float32_unpack_canonical(FloatParts64 *p, float32 f, 1459 float_status *s) 1460 { 1461 float32_unpack_raw(p, f); 1462 parts_canonicalize(p, s, &float32_params); 1463 } 1464 1465 static float32 float32_round_pack_canonical(FloatParts64 *p, 1466 float_status *s) 1467 { 1468 parts_uncanon(p, s, &float32_params); 1469 return float32_pack_raw(p); 1470 } 1471 1472 static void float64_unpack_canonical(FloatParts64 *p, float64 f, 1473 float_status *s) 1474 { 1475 float64_unpack_raw(p, f); 1476 parts_canonicalize(p, s, &float64_params); 1477 } 1478 1479 static float64 float64_round_pack_canonical(FloatParts64 *p, 1480 float_status *s) 1481 { 1482 parts_uncanon(p, s, &float64_params); 1483 return float64_pack_raw(p); 1484 } 1485 1486 static void float128_unpack_canonical(FloatParts128 *p, float128 f, 1487 float_status *s) 1488 { 1489 float128_unpack_raw(p, f); 1490 parts_canonicalize(p, s, &float128_params); 1491 } 1492 1493 static float128 float128_round_pack_canonical(FloatParts128 *p, 1494 float_status *s) 1495 { 1496 parts_uncanon(p, s, &float128_params); 1497 return float128_pack_raw(p); 1498 } 1499 1500 /* 1501 * Addition and subtraction 1502 */ 1503 1504 static float16 QEMU_FLATTEN 1505 float16_addsub(float16 a, float16 b, float_status *status, bool subtract) 1506 { 1507 FloatParts64 pa, pb, *pr; 1508 1509 float16_unpack_canonical(&pa, a, status); 1510 float16_unpack_canonical(&pb, b, status); 1511 pr = parts_addsub(&pa, &pb, status, subtract); 1512 1513 return float16_round_pack_canonical(pr, status); 1514 } 1515 1516 float16 float16_add(float16 a, float16 b, float_status *status) 1517 { 1518 return float16_addsub(a, b, status, false); 1519 } 1520 1521 float16 float16_sub(float16 a, float16 b, float_status *status) 1522 { 1523 return float16_addsub(a, b, status, true); 1524 } 1525 1526 static float32 QEMU_SOFTFLOAT_ATTR 1527 soft_f32_addsub(float32 a, float32 b, float_status *status, bool subtract) 1528 { 1529 FloatParts64 pa, pb, *pr; 1530 1531 float32_unpack_canonical(&pa, a, status); 1532 float32_unpack_canonical(&pb, b, status); 1533 pr = parts_addsub(&pa, &pb, status, subtract); 1534 1535 return float32_round_pack_canonical(pr, status); 1536 } 1537 1538 static float32 soft_f32_add(float32 a, float32 b, float_status *status) 1539 { 1540 return soft_f32_addsub(a, b, status, false); 1541 } 1542 1543 static float32 soft_f32_sub(float32 a, float32 b, float_status *status) 1544 { 1545 return soft_f32_addsub(a, b, status, true); 1546 } 1547 1548 static float64 QEMU_SOFTFLOAT_ATTR 1549 soft_f64_addsub(float64 a, float64 b, float_status *status, bool subtract) 1550 { 1551 FloatParts64 pa, pb, *pr; 1552 1553 float64_unpack_canonical(&pa, a, status); 1554 float64_unpack_canonical(&pb, b, status); 1555 pr = parts_addsub(&pa, &pb, status, subtract); 1556 1557 return float64_round_pack_canonical(pr, status); 1558 } 1559 1560 static float64 soft_f64_add(float64 a, float64 b, float_status *status) 1561 { 1562 return soft_f64_addsub(a, b, status, false); 1563 } 1564 1565 static float64 soft_f64_sub(float64 a, float64 b, float_status *status) 1566 { 1567 return soft_f64_addsub(a, b, status, true); 1568 } 1569 1570 static float hard_f32_add(float a, float b) 1571 { 1572 return a + b; 1573 } 1574 1575 static float hard_f32_sub(float a, float b) 1576 { 1577 return a - b; 1578 } 1579 1580 static double hard_f64_add(double a, double b) 1581 { 1582 return a + b; 1583 } 1584 1585 static double hard_f64_sub(double a, double b) 1586 { 1587 return a - b; 1588 } 1589 1590 static bool f32_addsubmul_post(union_float32 a, union_float32 b) 1591 { 1592 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1593 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1594 } 1595 return !(float32_is_zero(a.s) && float32_is_zero(b.s)); 1596 } 1597 1598 static bool f64_addsubmul_post(union_float64 a, union_float64 b) 1599 { 1600 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1601 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1602 } else { 1603 return !(float64_is_zero(a.s) && float64_is_zero(b.s)); 1604 } 1605 } 1606 1607 static float32 float32_addsub(float32 a, float32 b, float_status *s, 1608 hard_f32_op2_fn hard, soft_f32_op2_fn soft) 1609 { 1610 return float32_gen2(a, b, s, hard, soft, 1611 f32_is_zon2, f32_addsubmul_post); 1612 } 1613 1614 static float64 float64_addsub(float64 a, float64 b, float_status *s, 1615 hard_f64_op2_fn hard, soft_f64_op2_fn soft) 1616 { 1617 return float64_gen2(a, b, s, hard, soft, 1618 f64_is_zon2, f64_addsubmul_post); 1619 } 1620 1621 float32 QEMU_FLATTEN 1622 float32_add(float32 a, float32 b, float_status *s) 1623 { 1624 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add); 1625 } 1626 1627 float32 QEMU_FLATTEN 1628 float32_sub(float32 a, float32 b, float_status *s) 1629 { 1630 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub); 1631 } 1632 1633 float64 QEMU_FLATTEN 1634 float64_add(float64 a, float64 b, float_status *s) 1635 { 1636 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add); 1637 } 1638 1639 float64 QEMU_FLATTEN 1640 float64_sub(float64 a, float64 b, float_status *s) 1641 { 1642 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub); 1643 } 1644 1645 static bfloat16 QEMU_FLATTEN 1646 bfloat16_addsub(bfloat16 a, bfloat16 b, float_status *status, bool subtract) 1647 { 1648 FloatParts64 pa, pb, *pr; 1649 1650 bfloat16_unpack_canonical(&pa, a, status); 1651 bfloat16_unpack_canonical(&pb, b, status); 1652 pr = parts_addsub(&pa, &pb, status, subtract); 1653 1654 return bfloat16_round_pack_canonical(pr, status); 1655 } 1656 1657 bfloat16 bfloat16_add(bfloat16 a, bfloat16 b, float_status *status) 1658 { 1659 return bfloat16_addsub(a, b, status, false); 1660 } 1661 1662 bfloat16 bfloat16_sub(bfloat16 a, bfloat16 b, float_status *status) 1663 { 1664 return bfloat16_addsub(a, b, status, true); 1665 } 1666 1667 static float128 QEMU_FLATTEN 1668 float128_addsub(float128 a, float128 b, float_status *status, bool subtract) 1669 { 1670 FloatParts128 pa, pb, *pr; 1671 1672 float128_unpack_canonical(&pa, a, status); 1673 float128_unpack_canonical(&pb, b, status); 1674 pr = parts_addsub(&pa, &pb, status, subtract); 1675 1676 return float128_round_pack_canonical(pr, status); 1677 } 1678 1679 float128 float128_add(float128 a, float128 b, float_status *status) 1680 { 1681 return float128_addsub(a, b, status, false); 1682 } 1683 1684 float128 float128_sub(float128 a, float128 b, float_status *status) 1685 { 1686 return float128_addsub(a, b, status, true); 1687 } 1688 1689 /* 1690 * Multiplication 1691 */ 1692 1693 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status) 1694 { 1695 FloatParts64 pa, pb, *pr; 1696 1697 float16_unpack_canonical(&pa, a, status); 1698 float16_unpack_canonical(&pb, b, status); 1699 pr = parts_mul(&pa, &pb, status); 1700 1701 return float16_round_pack_canonical(pr, status); 1702 } 1703 1704 static float32 QEMU_SOFTFLOAT_ATTR 1705 soft_f32_mul(float32 a, float32 b, float_status *status) 1706 { 1707 FloatParts64 pa, pb, *pr; 1708 1709 float32_unpack_canonical(&pa, a, status); 1710 float32_unpack_canonical(&pb, b, status); 1711 pr = parts_mul(&pa, &pb, status); 1712 1713 return float32_round_pack_canonical(pr, status); 1714 } 1715 1716 static float64 QEMU_SOFTFLOAT_ATTR 1717 soft_f64_mul(float64 a, float64 b, float_status *status) 1718 { 1719 FloatParts64 pa, pb, *pr; 1720 1721 float64_unpack_canonical(&pa, a, status); 1722 float64_unpack_canonical(&pb, b, status); 1723 pr = parts_mul(&pa, &pb, status); 1724 1725 return float64_round_pack_canonical(pr, status); 1726 } 1727 1728 static float hard_f32_mul(float a, float b) 1729 { 1730 return a * b; 1731 } 1732 1733 static double hard_f64_mul(double a, double b) 1734 { 1735 return a * b; 1736 } 1737 1738 float32 QEMU_FLATTEN 1739 float32_mul(float32 a, float32 b, float_status *s) 1740 { 1741 return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul, 1742 f32_is_zon2, f32_addsubmul_post); 1743 } 1744 1745 float64 QEMU_FLATTEN 1746 float64_mul(float64 a, float64 b, float_status *s) 1747 { 1748 return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul, 1749 f64_is_zon2, f64_addsubmul_post); 1750 } 1751 1752 bfloat16 QEMU_FLATTEN 1753 bfloat16_mul(bfloat16 a, bfloat16 b, float_status *status) 1754 { 1755 FloatParts64 pa, pb, *pr; 1756 1757 bfloat16_unpack_canonical(&pa, a, status); 1758 bfloat16_unpack_canonical(&pb, b, status); 1759 pr = parts_mul(&pa, &pb, status); 1760 1761 return bfloat16_round_pack_canonical(pr, status); 1762 } 1763 1764 float128 QEMU_FLATTEN 1765 float128_mul(float128 a, float128 b, float_status *status) 1766 { 1767 FloatParts128 pa, pb, *pr; 1768 1769 float128_unpack_canonical(&pa, a, status); 1770 float128_unpack_canonical(&pb, b, status); 1771 pr = parts_mul(&pa, &pb, status); 1772 1773 return float128_round_pack_canonical(pr, status); 1774 } 1775 1776 /* 1777 * Fused multiply-add 1778 */ 1779 1780 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c, 1781 int flags, float_status *status) 1782 { 1783 FloatParts64 pa, pb, pc, *pr; 1784 1785 float16_unpack_canonical(&pa, a, status); 1786 float16_unpack_canonical(&pb, b, status); 1787 float16_unpack_canonical(&pc, c, status); 1788 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1789 1790 return float16_round_pack_canonical(pr, status); 1791 } 1792 1793 static float32 QEMU_SOFTFLOAT_ATTR 1794 soft_f32_muladd(float32 a, float32 b, float32 c, int flags, 1795 float_status *status) 1796 { 1797 FloatParts64 pa, pb, pc, *pr; 1798 1799 float32_unpack_canonical(&pa, a, status); 1800 float32_unpack_canonical(&pb, b, status); 1801 float32_unpack_canonical(&pc, c, status); 1802 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1803 1804 return float32_round_pack_canonical(pr, status); 1805 } 1806 1807 static float64 QEMU_SOFTFLOAT_ATTR 1808 soft_f64_muladd(float64 a, float64 b, float64 c, int flags, 1809 float_status *status) 1810 { 1811 FloatParts64 pa, pb, pc, *pr; 1812 1813 float64_unpack_canonical(&pa, a, status); 1814 float64_unpack_canonical(&pb, b, status); 1815 float64_unpack_canonical(&pc, c, status); 1816 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1817 1818 return float64_round_pack_canonical(pr, status); 1819 } 1820 1821 static bool force_soft_fma; 1822 1823 float32 QEMU_FLATTEN 1824 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s) 1825 { 1826 union_float32 ua, ub, uc, ur; 1827 1828 ua.s = xa; 1829 ub.s = xb; 1830 uc.s = xc; 1831 1832 if (unlikely(!can_use_fpu(s))) { 1833 goto soft; 1834 } 1835 if (unlikely(flags & float_muladd_halve_result)) { 1836 goto soft; 1837 } 1838 1839 float32_input_flush3(&ua.s, &ub.s, &uc.s, s); 1840 if (unlikely(!f32_is_zon3(ua, ub, uc))) { 1841 goto soft; 1842 } 1843 1844 if (unlikely(force_soft_fma)) { 1845 goto soft; 1846 } 1847 1848 /* 1849 * When (a || b) == 0, there's no need to check for under/over flow, 1850 * since we know the addend is (normal || 0) and the product is 0. 1851 */ 1852 if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) { 1853 union_float32 up; 1854 bool prod_sign; 1855 1856 prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s); 1857 prod_sign ^= !!(flags & float_muladd_negate_product); 1858 up.s = float32_set_sign(float32_zero, prod_sign); 1859 1860 if (flags & float_muladd_negate_c) { 1861 uc.h = -uc.h; 1862 } 1863 ur.h = up.h + uc.h; 1864 } else { 1865 union_float32 ua_orig = ua; 1866 union_float32 uc_orig = uc; 1867 1868 if (flags & float_muladd_negate_product) { 1869 ua.h = -ua.h; 1870 } 1871 if (flags & float_muladd_negate_c) { 1872 uc.h = -uc.h; 1873 } 1874 1875 ur.h = fmaf(ua.h, ub.h, uc.h); 1876 1877 if (unlikely(f32_is_inf(ur))) { 1878 float_raise(float_flag_overflow, s); 1879 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) { 1880 ua = ua_orig; 1881 uc = uc_orig; 1882 goto soft; 1883 } 1884 } 1885 if (flags & float_muladd_negate_result) { 1886 return float32_chs(ur.s); 1887 } 1888 return ur.s; 1889 1890 soft: 1891 return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s); 1892 } 1893 1894 float64 QEMU_FLATTEN 1895 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s) 1896 { 1897 union_float64 ua, ub, uc, ur; 1898 1899 ua.s = xa; 1900 ub.s = xb; 1901 uc.s = xc; 1902 1903 if (unlikely(!can_use_fpu(s))) { 1904 goto soft; 1905 } 1906 if (unlikely(flags & float_muladd_halve_result)) { 1907 goto soft; 1908 } 1909 1910 float64_input_flush3(&ua.s, &ub.s, &uc.s, s); 1911 if (unlikely(!f64_is_zon3(ua, ub, uc))) { 1912 goto soft; 1913 } 1914 1915 if (unlikely(force_soft_fma)) { 1916 goto soft; 1917 } 1918 1919 /* 1920 * When (a || b) == 0, there's no need to check for under/over flow, 1921 * since we know the addend is (normal || 0) and the product is 0. 1922 */ 1923 if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) { 1924 union_float64 up; 1925 bool prod_sign; 1926 1927 prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s); 1928 prod_sign ^= !!(flags & float_muladd_negate_product); 1929 up.s = float64_set_sign(float64_zero, prod_sign); 1930 1931 if (flags & float_muladd_negate_c) { 1932 uc.h = -uc.h; 1933 } 1934 ur.h = up.h + uc.h; 1935 } else { 1936 union_float64 ua_orig = ua; 1937 union_float64 uc_orig = uc; 1938 1939 if (flags & float_muladd_negate_product) { 1940 ua.h = -ua.h; 1941 } 1942 if (flags & float_muladd_negate_c) { 1943 uc.h = -uc.h; 1944 } 1945 1946 ur.h = fma(ua.h, ub.h, uc.h); 1947 1948 if (unlikely(f64_is_inf(ur))) { 1949 float_raise(float_flag_overflow, s); 1950 } else if (unlikely(fabs(ur.h) <= FLT_MIN)) { 1951 ua = ua_orig; 1952 uc = uc_orig; 1953 goto soft; 1954 } 1955 } 1956 if (flags & float_muladd_negate_result) { 1957 return float64_chs(ur.s); 1958 } 1959 return ur.s; 1960 1961 soft: 1962 return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s); 1963 } 1964 1965 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c, 1966 int flags, float_status *status) 1967 { 1968 FloatParts64 pa, pb, pc, *pr; 1969 1970 bfloat16_unpack_canonical(&pa, a, status); 1971 bfloat16_unpack_canonical(&pb, b, status); 1972 bfloat16_unpack_canonical(&pc, c, status); 1973 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1974 1975 return bfloat16_round_pack_canonical(pr, status); 1976 } 1977 1978 float128 QEMU_FLATTEN float128_muladd(float128 a, float128 b, float128 c, 1979 int flags, float_status *status) 1980 { 1981 FloatParts128 pa, pb, pc, *pr; 1982 1983 float128_unpack_canonical(&pa, a, status); 1984 float128_unpack_canonical(&pb, b, status); 1985 float128_unpack_canonical(&pc, c, status); 1986 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1987 1988 return float128_round_pack_canonical(pr, status); 1989 } 1990 1991 /* 1992 * Division 1993 */ 1994 1995 float16 float16_div(float16 a, float16 b, float_status *status) 1996 { 1997 FloatParts64 pa, pb, *pr; 1998 1999 float16_unpack_canonical(&pa, a, status); 2000 float16_unpack_canonical(&pb, b, status); 2001 pr = parts_div(&pa, &pb, status); 2002 2003 return float16_round_pack_canonical(pr, status); 2004 } 2005 2006 static float32 QEMU_SOFTFLOAT_ATTR 2007 soft_f32_div(float32 a, float32 b, float_status *status) 2008 { 2009 FloatParts64 pa, pb, *pr; 2010 2011 float32_unpack_canonical(&pa, a, status); 2012 float32_unpack_canonical(&pb, b, status); 2013 pr = parts_div(&pa, &pb, status); 2014 2015 return float32_round_pack_canonical(pr, status); 2016 } 2017 2018 static float64 QEMU_SOFTFLOAT_ATTR 2019 soft_f64_div(float64 a, float64 b, float_status *status) 2020 { 2021 FloatParts64 pa, pb, *pr; 2022 2023 float64_unpack_canonical(&pa, a, status); 2024 float64_unpack_canonical(&pb, b, status); 2025 pr = parts_div(&pa, &pb, status); 2026 2027 return float64_round_pack_canonical(pr, status); 2028 } 2029 2030 static float hard_f32_div(float a, float b) 2031 { 2032 return a / b; 2033 } 2034 2035 static double hard_f64_div(double a, double b) 2036 { 2037 return a / b; 2038 } 2039 2040 static bool f32_div_pre(union_float32 a, union_float32 b) 2041 { 2042 if (QEMU_HARDFLOAT_2F32_USE_FP) { 2043 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 2044 fpclassify(b.h) == FP_NORMAL; 2045 } 2046 return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s); 2047 } 2048 2049 static bool f64_div_pre(union_float64 a, union_float64 b) 2050 { 2051 if (QEMU_HARDFLOAT_2F64_USE_FP) { 2052 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 2053 fpclassify(b.h) == FP_NORMAL; 2054 } 2055 return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s); 2056 } 2057 2058 static bool f32_div_post(union_float32 a, union_float32 b) 2059 { 2060 if (QEMU_HARDFLOAT_2F32_USE_FP) { 2061 return fpclassify(a.h) != FP_ZERO; 2062 } 2063 return !float32_is_zero(a.s); 2064 } 2065 2066 static bool f64_div_post(union_float64 a, union_float64 b) 2067 { 2068 if (QEMU_HARDFLOAT_2F64_USE_FP) { 2069 return fpclassify(a.h) != FP_ZERO; 2070 } 2071 return !float64_is_zero(a.s); 2072 } 2073 2074 float32 QEMU_FLATTEN 2075 float32_div(float32 a, float32 b, float_status *s) 2076 { 2077 return float32_gen2(a, b, s, hard_f32_div, soft_f32_div, 2078 f32_div_pre, f32_div_post); 2079 } 2080 2081 float64 QEMU_FLATTEN 2082 float64_div(float64 a, float64 b, float_status *s) 2083 { 2084 return float64_gen2(a, b, s, hard_f64_div, soft_f64_div, 2085 f64_div_pre, f64_div_post); 2086 } 2087 2088 bfloat16 QEMU_FLATTEN 2089 bfloat16_div(bfloat16 a, bfloat16 b, float_status *status) 2090 { 2091 FloatParts64 pa, pb, *pr; 2092 2093 bfloat16_unpack_canonical(&pa, a, status); 2094 bfloat16_unpack_canonical(&pb, b, status); 2095 pr = parts_div(&pa, &pb, status); 2096 2097 return bfloat16_round_pack_canonical(pr, status); 2098 } 2099 2100 float128 QEMU_FLATTEN 2101 float128_div(float128 a, float128 b, float_status *status) 2102 { 2103 FloatParts128 pa, pb, *pr; 2104 2105 float128_unpack_canonical(&pa, a, status); 2106 float128_unpack_canonical(&pb, b, status); 2107 pr = parts_div(&pa, &pb, status); 2108 2109 return float128_round_pack_canonical(pr, status); 2110 } 2111 2112 /* 2113 * Float to Float conversions 2114 * 2115 * Returns the result of converting one float format to another. The 2116 * conversion is performed according to the IEC/IEEE Standard for 2117 * Binary Floating-Point Arithmetic. 2118 * 2119 * Usually this only needs to take care of raising invalid exceptions 2120 * and handling the conversion on NaNs. 2121 */ 2122 2123 static void parts_float_to_ahp(FloatParts64 *a, float_status *s) 2124 { 2125 switch (a->cls) { 2126 case float_class_qnan: 2127 case float_class_snan: 2128 /* 2129 * There is no NaN in the destination format. Raise Invalid 2130 * and return a zero with the sign of the input NaN. 2131 */ 2132 float_raise(float_flag_invalid, s); 2133 a->cls = float_class_zero; 2134 break; 2135 2136 case float_class_inf: 2137 /* 2138 * There is no Inf in the destination format. Raise Invalid 2139 * and return the maximum normal with the correct sign. 2140 */ 2141 float_raise(float_flag_invalid, s); 2142 a->cls = float_class_normal; 2143 a->exp = float16_params_ahp.exp_max; 2144 a->frac = MAKE_64BIT_MASK(float16_params_ahp.frac_shift, 2145 float16_params_ahp.frac_size + 1); 2146 break; 2147 2148 case float_class_normal: 2149 case float_class_zero: 2150 break; 2151 2152 default: 2153 g_assert_not_reached(); 2154 } 2155 } 2156 2157 static void parts64_float_to_float(FloatParts64 *a, float_status *s) 2158 { 2159 if (is_nan(a->cls)) { 2160 parts_return_nan(a, s); 2161 } 2162 } 2163 2164 static void parts128_float_to_float(FloatParts128 *a, float_status *s) 2165 { 2166 if (is_nan(a->cls)) { 2167 parts_return_nan(a, s); 2168 } 2169 } 2170 2171 #define parts_float_to_float(P, S) \ 2172 PARTS_GENERIC_64_128(float_to_float, P)(P, S) 2173 2174 static void parts_float_to_float_narrow(FloatParts64 *a, FloatParts128 *b, 2175 float_status *s) 2176 { 2177 a->cls = b->cls; 2178 a->sign = b->sign; 2179 a->exp = b->exp; 2180 2181 if (a->cls == float_class_normal) { 2182 frac_truncjam(a, b); 2183 } else if (is_nan(a->cls)) { 2184 /* Discard the low bits of the NaN. */ 2185 a->frac = b->frac_hi; 2186 parts_return_nan(a, s); 2187 } 2188 } 2189 2190 static void parts_float_to_float_widen(FloatParts128 *a, FloatParts64 *b, 2191 float_status *s) 2192 { 2193 a->cls = b->cls; 2194 a->sign = b->sign; 2195 a->exp = b->exp; 2196 frac_widen(a, b); 2197 2198 if (is_nan(a->cls)) { 2199 parts_return_nan(a, s); 2200 } 2201 } 2202 2203 float32 float16_to_float32(float16 a, bool ieee, float_status *s) 2204 { 2205 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2206 FloatParts64 p; 2207 2208 float16a_unpack_canonical(&p, a, s, fmt16); 2209 parts_float_to_float(&p, s); 2210 return float32_round_pack_canonical(&p, s); 2211 } 2212 2213 float64 float16_to_float64(float16 a, bool ieee, float_status *s) 2214 { 2215 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2216 FloatParts64 p; 2217 2218 float16a_unpack_canonical(&p, a, s, fmt16); 2219 parts_float_to_float(&p, s); 2220 return float64_round_pack_canonical(&p, s); 2221 } 2222 2223 float16 float32_to_float16(float32 a, bool ieee, float_status *s) 2224 { 2225 FloatParts64 p; 2226 const FloatFmt *fmt; 2227 2228 float32_unpack_canonical(&p, a, s); 2229 if (ieee) { 2230 parts_float_to_float(&p, s); 2231 fmt = &float16_params; 2232 } else { 2233 parts_float_to_ahp(&p, s); 2234 fmt = &float16_params_ahp; 2235 } 2236 return float16a_round_pack_canonical(&p, s, fmt); 2237 } 2238 2239 static float64 QEMU_SOFTFLOAT_ATTR 2240 soft_float32_to_float64(float32 a, float_status *s) 2241 { 2242 FloatParts64 p; 2243 2244 float32_unpack_canonical(&p, a, s); 2245 parts_float_to_float(&p, s); 2246 return float64_round_pack_canonical(&p, s); 2247 } 2248 2249 float64 float32_to_float64(float32 a, float_status *s) 2250 { 2251 if (likely(float32_is_normal(a))) { 2252 /* Widening conversion can never produce inexact results. */ 2253 union_float32 uf; 2254 union_float64 ud; 2255 uf.s = a; 2256 ud.h = uf.h; 2257 return ud.s; 2258 } else if (float32_is_zero(a)) { 2259 return float64_set_sign(float64_zero, float32_is_neg(a)); 2260 } else { 2261 return soft_float32_to_float64(a, s); 2262 } 2263 } 2264 2265 float16 float64_to_float16(float64 a, bool ieee, float_status *s) 2266 { 2267 FloatParts64 p; 2268 const FloatFmt *fmt; 2269 2270 float64_unpack_canonical(&p, a, s); 2271 if (ieee) { 2272 parts_float_to_float(&p, s); 2273 fmt = &float16_params; 2274 } else { 2275 parts_float_to_ahp(&p, s); 2276 fmt = &float16_params_ahp; 2277 } 2278 return float16a_round_pack_canonical(&p, s, fmt); 2279 } 2280 2281 float32 float64_to_float32(float64 a, float_status *s) 2282 { 2283 FloatParts64 p; 2284 2285 float64_unpack_canonical(&p, a, s); 2286 parts_float_to_float(&p, s); 2287 return float32_round_pack_canonical(&p, s); 2288 } 2289 2290 float32 bfloat16_to_float32(bfloat16 a, float_status *s) 2291 { 2292 FloatParts64 p; 2293 2294 bfloat16_unpack_canonical(&p, a, s); 2295 parts_float_to_float(&p, s); 2296 return float32_round_pack_canonical(&p, s); 2297 } 2298 2299 float64 bfloat16_to_float64(bfloat16 a, float_status *s) 2300 { 2301 FloatParts64 p; 2302 2303 bfloat16_unpack_canonical(&p, a, s); 2304 parts_float_to_float(&p, s); 2305 return float64_round_pack_canonical(&p, s); 2306 } 2307 2308 bfloat16 float32_to_bfloat16(float32 a, float_status *s) 2309 { 2310 FloatParts64 p; 2311 2312 float32_unpack_canonical(&p, a, s); 2313 parts_float_to_float(&p, s); 2314 return bfloat16_round_pack_canonical(&p, s); 2315 } 2316 2317 bfloat16 float64_to_bfloat16(float64 a, float_status *s) 2318 { 2319 FloatParts64 p; 2320 2321 float64_unpack_canonical(&p, a, s); 2322 parts_float_to_float(&p, s); 2323 return bfloat16_round_pack_canonical(&p, s); 2324 } 2325 2326 float32 float128_to_float32(float128 a, float_status *s) 2327 { 2328 FloatParts64 p64; 2329 FloatParts128 p128; 2330 2331 float128_unpack_canonical(&p128, a, s); 2332 parts_float_to_float_narrow(&p64, &p128, s); 2333 return float32_round_pack_canonical(&p64, s); 2334 } 2335 2336 float64 float128_to_float64(float128 a, float_status *s) 2337 { 2338 FloatParts64 p64; 2339 FloatParts128 p128; 2340 2341 float128_unpack_canonical(&p128, a, s); 2342 parts_float_to_float_narrow(&p64, &p128, s); 2343 return float64_round_pack_canonical(&p64, s); 2344 } 2345 2346 float128 float32_to_float128(float32 a, float_status *s) 2347 { 2348 FloatParts64 p64; 2349 FloatParts128 p128; 2350 2351 float32_unpack_canonical(&p64, a, s); 2352 parts_float_to_float_widen(&p128, &p64, s); 2353 return float128_round_pack_canonical(&p128, s); 2354 } 2355 2356 float128 float64_to_float128(float64 a, float_status *s) 2357 { 2358 FloatParts64 p64; 2359 FloatParts128 p128; 2360 2361 float64_unpack_canonical(&p64, a, s); 2362 parts_float_to_float_widen(&p128, &p64, s); 2363 return float128_round_pack_canonical(&p128, s); 2364 } 2365 2366 /* 2367 * Round to integral value 2368 */ 2369 2370 float16 float16_round_to_int(float16 a, float_status *s) 2371 { 2372 FloatParts64 p; 2373 2374 float16_unpack_canonical(&p, a, s); 2375 parts_round_to_int(&p, s->float_rounding_mode, 0, s, &float16_params); 2376 return float16_round_pack_canonical(&p, s); 2377 } 2378 2379 float32 float32_round_to_int(float32 a, float_status *s) 2380 { 2381 FloatParts64 p; 2382 2383 float32_unpack_canonical(&p, a, s); 2384 parts_round_to_int(&p, s->float_rounding_mode, 0, s, &float32_params); 2385 return float32_round_pack_canonical(&p, s); 2386 } 2387 2388 float64 float64_round_to_int(float64 a, float_status *s) 2389 { 2390 FloatParts64 p; 2391 2392 float64_unpack_canonical(&p, a, s); 2393 parts_round_to_int(&p, s->float_rounding_mode, 0, s, &float64_params); 2394 return float64_round_pack_canonical(&p, s); 2395 } 2396 2397 bfloat16 bfloat16_round_to_int(bfloat16 a, float_status *s) 2398 { 2399 FloatParts64 p; 2400 2401 bfloat16_unpack_canonical(&p, a, s); 2402 parts_round_to_int(&p, s->float_rounding_mode, 0, s, &bfloat16_params); 2403 return bfloat16_round_pack_canonical(&p, s); 2404 } 2405 2406 float128 float128_round_to_int(float128 a, float_status *s) 2407 { 2408 FloatParts128 p; 2409 2410 float128_unpack_canonical(&p, a, s); 2411 parts_round_to_int(&p, s->float_rounding_mode, 0, s, &float128_params); 2412 return float128_round_pack_canonical(&p, s); 2413 } 2414 2415 /* 2416 * Floating-point to signed integer conversions 2417 */ 2418 2419 int8_t float16_to_int8_scalbn(float16 a, FloatRoundMode rmode, int scale, 2420 float_status *s) 2421 { 2422 FloatParts64 p; 2423 2424 float16_unpack_canonical(&p, a, s); 2425 return parts_float_to_sint(&p, rmode, scale, INT8_MIN, INT8_MAX, s); 2426 } 2427 2428 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale, 2429 float_status *s) 2430 { 2431 FloatParts64 p; 2432 2433 float16_unpack_canonical(&p, a, s); 2434 return parts_float_to_sint(&p, rmode, scale, INT16_MIN, INT16_MAX, s); 2435 } 2436 2437 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale, 2438 float_status *s) 2439 { 2440 FloatParts64 p; 2441 2442 float16_unpack_canonical(&p, a, s); 2443 return parts_float_to_sint(&p, rmode, scale, INT32_MIN, INT32_MAX, s); 2444 } 2445 2446 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale, 2447 float_status *s) 2448 { 2449 FloatParts64 p; 2450 2451 float16_unpack_canonical(&p, a, s); 2452 return parts_float_to_sint(&p, rmode, scale, INT64_MIN, INT64_MAX, s); 2453 } 2454 2455 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale, 2456 float_status *s) 2457 { 2458 FloatParts64 p; 2459 2460 float32_unpack_canonical(&p, a, s); 2461 return parts_float_to_sint(&p, rmode, scale, INT16_MIN, INT16_MAX, s); 2462 } 2463 2464 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale, 2465 float_status *s) 2466 { 2467 FloatParts64 p; 2468 2469 float32_unpack_canonical(&p, a, s); 2470 return parts_float_to_sint(&p, rmode, scale, INT32_MIN, INT32_MAX, s); 2471 } 2472 2473 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale, 2474 float_status *s) 2475 { 2476 FloatParts64 p; 2477 2478 float32_unpack_canonical(&p, a, s); 2479 return parts_float_to_sint(&p, rmode, scale, INT64_MIN, INT64_MAX, s); 2480 } 2481 2482 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale, 2483 float_status *s) 2484 { 2485 FloatParts64 p; 2486 2487 float64_unpack_canonical(&p, a, s); 2488 return parts_float_to_sint(&p, rmode, scale, INT16_MIN, INT16_MAX, s); 2489 } 2490 2491 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale, 2492 float_status *s) 2493 { 2494 FloatParts64 p; 2495 2496 float64_unpack_canonical(&p, a, s); 2497 return parts_float_to_sint(&p, rmode, scale, INT32_MIN, INT32_MAX, s); 2498 } 2499 2500 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale, 2501 float_status *s) 2502 { 2503 FloatParts64 p; 2504 2505 float64_unpack_canonical(&p, a, s); 2506 return parts_float_to_sint(&p, rmode, scale, INT64_MIN, INT64_MAX, s); 2507 } 2508 2509 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2510 float_status *s) 2511 { 2512 FloatParts64 p; 2513 2514 bfloat16_unpack_canonical(&p, a, s); 2515 return parts_float_to_sint(&p, rmode, scale, INT16_MIN, INT16_MAX, s); 2516 } 2517 2518 int32_t bfloat16_to_int32_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2519 float_status *s) 2520 { 2521 FloatParts64 p; 2522 2523 bfloat16_unpack_canonical(&p, a, s); 2524 return parts_float_to_sint(&p, rmode, scale, INT32_MIN, INT32_MAX, s); 2525 } 2526 2527 int64_t bfloat16_to_int64_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2528 float_status *s) 2529 { 2530 FloatParts64 p; 2531 2532 bfloat16_unpack_canonical(&p, a, s); 2533 return parts_float_to_sint(&p, rmode, scale, INT64_MIN, INT64_MAX, s); 2534 } 2535 2536 static int32_t float128_to_int32_scalbn(float128 a, FloatRoundMode rmode, 2537 int scale, float_status *s) 2538 { 2539 FloatParts128 p; 2540 2541 float128_unpack_canonical(&p, a, s); 2542 return parts_float_to_sint(&p, rmode, scale, INT32_MIN, INT32_MAX, s); 2543 } 2544 2545 static int64_t float128_to_int64_scalbn(float128 a, FloatRoundMode rmode, 2546 int scale, float_status *s) 2547 { 2548 FloatParts128 p; 2549 2550 float128_unpack_canonical(&p, a, s); 2551 return parts_float_to_sint(&p, rmode, scale, INT64_MIN, INT64_MAX, s); 2552 } 2553 2554 int8_t float16_to_int8(float16 a, float_status *s) 2555 { 2556 return float16_to_int8_scalbn(a, s->float_rounding_mode, 0, s); 2557 } 2558 2559 int16_t float16_to_int16(float16 a, float_status *s) 2560 { 2561 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2562 } 2563 2564 int32_t float16_to_int32(float16 a, float_status *s) 2565 { 2566 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2567 } 2568 2569 int64_t float16_to_int64(float16 a, float_status *s) 2570 { 2571 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2572 } 2573 2574 int16_t float32_to_int16(float32 a, float_status *s) 2575 { 2576 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2577 } 2578 2579 int32_t float32_to_int32(float32 a, float_status *s) 2580 { 2581 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2582 } 2583 2584 int64_t float32_to_int64(float32 a, float_status *s) 2585 { 2586 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2587 } 2588 2589 int16_t float64_to_int16(float64 a, float_status *s) 2590 { 2591 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2592 } 2593 2594 int32_t float64_to_int32(float64 a, float_status *s) 2595 { 2596 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2597 } 2598 2599 int64_t float64_to_int64(float64 a, float_status *s) 2600 { 2601 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2602 } 2603 2604 int32_t float128_to_int32(float128 a, float_status *s) 2605 { 2606 return float128_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2607 } 2608 2609 int64_t float128_to_int64(float128 a, float_status *s) 2610 { 2611 return float128_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2612 } 2613 2614 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s) 2615 { 2616 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2617 } 2618 2619 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s) 2620 { 2621 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2622 } 2623 2624 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s) 2625 { 2626 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2627 } 2628 2629 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s) 2630 { 2631 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s); 2632 } 2633 2634 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s) 2635 { 2636 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s); 2637 } 2638 2639 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s) 2640 { 2641 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s); 2642 } 2643 2644 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s) 2645 { 2646 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s); 2647 } 2648 2649 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s) 2650 { 2651 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s); 2652 } 2653 2654 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s) 2655 { 2656 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s); 2657 } 2658 2659 int32_t float128_to_int32_round_to_zero(float128 a, float_status *s) 2660 { 2661 return float128_to_int32_scalbn(a, float_round_to_zero, 0, s); 2662 } 2663 2664 int64_t float128_to_int64_round_to_zero(float128 a, float_status *s) 2665 { 2666 return float128_to_int64_scalbn(a, float_round_to_zero, 0, s); 2667 } 2668 2669 int16_t bfloat16_to_int16(bfloat16 a, float_status *s) 2670 { 2671 return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2672 } 2673 2674 int32_t bfloat16_to_int32(bfloat16 a, float_status *s) 2675 { 2676 return bfloat16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2677 } 2678 2679 int64_t bfloat16_to_int64(bfloat16 a, float_status *s) 2680 { 2681 return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2682 } 2683 2684 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s) 2685 { 2686 return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2687 } 2688 2689 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a, float_status *s) 2690 { 2691 return bfloat16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2692 } 2693 2694 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a, float_status *s) 2695 { 2696 return bfloat16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2697 } 2698 2699 /* 2700 * Floating-point to unsigned integer conversions 2701 */ 2702 2703 uint8_t float16_to_uint8_scalbn(float16 a, FloatRoundMode rmode, int scale, 2704 float_status *s) 2705 { 2706 FloatParts64 p; 2707 2708 float16_unpack_canonical(&p, a, s); 2709 return parts_float_to_uint(&p, rmode, scale, UINT8_MAX, s); 2710 } 2711 2712 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale, 2713 float_status *s) 2714 { 2715 FloatParts64 p; 2716 2717 float16_unpack_canonical(&p, a, s); 2718 return parts_float_to_uint(&p, rmode, scale, UINT16_MAX, s); 2719 } 2720 2721 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale, 2722 float_status *s) 2723 { 2724 FloatParts64 p; 2725 2726 float16_unpack_canonical(&p, a, s); 2727 return parts_float_to_uint(&p, rmode, scale, UINT32_MAX, s); 2728 } 2729 2730 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale, 2731 float_status *s) 2732 { 2733 FloatParts64 p; 2734 2735 float16_unpack_canonical(&p, a, s); 2736 return parts_float_to_uint(&p, rmode, scale, UINT64_MAX, s); 2737 } 2738 2739 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale, 2740 float_status *s) 2741 { 2742 FloatParts64 p; 2743 2744 float32_unpack_canonical(&p, a, s); 2745 return parts_float_to_uint(&p, rmode, scale, UINT16_MAX, s); 2746 } 2747 2748 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale, 2749 float_status *s) 2750 { 2751 FloatParts64 p; 2752 2753 float32_unpack_canonical(&p, a, s); 2754 return parts_float_to_uint(&p, rmode, scale, UINT32_MAX, s); 2755 } 2756 2757 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale, 2758 float_status *s) 2759 { 2760 FloatParts64 p; 2761 2762 float32_unpack_canonical(&p, a, s); 2763 return parts_float_to_uint(&p, rmode, scale, UINT64_MAX, s); 2764 } 2765 2766 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale, 2767 float_status *s) 2768 { 2769 FloatParts64 p; 2770 2771 float64_unpack_canonical(&p, a, s); 2772 return parts_float_to_uint(&p, rmode, scale, UINT16_MAX, s); 2773 } 2774 2775 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale, 2776 float_status *s) 2777 { 2778 FloatParts64 p; 2779 2780 float64_unpack_canonical(&p, a, s); 2781 return parts_float_to_uint(&p, rmode, scale, UINT32_MAX, s); 2782 } 2783 2784 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale, 2785 float_status *s) 2786 { 2787 FloatParts64 p; 2788 2789 float64_unpack_canonical(&p, a, s); 2790 return parts_float_to_uint(&p, rmode, scale, UINT64_MAX, s); 2791 } 2792 2793 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode, 2794 int scale, float_status *s) 2795 { 2796 FloatParts64 p; 2797 2798 bfloat16_unpack_canonical(&p, a, s); 2799 return parts_float_to_uint(&p, rmode, scale, UINT16_MAX, s); 2800 } 2801 2802 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode rmode, 2803 int scale, float_status *s) 2804 { 2805 FloatParts64 p; 2806 2807 bfloat16_unpack_canonical(&p, a, s); 2808 return parts_float_to_uint(&p, rmode, scale, UINT32_MAX, s); 2809 } 2810 2811 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode rmode, 2812 int scale, float_status *s) 2813 { 2814 FloatParts64 p; 2815 2816 bfloat16_unpack_canonical(&p, a, s); 2817 return parts_float_to_uint(&p, rmode, scale, UINT64_MAX, s); 2818 } 2819 2820 static uint32_t float128_to_uint32_scalbn(float128 a, FloatRoundMode rmode, 2821 int scale, float_status *s) 2822 { 2823 FloatParts128 p; 2824 2825 float128_unpack_canonical(&p, a, s); 2826 return parts_float_to_uint(&p, rmode, scale, UINT32_MAX, s); 2827 } 2828 2829 static uint64_t float128_to_uint64_scalbn(float128 a, FloatRoundMode rmode, 2830 int scale, float_status *s) 2831 { 2832 FloatParts128 p; 2833 2834 float128_unpack_canonical(&p, a, s); 2835 return parts_float_to_uint(&p, rmode, scale, UINT64_MAX, s); 2836 } 2837 2838 uint8_t float16_to_uint8(float16 a, float_status *s) 2839 { 2840 return float16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s); 2841 } 2842 2843 uint16_t float16_to_uint16(float16 a, float_status *s) 2844 { 2845 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2846 } 2847 2848 uint32_t float16_to_uint32(float16 a, float_status *s) 2849 { 2850 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2851 } 2852 2853 uint64_t float16_to_uint64(float16 a, float_status *s) 2854 { 2855 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2856 } 2857 2858 uint16_t float32_to_uint16(float32 a, float_status *s) 2859 { 2860 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2861 } 2862 2863 uint32_t float32_to_uint32(float32 a, float_status *s) 2864 { 2865 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2866 } 2867 2868 uint64_t float32_to_uint64(float32 a, float_status *s) 2869 { 2870 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2871 } 2872 2873 uint16_t float64_to_uint16(float64 a, float_status *s) 2874 { 2875 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2876 } 2877 2878 uint32_t float64_to_uint32(float64 a, float_status *s) 2879 { 2880 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2881 } 2882 2883 uint64_t float64_to_uint64(float64 a, float_status *s) 2884 { 2885 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2886 } 2887 2888 uint32_t float128_to_uint32(float128 a, float_status *s) 2889 { 2890 return float128_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2891 } 2892 2893 uint64_t float128_to_uint64(float128 a, float_status *s) 2894 { 2895 return float128_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2896 } 2897 2898 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s) 2899 { 2900 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2901 } 2902 2903 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s) 2904 { 2905 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2906 } 2907 2908 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s) 2909 { 2910 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2911 } 2912 2913 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s) 2914 { 2915 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2916 } 2917 2918 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s) 2919 { 2920 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2921 } 2922 2923 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s) 2924 { 2925 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2926 } 2927 2928 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s) 2929 { 2930 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2931 } 2932 2933 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s) 2934 { 2935 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2936 } 2937 2938 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s) 2939 { 2940 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2941 } 2942 2943 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *s) 2944 { 2945 return float128_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2946 } 2947 2948 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *s) 2949 { 2950 return float128_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2951 } 2952 2953 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s) 2954 { 2955 return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2956 } 2957 2958 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *s) 2959 { 2960 return bfloat16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2961 } 2962 2963 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s) 2964 { 2965 return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2966 } 2967 2968 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s) 2969 { 2970 return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2971 } 2972 2973 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *s) 2974 { 2975 return bfloat16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2976 } 2977 2978 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *s) 2979 { 2980 return bfloat16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2981 } 2982 2983 /* 2984 * Signed integer to floating-point conversions 2985 */ 2986 2987 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status) 2988 { 2989 FloatParts64 p; 2990 2991 parts_sint_to_float(&p, a, scale, status); 2992 return float16_round_pack_canonical(&p, status); 2993 } 2994 2995 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status) 2996 { 2997 return int64_to_float16_scalbn(a, scale, status); 2998 } 2999 3000 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status) 3001 { 3002 return int64_to_float16_scalbn(a, scale, status); 3003 } 3004 3005 float16 int64_to_float16(int64_t a, float_status *status) 3006 { 3007 return int64_to_float16_scalbn(a, 0, status); 3008 } 3009 3010 float16 int32_to_float16(int32_t a, float_status *status) 3011 { 3012 return int64_to_float16_scalbn(a, 0, status); 3013 } 3014 3015 float16 int16_to_float16(int16_t a, float_status *status) 3016 { 3017 return int64_to_float16_scalbn(a, 0, status); 3018 } 3019 3020 float16 int8_to_float16(int8_t a, float_status *status) 3021 { 3022 return int64_to_float16_scalbn(a, 0, status); 3023 } 3024 3025 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status) 3026 { 3027 FloatParts64 p; 3028 3029 parts64_sint_to_float(&p, a, scale, status); 3030 return float32_round_pack_canonical(&p, status); 3031 } 3032 3033 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status) 3034 { 3035 return int64_to_float32_scalbn(a, scale, status); 3036 } 3037 3038 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status) 3039 { 3040 return int64_to_float32_scalbn(a, scale, status); 3041 } 3042 3043 float32 int64_to_float32(int64_t a, float_status *status) 3044 { 3045 return int64_to_float32_scalbn(a, 0, status); 3046 } 3047 3048 float32 int32_to_float32(int32_t a, float_status *status) 3049 { 3050 return int64_to_float32_scalbn(a, 0, status); 3051 } 3052 3053 float32 int16_to_float32(int16_t a, float_status *status) 3054 { 3055 return int64_to_float32_scalbn(a, 0, status); 3056 } 3057 3058 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status) 3059 { 3060 FloatParts64 p; 3061 3062 parts_sint_to_float(&p, a, scale, status); 3063 return float64_round_pack_canonical(&p, status); 3064 } 3065 3066 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status) 3067 { 3068 return int64_to_float64_scalbn(a, scale, status); 3069 } 3070 3071 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status) 3072 { 3073 return int64_to_float64_scalbn(a, scale, status); 3074 } 3075 3076 float64 int64_to_float64(int64_t a, float_status *status) 3077 { 3078 return int64_to_float64_scalbn(a, 0, status); 3079 } 3080 3081 float64 int32_to_float64(int32_t a, float_status *status) 3082 { 3083 return int64_to_float64_scalbn(a, 0, status); 3084 } 3085 3086 float64 int16_to_float64(int16_t a, float_status *status) 3087 { 3088 return int64_to_float64_scalbn(a, 0, status); 3089 } 3090 3091 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status) 3092 { 3093 FloatParts64 p; 3094 3095 parts_sint_to_float(&p, a, scale, status); 3096 return bfloat16_round_pack_canonical(&p, status); 3097 } 3098 3099 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int scale, float_status *status) 3100 { 3101 return int64_to_bfloat16_scalbn(a, scale, status); 3102 } 3103 3104 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status) 3105 { 3106 return int64_to_bfloat16_scalbn(a, scale, status); 3107 } 3108 3109 bfloat16 int64_to_bfloat16(int64_t a, float_status *status) 3110 { 3111 return int64_to_bfloat16_scalbn(a, 0, status); 3112 } 3113 3114 bfloat16 int32_to_bfloat16(int32_t a, float_status *status) 3115 { 3116 return int64_to_bfloat16_scalbn(a, 0, status); 3117 } 3118 3119 bfloat16 int16_to_bfloat16(int16_t a, float_status *status) 3120 { 3121 return int64_to_bfloat16_scalbn(a, 0, status); 3122 } 3123 3124 float128 int64_to_float128(int64_t a, float_status *status) 3125 { 3126 FloatParts128 p; 3127 3128 parts_sint_to_float(&p, a, 0, status); 3129 return float128_round_pack_canonical(&p, status); 3130 } 3131 3132 float128 int32_to_float128(int32_t a, float_status *status) 3133 { 3134 return int64_to_float128(a, status); 3135 } 3136 3137 /* 3138 * Unsigned Integer to floating-point conversions 3139 */ 3140 3141 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status) 3142 { 3143 FloatParts64 p; 3144 3145 parts_uint_to_float(&p, a, scale, status); 3146 return float16_round_pack_canonical(&p, status); 3147 } 3148 3149 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status) 3150 { 3151 return uint64_to_float16_scalbn(a, scale, status); 3152 } 3153 3154 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status) 3155 { 3156 return uint64_to_float16_scalbn(a, scale, status); 3157 } 3158 3159 float16 uint64_to_float16(uint64_t a, float_status *status) 3160 { 3161 return uint64_to_float16_scalbn(a, 0, status); 3162 } 3163 3164 float16 uint32_to_float16(uint32_t a, float_status *status) 3165 { 3166 return uint64_to_float16_scalbn(a, 0, status); 3167 } 3168 3169 float16 uint16_to_float16(uint16_t a, float_status *status) 3170 { 3171 return uint64_to_float16_scalbn(a, 0, status); 3172 } 3173 3174 float16 uint8_to_float16(uint8_t a, float_status *status) 3175 { 3176 return uint64_to_float16_scalbn(a, 0, status); 3177 } 3178 3179 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status) 3180 { 3181 FloatParts64 p; 3182 3183 parts_uint_to_float(&p, a, scale, status); 3184 return float32_round_pack_canonical(&p, status); 3185 } 3186 3187 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status) 3188 { 3189 return uint64_to_float32_scalbn(a, scale, status); 3190 } 3191 3192 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status) 3193 { 3194 return uint64_to_float32_scalbn(a, scale, status); 3195 } 3196 3197 float32 uint64_to_float32(uint64_t a, float_status *status) 3198 { 3199 return uint64_to_float32_scalbn(a, 0, status); 3200 } 3201 3202 float32 uint32_to_float32(uint32_t a, float_status *status) 3203 { 3204 return uint64_to_float32_scalbn(a, 0, status); 3205 } 3206 3207 float32 uint16_to_float32(uint16_t a, float_status *status) 3208 { 3209 return uint64_to_float32_scalbn(a, 0, status); 3210 } 3211 3212 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status) 3213 { 3214 FloatParts64 p; 3215 3216 parts_uint_to_float(&p, a, scale, status); 3217 return float64_round_pack_canonical(&p, status); 3218 } 3219 3220 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status) 3221 { 3222 return uint64_to_float64_scalbn(a, scale, status); 3223 } 3224 3225 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status) 3226 { 3227 return uint64_to_float64_scalbn(a, scale, status); 3228 } 3229 3230 float64 uint64_to_float64(uint64_t a, float_status *status) 3231 { 3232 return uint64_to_float64_scalbn(a, 0, status); 3233 } 3234 3235 float64 uint32_to_float64(uint32_t a, float_status *status) 3236 { 3237 return uint64_to_float64_scalbn(a, 0, status); 3238 } 3239 3240 float64 uint16_to_float64(uint16_t a, float_status *status) 3241 { 3242 return uint64_to_float64_scalbn(a, 0, status); 3243 } 3244 3245 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status) 3246 { 3247 FloatParts64 p; 3248 3249 parts_uint_to_float(&p, a, scale, status); 3250 return bfloat16_round_pack_canonical(&p, status); 3251 } 3252 3253 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int scale, float_status *status) 3254 { 3255 return uint64_to_bfloat16_scalbn(a, scale, status); 3256 } 3257 3258 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status) 3259 { 3260 return uint64_to_bfloat16_scalbn(a, scale, status); 3261 } 3262 3263 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status) 3264 { 3265 return uint64_to_bfloat16_scalbn(a, 0, status); 3266 } 3267 3268 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status) 3269 { 3270 return uint64_to_bfloat16_scalbn(a, 0, status); 3271 } 3272 3273 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status) 3274 { 3275 return uint64_to_bfloat16_scalbn(a, 0, status); 3276 } 3277 3278 float128 uint64_to_float128(uint64_t a, float_status *status) 3279 { 3280 FloatParts128 p; 3281 3282 parts_uint_to_float(&p, a, 0, status); 3283 return float128_round_pack_canonical(&p, status); 3284 } 3285 3286 /* 3287 * Minimum and maximum 3288 */ 3289 3290 static float16 float16_minmax(float16 a, float16 b, float_status *s, int flags) 3291 { 3292 FloatParts64 pa, pb, *pr; 3293 3294 float16_unpack_canonical(&pa, a, s); 3295 float16_unpack_canonical(&pb, b, s); 3296 pr = parts_minmax(&pa, &pb, s, flags); 3297 3298 return float16_round_pack_canonical(pr, s); 3299 } 3300 3301 static bfloat16 bfloat16_minmax(bfloat16 a, bfloat16 b, 3302 float_status *s, int flags) 3303 { 3304 FloatParts64 pa, pb, *pr; 3305 3306 bfloat16_unpack_canonical(&pa, a, s); 3307 bfloat16_unpack_canonical(&pb, b, s); 3308 pr = parts_minmax(&pa, &pb, s, flags); 3309 3310 return bfloat16_round_pack_canonical(pr, s); 3311 } 3312 3313 static float32 float32_minmax(float32 a, float32 b, float_status *s, int flags) 3314 { 3315 FloatParts64 pa, pb, *pr; 3316 3317 float32_unpack_canonical(&pa, a, s); 3318 float32_unpack_canonical(&pb, b, s); 3319 pr = parts_minmax(&pa, &pb, s, flags); 3320 3321 return float32_round_pack_canonical(pr, s); 3322 } 3323 3324 static float64 float64_minmax(float64 a, float64 b, float_status *s, int flags) 3325 { 3326 FloatParts64 pa, pb, *pr; 3327 3328 float64_unpack_canonical(&pa, a, s); 3329 float64_unpack_canonical(&pb, b, s); 3330 pr = parts_minmax(&pa, &pb, s, flags); 3331 3332 return float64_round_pack_canonical(pr, s); 3333 } 3334 3335 static float128 float128_minmax(float128 a, float128 b, 3336 float_status *s, int flags) 3337 { 3338 FloatParts128 pa, pb, *pr; 3339 3340 float128_unpack_canonical(&pa, a, s); 3341 float128_unpack_canonical(&pb, b, s); 3342 pr = parts_minmax(&pa, &pb, s, flags); 3343 3344 return float128_round_pack_canonical(pr, s); 3345 } 3346 3347 #define MINMAX_1(type, name, flags) \ 3348 type type##_##name(type a, type b, float_status *s) \ 3349 { return type##_minmax(a, b, s, flags); } 3350 3351 #define MINMAX_2(type) \ 3352 MINMAX_1(type, max, 0) \ 3353 MINMAX_1(type, maxnum, minmax_isnum) \ 3354 MINMAX_1(type, maxnummag, minmax_isnum | minmax_ismag) \ 3355 MINMAX_1(type, min, minmax_ismin) \ 3356 MINMAX_1(type, minnum, minmax_ismin | minmax_isnum) \ 3357 MINMAX_1(type, minnummag, minmax_ismin | minmax_isnum | minmax_ismag) 3358 3359 MINMAX_2(float16) 3360 MINMAX_2(bfloat16) 3361 MINMAX_2(float32) 3362 MINMAX_2(float64) 3363 MINMAX_2(float128) 3364 3365 #undef MINMAX_1 3366 #undef MINMAX_2 3367 3368 /* 3369 * Floating point compare 3370 */ 3371 3372 static FloatRelation QEMU_FLATTEN 3373 float16_do_compare(float16 a, float16 b, float_status *s, bool is_quiet) 3374 { 3375 FloatParts64 pa, pb; 3376 3377 float16_unpack_canonical(&pa, a, s); 3378 float16_unpack_canonical(&pb, b, s); 3379 return parts_compare(&pa, &pb, s, is_quiet); 3380 } 3381 3382 FloatRelation float16_compare(float16 a, float16 b, float_status *s) 3383 { 3384 return float16_do_compare(a, b, s, false); 3385 } 3386 3387 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s) 3388 { 3389 return float16_do_compare(a, b, s, true); 3390 } 3391 3392 static FloatRelation QEMU_SOFTFLOAT_ATTR 3393 float32_do_compare(float32 a, float32 b, float_status *s, bool is_quiet) 3394 { 3395 FloatParts64 pa, pb; 3396 3397 float32_unpack_canonical(&pa, a, s); 3398 float32_unpack_canonical(&pb, b, s); 3399 return parts_compare(&pa, &pb, s, is_quiet); 3400 } 3401 3402 static FloatRelation QEMU_FLATTEN 3403 float32_hs_compare(float32 xa, float32 xb, float_status *s, bool is_quiet) 3404 { 3405 union_float32 ua, ub; 3406 3407 ua.s = xa; 3408 ub.s = xb; 3409 3410 if (QEMU_NO_HARDFLOAT) { 3411 goto soft; 3412 } 3413 3414 float32_input_flush2(&ua.s, &ub.s, s); 3415 if (isgreaterequal(ua.h, ub.h)) { 3416 if (isgreater(ua.h, ub.h)) { 3417 return float_relation_greater; 3418 } 3419 return float_relation_equal; 3420 } 3421 if (likely(isless(ua.h, ub.h))) { 3422 return float_relation_less; 3423 } 3424 /* 3425 * The only condition remaining is unordered. 3426 * Fall through to set flags. 3427 */ 3428 soft: 3429 return float32_do_compare(ua.s, ub.s, s, is_quiet); 3430 } 3431 3432 FloatRelation float32_compare(float32 a, float32 b, float_status *s) 3433 { 3434 return float32_hs_compare(a, b, s, false); 3435 } 3436 3437 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s) 3438 { 3439 return float32_hs_compare(a, b, s, true); 3440 } 3441 3442 static FloatRelation QEMU_SOFTFLOAT_ATTR 3443 float64_do_compare(float64 a, float64 b, float_status *s, bool is_quiet) 3444 { 3445 FloatParts64 pa, pb; 3446 3447 float64_unpack_canonical(&pa, a, s); 3448 float64_unpack_canonical(&pb, b, s); 3449 return parts_compare(&pa, &pb, s, is_quiet); 3450 } 3451 3452 static FloatRelation QEMU_FLATTEN 3453 float64_hs_compare(float64 xa, float64 xb, float_status *s, bool is_quiet) 3454 { 3455 union_float64 ua, ub; 3456 3457 ua.s = xa; 3458 ub.s = xb; 3459 3460 if (QEMU_NO_HARDFLOAT) { 3461 goto soft; 3462 } 3463 3464 float64_input_flush2(&ua.s, &ub.s, s); 3465 if (isgreaterequal(ua.h, ub.h)) { 3466 if (isgreater(ua.h, ub.h)) { 3467 return float_relation_greater; 3468 } 3469 return float_relation_equal; 3470 } 3471 if (likely(isless(ua.h, ub.h))) { 3472 return float_relation_less; 3473 } 3474 /* 3475 * The only condition remaining is unordered. 3476 * Fall through to set flags. 3477 */ 3478 soft: 3479 return float64_do_compare(ua.s, ub.s, s, is_quiet); 3480 } 3481 3482 FloatRelation float64_compare(float64 a, float64 b, float_status *s) 3483 { 3484 return float64_hs_compare(a, b, s, false); 3485 } 3486 3487 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s) 3488 { 3489 return float64_hs_compare(a, b, s, true); 3490 } 3491 3492 static FloatRelation QEMU_FLATTEN 3493 bfloat16_do_compare(bfloat16 a, bfloat16 b, float_status *s, bool is_quiet) 3494 { 3495 FloatParts64 pa, pb; 3496 3497 bfloat16_unpack_canonical(&pa, a, s); 3498 bfloat16_unpack_canonical(&pb, b, s); 3499 return parts_compare(&pa, &pb, s, is_quiet); 3500 } 3501 3502 FloatRelation bfloat16_compare(bfloat16 a, bfloat16 b, float_status *s) 3503 { 3504 return bfloat16_do_compare(a, b, s, false); 3505 } 3506 3507 FloatRelation bfloat16_compare_quiet(bfloat16 a, bfloat16 b, float_status *s) 3508 { 3509 return bfloat16_do_compare(a, b, s, true); 3510 } 3511 3512 static FloatRelation QEMU_FLATTEN 3513 float128_do_compare(float128 a, float128 b, float_status *s, bool is_quiet) 3514 { 3515 FloatParts128 pa, pb; 3516 3517 float128_unpack_canonical(&pa, a, s); 3518 float128_unpack_canonical(&pb, b, s); 3519 return parts_compare(&pa, &pb, s, is_quiet); 3520 } 3521 3522 FloatRelation float128_compare(float128 a, float128 b, float_status *s) 3523 { 3524 return float128_do_compare(a, b, s, false); 3525 } 3526 3527 FloatRelation float128_compare_quiet(float128 a, float128 b, float_status *s) 3528 { 3529 return float128_do_compare(a, b, s, true); 3530 } 3531 3532 /* Multiply A by 2 raised to the power N. */ 3533 static FloatParts64 scalbn_decomposed(FloatParts64 a, int n, float_status *s) 3534 { 3535 if (unlikely(is_nan(a.cls))) { 3536 parts_return_nan(&a, s); 3537 } 3538 if (a.cls == float_class_normal) { 3539 /* The largest float type (even though not supported by FloatParts64) 3540 * is float128, which has a 15 bit exponent. Bounding N to 16 bits 3541 * still allows rounding to infinity, without allowing overflow 3542 * within the int32_t that backs FloatParts64.exp. 3543 */ 3544 n = MIN(MAX(n, -0x10000), 0x10000); 3545 a.exp += n; 3546 } 3547 return a; 3548 } 3549 3550 float16 float16_scalbn(float16 a, int n, float_status *status) 3551 { 3552 FloatParts64 pa, pr; 3553 3554 float16_unpack_canonical(&pa, a, status); 3555 pr = scalbn_decomposed(pa, n, status); 3556 return float16_round_pack_canonical(&pr, status); 3557 } 3558 3559 float32 float32_scalbn(float32 a, int n, float_status *status) 3560 { 3561 FloatParts64 pa, pr; 3562 3563 float32_unpack_canonical(&pa, a, status); 3564 pr = scalbn_decomposed(pa, n, status); 3565 return float32_round_pack_canonical(&pr, status); 3566 } 3567 3568 float64 float64_scalbn(float64 a, int n, float_status *status) 3569 { 3570 FloatParts64 pa, pr; 3571 3572 float64_unpack_canonical(&pa, a, status); 3573 pr = scalbn_decomposed(pa, n, status); 3574 return float64_round_pack_canonical(&pr, status); 3575 } 3576 3577 bfloat16 bfloat16_scalbn(bfloat16 a, int n, float_status *status) 3578 { 3579 FloatParts64 pa, pr; 3580 3581 bfloat16_unpack_canonical(&pa, a, status); 3582 pr = scalbn_decomposed(pa, n, status); 3583 return bfloat16_round_pack_canonical(&pr, status); 3584 } 3585 3586 /* 3587 * Square Root 3588 * 3589 * The old softfloat code did an approximation step before zeroing in 3590 * on the final result. However for simpleness we just compute the 3591 * square root by iterating down from the implicit bit to enough extra 3592 * bits to ensure we get a correctly rounded result. 3593 * 3594 * This does mean however the calculation is slower than before, 3595 * especially for 64 bit floats. 3596 */ 3597 3598 static FloatParts64 sqrt_float(FloatParts64 a, float_status *s, const FloatFmt *p) 3599 { 3600 uint64_t a_frac, r_frac, s_frac; 3601 int bit, last_bit; 3602 3603 if (is_nan(a.cls)) { 3604 parts_return_nan(&a, s); 3605 return a; 3606 } 3607 if (a.cls == float_class_zero) { 3608 return a; /* sqrt(+-0) = +-0 */ 3609 } 3610 if (a.sign) { 3611 float_raise(float_flag_invalid, s); 3612 parts_default_nan(&a, s); 3613 return a; 3614 } 3615 if (a.cls == float_class_inf) { 3616 return a; /* sqrt(+inf) = +inf */ 3617 } 3618 3619 assert(a.cls == float_class_normal); 3620 3621 /* We need two overflow bits at the top. Adding room for that is a 3622 * right shift. If the exponent is odd, we can discard the low bit 3623 * by multiplying the fraction by 2; that's a left shift. Combine 3624 * those and we shift right by 1 if the exponent is odd, otherwise 2. 3625 */ 3626 a_frac = a.frac >> (2 - (a.exp & 1)); 3627 a.exp >>= 1; 3628 3629 /* Bit-by-bit computation of sqrt. */ 3630 r_frac = 0; 3631 s_frac = 0; 3632 3633 /* Iterate from implicit bit down to the 3 extra bits to compute a 3634 * properly rounded result. Remember we've inserted two more bits 3635 * at the top, so these positions are two less. 3636 */ 3637 bit = DECOMPOSED_BINARY_POINT - 2; 3638 last_bit = MAX(p->frac_shift - 4, 0); 3639 do { 3640 uint64_t q = 1ULL << bit; 3641 uint64_t t_frac = s_frac + q; 3642 if (t_frac <= a_frac) { 3643 s_frac = t_frac + q; 3644 a_frac -= t_frac; 3645 r_frac += q; 3646 } 3647 a_frac <<= 1; 3648 } while (--bit >= last_bit); 3649 3650 /* Undo the right shift done above. If there is any remaining 3651 * fraction, the result is inexact. Set the sticky bit. 3652 */ 3653 a.frac = (r_frac << 2) + (a_frac != 0); 3654 3655 return a; 3656 } 3657 3658 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status) 3659 { 3660 FloatParts64 pa, pr; 3661 3662 float16_unpack_canonical(&pa, a, status); 3663 pr = sqrt_float(pa, status, &float16_params); 3664 return float16_round_pack_canonical(&pr, status); 3665 } 3666 3667 static float32 QEMU_SOFTFLOAT_ATTR 3668 soft_f32_sqrt(float32 a, float_status *status) 3669 { 3670 FloatParts64 pa, pr; 3671 3672 float32_unpack_canonical(&pa, a, status); 3673 pr = sqrt_float(pa, status, &float32_params); 3674 return float32_round_pack_canonical(&pr, status); 3675 } 3676 3677 static float64 QEMU_SOFTFLOAT_ATTR 3678 soft_f64_sqrt(float64 a, float_status *status) 3679 { 3680 FloatParts64 pa, pr; 3681 3682 float64_unpack_canonical(&pa, a, status); 3683 pr = sqrt_float(pa, status, &float64_params); 3684 return float64_round_pack_canonical(&pr, status); 3685 } 3686 3687 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s) 3688 { 3689 union_float32 ua, ur; 3690 3691 ua.s = xa; 3692 if (unlikely(!can_use_fpu(s))) { 3693 goto soft; 3694 } 3695 3696 float32_input_flush1(&ua.s, s); 3697 if (QEMU_HARDFLOAT_1F32_USE_FP) { 3698 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3699 fpclassify(ua.h) == FP_ZERO) || 3700 signbit(ua.h))) { 3701 goto soft; 3702 } 3703 } else if (unlikely(!float32_is_zero_or_normal(ua.s) || 3704 float32_is_neg(ua.s))) { 3705 goto soft; 3706 } 3707 ur.h = sqrtf(ua.h); 3708 return ur.s; 3709 3710 soft: 3711 return soft_f32_sqrt(ua.s, s); 3712 } 3713 3714 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s) 3715 { 3716 union_float64 ua, ur; 3717 3718 ua.s = xa; 3719 if (unlikely(!can_use_fpu(s))) { 3720 goto soft; 3721 } 3722 3723 float64_input_flush1(&ua.s, s); 3724 if (QEMU_HARDFLOAT_1F64_USE_FP) { 3725 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3726 fpclassify(ua.h) == FP_ZERO) || 3727 signbit(ua.h))) { 3728 goto soft; 3729 } 3730 } else if (unlikely(!float64_is_zero_or_normal(ua.s) || 3731 float64_is_neg(ua.s))) { 3732 goto soft; 3733 } 3734 ur.h = sqrt(ua.h); 3735 return ur.s; 3736 3737 soft: 3738 return soft_f64_sqrt(ua.s, s); 3739 } 3740 3741 bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status) 3742 { 3743 FloatParts64 pa, pr; 3744 3745 bfloat16_unpack_canonical(&pa, a, status); 3746 pr = sqrt_float(pa, status, &bfloat16_params); 3747 return bfloat16_round_pack_canonical(&pr, status); 3748 } 3749 3750 /*---------------------------------------------------------------------------- 3751 | The pattern for a default generated NaN. 3752 *----------------------------------------------------------------------------*/ 3753 3754 float16 float16_default_nan(float_status *status) 3755 { 3756 FloatParts64 p; 3757 3758 parts_default_nan(&p, status); 3759 p.frac >>= float16_params.frac_shift; 3760 return float16_pack_raw(&p); 3761 } 3762 3763 float32 float32_default_nan(float_status *status) 3764 { 3765 FloatParts64 p; 3766 3767 parts_default_nan(&p, status); 3768 p.frac >>= float32_params.frac_shift; 3769 return float32_pack_raw(&p); 3770 } 3771 3772 float64 float64_default_nan(float_status *status) 3773 { 3774 FloatParts64 p; 3775 3776 parts_default_nan(&p, status); 3777 p.frac >>= float64_params.frac_shift; 3778 return float64_pack_raw(&p); 3779 } 3780 3781 float128 float128_default_nan(float_status *status) 3782 { 3783 FloatParts128 p; 3784 3785 parts_default_nan(&p, status); 3786 frac_shr(&p, float128_params.frac_shift); 3787 return float128_pack_raw(&p); 3788 } 3789 3790 bfloat16 bfloat16_default_nan(float_status *status) 3791 { 3792 FloatParts64 p; 3793 3794 parts_default_nan(&p, status); 3795 p.frac >>= bfloat16_params.frac_shift; 3796 return bfloat16_pack_raw(&p); 3797 } 3798 3799 /*---------------------------------------------------------------------------- 3800 | Returns a quiet NaN from a signalling NaN for the floating point value `a'. 3801 *----------------------------------------------------------------------------*/ 3802 3803 float16 float16_silence_nan(float16 a, float_status *status) 3804 { 3805 FloatParts64 p; 3806 3807 float16_unpack_raw(&p, a); 3808 p.frac <<= float16_params.frac_shift; 3809 parts_silence_nan(&p, status); 3810 p.frac >>= float16_params.frac_shift; 3811 return float16_pack_raw(&p); 3812 } 3813 3814 float32 float32_silence_nan(float32 a, float_status *status) 3815 { 3816 FloatParts64 p; 3817 3818 float32_unpack_raw(&p, a); 3819 p.frac <<= float32_params.frac_shift; 3820 parts_silence_nan(&p, status); 3821 p.frac >>= float32_params.frac_shift; 3822 return float32_pack_raw(&p); 3823 } 3824 3825 float64 float64_silence_nan(float64 a, float_status *status) 3826 { 3827 FloatParts64 p; 3828 3829 float64_unpack_raw(&p, a); 3830 p.frac <<= float64_params.frac_shift; 3831 parts_silence_nan(&p, status); 3832 p.frac >>= float64_params.frac_shift; 3833 return float64_pack_raw(&p); 3834 } 3835 3836 bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status) 3837 { 3838 FloatParts64 p; 3839 3840 bfloat16_unpack_raw(&p, a); 3841 p.frac <<= bfloat16_params.frac_shift; 3842 parts_silence_nan(&p, status); 3843 p.frac >>= bfloat16_params.frac_shift; 3844 return bfloat16_pack_raw(&p); 3845 } 3846 3847 float128 float128_silence_nan(float128 a, float_status *status) 3848 { 3849 FloatParts128 p; 3850 3851 float128_unpack_raw(&p, a); 3852 frac_shl(&p, float128_params.frac_shift); 3853 parts_silence_nan(&p, status); 3854 frac_shr(&p, float128_params.frac_shift); 3855 return float128_pack_raw(&p); 3856 } 3857 3858 /*---------------------------------------------------------------------------- 3859 | If `a' is denormal and we are in flush-to-zero mode then set the 3860 | input-denormal exception and return zero. Otherwise just return the value. 3861 *----------------------------------------------------------------------------*/ 3862 3863 static bool parts_squash_denormal(FloatParts64 p, float_status *status) 3864 { 3865 if (p.exp == 0 && p.frac != 0) { 3866 float_raise(float_flag_input_denormal, status); 3867 return true; 3868 } 3869 3870 return false; 3871 } 3872 3873 float16 float16_squash_input_denormal(float16 a, float_status *status) 3874 { 3875 if (status->flush_inputs_to_zero) { 3876 FloatParts64 p; 3877 3878 float16_unpack_raw(&p, a); 3879 if (parts_squash_denormal(p, status)) { 3880 return float16_set_sign(float16_zero, p.sign); 3881 } 3882 } 3883 return a; 3884 } 3885 3886 float32 float32_squash_input_denormal(float32 a, float_status *status) 3887 { 3888 if (status->flush_inputs_to_zero) { 3889 FloatParts64 p; 3890 3891 float32_unpack_raw(&p, a); 3892 if (parts_squash_denormal(p, status)) { 3893 return float32_set_sign(float32_zero, p.sign); 3894 } 3895 } 3896 return a; 3897 } 3898 3899 float64 float64_squash_input_denormal(float64 a, float_status *status) 3900 { 3901 if (status->flush_inputs_to_zero) { 3902 FloatParts64 p; 3903 3904 float64_unpack_raw(&p, a); 3905 if (parts_squash_denormal(p, status)) { 3906 return float64_set_sign(float64_zero, p.sign); 3907 } 3908 } 3909 return a; 3910 } 3911 3912 bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status) 3913 { 3914 if (status->flush_inputs_to_zero) { 3915 FloatParts64 p; 3916 3917 bfloat16_unpack_raw(&p, a); 3918 if (parts_squash_denormal(p, status)) { 3919 return bfloat16_set_sign(bfloat16_zero, p.sign); 3920 } 3921 } 3922 return a; 3923 } 3924 3925 /*---------------------------------------------------------------------------- 3926 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 3927 | and 7, and returns the properly rounded 32-bit integer corresponding to the 3928 | input. If `zSign' is 1, the input is negated before being converted to an 3929 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 3930 | is simply rounded to an integer, with the inexact exception raised if the 3931 | input cannot be represented exactly as an integer. However, if the fixed- 3932 | point input is too large, the invalid exception is raised and the largest 3933 | positive or negative integer is returned. 3934 *----------------------------------------------------------------------------*/ 3935 3936 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ, 3937 float_status *status) 3938 { 3939 int8_t roundingMode; 3940 bool roundNearestEven; 3941 int8_t roundIncrement, roundBits; 3942 int32_t z; 3943 3944 roundingMode = status->float_rounding_mode; 3945 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3946 switch (roundingMode) { 3947 case float_round_nearest_even: 3948 case float_round_ties_away: 3949 roundIncrement = 0x40; 3950 break; 3951 case float_round_to_zero: 3952 roundIncrement = 0; 3953 break; 3954 case float_round_up: 3955 roundIncrement = zSign ? 0 : 0x7f; 3956 break; 3957 case float_round_down: 3958 roundIncrement = zSign ? 0x7f : 0; 3959 break; 3960 case float_round_to_odd: 3961 roundIncrement = absZ & 0x80 ? 0 : 0x7f; 3962 break; 3963 default: 3964 abort(); 3965 } 3966 roundBits = absZ & 0x7F; 3967 absZ = ( absZ + roundIncrement )>>7; 3968 if (!(roundBits ^ 0x40) && roundNearestEven) { 3969 absZ &= ~1; 3970 } 3971 z = absZ; 3972 if ( zSign ) z = - z; 3973 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 3974 float_raise(float_flag_invalid, status); 3975 return zSign ? INT32_MIN : INT32_MAX; 3976 } 3977 if (roundBits) { 3978 float_raise(float_flag_inexact, status); 3979 } 3980 return z; 3981 3982 } 3983 3984 /*---------------------------------------------------------------------------- 3985 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 3986 | `absZ1', with binary point between bits 63 and 64 (between the input words), 3987 | and returns the properly rounded 64-bit integer corresponding to the input. 3988 | If `zSign' is 1, the input is negated before being converted to an integer. 3989 | Ordinarily, the fixed-point input is simply rounded to an integer, with 3990 | the inexact exception raised if the input cannot be represented exactly as 3991 | an integer. However, if the fixed-point input is too large, the invalid 3992 | exception is raised and the largest positive or negative integer is 3993 | returned. 3994 *----------------------------------------------------------------------------*/ 3995 3996 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1, 3997 float_status *status) 3998 { 3999 int8_t roundingMode; 4000 bool roundNearestEven, increment; 4001 int64_t z; 4002 4003 roundingMode = status->float_rounding_mode; 4004 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4005 switch (roundingMode) { 4006 case float_round_nearest_even: 4007 case float_round_ties_away: 4008 increment = ((int64_t) absZ1 < 0); 4009 break; 4010 case float_round_to_zero: 4011 increment = 0; 4012 break; 4013 case float_round_up: 4014 increment = !zSign && absZ1; 4015 break; 4016 case float_round_down: 4017 increment = zSign && absZ1; 4018 break; 4019 case float_round_to_odd: 4020 increment = !(absZ0 & 1) && absZ1; 4021 break; 4022 default: 4023 abort(); 4024 } 4025 if ( increment ) { 4026 ++absZ0; 4027 if ( absZ0 == 0 ) goto overflow; 4028 if (!(absZ1 << 1) && roundNearestEven) { 4029 absZ0 &= ~1; 4030 } 4031 } 4032 z = absZ0; 4033 if ( zSign ) z = - z; 4034 if ( z && ( ( z < 0 ) ^ zSign ) ) { 4035 overflow: 4036 float_raise(float_flag_invalid, status); 4037 return zSign ? INT64_MIN : INT64_MAX; 4038 } 4039 if (absZ1) { 4040 float_raise(float_flag_inexact, status); 4041 } 4042 return z; 4043 4044 } 4045 4046 /*---------------------------------------------------------------------------- 4047 | Normalizes the subnormal single-precision floating-point value represented 4048 | by the denormalized significand `aSig'. The normalized exponent and 4049 | significand are stored at the locations pointed to by `zExpPtr' and 4050 | `zSigPtr', respectively. 4051 *----------------------------------------------------------------------------*/ 4052 4053 static void 4054 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 4055 { 4056 int8_t shiftCount; 4057 4058 shiftCount = clz32(aSig) - 8; 4059 *zSigPtr = aSig<<shiftCount; 4060 *zExpPtr = 1 - shiftCount; 4061 4062 } 4063 4064 /*---------------------------------------------------------------------------- 4065 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4066 | and significand `zSig', and returns the proper single-precision floating- 4067 | point value corresponding to the abstract input. Ordinarily, the abstract 4068 | value is simply rounded and packed into the single-precision format, with 4069 | the inexact exception raised if the abstract input cannot be represented 4070 | exactly. However, if the abstract value is too large, the overflow and 4071 | inexact exceptions are raised and an infinity or maximal finite value is 4072 | returned. If the abstract value is too small, the input value is rounded to 4073 | a subnormal number, and the underflow and inexact exceptions are raised if 4074 | the abstract input cannot be represented exactly as a subnormal single- 4075 | precision floating-point number. 4076 | The input significand `zSig' has its binary point between bits 30 4077 | and 29, which is 7 bits to the left of the usual location. This shifted 4078 | significand must be normalized or smaller. If `zSig' is not normalized, 4079 | `zExp' must be 0; in that case, the result returned is a subnormal number, 4080 | and it must not require rounding. In the usual case that `zSig' is 4081 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 4082 | The handling of underflow and overflow follows the IEC/IEEE Standard for 4083 | Binary Floating-Point Arithmetic. 4084 *----------------------------------------------------------------------------*/ 4085 4086 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig, 4087 float_status *status) 4088 { 4089 int8_t roundingMode; 4090 bool roundNearestEven; 4091 int8_t roundIncrement, roundBits; 4092 bool isTiny; 4093 4094 roundingMode = status->float_rounding_mode; 4095 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4096 switch (roundingMode) { 4097 case float_round_nearest_even: 4098 case float_round_ties_away: 4099 roundIncrement = 0x40; 4100 break; 4101 case float_round_to_zero: 4102 roundIncrement = 0; 4103 break; 4104 case float_round_up: 4105 roundIncrement = zSign ? 0 : 0x7f; 4106 break; 4107 case float_round_down: 4108 roundIncrement = zSign ? 0x7f : 0; 4109 break; 4110 case float_round_to_odd: 4111 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 4112 break; 4113 default: 4114 abort(); 4115 break; 4116 } 4117 roundBits = zSig & 0x7F; 4118 if ( 0xFD <= (uint16_t) zExp ) { 4119 if ( ( 0xFD < zExp ) 4120 || ( ( zExp == 0xFD ) 4121 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 4122 ) { 4123 bool overflow_to_inf = roundingMode != float_round_to_odd && 4124 roundIncrement != 0; 4125 float_raise(float_flag_overflow | float_flag_inexact, status); 4126 return packFloat32(zSign, 0xFF, -!overflow_to_inf); 4127 } 4128 if ( zExp < 0 ) { 4129 if (status->flush_to_zero) { 4130 float_raise(float_flag_output_denormal, status); 4131 return packFloat32(zSign, 0, 0); 4132 } 4133 isTiny = status->tininess_before_rounding 4134 || (zExp < -1) 4135 || (zSig + roundIncrement < 0x80000000); 4136 shift32RightJamming( zSig, - zExp, &zSig ); 4137 zExp = 0; 4138 roundBits = zSig & 0x7F; 4139 if (isTiny && roundBits) { 4140 float_raise(float_flag_underflow, status); 4141 } 4142 if (roundingMode == float_round_to_odd) { 4143 /* 4144 * For round-to-odd case, the roundIncrement depends on 4145 * zSig which just changed. 4146 */ 4147 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 4148 } 4149 } 4150 } 4151 if (roundBits) { 4152 float_raise(float_flag_inexact, status); 4153 } 4154 zSig = ( zSig + roundIncrement )>>7; 4155 if (!(roundBits ^ 0x40) && roundNearestEven) { 4156 zSig &= ~1; 4157 } 4158 if ( zSig == 0 ) zExp = 0; 4159 return packFloat32( zSign, zExp, zSig ); 4160 4161 } 4162 4163 /*---------------------------------------------------------------------------- 4164 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4165 | and significand `zSig', and returns the proper single-precision floating- 4166 | point value corresponding to the abstract input. This routine is just like 4167 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 4168 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 4169 | floating-point exponent. 4170 *----------------------------------------------------------------------------*/ 4171 4172 static float32 4173 normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig, 4174 float_status *status) 4175 { 4176 int8_t shiftCount; 4177 4178 shiftCount = clz32(zSig) - 1; 4179 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 4180 status); 4181 4182 } 4183 4184 /*---------------------------------------------------------------------------- 4185 | Normalizes the subnormal double-precision floating-point value represented 4186 | by the denormalized significand `aSig'. The normalized exponent and 4187 | significand are stored at the locations pointed to by `zExpPtr' and 4188 | `zSigPtr', respectively. 4189 *----------------------------------------------------------------------------*/ 4190 4191 static void 4192 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 4193 { 4194 int8_t shiftCount; 4195 4196 shiftCount = clz64(aSig) - 11; 4197 *zSigPtr = aSig<<shiftCount; 4198 *zExpPtr = 1 - shiftCount; 4199 4200 } 4201 4202 /*---------------------------------------------------------------------------- 4203 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 4204 | double-precision floating-point value, returning the result. After being 4205 | shifted into the proper positions, the three fields are simply added 4206 | together to form the result. This means that any integer portion of `zSig' 4207 | will be added into the exponent. Since a properly normalized significand 4208 | will have an integer portion equal to 1, the `zExp' input should be 1 less 4209 | than the desired result exponent whenever `zSig' is a complete, normalized 4210 | significand. 4211 *----------------------------------------------------------------------------*/ 4212 4213 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig) 4214 { 4215 4216 return make_float64( 4217 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 4218 4219 } 4220 4221 /*---------------------------------------------------------------------------- 4222 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4223 | and significand `zSig', and returns the proper double-precision floating- 4224 | point value corresponding to the abstract input. Ordinarily, the abstract 4225 | value is simply rounded and packed into the double-precision format, with 4226 | the inexact exception raised if the abstract input cannot be represented 4227 | exactly. However, if the abstract value is too large, the overflow and 4228 | inexact exceptions are raised and an infinity or maximal finite value is 4229 | returned. If the abstract value is too small, the input value is rounded to 4230 | a subnormal number, and the underflow and inexact exceptions are raised if 4231 | the abstract input cannot be represented exactly as a subnormal double- 4232 | precision floating-point number. 4233 | The input significand `zSig' has its binary point between bits 62 4234 | and 61, which is 10 bits to the left of the usual location. This shifted 4235 | significand must be normalized or smaller. If `zSig' is not normalized, 4236 | `zExp' must be 0; in that case, the result returned is a subnormal number, 4237 | and it must not require rounding. In the usual case that `zSig' is 4238 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 4239 | The handling of underflow and overflow follows the IEC/IEEE Standard for 4240 | Binary Floating-Point Arithmetic. 4241 *----------------------------------------------------------------------------*/ 4242 4243 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig, 4244 float_status *status) 4245 { 4246 int8_t roundingMode; 4247 bool roundNearestEven; 4248 int roundIncrement, roundBits; 4249 bool isTiny; 4250 4251 roundingMode = status->float_rounding_mode; 4252 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4253 switch (roundingMode) { 4254 case float_round_nearest_even: 4255 case float_round_ties_away: 4256 roundIncrement = 0x200; 4257 break; 4258 case float_round_to_zero: 4259 roundIncrement = 0; 4260 break; 4261 case float_round_up: 4262 roundIncrement = zSign ? 0 : 0x3ff; 4263 break; 4264 case float_round_down: 4265 roundIncrement = zSign ? 0x3ff : 0; 4266 break; 4267 case float_round_to_odd: 4268 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 4269 break; 4270 default: 4271 abort(); 4272 } 4273 roundBits = zSig & 0x3FF; 4274 if ( 0x7FD <= (uint16_t) zExp ) { 4275 if ( ( 0x7FD < zExp ) 4276 || ( ( zExp == 0x7FD ) 4277 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 4278 ) { 4279 bool overflow_to_inf = roundingMode != float_round_to_odd && 4280 roundIncrement != 0; 4281 float_raise(float_flag_overflow | float_flag_inexact, status); 4282 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 4283 } 4284 if ( zExp < 0 ) { 4285 if (status->flush_to_zero) { 4286 float_raise(float_flag_output_denormal, status); 4287 return packFloat64(zSign, 0, 0); 4288 } 4289 isTiny = status->tininess_before_rounding 4290 || (zExp < -1) 4291 || (zSig + roundIncrement < UINT64_C(0x8000000000000000)); 4292 shift64RightJamming( zSig, - zExp, &zSig ); 4293 zExp = 0; 4294 roundBits = zSig & 0x3FF; 4295 if (isTiny && roundBits) { 4296 float_raise(float_flag_underflow, status); 4297 } 4298 if (roundingMode == float_round_to_odd) { 4299 /* 4300 * For round-to-odd case, the roundIncrement depends on 4301 * zSig which just changed. 4302 */ 4303 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 4304 } 4305 } 4306 } 4307 if (roundBits) { 4308 float_raise(float_flag_inexact, status); 4309 } 4310 zSig = ( zSig + roundIncrement )>>10; 4311 if (!(roundBits ^ 0x200) && roundNearestEven) { 4312 zSig &= ~1; 4313 } 4314 if ( zSig == 0 ) zExp = 0; 4315 return packFloat64( zSign, zExp, zSig ); 4316 4317 } 4318 4319 /*---------------------------------------------------------------------------- 4320 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4321 | and significand `zSig', and returns the proper double-precision floating- 4322 | point value corresponding to the abstract input. This routine is just like 4323 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 4324 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 4325 | floating-point exponent. 4326 *----------------------------------------------------------------------------*/ 4327 4328 static float64 4329 normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig, 4330 float_status *status) 4331 { 4332 int8_t shiftCount; 4333 4334 shiftCount = clz64(zSig) - 1; 4335 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 4336 status); 4337 4338 } 4339 4340 /*---------------------------------------------------------------------------- 4341 | Normalizes the subnormal extended double-precision floating-point value 4342 | represented by the denormalized significand `aSig'. The normalized exponent 4343 | and significand are stored at the locations pointed to by `zExpPtr' and 4344 | `zSigPtr', respectively. 4345 *----------------------------------------------------------------------------*/ 4346 4347 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr, 4348 uint64_t *zSigPtr) 4349 { 4350 int8_t shiftCount; 4351 4352 shiftCount = clz64(aSig); 4353 *zSigPtr = aSig<<shiftCount; 4354 *zExpPtr = 1 - shiftCount; 4355 } 4356 4357 /*---------------------------------------------------------------------------- 4358 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4359 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 4360 | and returns the proper extended double-precision floating-point value 4361 | corresponding to the abstract input. Ordinarily, the abstract value is 4362 | rounded and packed into the extended double-precision format, with the 4363 | inexact exception raised if the abstract input cannot be represented 4364 | exactly. However, if the abstract value is too large, the overflow and 4365 | inexact exceptions are raised and an infinity or maximal finite value is 4366 | returned. If the abstract value is too small, the input value is rounded to 4367 | a subnormal number, and the underflow and inexact exceptions are raised if 4368 | the abstract input cannot be represented exactly as a subnormal extended 4369 | double-precision floating-point number. 4370 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 4371 | number of bits as single or double precision, respectively. Otherwise, the 4372 | result is rounded to the full precision of the extended double-precision 4373 | format. 4374 | The input significand must be normalized or smaller. If the input 4375 | significand is not normalized, `zExp' must be 0; in that case, the result 4376 | returned is a subnormal number, and it must not require rounding. The 4377 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 4378 | Floating-Point Arithmetic. 4379 *----------------------------------------------------------------------------*/ 4380 4381 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign, 4382 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 4383 float_status *status) 4384 { 4385 int8_t roundingMode; 4386 bool roundNearestEven, increment, isTiny; 4387 int64_t roundIncrement, roundMask, roundBits; 4388 4389 roundingMode = status->float_rounding_mode; 4390 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4391 if ( roundingPrecision == 80 ) goto precision80; 4392 if ( roundingPrecision == 64 ) { 4393 roundIncrement = UINT64_C(0x0000000000000400); 4394 roundMask = UINT64_C(0x00000000000007FF); 4395 } 4396 else if ( roundingPrecision == 32 ) { 4397 roundIncrement = UINT64_C(0x0000008000000000); 4398 roundMask = UINT64_C(0x000000FFFFFFFFFF); 4399 } 4400 else { 4401 goto precision80; 4402 } 4403 zSig0 |= ( zSig1 != 0 ); 4404 switch (roundingMode) { 4405 case float_round_nearest_even: 4406 case float_round_ties_away: 4407 break; 4408 case float_round_to_zero: 4409 roundIncrement = 0; 4410 break; 4411 case float_round_up: 4412 roundIncrement = zSign ? 0 : roundMask; 4413 break; 4414 case float_round_down: 4415 roundIncrement = zSign ? roundMask : 0; 4416 break; 4417 default: 4418 abort(); 4419 } 4420 roundBits = zSig0 & roundMask; 4421 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 4422 if ( ( 0x7FFE < zExp ) 4423 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 4424 ) { 4425 goto overflow; 4426 } 4427 if ( zExp <= 0 ) { 4428 if (status->flush_to_zero) { 4429 float_raise(float_flag_output_denormal, status); 4430 return packFloatx80(zSign, 0, 0); 4431 } 4432 isTiny = status->tininess_before_rounding 4433 || (zExp < 0 ) 4434 || (zSig0 <= zSig0 + roundIncrement); 4435 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 4436 zExp = 0; 4437 roundBits = zSig0 & roundMask; 4438 if (isTiny && roundBits) { 4439 float_raise(float_flag_underflow, status); 4440 } 4441 if (roundBits) { 4442 float_raise(float_flag_inexact, status); 4443 } 4444 zSig0 += roundIncrement; 4445 if ( (int64_t) zSig0 < 0 ) zExp = 1; 4446 roundIncrement = roundMask + 1; 4447 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 4448 roundMask |= roundIncrement; 4449 } 4450 zSig0 &= ~ roundMask; 4451 return packFloatx80( zSign, zExp, zSig0 ); 4452 } 4453 } 4454 if (roundBits) { 4455 float_raise(float_flag_inexact, status); 4456 } 4457 zSig0 += roundIncrement; 4458 if ( zSig0 < roundIncrement ) { 4459 ++zExp; 4460 zSig0 = UINT64_C(0x8000000000000000); 4461 } 4462 roundIncrement = roundMask + 1; 4463 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 4464 roundMask |= roundIncrement; 4465 } 4466 zSig0 &= ~ roundMask; 4467 if ( zSig0 == 0 ) zExp = 0; 4468 return packFloatx80( zSign, zExp, zSig0 ); 4469 precision80: 4470 switch (roundingMode) { 4471 case float_round_nearest_even: 4472 case float_round_ties_away: 4473 increment = ((int64_t)zSig1 < 0); 4474 break; 4475 case float_round_to_zero: 4476 increment = 0; 4477 break; 4478 case float_round_up: 4479 increment = !zSign && zSig1; 4480 break; 4481 case float_round_down: 4482 increment = zSign && zSig1; 4483 break; 4484 default: 4485 abort(); 4486 } 4487 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 4488 if ( ( 0x7FFE < zExp ) 4489 || ( ( zExp == 0x7FFE ) 4490 && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) ) 4491 && increment 4492 ) 4493 ) { 4494 roundMask = 0; 4495 overflow: 4496 float_raise(float_flag_overflow | float_flag_inexact, status); 4497 if ( ( roundingMode == float_round_to_zero ) 4498 || ( zSign && ( roundingMode == float_round_up ) ) 4499 || ( ! zSign && ( roundingMode == float_round_down ) ) 4500 ) { 4501 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 4502 } 4503 return packFloatx80(zSign, 4504 floatx80_infinity_high, 4505 floatx80_infinity_low); 4506 } 4507 if ( zExp <= 0 ) { 4508 isTiny = status->tininess_before_rounding 4509 || (zExp < 0) 4510 || !increment 4511 || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF)); 4512 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 4513 zExp = 0; 4514 if (isTiny && zSig1) { 4515 float_raise(float_flag_underflow, status); 4516 } 4517 if (zSig1) { 4518 float_raise(float_flag_inexact, status); 4519 } 4520 switch (roundingMode) { 4521 case float_round_nearest_even: 4522 case float_round_ties_away: 4523 increment = ((int64_t)zSig1 < 0); 4524 break; 4525 case float_round_to_zero: 4526 increment = 0; 4527 break; 4528 case float_round_up: 4529 increment = !zSign && zSig1; 4530 break; 4531 case float_round_down: 4532 increment = zSign && zSig1; 4533 break; 4534 default: 4535 abort(); 4536 } 4537 if ( increment ) { 4538 ++zSig0; 4539 if (!(zSig1 << 1) && roundNearestEven) { 4540 zSig0 &= ~1; 4541 } 4542 if ( (int64_t) zSig0 < 0 ) zExp = 1; 4543 } 4544 return packFloatx80( zSign, zExp, zSig0 ); 4545 } 4546 } 4547 if (zSig1) { 4548 float_raise(float_flag_inexact, status); 4549 } 4550 if ( increment ) { 4551 ++zSig0; 4552 if ( zSig0 == 0 ) { 4553 ++zExp; 4554 zSig0 = UINT64_C(0x8000000000000000); 4555 } 4556 else { 4557 if (!(zSig1 << 1) && roundNearestEven) { 4558 zSig0 &= ~1; 4559 } 4560 } 4561 } 4562 else { 4563 if ( zSig0 == 0 ) zExp = 0; 4564 } 4565 return packFloatx80( zSign, zExp, zSig0 ); 4566 4567 } 4568 4569 /*---------------------------------------------------------------------------- 4570 | Takes an abstract floating-point value having sign `zSign', exponent 4571 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 4572 | and returns the proper extended double-precision floating-point value 4573 | corresponding to the abstract input. This routine is just like 4574 | `roundAndPackFloatx80' except that the input significand does not have to be 4575 | normalized. 4576 *----------------------------------------------------------------------------*/ 4577 4578 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 4579 bool zSign, int32_t zExp, 4580 uint64_t zSig0, uint64_t zSig1, 4581 float_status *status) 4582 { 4583 int8_t shiftCount; 4584 4585 if ( zSig0 == 0 ) { 4586 zSig0 = zSig1; 4587 zSig1 = 0; 4588 zExp -= 64; 4589 } 4590 shiftCount = clz64(zSig0); 4591 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4592 zExp -= shiftCount; 4593 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 4594 zSig0, zSig1, status); 4595 4596 } 4597 4598 /*---------------------------------------------------------------------------- 4599 | Returns the least-significant 64 fraction bits of the quadruple-precision 4600 | floating-point value `a'. 4601 *----------------------------------------------------------------------------*/ 4602 4603 static inline uint64_t extractFloat128Frac1( float128 a ) 4604 { 4605 4606 return a.low; 4607 4608 } 4609 4610 /*---------------------------------------------------------------------------- 4611 | Returns the most-significant 48 fraction bits of the quadruple-precision 4612 | floating-point value `a'. 4613 *----------------------------------------------------------------------------*/ 4614 4615 static inline uint64_t extractFloat128Frac0( float128 a ) 4616 { 4617 4618 return a.high & UINT64_C(0x0000FFFFFFFFFFFF); 4619 4620 } 4621 4622 /*---------------------------------------------------------------------------- 4623 | Returns the exponent bits of the quadruple-precision floating-point value 4624 | `a'. 4625 *----------------------------------------------------------------------------*/ 4626 4627 static inline int32_t extractFloat128Exp( float128 a ) 4628 { 4629 4630 return ( a.high>>48 ) & 0x7FFF; 4631 4632 } 4633 4634 /*---------------------------------------------------------------------------- 4635 | Returns the sign bit of the quadruple-precision floating-point value `a'. 4636 *----------------------------------------------------------------------------*/ 4637 4638 static inline bool extractFloat128Sign(float128 a) 4639 { 4640 return a.high >> 63; 4641 } 4642 4643 /*---------------------------------------------------------------------------- 4644 | Normalizes the subnormal quadruple-precision floating-point value 4645 | represented by the denormalized significand formed by the concatenation of 4646 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 4647 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 4648 | significand are stored at the location pointed to by `zSig0Ptr', and the 4649 | least significant 64 bits of the normalized significand are stored at the 4650 | location pointed to by `zSig1Ptr'. 4651 *----------------------------------------------------------------------------*/ 4652 4653 static void 4654 normalizeFloat128Subnormal( 4655 uint64_t aSig0, 4656 uint64_t aSig1, 4657 int32_t *zExpPtr, 4658 uint64_t *zSig0Ptr, 4659 uint64_t *zSig1Ptr 4660 ) 4661 { 4662 int8_t shiftCount; 4663 4664 if ( aSig0 == 0 ) { 4665 shiftCount = clz64(aSig1) - 15; 4666 if ( shiftCount < 0 ) { 4667 *zSig0Ptr = aSig1>>( - shiftCount ); 4668 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 4669 } 4670 else { 4671 *zSig0Ptr = aSig1<<shiftCount; 4672 *zSig1Ptr = 0; 4673 } 4674 *zExpPtr = - shiftCount - 63; 4675 } 4676 else { 4677 shiftCount = clz64(aSig0) - 15; 4678 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 4679 *zExpPtr = 1 - shiftCount; 4680 } 4681 4682 } 4683 4684 /*---------------------------------------------------------------------------- 4685 | Packs the sign `zSign', the exponent `zExp', and the significand formed 4686 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 4687 | floating-point value, returning the result. After being shifted into the 4688 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 4689 | added together to form the most significant 32 bits of the result. This 4690 | means that any integer portion of `zSig0' will be added into the exponent. 4691 | Since a properly normalized significand will have an integer portion equal 4692 | to 1, the `zExp' input should be 1 less than the desired result exponent 4693 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 4694 | significand. 4695 *----------------------------------------------------------------------------*/ 4696 4697 static inline float128 4698 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1) 4699 { 4700 float128 z; 4701 4702 z.low = zSig1; 4703 z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0; 4704 return z; 4705 } 4706 4707 /*---------------------------------------------------------------------------- 4708 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4709 | and extended significand formed by the concatenation of `zSig0', `zSig1', 4710 | and `zSig2', and returns the proper quadruple-precision floating-point value 4711 | corresponding to the abstract input. Ordinarily, the abstract value is 4712 | simply rounded and packed into the quadruple-precision format, with the 4713 | inexact exception raised if the abstract input cannot be represented 4714 | exactly. However, if the abstract value is too large, the overflow and 4715 | inexact exceptions are raised and an infinity or maximal finite value is 4716 | returned. If the abstract value is too small, the input value is rounded to 4717 | a subnormal number, and the underflow and inexact exceptions are raised if 4718 | the abstract input cannot be represented exactly as a subnormal quadruple- 4719 | precision floating-point number. 4720 | The input significand must be normalized or smaller. If the input 4721 | significand is not normalized, `zExp' must be 0; in that case, the result 4722 | returned is a subnormal number, and it must not require rounding. In the 4723 | usual case that the input significand is normalized, `zExp' must be 1 less 4724 | than the ``true'' floating-point exponent. The handling of underflow and 4725 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4726 *----------------------------------------------------------------------------*/ 4727 4728 static float128 roundAndPackFloat128(bool zSign, int32_t zExp, 4729 uint64_t zSig0, uint64_t zSig1, 4730 uint64_t zSig2, float_status *status) 4731 { 4732 int8_t roundingMode; 4733 bool roundNearestEven, increment, isTiny; 4734 4735 roundingMode = status->float_rounding_mode; 4736 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4737 switch (roundingMode) { 4738 case float_round_nearest_even: 4739 case float_round_ties_away: 4740 increment = ((int64_t)zSig2 < 0); 4741 break; 4742 case float_round_to_zero: 4743 increment = 0; 4744 break; 4745 case float_round_up: 4746 increment = !zSign && zSig2; 4747 break; 4748 case float_round_down: 4749 increment = zSign && zSig2; 4750 break; 4751 case float_round_to_odd: 4752 increment = !(zSig1 & 0x1) && zSig2; 4753 break; 4754 default: 4755 abort(); 4756 } 4757 if ( 0x7FFD <= (uint32_t) zExp ) { 4758 if ( ( 0x7FFD < zExp ) 4759 || ( ( zExp == 0x7FFD ) 4760 && eq128( 4761 UINT64_C(0x0001FFFFFFFFFFFF), 4762 UINT64_C(0xFFFFFFFFFFFFFFFF), 4763 zSig0, 4764 zSig1 4765 ) 4766 && increment 4767 ) 4768 ) { 4769 float_raise(float_flag_overflow | float_flag_inexact, status); 4770 if ( ( roundingMode == float_round_to_zero ) 4771 || ( zSign && ( roundingMode == float_round_up ) ) 4772 || ( ! zSign && ( roundingMode == float_round_down ) ) 4773 || (roundingMode == float_round_to_odd) 4774 ) { 4775 return 4776 packFloat128( 4777 zSign, 4778 0x7FFE, 4779 UINT64_C(0x0000FFFFFFFFFFFF), 4780 UINT64_C(0xFFFFFFFFFFFFFFFF) 4781 ); 4782 } 4783 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4784 } 4785 if ( zExp < 0 ) { 4786 if (status->flush_to_zero) { 4787 float_raise(float_flag_output_denormal, status); 4788 return packFloat128(zSign, 0, 0, 0); 4789 } 4790 isTiny = status->tininess_before_rounding 4791 || (zExp < -1) 4792 || !increment 4793 || lt128(zSig0, zSig1, 4794 UINT64_C(0x0001FFFFFFFFFFFF), 4795 UINT64_C(0xFFFFFFFFFFFFFFFF)); 4796 shift128ExtraRightJamming( 4797 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 4798 zExp = 0; 4799 if (isTiny && zSig2) { 4800 float_raise(float_flag_underflow, status); 4801 } 4802 switch (roundingMode) { 4803 case float_round_nearest_even: 4804 case float_round_ties_away: 4805 increment = ((int64_t)zSig2 < 0); 4806 break; 4807 case float_round_to_zero: 4808 increment = 0; 4809 break; 4810 case float_round_up: 4811 increment = !zSign && zSig2; 4812 break; 4813 case float_round_down: 4814 increment = zSign && zSig2; 4815 break; 4816 case float_round_to_odd: 4817 increment = !(zSig1 & 0x1) && zSig2; 4818 break; 4819 default: 4820 abort(); 4821 } 4822 } 4823 } 4824 if (zSig2) { 4825 float_raise(float_flag_inexact, status); 4826 } 4827 if ( increment ) { 4828 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 4829 if ((zSig2 + zSig2 == 0) && roundNearestEven) { 4830 zSig1 &= ~1; 4831 } 4832 } 4833 else { 4834 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 4835 } 4836 return packFloat128( zSign, zExp, zSig0, zSig1 ); 4837 4838 } 4839 4840 /*---------------------------------------------------------------------------- 4841 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4842 | and significand formed by the concatenation of `zSig0' and `zSig1', and 4843 | returns the proper quadruple-precision floating-point value corresponding 4844 | to the abstract input. This routine is just like `roundAndPackFloat128' 4845 | except that the input significand has fewer bits and does not have to be 4846 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 4847 | point exponent. 4848 *----------------------------------------------------------------------------*/ 4849 4850 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp, 4851 uint64_t zSig0, uint64_t zSig1, 4852 float_status *status) 4853 { 4854 int8_t shiftCount; 4855 uint64_t zSig2; 4856 4857 if ( zSig0 == 0 ) { 4858 zSig0 = zSig1; 4859 zSig1 = 0; 4860 zExp -= 64; 4861 } 4862 shiftCount = clz64(zSig0) - 15; 4863 if ( 0 <= shiftCount ) { 4864 zSig2 = 0; 4865 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4866 } 4867 else { 4868 shift128ExtraRightJamming( 4869 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 4870 } 4871 zExp -= shiftCount; 4872 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 4873 4874 } 4875 4876 4877 /*---------------------------------------------------------------------------- 4878 | Returns the result of converting the 32-bit two's complement integer `a' 4879 | to the extended double-precision floating-point format. The conversion 4880 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4881 | Arithmetic. 4882 *----------------------------------------------------------------------------*/ 4883 4884 floatx80 int32_to_floatx80(int32_t a, float_status *status) 4885 { 4886 bool zSign; 4887 uint32_t absA; 4888 int8_t shiftCount; 4889 uint64_t zSig; 4890 4891 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 4892 zSign = ( a < 0 ); 4893 absA = zSign ? - a : a; 4894 shiftCount = clz32(absA) + 32; 4895 zSig = absA; 4896 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 4897 4898 } 4899 4900 /*---------------------------------------------------------------------------- 4901 | Returns the result of converting the 64-bit two's complement integer `a' 4902 | to the extended double-precision floating-point format. The conversion 4903 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4904 | Arithmetic. 4905 *----------------------------------------------------------------------------*/ 4906 4907 floatx80 int64_to_floatx80(int64_t a, float_status *status) 4908 { 4909 bool zSign; 4910 uint64_t absA; 4911 int8_t shiftCount; 4912 4913 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 4914 zSign = ( a < 0 ); 4915 absA = zSign ? - a : a; 4916 shiftCount = clz64(absA); 4917 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 4918 4919 } 4920 4921 /*---------------------------------------------------------------------------- 4922 | Returns the result of converting the single-precision floating-point value 4923 | `a' to the extended double-precision floating-point format. The conversion 4924 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4925 | Arithmetic. 4926 *----------------------------------------------------------------------------*/ 4927 4928 floatx80 float32_to_floatx80(float32 a, float_status *status) 4929 { 4930 bool aSign; 4931 int aExp; 4932 uint32_t aSig; 4933 4934 a = float32_squash_input_denormal(a, status); 4935 aSig = extractFloat32Frac( a ); 4936 aExp = extractFloat32Exp( a ); 4937 aSign = extractFloat32Sign( a ); 4938 if ( aExp == 0xFF ) { 4939 if (aSig) { 4940 floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status), 4941 status); 4942 return floatx80_silence_nan(res, status); 4943 } 4944 return packFloatx80(aSign, 4945 floatx80_infinity_high, 4946 floatx80_infinity_low); 4947 } 4948 if ( aExp == 0 ) { 4949 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 4950 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4951 } 4952 aSig |= 0x00800000; 4953 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 4954 4955 } 4956 4957 /*---------------------------------------------------------------------------- 4958 | Returns the remainder of the single-precision floating-point value `a' 4959 | with respect to the corresponding value `b'. The operation is performed 4960 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4961 *----------------------------------------------------------------------------*/ 4962 4963 float32 float32_rem(float32 a, float32 b, float_status *status) 4964 { 4965 bool aSign, zSign; 4966 int aExp, bExp, expDiff; 4967 uint32_t aSig, bSig; 4968 uint32_t q; 4969 uint64_t aSig64, bSig64, q64; 4970 uint32_t alternateASig; 4971 int32_t sigMean; 4972 a = float32_squash_input_denormal(a, status); 4973 b = float32_squash_input_denormal(b, status); 4974 4975 aSig = extractFloat32Frac( a ); 4976 aExp = extractFloat32Exp( a ); 4977 aSign = extractFloat32Sign( a ); 4978 bSig = extractFloat32Frac( b ); 4979 bExp = extractFloat32Exp( b ); 4980 if ( aExp == 0xFF ) { 4981 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 4982 return propagateFloat32NaN(a, b, status); 4983 } 4984 float_raise(float_flag_invalid, status); 4985 return float32_default_nan(status); 4986 } 4987 if ( bExp == 0xFF ) { 4988 if (bSig) { 4989 return propagateFloat32NaN(a, b, status); 4990 } 4991 return a; 4992 } 4993 if ( bExp == 0 ) { 4994 if ( bSig == 0 ) { 4995 float_raise(float_flag_invalid, status); 4996 return float32_default_nan(status); 4997 } 4998 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 4999 } 5000 if ( aExp == 0 ) { 5001 if ( aSig == 0 ) return a; 5002 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5003 } 5004 expDiff = aExp - bExp; 5005 aSig |= 0x00800000; 5006 bSig |= 0x00800000; 5007 if ( expDiff < 32 ) { 5008 aSig <<= 8; 5009 bSig <<= 8; 5010 if ( expDiff < 0 ) { 5011 if ( expDiff < -1 ) return a; 5012 aSig >>= 1; 5013 } 5014 q = ( bSig <= aSig ); 5015 if ( q ) aSig -= bSig; 5016 if ( 0 < expDiff ) { 5017 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 5018 q >>= 32 - expDiff; 5019 bSig >>= 2; 5020 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 5021 } 5022 else { 5023 aSig >>= 2; 5024 bSig >>= 2; 5025 } 5026 } 5027 else { 5028 if ( bSig <= aSig ) aSig -= bSig; 5029 aSig64 = ( (uint64_t) aSig )<<40; 5030 bSig64 = ( (uint64_t) bSig )<<40; 5031 expDiff -= 64; 5032 while ( 0 < expDiff ) { 5033 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 5034 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 5035 aSig64 = - ( ( bSig * q64 )<<38 ); 5036 expDiff -= 62; 5037 } 5038 expDiff += 64; 5039 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 5040 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 5041 q = q64>>( 64 - expDiff ); 5042 bSig <<= 6; 5043 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 5044 } 5045 do { 5046 alternateASig = aSig; 5047 ++q; 5048 aSig -= bSig; 5049 } while ( 0 <= (int32_t) aSig ); 5050 sigMean = aSig + alternateASig; 5051 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 5052 aSig = alternateASig; 5053 } 5054 zSign = ( (int32_t) aSig < 0 ); 5055 if ( zSign ) aSig = - aSig; 5056 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 5057 } 5058 5059 5060 5061 /*---------------------------------------------------------------------------- 5062 | Returns the binary exponential of the single-precision floating-point value 5063 | `a'. The operation is performed according to the IEC/IEEE Standard for 5064 | Binary Floating-Point Arithmetic. 5065 | 5066 | Uses the following identities: 5067 | 5068 | 1. ------------------------------------------------------------------------- 5069 | x x*ln(2) 5070 | 2 = e 5071 | 5072 | 2. ------------------------------------------------------------------------- 5073 | 2 3 4 5 n 5074 | x x x x x x x 5075 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 5076 | 1! 2! 3! 4! 5! n! 5077 *----------------------------------------------------------------------------*/ 5078 5079 static const float64 float32_exp2_coefficients[15] = 5080 { 5081 const_float64( 0x3ff0000000000000ll ), /* 1 */ 5082 const_float64( 0x3fe0000000000000ll ), /* 2 */ 5083 const_float64( 0x3fc5555555555555ll ), /* 3 */ 5084 const_float64( 0x3fa5555555555555ll ), /* 4 */ 5085 const_float64( 0x3f81111111111111ll ), /* 5 */ 5086 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 5087 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 5088 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 5089 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 5090 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 5091 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 5092 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 5093 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 5094 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 5095 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 5096 }; 5097 5098 float32 float32_exp2(float32 a, float_status *status) 5099 { 5100 bool aSign; 5101 int aExp; 5102 uint32_t aSig; 5103 float64 r, x, xn; 5104 int i; 5105 a = float32_squash_input_denormal(a, status); 5106 5107 aSig = extractFloat32Frac( a ); 5108 aExp = extractFloat32Exp( a ); 5109 aSign = extractFloat32Sign( a ); 5110 5111 if ( aExp == 0xFF) { 5112 if (aSig) { 5113 return propagateFloat32NaN(a, float32_zero, status); 5114 } 5115 return (aSign) ? float32_zero : a; 5116 } 5117 if (aExp == 0) { 5118 if (aSig == 0) return float32_one; 5119 } 5120 5121 float_raise(float_flag_inexact, status); 5122 5123 /* ******************************* */ 5124 /* using float64 for approximation */ 5125 /* ******************************* */ 5126 x = float32_to_float64(a, status); 5127 x = float64_mul(x, float64_ln2, status); 5128 5129 xn = x; 5130 r = float64_one; 5131 for (i = 0 ; i < 15 ; i++) { 5132 float64 f; 5133 5134 f = float64_mul(xn, float32_exp2_coefficients[i], status); 5135 r = float64_add(r, f, status); 5136 5137 xn = float64_mul(xn, x, status); 5138 } 5139 5140 return float64_to_float32(r, status); 5141 } 5142 5143 /*---------------------------------------------------------------------------- 5144 | Returns the binary log of the single-precision floating-point value `a'. 5145 | The operation is performed according to the IEC/IEEE Standard for Binary 5146 | Floating-Point Arithmetic. 5147 *----------------------------------------------------------------------------*/ 5148 float32 float32_log2(float32 a, float_status *status) 5149 { 5150 bool aSign, zSign; 5151 int aExp; 5152 uint32_t aSig, zSig, i; 5153 5154 a = float32_squash_input_denormal(a, status); 5155 aSig = extractFloat32Frac( a ); 5156 aExp = extractFloat32Exp( a ); 5157 aSign = extractFloat32Sign( a ); 5158 5159 if ( aExp == 0 ) { 5160 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 5161 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5162 } 5163 if ( aSign ) { 5164 float_raise(float_flag_invalid, status); 5165 return float32_default_nan(status); 5166 } 5167 if ( aExp == 0xFF ) { 5168 if (aSig) { 5169 return propagateFloat32NaN(a, float32_zero, status); 5170 } 5171 return a; 5172 } 5173 5174 aExp -= 0x7F; 5175 aSig |= 0x00800000; 5176 zSign = aExp < 0; 5177 zSig = aExp << 23; 5178 5179 for (i = 1 << 22; i > 0; i >>= 1) { 5180 aSig = ( (uint64_t)aSig * aSig ) >> 23; 5181 if ( aSig & 0x01000000 ) { 5182 aSig >>= 1; 5183 zSig |= i; 5184 } 5185 } 5186 5187 if ( zSign ) 5188 zSig = -zSig; 5189 5190 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 5191 } 5192 5193 /*---------------------------------------------------------------------------- 5194 | Returns the result of converting the double-precision floating-point value 5195 | `a' to the extended double-precision floating-point format. The conversion 5196 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5197 | Arithmetic. 5198 *----------------------------------------------------------------------------*/ 5199 5200 floatx80 float64_to_floatx80(float64 a, float_status *status) 5201 { 5202 bool aSign; 5203 int aExp; 5204 uint64_t aSig; 5205 5206 a = float64_squash_input_denormal(a, status); 5207 aSig = extractFloat64Frac( a ); 5208 aExp = extractFloat64Exp( a ); 5209 aSign = extractFloat64Sign( a ); 5210 if ( aExp == 0x7FF ) { 5211 if (aSig) { 5212 floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status), 5213 status); 5214 return floatx80_silence_nan(res, status); 5215 } 5216 return packFloatx80(aSign, 5217 floatx80_infinity_high, 5218 floatx80_infinity_low); 5219 } 5220 if ( aExp == 0 ) { 5221 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 5222 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5223 } 5224 return 5225 packFloatx80( 5226 aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11); 5227 5228 } 5229 5230 /*---------------------------------------------------------------------------- 5231 | Returns the remainder of the double-precision floating-point value `a' 5232 | with respect to the corresponding value `b'. The operation is performed 5233 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5234 *----------------------------------------------------------------------------*/ 5235 5236 float64 float64_rem(float64 a, float64 b, float_status *status) 5237 { 5238 bool aSign, zSign; 5239 int aExp, bExp, expDiff; 5240 uint64_t aSig, bSig; 5241 uint64_t q, alternateASig; 5242 int64_t sigMean; 5243 5244 a = float64_squash_input_denormal(a, status); 5245 b = float64_squash_input_denormal(b, status); 5246 aSig = extractFloat64Frac( a ); 5247 aExp = extractFloat64Exp( a ); 5248 aSign = extractFloat64Sign( a ); 5249 bSig = extractFloat64Frac( b ); 5250 bExp = extractFloat64Exp( b ); 5251 if ( aExp == 0x7FF ) { 5252 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 5253 return propagateFloat64NaN(a, b, status); 5254 } 5255 float_raise(float_flag_invalid, status); 5256 return float64_default_nan(status); 5257 } 5258 if ( bExp == 0x7FF ) { 5259 if (bSig) { 5260 return propagateFloat64NaN(a, b, status); 5261 } 5262 return a; 5263 } 5264 if ( bExp == 0 ) { 5265 if ( bSig == 0 ) { 5266 float_raise(float_flag_invalid, status); 5267 return float64_default_nan(status); 5268 } 5269 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 5270 } 5271 if ( aExp == 0 ) { 5272 if ( aSig == 0 ) return a; 5273 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5274 } 5275 expDiff = aExp - bExp; 5276 aSig = (aSig | UINT64_C(0x0010000000000000)) << 11; 5277 bSig = (bSig | UINT64_C(0x0010000000000000)) << 11; 5278 if ( expDiff < 0 ) { 5279 if ( expDiff < -1 ) return a; 5280 aSig >>= 1; 5281 } 5282 q = ( bSig <= aSig ); 5283 if ( q ) aSig -= bSig; 5284 expDiff -= 64; 5285 while ( 0 < expDiff ) { 5286 q = estimateDiv128To64( aSig, 0, bSig ); 5287 q = ( 2 < q ) ? q - 2 : 0; 5288 aSig = - ( ( bSig>>2 ) * q ); 5289 expDiff -= 62; 5290 } 5291 expDiff += 64; 5292 if ( 0 < expDiff ) { 5293 q = estimateDiv128To64( aSig, 0, bSig ); 5294 q = ( 2 < q ) ? q - 2 : 0; 5295 q >>= 64 - expDiff; 5296 bSig >>= 2; 5297 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 5298 } 5299 else { 5300 aSig >>= 2; 5301 bSig >>= 2; 5302 } 5303 do { 5304 alternateASig = aSig; 5305 ++q; 5306 aSig -= bSig; 5307 } while ( 0 <= (int64_t) aSig ); 5308 sigMean = aSig + alternateASig; 5309 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 5310 aSig = alternateASig; 5311 } 5312 zSign = ( (int64_t) aSig < 0 ); 5313 if ( zSign ) aSig = - aSig; 5314 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 5315 5316 } 5317 5318 /*---------------------------------------------------------------------------- 5319 | Returns the binary log of the double-precision floating-point value `a'. 5320 | The operation is performed according to the IEC/IEEE Standard for Binary 5321 | Floating-Point Arithmetic. 5322 *----------------------------------------------------------------------------*/ 5323 float64 float64_log2(float64 a, float_status *status) 5324 { 5325 bool aSign, zSign; 5326 int aExp; 5327 uint64_t aSig, aSig0, aSig1, zSig, i; 5328 a = float64_squash_input_denormal(a, status); 5329 5330 aSig = extractFloat64Frac( a ); 5331 aExp = extractFloat64Exp( a ); 5332 aSign = extractFloat64Sign( a ); 5333 5334 if ( aExp == 0 ) { 5335 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 5336 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5337 } 5338 if ( aSign ) { 5339 float_raise(float_flag_invalid, status); 5340 return float64_default_nan(status); 5341 } 5342 if ( aExp == 0x7FF ) { 5343 if (aSig) { 5344 return propagateFloat64NaN(a, float64_zero, status); 5345 } 5346 return a; 5347 } 5348 5349 aExp -= 0x3FF; 5350 aSig |= UINT64_C(0x0010000000000000); 5351 zSign = aExp < 0; 5352 zSig = (uint64_t)aExp << 52; 5353 for (i = 1LL << 51; i > 0; i >>= 1) { 5354 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 5355 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 5356 if ( aSig & UINT64_C(0x0020000000000000) ) { 5357 aSig >>= 1; 5358 zSig |= i; 5359 } 5360 } 5361 5362 if ( zSign ) 5363 zSig = -zSig; 5364 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 5365 } 5366 5367 /*---------------------------------------------------------------------------- 5368 | Returns the result of converting the extended double-precision floating- 5369 | point value `a' to the 32-bit two's complement integer format. The 5370 | conversion is performed according to the IEC/IEEE Standard for Binary 5371 | Floating-Point Arithmetic---which means in particular that the conversion 5372 | is rounded according to the current rounding mode. If `a' is a NaN, the 5373 | largest positive integer is returned. Otherwise, if the conversion 5374 | overflows, the largest integer with the same sign as `a' is returned. 5375 *----------------------------------------------------------------------------*/ 5376 5377 int32_t floatx80_to_int32(floatx80 a, float_status *status) 5378 { 5379 bool aSign; 5380 int32_t aExp, shiftCount; 5381 uint64_t aSig; 5382 5383 if (floatx80_invalid_encoding(a)) { 5384 float_raise(float_flag_invalid, status); 5385 return 1 << 31; 5386 } 5387 aSig = extractFloatx80Frac( a ); 5388 aExp = extractFloatx80Exp( a ); 5389 aSign = extractFloatx80Sign( a ); 5390 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5391 shiftCount = 0x4037 - aExp; 5392 if ( shiftCount <= 0 ) shiftCount = 1; 5393 shift64RightJamming( aSig, shiftCount, &aSig ); 5394 return roundAndPackInt32(aSign, aSig, status); 5395 5396 } 5397 5398 /*---------------------------------------------------------------------------- 5399 | Returns the result of converting the extended double-precision floating- 5400 | point value `a' to the 32-bit two's complement integer format. The 5401 | conversion is performed according to the IEC/IEEE Standard for Binary 5402 | Floating-Point Arithmetic, except that the conversion is always rounded 5403 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5404 | Otherwise, if the conversion overflows, the largest integer with the same 5405 | sign as `a' is returned. 5406 *----------------------------------------------------------------------------*/ 5407 5408 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 5409 { 5410 bool aSign; 5411 int32_t aExp, shiftCount; 5412 uint64_t aSig, savedASig; 5413 int32_t z; 5414 5415 if (floatx80_invalid_encoding(a)) { 5416 float_raise(float_flag_invalid, status); 5417 return 1 << 31; 5418 } 5419 aSig = extractFloatx80Frac( a ); 5420 aExp = extractFloatx80Exp( a ); 5421 aSign = extractFloatx80Sign( a ); 5422 if ( 0x401E < aExp ) { 5423 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5424 goto invalid; 5425 } 5426 else if ( aExp < 0x3FFF ) { 5427 if (aExp || aSig) { 5428 float_raise(float_flag_inexact, status); 5429 } 5430 return 0; 5431 } 5432 shiftCount = 0x403E - aExp; 5433 savedASig = aSig; 5434 aSig >>= shiftCount; 5435 z = aSig; 5436 if ( aSign ) z = - z; 5437 if ( ( z < 0 ) ^ aSign ) { 5438 invalid: 5439 float_raise(float_flag_invalid, status); 5440 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5441 } 5442 if ( ( aSig<<shiftCount ) != savedASig ) { 5443 float_raise(float_flag_inexact, status); 5444 } 5445 return z; 5446 5447 } 5448 5449 /*---------------------------------------------------------------------------- 5450 | Returns the result of converting the extended double-precision floating- 5451 | point value `a' to the 64-bit two's complement integer format. The 5452 | conversion is performed according to the IEC/IEEE Standard for Binary 5453 | Floating-Point Arithmetic---which means in particular that the conversion 5454 | is rounded according to the current rounding mode. If `a' is a NaN, 5455 | the largest positive integer is returned. Otherwise, if the conversion 5456 | overflows, the largest integer with the same sign as `a' is returned. 5457 *----------------------------------------------------------------------------*/ 5458 5459 int64_t floatx80_to_int64(floatx80 a, float_status *status) 5460 { 5461 bool aSign; 5462 int32_t aExp, shiftCount; 5463 uint64_t aSig, aSigExtra; 5464 5465 if (floatx80_invalid_encoding(a)) { 5466 float_raise(float_flag_invalid, status); 5467 return 1ULL << 63; 5468 } 5469 aSig = extractFloatx80Frac( a ); 5470 aExp = extractFloatx80Exp( a ); 5471 aSign = extractFloatx80Sign( a ); 5472 shiftCount = 0x403E - aExp; 5473 if ( shiftCount <= 0 ) { 5474 if ( shiftCount ) { 5475 float_raise(float_flag_invalid, status); 5476 if (!aSign || floatx80_is_any_nan(a)) { 5477 return INT64_MAX; 5478 } 5479 return INT64_MIN; 5480 } 5481 aSigExtra = 0; 5482 } 5483 else { 5484 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 5485 } 5486 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 5487 5488 } 5489 5490 /*---------------------------------------------------------------------------- 5491 | Returns the result of converting the extended double-precision floating- 5492 | point value `a' to the 64-bit two's complement integer format. The 5493 | conversion is performed according to the IEC/IEEE Standard for Binary 5494 | Floating-Point Arithmetic, except that the conversion is always rounded 5495 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5496 | Otherwise, if the conversion overflows, the largest integer with the same 5497 | sign as `a' is returned. 5498 *----------------------------------------------------------------------------*/ 5499 5500 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 5501 { 5502 bool aSign; 5503 int32_t aExp, shiftCount; 5504 uint64_t aSig; 5505 int64_t z; 5506 5507 if (floatx80_invalid_encoding(a)) { 5508 float_raise(float_flag_invalid, status); 5509 return 1ULL << 63; 5510 } 5511 aSig = extractFloatx80Frac( a ); 5512 aExp = extractFloatx80Exp( a ); 5513 aSign = extractFloatx80Sign( a ); 5514 shiftCount = aExp - 0x403E; 5515 if ( 0 <= shiftCount ) { 5516 aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF); 5517 if ( ( a.high != 0xC03E ) || aSig ) { 5518 float_raise(float_flag_invalid, status); 5519 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 5520 return INT64_MAX; 5521 } 5522 } 5523 return INT64_MIN; 5524 } 5525 else if ( aExp < 0x3FFF ) { 5526 if (aExp | aSig) { 5527 float_raise(float_flag_inexact, status); 5528 } 5529 return 0; 5530 } 5531 z = aSig>>( - shiftCount ); 5532 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 5533 float_raise(float_flag_inexact, status); 5534 } 5535 if ( aSign ) z = - z; 5536 return z; 5537 5538 } 5539 5540 /*---------------------------------------------------------------------------- 5541 | Returns the result of converting the extended double-precision floating- 5542 | point value `a' to the single-precision floating-point format. The 5543 | conversion is performed according to the IEC/IEEE Standard for Binary 5544 | Floating-Point Arithmetic. 5545 *----------------------------------------------------------------------------*/ 5546 5547 float32 floatx80_to_float32(floatx80 a, float_status *status) 5548 { 5549 bool aSign; 5550 int32_t aExp; 5551 uint64_t aSig; 5552 5553 if (floatx80_invalid_encoding(a)) { 5554 float_raise(float_flag_invalid, status); 5555 return float32_default_nan(status); 5556 } 5557 aSig = extractFloatx80Frac( a ); 5558 aExp = extractFloatx80Exp( a ); 5559 aSign = extractFloatx80Sign( a ); 5560 if ( aExp == 0x7FFF ) { 5561 if ( (uint64_t) ( aSig<<1 ) ) { 5562 float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status), 5563 status); 5564 return float32_silence_nan(res, status); 5565 } 5566 return packFloat32( aSign, 0xFF, 0 ); 5567 } 5568 shift64RightJamming( aSig, 33, &aSig ); 5569 if ( aExp || aSig ) aExp -= 0x3F81; 5570 return roundAndPackFloat32(aSign, aExp, aSig, status); 5571 5572 } 5573 5574 /*---------------------------------------------------------------------------- 5575 | Returns the result of converting the extended double-precision floating- 5576 | point value `a' to the double-precision floating-point format. The 5577 | conversion is performed according to the IEC/IEEE Standard for Binary 5578 | Floating-Point Arithmetic. 5579 *----------------------------------------------------------------------------*/ 5580 5581 float64 floatx80_to_float64(floatx80 a, float_status *status) 5582 { 5583 bool aSign; 5584 int32_t aExp; 5585 uint64_t aSig, zSig; 5586 5587 if (floatx80_invalid_encoding(a)) { 5588 float_raise(float_flag_invalid, status); 5589 return float64_default_nan(status); 5590 } 5591 aSig = extractFloatx80Frac( a ); 5592 aExp = extractFloatx80Exp( a ); 5593 aSign = extractFloatx80Sign( a ); 5594 if ( aExp == 0x7FFF ) { 5595 if ( (uint64_t) ( aSig<<1 ) ) { 5596 float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status), 5597 status); 5598 return float64_silence_nan(res, status); 5599 } 5600 return packFloat64( aSign, 0x7FF, 0 ); 5601 } 5602 shift64RightJamming( aSig, 1, &zSig ); 5603 if ( aExp || aSig ) aExp -= 0x3C01; 5604 return roundAndPackFloat64(aSign, aExp, zSig, status); 5605 5606 } 5607 5608 /*---------------------------------------------------------------------------- 5609 | Returns the result of converting the extended double-precision floating- 5610 | point value `a' to the quadruple-precision floating-point format. The 5611 | conversion is performed according to the IEC/IEEE Standard for Binary 5612 | Floating-Point Arithmetic. 5613 *----------------------------------------------------------------------------*/ 5614 5615 float128 floatx80_to_float128(floatx80 a, float_status *status) 5616 { 5617 bool aSign; 5618 int aExp; 5619 uint64_t aSig, zSig0, zSig1; 5620 5621 if (floatx80_invalid_encoding(a)) { 5622 float_raise(float_flag_invalid, status); 5623 return float128_default_nan(status); 5624 } 5625 aSig = extractFloatx80Frac( a ); 5626 aExp = extractFloatx80Exp( a ); 5627 aSign = extractFloatx80Sign( a ); 5628 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 5629 float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status), 5630 status); 5631 return float128_silence_nan(res, status); 5632 } 5633 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 5634 return packFloat128( aSign, aExp, zSig0, zSig1 ); 5635 5636 } 5637 5638 /*---------------------------------------------------------------------------- 5639 | Rounds the extended double-precision floating-point value `a' 5640 | to the precision provided by floatx80_rounding_precision and returns the 5641 | result as an extended double-precision floating-point value. 5642 | The operation is performed according to the IEC/IEEE Standard for Binary 5643 | Floating-Point Arithmetic. 5644 *----------------------------------------------------------------------------*/ 5645 5646 floatx80 floatx80_round(floatx80 a, float_status *status) 5647 { 5648 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5649 extractFloatx80Sign(a), 5650 extractFloatx80Exp(a), 5651 extractFloatx80Frac(a), 0, status); 5652 } 5653 5654 /*---------------------------------------------------------------------------- 5655 | Rounds the extended double-precision floating-point value `a' to an integer, 5656 | and returns the result as an extended quadruple-precision floating-point 5657 | value. The operation is performed according to the IEC/IEEE Standard for 5658 | Binary Floating-Point Arithmetic. 5659 *----------------------------------------------------------------------------*/ 5660 5661 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 5662 { 5663 bool aSign; 5664 int32_t aExp; 5665 uint64_t lastBitMask, roundBitsMask; 5666 floatx80 z; 5667 5668 if (floatx80_invalid_encoding(a)) { 5669 float_raise(float_flag_invalid, status); 5670 return floatx80_default_nan(status); 5671 } 5672 aExp = extractFloatx80Exp( a ); 5673 if ( 0x403E <= aExp ) { 5674 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 5675 return propagateFloatx80NaN(a, a, status); 5676 } 5677 return a; 5678 } 5679 if ( aExp < 0x3FFF ) { 5680 if ( ( aExp == 0 ) 5681 && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) { 5682 return a; 5683 } 5684 float_raise(float_flag_inexact, status); 5685 aSign = extractFloatx80Sign( a ); 5686 switch (status->float_rounding_mode) { 5687 case float_round_nearest_even: 5688 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 5689 ) { 5690 return 5691 packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000)); 5692 } 5693 break; 5694 case float_round_ties_away: 5695 if (aExp == 0x3FFE) { 5696 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000)); 5697 } 5698 break; 5699 case float_round_down: 5700 return 5701 aSign ? 5702 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000)) 5703 : packFloatx80( 0, 0, 0 ); 5704 case float_round_up: 5705 return 5706 aSign ? packFloatx80( 1, 0, 0 ) 5707 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000)); 5708 5709 case float_round_to_zero: 5710 break; 5711 default: 5712 g_assert_not_reached(); 5713 } 5714 return packFloatx80( aSign, 0, 0 ); 5715 } 5716 lastBitMask = 1; 5717 lastBitMask <<= 0x403E - aExp; 5718 roundBitsMask = lastBitMask - 1; 5719 z = a; 5720 switch (status->float_rounding_mode) { 5721 case float_round_nearest_even: 5722 z.low += lastBitMask>>1; 5723 if ((z.low & roundBitsMask) == 0) { 5724 z.low &= ~lastBitMask; 5725 } 5726 break; 5727 case float_round_ties_away: 5728 z.low += lastBitMask >> 1; 5729 break; 5730 case float_round_to_zero: 5731 break; 5732 case float_round_up: 5733 if (!extractFloatx80Sign(z)) { 5734 z.low += roundBitsMask; 5735 } 5736 break; 5737 case float_round_down: 5738 if (extractFloatx80Sign(z)) { 5739 z.low += roundBitsMask; 5740 } 5741 break; 5742 default: 5743 abort(); 5744 } 5745 z.low &= ~ roundBitsMask; 5746 if ( z.low == 0 ) { 5747 ++z.high; 5748 z.low = UINT64_C(0x8000000000000000); 5749 } 5750 if (z.low != a.low) { 5751 float_raise(float_flag_inexact, status); 5752 } 5753 return z; 5754 5755 } 5756 5757 /*---------------------------------------------------------------------------- 5758 | Returns the result of adding the absolute values of the extended double- 5759 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 5760 | negated before being returned. `zSign' is ignored if the result is a NaN. 5761 | The addition is performed according to the IEC/IEEE Standard for Binary 5762 | Floating-Point Arithmetic. 5763 *----------------------------------------------------------------------------*/ 5764 5765 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign, 5766 float_status *status) 5767 { 5768 int32_t aExp, bExp, zExp; 5769 uint64_t aSig, bSig, zSig0, zSig1; 5770 int32_t expDiff; 5771 5772 aSig = extractFloatx80Frac( a ); 5773 aExp = extractFloatx80Exp( a ); 5774 bSig = extractFloatx80Frac( b ); 5775 bExp = extractFloatx80Exp( b ); 5776 expDiff = aExp - bExp; 5777 if ( 0 < expDiff ) { 5778 if ( aExp == 0x7FFF ) { 5779 if ((uint64_t)(aSig << 1)) { 5780 return propagateFloatx80NaN(a, b, status); 5781 } 5782 return a; 5783 } 5784 if ( bExp == 0 ) --expDiff; 5785 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5786 zExp = aExp; 5787 } 5788 else if ( expDiff < 0 ) { 5789 if ( bExp == 0x7FFF ) { 5790 if ((uint64_t)(bSig << 1)) { 5791 return propagateFloatx80NaN(a, b, status); 5792 } 5793 return packFloatx80(zSign, 5794 floatx80_infinity_high, 5795 floatx80_infinity_low); 5796 } 5797 if ( aExp == 0 ) ++expDiff; 5798 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5799 zExp = bExp; 5800 } 5801 else { 5802 if ( aExp == 0x7FFF ) { 5803 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5804 return propagateFloatx80NaN(a, b, status); 5805 } 5806 return a; 5807 } 5808 zSig1 = 0; 5809 zSig0 = aSig + bSig; 5810 if ( aExp == 0 ) { 5811 if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) { 5812 /* At least one of the values is a pseudo-denormal, 5813 * and there is a carry out of the result. */ 5814 zExp = 1; 5815 goto shiftRight1; 5816 } 5817 if (zSig0 == 0) { 5818 return packFloatx80(zSign, 0, 0); 5819 } 5820 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 5821 goto roundAndPack; 5822 } 5823 zExp = aExp; 5824 goto shiftRight1; 5825 } 5826 zSig0 = aSig + bSig; 5827 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 5828 shiftRight1: 5829 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5830 zSig0 |= UINT64_C(0x8000000000000000); 5831 ++zExp; 5832 roundAndPack: 5833 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5834 zSign, zExp, zSig0, zSig1, status); 5835 } 5836 5837 /*---------------------------------------------------------------------------- 5838 | Returns the result of subtracting the absolute values of the extended 5839 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 5840 | difference is negated before being returned. `zSign' is ignored if the 5841 | result is a NaN. The subtraction is performed according to the IEC/IEEE 5842 | Standard for Binary Floating-Point Arithmetic. 5843 *----------------------------------------------------------------------------*/ 5844 5845 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign, 5846 float_status *status) 5847 { 5848 int32_t aExp, bExp, zExp; 5849 uint64_t aSig, bSig, zSig0, zSig1; 5850 int32_t expDiff; 5851 5852 aSig = extractFloatx80Frac( a ); 5853 aExp = extractFloatx80Exp( a ); 5854 bSig = extractFloatx80Frac( b ); 5855 bExp = extractFloatx80Exp( b ); 5856 expDiff = aExp - bExp; 5857 if ( 0 < expDiff ) goto aExpBigger; 5858 if ( expDiff < 0 ) goto bExpBigger; 5859 if ( aExp == 0x7FFF ) { 5860 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5861 return propagateFloatx80NaN(a, b, status); 5862 } 5863 float_raise(float_flag_invalid, status); 5864 return floatx80_default_nan(status); 5865 } 5866 if ( aExp == 0 ) { 5867 aExp = 1; 5868 bExp = 1; 5869 } 5870 zSig1 = 0; 5871 if ( bSig < aSig ) goto aBigger; 5872 if ( aSig < bSig ) goto bBigger; 5873 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 5874 bExpBigger: 5875 if ( bExp == 0x7FFF ) { 5876 if ((uint64_t)(bSig << 1)) { 5877 return propagateFloatx80NaN(a, b, status); 5878 } 5879 return packFloatx80(zSign ^ 1, floatx80_infinity_high, 5880 floatx80_infinity_low); 5881 } 5882 if ( aExp == 0 ) ++expDiff; 5883 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5884 bBigger: 5885 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 5886 zExp = bExp; 5887 zSign ^= 1; 5888 goto normalizeRoundAndPack; 5889 aExpBigger: 5890 if ( aExp == 0x7FFF ) { 5891 if ((uint64_t)(aSig << 1)) { 5892 return propagateFloatx80NaN(a, b, status); 5893 } 5894 return a; 5895 } 5896 if ( bExp == 0 ) --expDiff; 5897 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5898 aBigger: 5899 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 5900 zExp = aExp; 5901 normalizeRoundAndPack: 5902 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 5903 zSign, zExp, zSig0, zSig1, status); 5904 } 5905 5906 /*---------------------------------------------------------------------------- 5907 | Returns the result of adding the extended double-precision floating-point 5908 | values `a' and `b'. The operation is performed according to the IEC/IEEE 5909 | Standard for Binary Floating-Point Arithmetic. 5910 *----------------------------------------------------------------------------*/ 5911 5912 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 5913 { 5914 bool aSign, bSign; 5915 5916 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5917 float_raise(float_flag_invalid, status); 5918 return floatx80_default_nan(status); 5919 } 5920 aSign = extractFloatx80Sign( a ); 5921 bSign = extractFloatx80Sign( b ); 5922 if ( aSign == bSign ) { 5923 return addFloatx80Sigs(a, b, aSign, status); 5924 } 5925 else { 5926 return subFloatx80Sigs(a, b, aSign, status); 5927 } 5928 5929 } 5930 5931 /*---------------------------------------------------------------------------- 5932 | Returns the result of subtracting the extended double-precision floating- 5933 | point values `a' and `b'. The operation is performed according to the 5934 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5935 *----------------------------------------------------------------------------*/ 5936 5937 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 5938 { 5939 bool aSign, bSign; 5940 5941 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5942 float_raise(float_flag_invalid, status); 5943 return floatx80_default_nan(status); 5944 } 5945 aSign = extractFloatx80Sign( a ); 5946 bSign = extractFloatx80Sign( b ); 5947 if ( aSign == bSign ) { 5948 return subFloatx80Sigs(a, b, aSign, status); 5949 } 5950 else { 5951 return addFloatx80Sigs(a, b, aSign, status); 5952 } 5953 5954 } 5955 5956 /*---------------------------------------------------------------------------- 5957 | Returns the result of multiplying the extended double-precision floating- 5958 | point values `a' and `b'. The operation is performed according to the 5959 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5960 *----------------------------------------------------------------------------*/ 5961 5962 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 5963 { 5964 bool aSign, bSign, zSign; 5965 int32_t aExp, bExp, zExp; 5966 uint64_t aSig, bSig, zSig0, zSig1; 5967 5968 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5969 float_raise(float_flag_invalid, status); 5970 return floatx80_default_nan(status); 5971 } 5972 aSig = extractFloatx80Frac( a ); 5973 aExp = extractFloatx80Exp( a ); 5974 aSign = extractFloatx80Sign( a ); 5975 bSig = extractFloatx80Frac( b ); 5976 bExp = extractFloatx80Exp( b ); 5977 bSign = extractFloatx80Sign( b ); 5978 zSign = aSign ^ bSign; 5979 if ( aExp == 0x7FFF ) { 5980 if ( (uint64_t) ( aSig<<1 ) 5981 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5982 return propagateFloatx80NaN(a, b, status); 5983 } 5984 if ( ( bExp | bSig ) == 0 ) goto invalid; 5985 return packFloatx80(zSign, floatx80_infinity_high, 5986 floatx80_infinity_low); 5987 } 5988 if ( bExp == 0x7FFF ) { 5989 if ((uint64_t)(bSig << 1)) { 5990 return propagateFloatx80NaN(a, b, status); 5991 } 5992 if ( ( aExp | aSig ) == 0 ) { 5993 invalid: 5994 float_raise(float_flag_invalid, status); 5995 return floatx80_default_nan(status); 5996 } 5997 return packFloatx80(zSign, floatx80_infinity_high, 5998 floatx80_infinity_low); 5999 } 6000 if ( aExp == 0 ) { 6001 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6002 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 6003 } 6004 if ( bExp == 0 ) { 6005 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6006 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6007 } 6008 zExp = aExp + bExp - 0x3FFE; 6009 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 6010 if ( 0 < (int64_t) zSig0 ) { 6011 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 6012 --zExp; 6013 } 6014 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6015 zSign, zExp, zSig0, zSig1, status); 6016 } 6017 6018 /*---------------------------------------------------------------------------- 6019 | Returns the result of dividing the extended double-precision floating-point 6020 | value `a' by the corresponding value `b'. The operation is performed 6021 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6022 *----------------------------------------------------------------------------*/ 6023 6024 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 6025 { 6026 bool aSign, bSign, zSign; 6027 int32_t aExp, bExp, zExp; 6028 uint64_t aSig, bSig, zSig0, zSig1; 6029 uint64_t rem0, rem1, rem2, term0, term1, term2; 6030 6031 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6032 float_raise(float_flag_invalid, status); 6033 return floatx80_default_nan(status); 6034 } 6035 aSig = extractFloatx80Frac( a ); 6036 aExp = extractFloatx80Exp( a ); 6037 aSign = extractFloatx80Sign( a ); 6038 bSig = extractFloatx80Frac( b ); 6039 bExp = extractFloatx80Exp( b ); 6040 bSign = extractFloatx80Sign( b ); 6041 zSign = aSign ^ bSign; 6042 if ( aExp == 0x7FFF ) { 6043 if ((uint64_t)(aSig << 1)) { 6044 return propagateFloatx80NaN(a, b, status); 6045 } 6046 if ( bExp == 0x7FFF ) { 6047 if ((uint64_t)(bSig << 1)) { 6048 return propagateFloatx80NaN(a, b, status); 6049 } 6050 goto invalid; 6051 } 6052 return packFloatx80(zSign, floatx80_infinity_high, 6053 floatx80_infinity_low); 6054 } 6055 if ( bExp == 0x7FFF ) { 6056 if ((uint64_t)(bSig << 1)) { 6057 return propagateFloatx80NaN(a, b, status); 6058 } 6059 return packFloatx80( zSign, 0, 0 ); 6060 } 6061 if ( bExp == 0 ) { 6062 if ( bSig == 0 ) { 6063 if ( ( aExp | aSig ) == 0 ) { 6064 invalid: 6065 float_raise(float_flag_invalid, status); 6066 return floatx80_default_nan(status); 6067 } 6068 float_raise(float_flag_divbyzero, status); 6069 return packFloatx80(zSign, floatx80_infinity_high, 6070 floatx80_infinity_low); 6071 } 6072 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6073 } 6074 if ( aExp == 0 ) { 6075 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6076 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 6077 } 6078 zExp = aExp - bExp + 0x3FFE; 6079 rem1 = 0; 6080 if ( bSig <= aSig ) { 6081 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 6082 ++zExp; 6083 } 6084 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 6085 mul64To128( bSig, zSig0, &term0, &term1 ); 6086 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 6087 while ( (int64_t) rem0 < 0 ) { 6088 --zSig0; 6089 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 6090 } 6091 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 6092 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 6093 mul64To128( bSig, zSig1, &term1, &term2 ); 6094 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6095 while ( (int64_t) rem1 < 0 ) { 6096 --zSig1; 6097 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 6098 } 6099 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 6100 } 6101 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6102 zSign, zExp, zSig0, zSig1, status); 6103 } 6104 6105 /*---------------------------------------------------------------------------- 6106 | Returns the remainder of the extended double-precision floating-point value 6107 | `a' with respect to the corresponding value `b'. The operation is performed 6108 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic, 6109 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating 6110 | the quotient toward zero instead. '*quotient' is set to the low 64 bits of 6111 | the absolute value of the integer quotient. 6112 *----------------------------------------------------------------------------*/ 6113 6114 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient, 6115 float_status *status) 6116 { 6117 bool aSign, zSign; 6118 int32_t aExp, bExp, expDiff, aExpOrig; 6119 uint64_t aSig0, aSig1, bSig; 6120 uint64_t q, term0, term1, alternateASig0, alternateASig1; 6121 6122 *quotient = 0; 6123 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6124 float_raise(float_flag_invalid, status); 6125 return floatx80_default_nan(status); 6126 } 6127 aSig0 = extractFloatx80Frac( a ); 6128 aExpOrig = aExp = extractFloatx80Exp( a ); 6129 aSign = extractFloatx80Sign( a ); 6130 bSig = extractFloatx80Frac( b ); 6131 bExp = extractFloatx80Exp( b ); 6132 if ( aExp == 0x7FFF ) { 6133 if ( (uint64_t) ( aSig0<<1 ) 6134 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 6135 return propagateFloatx80NaN(a, b, status); 6136 } 6137 goto invalid; 6138 } 6139 if ( bExp == 0x7FFF ) { 6140 if ((uint64_t)(bSig << 1)) { 6141 return propagateFloatx80NaN(a, b, status); 6142 } 6143 if (aExp == 0 && aSig0 >> 63) { 6144 /* 6145 * Pseudo-denormal argument must be returned in normalized 6146 * form. 6147 */ 6148 return packFloatx80(aSign, 1, aSig0); 6149 } 6150 return a; 6151 } 6152 if ( bExp == 0 ) { 6153 if ( bSig == 0 ) { 6154 invalid: 6155 float_raise(float_flag_invalid, status); 6156 return floatx80_default_nan(status); 6157 } 6158 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6159 } 6160 if ( aExp == 0 ) { 6161 if ( aSig0 == 0 ) return a; 6162 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6163 } 6164 zSign = aSign; 6165 expDiff = aExp - bExp; 6166 aSig1 = 0; 6167 if ( expDiff < 0 ) { 6168 if ( mod || expDiff < -1 ) { 6169 if (aExp == 1 && aExpOrig == 0) { 6170 /* 6171 * Pseudo-denormal argument must be returned in 6172 * normalized form. 6173 */ 6174 return packFloatx80(aSign, aExp, aSig0); 6175 } 6176 return a; 6177 } 6178 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 6179 expDiff = 0; 6180 } 6181 *quotient = q = ( bSig <= aSig0 ); 6182 if ( q ) aSig0 -= bSig; 6183 expDiff -= 64; 6184 while ( 0 < expDiff ) { 6185 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6186 q = ( 2 < q ) ? q - 2 : 0; 6187 mul64To128( bSig, q, &term0, &term1 ); 6188 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6189 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 6190 expDiff -= 62; 6191 *quotient <<= 62; 6192 *quotient += q; 6193 } 6194 expDiff += 64; 6195 if ( 0 < expDiff ) { 6196 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6197 q = ( 2 < q ) ? q - 2 : 0; 6198 q >>= 64 - expDiff; 6199 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 6200 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6201 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 6202 while ( le128( term0, term1, aSig0, aSig1 ) ) { 6203 ++q; 6204 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6205 } 6206 if (expDiff < 64) { 6207 *quotient <<= expDiff; 6208 } else { 6209 *quotient = 0; 6210 } 6211 *quotient += q; 6212 } 6213 else { 6214 term1 = 0; 6215 term0 = bSig; 6216 } 6217 if (!mod) { 6218 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 6219 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6220 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6221 && ( q & 1 ) ) 6222 ) { 6223 aSig0 = alternateASig0; 6224 aSig1 = alternateASig1; 6225 zSign = ! zSign; 6226 ++*quotient; 6227 } 6228 } 6229 return 6230 normalizeRoundAndPackFloatx80( 6231 80, zSign, bExp + expDiff, aSig0, aSig1, status); 6232 6233 } 6234 6235 /*---------------------------------------------------------------------------- 6236 | Returns the remainder of the extended double-precision floating-point value 6237 | `a' with respect to the corresponding value `b'. The operation is performed 6238 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6239 *----------------------------------------------------------------------------*/ 6240 6241 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 6242 { 6243 uint64_t quotient; 6244 return floatx80_modrem(a, b, false, "ient, status); 6245 } 6246 6247 /*---------------------------------------------------------------------------- 6248 | Returns the remainder of the extended double-precision floating-point value 6249 | `a' with respect to the corresponding value `b', with the quotient truncated 6250 | toward zero. 6251 *----------------------------------------------------------------------------*/ 6252 6253 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status) 6254 { 6255 uint64_t quotient; 6256 return floatx80_modrem(a, b, true, "ient, status); 6257 } 6258 6259 /*---------------------------------------------------------------------------- 6260 | Returns the square root of the extended double-precision floating-point 6261 | value `a'. The operation is performed according to the IEC/IEEE Standard 6262 | for Binary Floating-Point Arithmetic. 6263 *----------------------------------------------------------------------------*/ 6264 6265 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 6266 { 6267 bool aSign; 6268 int32_t aExp, zExp; 6269 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 6270 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6271 6272 if (floatx80_invalid_encoding(a)) { 6273 float_raise(float_flag_invalid, status); 6274 return floatx80_default_nan(status); 6275 } 6276 aSig0 = extractFloatx80Frac( a ); 6277 aExp = extractFloatx80Exp( a ); 6278 aSign = extractFloatx80Sign( a ); 6279 if ( aExp == 0x7FFF ) { 6280 if ((uint64_t)(aSig0 << 1)) { 6281 return propagateFloatx80NaN(a, a, status); 6282 } 6283 if ( ! aSign ) return a; 6284 goto invalid; 6285 } 6286 if ( aSign ) { 6287 if ( ( aExp | aSig0 ) == 0 ) return a; 6288 invalid: 6289 float_raise(float_flag_invalid, status); 6290 return floatx80_default_nan(status); 6291 } 6292 if ( aExp == 0 ) { 6293 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 6294 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6295 } 6296 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 6297 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 6298 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 6299 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6300 doubleZSig0 = zSig0<<1; 6301 mul64To128( zSig0, zSig0, &term0, &term1 ); 6302 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6303 while ( (int64_t) rem0 < 0 ) { 6304 --zSig0; 6305 doubleZSig0 -= 2; 6306 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6307 } 6308 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6309 if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) { 6310 if ( zSig1 == 0 ) zSig1 = 1; 6311 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6312 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6313 mul64To128( zSig1, zSig1, &term2, &term3 ); 6314 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6315 while ( (int64_t) rem1 < 0 ) { 6316 --zSig1; 6317 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6318 term3 |= 1; 6319 term2 |= doubleZSig0; 6320 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6321 } 6322 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6323 } 6324 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 6325 zSig0 |= doubleZSig0; 6326 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6327 0, zExp, zSig0, zSig1, status); 6328 } 6329 6330 /*---------------------------------------------------------------------------- 6331 | Returns the result of converting the quadruple-precision floating-point 6332 | value `a' to the extended double-precision floating-point format. The 6333 | conversion is performed according to the IEC/IEEE Standard for Binary 6334 | Floating-Point Arithmetic. 6335 *----------------------------------------------------------------------------*/ 6336 6337 floatx80 float128_to_floatx80(float128 a, float_status *status) 6338 { 6339 bool aSign; 6340 int32_t aExp; 6341 uint64_t aSig0, aSig1; 6342 6343 aSig1 = extractFloat128Frac1( a ); 6344 aSig0 = extractFloat128Frac0( a ); 6345 aExp = extractFloat128Exp( a ); 6346 aSign = extractFloat128Sign( a ); 6347 if ( aExp == 0x7FFF ) { 6348 if ( aSig0 | aSig1 ) { 6349 floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status), 6350 status); 6351 return floatx80_silence_nan(res, status); 6352 } 6353 return packFloatx80(aSign, floatx80_infinity_high, 6354 floatx80_infinity_low); 6355 } 6356 if ( aExp == 0 ) { 6357 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6358 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6359 } 6360 else { 6361 aSig0 |= UINT64_C(0x0001000000000000); 6362 } 6363 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6364 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6365 6366 } 6367 6368 /*---------------------------------------------------------------------------- 6369 | Returns the remainder of the quadruple-precision floating-point value `a' 6370 | with respect to the corresponding value `b'. The operation is performed 6371 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6372 *----------------------------------------------------------------------------*/ 6373 6374 float128 float128_rem(float128 a, float128 b, float_status *status) 6375 { 6376 bool aSign, zSign; 6377 int32_t aExp, bExp, expDiff; 6378 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 6379 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 6380 int64_t sigMean0; 6381 6382 aSig1 = extractFloat128Frac1( a ); 6383 aSig0 = extractFloat128Frac0( a ); 6384 aExp = extractFloat128Exp( a ); 6385 aSign = extractFloat128Sign( a ); 6386 bSig1 = extractFloat128Frac1( b ); 6387 bSig0 = extractFloat128Frac0( b ); 6388 bExp = extractFloat128Exp( b ); 6389 if ( aExp == 0x7FFF ) { 6390 if ( ( aSig0 | aSig1 ) 6391 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6392 return propagateFloat128NaN(a, b, status); 6393 } 6394 goto invalid; 6395 } 6396 if ( bExp == 0x7FFF ) { 6397 if (bSig0 | bSig1) { 6398 return propagateFloat128NaN(a, b, status); 6399 } 6400 return a; 6401 } 6402 if ( bExp == 0 ) { 6403 if ( ( bSig0 | bSig1 ) == 0 ) { 6404 invalid: 6405 float_raise(float_flag_invalid, status); 6406 return float128_default_nan(status); 6407 } 6408 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6409 } 6410 if ( aExp == 0 ) { 6411 if ( ( aSig0 | aSig1 ) == 0 ) return a; 6412 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6413 } 6414 expDiff = aExp - bExp; 6415 if ( expDiff < -1 ) return a; 6416 shortShift128Left( 6417 aSig0 | UINT64_C(0x0001000000000000), 6418 aSig1, 6419 15 - ( expDiff < 0 ), 6420 &aSig0, 6421 &aSig1 6422 ); 6423 shortShift128Left( 6424 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 ); 6425 q = le128( bSig0, bSig1, aSig0, aSig1 ); 6426 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6427 expDiff -= 64; 6428 while ( 0 < expDiff ) { 6429 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6430 q = ( 4 < q ) ? q - 4 : 0; 6431 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6432 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 6433 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 6434 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 6435 expDiff -= 61; 6436 } 6437 if ( -64 < expDiff ) { 6438 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6439 q = ( 4 < q ) ? q - 4 : 0; 6440 q >>= - expDiff; 6441 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6442 expDiff += 52; 6443 if ( expDiff < 0 ) { 6444 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6445 } 6446 else { 6447 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 6448 } 6449 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6450 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 6451 } 6452 else { 6453 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 6454 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6455 } 6456 do { 6457 alternateASig0 = aSig0; 6458 alternateASig1 = aSig1; 6459 ++q; 6460 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6461 } while ( 0 <= (int64_t) aSig0 ); 6462 add128( 6463 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 6464 if ( ( sigMean0 < 0 ) 6465 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 6466 aSig0 = alternateASig0; 6467 aSig1 = alternateASig1; 6468 } 6469 zSign = ( (int64_t) aSig0 < 0 ); 6470 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 6471 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 6472 status); 6473 } 6474 6475 /*---------------------------------------------------------------------------- 6476 | Returns the square root of the quadruple-precision floating-point value `a'. 6477 | The operation is performed according to the IEC/IEEE Standard for Binary 6478 | Floating-Point Arithmetic. 6479 *----------------------------------------------------------------------------*/ 6480 6481 float128 float128_sqrt(float128 a, float_status *status) 6482 { 6483 bool aSign; 6484 int32_t aExp, zExp; 6485 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 6486 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6487 6488 aSig1 = extractFloat128Frac1( a ); 6489 aSig0 = extractFloat128Frac0( a ); 6490 aExp = extractFloat128Exp( a ); 6491 aSign = extractFloat128Sign( a ); 6492 if ( aExp == 0x7FFF ) { 6493 if (aSig0 | aSig1) { 6494 return propagateFloat128NaN(a, a, status); 6495 } 6496 if ( ! aSign ) return a; 6497 goto invalid; 6498 } 6499 if ( aSign ) { 6500 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 6501 invalid: 6502 float_raise(float_flag_invalid, status); 6503 return float128_default_nan(status); 6504 } 6505 if ( aExp == 0 ) { 6506 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 6507 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6508 } 6509 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 6510 aSig0 |= UINT64_C(0x0001000000000000); 6511 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 6512 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 6513 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6514 doubleZSig0 = zSig0<<1; 6515 mul64To128( zSig0, zSig0, &term0, &term1 ); 6516 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6517 while ( (int64_t) rem0 < 0 ) { 6518 --zSig0; 6519 doubleZSig0 -= 2; 6520 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6521 } 6522 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6523 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 6524 if ( zSig1 == 0 ) zSig1 = 1; 6525 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6526 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6527 mul64To128( zSig1, zSig1, &term2, &term3 ); 6528 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6529 while ( (int64_t) rem1 < 0 ) { 6530 --zSig1; 6531 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6532 term3 |= 1; 6533 term2 |= doubleZSig0; 6534 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6535 } 6536 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6537 } 6538 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 6539 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 6540 6541 } 6542 6543 static inline FloatRelation 6544 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet, 6545 float_status *status) 6546 { 6547 bool aSign, bSign; 6548 6549 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6550 float_raise(float_flag_invalid, status); 6551 return float_relation_unordered; 6552 } 6553 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 6554 ( extractFloatx80Frac( a )<<1 ) ) || 6555 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 6556 ( extractFloatx80Frac( b )<<1 ) )) { 6557 if (!is_quiet || 6558 floatx80_is_signaling_nan(a, status) || 6559 floatx80_is_signaling_nan(b, status)) { 6560 float_raise(float_flag_invalid, status); 6561 } 6562 return float_relation_unordered; 6563 } 6564 aSign = extractFloatx80Sign( a ); 6565 bSign = extractFloatx80Sign( b ); 6566 if ( aSign != bSign ) { 6567 6568 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 6569 ( ( a.low | b.low ) == 0 ) ) { 6570 /* zero case */ 6571 return float_relation_equal; 6572 } else { 6573 return 1 - (2 * aSign); 6574 } 6575 } else { 6576 /* Normalize pseudo-denormals before comparison. */ 6577 if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) { 6578 ++a.high; 6579 } 6580 if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) { 6581 ++b.high; 6582 } 6583 if (a.low == b.low && a.high == b.high) { 6584 return float_relation_equal; 6585 } else { 6586 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 6587 } 6588 } 6589 } 6590 6591 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status) 6592 { 6593 return floatx80_compare_internal(a, b, 0, status); 6594 } 6595 6596 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b, 6597 float_status *status) 6598 { 6599 return floatx80_compare_internal(a, b, 1, status); 6600 } 6601 6602 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 6603 { 6604 bool aSign; 6605 int32_t aExp; 6606 uint64_t aSig; 6607 6608 if (floatx80_invalid_encoding(a)) { 6609 float_raise(float_flag_invalid, status); 6610 return floatx80_default_nan(status); 6611 } 6612 aSig = extractFloatx80Frac( a ); 6613 aExp = extractFloatx80Exp( a ); 6614 aSign = extractFloatx80Sign( a ); 6615 6616 if ( aExp == 0x7FFF ) { 6617 if ( aSig<<1 ) { 6618 return propagateFloatx80NaN(a, a, status); 6619 } 6620 return a; 6621 } 6622 6623 if (aExp == 0) { 6624 if (aSig == 0) { 6625 return a; 6626 } 6627 aExp++; 6628 } 6629 6630 if (n > 0x10000) { 6631 n = 0x10000; 6632 } else if (n < -0x10000) { 6633 n = -0x10000; 6634 } 6635 6636 aExp += n; 6637 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 6638 aSign, aExp, aSig, 0, status); 6639 } 6640 6641 float128 float128_scalbn(float128 a, int n, float_status *status) 6642 { 6643 bool aSign; 6644 int32_t aExp; 6645 uint64_t aSig0, aSig1; 6646 6647 aSig1 = extractFloat128Frac1( a ); 6648 aSig0 = extractFloat128Frac0( a ); 6649 aExp = extractFloat128Exp( a ); 6650 aSign = extractFloat128Sign( a ); 6651 if ( aExp == 0x7FFF ) { 6652 if ( aSig0 | aSig1 ) { 6653 return propagateFloat128NaN(a, a, status); 6654 } 6655 return a; 6656 } 6657 if (aExp != 0) { 6658 aSig0 |= UINT64_C(0x0001000000000000); 6659 } else if (aSig0 == 0 && aSig1 == 0) { 6660 return a; 6661 } else { 6662 aExp++; 6663 } 6664 6665 if (n > 0x10000) { 6666 n = 0x10000; 6667 } else if (n < -0x10000) { 6668 n = -0x10000; 6669 } 6670 6671 aExp += n - 1; 6672 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 6673 , status); 6674 6675 } 6676 6677 static void __attribute__((constructor)) softfloat_init(void) 6678 { 6679 union_float64 ua, ub, uc, ur; 6680 6681 if (QEMU_NO_HARDFLOAT) { 6682 return; 6683 } 6684 /* 6685 * Test that the host's FMA is not obviously broken. For example, 6686 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see 6687 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304 6688 */ 6689 ua.s = 0x0020000000000001ULL; 6690 ub.s = 0x3ca0000000000000ULL; 6691 uc.s = 0x0020000000000000ULL; 6692 ur.h = fma(ua.h, ub.h, uc.h); 6693 if (ur.s != 0x0020000000000001ULL) { 6694 force_soft_fma = true; 6695 } 6696 } 6697