1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include <math.h> 87 #include "qemu/bitops.h" 88 #include "fpu/softfloat.h" 89 90 /* We only need stdlib for abort() */ 91 92 /*---------------------------------------------------------------------------- 93 | Primitive arithmetic functions, including multi-word arithmetic, and 94 | division and square root approximations. (Can be specialized to target if 95 | desired.) 96 *----------------------------------------------------------------------------*/ 97 #include "fpu/softfloat-macros.h" 98 99 /* 100 * Hardfloat 101 * 102 * Fast emulation of guest FP instructions is challenging for two reasons. 103 * First, FP instruction semantics are similar but not identical, particularly 104 * when handling NaNs. Second, emulating at reasonable speed the guest FP 105 * exception flags is not trivial: reading the host's flags register with a 106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp], 107 * and trapping on every FP exception is not fast nor pleasant to work with. 108 * 109 * We address these challenges by leveraging the host FPU for a subset of the 110 * operations. To do this we expand on the idea presented in this paper: 111 * 112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a 113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615. 114 * 115 * The idea is thus to leverage the host FPU to (1) compute FP operations 116 * and (2) identify whether FP exceptions occurred while avoiding 117 * expensive exception flag register accesses. 118 * 119 * An important optimization shown in the paper is that given that exception 120 * flags are rarely cleared by the guest, we can avoid recomputing some flags. 121 * This is particularly useful for the inexact flag, which is very frequently 122 * raised in floating-point workloads. 123 * 124 * We optimize the code further by deferring to soft-fp whenever FP exception 125 * detection might get hairy. Two examples: (1) when at least one operand is 126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result 127 * and the result is < the minimum normal. 128 */ 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \ 130 static inline void name(soft_t *a, float_status *s) \ 131 { \ 132 if (unlikely(soft_t ## _is_denormal(*a))) { \ 133 *a = soft_t ## _set_sign(soft_t ## _zero, \ 134 soft_t ## _is_neg(*a)); \ 135 float_raise(float_flag_input_denormal, s); \ 136 } \ 137 } 138 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32) 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64) 141 #undef GEN_INPUT_FLUSH__NOCHECK 142 143 #define GEN_INPUT_FLUSH1(name, soft_t) \ 144 static inline void name(soft_t *a, float_status *s) \ 145 { \ 146 if (likely(!s->flush_inputs_to_zero)) { \ 147 return; \ 148 } \ 149 soft_t ## _input_flush__nocheck(a, s); \ 150 } 151 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32) 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64) 154 #undef GEN_INPUT_FLUSH1 155 156 #define GEN_INPUT_FLUSH2(name, soft_t) \ 157 static inline void name(soft_t *a, soft_t *b, float_status *s) \ 158 { \ 159 if (likely(!s->flush_inputs_to_zero)) { \ 160 return; \ 161 } \ 162 soft_t ## _input_flush__nocheck(a, s); \ 163 soft_t ## _input_flush__nocheck(b, s); \ 164 } 165 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32) 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64) 168 #undef GEN_INPUT_FLUSH2 169 170 #define GEN_INPUT_FLUSH3(name, soft_t) \ 171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \ 172 { \ 173 if (likely(!s->flush_inputs_to_zero)) { \ 174 return; \ 175 } \ 176 soft_t ## _input_flush__nocheck(a, s); \ 177 soft_t ## _input_flush__nocheck(b, s); \ 178 soft_t ## _input_flush__nocheck(c, s); \ 179 } 180 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32) 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64) 183 #undef GEN_INPUT_FLUSH3 184 185 /* 186 * Choose whether to use fpclassify or float32/64_* primitives in the generated 187 * hardfloat functions. Each combination of number of inputs and float size 188 * gets its own value. 189 */ 190 #if defined(__x86_64__) 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1 197 #else 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0 204 #endif 205 206 /* 207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over 208 * float{32,64}_is_infinity when !USE_FP. 209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup. 210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%. 211 */ 212 #if defined(__x86_64__) || defined(__aarch64__) 213 # define QEMU_HARDFLOAT_USE_ISINF 1 214 #else 215 # define QEMU_HARDFLOAT_USE_ISINF 0 216 #endif 217 218 /* 219 * Some targets clear the FP flags before most FP operations. This prevents 220 * the use of hardfloat, since hardfloat relies on the inexact flag being 221 * already set. 222 */ 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__) 224 # if defined(__FAST_MATH__) 225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \ 226 IEEE implementation 227 # endif 228 # define QEMU_NO_HARDFLOAT 1 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN 230 #else 231 # define QEMU_NO_HARDFLOAT 0 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline)) 233 #endif 234 235 static inline bool can_use_fpu(const float_status *s) 236 { 237 if (QEMU_NO_HARDFLOAT) { 238 return false; 239 } 240 return likely(s->float_exception_flags & float_flag_inexact && 241 s->float_rounding_mode == float_round_nearest_even); 242 } 243 244 /* 245 * Hardfloat generation functions. Each operation can have two flavors: 246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for 247 * most condition checks, or native ones (e.g. fpclassify). 248 * 249 * The flavor is chosen by the callers. Instead of using macros, we rely on the 250 * compiler to propagate constants and inline everything into the callers. 251 * 252 * We only generate functions for operations with two inputs, since only 253 * these are common enough to justify consolidating them into common code. 254 */ 255 256 typedef union { 257 float32 s; 258 float h; 259 } union_float32; 260 261 typedef union { 262 float64 s; 263 double h; 264 } union_float64; 265 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b); 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b); 268 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s); 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s); 271 typedef float (*hard_f32_op2_fn)(float a, float b); 272 typedef double (*hard_f64_op2_fn)(double a, double b); 273 274 /* 2-input is-zero-or-normal */ 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b) 276 { 277 if (QEMU_HARDFLOAT_2F32_USE_FP) { 278 /* 279 * Not using a temp variable for consecutive fpclassify calls ends up 280 * generating faster code. 281 */ 282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 284 } 285 return float32_is_zero_or_normal(a.s) && 286 float32_is_zero_or_normal(b.s); 287 } 288 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b) 290 { 291 if (QEMU_HARDFLOAT_2F64_USE_FP) { 292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 294 } 295 return float64_is_zero_or_normal(a.s) && 296 float64_is_zero_or_normal(b.s); 297 } 298 299 /* 3-input is-zero-or-normal */ 300 static inline 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c) 302 { 303 if (QEMU_HARDFLOAT_3F32_USE_FP) { 304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 307 } 308 return float32_is_zero_or_normal(a.s) && 309 float32_is_zero_or_normal(b.s) && 310 float32_is_zero_or_normal(c.s); 311 } 312 313 static inline 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c) 315 { 316 if (QEMU_HARDFLOAT_3F64_USE_FP) { 317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 320 } 321 return float64_is_zero_or_normal(a.s) && 322 float64_is_zero_or_normal(b.s) && 323 float64_is_zero_or_normal(c.s); 324 } 325 326 static inline bool f32_is_inf(union_float32 a) 327 { 328 if (QEMU_HARDFLOAT_USE_ISINF) { 329 return isinf(a.h); 330 } 331 return float32_is_infinity(a.s); 332 } 333 334 static inline bool f64_is_inf(union_float64 a) 335 { 336 if (QEMU_HARDFLOAT_USE_ISINF) { 337 return isinf(a.h); 338 } 339 return float64_is_infinity(a.s); 340 } 341 342 static inline float32 343 float32_gen2(float32 xa, float32 xb, float_status *s, 344 hard_f32_op2_fn hard, soft_f32_op2_fn soft, 345 f32_check_fn pre, f32_check_fn post) 346 { 347 union_float32 ua, ub, ur; 348 349 ua.s = xa; 350 ub.s = xb; 351 352 if (unlikely(!can_use_fpu(s))) { 353 goto soft; 354 } 355 356 float32_input_flush2(&ua.s, &ub.s, s); 357 if (unlikely(!pre(ua, ub))) { 358 goto soft; 359 } 360 361 ur.h = hard(ua.h, ub.h); 362 if (unlikely(f32_is_inf(ur))) { 363 float_raise(float_flag_overflow, s); 364 } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) { 365 goto soft; 366 } 367 return ur.s; 368 369 soft: 370 return soft(ua.s, ub.s, s); 371 } 372 373 static inline float64 374 float64_gen2(float64 xa, float64 xb, float_status *s, 375 hard_f64_op2_fn hard, soft_f64_op2_fn soft, 376 f64_check_fn pre, f64_check_fn post) 377 { 378 union_float64 ua, ub, ur; 379 380 ua.s = xa; 381 ub.s = xb; 382 383 if (unlikely(!can_use_fpu(s))) { 384 goto soft; 385 } 386 387 float64_input_flush2(&ua.s, &ub.s, s); 388 if (unlikely(!pre(ua, ub))) { 389 goto soft; 390 } 391 392 ur.h = hard(ua.h, ub.h); 393 if (unlikely(f64_is_inf(ur))) { 394 float_raise(float_flag_overflow, s); 395 } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) { 396 goto soft; 397 } 398 return ur.s; 399 400 soft: 401 return soft(ua.s, ub.s, s); 402 } 403 404 /*---------------------------------------------------------------------------- 405 | Returns the fraction bits of the single-precision floating-point value `a'. 406 *----------------------------------------------------------------------------*/ 407 408 static inline uint32_t extractFloat32Frac(float32 a) 409 { 410 return float32_val(a) & 0x007FFFFF; 411 } 412 413 /*---------------------------------------------------------------------------- 414 | Returns the exponent bits of the single-precision floating-point value `a'. 415 *----------------------------------------------------------------------------*/ 416 417 static inline int extractFloat32Exp(float32 a) 418 { 419 return (float32_val(a) >> 23) & 0xFF; 420 } 421 422 /*---------------------------------------------------------------------------- 423 | Returns the sign bit of the single-precision floating-point value `a'. 424 *----------------------------------------------------------------------------*/ 425 426 static inline bool extractFloat32Sign(float32 a) 427 { 428 return float32_val(a) >> 31; 429 } 430 431 /*---------------------------------------------------------------------------- 432 | Returns the fraction bits of the double-precision floating-point value `a'. 433 *----------------------------------------------------------------------------*/ 434 435 static inline uint64_t extractFloat64Frac(float64 a) 436 { 437 return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF); 438 } 439 440 /*---------------------------------------------------------------------------- 441 | Returns the exponent bits of the double-precision floating-point value `a'. 442 *----------------------------------------------------------------------------*/ 443 444 static inline int extractFloat64Exp(float64 a) 445 { 446 return (float64_val(a) >> 52) & 0x7FF; 447 } 448 449 /*---------------------------------------------------------------------------- 450 | Returns the sign bit of the double-precision floating-point value `a'. 451 *----------------------------------------------------------------------------*/ 452 453 static inline bool extractFloat64Sign(float64 a) 454 { 455 return float64_val(a) >> 63; 456 } 457 458 /* 459 * Classify a floating point number. Everything above float_class_qnan 460 * is a NaN so cls >= float_class_qnan is any NaN. 461 */ 462 463 typedef enum __attribute__ ((__packed__)) { 464 float_class_unclassified, 465 float_class_zero, 466 float_class_normal, 467 float_class_inf, 468 float_class_qnan, /* all NaNs from here */ 469 float_class_snan, 470 } FloatClass; 471 472 #define float_cmask(bit) (1u << (bit)) 473 474 enum { 475 float_cmask_zero = float_cmask(float_class_zero), 476 float_cmask_normal = float_cmask(float_class_normal), 477 float_cmask_inf = float_cmask(float_class_inf), 478 float_cmask_qnan = float_cmask(float_class_qnan), 479 float_cmask_snan = float_cmask(float_class_snan), 480 481 float_cmask_infzero = float_cmask_zero | float_cmask_inf, 482 float_cmask_anynan = float_cmask_qnan | float_cmask_snan, 483 }; 484 485 /* Flags for parts_minmax. */ 486 enum { 487 /* Set for minimum; clear for maximum. */ 488 minmax_ismin = 1, 489 /* Set for the IEEE 754-2008 minNum() and maxNum() operations. */ 490 minmax_isnum = 2, 491 /* Set for the IEEE 754-2008 minNumMag() and minNumMag() operations. */ 492 minmax_ismag = 4, 493 }; 494 495 /* Simple helpers for checking if, or what kind of, NaN we have */ 496 static inline __attribute__((unused)) bool is_nan(FloatClass c) 497 { 498 return unlikely(c >= float_class_qnan); 499 } 500 501 static inline __attribute__((unused)) bool is_snan(FloatClass c) 502 { 503 return c == float_class_snan; 504 } 505 506 static inline __attribute__((unused)) bool is_qnan(FloatClass c) 507 { 508 return c == float_class_qnan; 509 } 510 511 /* 512 * Structure holding all of the decomposed parts of a float. 513 * The exponent is unbiased and the fraction is normalized. 514 * 515 * The fraction words are stored in big-endian word ordering, 516 * so that truncation from a larger format to a smaller format 517 * can be done simply by ignoring subsequent elements. 518 */ 519 520 typedef struct { 521 FloatClass cls; 522 bool sign; 523 int32_t exp; 524 union { 525 /* Routines that know the structure may reference the singular name. */ 526 uint64_t frac; 527 /* 528 * Routines expanded with multiple structures reference "hi" and "lo" 529 * depending on the operation. In FloatParts64, "hi" and "lo" are 530 * both the same word and aliased here. 531 */ 532 uint64_t frac_hi; 533 uint64_t frac_lo; 534 }; 535 } FloatParts64; 536 537 typedef struct { 538 FloatClass cls; 539 bool sign; 540 int32_t exp; 541 uint64_t frac_hi; 542 uint64_t frac_lo; 543 } FloatParts128; 544 545 typedef struct { 546 FloatClass cls; 547 bool sign; 548 int32_t exp; 549 uint64_t frac_hi; 550 uint64_t frac_hm; /* high-middle */ 551 uint64_t frac_lm; /* low-middle */ 552 uint64_t frac_lo; 553 } FloatParts256; 554 555 /* These apply to the most significant word of each FloatPartsN. */ 556 #define DECOMPOSED_BINARY_POINT 63 557 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 558 559 /* Structure holding all of the relevant parameters for a format. 560 * exp_size: the size of the exponent field 561 * exp_bias: the offset applied to the exponent field 562 * exp_max: the maximum normalised exponent 563 * frac_size: the size of the fraction field 564 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 565 * The following are computed based the size of fraction 566 * frac_lsb: least significant bit of fraction 567 * frac_lsbm1: the bit below the least significant bit (for rounding) 568 * round_mask/roundeven_mask: masks used for rounding 569 * The following optional modifiers are available: 570 * arm_althp: handle ARM Alternative Half Precision 571 */ 572 typedef struct { 573 int exp_size; 574 int exp_bias; 575 int exp_max; 576 int frac_size; 577 int frac_shift; 578 uint64_t frac_lsb; 579 uint64_t frac_lsbm1; 580 uint64_t round_mask; 581 uint64_t roundeven_mask; 582 bool arm_althp; 583 } FloatFmt; 584 585 /* Expand fields based on the size of exponent and fraction */ 586 #define FLOAT_PARAMS(E, F) \ 587 .exp_size = E, \ 588 .exp_bias = ((1 << E) - 1) >> 1, \ 589 .exp_max = (1 << E) - 1, \ 590 .frac_size = F, \ 591 .frac_shift = (-F - 1) & 63, \ 592 .frac_lsb = 1ull << ((-F - 1) & 63), \ 593 .frac_lsbm1 = 1ull << ((-F - 2) & 63), \ 594 .round_mask = (1ull << ((-F - 1) & 63)) - 1, \ 595 .roundeven_mask = (2ull << ((-F - 1) & 63)) - 1 596 597 static const FloatFmt float16_params = { 598 FLOAT_PARAMS(5, 10) 599 }; 600 601 static const FloatFmt float16_params_ahp = { 602 FLOAT_PARAMS(5, 10), 603 .arm_althp = true 604 }; 605 606 static const FloatFmt bfloat16_params = { 607 FLOAT_PARAMS(8, 7) 608 }; 609 610 static const FloatFmt float32_params = { 611 FLOAT_PARAMS(8, 23) 612 }; 613 614 static const FloatFmt float64_params = { 615 FLOAT_PARAMS(11, 52) 616 }; 617 618 static const FloatFmt float128_params = { 619 FLOAT_PARAMS(15, 112) 620 }; 621 622 /* Unpack a float to parts, but do not canonicalize. */ 623 static void unpack_raw64(FloatParts64 *r, const FloatFmt *fmt, uint64_t raw) 624 { 625 const int f_size = fmt->frac_size; 626 const int e_size = fmt->exp_size; 627 628 *r = (FloatParts64) { 629 .cls = float_class_unclassified, 630 .sign = extract64(raw, f_size + e_size, 1), 631 .exp = extract64(raw, f_size, e_size), 632 .frac = extract64(raw, 0, f_size) 633 }; 634 } 635 636 static inline void float16_unpack_raw(FloatParts64 *p, float16 f) 637 { 638 unpack_raw64(p, &float16_params, f); 639 } 640 641 static inline void bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f) 642 { 643 unpack_raw64(p, &bfloat16_params, f); 644 } 645 646 static inline void float32_unpack_raw(FloatParts64 *p, float32 f) 647 { 648 unpack_raw64(p, &float32_params, f); 649 } 650 651 static inline void float64_unpack_raw(FloatParts64 *p, float64 f) 652 { 653 unpack_raw64(p, &float64_params, f); 654 } 655 656 static void float128_unpack_raw(FloatParts128 *p, float128 f) 657 { 658 const int f_size = float128_params.frac_size - 64; 659 const int e_size = float128_params.exp_size; 660 661 *p = (FloatParts128) { 662 .cls = float_class_unclassified, 663 .sign = extract64(f.high, f_size + e_size, 1), 664 .exp = extract64(f.high, f_size, e_size), 665 .frac_hi = extract64(f.high, 0, f_size), 666 .frac_lo = f.low, 667 }; 668 } 669 670 /* Pack a float from parts, but do not canonicalize. */ 671 static uint64_t pack_raw64(const FloatParts64 *p, const FloatFmt *fmt) 672 { 673 const int f_size = fmt->frac_size; 674 const int e_size = fmt->exp_size; 675 uint64_t ret; 676 677 ret = (uint64_t)p->sign << (f_size + e_size); 678 ret = deposit64(ret, f_size, e_size, p->exp); 679 ret = deposit64(ret, 0, f_size, p->frac); 680 return ret; 681 } 682 683 static inline float16 float16_pack_raw(const FloatParts64 *p) 684 { 685 return make_float16(pack_raw64(p, &float16_params)); 686 } 687 688 static inline bfloat16 bfloat16_pack_raw(const FloatParts64 *p) 689 { 690 return pack_raw64(p, &bfloat16_params); 691 } 692 693 static inline float32 float32_pack_raw(const FloatParts64 *p) 694 { 695 return make_float32(pack_raw64(p, &float32_params)); 696 } 697 698 static inline float64 float64_pack_raw(const FloatParts64 *p) 699 { 700 return make_float64(pack_raw64(p, &float64_params)); 701 } 702 703 static float128 float128_pack_raw(const FloatParts128 *p) 704 { 705 const int f_size = float128_params.frac_size - 64; 706 const int e_size = float128_params.exp_size; 707 uint64_t hi; 708 709 hi = (uint64_t)p->sign << (f_size + e_size); 710 hi = deposit64(hi, f_size, e_size, p->exp); 711 hi = deposit64(hi, 0, f_size, p->frac_hi); 712 return make_float128(hi, p->frac_lo); 713 } 714 715 /*---------------------------------------------------------------------------- 716 | Functions and definitions to determine: (1) whether tininess for underflow 717 | is detected before or after rounding by default, (2) what (if anything) 718 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 719 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 720 | are propagated from function inputs to output. These details are target- 721 | specific. 722 *----------------------------------------------------------------------------*/ 723 #include "softfloat-specialize.c.inc" 724 725 #define PARTS_GENERIC_64_128(NAME, P) \ 726 QEMU_GENERIC(P, (FloatParts128 *, parts128_##NAME), parts64_##NAME) 727 728 #define PARTS_GENERIC_64_128_256(NAME, P) \ 729 QEMU_GENERIC(P, (FloatParts256 *, parts256_##NAME), \ 730 (FloatParts128 *, parts128_##NAME), parts64_##NAME) 731 732 #define parts_default_nan(P, S) PARTS_GENERIC_64_128(default_nan, P)(P, S) 733 #define parts_silence_nan(P, S) PARTS_GENERIC_64_128(silence_nan, P)(P, S) 734 735 static void parts64_return_nan(FloatParts64 *a, float_status *s); 736 static void parts128_return_nan(FloatParts128 *a, float_status *s); 737 738 #define parts_return_nan(P, S) PARTS_GENERIC_64_128(return_nan, P)(P, S) 739 740 static FloatParts64 *parts64_pick_nan(FloatParts64 *a, FloatParts64 *b, 741 float_status *s); 742 static FloatParts128 *parts128_pick_nan(FloatParts128 *a, FloatParts128 *b, 743 float_status *s); 744 745 #define parts_pick_nan(A, B, S) PARTS_GENERIC_64_128(pick_nan, A)(A, B, S) 746 747 static FloatParts64 *parts64_pick_nan_muladd(FloatParts64 *a, FloatParts64 *b, 748 FloatParts64 *c, float_status *s, 749 int ab_mask, int abc_mask); 750 static FloatParts128 *parts128_pick_nan_muladd(FloatParts128 *a, 751 FloatParts128 *b, 752 FloatParts128 *c, 753 float_status *s, 754 int ab_mask, int abc_mask); 755 756 #define parts_pick_nan_muladd(A, B, C, S, ABM, ABCM) \ 757 PARTS_GENERIC_64_128(pick_nan_muladd, A)(A, B, C, S, ABM, ABCM) 758 759 static void parts64_canonicalize(FloatParts64 *p, float_status *status, 760 const FloatFmt *fmt); 761 static void parts128_canonicalize(FloatParts128 *p, float_status *status, 762 const FloatFmt *fmt); 763 764 #define parts_canonicalize(A, S, F) \ 765 PARTS_GENERIC_64_128(canonicalize, A)(A, S, F) 766 767 static void parts64_uncanon(FloatParts64 *p, float_status *status, 768 const FloatFmt *fmt); 769 static void parts128_uncanon(FloatParts128 *p, float_status *status, 770 const FloatFmt *fmt); 771 772 #define parts_uncanon(A, S, F) \ 773 PARTS_GENERIC_64_128(uncanon, A)(A, S, F) 774 775 static void parts64_add_normal(FloatParts64 *a, FloatParts64 *b); 776 static void parts128_add_normal(FloatParts128 *a, FloatParts128 *b); 777 static void parts256_add_normal(FloatParts256 *a, FloatParts256 *b); 778 779 #define parts_add_normal(A, B) \ 780 PARTS_GENERIC_64_128_256(add_normal, A)(A, B) 781 782 static bool parts64_sub_normal(FloatParts64 *a, FloatParts64 *b); 783 static bool parts128_sub_normal(FloatParts128 *a, FloatParts128 *b); 784 static bool parts256_sub_normal(FloatParts256 *a, FloatParts256 *b); 785 786 #define parts_sub_normal(A, B) \ 787 PARTS_GENERIC_64_128_256(sub_normal, A)(A, B) 788 789 static FloatParts64 *parts64_addsub(FloatParts64 *a, FloatParts64 *b, 790 float_status *s, bool subtract); 791 static FloatParts128 *parts128_addsub(FloatParts128 *a, FloatParts128 *b, 792 float_status *s, bool subtract); 793 794 #define parts_addsub(A, B, S, Z) \ 795 PARTS_GENERIC_64_128(addsub, A)(A, B, S, Z) 796 797 static FloatParts64 *parts64_mul(FloatParts64 *a, FloatParts64 *b, 798 float_status *s); 799 static FloatParts128 *parts128_mul(FloatParts128 *a, FloatParts128 *b, 800 float_status *s); 801 802 #define parts_mul(A, B, S) \ 803 PARTS_GENERIC_64_128(mul, A)(A, B, S) 804 805 static FloatParts64 *parts64_muladd(FloatParts64 *a, FloatParts64 *b, 806 FloatParts64 *c, int flags, 807 float_status *s); 808 static FloatParts128 *parts128_muladd(FloatParts128 *a, FloatParts128 *b, 809 FloatParts128 *c, int flags, 810 float_status *s); 811 812 #define parts_muladd(A, B, C, Z, S) \ 813 PARTS_GENERIC_64_128(muladd, A)(A, B, C, Z, S) 814 815 static FloatParts64 *parts64_div(FloatParts64 *a, FloatParts64 *b, 816 float_status *s); 817 static FloatParts128 *parts128_div(FloatParts128 *a, FloatParts128 *b, 818 float_status *s); 819 820 #define parts_div(A, B, S) \ 821 PARTS_GENERIC_64_128(div, A)(A, B, S) 822 823 static bool parts64_round_to_int_normal(FloatParts64 *a, FloatRoundMode rm, 824 int scale, int frac_size); 825 static bool parts128_round_to_int_normal(FloatParts128 *a, FloatRoundMode r, 826 int scale, int frac_size); 827 828 #define parts_round_to_int_normal(A, R, C, F) \ 829 PARTS_GENERIC_64_128(round_to_int_normal, A)(A, R, C, F) 830 831 static void parts64_round_to_int(FloatParts64 *a, FloatRoundMode rm, 832 int scale, float_status *s, 833 const FloatFmt *fmt); 834 static void parts128_round_to_int(FloatParts128 *a, FloatRoundMode r, 835 int scale, float_status *s, 836 const FloatFmt *fmt); 837 838 #define parts_round_to_int(A, R, C, S, F) \ 839 PARTS_GENERIC_64_128(round_to_int, A)(A, R, C, S, F) 840 841 static int64_t parts64_float_to_sint(FloatParts64 *p, FloatRoundMode rmode, 842 int scale, int64_t min, int64_t max, 843 float_status *s); 844 static int64_t parts128_float_to_sint(FloatParts128 *p, FloatRoundMode rmode, 845 int scale, int64_t min, int64_t max, 846 float_status *s); 847 848 #define parts_float_to_sint(P, R, Z, MN, MX, S) \ 849 PARTS_GENERIC_64_128(float_to_sint, P)(P, R, Z, MN, MX, S) 850 851 static uint64_t parts64_float_to_uint(FloatParts64 *p, FloatRoundMode rmode, 852 int scale, uint64_t max, 853 float_status *s); 854 static uint64_t parts128_float_to_uint(FloatParts128 *p, FloatRoundMode rmode, 855 int scale, uint64_t max, 856 float_status *s); 857 858 #define parts_float_to_uint(P, R, Z, M, S) \ 859 PARTS_GENERIC_64_128(float_to_uint, P)(P, R, Z, M, S) 860 861 static void parts64_sint_to_float(FloatParts64 *p, int64_t a, 862 int scale, float_status *s); 863 static void parts128_sint_to_float(FloatParts128 *p, int64_t a, 864 int scale, float_status *s); 865 866 #define parts_sint_to_float(P, I, Z, S) \ 867 PARTS_GENERIC_64_128(sint_to_float, P)(P, I, Z, S) 868 869 static void parts64_uint_to_float(FloatParts64 *p, uint64_t a, 870 int scale, float_status *s); 871 static void parts128_uint_to_float(FloatParts128 *p, uint64_t a, 872 int scale, float_status *s); 873 874 #define parts_uint_to_float(P, I, Z, S) \ 875 PARTS_GENERIC_64_128(uint_to_float, P)(P, I, Z, S) 876 877 static FloatParts64 *parts64_minmax(FloatParts64 *a, FloatParts64 *b, 878 float_status *s, int flags); 879 static FloatParts128 *parts128_minmax(FloatParts128 *a, FloatParts128 *b, 880 float_status *s, int flags); 881 882 #define parts_minmax(A, B, S, F) \ 883 PARTS_GENERIC_64_128(minmax, A)(A, B, S, F) 884 885 /* 886 * Helper functions for softfloat-parts.c.inc, per-size operations. 887 */ 888 889 #define FRAC_GENERIC_64_128(NAME, P) \ 890 QEMU_GENERIC(P, (FloatParts128 *, frac128_##NAME), frac64_##NAME) 891 892 #define FRAC_GENERIC_64_128_256(NAME, P) \ 893 QEMU_GENERIC(P, (FloatParts256 *, frac256_##NAME), \ 894 (FloatParts128 *, frac128_##NAME), frac64_##NAME) 895 896 static bool frac64_add(FloatParts64 *r, FloatParts64 *a, FloatParts64 *b) 897 { 898 return uadd64_overflow(a->frac, b->frac, &r->frac); 899 } 900 901 static bool frac128_add(FloatParts128 *r, FloatParts128 *a, FloatParts128 *b) 902 { 903 bool c = 0; 904 r->frac_lo = uadd64_carry(a->frac_lo, b->frac_lo, &c); 905 r->frac_hi = uadd64_carry(a->frac_hi, b->frac_hi, &c); 906 return c; 907 } 908 909 static bool frac256_add(FloatParts256 *r, FloatParts256 *a, FloatParts256 *b) 910 { 911 bool c = 0; 912 r->frac_lo = uadd64_carry(a->frac_lo, b->frac_lo, &c); 913 r->frac_lm = uadd64_carry(a->frac_lm, b->frac_lm, &c); 914 r->frac_hm = uadd64_carry(a->frac_hm, b->frac_hm, &c); 915 r->frac_hi = uadd64_carry(a->frac_hi, b->frac_hi, &c); 916 return c; 917 } 918 919 #define frac_add(R, A, B) FRAC_GENERIC_64_128_256(add, R)(R, A, B) 920 921 static bool frac64_addi(FloatParts64 *r, FloatParts64 *a, uint64_t c) 922 { 923 return uadd64_overflow(a->frac, c, &r->frac); 924 } 925 926 static bool frac128_addi(FloatParts128 *r, FloatParts128 *a, uint64_t c) 927 { 928 c = uadd64_overflow(a->frac_lo, c, &r->frac_lo); 929 return uadd64_overflow(a->frac_hi, c, &r->frac_hi); 930 } 931 932 #define frac_addi(R, A, C) FRAC_GENERIC_64_128(addi, R)(R, A, C) 933 934 static void frac64_allones(FloatParts64 *a) 935 { 936 a->frac = -1; 937 } 938 939 static void frac128_allones(FloatParts128 *a) 940 { 941 a->frac_hi = a->frac_lo = -1; 942 } 943 944 #define frac_allones(A) FRAC_GENERIC_64_128(allones, A)(A) 945 946 static int frac64_cmp(FloatParts64 *a, FloatParts64 *b) 947 { 948 return a->frac == b->frac ? 0 : a->frac < b->frac ? -1 : 1; 949 } 950 951 static int frac128_cmp(FloatParts128 *a, FloatParts128 *b) 952 { 953 uint64_t ta = a->frac_hi, tb = b->frac_hi; 954 if (ta == tb) { 955 ta = a->frac_lo, tb = b->frac_lo; 956 if (ta == tb) { 957 return 0; 958 } 959 } 960 return ta < tb ? -1 : 1; 961 } 962 963 #define frac_cmp(A, B) FRAC_GENERIC_64_128(cmp, A)(A, B) 964 965 static void frac64_clear(FloatParts64 *a) 966 { 967 a->frac = 0; 968 } 969 970 static void frac128_clear(FloatParts128 *a) 971 { 972 a->frac_hi = a->frac_lo = 0; 973 } 974 975 #define frac_clear(A) FRAC_GENERIC_64_128(clear, A)(A) 976 977 static bool frac64_div(FloatParts64 *a, FloatParts64 *b) 978 { 979 uint64_t n1, n0, r, q; 980 bool ret; 981 982 /* 983 * We want a 2*N / N-bit division to produce exactly an N-bit 984 * result, so that we do not lose any precision and so that we 985 * do not have to renormalize afterward. If A.frac < B.frac, 986 * then division would produce an (N-1)-bit result; shift A left 987 * by one to produce the an N-bit result, and return true to 988 * decrement the exponent to match. 989 * 990 * The udiv_qrnnd algorithm that we're using requires normalization, 991 * i.e. the msb of the denominator must be set, which is already true. 992 */ 993 ret = a->frac < b->frac; 994 if (ret) { 995 n0 = a->frac; 996 n1 = 0; 997 } else { 998 n0 = a->frac >> 1; 999 n1 = a->frac << 63; 1000 } 1001 q = udiv_qrnnd(&r, n0, n1, b->frac); 1002 1003 /* Set lsb if there is a remainder, to set inexact. */ 1004 a->frac = q | (r != 0); 1005 1006 return ret; 1007 } 1008 1009 static bool frac128_div(FloatParts128 *a, FloatParts128 *b) 1010 { 1011 uint64_t q0, q1, a0, a1, b0, b1; 1012 uint64_t r0, r1, r2, r3, t0, t1, t2, t3; 1013 bool ret = false; 1014 1015 a0 = a->frac_hi, a1 = a->frac_lo; 1016 b0 = b->frac_hi, b1 = b->frac_lo; 1017 1018 ret = lt128(a0, a1, b0, b1); 1019 if (!ret) { 1020 a1 = shr_double(a0, a1, 1); 1021 a0 = a0 >> 1; 1022 } 1023 1024 /* Use 128/64 -> 64 division as estimate for 192/128 -> 128 division. */ 1025 q0 = estimateDiv128To64(a0, a1, b0); 1026 1027 /* 1028 * Estimate is high because B1 was not included (unless B1 == 0). 1029 * Reduce quotient and increase remainder until remainder is non-negative. 1030 * This loop will execute 0 to 2 times. 1031 */ 1032 mul128By64To192(b0, b1, q0, &t0, &t1, &t2); 1033 sub192(a0, a1, 0, t0, t1, t2, &r0, &r1, &r2); 1034 while (r0 != 0) { 1035 q0--; 1036 add192(r0, r1, r2, 0, b0, b1, &r0, &r1, &r2); 1037 } 1038 1039 /* Repeat using the remainder, producing a second word of quotient. */ 1040 q1 = estimateDiv128To64(r1, r2, b0); 1041 mul128By64To192(b0, b1, q1, &t1, &t2, &t3); 1042 sub192(r1, r2, 0, t1, t2, t3, &r1, &r2, &r3); 1043 while (r1 != 0) { 1044 q1--; 1045 add192(r1, r2, r3, 0, b0, b1, &r1, &r2, &r3); 1046 } 1047 1048 /* Any remainder indicates inexact; set sticky bit. */ 1049 q1 |= (r2 | r3) != 0; 1050 1051 a->frac_hi = q0; 1052 a->frac_lo = q1; 1053 return ret; 1054 } 1055 1056 #define frac_div(A, B) FRAC_GENERIC_64_128(div, A)(A, B) 1057 1058 static bool frac64_eqz(FloatParts64 *a) 1059 { 1060 return a->frac == 0; 1061 } 1062 1063 static bool frac128_eqz(FloatParts128 *a) 1064 { 1065 return (a->frac_hi | a->frac_lo) == 0; 1066 } 1067 1068 #define frac_eqz(A) FRAC_GENERIC_64_128(eqz, A)(A) 1069 1070 static void frac64_mulw(FloatParts128 *r, FloatParts64 *a, FloatParts64 *b) 1071 { 1072 mulu64(&r->frac_lo, &r->frac_hi, a->frac, b->frac); 1073 } 1074 1075 static void frac128_mulw(FloatParts256 *r, FloatParts128 *a, FloatParts128 *b) 1076 { 1077 mul128To256(a->frac_hi, a->frac_lo, b->frac_hi, b->frac_lo, 1078 &r->frac_hi, &r->frac_hm, &r->frac_lm, &r->frac_lo); 1079 } 1080 1081 #define frac_mulw(R, A, B) FRAC_GENERIC_64_128(mulw, A)(R, A, B) 1082 1083 static void frac64_neg(FloatParts64 *a) 1084 { 1085 a->frac = -a->frac; 1086 } 1087 1088 static void frac128_neg(FloatParts128 *a) 1089 { 1090 bool c = 0; 1091 a->frac_lo = usub64_borrow(0, a->frac_lo, &c); 1092 a->frac_hi = usub64_borrow(0, a->frac_hi, &c); 1093 } 1094 1095 static void frac256_neg(FloatParts256 *a) 1096 { 1097 bool c = 0; 1098 a->frac_lo = usub64_borrow(0, a->frac_lo, &c); 1099 a->frac_lm = usub64_borrow(0, a->frac_lm, &c); 1100 a->frac_hm = usub64_borrow(0, a->frac_hm, &c); 1101 a->frac_hi = usub64_borrow(0, a->frac_hi, &c); 1102 } 1103 1104 #define frac_neg(A) FRAC_GENERIC_64_128_256(neg, A)(A) 1105 1106 static int frac64_normalize(FloatParts64 *a) 1107 { 1108 if (a->frac) { 1109 int shift = clz64(a->frac); 1110 a->frac <<= shift; 1111 return shift; 1112 } 1113 return 64; 1114 } 1115 1116 static int frac128_normalize(FloatParts128 *a) 1117 { 1118 if (a->frac_hi) { 1119 int shl = clz64(a->frac_hi); 1120 a->frac_hi = shl_double(a->frac_hi, a->frac_lo, shl); 1121 a->frac_lo <<= shl; 1122 return shl; 1123 } else if (a->frac_lo) { 1124 int shl = clz64(a->frac_lo); 1125 a->frac_hi = a->frac_lo << shl; 1126 a->frac_lo = 0; 1127 return shl + 64; 1128 } 1129 return 128; 1130 } 1131 1132 static int frac256_normalize(FloatParts256 *a) 1133 { 1134 uint64_t a0 = a->frac_hi, a1 = a->frac_hm; 1135 uint64_t a2 = a->frac_lm, a3 = a->frac_lo; 1136 int ret, shl; 1137 1138 if (likely(a0)) { 1139 shl = clz64(a0); 1140 if (shl == 0) { 1141 return 0; 1142 } 1143 ret = shl; 1144 } else { 1145 if (a1) { 1146 ret = 64; 1147 a0 = a1, a1 = a2, a2 = a3, a3 = 0; 1148 } else if (a2) { 1149 ret = 128; 1150 a0 = a2, a1 = a3, a2 = 0, a3 = 0; 1151 } else if (a3) { 1152 ret = 192; 1153 a0 = a3, a1 = 0, a2 = 0, a3 = 0; 1154 } else { 1155 ret = 256; 1156 a0 = 0, a1 = 0, a2 = 0, a3 = 0; 1157 goto done; 1158 } 1159 shl = clz64(a0); 1160 if (shl == 0) { 1161 goto done; 1162 } 1163 ret += shl; 1164 } 1165 1166 a0 = shl_double(a0, a1, shl); 1167 a1 = shl_double(a1, a2, shl); 1168 a2 = shl_double(a2, a3, shl); 1169 a3 <<= shl; 1170 1171 done: 1172 a->frac_hi = a0; 1173 a->frac_hm = a1; 1174 a->frac_lm = a2; 1175 a->frac_lo = a3; 1176 return ret; 1177 } 1178 1179 #define frac_normalize(A) FRAC_GENERIC_64_128_256(normalize, A)(A) 1180 1181 static void frac64_shl(FloatParts64 *a, int c) 1182 { 1183 a->frac <<= c; 1184 } 1185 1186 static void frac128_shl(FloatParts128 *a, int c) 1187 { 1188 uint64_t a0 = a->frac_hi, a1 = a->frac_lo; 1189 1190 if (c & 64) { 1191 a0 = a1, a1 = 0; 1192 } 1193 1194 c &= 63; 1195 if (c) { 1196 a0 = shl_double(a0, a1, c); 1197 a1 = a1 << c; 1198 } 1199 1200 a->frac_hi = a0; 1201 a->frac_lo = a1; 1202 } 1203 1204 #define frac_shl(A, C) FRAC_GENERIC_64_128(shl, A)(A, C) 1205 1206 static void frac64_shr(FloatParts64 *a, int c) 1207 { 1208 a->frac >>= c; 1209 } 1210 1211 static void frac128_shr(FloatParts128 *a, int c) 1212 { 1213 uint64_t a0 = a->frac_hi, a1 = a->frac_lo; 1214 1215 if (c & 64) { 1216 a1 = a0, a0 = 0; 1217 } 1218 1219 c &= 63; 1220 if (c) { 1221 a1 = shr_double(a0, a1, c); 1222 a0 = a0 >> c; 1223 } 1224 1225 a->frac_hi = a0; 1226 a->frac_lo = a1; 1227 } 1228 1229 #define frac_shr(A, C) FRAC_GENERIC_64_128(shr, A)(A, C) 1230 1231 static void frac64_shrjam(FloatParts64 *a, int c) 1232 { 1233 uint64_t a0 = a->frac; 1234 1235 if (likely(c != 0)) { 1236 if (likely(c < 64)) { 1237 a0 = (a0 >> c) | (shr_double(a0, 0, c) != 0); 1238 } else { 1239 a0 = a0 != 0; 1240 } 1241 a->frac = a0; 1242 } 1243 } 1244 1245 static void frac128_shrjam(FloatParts128 *a, int c) 1246 { 1247 uint64_t a0 = a->frac_hi, a1 = a->frac_lo; 1248 uint64_t sticky = 0; 1249 1250 if (unlikely(c == 0)) { 1251 return; 1252 } else if (likely(c < 64)) { 1253 /* nothing */ 1254 } else if (likely(c < 128)) { 1255 sticky = a1; 1256 a1 = a0; 1257 a0 = 0; 1258 c &= 63; 1259 if (c == 0) { 1260 goto done; 1261 } 1262 } else { 1263 sticky = a0 | a1; 1264 a0 = a1 = 0; 1265 goto done; 1266 } 1267 1268 sticky |= shr_double(a1, 0, c); 1269 a1 = shr_double(a0, a1, c); 1270 a0 = a0 >> c; 1271 1272 done: 1273 a->frac_lo = a1 | (sticky != 0); 1274 a->frac_hi = a0; 1275 } 1276 1277 static void frac256_shrjam(FloatParts256 *a, int c) 1278 { 1279 uint64_t a0 = a->frac_hi, a1 = a->frac_hm; 1280 uint64_t a2 = a->frac_lm, a3 = a->frac_lo; 1281 uint64_t sticky = 0; 1282 1283 if (unlikely(c == 0)) { 1284 return; 1285 } else if (likely(c < 64)) { 1286 /* nothing */ 1287 } else if (likely(c < 256)) { 1288 if (unlikely(c & 128)) { 1289 sticky |= a2 | a3; 1290 a3 = a1, a2 = a0, a1 = 0, a0 = 0; 1291 } 1292 if (unlikely(c & 64)) { 1293 sticky |= a3; 1294 a3 = a2, a2 = a1, a1 = a0, a0 = 0; 1295 } 1296 c &= 63; 1297 if (c == 0) { 1298 goto done; 1299 } 1300 } else { 1301 sticky = a0 | a1 | a2 | a3; 1302 a0 = a1 = a2 = a3 = 0; 1303 goto done; 1304 } 1305 1306 sticky |= shr_double(a3, 0, c); 1307 a3 = shr_double(a2, a3, c); 1308 a2 = shr_double(a1, a2, c); 1309 a1 = shr_double(a0, a1, c); 1310 a0 = a0 >> c; 1311 1312 done: 1313 a->frac_lo = a3 | (sticky != 0); 1314 a->frac_lm = a2; 1315 a->frac_hm = a1; 1316 a->frac_hi = a0; 1317 } 1318 1319 #define frac_shrjam(A, C) FRAC_GENERIC_64_128_256(shrjam, A)(A, C) 1320 1321 static bool frac64_sub(FloatParts64 *r, FloatParts64 *a, FloatParts64 *b) 1322 { 1323 return usub64_overflow(a->frac, b->frac, &r->frac); 1324 } 1325 1326 static bool frac128_sub(FloatParts128 *r, FloatParts128 *a, FloatParts128 *b) 1327 { 1328 bool c = 0; 1329 r->frac_lo = usub64_borrow(a->frac_lo, b->frac_lo, &c); 1330 r->frac_hi = usub64_borrow(a->frac_hi, b->frac_hi, &c); 1331 return c; 1332 } 1333 1334 static bool frac256_sub(FloatParts256 *r, FloatParts256 *a, FloatParts256 *b) 1335 { 1336 bool c = 0; 1337 r->frac_lo = usub64_borrow(a->frac_lo, b->frac_lo, &c); 1338 r->frac_lm = usub64_borrow(a->frac_lm, b->frac_lm, &c); 1339 r->frac_hm = usub64_borrow(a->frac_hm, b->frac_hm, &c); 1340 r->frac_hi = usub64_borrow(a->frac_hi, b->frac_hi, &c); 1341 return c; 1342 } 1343 1344 #define frac_sub(R, A, B) FRAC_GENERIC_64_128_256(sub, R)(R, A, B) 1345 1346 static void frac64_truncjam(FloatParts64 *r, FloatParts128 *a) 1347 { 1348 r->frac = a->frac_hi | (a->frac_lo != 0); 1349 } 1350 1351 static void frac128_truncjam(FloatParts128 *r, FloatParts256 *a) 1352 { 1353 r->frac_hi = a->frac_hi; 1354 r->frac_lo = a->frac_hm | ((a->frac_lm | a->frac_lo) != 0); 1355 } 1356 1357 #define frac_truncjam(R, A) FRAC_GENERIC_64_128(truncjam, R)(R, A) 1358 1359 static void frac64_widen(FloatParts128 *r, FloatParts64 *a) 1360 { 1361 r->frac_hi = a->frac; 1362 r->frac_lo = 0; 1363 } 1364 1365 static void frac128_widen(FloatParts256 *r, FloatParts128 *a) 1366 { 1367 r->frac_hi = a->frac_hi; 1368 r->frac_hm = a->frac_lo; 1369 r->frac_lm = 0; 1370 r->frac_lo = 0; 1371 } 1372 1373 #define frac_widen(A, B) FRAC_GENERIC_64_128(widen, B)(A, B) 1374 1375 #define partsN(NAME) glue(glue(glue(parts,N),_),NAME) 1376 #define FloatPartsN glue(FloatParts,N) 1377 #define FloatPartsW glue(FloatParts,W) 1378 1379 #define N 64 1380 #define W 128 1381 1382 #include "softfloat-parts-addsub.c.inc" 1383 #include "softfloat-parts.c.inc" 1384 1385 #undef N 1386 #undef W 1387 #define N 128 1388 #define W 256 1389 1390 #include "softfloat-parts-addsub.c.inc" 1391 #include "softfloat-parts.c.inc" 1392 1393 #undef N 1394 #undef W 1395 #define N 256 1396 1397 #include "softfloat-parts-addsub.c.inc" 1398 1399 #undef N 1400 #undef W 1401 #undef partsN 1402 #undef FloatPartsN 1403 #undef FloatPartsW 1404 1405 /* 1406 * Pack/unpack routines with a specific FloatFmt. 1407 */ 1408 1409 static void float16a_unpack_canonical(FloatParts64 *p, float16 f, 1410 float_status *s, const FloatFmt *params) 1411 { 1412 float16_unpack_raw(p, f); 1413 parts_canonicalize(p, s, params); 1414 } 1415 1416 static void float16_unpack_canonical(FloatParts64 *p, float16 f, 1417 float_status *s) 1418 { 1419 float16a_unpack_canonical(p, f, s, &float16_params); 1420 } 1421 1422 static void bfloat16_unpack_canonical(FloatParts64 *p, bfloat16 f, 1423 float_status *s) 1424 { 1425 bfloat16_unpack_raw(p, f); 1426 parts_canonicalize(p, s, &bfloat16_params); 1427 } 1428 1429 static float16 float16a_round_pack_canonical(FloatParts64 *p, 1430 float_status *s, 1431 const FloatFmt *params) 1432 { 1433 parts_uncanon(p, s, params); 1434 return float16_pack_raw(p); 1435 } 1436 1437 static float16 float16_round_pack_canonical(FloatParts64 *p, 1438 float_status *s) 1439 { 1440 return float16a_round_pack_canonical(p, s, &float16_params); 1441 } 1442 1443 static bfloat16 bfloat16_round_pack_canonical(FloatParts64 *p, 1444 float_status *s) 1445 { 1446 parts_uncanon(p, s, &bfloat16_params); 1447 return bfloat16_pack_raw(p); 1448 } 1449 1450 static void float32_unpack_canonical(FloatParts64 *p, float32 f, 1451 float_status *s) 1452 { 1453 float32_unpack_raw(p, f); 1454 parts_canonicalize(p, s, &float32_params); 1455 } 1456 1457 static float32 float32_round_pack_canonical(FloatParts64 *p, 1458 float_status *s) 1459 { 1460 parts_uncanon(p, s, &float32_params); 1461 return float32_pack_raw(p); 1462 } 1463 1464 static void float64_unpack_canonical(FloatParts64 *p, float64 f, 1465 float_status *s) 1466 { 1467 float64_unpack_raw(p, f); 1468 parts_canonicalize(p, s, &float64_params); 1469 } 1470 1471 static float64 float64_round_pack_canonical(FloatParts64 *p, 1472 float_status *s) 1473 { 1474 parts_uncanon(p, s, &float64_params); 1475 return float64_pack_raw(p); 1476 } 1477 1478 static void float128_unpack_canonical(FloatParts128 *p, float128 f, 1479 float_status *s) 1480 { 1481 float128_unpack_raw(p, f); 1482 parts_canonicalize(p, s, &float128_params); 1483 } 1484 1485 static float128 float128_round_pack_canonical(FloatParts128 *p, 1486 float_status *s) 1487 { 1488 parts_uncanon(p, s, &float128_params); 1489 return float128_pack_raw(p); 1490 } 1491 1492 /* 1493 * Addition and subtraction 1494 */ 1495 1496 static float16 QEMU_FLATTEN 1497 float16_addsub(float16 a, float16 b, float_status *status, bool subtract) 1498 { 1499 FloatParts64 pa, pb, *pr; 1500 1501 float16_unpack_canonical(&pa, a, status); 1502 float16_unpack_canonical(&pb, b, status); 1503 pr = parts_addsub(&pa, &pb, status, subtract); 1504 1505 return float16_round_pack_canonical(pr, status); 1506 } 1507 1508 float16 float16_add(float16 a, float16 b, float_status *status) 1509 { 1510 return float16_addsub(a, b, status, false); 1511 } 1512 1513 float16 float16_sub(float16 a, float16 b, float_status *status) 1514 { 1515 return float16_addsub(a, b, status, true); 1516 } 1517 1518 static float32 QEMU_SOFTFLOAT_ATTR 1519 soft_f32_addsub(float32 a, float32 b, float_status *status, bool subtract) 1520 { 1521 FloatParts64 pa, pb, *pr; 1522 1523 float32_unpack_canonical(&pa, a, status); 1524 float32_unpack_canonical(&pb, b, status); 1525 pr = parts_addsub(&pa, &pb, status, subtract); 1526 1527 return float32_round_pack_canonical(pr, status); 1528 } 1529 1530 static float32 soft_f32_add(float32 a, float32 b, float_status *status) 1531 { 1532 return soft_f32_addsub(a, b, status, false); 1533 } 1534 1535 static float32 soft_f32_sub(float32 a, float32 b, float_status *status) 1536 { 1537 return soft_f32_addsub(a, b, status, true); 1538 } 1539 1540 static float64 QEMU_SOFTFLOAT_ATTR 1541 soft_f64_addsub(float64 a, float64 b, float_status *status, bool subtract) 1542 { 1543 FloatParts64 pa, pb, *pr; 1544 1545 float64_unpack_canonical(&pa, a, status); 1546 float64_unpack_canonical(&pb, b, status); 1547 pr = parts_addsub(&pa, &pb, status, subtract); 1548 1549 return float64_round_pack_canonical(pr, status); 1550 } 1551 1552 static float64 soft_f64_add(float64 a, float64 b, float_status *status) 1553 { 1554 return soft_f64_addsub(a, b, status, false); 1555 } 1556 1557 static float64 soft_f64_sub(float64 a, float64 b, float_status *status) 1558 { 1559 return soft_f64_addsub(a, b, status, true); 1560 } 1561 1562 static float hard_f32_add(float a, float b) 1563 { 1564 return a + b; 1565 } 1566 1567 static float hard_f32_sub(float a, float b) 1568 { 1569 return a - b; 1570 } 1571 1572 static double hard_f64_add(double a, double b) 1573 { 1574 return a + b; 1575 } 1576 1577 static double hard_f64_sub(double a, double b) 1578 { 1579 return a - b; 1580 } 1581 1582 static bool f32_addsubmul_post(union_float32 a, union_float32 b) 1583 { 1584 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1585 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1586 } 1587 return !(float32_is_zero(a.s) && float32_is_zero(b.s)); 1588 } 1589 1590 static bool f64_addsubmul_post(union_float64 a, union_float64 b) 1591 { 1592 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1593 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1594 } else { 1595 return !(float64_is_zero(a.s) && float64_is_zero(b.s)); 1596 } 1597 } 1598 1599 static float32 float32_addsub(float32 a, float32 b, float_status *s, 1600 hard_f32_op2_fn hard, soft_f32_op2_fn soft) 1601 { 1602 return float32_gen2(a, b, s, hard, soft, 1603 f32_is_zon2, f32_addsubmul_post); 1604 } 1605 1606 static float64 float64_addsub(float64 a, float64 b, float_status *s, 1607 hard_f64_op2_fn hard, soft_f64_op2_fn soft) 1608 { 1609 return float64_gen2(a, b, s, hard, soft, 1610 f64_is_zon2, f64_addsubmul_post); 1611 } 1612 1613 float32 QEMU_FLATTEN 1614 float32_add(float32 a, float32 b, float_status *s) 1615 { 1616 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add); 1617 } 1618 1619 float32 QEMU_FLATTEN 1620 float32_sub(float32 a, float32 b, float_status *s) 1621 { 1622 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub); 1623 } 1624 1625 float64 QEMU_FLATTEN 1626 float64_add(float64 a, float64 b, float_status *s) 1627 { 1628 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add); 1629 } 1630 1631 float64 QEMU_FLATTEN 1632 float64_sub(float64 a, float64 b, float_status *s) 1633 { 1634 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub); 1635 } 1636 1637 static bfloat16 QEMU_FLATTEN 1638 bfloat16_addsub(bfloat16 a, bfloat16 b, float_status *status, bool subtract) 1639 { 1640 FloatParts64 pa, pb, *pr; 1641 1642 bfloat16_unpack_canonical(&pa, a, status); 1643 bfloat16_unpack_canonical(&pb, b, status); 1644 pr = parts_addsub(&pa, &pb, status, subtract); 1645 1646 return bfloat16_round_pack_canonical(pr, status); 1647 } 1648 1649 bfloat16 bfloat16_add(bfloat16 a, bfloat16 b, float_status *status) 1650 { 1651 return bfloat16_addsub(a, b, status, false); 1652 } 1653 1654 bfloat16 bfloat16_sub(bfloat16 a, bfloat16 b, float_status *status) 1655 { 1656 return bfloat16_addsub(a, b, status, true); 1657 } 1658 1659 static float128 QEMU_FLATTEN 1660 float128_addsub(float128 a, float128 b, float_status *status, bool subtract) 1661 { 1662 FloatParts128 pa, pb, *pr; 1663 1664 float128_unpack_canonical(&pa, a, status); 1665 float128_unpack_canonical(&pb, b, status); 1666 pr = parts_addsub(&pa, &pb, status, subtract); 1667 1668 return float128_round_pack_canonical(pr, status); 1669 } 1670 1671 float128 float128_add(float128 a, float128 b, float_status *status) 1672 { 1673 return float128_addsub(a, b, status, false); 1674 } 1675 1676 float128 float128_sub(float128 a, float128 b, float_status *status) 1677 { 1678 return float128_addsub(a, b, status, true); 1679 } 1680 1681 /* 1682 * Multiplication 1683 */ 1684 1685 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status) 1686 { 1687 FloatParts64 pa, pb, *pr; 1688 1689 float16_unpack_canonical(&pa, a, status); 1690 float16_unpack_canonical(&pb, b, status); 1691 pr = parts_mul(&pa, &pb, status); 1692 1693 return float16_round_pack_canonical(pr, status); 1694 } 1695 1696 static float32 QEMU_SOFTFLOAT_ATTR 1697 soft_f32_mul(float32 a, float32 b, float_status *status) 1698 { 1699 FloatParts64 pa, pb, *pr; 1700 1701 float32_unpack_canonical(&pa, a, status); 1702 float32_unpack_canonical(&pb, b, status); 1703 pr = parts_mul(&pa, &pb, status); 1704 1705 return float32_round_pack_canonical(pr, status); 1706 } 1707 1708 static float64 QEMU_SOFTFLOAT_ATTR 1709 soft_f64_mul(float64 a, float64 b, float_status *status) 1710 { 1711 FloatParts64 pa, pb, *pr; 1712 1713 float64_unpack_canonical(&pa, a, status); 1714 float64_unpack_canonical(&pb, b, status); 1715 pr = parts_mul(&pa, &pb, status); 1716 1717 return float64_round_pack_canonical(pr, status); 1718 } 1719 1720 static float hard_f32_mul(float a, float b) 1721 { 1722 return a * b; 1723 } 1724 1725 static double hard_f64_mul(double a, double b) 1726 { 1727 return a * b; 1728 } 1729 1730 float32 QEMU_FLATTEN 1731 float32_mul(float32 a, float32 b, float_status *s) 1732 { 1733 return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul, 1734 f32_is_zon2, f32_addsubmul_post); 1735 } 1736 1737 float64 QEMU_FLATTEN 1738 float64_mul(float64 a, float64 b, float_status *s) 1739 { 1740 return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul, 1741 f64_is_zon2, f64_addsubmul_post); 1742 } 1743 1744 bfloat16 QEMU_FLATTEN 1745 bfloat16_mul(bfloat16 a, bfloat16 b, float_status *status) 1746 { 1747 FloatParts64 pa, pb, *pr; 1748 1749 bfloat16_unpack_canonical(&pa, a, status); 1750 bfloat16_unpack_canonical(&pb, b, status); 1751 pr = parts_mul(&pa, &pb, status); 1752 1753 return bfloat16_round_pack_canonical(pr, status); 1754 } 1755 1756 float128 QEMU_FLATTEN 1757 float128_mul(float128 a, float128 b, float_status *status) 1758 { 1759 FloatParts128 pa, pb, *pr; 1760 1761 float128_unpack_canonical(&pa, a, status); 1762 float128_unpack_canonical(&pb, b, status); 1763 pr = parts_mul(&pa, &pb, status); 1764 1765 return float128_round_pack_canonical(pr, status); 1766 } 1767 1768 /* 1769 * Fused multiply-add 1770 */ 1771 1772 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c, 1773 int flags, float_status *status) 1774 { 1775 FloatParts64 pa, pb, pc, *pr; 1776 1777 float16_unpack_canonical(&pa, a, status); 1778 float16_unpack_canonical(&pb, b, status); 1779 float16_unpack_canonical(&pc, c, status); 1780 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1781 1782 return float16_round_pack_canonical(pr, status); 1783 } 1784 1785 static float32 QEMU_SOFTFLOAT_ATTR 1786 soft_f32_muladd(float32 a, float32 b, float32 c, int flags, 1787 float_status *status) 1788 { 1789 FloatParts64 pa, pb, pc, *pr; 1790 1791 float32_unpack_canonical(&pa, a, status); 1792 float32_unpack_canonical(&pb, b, status); 1793 float32_unpack_canonical(&pc, c, status); 1794 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1795 1796 return float32_round_pack_canonical(pr, status); 1797 } 1798 1799 static float64 QEMU_SOFTFLOAT_ATTR 1800 soft_f64_muladd(float64 a, float64 b, float64 c, int flags, 1801 float_status *status) 1802 { 1803 FloatParts64 pa, pb, pc, *pr; 1804 1805 float64_unpack_canonical(&pa, a, status); 1806 float64_unpack_canonical(&pb, b, status); 1807 float64_unpack_canonical(&pc, c, status); 1808 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1809 1810 return float64_round_pack_canonical(pr, status); 1811 } 1812 1813 static bool force_soft_fma; 1814 1815 float32 QEMU_FLATTEN 1816 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s) 1817 { 1818 union_float32 ua, ub, uc, ur; 1819 1820 ua.s = xa; 1821 ub.s = xb; 1822 uc.s = xc; 1823 1824 if (unlikely(!can_use_fpu(s))) { 1825 goto soft; 1826 } 1827 if (unlikely(flags & float_muladd_halve_result)) { 1828 goto soft; 1829 } 1830 1831 float32_input_flush3(&ua.s, &ub.s, &uc.s, s); 1832 if (unlikely(!f32_is_zon3(ua, ub, uc))) { 1833 goto soft; 1834 } 1835 1836 if (unlikely(force_soft_fma)) { 1837 goto soft; 1838 } 1839 1840 /* 1841 * When (a || b) == 0, there's no need to check for under/over flow, 1842 * since we know the addend is (normal || 0) and the product is 0. 1843 */ 1844 if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) { 1845 union_float32 up; 1846 bool prod_sign; 1847 1848 prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s); 1849 prod_sign ^= !!(flags & float_muladd_negate_product); 1850 up.s = float32_set_sign(float32_zero, prod_sign); 1851 1852 if (flags & float_muladd_negate_c) { 1853 uc.h = -uc.h; 1854 } 1855 ur.h = up.h + uc.h; 1856 } else { 1857 union_float32 ua_orig = ua; 1858 union_float32 uc_orig = uc; 1859 1860 if (flags & float_muladd_negate_product) { 1861 ua.h = -ua.h; 1862 } 1863 if (flags & float_muladd_negate_c) { 1864 uc.h = -uc.h; 1865 } 1866 1867 ur.h = fmaf(ua.h, ub.h, uc.h); 1868 1869 if (unlikely(f32_is_inf(ur))) { 1870 float_raise(float_flag_overflow, s); 1871 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) { 1872 ua = ua_orig; 1873 uc = uc_orig; 1874 goto soft; 1875 } 1876 } 1877 if (flags & float_muladd_negate_result) { 1878 return float32_chs(ur.s); 1879 } 1880 return ur.s; 1881 1882 soft: 1883 return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s); 1884 } 1885 1886 float64 QEMU_FLATTEN 1887 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s) 1888 { 1889 union_float64 ua, ub, uc, ur; 1890 1891 ua.s = xa; 1892 ub.s = xb; 1893 uc.s = xc; 1894 1895 if (unlikely(!can_use_fpu(s))) { 1896 goto soft; 1897 } 1898 if (unlikely(flags & float_muladd_halve_result)) { 1899 goto soft; 1900 } 1901 1902 float64_input_flush3(&ua.s, &ub.s, &uc.s, s); 1903 if (unlikely(!f64_is_zon3(ua, ub, uc))) { 1904 goto soft; 1905 } 1906 1907 if (unlikely(force_soft_fma)) { 1908 goto soft; 1909 } 1910 1911 /* 1912 * When (a || b) == 0, there's no need to check for under/over flow, 1913 * since we know the addend is (normal || 0) and the product is 0. 1914 */ 1915 if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) { 1916 union_float64 up; 1917 bool prod_sign; 1918 1919 prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s); 1920 prod_sign ^= !!(flags & float_muladd_negate_product); 1921 up.s = float64_set_sign(float64_zero, prod_sign); 1922 1923 if (flags & float_muladd_negate_c) { 1924 uc.h = -uc.h; 1925 } 1926 ur.h = up.h + uc.h; 1927 } else { 1928 union_float64 ua_orig = ua; 1929 union_float64 uc_orig = uc; 1930 1931 if (flags & float_muladd_negate_product) { 1932 ua.h = -ua.h; 1933 } 1934 if (flags & float_muladd_negate_c) { 1935 uc.h = -uc.h; 1936 } 1937 1938 ur.h = fma(ua.h, ub.h, uc.h); 1939 1940 if (unlikely(f64_is_inf(ur))) { 1941 float_raise(float_flag_overflow, s); 1942 } else if (unlikely(fabs(ur.h) <= FLT_MIN)) { 1943 ua = ua_orig; 1944 uc = uc_orig; 1945 goto soft; 1946 } 1947 } 1948 if (flags & float_muladd_negate_result) { 1949 return float64_chs(ur.s); 1950 } 1951 return ur.s; 1952 1953 soft: 1954 return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s); 1955 } 1956 1957 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c, 1958 int flags, float_status *status) 1959 { 1960 FloatParts64 pa, pb, pc, *pr; 1961 1962 bfloat16_unpack_canonical(&pa, a, status); 1963 bfloat16_unpack_canonical(&pb, b, status); 1964 bfloat16_unpack_canonical(&pc, c, status); 1965 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1966 1967 return bfloat16_round_pack_canonical(pr, status); 1968 } 1969 1970 float128 QEMU_FLATTEN float128_muladd(float128 a, float128 b, float128 c, 1971 int flags, float_status *status) 1972 { 1973 FloatParts128 pa, pb, pc, *pr; 1974 1975 float128_unpack_canonical(&pa, a, status); 1976 float128_unpack_canonical(&pb, b, status); 1977 float128_unpack_canonical(&pc, c, status); 1978 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1979 1980 return float128_round_pack_canonical(pr, status); 1981 } 1982 1983 /* 1984 * Division 1985 */ 1986 1987 float16 float16_div(float16 a, float16 b, float_status *status) 1988 { 1989 FloatParts64 pa, pb, *pr; 1990 1991 float16_unpack_canonical(&pa, a, status); 1992 float16_unpack_canonical(&pb, b, status); 1993 pr = parts_div(&pa, &pb, status); 1994 1995 return float16_round_pack_canonical(pr, status); 1996 } 1997 1998 static float32 QEMU_SOFTFLOAT_ATTR 1999 soft_f32_div(float32 a, float32 b, float_status *status) 2000 { 2001 FloatParts64 pa, pb, *pr; 2002 2003 float32_unpack_canonical(&pa, a, status); 2004 float32_unpack_canonical(&pb, b, status); 2005 pr = parts_div(&pa, &pb, status); 2006 2007 return float32_round_pack_canonical(pr, status); 2008 } 2009 2010 static float64 QEMU_SOFTFLOAT_ATTR 2011 soft_f64_div(float64 a, float64 b, float_status *status) 2012 { 2013 FloatParts64 pa, pb, *pr; 2014 2015 float64_unpack_canonical(&pa, a, status); 2016 float64_unpack_canonical(&pb, b, status); 2017 pr = parts_div(&pa, &pb, status); 2018 2019 return float64_round_pack_canonical(pr, status); 2020 } 2021 2022 static float hard_f32_div(float a, float b) 2023 { 2024 return a / b; 2025 } 2026 2027 static double hard_f64_div(double a, double b) 2028 { 2029 return a / b; 2030 } 2031 2032 static bool f32_div_pre(union_float32 a, union_float32 b) 2033 { 2034 if (QEMU_HARDFLOAT_2F32_USE_FP) { 2035 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 2036 fpclassify(b.h) == FP_NORMAL; 2037 } 2038 return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s); 2039 } 2040 2041 static bool f64_div_pre(union_float64 a, union_float64 b) 2042 { 2043 if (QEMU_HARDFLOAT_2F64_USE_FP) { 2044 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 2045 fpclassify(b.h) == FP_NORMAL; 2046 } 2047 return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s); 2048 } 2049 2050 static bool f32_div_post(union_float32 a, union_float32 b) 2051 { 2052 if (QEMU_HARDFLOAT_2F32_USE_FP) { 2053 return fpclassify(a.h) != FP_ZERO; 2054 } 2055 return !float32_is_zero(a.s); 2056 } 2057 2058 static bool f64_div_post(union_float64 a, union_float64 b) 2059 { 2060 if (QEMU_HARDFLOAT_2F64_USE_FP) { 2061 return fpclassify(a.h) != FP_ZERO; 2062 } 2063 return !float64_is_zero(a.s); 2064 } 2065 2066 float32 QEMU_FLATTEN 2067 float32_div(float32 a, float32 b, float_status *s) 2068 { 2069 return float32_gen2(a, b, s, hard_f32_div, soft_f32_div, 2070 f32_div_pre, f32_div_post); 2071 } 2072 2073 float64 QEMU_FLATTEN 2074 float64_div(float64 a, float64 b, float_status *s) 2075 { 2076 return float64_gen2(a, b, s, hard_f64_div, soft_f64_div, 2077 f64_div_pre, f64_div_post); 2078 } 2079 2080 bfloat16 QEMU_FLATTEN 2081 bfloat16_div(bfloat16 a, bfloat16 b, float_status *status) 2082 { 2083 FloatParts64 pa, pb, *pr; 2084 2085 bfloat16_unpack_canonical(&pa, a, status); 2086 bfloat16_unpack_canonical(&pb, b, status); 2087 pr = parts_div(&pa, &pb, status); 2088 2089 return bfloat16_round_pack_canonical(pr, status); 2090 } 2091 2092 float128 QEMU_FLATTEN 2093 float128_div(float128 a, float128 b, float_status *status) 2094 { 2095 FloatParts128 pa, pb, *pr; 2096 2097 float128_unpack_canonical(&pa, a, status); 2098 float128_unpack_canonical(&pb, b, status); 2099 pr = parts_div(&pa, &pb, status); 2100 2101 return float128_round_pack_canonical(pr, status); 2102 } 2103 2104 /* 2105 * Float to Float conversions 2106 * 2107 * Returns the result of converting one float format to another. The 2108 * conversion is performed according to the IEC/IEEE Standard for 2109 * Binary Floating-Point Arithmetic. 2110 * 2111 * Usually this only needs to take care of raising invalid exceptions 2112 * and handling the conversion on NaNs. 2113 */ 2114 2115 static void parts_float_to_ahp(FloatParts64 *a, float_status *s) 2116 { 2117 switch (a->cls) { 2118 case float_class_qnan: 2119 case float_class_snan: 2120 /* 2121 * There is no NaN in the destination format. Raise Invalid 2122 * and return a zero with the sign of the input NaN. 2123 */ 2124 float_raise(float_flag_invalid, s); 2125 a->cls = float_class_zero; 2126 break; 2127 2128 case float_class_inf: 2129 /* 2130 * There is no Inf in the destination format. Raise Invalid 2131 * and return the maximum normal with the correct sign. 2132 */ 2133 float_raise(float_flag_invalid, s); 2134 a->cls = float_class_normal; 2135 a->exp = float16_params_ahp.exp_max; 2136 a->frac = MAKE_64BIT_MASK(float16_params_ahp.frac_shift, 2137 float16_params_ahp.frac_size + 1); 2138 break; 2139 2140 case float_class_normal: 2141 case float_class_zero: 2142 break; 2143 2144 default: 2145 g_assert_not_reached(); 2146 } 2147 } 2148 2149 static void parts64_float_to_float(FloatParts64 *a, float_status *s) 2150 { 2151 if (is_nan(a->cls)) { 2152 parts_return_nan(a, s); 2153 } 2154 } 2155 2156 static void parts128_float_to_float(FloatParts128 *a, float_status *s) 2157 { 2158 if (is_nan(a->cls)) { 2159 parts_return_nan(a, s); 2160 } 2161 } 2162 2163 #define parts_float_to_float(P, S) \ 2164 PARTS_GENERIC_64_128(float_to_float, P)(P, S) 2165 2166 static void parts_float_to_float_narrow(FloatParts64 *a, FloatParts128 *b, 2167 float_status *s) 2168 { 2169 a->cls = b->cls; 2170 a->sign = b->sign; 2171 a->exp = b->exp; 2172 2173 if (a->cls == float_class_normal) { 2174 frac_truncjam(a, b); 2175 } else if (is_nan(a->cls)) { 2176 /* Discard the low bits of the NaN. */ 2177 a->frac = b->frac_hi; 2178 parts_return_nan(a, s); 2179 } 2180 } 2181 2182 static void parts_float_to_float_widen(FloatParts128 *a, FloatParts64 *b, 2183 float_status *s) 2184 { 2185 a->cls = b->cls; 2186 a->sign = b->sign; 2187 a->exp = b->exp; 2188 frac_widen(a, b); 2189 2190 if (is_nan(a->cls)) { 2191 parts_return_nan(a, s); 2192 } 2193 } 2194 2195 float32 float16_to_float32(float16 a, bool ieee, float_status *s) 2196 { 2197 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2198 FloatParts64 p; 2199 2200 float16a_unpack_canonical(&p, a, s, fmt16); 2201 parts_float_to_float(&p, s); 2202 return float32_round_pack_canonical(&p, s); 2203 } 2204 2205 float64 float16_to_float64(float16 a, bool ieee, float_status *s) 2206 { 2207 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2208 FloatParts64 p; 2209 2210 float16a_unpack_canonical(&p, a, s, fmt16); 2211 parts_float_to_float(&p, s); 2212 return float64_round_pack_canonical(&p, s); 2213 } 2214 2215 float16 float32_to_float16(float32 a, bool ieee, float_status *s) 2216 { 2217 FloatParts64 p; 2218 const FloatFmt *fmt; 2219 2220 float32_unpack_canonical(&p, a, s); 2221 if (ieee) { 2222 parts_float_to_float(&p, s); 2223 fmt = &float16_params; 2224 } else { 2225 parts_float_to_ahp(&p, s); 2226 fmt = &float16_params_ahp; 2227 } 2228 return float16a_round_pack_canonical(&p, s, fmt); 2229 } 2230 2231 static float64 QEMU_SOFTFLOAT_ATTR 2232 soft_float32_to_float64(float32 a, float_status *s) 2233 { 2234 FloatParts64 p; 2235 2236 float32_unpack_canonical(&p, a, s); 2237 parts_float_to_float(&p, s); 2238 return float64_round_pack_canonical(&p, s); 2239 } 2240 2241 float64 float32_to_float64(float32 a, float_status *s) 2242 { 2243 if (likely(float32_is_normal(a))) { 2244 /* Widening conversion can never produce inexact results. */ 2245 union_float32 uf; 2246 union_float64 ud; 2247 uf.s = a; 2248 ud.h = uf.h; 2249 return ud.s; 2250 } else if (float32_is_zero(a)) { 2251 return float64_set_sign(float64_zero, float32_is_neg(a)); 2252 } else { 2253 return soft_float32_to_float64(a, s); 2254 } 2255 } 2256 2257 float16 float64_to_float16(float64 a, bool ieee, float_status *s) 2258 { 2259 FloatParts64 p; 2260 const FloatFmt *fmt; 2261 2262 float64_unpack_canonical(&p, a, s); 2263 if (ieee) { 2264 parts_float_to_float(&p, s); 2265 fmt = &float16_params; 2266 } else { 2267 parts_float_to_ahp(&p, s); 2268 fmt = &float16_params_ahp; 2269 } 2270 return float16a_round_pack_canonical(&p, s, fmt); 2271 } 2272 2273 float32 float64_to_float32(float64 a, float_status *s) 2274 { 2275 FloatParts64 p; 2276 2277 float64_unpack_canonical(&p, a, s); 2278 parts_float_to_float(&p, s); 2279 return float32_round_pack_canonical(&p, s); 2280 } 2281 2282 float32 bfloat16_to_float32(bfloat16 a, float_status *s) 2283 { 2284 FloatParts64 p; 2285 2286 bfloat16_unpack_canonical(&p, a, s); 2287 parts_float_to_float(&p, s); 2288 return float32_round_pack_canonical(&p, s); 2289 } 2290 2291 float64 bfloat16_to_float64(bfloat16 a, float_status *s) 2292 { 2293 FloatParts64 p; 2294 2295 bfloat16_unpack_canonical(&p, a, s); 2296 parts_float_to_float(&p, s); 2297 return float64_round_pack_canonical(&p, s); 2298 } 2299 2300 bfloat16 float32_to_bfloat16(float32 a, float_status *s) 2301 { 2302 FloatParts64 p; 2303 2304 float32_unpack_canonical(&p, a, s); 2305 parts_float_to_float(&p, s); 2306 return bfloat16_round_pack_canonical(&p, s); 2307 } 2308 2309 bfloat16 float64_to_bfloat16(float64 a, float_status *s) 2310 { 2311 FloatParts64 p; 2312 2313 float64_unpack_canonical(&p, a, s); 2314 parts_float_to_float(&p, s); 2315 return bfloat16_round_pack_canonical(&p, s); 2316 } 2317 2318 float32 float128_to_float32(float128 a, float_status *s) 2319 { 2320 FloatParts64 p64; 2321 FloatParts128 p128; 2322 2323 float128_unpack_canonical(&p128, a, s); 2324 parts_float_to_float_narrow(&p64, &p128, s); 2325 return float32_round_pack_canonical(&p64, s); 2326 } 2327 2328 float64 float128_to_float64(float128 a, float_status *s) 2329 { 2330 FloatParts64 p64; 2331 FloatParts128 p128; 2332 2333 float128_unpack_canonical(&p128, a, s); 2334 parts_float_to_float_narrow(&p64, &p128, s); 2335 return float64_round_pack_canonical(&p64, s); 2336 } 2337 2338 float128 float32_to_float128(float32 a, float_status *s) 2339 { 2340 FloatParts64 p64; 2341 FloatParts128 p128; 2342 2343 float32_unpack_canonical(&p64, a, s); 2344 parts_float_to_float_widen(&p128, &p64, s); 2345 return float128_round_pack_canonical(&p128, s); 2346 } 2347 2348 float128 float64_to_float128(float64 a, float_status *s) 2349 { 2350 FloatParts64 p64; 2351 FloatParts128 p128; 2352 2353 float64_unpack_canonical(&p64, a, s); 2354 parts_float_to_float_widen(&p128, &p64, s); 2355 return float128_round_pack_canonical(&p128, s); 2356 } 2357 2358 /* 2359 * Round to integral value 2360 */ 2361 2362 float16 float16_round_to_int(float16 a, float_status *s) 2363 { 2364 FloatParts64 p; 2365 2366 float16_unpack_canonical(&p, a, s); 2367 parts_round_to_int(&p, s->float_rounding_mode, 0, s, &float16_params); 2368 return float16_round_pack_canonical(&p, s); 2369 } 2370 2371 float32 float32_round_to_int(float32 a, float_status *s) 2372 { 2373 FloatParts64 p; 2374 2375 float32_unpack_canonical(&p, a, s); 2376 parts_round_to_int(&p, s->float_rounding_mode, 0, s, &float32_params); 2377 return float32_round_pack_canonical(&p, s); 2378 } 2379 2380 float64 float64_round_to_int(float64 a, float_status *s) 2381 { 2382 FloatParts64 p; 2383 2384 float64_unpack_canonical(&p, a, s); 2385 parts_round_to_int(&p, s->float_rounding_mode, 0, s, &float64_params); 2386 return float64_round_pack_canonical(&p, s); 2387 } 2388 2389 bfloat16 bfloat16_round_to_int(bfloat16 a, float_status *s) 2390 { 2391 FloatParts64 p; 2392 2393 bfloat16_unpack_canonical(&p, a, s); 2394 parts_round_to_int(&p, s->float_rounding_mode, 0, s, &bfloat16_params); 2395 return bfloat16_round_pack_canonical(&p, s); 2396 } 2397 2398 float128 float128_round_to_int(float128 a, float_status *s) 2399 { 2400 FloatParts128 p; 2401 2402 float128_unpack_canonical(&p, a, s); 2403 parts_round_to_int(&p, s->float_rounding_mode, 0, s, &float128_params); 2404 return float128_round_pack_canonical(&p, s); 2405 } 2406 2407 /* 2408 * Floating-point to signed integer conversions 2409 */ 2410 2411 int8_t float16_to_int8_scalbn(float16 a, FloatRoundMode rmode, int scale, 2412 float_status *s) 2413 { 2414 FloatParts64 p; 2415 2416 float16_unpack_canonical(&p, a, s); 2417 return parts_float_to_sint(&p, rmode, scale, INT8_MIN, INT8_MAX, s); 2418 } 2419 2420 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale, 2421 float_status *s) 2422 { 2423 FloatParts64 p; 2424 2425 float16_unpack_canonical(&p, a, s); 2426 return parts_float_to_sint(&p, rmode, scale, INT16_MIN, INT16_MAX, s); 2427 } 2428 2429 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale, 2430 float_status *s) 2431 { 2432 FloatParts64 p; 2433 2434 float16_unpack_canonical(&p, a, s); 2435 return parts_float_to_sint(&p, rmode, scale, INT32_MIN, INT32_MAX, s); 2436 } 2437 2438 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale, 2439 float_status *s) 2440 { 2441 FloatParts64 p; 2442 2443 float16_unpack_canonical(&p, a, s); 2444 return parts_float_to_sint(&p, rmode, scale, INT64_MIN, INT64_MAX, s); 2445 } 2446 2447 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale, 2448 float_status *s) 2449 { 2450 FloatParts64 p; 2451 2452 float32_unpack_canonical(&p, a, s); 2453 return parts_float_to_sint(&p, rmode, scale, INT16_MIN, INT16_MAX, s); 2454 } 2455 2456 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale, 2457 float_status *s) 2458 { 2459 FloatParts64 p; 2460 2461 float32_unpack_canonical(&p, a, s); 2462 return parts_float_to_sint(&p, rmode, scale, INT32_MIN, INT32_MAX, s); 2463 } 2464 2465 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale, 2466 float_status *s) 2467 { 2468 FloatParts64 p; 2469 2470 float32_unpack_canonical(&p, a, s); 2471 return parts_float_to_sint(&p, rmode, scale, INT64_MIN, INT64_MAX, s); 2472 } 2473 2474 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale, 2475 float_status *s) 2476 { 2477 FloatParts64 p; 2478 2479 float64_unpack_canonical(&p, a, s); 2480 return parts_float_to_sint(&p, rmode, scale, INT16_MIN, INT16_MAX, s); 2481 } 2482 2483 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale, 2484 float_status *s) 2485 { 2486 FloatParts64 p; 2487 2488 float64_unpack_canonical(&p, a, s); 2489 return parts_float_to_sint(&p, rmode, scale, INT32_MIN, INT32_MAX, s); 2490 } 2491 2492 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale, 2493 float_status *s) 2494 { 2495 FloatParts64 p; 2496 2497 float64_unpack_canonical(&p, a, s); 2498 return parts_float_to_sint(&p, rmode, scale, INT64_MIN, INT64_MAX, s); 2499 } 2500 2501 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2502 float_status *s) 2503 { 2504 FloatParts64 p; 2505 2506 bfloat16_unpack_canonical(&p, a, s); 2507 return parts_float_to_sint(&p, rmode, scale, INT16_MIN, INT16_MAX, s); 2508 } 2509 2510 int32_t bfloat16_to_int32_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2511 float_status *s) 2512 { 2513 FloatParts64 p; 2514 2515 bfloat16_unpack_canonical(&p, a, s); 2516 return parts_float_to_sint(&p, rmode, scale, INT32_MIN, INT32_MAX, s); 2517 } 2518 2519 int64_t bfloat16_to_int64_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2520 float_status *s) 2521 { 2522 FloatParts64 p; 2523 2524 bfloat16_unpack_canonical(&p, a, s); 2525 return parts_float_to_sint(&p, rmode, scale, INT64_MIN, INT64_MAX, s); 2526 } 2527 2528 static int32_t float128_to_int32_scalbn(float128 a, FloatRoundMode rmode, 2529 int scale, float_status *s) 2530 { 2531 FloatParts128 p; 2532 2533 float128_unpack_canonical(&p, a, s); 2534 return parts_float_to_sint(&p, rmode, scale, INT32_MIN, INT32_MAX, s); 2535 } 2536 2537 static int64_t float128_to_int64_scalbn(float128 a, FloatRoundMode rmode, 2538 int scale, float_status *s) 2539 { 2540 FloatParts128 p; 2541 2542 float128_unpack_canonical(&p, a, s); 2543 return parts_float_to_sint(&p, rmode, scale, INT64_MIN, INT64_MAX, s); 2544 } 2545 2546 int8_t float16_to_int8(float16 a, float_status *s) 2547 { 2548 return float16_to_int8_scalbn(a, s->float_rounding_mode, 0, s); 2549 } 2550 2551 int16_t float16_to_int16(float16 a, float_status *s) 2552 { 2553 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2554 } 2555 2556 int32_t float16_to_int32(float16 a, float_status *s) 2557 { 2558 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2559 } 2560 2561 int64_t float16_to_int64(float16 a, float_status *s) 2562 { 2563 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2564 } 2565 2566 int16_t float32_to_int16(float32 a, float_status *s) 2567 { 2568 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2569 } 2570 2571 int32_t float32_to_int32(float32 a, float_status *s) 2572 { 2573 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2574 } 2575 2576 int64_t float32_to_int64(float32 a, float_status *s) 2577 { 2578 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2579 } 2580 2581 int16_t float64_to_int16(float64 a, float_status *s) 2582 { 2583 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2584 } 2585 2586 int32_t float64_to_int32(float64 a, float_status *s) 2587 { 2588 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2589 } 2590 2591 int64_t float64_to_int64(float64 a, float_status *s) 2592 { 2593 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2594 } 2595 2596 int32_t float128_to_int32(float128 a, float_status *s) 2597 { 2598 return float128_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2599 } 2600 2601 int64_t float128_to_int64(float128 a, float_status *s) 2602 { 2603 return float128_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2604 } 2605 2606 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s) 2607 { 2608 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2609 } 2610 2611 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s) 2612 { 2613 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2614 } 2615 2616 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s) 2617 { 2618 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2619 } 2620 2621 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s) 2622 { 2623 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s); 2624 } 2625 2626 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s) 2627 { 2628 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s); 2629 } 2630 2631 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s) 2632 { 2633 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s); 2634 } 2635 2636 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s) 2637 { 2638 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s); 2639 } 2640 2641 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s) 2642 { 2643 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s); 2644 } 2645 2646 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s) 2647 { 2648 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s); 2649 } 2650 2651 int32_t float128_to_int32_round_to_zero(float128 a, float_status *s) 2652 { 2653 return float128_to_int32_scalbn(a, float_round_to_zero, 0, s); 2654 } 2655 2656 int64_t float128_to_int64_round_to_zero(float128 a, float_status *s) 2657 { 2658 return float128_to_int64_scalbn(a, float_round_to_zero, 0, s); 2659 } 2660 2661 int16_t bfloat16_to_int16(bfloat16 a, float_status *s) 2662 { 2663 return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2664 } 2665 2666 int32_t bfloat16_to_int32(bfloat16 a, float_status *s) 2667 { 2668 return bfloat16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2669 } 2670 2671 int64_t bfloat16_to_int64(bfloat16 a, float_status *s) 2672 { 2673 return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2674 } 2675 2676 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s) 2677 { 2678 return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2679 } 2680 2681 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a, float_status *s) 2682 { 2683 return bfloat16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2684 } 2685 2686 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a, float_status *s) 2687 { 2688 return bfloat16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2689 } 2690 2691 /* 2692 * Floating-point to unsigned integer conversions 2693 */ 2694 2695 uint8_t float16_to_uint8_scalbn(float16 a, FloatRoundMode rmode, int scale, 2696 float_status *s) 2697 { 2698 FloatParts64 p; 2699 2700 float16_unpack_canonical(&p, a, s); 2701 return parts_float_to_uint(&p, rmode, scale, UINT8_MAX, s); 2702 } 2703 2704 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale, 2705 float_status *s) 2706 { 2707 FloatParts64 p; 2708 2709 float16_unpack_canonical(&p, a, s); 2710 return parts_float_to_uint(&p, rmode, scale, UINT16_MAX, s); 2711 } 2712 2713 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale, 2714 float_status *s) 2715 { 2716 FloatParts64 p; 2717 2718 float16_unpack_canonical(&p, a, s); 2719 return parts_float_to_uint(&p, rmode, scale, UINT32_MAX, s); 2720 } 2721 2722 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale, 2723 float_status *s) 2724 { 2725 FloatParts64 p; 2726 2727 float16_unpack_canonical(&p, a, s); 2728 return parts_float_to_uint(&p, rmode, scale, UINT64_MAX, s); 2729 } 2730 2731 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale, 2732 float_status *s) 2733 { 2734 FloatParts64 p; 2735 2736 float32_unpack_canonical(&p, a, s); 2737 return parts_float_to_uint(&p, rmode, scale, UINT16_MAX, s); 2738 } 2739 2740 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale, 2741 float_status *s) 2742 { 2743 FloatParts64 p; 2744 2745 float32_unpack_canonical(&p, a, s); 2746 return parts_float_to_uint(&p, rmode, scale, UINT32_MAX, s); 2747 } 2748 2749 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale, 2750 float_status *s) 2751 { 2752 FloatParts64 p; 2753 2754 float32_unpack_canonical(&p, a, s); 2755 return parts_float_to_uint(&p, rmode, scale, UINT64_MAX, s); 2756 } 2757 2758 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale, 2759 float_status *s) 2760 { 2761 FloatParts64 p; 2762 2763 float64_unpack_canonical(&p, a, s); 2764 return parts_float_to_uint(&p, rmode, scale, UINT16_MAX, s); 2765 } 2766 2767 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale, 2768 float_status *s) 2769 { 2770 FloatParts64 p; 2771 2772 float64_unpack_canonical(&p, a, s); 2773 return parts_float_to_uint(&p, rmode, scale, UINT32_MAX, s); 2774 } 2775 2776 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale, 2777 float_status *s) 2778 { 2779 FloatParts64 p; 2780 2781 float64_unpack_canonical(&p, a, s); 2782 return parts_float_to_uint(&p, rmode, scale, UINT64_MAX, s); 2783 } 2784 2785 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode, 2786 int scale, float_status *s) 2787 { 2788 FloatParts64 p; 2789 2790 bfloat16_unpack_canonical(&p, a, s); 2791 return parts_float_to_uint(&p, rmode, scale, UINT16_MAX, s); 2792 } 2793 2794 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode rmode, 2795 int scale, float_status *s) 2796 { 2797 FloatParts64 p; 2798 2799 bfloat16_unpack_canonical(&p, a, s); 2800 return parts_float_to_uint(&p, rmode, scale, UINT32_MAX, s); 2801 } 2802 2803 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode rmode, 2804 int scale, float_status *s) 2805 { 2806 FloatParts64 p; 2807 2808 bfloat16_unpack_canonical(&p, a, s); 2809 return parts_float_to_uint(&p, rmode, scale, UINT64_MAX, s); 2810 } 2811 2812 static uint32_t float128_to_uint32_scalbn(float128 a, FloatRoundMode rmode, 2813 int scale, float_status *s) 2814 { 2815 FloatParts128 p; 2816 2817 float128_unpack_canonical(&p, a, s); 2818 return parts_float_to_uint(&p, rmode, scale, UINT32_MAX, s); 2819 } 2820 2821 static uint64_t float128_to_uint64_scalbn(float128 a, FloatRoundMode rmode, 2822 int scale, float_status *s) 2823 { 2824 FloatParts128 p; 2825 2826 float128_unpack_canonical(&p, a, s); 2827 return parts_float_to_uint(&p, rmode, scale, UINT64_MAX, s); 2828 } 2829 2830 uint8_t float16_to_uint8(float16 a, float_status *s) 2831 { 2832 return float16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s); 2833 } 2834 2835 uint16_t float16_to_uint16(float16 a, float_status *s) 2836 { 2837 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2838 } 2839 2840 uint32_t float16_to_uint32(float16 a, float_status *s) 2841 { 2842 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2843 } 2844 2845 uint64_t float16_to_uint64(float16 a, float_status *s) 2846 { 2847 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2848 } 2849 2850 uint16_t float32_to_uint16(float32 a, float_status *s) 2851 { 2852 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2853 } 2854 2855 uint32_t float32_to_uint32(float32 a, float_status *s) 2856 { 2857 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2858 } 2859 2860 uint64_t float32_to_uint64(float32 a, float_status *s) 2861 { 2862 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2863 } 2864 2865 uint16_t float64_to_uint16(float64 a, float_status *s) 2866 { 2867 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2868 } 2869 2870 uint32_t float64_to_uint32(float64 a, float_status *s) 2871 { 2872 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2873 } 2874 2875 uint64_t float64_to_uint64(float64 a, float_status *s) 2876 { 2877 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2878 } 2879 2880 uint32_t float128_to_uint32(float128 a, float_status *s) 2881 { 2882 return float128_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2883 } 2884 2885 uint64_t float128_to_uint64(float128 a, float_status *s) 2886 { 2887 return float128_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2888 } 2889 2890 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s) 2891 { 2892 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2893 } 2894 2895 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s) 2896 { 2897 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2898 } 2899 2900 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s) 2901 { 2902 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2903 } 2904 2905 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s) 2906 { 2907 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2908 } 2909 2910 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s) 2911 { 2912 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2913 } 2914 2915 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s) 2916 { 2917 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2918 } 2919 2920 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s) 2921 { 2922 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2923 } 2924 2925 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s) 2926 { 2927 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2928 } 2929 2930 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s) 2931 { 2932 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2933 } 2934 2935 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *s) 2936 { 2937 return float128_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2938 } 2939 2940 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *s) 2941 { 2942 return float128_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2943 } 2944 2945 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s) 2946 { 2947 return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2948 } 2949 2950 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *s) 2951 { 2952 return bfloat16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2953 } 2954 2955 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s) 2956 { 2957 return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2958 } 2959 2960 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s) 2961 { 2962 return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2963 } 2964 2965 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *s) 2966 { 2967 return bfloat16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2968 } 2969 2970 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *s) 2971 { 2972 return bfloat16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2973 } 2974 2975 /* 2976 * Signed integer to floating-point conversions 2977 */ 2978 2979 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status) 2980 { 2981 FloatParts64 p; 2982 2983 parts_sint_to_float(&p, a, scale, status); 2984 return float16_round_pack_canonical(&p, status); 2985 } 2986 2987 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status) 2988 { 2989 return int64_to_float16_scalbn(a, scale, status); 2990 } 2991 2992 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status) 2993 { 2994 return int64_to_float16_scalbn(a, scale, status); 2995 } 2996 2997 float16 int64_to_float16(int64_t a, float_status *status) 2998 { 2999 return int64_to_float16_scalbn(a, 0, status); 3000 } 3001 3002 float16 int32_to_float16(int32_t a, float_status *status) 3003 { 3004 return int64_to_float16_scalbn(a, 0, status); 3005 } 3006 3007 float16 int16_to_float16(int16_t a, float_status *status) 3008 { 3009 return int64_to_float16_scalbn(a, 0, status); 3010 } 3011 3012 float16 int8_to_float16(int8_t a, float_status *status) 3013 { 3014 return int64_to_float16_scalbn(a, 0, status); 3015 } 3016 3017 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status) 3018 { 3019 FloatParts64 p; 3020 3021 parts64_sint_to_float(&p, a, scale, status); 3022 return float32_round_pack_canonical(&p, status); 3023 } 3024 3025 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status) 3026 { 3027 return int64_to_float32_scalbn(a, scale, status); 3028 } 3029 3030 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status) 3031 { 3032 return int64_to_float32_scalbn(a, scale, status); 3033 } 3034 3035 float32 int64_to_float32(int64_t a, float_status *status) 3036 { 3037 return int64_to_float32_scalbn(a, 0, status); 3038 } 3039 3040 float32 int32_to_float32(int32_t a, float_status *status) 3041 { 3042 return int64_to_float32_scalbn(a, 0, status); 3043 } 3044 3045 float32 int16_to_float32(int16_t a, float_status *status) 3046 { 3047 return int64_to_float32_scalbn(a, 0, status); 3048 } 3049 3050 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status) 3051 { 3052 FloatParts64 p; 3053 3054 parts_sint_to_float(&p, a, scale, status); 3055 return float64_round_pack_canonical(&p, status); 3056 } 3057 3058 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status) 3059 { 3060 return int64_to_float64_scalbn(a, scale, status); 3061 } 3062 3063 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status) 3064 { 3065 return int64_to_float64_scalbn(a, scale, status); 3066 } 3067 3068 float64 int64_to_float64(int64_t a, float_status *status) 3069 { 3070 return int64_to_float64_scalbn(a, 0, status); 3071 } 3072 3073 float64 int32_to_float64(int32_t a, float_status *status) 3074 { 3075 return int64_to_float64_scalbn(a, 0, status); 3076 } 3077 3078 float64 int16_to_float64(int16_t a, float_status *status) 3079 { 3080 return int64_to_float64_scalbn(a, 0, status); 3081 } 3082 3083 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status) 3084 { 3085 FloatParts64 p; 3086 3087 parts_sint_to_float(&p, a, scale, status); 3088 return bfloat16_round_pack_canonical(&p, status); 3089 } 3090 3091 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int scale, float_status *status) 3092 { 3093 return int64_to_bfloat16_scalbn(a, scale, status); 3094 } 3095 3096 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status) 3097 { 3098 return int64_to_bfloat16_scalbn(a, scale, status); 3099 } 3100 3101 bfloat16 int64_to_bfloat16(int64_t a, float_status *status) 3102 { 3103 return int64_to_bfloat16_scalbn(a, 0, status); 3104 } 3105 3106 bfloat16 int32_to_bfloat16(int32_t a, float_status *status) 3107 { 3108 return int64_to_bfloat16_scalbn(a, 0, status); 3109 } 3110 3111 bfloat16 int16_to_bfloat16(int16_t a, float_status *status) 3112 { 3113 return int64_to_bfloat16_scalbn(a, 0, status); 3114 } 3115 3116 float128 int64_to_float128(int64_t a, float_status *status) 3117 { 3118 FloatParts128 p; 3119 3120 parts_sint_to_float(&p, a, 0, status); 3121 return float128_round_pack_canonical(&p, status); 3122 } 3123 3124 float128 int32_to_float128(int32_t a, float_status *status) 3125 { 3126 return int64_to_float128(a, status); 3127 } 3128 3129 /* 3130 * Unsigned Integer to floating-point conversions 3131 */ 3132 3133 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status) 3134 { 3135 FloatParts64 p; 3136 3137 parts_uint_to_float(&p, a, scale, status); 3138 return float16_round_pack_canonical(&p, status); 3139 } 3140 3141 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status) 3142 { 3143 return uint64_to_float16_scalbn(a, scale, status); 3144 } 3145 3146 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status) 3147 { 3148 return uint64_to_float16_scalbn(a, scale, status); 3149 } 3150 3151 float16 uint64_to_float16(uint64_t a, float_status *status) 3152 { 3153 return uint64_to_float16_scalbn(a, 0, status); 3154 } 3155 3156 float16 uint32_to_float16(uint32_t a, float_status *status) 3157 { 3158 return uint64_to_float16_scalbn(a, 0, status); 3159 } 3160 3161 float16 uint16_to_float16(uint16_t a, float_status *status) 3162 { 3163 return uint64_to_float16_scalbn(a, 0, status); 3164 } 3165 3166 float16 uint8_to_float16(uint8_t a, float_status *status) 3167 { 3168 return uint64_to_float16_scalbn(a, 0, status); 3169 } 3170 3171 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status) 3172 { 3173 FloatParts64 p; 3174 3175 parts_uint_to_float(&p, a, scale, status); 3176 return float32_round_pack_canonical(&p, status); 3177 } 3178 3179 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status) 3180 { 3181 return uint64_to_float32_scalbn(a, scale, status); 3182 } 3183 3184 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status) 3185 { 3186 return uint64_to_float32_scalbn(a, scale, status); 3187 } 3188 3189 float32 uint64_to_float32(uint64_t a, float_status *status) 3190 { 3191 return uint64_to_float32_scalbn(a, 0, status); 3192 } 3193 3194 float32 uint32_to_float32(uint32_t a, float_status *status) 3195 { 3196 return uint64_to_float32_scalbn(a, 0, status); 3197 } 3198 3199 float32 uint16_to_float32(uint16_t a, float_status *status) 3200 { 3201 return uint64_to_float32_scalbn(a, 0, status); 3202 } 3203 3204 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status) 3205 { 3206 FloatParts64 p; 3207 3208 parts_uint_to_float(&p, a, scale, status); 3209 return float64_round_pack_canonical(&p, status); 3210 } 3211 3212 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status) 3213 { 3214 return uint64_to_float64_scalbn(a, scale, status); 3215 } 3216 3217 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status) 3218 { 3219 return uint64_to_float64_scalbn(a, scale, status); 3220 } 3221 3222 float64 uint64_to_float64(uint64_t a, float_status *status) 3223 { 3224 return uint64_to_float64_scalbn(a, 0, status); 3225 } 3226 3227 float64 uint32_to_float64(uint32_t a, float_status *status) 3228 { 3229 return uint64_to_float64_scalbn(a, 0, status); 3230 } 3231 3232 float64 uint16_to_float64(uint16_t a, float_status *status) 3233 { 3234 return uint64_to_float64_scalbn(a, 0, status); 3235 } 3236 3237 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status) 3238 { 3239 FloatParts64 p; 3240 3241 parts_uint_to_float(&p, a, scale, status); 3242 return bfloat16_round_pack_canonical(&p, status); 3243 } 3244 3245 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int scale, float_status *status) 3246 { 3247 return uint64_to_bfloat16_scalbn(a, scale, status); 3248 } 3249 3250 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status) 3251 { 3252 return uint64_to_bfloat16_scalbn(a, scale, status); 3253 } 3254 3255 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status) 3256 { 3257 return uint64_to_bfloat16_scalbn(a, 0, status); 3258 } 3259 3260 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status) 3261 { 3262 return uint64_to_bfloat16_scalbn(a, 0, status); 3263 } 3264 3265 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status) 3266 { 3267 return uint64_to_bfloat16_scalbn(a, 0, status); 3268 } 3269 3270 float128 uint64_to_float128(uint64_t a, float_status *status) 3271 { 3272 FloatParts128 p; 3273 3274 parts_uint_to_float(&p, a, 0, status); 3275 return float128_round_pack_canonical(&p, status); 3276 } 3277 3278 /* 3279 * Minimum and maximum 3280 */ 3281 3282 static float16 float16_minmax(float16 a, float16 b, float_status *s, int flags) 3283 { 3284 FloatParts64 pa, pb, *pr; 3285 3286 float16_unpack_canonical(&pa, a, s); 3287 float16_unpack_canonical(&pb, b, s); 3288 pr = parts_minmax(&pa, &pb, s, flags); 3289 3290 return float16_round_pack_canonical(pr, s); 3291 } 3292 3293 static bfloat16 bfloat16_minmax(bfloat16 a, bfloat16 b, 3294 float_status *s, int flags) 3295 { 3296 FloatParts64 pa, pb, *pr; 3297 3298 bfloat16_unpack_canonical(&pa, a, s); 3299 bfloat16_unpack_canonical(&pb, b, s); 3300 pr = parts_minmax(&pa, &pb, s, flags); 3301 3302 return bfloat16_round_pack_canonical(pr, s); 3303 } 3304 3305 static float32 float32_minmax(float32 a, float32 b, float_status *s, int flags) 3306 { 3307 FloatParts64 pa, pb, *pr; 3308 3309 float32_unpack_canonical(&pa, a, s); 3310 float32_unpack_canonical(&pb, b, s); 3311 pr = parts_minmax(&pa, &pb, s, flags); 3312 3313 return float32_round_pack_canonical(pr, s); 3314 } 3315 3316 static float64 float64_minmax(float64 a, float64 b, float_status *s, int flags) 3317 { 3318 FloatParts64 pa, pb, *pr; 3319 3320 float64_unpack_canonical(&pa, a, s); 3321 float64_unpack_canonical(&pb, b, s); 3322 pr = parts_minmax(&pa, &pb, s, flags); 3323 3324 return float64_round_pack_canonical(pr, s); 3325 } 3326 3327 #define MINMAX_1(type, name, flags) \ 3328 type type##_##name(type a, type b, float_status *s) \ 3329 { return type##_minmax(a, b, s, flags); } 3330 3331 #define MINMAX_2(type) \ 3332 MINMAX_1(type, max, 0) \ 3333 MINMAX_1(type, maxnum, minmax_isnum) \ 3334 MINMAX_1(type, maxnummag, minmax_isnum | minmax_ismag) \ 3335 MINMAX_1(type, min, minmax_ismin) \ 3336 MINMAX_1(type, minnum, minmax_ismin | minmax_isnum) \ 3337 MINMAX_1(type, minnummag, minmax_ismin | minmax_isnum | minmax_ismag) 3338 3339 MINMAX_2(float16) 3340 MINMAX_2(bfloat16) 3341 MINMAX_2(float32) 3342 MINMAX_2(float64) 3343 3344 #undef MINMAX_1 3345 #undef MINMAX_2 3346 3347 /* Floating point compare */ 3348 static FloatRelation compare_floats(FloatParts64 a, FloatParts64 b, bool is_quiet, 3349 float_status *s) 3350 { 3351 if (is_nan(a.cls) || is_nan(b.cls)) { 3352 if (!is_quiet || 3353 a.cls == float_class_snan || 3354 b.cls == float_class_snan) { 3355 float_raise(float_flag_invalid, s); 3356 } 3357 return float_relation_unordered; 3358 } 3359 3360 if (a.cls == float_class_zero) { 3361 if (b.cls == float_class_zero) { 3362 return float_relation_equal; 3363 } 3364 return b.sign ? float_relation_greater : float_relation_less; 3365 } else if (b.cls == float_class_zero) { 3366 return a.sign ? float_relation_less : float_relation_greater; 3367 } 3368 3369 /* The only really important thing about infinity is its sign. If 3370 * both are infinities the sign marks the smallest of the two. 3371 */ 3372 if (a.cls == float_class_inf) { 3373 if ((b.cls == float_class_inf) && (a.sign == b.sign)) { 3374 return float_relation_equal; 3375 } 3376 return a.sign ? float_relation_less : float_relation_greater; 3377 } else if (b.cls == float_class_inf) { 3378 return b.sign ? float_relation_greater : float_relation_less; 3379 } 3380 3381 if (a.sign != b.sign) { 3382 return a.sign ? float_relation_less : float_relation_greater; 3383 } 3384 3385 if (a.exp == b.exp) { 3386 if (a.frac == b.frac) { 3387 return float_relation_equal; 3388 } 3389 if (a.sign) { 3390 return a.frac > b.frac ? 3391 float_relation_less : float_relation_greater; 3392 } else { 3393 return a.frac > b.frac ? 3394 float_relation_greater : float_relation_less; 3395 } 3396 } else { 3397 if (a.sign) { 3398 return a.exp > b.exp ? float_relation_less : float_relation_greater; 3399 } else { 3400 return a.exp > b.exp ? float_relation_greater : float_relation_less; 3401 } 3402 } 3403 } 3404 3405 #define COMPARE(name, attr, sz) \ 3406 static int attr \ 3407 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \ 3408 { \ 3409 FloatParts64 pa, pb; \ 3410 float ## sz ## _unpack_canonical(&pa, a, s); \ 3411 float ## sz ## _unpack_canonical(&pb, b, s); \ 3412 return compare_floats(pa, pb, is_quiet, s); \ 3413 } 3414 3415 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16) 3416 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32) 3417 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64) 3418 3419 #undef COMPARE 3420 3421 FloatRelation float16_compare(float16 a, float16 b, float_status *s) 3422 { 3423 return soft_f16_compare(a, b, false, s); 3424 } 3425 3426 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s) 3427 { 3428 return soft_f16_compare(a, b, true, s); 3429 } 3430 3431 static FloatRelation QEMU_FLATTEN 3432 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s) 3433 { 3434 union_float32 ua, ub; 3435 3436 ua.s = xa; 3437 ub.s = xb; 3438 3439 if (QEMU_NO_HARDFLOAT) { 3440 goto soft; 3441 } 3442 3443 float32_input_flush2(&ua.s, &ub.s, s); 3444 if (isgreaterequal(ua.h, ub.h)) { 3445 if (isgreater(ua.h, ub.h)) { 3446 return float_relation_greater; 3447 } 3448 return float_relation_equal; 3449 } 3450 if (likely(isless(ua.h, ub.h))) { 3451 return float_relation_less; 3452 } 3453 /* The only condition remaining is unordered. 3454 * Fall through to set flags. 3455 */ 3456 soft: 3457 return soft_f32_compare(ua.s, ub.s, is_quiet, s); 3458 } 3459 3460 FloatRelation float32_compare(float32 a, float32 b, float_status *s) 3461 { 3462 return f32_compare(a, b, false, s); 3463 } 3464 3465 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s) 3466 { 3467 return f32_compare(a, b, true, s); 3468 } 3469 3470 static FloatRelation QEMU_FLATTEN 3471 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s) 3472 { 3473 union_float64 ua, ub; 3474 3475 ua.s = xa; 3476 ub.s = xb; 3477 3478 if (QEMU_NO_HARDFLOAT) { 3479 goto soft; 3480 } 3481 3482 float64_input_flush2(&ua.s, &ub.s, s); 3483 if (isgreaterequal(ua.h, ub.h)) { 3484 if (isgreater(ua.h, ub.h)) { 3485 return float_relation_greater; 3486 } 3487 return float_relation_equal; 3488 } 3489 if (likely(isless(ua.h, ub.h))) { 3490 return float_relation_less; 3491 } 3492 /* The only condition remaining is unordered. 3493 * Fall through to set flags. 3494 */ 3495 soft: 3496 return soft_f64_compare(ua.s, ub.s, is_quiet, s); 3497 } 3498 3499 FloatRelation float64_compare(float64 a, float64 b, float_status *s) 3500 { 3501 return f64_compare(a, b, false, s); 3502 } 3503 3504 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s) 3505 { 3506 return f64_compare(a, b, true, s); 3507 } 3508 3509 static FloatRelation QEMU_FLATTEN 3510 soft_bf16_compare(bfloat16 a, bfloat16 b, bool is_quiet, float_status *s) 3511 { 3512 FloatParts64 pa, pb; 3513 3514 bfloat16_unpack_canonical(&pa, a, s); 3515 bfloat16_unpack_canonical(&pb, b, s); 3516 return compare_floats(pa, pb, is_quiet, s); 3517 } 3518 3519 FloatRelation bfloat16_compare(bfloat16 a, bfloat16 b, float_status *s) 3520 { 3521 return soft_bf16_compare(a, b, false, s); 3522 } 3523 3524 FloatRelation bfloat16_compare_quiet(bfloat16 a, bfloat16 b, float_status *s) 3525 { 3526 return soft_bf16_compare(a, b, true, s); 3527 } 3528 3529 /* Multiply A by 2 raised to the power N. */ 3530 static FloatParts64 scalbn_decomposed(FloatParts64 a, int n, float_status *s) 3531 { 3532 if (unlikely(is_nan(a.cls))) { 3533 parts_return_nan(&a, s); 3534 } 3535 if (a.cls == float_class_normal) { 3536 /* The largest float type (even though not supported by FloatParts64) 3537 * is float128, which has a 15 bit exponent. Bounding N to 16 bits 3538 * still allows rounding to infinity, without allowing overflow 3539 * within the int32_t that backs FloatParts64.exp. 3540 */ 3541 n = MIN(MAX(n, -0x10000), 0x10000); 3542 a.exp += n; 3543 } 3544 return a; 3545 } 3546 3547 float16 float16_scalbn(float16 a, int n, float_status *status) 3548 { 3549 FloatParts64 pa, pr; 3550 3551 float16_unpack_canonical(&pa, a, status); 3552 pr = scalbn_decomposed(pa, n, status); 3553 return float16_round_pack_canonical(&pr, status); 3554 } 3555 3556 float32 float32_scalbn(float32 a, int n, float_status *status) 3557 { 3558 FloatParts64 pa, pr; 3559 3560 float32_unpack_canonical(&pa, a, status); 3561 pr = scalbn_decomposed(pa, n, status); 3562 return float32_round_pack_canonical(&pr, status); 3563 } 3564 3565 float64 float64_scalbn(float64 a, int n, float_status *status) 3566 { 3567 FloatParts64 pa, pr; 3568 3569 float64_unpack_canonical(&pa, a, status); 3570 pr = scalbn_decomposed(pa, n, status); 3571 return float64_round_pack_canonical(&pr, status); 3572 } 3573 3574 bfloat16 bfloat16_scalbn(bfloat16 a, int n, float_status *status) 3575 { 3576 FloatParts64 pa, pr; 3577 3578 bfloat16_unpack_canonical(&pa, a, status); 3579 pr = scalbn_decomposed(pa, n, status); 3580 return bfloat16_round_pack_canonical(&pr, status); 3581 } 3582 3583 /* 3584 * Square Root 3585 * 3586 * The old softfloat code did an approximation step before zeroing in 3587 * on the final result. However for simpleness we just compute the 3588 * square root by iterating down from the implicit bit to enough extra 3589 * bits to ensure we get a correctly rounded result. 3590 * 3591 * This does mean however the calculation is slower than before, 3592 * especially for 64 bit floats. 3593 */ 3594 3595 static FloatParts64 sqrt_float(FloatParts64 a, float_status *s, const FloatFmt *p) 3596 { 3597 uint64_t a_frac, r_frac, s_frac; 3598 int bit, last_bit; 3599 3600 if (is_nan(a.cls)) { 3601 parts_return_nan(&a, s); 3602 return a; 3603 } 3604 if (a.cls == float_class_zero) { 3605 return a; /* sqrt(+-0) = +-0 */ 3606 } 3607 if (a.sign) { 3608 float_raise(float_flag_invalid, s); 3609 parts_default_nan(&a, s); 3610 return a; 3611 } 3612 if (a.cls == float_class_inf) { 3613 return a; /* sqrt(+inf) = +inf */ 3614 } 3615 3616 assert(a.cls == float_class_normal); 3617 3618 /* We need two overflow bits at the top. Adding room for that is a 3619 * right shift. If the exponent is odd, we can discard the low bit 3620 * by multiplying the fraction by 2; that's a left shift. Combine 3621 * those and we shift right by 1 if the exponent is odd, otherwise 2. 3622 */ 3623 a_frac = a.frac >> (2 - (a.exp & 1)); 3624 a.exp >>= 1; 3625 3626 /* Bit-by-bit computation of sqrt. */ 3627 r_frac = 0; 3628 s_frac = 0; 3629 3630 /* Iterate from implicit bit down to the 3 extra bits to compute a 3631 * properly rounded result. Remember we've inserted two more bits 3632 * at the top, so these positions are two less. 3633 */ 3634 bit = DECOMPOSED_BINARY_POINT - 2; 3635 last_bit = MAX(p->frac_shift - 4, 0); 3636 do { 3637 uint64_t q = 1ULL << bit; 3638 uint64_t t_frac = s_frac + q; 3639 if (t_frac <= a_frac) { 3640 s_frac = t_frac + q; 3641 a_frac -= t_frac; 3642 r_frac += q; 3643 } 3644 a_frac <<= 1; 3645 } while (--bit >= last_bit); 3646 3647 /* Undo the right shift done above. If there is any remaining 3648 * fraction, the result is inexact. Set the sticky bit. 3649 */ 3650 a.frac = (r_frac << 2) + (a_frac != 0); 3651 3652 return a; 3653 } 3654 3655 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status) 3656 { 3657 FloatParts64 pa, pr; 3658 3659 float16_unpack_canonical(&pa, a, status); 3660 pr = sqrt_float(pa, status, &float16_params); 3661 return float16_round_pack_canonical(&pr, status); 3662 } 3663 3664 static float32 QEMU_SOFTFLOAT_ATTR 3665 soft_f32_sqrt(float32 a, float_status *status) 3666 { 3667 FloatParts64 pa, pr; 3668 3669 float32_unpack_canonical(&pa, a, status); 3670 pr = sqrt_float(pa, status, &float32_params); 3671 return float32_round_pack_canonical(&pr, status); 3672 } 3673 3674 static float64 QEMU_SOFTFLOAT_ATTR 3675 soft_f64_sqrt(float64 a, float_status *status) 3676 { 3677 FloatParts64 pa, pr; 3678 3679 float64_unpack_canonical(&pa, a, status); 3680 pr = sqrt_float(pa, status, &float64_params); 3681 return float64_round_pack_canonical(&pr, status); 3682 } 3683 3684 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s) 3685 { 3686 union_float32 ua, ur; 3687 3688 ua.s = xa; 3689 if (unlikely(!can_use_fpu(s))) { 3690 goto soft; 3691 } 3692 3693 float32_input_flush1(&ua.s, s); 3694 if (QEMU_HARDFLOAT_1F32_USE_FP) { 3695 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3696 fpclassify(ua.h) == FP_ZERO) || 3697 signbit(ua.h))) { 3698 goto soft; 3699 } 3700 } else if (unlikely(!float32_is_zero_or_normal(ua.s) || 3701 float32_is_neg(ua.s))) { 3702 goto soft; 3703 } 3704 ur.h = sqrtf(ua.h); 3705 return ur.s; 3706 3707 soft: 3708 return soft_f32_sqrt(ua.s, s); 3709 } 3710 3711 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s) 3712 { 3713 union_float64 ua, ur; 3714 3715 ua.s = xa; 3716 if (unlikely(!can_use_fpu(s))) { 3717 goto soft; 3718 } 3719 3720 float64_input_flush1(&ua.s, s); 3721 if (QEMU_HARDFLOAT_1F64_USE_FP) { 3722 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3723 fpclassify(ua.h) == FP_ZERO) || 3724 signbit(ua.h))) { 3725 goto soft; 3726 } 3727 } else if (unlikely(!float64_is_zero_or_normal(ua.s) || 3728 float64_is_neg(ua.s))) { 3729 goto soft; 3730 } 3731 ur.h = sqrt(ua.h); 3732 return ur.s; 3733 3734 soft: 3735 return soft_f64_sqrt(ua.s, s); 3736 } 3737 3738 bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status) 3739 { 3740 FloatParts64 pa, pr; 3741 3742 bfloat16_unpack_canonical(&pa, a, status); 3743 pr = sqrt_float(pa, status, &bfloat16_params); 3744 return bfloat16_round_pack_canonical(&pr, status); 3745 } 3746 3747 /*---------------------------------------------------------------------------- 3748 | The pattern for a default generated NaN. 3749 *----------------------------------------------------------------------------*/ 3750 3751 float16 float16_default_nan(float_status *status) 3752 { 3753 FloatParts64 p; 3754 3755 parts_default_nan(&p, status); 3756 p.frac >>= float16_params.frac_shift; 3757 return float16_pack_raw(&p); 3758 } 3759 3760 float32 float32_default_nan(float_status *status) 3761 { 3762 FloatParts64 p; 3763 3764 parts_default_nan(&p, status); 3765 p.frac >>= float32_params.frac_shift; 3766 return float32_pack_raw(&p); 3767 } 3768 3769 float64 float64_default_nan(float_status *status) 3770 { 3771 FloatParts64 p; 3772 3773 parts_default_nan(&p, status); 3774 p.frac >>= float64_params.frac_shift; 3775 return float64_pack_raw(&p); 3776 } 3777 3778 float128 float128_default_nan(float_status *status) 3779 { 3780 FloatParts128 p; 3781 3782 parts_default_nan(&p, status); 3783 frac_shr(&p, float128_params.frac_shift); 3784 return float128_pack_raw(&p); 3785 } 3786 3787 bfloat16 bfloat16_default_nan(float_status *status) 3788 { 3789 FloatParts64 p; 3790 3791 parts_default_nan(&p, status); 3792 p.frac >>= bfloat16_params.frac_shift; 3793 return bfloat16_pack_raw(&p); 3794 } 3795 3796 /*---------------------------------------------------------------------------- 3797 | Returns a quiet NaN from a signalling NaN for the floating point value `a'. 3798 *----------------------------------------------------------------------------*/ 3799 3800 float16 float16_silence_nan(float16 a, float_status *status) 3801 { 3802 FloatParts64 p; 3803 3804 float16_unpack_raw(&p, a); 3805 p.frac <<= float16_params.frac_shift; 3806 parts_silence_nan(&p, status); 3807 p.frac >>= float16_params.frac_shift; 3808 return float16_pack_raw(&p); 3809 } 3810 3811 float32 float32_silence_nan(float32 a, float_status *status) 3812 { 3813 FloatParts64 p; 3814 3815 float32_unpack_raw(&p, a); 3816 p.frac <<= float32_params.frac_shift; 3817 parts_silence_nan(&p, status); 3818 p.frac >>= float32_params.frac_shift; 3819 return float32_pack_raw(&p); 3820 } 3821 3822 float64 float64_silence_nan(float64 a, float_status *status) 3823 { 3824 FloatParts64 p; 3825 3826 float64_unpack_raw(&p, a); 3827 p.frac <<= float64_params.frac_shift; 3828 parts_silence_nan(&p, status); 3829 p.frac >>= float64_params.frac_shift; 3830 return float64_pack_raw(&p); 3831 } 3832 3833 bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status) 3834 { 3835 FloatParts64 p; 3836 3837 bfloat16_unpack_raw(&p, a); 3838 p.frac <<= bfloat16_params.frac_shift; 3839 parts_silence_nan(&p, status); 3840 p.frac >>= bfloat16_params.frac_shift; 3841 return bfloat16_pack_raw(&p); 3842 } 3843 3844 float128 float128_silence_nan(float128 a, float_status *status) 3845 { 3846 FloatParts128 p; 3847 3848 float128_unpack_raw(&p, a); 3849 frac_shl(&p, float128_params.frac_shift); 3850 parts_silence_nan(&p, status); 3851 frac_shr(&p, float128_params.frac_shift); 3852 return float128_pack_raw(&p); 3853 } 3854 3855 /*---------------------------------------------------------------------------- 3856 | If `a' is denormal and we are in flush-to-zero mode then set the 3857 | input-denormal exception and return zero. Otherwise just return the value. 3858 *----------------------------------------------------------------------------*/ 3859 3860 static bool parts_squash_denormal(FloatParts64 p, float_status *status) 3861 { 3862 if (p.exp == 0 && p.frac != 0) { 3863 float_raise(float_flag_input_denormal, status); 3864 return true; 3865 } 3866 3867 return false; 3868 } 3869 3870 float16 float16_squash_input_denormal(float16 a, float_status *status) 3871 { 3872 if (status->flush_inputs_to_zero) { 3873 FloatParts64 p; 3874 3875 float16_unpack_raw(&p, a); 3876 if (parts_squash_denormal(p, status)) { 3877 return float16_set_sign(float16_zero, p.sign); 3878 } 3879 } 3880 return a; 3881 } 3882 3883 float32 float32_squash_input_denormal(float32 a, float_status *status) 3884 { 3885 if (status->flush_inputs_to_zero) { 3886 FloatParts64 p; 3887 3888 float32_unpack_raw(&p, a); 3889 if (parts_squash_denormal(p, status)) { 3890 return float32_set_sign(float32_zero, p.sign); 3891 } 3892 } 3893 return a; 3894 } 3895 3896 float64 float64_squash_input_denormal(float64 a, float_status *status) 3897 { 3898 if (status->flush_inputs_to_zero) { 3899 FloatParts64 p; 3900 3901 float64_unpack_raw(&p, a); 3902 if (parts_squash_denormal(p, status)) { 3903 return float64_set_sign(float64_zero, p.sign); 3904 } 3905 } 3906 return a; 3907 } 3908 3909 bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status) 3910 { 3911 if (status->flush_inputs_to_zero) { 3912 FloatParts64 p; 3913 3914 bfloat16_unpack_raw(&p, a); 3915 if (parts_squash_denormal(p, status)) { 3916 return bfloat16_set_sign(bfloat16_zero, p.sign); 3917 } 3918 } 3919 return a; 3920 } 3921 3922 /*---------------------------------------------------------------------------- 3923 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 3924 | and 7, and returns the properly rounded 32-bit integer corresponding to the 3925 | input. If `zSign' is 1, the input is negated before being converted to an 3926 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 3927 | is simply rounded to an integer, with the inexact exception raised if the 3928 | input cannot be represented exactly as an integer. However, if the fixed- 3929 | point input is too large, the invalid exception is raised and the largest 3930 | positive or negative integer is returned. 3931 *----------------------------------------------------------------------------*/ 3932 3933 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ, 3934 float_status *status) 3935 { 3936 int8_t roundingMode; 3937 bool roundNearestEven; 3938 int8_t roundIncrement, roundBits; 3939 int32_t z; 3940 3941 roundingMode = status->float_rounding_mode; 3942 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3943 switch (roundingMode) { 3944 case float_round_nearest_even: 3945 case float_round_ties_away: 3946 roundIncrement = 0x40; 3947 break; 3948 case float_round_to_zero: 3949 roundIncrement = 0; 3950 break; 3951 case float_round_up: 3952 roundIncrement = zSign ? 0 : 0x7f; 3953 break; 3954 case float_round_down: 3955 roundIncrement = zSign ? 0x7f : 0; 3956 break; 3957 case float_round_to_odd: 3958 roundIncrement = absZ & 0x80 ? 0 : 0x7f; 3959 break; 3960 default: 3961 abort(); 3962 } 3963 roundBits = absZ & 0x7F; 3964 absZ = ( absZ + roundIncrement )>>7; 3965 if (!(roundBits ^ 0x40) && roundNearestEven) { 3966 absZ &= ~1; 3967 } 3968 z = absZ; 3969 if ( zSign ) z = - z; 3970 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 3971 float_raise(float_flag_invalid, status); 3972 return zSign ? INT32_MIN : INT32_MAX; 3973 } 3974 if (roundBits) { 3975 float_raise(float_flag_inexact, status); 3976 } 3977 return z; 3978 3979 } 3980 3981 /*---------------------------------------------------------------------------- 3982 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 3983 | `absZ1', with binary point between bits 63 and 64 (between the input words), 3984 | and returns the properly rounded 64-bit integer corresponding to the input. 3985 | If `zSign' is 1, the input is negated before being converted to an integer. 3986 | Ordinarily, the fixed-point input is simply rounded to an integer, with 3987 | the inexact exception raised if the input cannot be represented exactly as 3988 | an integer. However, if the fixed-point input is too large, the invalid 3989 | exception is raised and the largest positive or negative integer is 3990 | returned. 3991 *----------------------------------------------------------------------------*/ 3992 3993 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1, 3994 float_status *status) 3995 { 3996 int8_t roundingMode; 3997 bool roundNearestEven, increment; 3998 int64_t z; 3999 4000 roundingMode = status->float_rounding_mode; 4001 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4002 switch (roundingMode) { 4003 case float_round_nearest_even: 4004 case float_round_ties_away: 4005 increment = ((int64_t) absZ1 < 0); 4006 break; 4007 case float_round_to_zero: 4008 increment = 0; 4009 break; 4010 case float_round_up: 4011 increment = !zSign && absZ1; 4012 break; 4013 case float_round_down: 4014 increment = zSign && absZ1; 4015 break; 4016 case float_round_to_odd: 4017 increment = !(absZ0 & 1) && absZ1; 4018 break; 4019 default: 4020 abort(); 4021 } 4022 if ( increment ) { 4023 ++absZ0; 4024 if ( absZ0 == 0 ) goto overflow; 4025 if (!(absZ1 << 1) && roundNearestEven) { 4026 absZ0 &= ~1; 4027 } 4028 } 4029 z = absZ0; 4030 if ( zSign ) z = - z; 4031 if ( z && ( ( z < 0 ) ^ zSign ) ) { 4032 overflow: 4033 float_raise(float_flag_invalid, status); 4034 return zSign ? INT64_MIN : INT64_MAX; 4035 } 4036 if (absZ1) { 4037 float_raise(float_flag_inexact, status); 4038 } 4039 return z; 4040 4041 } 4042 4043 /*---------------------------------------------------------------------------- 4044 | Normalizes the subnormal single-precision floating-point value represented 4045 | by the denormalized significand `aSig'. The normalized exponent and 4046 | significand are stored at the locations pointed to by `zExpPtr' and 4047 | `zSigPtr', respectively. 4048 *----------------------------------------------------------------------------*/ 4049 4050 static void 4051 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 4052 { 4053 int8_t shiftCount; 4054 4055 shiftCount = clz32(aSig) - 8; 4056 *zSigPtr = aSig<<shiftCount; 4057 *zExpPtr = 1 - shiftCount; 4058 4059 } 4060 4061 /*---------------------------------------------------------------------------- 4062 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4063 | and significand `zSig', and returns the proper single-precision floating- 4064 | point value corresponding to the abstract input. Ordinarily, the abstract 4065 | value is simply rounded and packed into the single-precision format, with 4066 | the inexact exception raised if the abstract input cannot be represented 4067 | exactly. However, if the abstract value is too large, the overflow and 4068 | inexact exceptions are raised and an infinity or maximal finite value is 4069 | returned. If the abstract value is too small, the input value is rounded to 4070 | a subnormal number, and the underflow and inexact exceptions are raised if 4071 | the abstract input cannot be represented exactly as a subnormal single- 4072 | precision floating-point number. 4073 | The input significand `zSig' has its binary point between bits 30 4074 | and 29, which is 7 bits to the left of the usual location. This shifted 4075 | significand must be normalized or smaller. If `zSig' is not normalized, 4076 | `zExp' must be 0; in that case, the result returned is a subnormal number, 4077 | and it must not require rounding. In the usual case that `zSig' is 4078 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 4079 | The handling of underflow and overflow follows the IEC/IEEE Standard for 4080 | Binary Floating-Point Arithmetic. 4081 *----------------------------------------------------------------------------*/ 4082 4083 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig, 4084 float_status *status) 4085 { 4086 int8_t roundingMode; 4087 bool roundNearestEven; 4088 int8_t roundIncrement, roundBits; 4089 bool isTiny; 4090 4091 roundingMode = status->float_rounding_mode; 4092 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4093 switch (roundingMode) { 4094 case float_round_nearest_even: 4095 case float_round_ties_away: 4096 roundIncrement = 0x40; 4097 break; 4098 case float_round_to_zero: 4099 roundIncrement = 0; 4100 break; 4101 case float_round_up: 4102 roundIncrement = zSign ? 0 : 0x7f; 4103 break; 4104 case float_round_down: 4105 roundIncrement = zSign ? 0x7f : 0; 4106 break; 4107 case float_round_to_odd: 4108 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 4109 break; 4110 default: 4111 abort(); 4112 break; 4113 } 4114 roundBits = zSig & 0x7F; 4115 if ( 0xFD <= (uint16_t) zExp ) { 4116 if ( ( 0xFD < zExp ) 4117 || ( ( zExp == 0xFD ) 4118 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 4119 ) { 4120 bool overflow_to_inf = roundingMode != float_round_to_odd && 4121 roundIncrement != 0; 4122 float_raise(float_flag_overflow | float_flag_inexact, status); 4123 return packFloat32(zSign, 0xFF, -!overflow_to_inf); 4124 } 4125 if ( zExp < 0 ) { 4126 if (status->flush_to_zero) { 4127 float_raise(float_flag_output_denormal, status); 4128 return packFloat32(zSign, 0, 0); 4129 } 4130 isTiny = status->tininess_before_rounding 4131 || (zExp < -1) 4132 || (zSig + roundIncrement < 0x80000000); 4133 shift32RightJamming( zSig, - zExp, &zSig ); 4134 zExp = 0; 4135 roundBits = zSig & 0x7F; 4136 if (isTiny && roundBits) { 4137 float_raise(float_flag_underflow, status); 4138 } 4139 if (roundingMode == float_round_to_odd) { 4140 /* 4141 * For round-to-odd case, the roundIncrement depends on 4142 * zSig which just changed. 4143 */ 4144 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 4145 } 4146 } 4147 } 4148 if (roundBits) { 4149 float_raise(float_flag_inexact, status); 4150 } 4151 zSig = ( zSig + roundIncrement )>>7; 4152 if (!(roundBits ^ 0x40) && roundNearestEven) { 4153 zSig &= ~1; 4154 } 4155 if ( zSig == 0 ) zExp = 0; 4156 return packFloat32( zSign, zExp, zSig ); 4157 4158 } 4159 4160 /*---------------------------------------------------------------------------- 4161 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4162 | and significand `zSig', and returns the proper single-precision floating- 4163 | point value corresponding to the abstract input. This routine is just like 4164 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 4165 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 4166 | floating-point exponent. 4167 *----------------------------------------------------------------------------*/ 4168 4169 static float32 4170 normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig, 4171 float_status *status) 4172 { 4173 int8_t shiftCount; 4174 4175 shiftCount = clz32(zSig) - 1; 4176 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 4177 status); 4178 4179 } 4180 4181 /*---------------------------------------------------------------------------- 4182 | Normalizes the subnormal double-precision floating-point value represented 4183 | by the denormalized significand `aSig'. The normalized exponent and 4184 | significand are stored at the locations pointed to by `zExpPtr' and 4185 | `zSigPtr', respectively. 4186 *----------------------------------------------------------------------------*/ 4187 4188 static void 4189 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 4190 { 4191 int8_t shiftCount; 4192 4193 shiftCount = clz64(aSig) - 11; 4194 *zSigPtr = aSig<<shiftCount; 4195 *zExpPtr = 1 - shiftCount; 4196 4197 } 4198 4199 /*---------------------------------------------------------------------------- 4200 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 4201 | double-precision floating-point value, returning the result. After being 4202 | shifted into the proper positions, the three fields are simply added 4203 | together to form the result. This means that any integer portion of `zSig' 4204 | will be added into the exponent. Since a properly normalized significand 4205 | will have an integer portion equal to 1, the `zExp' input should be 1 less 4206 | than the desired result exponent whenever `zSig' is a complete, normalized 4207 | significand. 4208 *----------------------------------------------------------------------------*/ 4209 4210 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig) 4211 { 4212 4213 return make_float64( 4214 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 4215 4216 } 4217 4218 /*---------------------------------------------------------------------------- 4219 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4220 | and significand `zSig', and returns the proper double-precision floating- 4221 | point value corresponding to the abstract input. Ordinarily, the abstract 4222 | value is simply rounded and packed into the double-precision format, with 4223 | the inexact exception raised if the abstract input cannot be represented 4224 | exactly. However, if the abstract value is too large, the overflow and 4225 | inexact exceptions are raised and an infinity or maximal finite value is 4226 | returned. If the abstract value is too small, the input value is rounded to 4227 | a subnormal number, and the underflow and inexact exceptions are raised if 4228 | the abstract input cannot be represented exactly as a subnormal double- 4229 | precision floating-point number. 4230 | The input significand `zSig' has its binary point between bits 62 4231 | and 61, which is 10 bits to the left of the usual location. This shifted 4232 | significand must be normalized or smaller. If `zSig' is not normalized, 4233 | `zExp' must be 0; in that case, the result returned is a subnormal number, 4234 | and it must not require rounding. In the usual case that `zSig' is 4235 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 4236 | The handling of underflow and overflow follows the IEC/IEEE Standard for 4237 | Binary Floating-Point Arithmetic. 4238 *----------------------------------------------------------------------------*/ 4239 4240 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig, 4241 float_status *status) 4242 { 4243 int8_t roundingMode; 4244 bool roundNearestEven; 4245 int roundIncrement, roundBits; 4246 bool isTiny; 4247 4248 roundingMode = status->float_rounding_mode; 4249 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4250 switch (roundingMode) { 4251 case float_round_nearest_even: 4252 case float_round_ties_away: 4253 roundIncrement = 0x200; 4254 break; 4255 case float_round_to_zero: 4256 roundIncrement = 0; 4257 break; 4258 case float_round_up: 4259 roundIncrement = zSign ? 0 : 0x3ff; 4260 break; 4261 case float_round_down: 4262 roundIncrement = zSign ? 0x3ff : 0; 4263 break; 4264 case float_round_to_odd: 4265 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 4266 break; 4267 default: 4268 abort(); 4269 } 4270 roundBits = zSig & 0x3FF; 4271 if ( 0x7FD <= (uint16_t) zExp ) { 4272 if ( ( 0x7FD < zExp ) 4273 || ( ( zExp == 0x7FD ) 4274 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 4275 ) { 4276 bool overflow_to_inf = roundingMode != float_round_to_odd && 4277 roundIncrement != 0; 4278 float_raise(float_flag_overflow | float_flag_inexact, status); 4279 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 4280 } 4281 if ( zExp < 0 ) { 4282 if (status->flush_to_zero) { 4283 float_raise(float_flag_output_denormal, status); 4284 return packFloat64(zSign, 0, 0); 4285 } 4286 isTiny = status->tininess_before_rounding 4287 || (zExp < -1) 4288 || (zSig + roundIncrement < UINT64_C(0x8000000000000000)); 4289 shift64RightJamming( zSig, - zExp, &zSig ); 4290 zExp = 0; 4291 roundBits = zSig & 0x3FF; 4292 if (isTiny && roundBits) { 4293 float_raise(float_flag_underflow, status); 4294 } 4295 if (roundingMode == float_round_to_odd) { 4296 /* 4297 * For round-to-odd case, the roundIncrement depends on 4298 * zSig which just changed. 4299 */ 4300 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 4301 } 4302 } 4303 } 4304 if (roundBits) { 4305 float_raise(float_flag_inexact, status); 4306 } 4307 zSig = ( zSig + roundIncrement )>>10; 4308 if (!(roundBits ^ 0x200) && roundNearestEven) { 4309 zSig &= ~1; 4310 } 4311 if ( zSig == 0 ) zExp = 0; 4312 return packFloat64( zSign, zExp, zSig ); 4313 4314 } 4315 4316 /*---------------------------------------------------------------------------- 4317 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4318 | and significand `zSig', and returns the proper double-precision floating- 4319 | point value corresponding to the abstract input. This routine is just like 4320 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 4321 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 4322 | floating-point exponent. 4323 *----------------------------------------------------------------------------*/ 4324 4325 static float64 4326 normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig, 4327 float_status *status) 4328 { 4329 int8_t shiftCount; 4330 4331 shiftCount = clz64(zSig) - 1; 4332 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 4333 status); 4334 4335 } 4336 4337 /*---------------------------------------------------------------------------- 4338 | Normalizes the subnormal extended double-precision floating-point value 4339 | represented by the denormalized significand `aSig'. The normalized exponent 4340 | and significand are stored at the locations pointed to by `zExpPtr' and 4341 | `zSigPtr', respectively. 4342 *----------------------------------------------------------------------------*/ 4343 4344 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr, 4345 uint64_t *zSigPtr) 4346 { 4347 int8_t shiftCount; 4348 4349 shiftCount = clz64(aSig); 4350 *zSigPtr = aSig<<shiftCount; 4351 *zExpPtr = 1 - shiftCount; 4352 } 4353 4354 /*---------------------------------------------------------------------------- 4355 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4356 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 4357 | and returns the proper extended double-precision floating-point value 4358 | corresponding to the abstract input. Ordinarily, the abstract value is 4359 | rounded and packed into the extended double-precision format, with the 4360 | inexact exception raised if the abstract input cannot be represented 4361 | exactly. However, if the abstract value is too large, the overflow and 4362 | inexact exceptions are raised and an infinity or maximal finite value is 4363 | returned. If the abstract value is too small, the input value is rounded to 4364 | a subnormal number, and the underflow and inexact exceptions are raised if 4365 | the abstract input cannot be represented exactly as a subnormal extended 4366 | double-precision floating-point number. 4367 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 4368 | number of bits as single or double precision, respectively. Otherwise, the 4369 | result is rounded to the full precision of the extended double-precision 4370 | format. 4371 | The input significand must be normalized or smaller. If the input 4372 | significand is not normalized, `zExp' must be 0; in that case, the result 4373 | returned is a subnormal number, and it must not require rounding. The 4374 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 4375 | Floating-Point Arithmetic. 4376 *----------------------------------------------------------------------------*/ 4377 4378 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign, 4379 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 4380 float_status *status) 4381 { 4382 int8_t roundingMode; 4383 bool roundNearestEven, increment, isTiny; 4384 int64_t roundIncrement, roundMask, roundBits; 4385 4386 roundingMode = status->float_rounding_mode; 4387 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4388 if ( roundingPrecision == 80 ) goto precision80; 4389 if ( roundingPrecision == 64 ) { 4390 roundIncrement = UINT64_C(0x0000000000000400); 4391 roundMask = UINT64_C(0x00000000000007FF); 4392 } 4393 else if ( roundingPrecision == 32 ) { 4394 roundIncrement = UINT64_C(0x0000008000000000); 4395 roundMask = UINT64_C(0x000000FFFFFFFFFF); 4396 } 4397 else { 4398 goto precision80; 4399 } 4400 zSig0 |= ( zSig1 != 0 ); 4401 switch (roundingMode) { 4402 case float_round_nearest_even: 4403 case float_round_ties_away: 4404 break; 4405 case float_round_to_zero: 4406 roundIncrement = 0; 4407 break; 4408 case float_round_up: 4409 roundIncrement = zSign ? 0 : roundMask; 4410 break; 4411 case float_round_down: 4412 roundIncrement = zSign ? roundMask : 0; 4413 break; 4414 default: 4415 abort(); 4416 } 4417 roundBits = zSig0 & roundMask; 4418 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 4419 if ( ( 0x7FFE < zExp ) 4420 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 4421 ) { 4422 goto overflow; 4423 } 4424 if ( zExp <= 0 ) { 4425 if (status->flush_to_zero) { 4426 float_raise(float_flag_output_denormal, status); 4427 return packFloatx80(zSign, 0, 0); 4428 } 4429 isTiny = status->tininess_before_rounding 4430 || (zExp < 0 ) 4431 || (zSig0 <= zSig0 + roundIncrement); 4432 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 4433 zExp = 0; 4434 roundBits = zSig0 & roundMask; 4435 if (isTiny && roundBits) { 4436 float_raise(float_flag_underflow, status); 4437 } 4438 if (roundBits) { 4439 float_raise(float_flag_inexact, status); 4440 } 4441 zSig0 += roundIncrement; 4442 if ( (int64_t) zSig0 < 0 ) zExp = 1; 4443 roundIncrement = roundMask + 1; 4444 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 4445 roundMask |= roundIncrement; 4446 } 4447 zSig0 &= ~ roundMask; 4448 return packFloatx80( zSign, zExp, zSig0 ); 4449 } 4450 } 4451 if (roundBits) { 4452 float_raise(float_flag_inexact, status); 4453 } 4454 zSig0 += roundIncrement; 4455 if ( zSig0 < roundIncrement ) { 4456 ++zExp; 4457 zSig0 = UINT64_C(0x8000000000000000); 4458 } 4459 roundIncrement = roundMask + 1; 4460 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 4461 roundMask |= roundIncrement; 4462 } 4463 zSig0 &= ~ roundMask; 4464 if ( zSig0 == 0 ) zExp = 0; 4465 return packFloatx80( zSign, zExp, zSig0 ); 4466 precision80: 4467 switch (roundingMode) { 4468 case float_round_nearest_even: 4469 case float_round_ties_away: 4470 increment = ((int64_t)zSig1 < 0); 4471 break; 4472 case float_round_to_zero: 4473 increment = 0; 4474 break; 4475 case float_round_up: 4476 increment = !zSign && zSig1; 4477 break; 4478 case float_round_down: 4479 increment = zSign && zSig1; 4480 break; 4481 default: 4482 abort(); 4483 } 4484 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 4485 if ( ( 0x7FFE < zExp ) 4486 || ( ( zExp == 0x7FFE ) 4487 && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) ) 4488 && increment 4489 ) 4490 ) { 4491 roundMask = 0; 4492 overflow: 4493 float_raise(float_flag_overflow | float_flag_inexact, status); 4494 if ( ( roundingMode == float_round_to_zero ) 4495 || ( zSign && ( roundingMode == float_round_up ) ) 4496 || ( ! zSign && ( roundingMode == float_round_down ) ) 4497 ) { 4498 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 4499 } 4500 return packFloatx80(zSign, 4501 floatx80_infinity_high, 4502 floatx80_infinity_low); 4503 } 4504 if ( zExp <= 0 ) { 4505 isTiny = status->tininess_before_rounding 4506 || (zExp < 0) 4507 || !increment 4508 || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF)); 4509 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 4510 zExp = 0; 4511 if (isTiny && zSig1) { 4512 float_raise(float_flag_underflow, status); 4513 } 4514 if (zSig1) { 4515 float_raise(float_flag_inexact, status); 4516 } 4517 switch (roundingMode) { 4518 case float_round_nearest_even: 4519 case float_round_ties_away: 4520 increment = ((int64_t)zSig1 < 0); 4521 break; 4522 case float_round_to_zero: 4523 increment = 0; 4524 break; 4525 case float_round_up: 4526 increment = !zSign && zSig1; 4527 break; 4528 case float_round_down: 4529 increment = zSign && zSig1; 4530 break; 4531 default: 4532 abort(); 4533 } 4534 if ( increment ) { 4535 ++zSig0; 4536 if (!(zSig1 << 1) && roundNearestEven) { 4537 zSig0 &= ~1; 4538 } 4539 if ( (int64_t) zSig0 < 0 ) zExp = 1; 4540 } 4541 return packFloatx80( zSign, zExp, zSig0 ); 4542 } 4543 } 4544 if (zSig1) { 4545 float_raise(float_flag_inexact, status); 4546 } 4547 if ( increment ) { 4548 ++zSig0; 4549 if ( zSig0 == 0 ) { 4550 ++zExp; 4551 zSig0 = UINT64_C(0x8000000000000000); 4552 } 4553 else { 4554 if (!(zSig1 << 1) && roundNearestEven) { 4555 zSig0 &= ~1; 4556 } 4557 } 4558 } 4559 else { 4560 if ( zSig0 == 0 ) zExp = 0; 4561 } 4562 return packFloatx80( zSign, zExp, zSig0 ); 4563 4564 } 4565 4566 /*---------------------------------------------------------------------------- 4567 | Takes an abstract floating-point value having sign `zSign', exponent 4568 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 4569 | and returns the proper extended double-precision floating-point value 4570 | corresponding to the abstract input. This routine is just like 4571 | `roundAndPackFloatx80' except that the input significand does not have to be 4572 | normalized. 4573 *----------------------------------------------------------------------------*/ 4574 4575 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 4576 bool zSign, int32_t zExp, 4577 uint64_t zSig0, uint64_t zSig1, 4578 float_status *status) 4579 { 4580 int8_t shiftCount; 4581 4582 if ( zSig0 == 0 ) { 4583 zSig0 = zSig1; 4584 zSig1 = 0; 4585 zExp -= 64; 4586 } 4587 shiftCount = clz64(zSig0); 4588 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4589 zExp -= shiftCount; 4590 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 4591 zSig0, zSig1, status); 4592 4593 } 4594 4595 /*---------------------------------------------------------------------------- 4596 | Returns the least-significant 64 fraction bits of the quadruple-precision 4597 | floating-point value `a'. 4598 *----------------------------------------------------------------------------*/ 4599 4600 static inline uint64_t extractFloat128Frac1( float128 a ) 4601 { 4602 4603 return a.low; 4604 4605 } 4606 4607 /*---------------------------------------------------------------------------- 4608 | Returns the most-significant 48 fraction bits of the quadruple-precision 4609 | floating-point value `a'. 4610 *----------------------------------------------------------------------------*/ 4611 4612 static inline uint64_t extractFloat128Frac0( float128 a ) 4613 { 4614 4615 return a.high & UINT64_C(0x0000FFFFFFFFFFFF); 4616 4617 } 4618 4619 /*---------------------------------------------------------------------------- 4620 | Returns the exponent bits of the quadruple-precision floating-point value 4621 | `a'. 4622 *----------------------------------------------------------------------------*/ 4623 4624 static inline int32_t extractFloat128Exp( float128 a ) 4625 { 4626 4627 return ( a.high>>48 ) & 0x7FFF; 4628 4629 } 4630 4631 /*---------------------------------------------------------------------------- 4632 | Returns the sign bit of the quadruple-precision floating-point value `a'. 4633 *----------------------------------------------------------------------------*/ 4634 4635 static inline bool extractFloat128Sign(float128 a) 4636 { 4637 return a.high >> 63; 4638 } 4639 4640 /*---------------------------------------------------------------------------- 4641 | Normalizes the subnormal quadruple-precision floating-point value 4642 | represented by the denormalized significand formed by the concatenation of 4643 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 4644 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 4645 | significand are stored at the location pointed to by `zSig0Ptr', and the 4646 | least significant 64 bits of the normalized significand are stored at the 4647 | location pointed to by `zSig1Ptr'. 4648 *----------------------------------------------------------------------------*/ 4649 4650 static void 4651 normalizeFloat128Subnormal( 4652 uint64_t aSig0, 4653 uint64_t aSig1, 4654 int32_t *zExpPtr, 4655 uint64_t *zSig0Ptr, 4656 uint64_t *zSig1Ptr 4657 ) 4658 { 4659 int8_t shiftCount; 4660 4661 if ( aSig0 == 0 ) { 4662 shiftCount = clz64(aSig1) - 15; 4663 if ( shiftCount < 0 ) { 4664 *zSig0Ptr = aSig1>>( - shiftCount ); 4665 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 4666 } 4667 else { 4668 *zSig0Ptr = aSig1<<shiftCount; 4669 *zSig1Ptr = 0; 4670 } 4671 *zExpPtr = - shiftCount - 63; 4672 } 4673 else { 4674 shiftCount = clz64(aSig0) - 15; 4675 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 4676 *zExpPtr = 1 - shiftCount; 4677 } 4678 4679 } 4680 4681 /*---------------------------------------------------------------------------- 4682 | Packs the sign `zSign', the exponent `zExp', and the significand formed 4683 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 4684 | floating-point value, returning the result. After being shifted into the 4685 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 4686 | added together to form the most significant 32 bits of the result. This 4687 | means that any integer portion of `zSig0' will be added into the exponent. 4688 | Since a properly normalized significand will have an integer portion equal 4689 | to 1, the `zExp' input should be 1 less than the desired result exponent 4690 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 4691 | significand. 4692 *----------------------------------------------------------------------------*/ 4693 4694 static inline float128 4695 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1) 4696 { 4697 float128 z; 4698 4699 z.low = zSig1; 4700 z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0; 4701 return z; 4702 } 4703 4704 /*---------------------------------------------------------------------------- 4705 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4706 | and extended significand formed by the concatenation of `zSig0', `zSig1', 4707 | and `zSig2', and returns the proper quadruple-precision floating-point value 4708 | corresponding to the abstract input. Ordinarily, the abstract value is 4709 | simply rounded and packed into the quadruple-precision format, with the 4710 | inexact exception raised if the abstract input cannot be represented 4711 | exactly. However, if the abstract value is too large, the overflow and 4712 | inexact exceptions are raised and an infinity or maximal finite value is 4713 | returned. If the abstract value is too small, the input value is rounded to 4714 | a subnormal number, and the underflow and inexact exceptions are raised if 4715 | the abstract input cannot be represented exactly as a subnormal quadruple- 4716 | precision floating-point number. 4717 | The input significand must be normalized or smaller. If the input 4718 | significand is not normalized, `zExp' must be 0; in that case, the result 4719 | returned is a subnormal number, and it must not require rounding. In the 4720 | usual case that the input significand is normalized, `zExp' must be 1 less 4721 | than the ``true'' floating-point exponent. The handling of underflow and 4722 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4723 *----------------------------------------------------------------------------*/ 4724 4725 static float128 roundAndPackFloat128(bool zSign, int32_t zExp, 4726 uint64_t zSig0, uint64_t zSig1, 4727 uint64_t zSig2, float_status *status) 4728 { 4729 int8_t roundingMode; 4730 bool roundNearestEven, increment, isTiny; 4731 4732 roundingMode = status->float_rounding_mode; 4733 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4734 switch (roundingMode) { 4735 case float_round_nearest_even: 4736 case float_round_ties_away: 4737 increment = ((int64_t)zSig2 < 0); 4738 break; 4739 case float_round_to_zero: 4740 increment = 0; 4741 break; 4742 case float_round_up: 4743 increment = !zSign && zSig2; 4744 break; 4745 case float_round_down: 4746 increment = zSign && zSig2; 4747 break; 4748 case float_round_to_odd: 4749 increment = !(zSig1 & 0x1) && zSig2; 4750 break; 4751 default: 4752 abort(); 4753 } 4754 if ( 0x7FFD <= (uint32_t) zExp ) { 4755 if ( ( 0x7FFD < zExp ) 4756 || ( ( zExp == 0x7FFD ) 4757 && eq128( 4758 UINT64_C(0x0001FFFFFFFFFFFF), 4759 UINT64_C(0xFFFFFFFFFFFFFFFF), 4760 zSig0, 4761 zSig1 4762 ) 4763 && increment 4764 ) 4765 ) { 4766 float_raise(float_flag_overflow | float_flag_inexact, status); 4767 if ( ( roundingMode == float_round_to_zero ) 4768 || ( zSign && ( roundingMode == float_round_up ) ) 4769 || ( ! zSign && ( roundingMode == float_round_down ) ) 4770 || (roundingMode == float_round_to_odd) 4771 ) { 4772 return 4773 packFloat128( 4774 zSign, 4775 0x7FFE, 4776 UINT64_C(0x0000FFFFFFFFFFFF), 4777 UINT64_C(0xFFFFFFFFFFFFFFFF) 4778 ); 4779 } 4780 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4781 } 4782 if ( zExp < 0 ) { 4783 if (status->flush_to_zero) { 4784 float_raise(float_flag_output_denormal, status); 4785 return packFloat128(zSign, 0, 0, 0); 4786 } 4787 isTiny = status->tininess_before_rounding 4788 || (zExp < -1) 4789 || !increment 4790 || lt128(zSig0, zSig1, 4791 UINT64_C(0x0001FFFFFFFFFFFF), 4792 UINT64_C(0xFFFFFFFFFFFFFFFF)); 4793 shift128ExtraRightJamming( 4794 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 4795 zExp = 0; 4796 if (isTiny && zSig2) { 4797 float_raise(float_flag_underflow, status); 4798 } 4799 switch (roundingMode) { 4800 case float_round_nearest_even: 4801 case float_round_ties_away: 4802 increment = ((int64_t)zSig2 < 0); 4803 break; 4804 case float_round_to_zero: 4805 increment = 0; 4806 break; 4807 case float_round_up: 4808 increment = !zSign && zSig2; 4809 break; 4810 case float_round_down: 4811 increment = zSign && zSig2; 4812 break; 4813 case float_round_to_odd: 4814 increment = !(zSig1 & 0x1) && zSig2; 4815 break; 4816 default: 4817 abort(); 4818 } 4819 } 4820 } 4821 if (zSig2) { 4822 float_raise(float_flag_inexact, status); 4823 } 4824 if ( increment ) { 4825 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 4826 if ((zSig2 + zSig2 == 0) && roundNearestEven) { 4827 zSig1 &= ~1; 4828 } 4829 } 4830 else { 4831 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 4832 } 4833 return packFloat128( zSign, zExp, zSig0, zSig1 ); 4834 4835 } 4836 4837 /*---------------------------------------------------------------------------- 4838 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4839 | and significand formed by the concatenation of `zSig0' and `zSig1', and 4840 | returns the proper quadruple-precision floating-point value corresponding 4841 | to the abstract input. This routine is just like `roundAndPackFloat128' 4842 | except that the input significand has fewer bits and does not have to be 4843 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 4844 | point exponent. 4845 *----------------------------------------------------------------------------*/ 4846 4847 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp, 4848 uint64_t zSig0, uint64_t zSig1, 4849 float_status *status) 4850 { 4851 int8_t shiftCount; 4852 uint64_t zSig2; 4853 4854 if ( zSig0 == 0 ) { 4855 zSig0 = zSig1; 4856 zSig1 = 0; 4857 zExp -= 64; 4858 } 4859 shiftCount = clz64(zSig0) - 15; 4860 if ( 0 <= shiftCount ) { 4861 zSig2 = 0; 4862 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4863 } 4864 else { 4865 shift128ExtraRightJamming( 4866 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 4867 } 4868 zExp -= shiftCount; 4869 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 4870 4871 } 4872 4873 4874 /*---------------------------------------------------------------------------- 4875 | Returns the result of converting the 32-bit two's complement integer `a' 4876 | to the extended double-precision floating-point format. The conversion 4877 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4878 | Arithmetic. 4879 *----------------------------------------------------------------------------*/ 4880 4881 floatx80 int32_to_floatx80(int32_t a, float_status *status) 4882 { 4883 bool zSign; 4884 uint32_t absA; 4885 int8_t shiftCount; 4886 uint64_t zSig; 4887 4888 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 4889 zSign = ( a < 0 ); 4890 absA = zSign ? - a : a; 4891 shiftCount = clz32(absA) + 32; 4892 zSig = absA; 4893 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 4894 4895 } 4896 4897 /*---------------------------------------------------------------------------- 4898 | Returns the result of converting the 64-bit two's complement integer `a' 4899 | to the extended double-precision floating-point format. The conversion 4900 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4901 | Arithmetic. 4902 *----------------------------------------------------------------------------*/ 4903 4904 floatx80 int64_to_floatx80(int64_t a, float_status *status) 4905 { 4906 bool zSign; 4907 uint64_t absA; 4908 int8_t shiftCount; 4909 4910 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 4911 zSign = ( a < 0 ); 4912 absA = zSign ? - a : a; 4913 shiftCount = clz64(absA); 4914 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 4915 4916 } 4917 4918 /*---------------------------------------------------------------------------- 4919 | Returns the result of converting the single-precision floating-point value 4920 | `a' to the extended double-precision floating-point format. The conversion 4921 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4922 | Arithmetic. 4923 *----------------------------------------------------------------------------*/ 4924 4925 floatx80 float32_to_floatx80(float32 a, float_status *status) 4926 { 4927 bool aSign; 4928 int aExp; 4929 uint32_t aSig; 4930 4931 a = float32_squash_input_denormal(a, status); 4932 aSig = extractFloat32Frac( a ); 4933 aExp = extractFloat32Exp( a ); 4934 aSign = extractFloat32Sign( a ); 4935 if ( aExp == 0xFF ) { 4936 if (aSig) { 4937 floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status), 4938 status); 4939 return floatx80_silence_nan(res, status); 4940 } 4941 return packFloatx80(aSign, 4942 floatx80_infinity_high, 4943 floatx80_infinity_low); 4944 } 4945 if ( aExp == 0 ) { 4946 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 4947 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 4948 } 4949 aSig |= 0x00800000; 4950 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 4951 4952 } 4953 4954 /*---------------------------------------------------------------------------- 4955 | Returns the remainder of the single-precision floating-point value `a' 4956 | with respect to the corresponding value `b'. The operation is performed 4957 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4958 *----------------------------------------------------------------------------*/ 4959 4960 float32 float32_rem(float32 a, float32 b, float_status *status) 4961 { 4962 bool aSign, zSign; 4963 int aExp, bExp, expDiff; 4964 uint32_t aSig, bSig; 4965 uint32_t q; 4966 uint64_t aSig64, bSig64, q64; 4967 uint32_t alternateASig; 4968 int32_t sigMean; 4969 a = float32_squash_input_denormal(a, status); 4970 b = float32_squash_input_denormal(b, status); 4971 4972 aSig = extractFloat32Frac( a ); 4973 aExp = extractFloat32Exp( a ); 4974 aSign = extractFloat32Sign( a ); 4975 bSig = extractFloat32Frac( b ); 4976 bExp = extractFloat32Exp( b ); 4977 if ( aExp == 0xFF ) { 4978 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 4979 return propagateFloat32NaN(a, b, status); 4980 } 4981 float_raise(float_flag_invalid, status); 4982 return float32_default_nan(status); 4983 } 4984 if ( bExp == 0xFF ) { 4985 if (bSig) { 4986 return propagateFloat32NaN(a, b, status); 4987 } 4988 return a; 4989 } 4990 if ( bExp == 0 ) { 4991 if ( bSig == 0 ) { 4992 float_raise(float_flag_invalid, status); 4993 return float32_default_nan(status); 4994 } 4995 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 4996 } 4997 if ( aExp == 0 ) { 4998 if ( aSig == 0 ) return a; 4999 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5000 } 5001 expDiff = aExp - bExp; 5002 aSig |= 0x00800000; 5003 bSig |= 0x00800000; 5004 if ( expDiff < 32 ) { 5005 aSig <<= 8; 5006 bSig <<= 8; 5007 if ( expDiff < 0 ) { 5008 if ( expDiff < -1 ) return a; 5009 aSig >>= 1; 5010 } 5011 q = ( bSig <= aSig ); 5012 if ( q ) aSig -= bSig; 5013 if ( 0 < expDiff ) { 5014 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 5015 q >>= 32 - expDiff; 5016 bSig >>= 2; 5017 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 5018 } 5019 else { 5020 aSig >>= 2; 5021 bSig >>= 2; 5022 } 5023 } 5024 else { 5025 if ( bSig <= aSig ) aSig -= bSig; 5026 aSig64 = ( (uint64_t) aSig )<<40; 5027 bSig64 = ( (uint64_t) bSig )<<40; 5028 expDiff -= 64; 5029 while ( 0 < expDiff ) { 5030 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 5031 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 5032 aSig64 = - ( ( bSig * q64 )<<38 ); 5033 expDiff -= 62; 5034 } 5035 expDiff += 64; 5036 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 5037 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 5038 q = q64>>( 64 - expDiff ); 5039 bSig <<= 6; 5040 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 5041 } 5042 do { 5043 alternateASig = aSig; 5044 ++q; 5045 aSig -= bSig; 5046 } while ( 0 <= (int32_t) aSig ); 5047 sigMean = aSig + alternateASig; 5048 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 5049 aSig = alternateASig; 5050 } 5051 zSign = ( (int32_t) aSig < 0 ); 5052 if ( zSign ) aSig = - aSig; 5053 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 5054 } 5055 5056 5057 5058 /*---------------------------------------------------------------------------- 5059 | Returns the binary exponential of the single-precision floating-point value 5060 | `a'. The operation is performed according to the IEC/IEEE Standard for 5061 | Binary Floating-Point Arithmetic. 5062 | 5063 | Uses the following identities: 5064 | 5065 | 1. ------------------------------------------------------------------------- 5066 | x x*ln(2) 5067 | 2 = e 5068 | 5069 | 2. ------------------------------------------------------------------------- 5070 | 2 3 4 5 n 5071 | x x x x x x x 5072 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 5073 | 1! 2! 3! 4! 5! n! 5074 *----------------------------------------------------------------------------*/ 5075 5076 static const float64 float32_exp2_coefficients[15] = 5077 { 5078 const_float64( 0x3ff0000000000000ll ), /* 1 */ 5079 const_float64( 0x3fe0000000000000ll ), /* 2 */ 5080 const_float64( 0x3fc5555555555555ll ), /* 3 */ 5081 const_float64( 0x3fa5555555555555ll ), /* 4 */ 5082 const_float64( 0x3f81111111111111ll ), /* 5 */ 5083 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 5084 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 5085 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 5086 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 5087 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 5088 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 5089 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 5090 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 5091 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 5092 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 5093 }; 5094 5095 float32 float32_exp2(float32 a, float_status *status) 5096 { 5097 bool aSign; 5098 int aExp; 5099 uint32_t aSig; 5100 float64 r, x, xn; 5101 int i; 5102 a = float32_squash_input_denormal(a, status); 5103 5104 aSig = extractFloat32Frac( a ); 5105 aExp = extractFloat32Exp( a ); 5106 aSign = extractFloat32Sign( a ); 5107 5108 if ( aExp == 0xFF) { 5109 if (aSig) { 5110 return propagateFloat32NaN(a, float32_zero, status); 5111 } 5112 return (aSign) ? float32_zero : a; 5113 } 5114 if (aExp == 0) { 5115 if (aSig == 0) return float32_one; 5116 } 5117 5118 float_raise(float_flag_inexact, status); 5119 5120 /* ******************************* */ 5121 /* using float64 for approximation */ 5122 /* ******************************* */ 5123 x = float32_to_float64(a, status); 5124 x = float64_mul(x, float64_ln2, status); 5125 5126 xn = x; 5127 r = float64_one; 5128 for (i = 0 ; i < 15 ; i++) { 5129 float64 f; 5130 5131 f = float64_mul(xn, float32_exp2_coefficients[i], status); 5132 r = float64_add(r, f, status); 5133 5134 xn = float64_mul(xn, x, status); 5135 } 5136 5137 return float64_to_float32(r, status); 5138 } 5139 5140 /*---------------------------------------------------------------------------- 5141 | Returns the binary log of the single-precision floating-point value `a'. 5142 | The operation is performed according to the IEC/IEEE Standard for Binary 5143 | Floating-Point Arithmetic. 5144 *----------------------------------------------------------------------------*/ 5145 float32 float32_log2(float32 a, float_status *status) 5146 { 5147 bool aSign, zSign; 5148 int aExp; 5149 uint32_t aSig, zSig, i; 5150 5151 a = float32_squash_input_denormal(a, status); 5152 aSig = extractFloat32Frac( a ); 5153 aExp = extractFloat32Exp( a ); 5154 aSign = extractFloat32Sign( a ); 5155 5156 if ( aExp == 0 ) { 5157 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 5158 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5159 } 5160 if ( aSign ) { 5161 float_raise(float_flag_invalid, status); 5162 return float32_default_nan(status); 5163 } 5164 if ( aExp == 0xFF ) { 5165 if (aSig) { 5166 return propagateFloat32NaN(a, float32_zero, status); 5167 } 5168 return a; 5169 } 5170 5171 aExp -= 0x7F; 5172 aSig |= 0x00800000; 5173 zSign = aExp < 0; 5174 zSig = aExp << 23; 5175 5176 for (i = 1 << 22; i > 0; i >>= 1) { 5177 aSig = ( (uint64_t)aSig * aSig ) >> 23; 5178 if ( aSig & 0x01000000 ) { 5179 aSig >>= 1; 5180 zSig |= i; 5181 } 5182 } 5183 5184 if ( zSign ) 5185 zSig = -zSig; 5186 5187 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 5188 } 5189 5190 /*---------------------------------------------------------------------------- 5191 | Returns the result of converting the double-precision floating-point value 5192 | `a' to the extended double-precision floating-point format. The conversion 5193 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5194 | Arithmetic. 5195 *----------------------------------------------------------------------------*/ 5196 5197 floatx80 float64_to_floatx80(float64 a, float_status *status) 5198 { 5199 bool aSign; 5200 int aExp; 5201 uint64_t aSig; 5202 5203 a = float64_squash_input_denormal(a, status); 5204 aSig = extractFloat64Frac( a ); 5205 aExp = extractFloat64Exp( a ); 5206 aSign = extractFloat64Sign( a ); 5207 if ( aExp == 0x7FF ) { 5208 if (aSig) { 5209 floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status), 5210 status); 5211 return floatx80_silence_nan(res, status); 5212 } 5213 return packFloatx80(aSign, 5214 floatx80_infinity_high, 5215 floatx80_infinity_low); 5216 } 5217 if ( aExp == 0 ) { 5218 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 5219 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5220 } 5221 return 5222 packFloatx80( 5223 aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11); 5224 5225 } 5226 5227 /*---------------------------------------------------------------------------- 5228 | Returns the remainder of the double-precision floating-point value `a' 5229 | with respect to the corresponding value `b'. The operation is performed 5230 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5231 *----------------------------------------------------------------------------*/ 5232 5233 float64 float64_rem(float64 a, float64 b, float_status *status) 5234 { 5235 bool aSign, zSign; 5236 int aExp, bExp, expDiff; 5237 uint64_t aSig, bSig; 5238 uint64_t q, alternateASig; 5239 int64_t sigMean; 5240 5241 a = float64_squash_input_denormal(a, status); 5242 b = float64_squash_input_denormal(b, status); 5243 aSig = extractFloat64Frac( a ); 5244 aExp = extractFloat64Exp( a ); 5245 aSign = extractFloat64Sign( a ); 5246 bSig = extractFloat64Frac( b ); 5247 bExp = extractFloat64Exp( b ); 5248 if ( aExp == 0x7FF ) { 5249 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 5250 return propagateFloat64NaN(a, b, status); 5251 } 5252 float_raise(float_flag_invalid, status); 5253 return float64_default_nan(status); 5254 } 5255 if ( bExp == 0x7FF ) { 5256 if (bSig) { 5257 return propagateFloat64NaN(a, b, status); 5258 } 5259 return a; 5260 } 5261 if ( bExp == 0 ) { 5262 if ( bSig == 0 ) { 5263 float_raise(float_flag_invalid, status); 5264 return float64_default_nan(status); 5265 } 5266 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 5267 } 5268 if ( aExp == 0 ) { 5269 if ( aSig == 0 ) return a; 5270 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5271 } 5272 expDiff = aExp - bExp; 5273 aSig = (aSig | UINT64_C(0x0010000000000000)) << 11; 5274 bSig = (bSig | UINT64_C(0x0010000000000000)) << 11; 5275 if ( expDiff < 0 ) { 5276 if ( expDiff < -1 ) return a; 5277 aSig >>= 1; 5278 } 5279 q = ( bSig <= aSig ); 5280 if ( q ) aSig -= bSig; 5281 expDiff -= 64; 5282 while ( 0 < expDiff ) { 5283 q = estimateDiv128To64( aSig, 0, bSig ); 5284 q = ( 2 < q ) ? q - 2 : 0; 5285 aSig = - ( ( bSig>>2 ) * q ); 5286 expDiff -= 62; 5287 } 5288 expDiff += 64; 5289 if ( 0 < expDiff ) { 5290 q = estimateDiv128To64( aSig, 0, bSig ); 5291 q = ( 2 < q ) ? q - 2 : 0; 5292 q >>= 64 - expDiff; 5293 bSig >>= 2; 5294 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 5295 } 5296 else { 5297 aSig >>= 2; 5298 bSig >>= 2; 5299 } 5300 do { 5301 alternateASig = aSig; 5302 ++q; 5303 aSig -= bSig; 5304 } while ( 0 <= (int64_t) aSig ); 5305 sigMean = aSig + alternateASig; 5306 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 5307 aSig = alternateASig; 5308 } 5309 zSign = ( (int64_t) aSig < 0 ); 5310 if ( zSign ) aSig = - aSig; 5311 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 5312 5313 } 5314 5315 /*---------------------------------------------------------------------------- 5316 | Returns the binary log of the double-precision floating-point value `a'. 5317 | The operation is performed according to the IEC/IEEE Standard for Binary 5318 | Floating-Point Arithmetic. 5319 *----------------------------------------------------------------------------*/ 5320 float64 float64_log2(float64 a, float_status *status) 5321 { 5322 bool aSign, zSign; 5323 int aExp; 5324 uint64_t aSig, aSig0, aSig1, zSig, i; 5325 a = float64_squash_input_denormal(a, status); 5326 5327 aSig = extractFloat64Frac( a ); 5328 aExp = extractFloat64Exp( a ); 5329 aSign = extractFloat64Sign( a ); 5330 5331 if ( aExp == 0 ) { 5332 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 5333 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5334 } 5335 if ( aSign ) { 5336 float_raise(float_flag_invalid, status); 5337 return float64_default_nan(status); 5338 } 5339 if ( aExp == 0x7FF ) { 5340 if (aSig) { 5341 return propagateFloat64NaN(a, float64_zero, status); 5342 } 5343 return a; 5344 } 5345 5346 aExp -= 0x3FF; 5347 aSig |= UINT64_C(0x0010000000000000); 5348 zSign = aExp < 0; 5349 zSig = (uint64_t)aExp << 52; 5350 for (i = 1LL << 51; i > 0; i >>= 1) { 5351 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 5352 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 5353 if ( aSig & UINT64_C(0x0020000000000000) ) { 5354 aSig >>= 1; 5355 zSig |= i; 5356 } 5357 } 5358 5359 if ( zSign ) 5360 zSig = -zSig; 5361 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 5362 } 5363 5364 /*---------------------------------------------------------------------------- 5365 | Returns the result of converting the extended double-precision floating- 5366 | point value `a' to the 32-bit two's complement integer format. The 5367 | conversion is performed according to the IEC/IEEE Standard for Binary 5368 | Floating-Point Arithmetic---which means in particular that the conversion 5369 | is rounded according to the current rounding mode. If `a' is a NaN, the 5370 | largest positive integer is returned. Otherwise, if the conversion 5371 | overflows, the largest integer with the same sign as `a' is returned. 5372 *----------------------------------------------------------------------------*/ 5373 5374 int32_t floatx80_to_int32(floatx80 a, float_status *status) 5375 { 5376 bool aSign; 5377 int32_t aExp, shiftCount; 5378 uint64_t aSig; 5379 5380 if (floatx80_invalid_encoding(a)) { 5381 float_raise(float_flag_invalid, status); 5382 return 1 << 31; 5383 } 5384 aSig = extractFloatx80Frac( a ); 5385 aExp = extractFloatx80Exp( a ); 5386 aSign = extractFloatx80Sign( a ); 5387 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5388 shiftCount = 0x4037 - aExp; 5389 if ( shiftCount <= 0 ) shiftCount = 1; 5390 shift64RightJamming( aSig, shiftCount, &aSig ); 5391 return roundAndPackInt32(aSign, aSig, status); 5392 5393 } 5394 5395 /*---------------------------------------------------------------------------- 5396 | Returns the result of converting the extended double-precision floating- 5397 | point value `a' to the 32-bit two's complement integer format. The 5398 | conversion is performed according to the IEC/IEEE Standard for Binary 5399 | Floating-Point Arithmetic, except that the conversion is always rounded 5400 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5401 | Otherwise, if the conversion overflows, the largest integer with the same 5402 | sign as `a' is returned. 5403 *----------------------------------------------------------------------------*/ 5404 5405 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 5406 { 5407 bool aSign; 5408 int32_t aExp, shiftCount; 5409 uint64_t aSig, savedASig; 5410 int32_t z; 5411 5412 if (floatx80_invalid_encoding(a)) { 5413 float_raise(float_flag_invalid, status); 5414 return 1 << 31; 5415 } 5416 aSig = extractFloatx80Frac( a ); 5417 aExp = extractFloatx80Exp( a ); 5418 aSign = extractFloatx80Sign( a ); 5419 if ( 0x401E < aExp ) { 5420 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5421 goto invalid; 5422 } 5423 else if ( aExp < 0x3FFF ) { 5424 if (aExp || aSig) { 5425 float_raise(float_flag_inexact, status); 5426 } 5427 return 0; 5428 } 5429 shiftCount = 0x403E - aExp; 5430 savedASig = aSig; 5431 aSig >>= shiftCount; 5432 z = aSig; 5433 if ( aSign ) z = - z; 5434 if ( ( z < 0 ) ^ aSign ) { 5435 invalid: 5436 float_raise(float_flag_invalid, status); 5437 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5438 } 5439 if ( ( aSig<<shiftCount ) != savedASig ) { 5440 float_raise(float_flag_inexact, status); 5441 } 5442 return z; 5443 5444 } 5445 5446 /*---------------------------------------------------------------------------- 5447 | Returns the result of converting the extended double-precision floating- 5448 | point value `a' to the 64-bit two's complement integer format. The 5449 | conversion is performed according to the IEC/IEEE Standard for Binary 5450 | Floating-Point Arithmetic---which means in particular that the conversion 5451 | is rounded according to the current rounding mode. If `a' is a NaN, 5452 | the largest positive integer is returned. Otherwise, if the conversion 5453 | overflows, the largest integer with the same sign as `a' is returned. 5454 *----------------------------------------------------------------------------*/ 5455 5456 int64_t floatx80_to_int64(floatx80 a, float_status *status) 5457 { 5458 bool aSign; 5459 int32_t aExp, shiftCount; 5460 uint64_t aSig, aSigExtra; 5461 5462 if (floatx80_invalid_encoding(a)) { 5463 float_raise(float_flag_invalid, status); 5464 return 1ULL << 63; 5465 } 5466 aSig = extractFloatx80Frac( a ); 5467 aExp = extractFloatx80Exp( a ); 5468 aSign = extractFloatx80Sign( a ); 5469 shiftCount = 0x403E - aExp; 5470 if ( shiftCount <= 0 ) { 5471 if ( shiftCount ) { 5472 float_raise(float_flag_invalid, status); 5473 if (!aSign || floatx80_is_any_nan(a)) { 5474 return INT64_MAX; 5475 } 5476 return INT64_MIN; 5477 } 5478 aSigExtra = 0; 5479 } 5480 else { 5481 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 5482 } 5483 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 5484 5485 } 5486 5487 /*---------------------------------------------------------------------------- 5488 | Returns the result of converting the extended double-precision floating- 5489 | point value `a' to the 64-bit two's complement integer format. The 5490 | conversion is performed according to the IEC/IEEE Standard for Binary 5491 | Floating-Point Arithmetic, except that the conversion is always rounded 5492 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5493 | Otherwise, if the conversion overflows, the largest integer with the same 5494 | sign as `a' is returned. 5495 *----------------------------------------------------------------------------*/ 5496 5497 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 5498 { 5499 bool aSign; 5500 int32_t aExp, shiftCount; 5501 uint64_t aSig; 5502 int64_t z; 5503 5504 if (floatx80_invalid_encoding(a)) { 5505 float_raise(float_flag_invalid, status); 5506 return 1ULL << 63; 5507 } 5508 aSig = extractFloatx80Frac( a ); 5509 aExp = extractFloatx80Exp( a ); 5510 aSign = extractFloatx80Sign( a ); 5511 shiftCount = aExp - 0x403E; 5512 if ( 0 <= shiftCount ) { 5513 aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF); 5514 if ( ( a.high != 0xC03E ) || aSig ) { 5515 float_raise(float_flag_invalid, status); 5516 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 5517 return INT64_MAX; 5518 } 5519 } 5520 return INT64_MIN; 5521 } 5522 else if ( aExp < 0x3FFF ) { 5523 if (aExp | aSig) { 5524 float_raise(float_flag_inexact, status); 5525 } 5526 return 0; 5527 } 5528 z = aSig>>( - shiftCount ); 5529 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 5530 float_raise(float_flag_inexact, status); 5531 } 5532 if ( aSign ) z = - z; 5533 return z; 5534 5535 } 5536 5537 /*---------------------------------------------------------------------------- 5538 | Returns the result of converting the extended double-precision floating- 5539 | point value `a' to the single-precision floating-point format. The 5540 | conversion is performed according to the IEC/IEEE Standard for Binary 5541 | Floating-Point Arithmetic. 5542 *----------------------------------------------------------------------------*/ 5543 5544 float32 floatx80_to_float32(floatx80 a, float_status *status) 5545 { 5546 bool aSign; 5547 int32_t aExp; 5548 uint64_t aSig; 5549 5550 if (floatx80_invalid_encoding(a)) { 5551 float_raise(float_flag_invalid, status); 5552 return float32_default_nan(status); 5553 } 5554 aSig = extractFloatx80Frac( a ); 5555 aExp = extractFloatx80Exp( a ); 5556 aSign = extractFloatx80Sign( a ); 5557 if ( aExp == 0x7FFF ) { 5558 if ( (uint64_t) ( aSig<<1 ) ) { 5559 float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status), 5560 status); 5561 return float32_silence_nan(res, status); 5562 } 5563 return packFloat32( aSign, 0xFF, 0 ); 5564 } 5565 shift64RightJamming( aSig, 33, &aSig ); 5566 if ( aExp || aSig ) aExp -= 0x3F81; 5567 return roundAndPackFloat32(aSign, aExp, aSig, status); 5568 5569 } 5570 5571 /*---------------------------------------------------------------------------- 5572 | Returns the result of converting the extended double-precision floating- 5573 | point value `a' to the double-precision floating-point format. The 5574 | conversion is performed according to the IEC/IEEE Standard for Binary 5575 | Floating-Point Arithmetic. 5576 *----------------------------------------------------------------------------*/ 5577 5578 float64 floatx80_to_float64(floatx80 a, float_status *status) 5579 { 5580 bool aSign; 5581 int32_t aExp; 5582 uint64_t aSig, zSig; 5583 5584 if (floatx80_invalid_encoding(a)) { 5585 float_raise(float_flag_invalid, status); 5586 return float64_default_nan(status); 5587 } 5588 aSig = extractFloatx80Frac( a ); 5589 aExp = extractFloatx80Exp( a ); 5590 aSign = extractFloatx80Sign( a ); 5591 if ( aExp == 0x7FFF ) { 5592 if ( (uint64_t) ( aSig<<1 ) ) { 5593 float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status), 5594 status); 5595 return float64_silence_nan(res, status); 5596 } 5597 return packFloat64( aSign, 0x7FF, 0 ); 5598 } 5599 shift64RightJamming( aSig, 1, &zSig ); 5600 if ( aExp || aSig ) aExp -= 0x3C01; 5601 return roundAndPackFloat64(aSign, aExp, zSig, status); 5602 5603 } 5604 5605 /*---------------------------------------------------------------------------- 5606 | Returns the result of converting the extended double-precision floating- 5607 | point value `a' to the quadruple-precision floating-point format. The 5608 | conversion is performed according to the IEC/IEEE Standard for Binary 5609 | Floating-Point Arithmetic. 5610 *----------------------------------------------------------------------------*/ 5611 5612 float128 floatx80_to_float128(floatx80 a, float_status *status) 5613 { 5614 bool aSign; 5615 int aExp; 5616 uint64_t aSig, zSig0, zSig1; 5617 5618 if (floatx80_invalid_encoding(a)) { 5619 float_raise(float_flag_invalid, status); 5620 return float128_default_nan(status); 5621 } 5622 aSig = extractFloatx80Frac( a ); 5623 aExp = extractFloatx80Exp( a ); 5624 aSign = extractFloatx80Sign( a ); 5625 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 5626 float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status), 5627 status); 5628 return float128_silence_nan(res, status); 5629 } 5630 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 5631 return packFloat128( aSign, aExp, zSig0, zSig1 ); 5632 5633 } 5634 5635 /*---------------------------------------------------------------------------- 5636 | Rounds the extended double-precision floating-point value `a' 5637 | to the precision provided by floatx80_rounding_precision and returns the 5638 | result as an extended double-precision floating-point value. 5639 | The operation is performed according to the IEC/IEEE Standard for Binary 5640 | Floating-Point Arithmetic. 5641 *----------------------------------------------------------------------------*/ 5642 5643 floatx80 floatx80_round(floatx80 a, float_status *status) 5644 { 5645 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5646 extractFloatx80Sign(a), 5647 extractFloatx80Exp(a), 5648 extractFloatx80Frac(a), 0, status); 5649 } 5650 5651 /*---------------------------------------------------------------------------- 5652 | Rounds the extended double-precision floating-point value `a' to an integer, 5653 | and returns the result as an extended quadruple-precision floating-point 5654 | value. The operation is performed according to the IEC/IEEE Standard for 5655 | Binary Floating-Point Arithmetic. 5656 *----------------------------------------------------------------------------*/ 5657 5658 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 5659 { 5660 bool aSign; 5661 int32_t aExp; 5662 uint64_t lastBitMask, roundBitsMask; 5663 floatx80 z; 5664 5665 if (floatx80_invalid_encoding(a)) { 5666 float_raise(float_flag_invalid, status); 5667 return floatx80_default_nan(status); 5668 } 5669 aExp = extractFloatx80Exp( a ); 5670 if ( 0x403E <= aExp ) { 5671 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 5672 return propagateFloatx80NaN(a, a, status); 5673 } 5674 return a; 5675 } 5676 if ( aExp < 0x3FFF ) { 5677 if ( ( aExp == 0 ) 5678 && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) { 5679 return a; 5680 } 5681 float_raise(float_flag_inexact, status); 5682 aSign = extractFloatx80Sign( a ); 5683 switch (status->float_rounding_mode) { 5684 case float_round_nearest_even: 5685 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 5686 ) { 5687 return 5688 packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000)); 5689 } 5690 break; 5691 case float_round_ties_away: 5692 if (aExp == 0x3FFE) { 5693 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000)); 5694 } 5695 break; 5696 case float_round_down: 5697 return 5698 aSign ? 5699 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000)) 5700 : packFloatx80( 0, 0, 0 ); 5701 case float_round_up: 5702 return 5703 aSign ? packFloatx80( 1, 0, 0 ) 5704 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000)); 5705 5706 case float_round_to_zero: 5707 break; 5708 default: 5709 g_assert_not_reached(); 5710 } 5711 return packFloatx80( aSign, 0, 0 ); 5712 } 5713 lastBitMask = 1; 5714 lastBitMask <<= 0x403E - aExp; 5715 roundBitsMask = lastBitMask - 1; 5716 z = a; 5717 switch (status->float_rounding_mode) { 5718 case float_round_nearest_even: 5719 z.low += lastBitMask>>1; 5720 if ((z.low & roundBitsMask) == 0) { 5721 z.low &= ~lastBitMask; 5722 } 5723 break; 5724 case float_round_ties_away: 5725 z.low += lastBitMask >> 1; 5726 break; 5727 case float_round_to_zero: 5728 break; 5729 case float_round_up: 5730 if (!extractFloatx80Sign(z)) { 5731 z.low += roundBitsMask; 5732 } 5733 break; 5734 case float_round_down: 5735 if (extractFloatx80Sign(z)) { 5736 z.low += roundBitsMask; 5737 } 5738 break; 5739 default: 5740 abort(); 5741 } 5742 z.low &= ~ roundBitsMask; 5743 if ( z.low == 0 ) { 5744 ++z.high; 5745 z.low = UINT64_C(0x8000000000000000); 5746 } 5747 if (z.low != a.low) { 5748 float_raise(float_flag_inexact, status); 5749 } 5750 return z; 5751 5752 } 5753 5754 /*---------------------------------------------------------------------------- 5755 | Returns the result of adding the absolute values of the extended double- 5756 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 5757 | negated before being returned. `zSign' is ignored if the result is a NaN. 5758 | The addition is performed according to the IEC/IEEE Standard for Binary 5759 | Floating-Point Arithmetic. 5760 *----------------------------------------------------------------------------*/ 5761 5762 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign, 5763 float_status *status) 5764 { 5765 int32_t aExp, bExp, zExp; 5766 uint64_t aSig, bSig, zSig0, zSig1; 5767 int32_t expDiff; 5768 5769 aSig = extractFloatx80Frac( a ); 5770 aExp = extractFloatx80Exp( a ); 5771 bSig = extractFloatx80Frac( b ); 5772 bExp = extractFloatx80Exp( b ); 5773 expDiff = aExp - bExp; 5774 if ( 0 < expDiff ) { 5775 if ( aExp == 0x7FFF ) { 5776 if ((uint64_t)(aSig << 1)) { 5777 return propagateFloatx80NaN(a, b, status); 5778 } 5779 return a; 5780 } 5781 if ( bExp == 0 ) --expDiff; 5782 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5783 zExp = aExp; 5784 } 5785 else if ( expDiff < 0 ) { 5786 if ( bExp == 0x7FFF ) { 5787 if ((uint64_t)(bSig << 1)) { 5788 return propagateFloatx80NaN(a, b, status); 5789 } 5790 return packFloatx80(zSign, 5791 floatx80_infinity_high, 5792 floatx80_infinity_low); 5793 } 5794 if ( aExp == 0 ) ++expDiff; 5795 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5796 zExp = bExp; 5797 } 5798 else { 5799 if ( aExp == 0x7FFF ) { 5800 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5801 return propagateFloatx80NaN(a, b, status); 5802 } 5803 return a; 5804 } 5805 zSig1 = 0; 5806 zSig0 = aSig + bSig; 5807 if ( aExp == 0 ) { 5808 if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) { 5809 /* At least one of the values is a pseudo-denormal, 5810 * and there is a carry out of the result. */ 5811 zExp = 1; 5812 goto shiftRight1; 5813 } 5814 if (zSig0 == 0) { 5815 return packFloatx80(zSign, 0, 0); 5816 } 5817 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 5818 goto roundAndPack; 5819 } 5820 zExp = aExp; 5821 goto shiftRight1; 5822 } 5823 zSig0 = aSig + bSig; 5824 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 5825 shiftRight1: 5826 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5827 zSig0 |= UINT64_C(0x8000000000000000); 5828 ++zExp; 5829 roundAndPack: 5830 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5831 zSign, zExp, zSig0, zSig1, status); 5832 } 5833 5834 /*---------------------------------------------------------------------------- 5835 | Returns the result of subtracting the absolute values of the extended 5836 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 5837 | difference is negated before being returned. `zSign' is ignored if the 5838 | result is a NaN. The subtraction is performed according to the IEC/IEEE 5839 | Standard for Binary Floating-Point Arithmetic. 5840 *----------------------------------------------------------------------------*/ 5841 5842 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign, 5843 float_status *status) 5844 { 5845 int32_t aExp, bExp, zExp; 5846 uint64_t aSig, bSig, zSig0, zSig1; 5847 int32_t expDiff; 5848 5849 aSig = extractFloatx80Frac( a ); 5850 aExp = extractFloatx80Exp( a ); 5851 bSig = extractFloatx80Frac( b ); 5852 bExp = extractFloatx80Exp( b ); 5853 expDiff = aExp - bExp; 5854 if ( 0 < expDiff ) goto aExpBigger; 5855 if ( expDiff < 0 ) goto bExpBigger; 5856 if ( aExp == 0x7FFF ) { 5857 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5858 return propagateFloatx80NaN(a, b, status); 5859 } 5860 float_raise(float_flag_invalid, status); 5861 return floatx80_default_nan(status); 5862 } 5863 if ( aExp == 0 ) { 5864 aExp = 1; 5865 bExp = 1; 5866 } 5867 zSig1 = 0; 5868 if ( bSig < aSig ) goto aBigger; 5869 if ( aSig < bSig ) goto bBigger; 5870 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 5871 bExpBigger: 5872 if ( bExp == 0x7FFF ) { 5873 if ((uint64_t)(bSig << 1)) { 5874 return propagateFloatx80NaN(a, b, status); 5875 } 5876 return packFloatx80(zSign ^ 1, floatx80_infinity_high, 5877 floatx80_infinity_low); 5878 } 5879 if ( aExp == 0 ) ++expDiff; 5880 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5881 bBigger: 5882 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 5883 zExp = bExp; 5884 zSign ^= 1; 5885 goto normalizeRoundAndPack; 5886 aExpBigger: 5887 if ( aExp == 0x7FFF ) { 5888 if ((uint64_t)(aSig << 1)) { 5889 return propagateFloatx80NaN(a, b, status); 5890 } 5891 return a; 5892 } 5893 if ( bExp == 0 ) --expDiff; 5894 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5895 aBigger: 5896 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 5897 zExp = aExp; 5898 normalizeRoundAndPack: 5899 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 5900 zSign, zExp, zSig0, zSig1, status); 5901 } 5902 5903 /*---------------------------------------------------------------------------- 5904 | Returns the result of adding the extended double-precision floating-point 5905 | values `a' and `b'. The operation is performed according to the IEC/IEEE 5906 | Standard for Binary Floating-Point Arithmetic. 5907 *----------------------------------------------------------------------------*/ 5908 5909 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 5910 { 5911 bool aSign, bSign; 5912 5913 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5914 float_raise(float_flag_invalid, status); 5915 return floatx80_default_nan(status); 5916 } 5917 aSign = extractFloatx80Sign( a ); 5918 bSign = extractFloatx80Sign( b ); 5919 if ( aSign == bSign ) { 5920 return addFloatx80Sigs(a, b, aSign, status); 5921 } 5922 else { 5923 return subFloatx80Sigs(a, b, aSign, status); 5924 } 5925 5926 } 5927 5928 /*---------------------------------------------------------------------------- 5929 | Returns the result of subtracting the extended double-precision floating- 5930 | point values `a' and `b'. The operation is performed according to the 5931 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5932 *----------------------------------------------------------------------------*/ 5933 5934 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 5935 { 5936 bool aSign, bSign; 5937 5938 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5939 float_raise(float_flag_invalid, status); 5940 return floatx80_default_nan(status); 5941 } 5942 aSign = extractFloatx80Sign( a ); 5943 bSign = extractFloatx80Sign( b ); 5944 if ( aSign == bSign ) { 5945 return subFloatx80Sigs(a, b, aSign, status); 5946 } 5947 else { 5948 return addFloatx80Sigs(a, b, aSign, status); 5949 } 5950 5951 } 5952 5953 /*---------------------------------------------------------------------------- 5954 | Returns the result of multiplying the extended double-precision floating- 5955 | point values `a' and `b'. The operation is performed according to the 5956 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5957 *----------------------------------------------------------------------------*/ 5958 5959 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 5960 { 5961 bool aSign, bSign, zSign; 5962 int32_t aExp, bExp, zExp; 5963 uint64_t aSig, bSig, zSig0, zSig1; 5964 5965 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5966 float_raise(float_flag_invalid, status); 5967 return floatx80_default_nan(status); 5968 } 5969 aSig = extractFloatx80Frac( a ); 5970 aExp = extractFloatx80Exp( a ); 5971 aSign = extractFloatx80Sign( a ); 5972 bSig = extractFloatx80Frac( b ); 5973 bExp = extractFloatx80Exp( b ); 5974 bSign = extractFloatx80Sign( b ); 5975 zSign = aSign ^ bSign; 5976 if ( aExp == 0x7FFF ) { 5977 if ( (uint64_t) ( aSig<<1 ) 5978 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5979 return propagateFloatx80NaN(a, b, status); 5980 } 5981 if ( ( bExp | bSig ) == 0 ) goto invalid; 5982 return packFloatx80(zSign, floatx80_infinity_high, 5983 floatx80_infinity_low); 5984 } 5985 if ( bExp == 0x7FFF ) { 5986 if ((uint64_t)(bSig << 1)) { 5987 return propagateFloatx80NaN(a, b, status); 5988 } 5989 if ( ( aExp | aSig ) == 0 ) { 5990 invalid: 5991 float_raise(float_flag_invalid, status); 5992 return floatx80_default_nan(status); 5993 } 5994 return packFloatx80(zSign, floatx80_infinity_high, 5995 floatx80_infinity_low); 5996 } 5997 if ( aExp == 0 ) { 5998 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5999 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 6000 } 6001 if ( bExp == 0 ) { 6002 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6003 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6004 } 6005 zExp = aExp + bExp - 0x3FFE; 6006 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 6007 if ( 0 < (int64_t) zSig0 ) { 6008 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 6009 --zExp; 6010 } 6011 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6012 zSign, zExp, zSig0, zSig1, status); 6013 } 6014 6015 /*---------------------------------------------------------------------------- 6016 | Returns the result of dividing the extended double-precision floating-point 6017 | value `a' by the corresponding value `b'. The operation is performed 6018 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6019 *----------------------------------------------------------------------------*/ 6020 6021 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 6022 { 6023 bool aSign, bSign, zSign; 6024 int32_t aExp, bExp, zExp; 6025 uint64_t aSig, bSig, zSig0, zSig1; 6026 uint64_t rem0, rem1, rem2, term0, term1, term2; 6027 6028 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6029 float_raise(float_flag_invalid, status); 6030 return floatx80_default_nan(status); 6031 } 6032 aSig = extractFloatx80Frac( a ); 6033 aExp = extractFloatx80Exp( a ); 6034 aSign = extractFloatx80Sign( a ); 6035 bSig = extractFloatx80Frac( b ); 6036 bExp = extractFloatx80Exp( b ); 6037 bSign = extractFloatx80Sign( b ); 6038 zSign = aSign ^ bSign; 6039 if ( aExp == 0x7FFF ) { 6040 if ((uint64_t)(aSig << 1)) { 6041 return propagateFloatx80NaN(a, b, status); 6042 } 6043 if ( bExp == 0x7FFF ) { 6044 if ((uint64_t)(bSig << 1)) { 6045 return propagateFloatx80NaN(a, b, status); 6046 } 6047 goto invalid; 6048 } 6049 return packFloatx80(zSign, floatx80_infinity_high, 6050 floatx80_infinity_low); 6051 } 6052 if ( bExp == 0x7FFF ) { 6053 if ((uint64_t)(bSig << 1)) { 6054 return propagateFloatx80NaN(a, b, status); 6055 } 6056 return packFloatx80( zSign, 0, 0 ); 6057 } 6058 if ( bExp == 0 ) { 6059 if ( bSig == 0 ) { 6060 if ( ( aExp | aSig ) == 0 ) { 6061 invalid: 6062 float_raise(float_flag_invalid, status); 6063 return floatx80_default_nan(status); 6064 } 6065 float_raise(float_flag_divbyzero, status); 6066 return packFloatx80(zSign, floatx80_infinity_high, 6067 floatx80_infinity_low); 6068 } 6069 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6070 } 6071 if ( aExp == 0 ) { 6072 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6073 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 6074 } 6075 zExp = aExp - bExp + 0x3FFE; 6076 rem1 = 0; 6077 if ( bSig <= aSig ) { 6078 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 6079 ++zExp; 6080 } 6081 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 6082 mul64To128( bSig, zSig0, &term0, &term1 ); 6083 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 6084 while ( (int64_t) rem0 < 0 ) { 6085 --zSig0; 6086 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 6087 } 6088 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 6089 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 6090 mul64To128( bSig, zSig1, &term1, &term2 ); 6091 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6092 while ( (int64_t) rem1 < 0 ) { 6093 --zSig1; 6094 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 6095 } 6096 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 6097 } 6098 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6099 zSign, zExp, zSig0, zSig1, status); 6100 } 6101 6102 /*---------------------------------------------------------------------------- 6103 | Returns the remainder of the extended double-precision floating-point value 6104 | `a' with respect to the corresponding value `b'. The operation is performed 6105 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic, 6106 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating 6107 | the quotient toward zero instead. '*quotient' is set to the low 64 bits of 6108 | the absolute value of the integer quotient. 6109 *----------------------------------------------------------------------------*/ 6110 6111 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient, 6112 float_status *status) 6113 { 6114 bool aSign, zSign; 6115 int32_t aExp, bExp, expDiff, aExpOrig; 6116 uint64_t aSig0, aSig1, bSig; 6117 uint64_t q, term0, term1, alternateASig0, alternateASig1; 6118 6119 *quotient = 0; 6120 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6121 float_raise(float_flag_invalid, status); 6122 return floatx80_default_nan(status); 6123 } 6124 aSig0 = extractFloatx80Frac( a ); 6125 aExpOrig = aExp = extractFloatx80Exp( a ); 6126 aSign = extractFloatx80Sign( a ); 6127 bSig = extractFloatx80Frac( b ); 6128 bExp = extractFloatx80Exp( b ); 6129 if ( aExp == 0x7FFF ) { 6130 if ( (uint64_t) ( aSig0<<1 ) 6131 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 6132 return propagateFloatx80NaN(a, b, status); 6133 } 6134 goto invalid; 6135 } 6136 if ( bExp == 0x7FFF ) { 6137 if ((uint64_t)(bSig << 1)) { 6138 return propagateFloatx80NaN(a, b, status); 6139 } 6140 if (aExp == 0 && aSig0 >> 63) { 6141 /* 6142 * Pseudo-denormal argument must be returned in normalized 6143 * form. 6144 */ 6145 return packFloatx80(aSign, 1, aSig0); 6146 } 6147 return a; 6148 } 6149 if ( bExp == 0 ) { 6150 if ( bSig == 0 ) { 6151 invalid: 6152 float_raise(float_flag_invalid, status); 6153 return floatx80_default_nan(status); 6154 } 6155 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6156 } 6157 if ( aExp == 0 ) { 6158 if ( aSig0 == 0 ) return a; 6159 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6160 } 6161 zSign = aSign; 6162 expDiff = aExp - bExp; 6163 aSig1 = 0; 6164 if ( expDiff < 0 ) { 6165 if ( mod || expDiff < -1 ) { 6166 if (aExp == 1 && aExpOrig == 0) { 6167 /* 6168 * Pseudo-denormal argument must be returned in 6169 * normalized form. 6170 */ 6171 return packFloatx80(aSign, aExp, aSig0); 6172 } 6173 return a; 6174 } 6175 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 6176 expDiff = 0; 6177 } 6178 *quotient = q = ( bSig <= aSig0 ); 6179 if ( q ) aSig0 -= bSig; 6180 expDiff -= 64; 6181 while ( 0 < expDiff ) { 6182 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6183 q = ( 2 < q ) ? q - 2 : 0; 6184 mul64To128( bSig, q, &term0, &term1 ); 6185 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6186 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 6187 expDiff -= 62; 6188 *quotient <<= 62; 6189 *quotient += q; 6190 } 6191 expDiff += 64; 6192 if ( 0 < expDiff ) { 6193 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6194 q = ( 2 < q ) ? q - 2 : 0; 6195 q >>= 64 - expDiff; 6196 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 6197 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6198 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 6199 while ( le128( term0, term1, aSig0, aSig1 ) ) { 6200 ++q; 6201 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6202 } 6203 if (expDiff < 64) { 6204 *quotient <<= expDiff; 6205 } else { 6206 *quotient = 0; 6207 } 6208 *quotient += q; 6209 } 6210 else { 6211 term1 = 0; 6212 term0 = bSig; 6213 } 6214 if (!mod) { 6215 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 6216 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6217 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6218 && ( q & 1 ) ) 6219 ) { 6220 aSig0 = alternateASig0; 6221 aSig1 = alternateASig1; 6222 zSign = ! zSign; 6223 ++*quotient; 6224 } 6225 } 6226 return 6227 normalizeRoundAndPackFloatx80( 6228 80, zSign, bExp + expDiff, aSig0, aSig1, status); 6229 6230 } 6231 6232 /*---------------------------------------------------------------------------- 6233 | Returns the remainder of the extended double-precision floating-point value 6234 | `a' with respect to the corresponding value `b'. The operation is performed 6235 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6236 *----------------------------------------------------------------------------*/ 6237 6238 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 6239 { 6240 uint64_t quotient; 6241 return floatx80_modrem(a, b, false, "ient, status); 6242 } 6243 6244 /*---------------------------------------------------------------------------- 6245 | Returns the remainder of the extended double-precision floating-point value 6246 | `a' with respect to the corresponding value `b', with the quotient truncated 6247 | toward zero. 6248 *----------------------------------------------------------------------------*/ 6249 6250 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status) 6251 { 6252 uint64_t quotient; 6253 return floatx80_modrem(a, b, true, "ient, status); 6254 } 6255 6256 /*---------------------------------------------------------------------------- 6257 | Returns the square root of the extended double-precision floating-point 6258 | value `a'. The operation is performed according to the IEC/IEEE Standard 6259 | for Binary Floating-Point Arithmetic. 6260 *----------------------------------------------------------------------------*/ 6261 6262 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 6263 { 6264 bool aSign; 6265 int32_t aExp, zExp; 6266 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 6267 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6268 6269 if (floatx80_invalid_encoding(a)) { 6270 float_raise(float_flag_invalid, status); 6271 return floatx80_default_nan(status); 6272 } 6273 aSig0 = extractFloatx80Frac( a ); 6274 aExp = extractFloatx80Exp( a ); 6275 aSign = extractFloatx80Sign( a ); 6276 if ( aExp == 0x7FFF ) { 6277 if ((uint64_t)(aSig0 << 1)) { 6278 return propagateFloatx80NaN(a, a, status); 6279 } 6280 if ( ! aSign ) return a; 6281 goto invalid; 6282 } 6283 if ( aSign ) { 6284 if ( ( aExp | aSig0 ) == 0 ) return a; 6285 invalid: 6286 float_raise(float_flag_invalid, status); 6287 return floatx80_default_nan(status); 6288 } 6289 if ( aExp == 0 ) { 6290 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 6291 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6292 } 6293 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 6294 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 6295 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 6296 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6297 doubleZSig0 = zSig0<<1; 6298 mul64To128( zSig0, zSig0, &term0, &term1 ); 6299 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6300 while ( (int64_t) rem0 < 0 ) { 6301 --zSig0; 6302 doubleZSig0 -= 2; 6303 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6304 } 6305 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6306 if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) { 6307 if ( zSig1 == 0 ) zSig1 = 1; 6308 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6309 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6310 mul64To128( zSig1, zSig1, &term2, &term3 ); 6311 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6312 while ( (int64_t) rem1 < 0 ) { 6313 --zSig1; 6314 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6315 term3 |= 1; 6316 term2 |= doubleZSig0; 6317 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6318 } 6319 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6320 } 6321 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 6322 zSig0 |= doubleZSig0; 6323 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6324 0, zExp, zSig0, zSig1, status); 6325 } 6326 6327 /*---------------------------------------------------------------------------- 6328 | Returns the result of converting the quadruple-precision floating-point 6329 | value `a' to the extended double-precision floating-point format. The 6330 | conversion is performed according to the IEC/IEEE Standard for Binary 6331 | Floating-Point Arithmetic. 6332 *----------------------------------------------------------------------------*/ 6333 6334 floatx80 float128_to_floatx80(float128 a, float_status *status) 6335 { 6336 bool aSign; 6337 int32_t aExp; 6338 uint64_t aSig0, aSig1; 6339 6340 aSig1 = extractFloat128Frac1( a ); 6341 aSig0 = extractFloat128Frac0( a ); 6342 aExp = extractFloat128Exp( a ); 6343 aSign = extractFloat128Sign( a ); 6344 if ( aExp == 0x7FFF ) { 6345 if ( aSig0 | aSig1 ) { 6346 floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status), 6347 status); 6348 return floatx80_silence_nan(res, status); 6349 } 6350 return packFloatx80(aSign, floatx80_infinity_high, 6351 floatx80_infinity_low); 6352 } 6353 if ( aExp == 0 ) { 6354 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6355 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6356 } 6357 else { 6358 aSig0 |= UINT64_C(0x0001000000000000); 6359 } 6360 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6361 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6362 6363 } 6364 6365 /*---------------------------------------------------------------------------- 6366 | Returns the remainder of the quadruple-precision floating-point value `a' 6367 | with respect to the corresponding value `b'. The operation is performed 6368 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6369 *----------------------------------------------------------------------------*/ 6370 6371 float128 float128_rem(float128 a, float128 b, float_status *status) 6372 { 6373 bool aSign, zSign; 6374 int32_t aExp, bExp, expDiff; 6375 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 6376 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 6377 int64_t sigMean0; 6378 6379 aSig1 = extractFloat128Frac1( a ); 6380 aSig0 = extractFloat128Frac0( a ); 6381 aExp = extractFloat128Exp( a ); 6382 aSign = extractFloat128Sign( a ); 6383 bSig1 = extractFloat128Frac1( b ); 6384 bSig0 = extractFloat128Frac0( b ); 6385 bExp = extractFloat128Exp( b ); 6386 if ( aExp == 0x7FFF ) { 6387 if ( ( aSig0 | aSig1 ) 6388 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6389 return propagateFloat128NaN(a, b, status); 6390 } 6391 goto invalid; 6392 } 6393 if ( bExp == 0x7FFF ) { 6394 if (bSig0 | bSig1) { 6395 return propagateFloat128NaN(a, b, status); 6396 } 6397 return a; 6398 } 6399 if ( bExp == 0 ) { 6400 if ( ( bSig0 | bSig1 ) == 0 ) { 6401 invalid: 6402 float_raise(float_flag_invalid, status); 6403 return float128_default_nan(status); 6404 } 6405 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6406 } 6407 if ( aExp == 0 ) { 6408 if ( ( aSig0 | aSig1 ) == 0 ) return a; 6409 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6410 } 6411 expDiff = aExp - bExp; 6412 if ( expDiff < -1 ) return a; 6413 shortShift128Left( 6414 aSig0 | UINT64_C(0x0001000000000000), 6415 aSig1, 6416 15 - ( expDiff < 0 ), 6417 &aSig0, 6418 &aSig1 6419 ); 6420 shortShift128Left( 6421 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 ); 6422 q = le128( bSig0, bSig1, aSig0, aSig1 ); 6423 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6424 expDiff -= 64; 6425 while ( 0 < expDiff ) { 6426 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6427 q = ( 4 < q ) ? q - 4 : 0; 6428 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6429 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 6430 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 6431 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 6432 expDiff -= 61; 6433 } 6434 if ( -64 < expDiff ) { 6435 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6436 q = ( 4 < q ) ? q - 4 : 0; 6437 q >>= - expDiff; 6438 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6439 expDiff += 52; 6440 if ( expDiff < 0 ) { 6441 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6442 } 6443 else { 6444 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 6445 } 6446 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6447 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 6448 } 6449 else { 6450 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 6451 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6452 } 6453 do { 6454 alternateASig0 = aSig0; 6455 alternateASig1 = aSig1; 6456 ++q; 6457 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6458 } while ( 0 <= (int64_t) aSig0 ); 6459 add128( 6460 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 6461 if ( ( sigMean0 < 0 ) 6462 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 6463 aSig0 = alternateASig0; 6464 aSig1 = alternateASig1; 6465 } 6466 zSign = ( (int64_t) aSig0 < 0 ); 6467 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 6468 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 6469 status); 6470 } 6471 6472 /*---------------------------------------------------------------------------- 6473 | Returns the square root of the quadruple-precision floating-point value `a'. 6474 | The operation is performed according to the IEC/IEEE Standard for Binary 6475 | Floating-Point Arithmetic. 6476 *----------------------------------------------------------------------------*/ 6477 6478 float128 float128_sqrt(float128 a, float_status *status) 6479 { 6480 bool aSign; 6481 int32_t aExp, zExp; 6482 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 6483 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6484 6485 aSig1 = extractFloat128Frac1( a ); 6486 aSig0 = extractFloat128Frac0( a ); 6487 aExp = extractFloat128Exp( a ); 6488 aSign = extractFloat128Sign( a ); 6489 if ( aExp == 0x7FFF ) { 6490 if (aSig0 | aSig1) { 6491 return propagateFloat128NaN(a, a, status); 6492 } 6493 if ( ! aSign ) return a; 6494 goto invalid; 6495 } 6496 if ( aSign ) { 6497 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 6498 invalid: 6499 float_raise(float_flag_invalid, status); 6500 return float128_default_nan(status); 6501 } 6502 if ( aExp == 0 ) { 6503 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 6504 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6505 } 6506 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 6507 aSig0 |= UINT64_C(0x0001000000000000); 6508 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 6509 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 6510 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6511 doubleZSig0 = zSig0<<1; 6512 mul64To128( zSig0, zSig0, &term0, &term1 ); 6513 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6514 while ( (int64_t) rem0 < 0 ) { 6515 --zSig0; 6516 doubleZSig0 -= 2; 6517 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6518 } 6519 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6520 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 6521 if ( zSig1 == 0 ) zSig1 = 1; 6522 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6523 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6524 mul64To128( zSig1, zSig1, &term2, &term3 ); 6525 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6526 while ( (int64_t) rem1 < 0 ) { 6527 --zSig1; 6528 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6529 term3 |= 1; 6530 term2 |= doubleZSig0; 6531 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6532 } 6533 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6534 } 6535 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 6536 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 6537 6538 } 6539 6540 static inline FloatRelation 6541 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet, 6542 float_status *status) 6543 { 6544 bool aSign, bSign; 6545 6546 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6547 float_raise(float_flag_invalid, status); 6548 return float_relation_unordered; 6549 } 6550 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 6551 ( extractFloatx80Frac( a )<<1 ) ) || 6552 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 6553 ( extractFloatx80Frac( b )<<1 ) )) { 6554 if (!is_quiet || 6555 floatx80_is_signaling_nan(a, status) || 6556 floatx80_is_signaling_nan(b, status)) { 6557 float_raise(float_flag_invalid, status); 6558 } 6559 return float_relation_unordered; 6560 } 6561 aSign = extractFloatx80Sign( a ); 6562 bSign = extractFloatx80Sign( b ); 6563 if ( aSign != bSign ) { 6564 6565 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 6566 ( ( a.low | b.low ) == 0 ) ) { 6567 /* zero case */ 6568 return float_relation_equal; 6569 } else { 6570 return 1 - (2 * aSign); 6571 } 6572 } else { 6573 /* Normalize pseudo-denormals before comparison. */ 6574 if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) { 6575 ++a.high; 6576 } 6577 if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) { 6578 ++b.high; 6579 } 6580 if (a.low == b.low && a.high == b.high) { 6581 return float_relation_equal; 6582 } else { 6583 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 6584 } 6585 } 6586 } 6587 6588 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status) 6589 { 6590 return floatx80_compare_internal(a, b, 0, status); 6591 } 6592 6593 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b, 6594 float_status *status) 6595 { 6596 return floatx80_compare_internal(a, b, 1, status); 6597 } 6598 6599 static inline FloatRelation 6600 float128_compare_internal(float128 a, float128 b, bool is_quiet, 6601 float_status *status) 6602 { 6603 bool aSign, bSign; 6604 6605 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 6606 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 6607 ( ( extractFloat128Exp( b ) == 0x7fff ) && 6608 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 6609 if (!is_quiet || 6610 float128_is_signaling_nan(a, status) || 6611 float128_is_signaling_nan(b, status)) { 6612 float_raise(float_flag_invalid, status); 6613 } 6614 return float_relation_unordered; 6615 } 6616 aSign = extractFloat128Sign( a ); 6617 bSign = extractFloat128Sign( b ); 6618 if ( aSign != bSign ) { 6619 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 6620 /* zero case */ 6621 return float_relation_equal; 6622 } else { 6623 return 1 - (2 * aSign); 6624 } 6625 } else { 6626 if (a.low == b.low && a.high == b.high) { 6627 return float_relation_equal; 6628 } else { 6629 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 6630 } 6631 } 6632 } 6633 6634 FloatRelation float128_compare(float128 a, float128 b, float_status *status) 6635 { 6636 return float128_compare_internal(a, b, 0, status); 6637 } 6638 6639 FloatRelation float128_compare_quiet(float128 a, float128 b, 6640 float_status *status) 6641 { 6642 return float128_compare_internal(a, b, 1, status); 6643 } 6644 6645 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 6646 { 6647 bool aSign; 6648 int32_t aExp; 6649 uint64_t aSig; 6650 6651 if (floatx80_invalid_encoding(a)) { 6652 float_raise(float_flag_invalid, status); 6653 return floatx80_default_nan(status); 6654 } 6655 aSig = extractFloatx80Frac( a ); 6656 aExp = extractFloatx80Exp( a ); 6657 aSign = extractFloatx80Sign( a ); 6658 6659 if ( aExp == 0x7FFF ) { 6660 if ( aSig<<1 ) { 6661 return propagateFloatx80NaN(a, a, status); 6662 } 6663 return a; 6664 } 6665 6666 if (aExp == 0) { 6667 if (aSig == 0) { 6668 return a; 6669 } 6670 aExp++; 6671 } 6672 6673 if (n > 0x10000) { 6674 n = 0x10000; 6675 } else if (n < -0x10000) { 6676 n = -0x10000; 6677 } 6678 6679 aExp += n; 6680 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 6681 aSign, aExp, aSig, 0, status); 6682 } 6683 6684 float128 float128_scalbn(float128 a, int n, float_status *status) 6685 { 6686 bool aSign; 6687 int32_t aExp; 6688 uint64_t aSig0, aSig1; 6689 6690 aSig1 = extractFloat128Frac1( a ); 6691 aSig0 = extractFloat128Frac0( a ); 6692 aExp = extractFloat128Exp( a ); 6693 aSign = extractFloat128Sign( a ); 6694 if ( aExp == 0x7FFF ) { 6695 if ( aSig0 | aSig1 ) { 6696 return propagateFloat128NaN(a, a, status); 6697 } 6698 return a; 6699 } 6700 if (aExp != 0) { 6701 aSig0 |= UINT64_C(0x0001000000000000); 6702 } else if (aSig0 == 0 && aSig1 == 0) { 6703 return a; 6704 } else { 6705 aExp++; 6706 } 6707 6708 if (n > 0x10000) { 6709 n = 0x10000; 6710 } else if (n < -0x10000) { 6711 n = -0x10000; 6712 } 6713 6714 aExp += n - 1; 6715 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 6716 , status); 6717 6718 } 6719 6720 static void __attribute__((constructor)) softfloat_init(void) 6721 { 6722 union_float64 ua, ub, uc, ur; 6723 6724 if (QEMU_NO_HARDFLOAT) { 6725 return; 6726 } 6727 /* 6728 * Test that the host's FMA is not obviously broken. For example, 6729 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see 6730 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304 6731 */ 6732 ua.s = 0x0020000000000001ULL; 6733 ub.s = 0x3ca0000000000000ULL; 6734 uc.s = 0x0020000000000000ULL; 6735 ur.h = fma(ua.h, ub.h, uc.h); 6736 if (ur.s != 0x0020000000000001ULL) { 6737 force_soft_fma = true; 6738 } 6739 } 6740