1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include <math.h> 87 #include "qemu/bitops.h" 88 #include "fpu/softfloat.h" 89 90 /* We only need stdlib for abort() */ 91 92 /*---------------------------------------------------------------------------- 93 | Primitive arithmetic functions, including multi-word arithmetic, and 94 | division and square root approximations. (Can be specialized to target if 95 | desired.) 96 *----------------------------------------------------------------------------*/ 97 #include "fpu/softfloat-macros.h" 98 99 /* 100 * Hardfloat 101 * 102 * Fast emulation of guest FP instructions is challenging for two reasons. 103 * First, FP instruction semantics are similar but not identical, particularly 104 * when handling NaNs. Second, emulating at reasonable speed the guest FP 105 * exception flags is not trivial: reading the host's flags register with a 106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp], 107 * and trapping on every FP exception is not fast nor pleasant to work with. 108 * 109 * We address these challenges by leveraging the host FPU for a subset of the 110 * operations. To do this we expand on the idea presented in this paper: 111 * 112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a 113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615. 114 * 115 * The idea is thus to leverage the host FPU to (1) compute FP operations 116 * and (2) identify whether FP exceptions occurred while avoiding 117 * expensive exception flag register accesses. 118 * 119 * An important optimization shown in the paper is that given that exception 120 * flags are rarely cleared by the guest, we can avoid recomputing some flags. 121 * This is particularly useful for the inexact flag, which is very frequently 122 * raised in floating-point workloads. 123 * 124 * We optimize the code further by deferring to soft-fp whenever FP exception 125 * detection might get hairy. Two examples: (1) when at least one operand is 126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result 127 * and the result is < the minimum normal. 128 */ 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \ 130 static inline void name(soft_t *a, float_status *s) \ 131 { \ 132 if (unlikely(soft_t ## _is_denormal(*a))) { \ 133 *a = soft_t ## _set_sign(soft_t ## _zero, \ 134 soft_t ## _is_neg(*a)); \ 135 float_raise(float_flag_input_denormal, s); \ 136 } \ 137 } 138 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32) 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64) 141 #undef GEN_INPUT_FLUSH__NOCHECK 142 143 #define GEN_INPUT_FLUSH1(name, soft_t) \ 144 static inline void name(soft_t *a, float_status *s) \ 145 { \ 146 if (likely(!s->flush_inputs_to_zero)) { \ 147 return; \ 148 } \ 149 soft_t ## _input_flush__nocheck(a, s); \ 150 } 151 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32) 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64) 154 #undef GEN_INPUT_FLUSH1 155 156 #define GEN_INPUT_FLUSH2(name, soft_t) \ 157 static inline void name(soft_t *a, soft_t *b, float_status *s) \ 158 { \ 159 if (likely(!s->flush_inputs_to_zero)) { \ 160 return; \ 161 } \ 162 soft_t ## _input_flush__nocheck(a, s); \ 163 soft_t ## _input_flush__nocheck(b, s); \ 164 } 165 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32) 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64) 168 #undef GEN_INPUT_FLUSH2 169 170 #define GEN_INPUT_FLUSH3(name, soft_t) \ 171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \ 172 { \ 173 if (likely(!s->flush_inputs_to_zero)) { \ 174 return; \ 175 } \ 176 soft_t ## _input_flush__nocheck(a, s); \ 177 soft_t ## _input_flush__nocheck(b, s); \ 178 soft_t ## _input_flush__nocheck(c, s); \ 179 } 180 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32) 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64) 183 #undef GEN_INPUT_FLUSH3 184 185 /* 186 * Choose whether to use fpclassify or float32/64_* primitives in the generated 187 * hardfloat functions. Each combination of number of inputs and float size 188 * gets its own value. 189 */ 190 #if defined(__x86_64__) 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1 197 #else 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0 204 #endif 205 206 /* 207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over 208 * float{32,64}_is_infinity when !USE_FP. 209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup. 210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%. 211 */ 212 #if defined(__x86_64__) || defined(__aarch64__) 213 # define QEMU_HARDFLOAT_USE_ISINF 1 214 #else 215 # define QEMU_HARDFLOAT_USE_ISINF 0 216 #endif 217 218 /* 219 * Some targets clear the FP flags before most FP operations. This prevents 220 * the use of hardfloat, since hardfloat relies on the inexact flag being 221 * already set. 222 */ 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__) 224 # if defined(__FAST_MATH__) 225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \ 226 IEEE implementation 227 # endif 228 # define QEMU_NO_HARDFLOAT 1 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN 230 #else 231 # define QEMU_NO_HARDFLOAT 0 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline)) 233 #endif 234 235 static inline bool can_use_fpu(const float_status *s) 236 { 237 if (QEMU_NO_HARDFLOAT) { 238 return false; 239 } 240 return likely(s->float_exception_flags & float_flag_inexact && 241 s->float_rounding_mode == float_round_nearest_even); 242 } 243 244 /* 245 * Hardfloat generation functions. Each operation can have two flavors: 246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for 247 * most condition checks, or native ones (e.g. fpclassify). 248 * 249 * The flavor is chosen by the callers. Instead of using macros, we rely on the 250 * compiler to propagate constants and inline everything into the callers. 251 * 252 * We only generate functions for operations with two inputs, since only 253 * these are common enough to justify consolidating them into common code. 254 */ 255 256 typedef union { 257 float32 s; 258 float h; 259 } union_float32; 260 261 typedef union { 262 float64 s; 263 double h; 264 } union_float64; 265 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b); 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b); 268 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s); 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s); 271 typedef float (*hard_f32_op2_fn)(float a, float b); 272 typedef double (*hard_f64_op2_fn)(double a, double b); 273 274 /* 2-input is-zero-or-normal */ 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b) 276 { 277 if (QEMU_HARDFLOAT_2F32_USE_FP) { 278 /* 279 * Not using a temp variable for consecutive fpclassify calls ends up 280 * generating faster code. 281 */ 282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 284 } 285 return float32_is_zero_or_normal(a.s) && 286 float32_is_zero_or_normal(b.s); 287 } 288 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b) 290 { 291 if (QEMU_HARDFLOAT_2F64_USE_FP) { 292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 294 } 295 return float64_is_zero_or_normal(a.s) && 296 float64_is_zero_or_normal(b.s); 297 } 298 299 /* 3-input is-zero-or-normal */ 300 static inline 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c) 302 { 303 if (QEMU_HARDFLOAT_3F32_USE_FP) { 304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 307 } 308 return float32_is_zero_or_normal(a.s) && 309 float32_is_zero_or_normal(b.s) && 310 float32_is_zero_or_normal(c.s); 311 } 312 313 static inline 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c) 315 { 316 if (QEMU_HARDFLOAT_3F64_USE_FP) { 317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 320 } 321 return float64_is_zero_or_normal(a.s) && 322 float64_is_zero_or_normal(b.s) && 323 float64_is_zero_or_normal(c.s); 324 } 325 326 static inline bool f32_is_inf(union_float32 a) 327 { 328 if (QEMU_HARDFLOAT_USE_ISINF) { 329 return isinf(a.h); 330 } 331 return float32_is_infinity(a.s); 332 } 333 334 static inline bool f64_is_inf(union_float64 a) 335 { 336 if (QEMU_HARDFLOAT_USE_ISINF) { 337 return isinf(a.h); 338 } 339 return float64_is_infinity(a.s); 340 } 341 342 static inline float32 343 float32_gen2(float32 xa, float32 xb, float_status *s, 344 hard_f32_op2_fn hard, soft_f32_op2_fn soft, 345 f32_check_fn pre, f32_check_fn post) 346 { 347 union_float32 ua, ub, ur; 348 349 ua.s = xa; 350 ub.s = xb; 351 352 if (unlikely(!can_use_fpu(s))) { 353 goto soft; 354 } 355 356 float32_input_flush2(&ua.s, &ub.s, s); 357 if (unlikely(!pre(ua, ub))) { 358 goto soft; 359 } 360 361 ur.h = hard(ua.h, ub.h); 362 if (unlikely(f32_is_inf(ur))) { 363 float_raise(float_flag_overflow, s); 364 } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) { 365 goto soft; 366 } 367 return ur.s; 368 369 soft: 370 return soft(ua.s, ub.s, s); 371 } 372 373 static inline float64 374 float64_gen2(float64 xa, float64 xb, float_status *s, 375 hard_f64_op2_fn hard, soft_f64_op2_fn soft, 376 f64_check_fn pre, f64_check_fn post) 377 { 378 union_float64 ua, ub, ur; 379 380 ua.s = xa; 381 ub.s = xb; 382 383 if (unlikely(!can_use_fpu(s))) { 384 goto soft; 385 } 386 387 float64_input_flush2(&ua.s, &ub.s, s); 388 if (unlikely(!pre(ua, ub))) { 389 goto soft; 390 } 391 392 ur.h = hard(ua.h, ub.h); 393 if (unlikely(f64_is_inf(ur))) { 394 float_raise(float_flag_overflow, s); 395 } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) { 396 goto soft; 397 } 398 return ur.s; 399 400 soft: 401 return soft(ua.s, ub.s, s); 402 } 403 404 /*---------------------------------------------------------------------------- 405 | Returns the fraction bits of the single-precision floating-point value `a'. 406 *----------------------------------------------------------------------------*/ 407 408 static inline uint32_t extractFloat32Frac(float32 a) 409 { 410 return float32_val(a) & 0x007FFFFF; 411 } 412 413 /*---------------------------------------------------------------------------- 414 | Returns the exponent bits of the single-precision floating-point value `a'. 415 *----------------------------------------------------------------------------*/ 416 417 static inline int extractFloat32Exp(float32 a) 418 { 419 return (float32_val(a) >> 23) & 0xFF; 420 } 421 422 /*---------------------------------------------------------------------------- 423 | Returns the sign bit of the single-precision floating-point value `a'. 424 *----------------------------------------------------------------------------*/ 425 426 static inline bool extractFloat32Sign(float32 a) 427 { 428 return float32_val(a) >> 31; 429 } 430 431 /*---------------------------------------------------------------------------- 432 | Returns the fraction bits of the double-precision floating-point value `a'. 433 *----------------------------------------------------------------------------*/ 434 435 static inline uint64_t extractFloat64Frac(float64 a) 436 { 437 return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF); 438 } 439 440 /*---------------------------------------------------------------------------- 441 | Returns the exponent bits of the double-precision floating-point value `a'. 442 *----------------------------------------------------------------------------*/ 443 444 static inline int extractFloat64Exp(float64 a) 445 { 446 return (float64_val(a) >> 52) & 0x7FF; 447 } 448 449 /*---------------------------------------------------------------------------- 450 | Returns the sign bit of the double-precision floating-point value `a'. 451 *----------------------------------------------------------------------------*/ 452 453 static inline bool extractFloat64Sign(float64 a) 454 { 455 return float64_val(a) >> 63; 456 } 457 458 /* 459 * Classify a floating point number. Everything above float_class_qnan 460 * is a NaN so cls >= float_class_qnan is any NaN. 461 */ 462 463 typedef enum __attribute__ ((__packed__)) { 464 float_class_unclassified, 465 float_class_zero, 466 float_class_normal, 467 float_class_inf, 468 float_class_qnan, /* all NaNs from here */ 469 float_class_snan, 470 } FloatClass; 471 472 #define float_cmask(bit) (1u << (bit)) 473 474 enum { 475 float_cmask_zero = float_cmask(float_class_zero), 476 float_cmask_normal = float_cmask(float_class_normal), 477 float_cmask_inf = float_cmask(float_class_inf), 478 float_cmask_qnan = float_cmask(float_class_qnan), 479 float_cmask_snan = float_cmask(float_class_snan), 480 481 float_cmask_infzero = float_cmask_zero | float_cmask_inf, 482 float_cmask_anynan = float_cmask_qnan | float_cmask_snan, 483 }; 484 485 486 /* Simple helpers for checking if, or what kind of, NaN we have */ 487 static inline __attribute__((unused)) bool is_nan(FloatClass c) 488 { 489 return unlikely(c >= float_class_qnan); 490 } 491 492 static inline __attribute__((unused)) bool is_snan(FloatClass c) 493 { 494 return c == float_class_snan; 495 } 496 497 static inline __attribute__((unused)) bool is_qnan(FloatClass c) 498 { 499 return c == float_class_qnan; 500 } 501 502 /* 503 * Structure holding all of the decomposed parts of a float. 504 * The exponent is unbiased and the fraction is normalized. 505 * 506 * The fraction words are stored in big-endian word ordering, 507 * so that truncation from a larger format to a smaller format 508 * can be done simply by ignoring subsequent elements. 509 */ 510 511 typedef struct { 512 FloatClass cls; 513 bool sign; 514 int32_t exp; 515 union { 516 /* Routines that know the structure may reference the singular name. */ 517 uint64_t frac; 518 /* 519 * Routines expanded with multiple structures reference "hi" and "lo" 520 * depending on the operation. In FloatParts64, "hi" and "lo" are 521 * both the same word and aliased here. 522 */ 523 uint64_t frac_hi; 524 uint64_t frac_lo; 525 }; 526 } FloatParts64; 527 528 typedef struct { 529 FloatClass cls; 530 bool sign; 531 int32_t exp; 532 uint64_t frac_hi; 533 uint64_t frac_lo; 534 } FloatParts128; 535 536 typedef struct { 537 FloatClass cls; 538 bool sign; 539 int32_t exp; 540 uint64_t frac_hi; 541 uint64_t frac_hm; /* high-middle */ 542 uint64_t frac_lm; /* low-middle */ 543 uint64_t frac_lo; 544 } FloatParts256; 545 546 /* These apply to the most significant word of each FloatPartsN. */ 547 #define DECOMPOSED_BINARY_POINT 63 548 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 549 550 /* Structure holding all of the relevant parameters for a format. 551 * exp_size: the size of the exponent field 552 * exp_bias: the offset applied to the exponent field 553 * exp_max: the maximum normalised exponent 554 * frac_size: the size of the fraction field 555 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 556 * The following are computed based the size of fraction 557 * frac_lsb: least significant bit of fraction 558 * frac_lsbm1: the bit below the least significant bit (for rounding) 559 * round_mask/roundeven_mask: masks used for rounding 560 * The following optional modifiers are available: 561 * arm_althp: handle ARM Alternative Half Precision 562 */ 563 typedef struct { 564 int exp_size; 565 int exp_bias; 566 int exp_max; 567 int frac_size; 568 int frac_shift; 569 uint64_t frac_lsb; 570 uint64_t frac_lsbm1; 571 uint64_t round_mask; 572 uint64_t roundeven_mask; 573 bool arm_althp; 574 } FloatFmt; 575 576 /* Expand fields based on the size of exponent and fraction */ 577 #define FLOAT_PARAMS(E, F) \ 578 .exp_size = E, \ 579 .exp_bias = ((1 << E) - 1) >> 1, \ 580 .exp_max = (1 << E) - 1, \ 581 .frac_size = F, \ 582 .frac_shift = (-F - 1) & 63, \ 583 .frac_lsb = 1ull << ((-F - 1) & 63), \ 584 .frac_lsbm1 = 1ull << ((-F - 2) & 63), \ 585 .round_mask = (1ull << ((-F - 1) & 63)) - 1, \ 586 .roundeven_mask = (2ull << ((-F - 1) & 63)) - 1 587 588 static const FloatFmt float16_params = { 589 FLOAT_PARAMS(5, 10) 590 }; 591 592 static const FloatFmt float16_params_ahp = { 593 FLOAT_PARAMS(5, 10), 594 .arm_althp = true 595 }; 596 597 static const FloatFmt bfloat16_params = { 598 FLOAT_PARAMS(8, 7) 599 }; 600 601 static const FloatFmt float32_params = { 602 FLOAT_PARAMS(8, 23) 603 }; 604 605 static const FloatFmt float64_params = { 606 FLOAT_PARAMS(11, 52) 607 }; 608 609 static const FloatFmt float128_params = { 610 FLOAT_PARAMS(15, 112) 611 }; 612 613 /* Unpack a float to parts, but do not canonicalize. */ 614 static void unpack_raw64(FloatParts64 *r, const FloatFmt *fmt, uint64_t raw) 615 { 616 const int f_size = fmt->frac_size; 617 const int e_size = fmt->exp_size; 618 619 *r = (FloatParts64) { 620 .cls = float_class_unclassified, 621 .sign = extract64(raw, f_size + e_size, 1), 622 .exp = extract64(raw, f_size, e_size), 623 .frac = extract64(raw, 0, f_size) 624 }; 625 } 626 627 static inline void float16_unpack_raw(FloatParts64 *p, float16 f) 628 { 629 unpack_raw64(p, &float16_params, f); 630 } 631 632 static inline void bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f) 633 { 634 unpack_raw64(p, &bfloat16_params, f); 635 } 636 637 static inline void float32_unpack_raw(FloatParts64 *p, float32 f) 638 { 639 unpack_raw64(p, &float32_params, f); 640 } 641 642 static inline void float64_unpack_raw(FloatParts64 *p, float64 f) 643 { 644 unpack_raw64(p, &float64_params, f); 645 } 646 647 static void float128_unpack_raw(FloatParts128 *p, float128 f) 648 { 649 const int f_size = float128_params.frac_size - 64; 650 const int e_size = float128_params.exp_size; 651 652 *p = (FloatParts128) { 653 .cls = float_class_unclassified, 654 .sign = extract64(f.high, f_size + e_size, 1), 655 .exp = extract64(f.high, f_size, e_size), 656 .frac_hi = extract64(f.high, 0, f_size), 657 .frac_lo = f.low, 658 }; 659 } 660 661 /* Pack a float from parts, but do not canonicalize. */ 662 static uint64_t pack_raw64(const FloatParts64 *p, const FloatFmt *fmt) 663 { 664 const int f_size = fmt->frac_size; 665 const int e_size = fmt->exp_size; 666 uint64_t ret; 667 668 ret = (uint64_t)p->sign << (f_size + e_size); 669 ret = deposit64(ret, f_size, e_size, p->exp); 670 ret = deposit64(ret, 0, f_size, p->frac); 671 return ret; 672 } 673 674 static inline float16 float16_pack_raw(const FloatParts64 *p) 675 { 676 return make_float16(pack_raw64(p, &float16_params)); 677 } 678 679 static inline bfloat16 bfloat16_pack_raw(const FloatParts64 *p) 680 { 681 return pack_raw64(p, &bfloat16_params); 682 } 683 684 static inline float32 float32_pack_raw(const FloatParts64 *p) 685 { 686 return make_float32(pack_raw64(p, &float32_params)); 687 } 688 689 static inline float64 float64_pack_raw(const FloatParts64 *p) 690 { 691 return make_float64(pack_raw64(p, &float64_params)); 692 } 693 694 static float128 float128_pack_raw(const FloatParts128 *p) 695 { 696 const int f_size = float128_params.frac_size - 64; 697 const int e_size = float128_params.exp_size; 698 uint64_t hi; 699 700 hi = (uint64_t)p->sign << (f_size + e_size); 701 hi = deposit64(hi, f_size, e_size, p->exp); 702 hi = deposit64(hi, 0, f_size, p->frac_hi); 703 return make_float128(hi, p->frac_lo); 704 } 705 706 /*---------------------------------------------------------------------------- 707 | Functions and definitions to determine: (1) whether tininess for underflow 708 | is detected before or after rounding by default, (2) what (if anything) 709 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 710 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 711 | are propagated from function inputs to output. These details are target- 712 | specific. 713 *----------------------------------------------------------------------------*/ 714 #include "softfloat-specialize.c.inc" 715 716 #define PARTS_GENERIC_64_128(NAME, P) \ 717 QEMU_GENERIC(P, (FloatParts128 *, parts128_##NAME), parts64_##NAME) 718 719 #define PARTS_GENERIC_64_128_256(NAME, P) \ 720 QEMU_GENERIC(P, (FloatParts256 *, parts256_##NAME), \ 721 (FloatParts128 *, parts128_##NAME), parts64_##NAME) 722 723 #define parts_default_nan(P, S) PARTS_GENERIC_64_128(default_nan, P)(P, S) 724 #define parts_silence_nan(P, S) PARTS_GENERIC_64_128(silence_nan, P)(P, S) 725 726 static void parts64_return_nan(FloatParts64 *a, float_status *s); 727 static void parts128_return_nan(FloatParts128 *a, float_status *s); 728 729 #define parts_return_nan(P, S) PARTS_GENERIC_64_128(return_nan, P)(P, S) 730 731 static FloatParts64 *parts64_pick_nan(FloatParts64 *a, FloatParts64 *b, 732 float_status *s); 733 static FloatParts128 *parts128_pick_nan(FloatParts128 *a, FloatParts128 *b, 734 float_status *s); 735 736 #define parts_pick_nan(A, B, S) PARTS_GENERIC_64_128(pick_nan, A)(A, B, S) 737 738 static FloatParts64 *parts64_pick_nan_muladd(FloatParts64 *a, FloatParts64 *b, 739 FloatParts64 *c, float_status *s, 740 int ab_mask, int abc_mask); 741 static FloatParts128 *parts128_pick_nan_muladd(FloatParts128 *a, 742 FloatParts128 *b, 743 FloatParts128 *c, 744 float_status *s, 745 int ab_mask, int abc_mask); 746 747 #define parts_pick_nan_muladd(A, B, C, S, ABM, ABCM) \ 748 PARTS_GENERIC_64_128(pick_nan_muladd, A)(A, B, C, S, ABM, ABCM) 749 750 static void parts64_canonicalize(FloatParts64 *p, float_status *status, 751 const FloatFmt *fmt); 752 static void parts128_canonicalize(FloatParts128 *p, float_status *status, 753 const FloatFmt *fmt); 754 755 #define parts_canonicalize(A, S, F) \ 756 PARTS_GENERIC_64_128(canonicalize, A)(A, S, F) 757 758 static void parts64_uncanon(FloatParts64 *p, float_status *status, 759 const FloatFmt *fmt); 760 static void parts128_uncanon(FloatParts128 *p, float_status *status, 761 const FloatFmt *fmt); 762 763 #define parts_uncanon(A, S, F) \ 764 PARTS_GENERIC_64_128(uncanon, A)(A, S, F) 765 766 static void parts64_add_normal(FloatParts64 *a, FloatParts64 *b); 767 static void parts128_add_normal(FloatParts128 *a, FloatParts128 *b); 768 static void parts256_add_normal(FloatParts256 *a, FloatParts256 *b); 769 770 #define parts_add_normal(A, B) \ 771 PARTS_GENERIC_64_128_256(add_normal, A)(A, B) 772 773 static bool parts64_sub_normal(FloatParts64 *a, FloatParts64 *b); 774 static bool parts128_sub_normal(FloatParts128 *a, FloatParts128 *b); 775 static bool parts256_sub_normal(FloatParts256 *a, FloatParts256 *b); 776 777 #define parts_sub_normal(A, B) \ 778 PARTS_GENERIC_64_128_256(sub_normal, A)(A, B) 779 780 static FloatParts64 *parts64_addsub(FloatParts64 *a, FloatParts64 *b, 781 float_status *s, bool subtract); 782 static FloatParts128 *parts128_addsub(FloatParts128 *a, FloatParts128 *b, 783 float_status *s, bool subtract); 784 785 #define parts_addsub(A, B, S, Z) \ 786 PARTS_GENERIC_64_128(addsub, A)(A, B, S, Z) 787 788 static FloatParts64 *parts64_mul(FloatParts64 *a, FloatParts64 *b, 789 float_status *s); 790 static FloatParts128 *parts128_mul(FloatParts128 *a, FloatParts128 *b, 791 float_status *s); 792 793 #define parts_mul(A, B, S) \ 794 PARTS_GENERIC_64_128(mul, A)(A, B, S) 795 796 static FloatParts64 *parts64_muladd(FloatParts64 *a, FloatParts64 *b, 797 FloatParts64 *c, int flags, 798 float_status *s); 799 static FloatParts128 *parts128_muladd(FloatParts128 *a, FloatParts128 *b, 800 FloatParts128 *c, int flags, 801 float_status *s); 802 803 #define parts_muladd(A, B, C, Z, S) \ 804 PARTS_GENERIC_64_128(muladd, A)(A, B, C, Z, S) 805 806 static FloatParts64 *parts64_div(FloatParts64 *a, FloatParts64 *b, 807 float_status *s); 808 static FloatParts128 *parts128_div(FloatParts128 *a, FloatParts128 *b, 809 float_status *s); 810 811 #define parts_div(A, B, S) \ 812 PARTS_GENERIC_64_128(div, A)(A, B, S) 813 814 static bool parts64_round_to_int_normal(FloatParts64 *a, FloatRoundMode rm, 815 int scale, int frac_size); 816 static bool parts128_round_to_int_normal(FloatParts128 *a, FloatRoundMode r, 817 int scale, int frac_size); 818 819 #define parts_round_to_int_normal(A, R, C, F) \ 820 PARTS_GENERIC_64_128(round_to_int_normal, A)(A, R, C, F) 821 822 static void parts64_round_to_int(FloatParts64 *a, FloatRoundMode rm, 823 int scale, float_status *s, 824 const FloatFmt *fmt); 825 static void parts128_round_to_int(FloatParts128 *a, FloatRoundMode r, 826 int scale, float_status *s, 827 const FloatFmt *fmt); 828 829 #define parts_round_to_int(A, R, C, S, F) \ 830 PARTS_GENERIC_64_128(round_to_int, A)(A, R, C, S, F) 831 832 /* 833 * Helper functions for softfloat-parts.c.inc, per-size operations. 834 */ 835 836 #define FRAC_GENERIC_64_128(NAME, P) \ 837 QEMU_GENERIC(P, (FloatParts128 *, frac128_##NAME), frac64_##NAME) 838 839 #define FRAC_GENERIC_64_128_256(NAME, P) \ 840 QEMU_GENERIC(P, (FloatParts256 *, frac256_##NAME), \ 841 (FloatParts128 *, frac128_##NAME), frac64_##NAME) 842 843 static bool frac64_add(FloatParts64 *r, FloatParts64 *a, FloatParts64 *b) 844 { 845 return uadd64_overflow(a->frac, b->frac, &r->frac); 846 } 847 848 static bool frac128_add(FloatParts128 *r, FloatParts128 *a, FloatParts128 *b) 849 { 850 bool c = 0; 851 r->frac_lo = uadd64_carry(a->frac_lo, b->frac_lo, &c); 852 r->frac_hi = uadd64_carry(a->frac_hi, b->frac_hi, &c); 853 return c; 854 } 855 856 static bool frac256_add(FloatParts256 *r, FloatParts256 *a, FloatParts256 *b) 857 { 858 bool c = 0; 859 r->frac_lo = uadd64_carry(a->frac_lo, b->frac_lo, &c); 860 r->frac_lm = uadd64_carry(a->frac_lm, b->frac_lm, &c); 861 r->frac_hm = uadd64_carry(a->frac_hm, b->frac_hm, &c); 862 r->frac_hi = uadd64_carry(a->frac_hi, b->frac_hi, &c); 863 return c; 864 } 865 866 #define frac_add(R, A, B) FRAC_GENERIC_64_128_256(add, R)(R, A, B) 867 868 static bool frac64_addi(FloatParts64 *r, FloatParts64 *a, uint64_t c) 869 { 870 return uadd64_overflow(a->frac, c, &r->frac); 871 } 872 873 static bool frac128_addi(FloatParts128 *r, FloatParts128 *a, uint64_t c) 874 { 875 c = uadd64_overflow(a->frac_lo, c, &r->frac_lo); 876 return uadd64_overflow(a->frac_hi, c, &r->frac_hi); 877 } 878 879 #define frac_addi(R, A, C) FRAC_GENERIC_64_128(addi, R)(R, A, C) 880 881 static void frac64_allones(FloatParts64 *a) 882 { 883 a->frac = -1; 884 } 885 886 static void frac128_allones(FloatParts128 *a) 887 { 888 a->frac_hi = a->frac_lo = -1; 889 } 890 891 #define frac_allones(A) FRAC_GENERIC_64_128(allones, A)(A) 892 893 static int frac64_cmp(FloatParts64 *a, FloatParts64 *b) 894 { 895 return a->frac == b->frac ? 0 : a->frac < b->frac ? -1 : 1; 896 } 897 898 static int frac128_cmp(FloatParts128 *a, FloatParts128 *b) 899 { 900 uint64_t ta = a->frac_hi, tb = b->frac_hi; 901 if (ta == tb) { 902 ta = a->frac_lo, tb = b->frac_lo; 903 if (ta == tb) { 904 return 0; 905 } 906 } 907 return ta < tb ? -1 : 1; 908 } 909 910 #define frac_cmp(A, B) FRAC_GENERIC_64_128(cmp, A)(A, B) 911 912 static void frac64_clear(FloatParts64 *a) 913 { 914 a->frac = 0; 915 } 916 917 static void frac128_clear(FloatParts128 *a) 918 { 919 a->frac_hi = a->frac_lo = 0; 920 } 921 922 #define frac_clear(A) FRAC_GENERIC_64_128(clear, A)(A) 923 924 static bool frac64_div(FloatParts64 *a, FloatParts64 *b) 925 { 926 uint64_t n1, n0, r, q; 927 bool ret; 928 929 /* 930 * We want a 2*N / N-bit division to produce exactly an N-bit 931 * result, so that we do not lose any precision and so that we 932 * do not have to renormalize afterward. If A.frac < B.frac, 933 * then division would produce an (N-1)-bit result; shift A left 934 * by one to produce the an N-bit result, and return true to 935 * decrement the exponent to match. 936 * 937 * The udiv_qrnnd algorithm that we're using requires normalization, 938 * i.e. the msb of the denominator must be set, which is already true. 939 */ 940 ret = a->frac < b->frac; 941 if (ret) { 942 n0 = a->frac; 943 n1 = 0; 944 } else { 945 n0 = a->frac >> 1; 946 n1 = a->frac << 63; 947 } 948 q = udiv_qrnnd(&r, n0, n1, b->frac); 949 950 /* Set lsb if there is a remainder, to set inexact. */ 951 a->frac = q | (r != 0); 952 953 return ret; 954 } 955 956 static bool frac128_div(FloatParts128 *a, FloatParts128 *b) 957 { 958 uint64_t q0, q1, a0, a1, b0, b1; 959 uint64_t r0, r1, r2, r3, t0, t1, t2, t3; 960 bool ret = false; 961 962 a0 = a->frac_hi, a1 = a->frac_lo; 963 b0 = b->frac_hi, b1 = b->frac_lo; 964 965 ret = lt128(a0, a1, b0, b1); 966 if (!ret) { 967 a1 = shr_double(a0, a1, 1); 968 a0 = a0 >> 1; 969 } 970 971 /* Use 128/64 -> 64 division as estimate for 192/128 -> 128 division. */ 972 q0 = estimateDiv128To64(a0, a1, b0); 973 974 /* 975 * Estimate is high because B1 was not included (unless B1 == 0). 976 * Reduce quotient and increase remainder until remainder is non-negative. 977 * This loop will execute 0 to 2 times. 978 */ 979 mul128By64To192(b0, b1, q0, &t0, &t1, &t2); 980 sub192(a0, a1, 0, t0, t1, t2, &r0, &r1, &r2); 981 while (r0 != 0) { 982 q0--; 983 add192(r0, r1, r2, 0, b0, b1, &r0, &r1, &r2); 984 } 985 986 /* Repeat using the remainder, producing a second word of quotient. */ 987 q1 = estimateDiv128To64(r1, r2, b0); 988 mul128By64To192(b0, b1, q1, &t1, &t2, &t3); 989 sub192(r1, r2, 0, t1, t2, t3, &r1, &r2, &r3); 990 while (r1 != 0) { 991 q1--; 992 add192(r1, r2, r3, 0, b0, b1, &r1, &r2, &r3); 993 } 994 995 /* Any remainder indicates inexact; set sticky bit. */ 996 q1 |= (r2 | r3) != 0; 997 998 a->frac_hi = q0; 999 a->frac_lo = q1; 1000 return ret; 1001 } 1002 1003 #define frac_div(A, B) FRAC_GENERIC_64_128(div, A)(A, B) 1004 1005 static bool frac64_eqz(FloatParts64 *a) 1006 { 1007 return a->frac == 0; 1008 } 1009 1010 static bool frac128_eqz(FloatParts128 *a) 1011 { 1012 return (a->frac_hi | a->frac_lo) == 0; 1013 } 1014 1015 #define frac_eqz(A) FRAC_GENERIC_64_128(eqz, A)(A) 1016 1017 static void frac64_mulw(FloatParts128 *r, FloatParts64 *a, FloatParts64 *b) 1018 { 1019 mulu64(&r->frac_lo, &r->frac_hi, a->frac, b->frac); 1020 } 1021 1022 static void frac128_mulw(FloatParts256 *r, FloatParts128 *a, FloatParts128 *b) 1023 { 1024 mul128To256(a->frac_hi, a->frac_lo, b->frac_hi, b->frac_lo, 1025 &r->frac_hi, &r->frac_hm, &r->frac_lm, &r->frac_lo); 1026 } 1027 1028 #define frac_mulw(R, A, B) FRAC_GENERIC_64_128(mulw, A)(R, A, B) 1029 1030 static void frac64_neg(FloatParts64 *a) 1031 { 1032 a->frac = -a->frac; 1033 } 1034 1035 static void frac128_neg(FloatParts128 *a) 1036 { 1037 bool c = 0; 1038 a->frac_lo = usub64_borrow(0, a->frac_lo, &c); 1039 a->frac_hi = usub64_borrow(0, a->frac_hi, &c); 1040 } 1041 1042 static void frac256_neg(FloatParts256 *a) 1043 { 1044 bool c = 0; 1045 a->frac_lo = usub64_borrow(0, a->frac_lo, &c); 1046 a->frac_lm = usub64_borrow(0, a->frac_lm, &c); 1047 a->frac_hm = usub64_borrow(0, a->frac_hm, &c); 1048 a->frac_hi = usub64_borrow(0, a->frac_hi, &c); 1049 } 1050 1051 #define frac_neg(A) FRAC_GENERIC_64_128_256(neg, A)(A) 1052 1053 static int frac64_normalize(FloatParts64 *a) 1054 { 1055 if (a->frac) { 1056 int shift = clz64(a->frac); 1057 a->frac <<= shift; 1058 return shift; 1059 } 1060 return 64; 1061 } 1062 1063 static int frac128_normalize(FloatParts128 *a) 1064 { 1065 if (a->frac_hi) { 1066 int shl = clz64(a->frac_hi); 1067 a->frac_hi = shl_double(a->frac_hi, a->frac_lo, shl); 1068 a->frac_lo <<= shl; 1069 return shl; 1070 } else if (a->frac_lo) { 1071 int shl = clz64(a->frac_lo); 1072 a->frac_hi = a->frac_lo << shl; 1073 a->frac_lo = 0; 1074 return shl + 64; 1075 } 1076 return 128; 1077 } 1078 1079 static int frac256_normalize(FloatParts256 *a) 1080 { 1081 uint64_t a0 = a->frac_hi, a1 = a->frac_hm; 1082 uint64_t a2 = a->frac_lm, a3 = a->frac_lo; 1083 int ret, shl; 1084 1085 if (likely(a0)) { 1086 shl = clz64(a0); 1087 if (shl == 0) { 1088 return 0; 1089 } 1090 ret = shl; 1091 } else { 1092 if (a1) { 1093 ret = 64; 1094 a0 = a1, a1 = a2, a2 = a3, a3 = 0; 1095 } else if (a2) { 1096 ret = 128; 1097 a0 = a2, a1 = a3, a2 = 0, a3 = 0; 1098 } else if (a3) { 1099 ret = 192; 1100 a0 = a3, a1 = 0, a2 = 0, a3 = 0; 1101 } else { 1102 ret = 256; 1103 a0 = 0, a1 = 0, a2 = 0, a3 = 0; 1104 goto done; 1105 } 1106 shl = clz64(a0); 1107 if (shl == 0) { 1108 goto done; 1109 } 1110 ret += shl; 1111 } 1112 1113 a0 = shl_double(a0, a1, shl); 1114 a1 = shl_double(a1, a2, shl); 1115 a2 = shl_double(a2, a3, shl); 1116 a3 <<= shl; 1117 1118 done: 1119 a->frac_hi = a0; 1120 a->frac_hm = a1; 1121 a->frac_lm = a2; 1122 a->frac_lo = a3; 1123 return ret; 1124 } 1125 1126 #define frac_normalize(A) FRAC_GENERIC_64_128_256(normalize, A)(A) 1127 1128 static void frac64_shl(FloatParts64 *a, int c) 1129 { 1130 a->frac <<= c; 1131 } 1132 1133 static void frac128_shl(FloatParts128 *a, int c) 1134 { 1135 uint64_t a0 = a->frac_hi, a1 = a->frac_lo; 1136 1137 if (c & 64) { 1138 a0 = a1, a1 = 0; 1139 } 1140 1141 c &= 63; 1142 if (c) { 1143 a0 = shl_double(a0, a1, c); 1144 a1 = a1 << c; 1145 } 1146 1147 a->frac_hi = a0; 1148 a->frac_lo = a1; 1149 } 1150 1151 #define frac_shl(A, C) FRAC_GENERIC_64_128(shl, A)(A, C) 1152 1153 static void frac64_shr(FloatParts64 *a, int c) 1154 { 1155 a->frac >>= c; 1156 } 1157 1158 static void frac128_shr(FloatParts128 *a, int c) 1159 { 1160 uint64_t a0 = a->frac_hi, a1 = a->frac_lo; 1161 1162 if (c & 64) { 1163 a1 = a0, a0 = 0; 1164 } 1165 1166 c &= 63; 1167 if (c) { 1168 a1 = shr_double(a0, a1, c); 1169 a0 = a0 >> c; 1170 } 1171 1172 a->frac_hi = a0; 1173 a->frac_lo = a1; 1174 } 1175 1176 #define frac_shr(A, C) FRAC_GENERIC_64_128(shr, A)(A, C) 1177 1178 static void frac64_shrjam(FloatParts64 *a, int c) 1179 { 1180 uint64_t a0 = a->frac; 1181 1182 if (likely(c != 0)) { 1183 if (likely(c < 64)) { 1184 a0 = (a0 >> c) | (shr_double(a0, 0, c) != 0); 1185 } else { 1186 a0 = a0 != 0; 1187 } 1188 a->frac = a0; 1189 } 1190 } 1191 1192 static void frac128_shrjam(FloatParts128 *a, int c) 1193 { 1194 uint64_t a0 = a->frac_hi, a1 = a->frac_lo; 1195 uint64_t sticky = 0; 1196 1197 if (unlikely(c == 0)) { 1198 return; 1199 } else if (likely(c < 64)) { 1200 /* nothing */ 1201 } else if (likely(c < 128)) { 1202 sticky = a1; 1203 a1 = a0; 1204 a0 = 0; 1205 c &= 63; 1206 if (c == 0) { 1207 goto done; 1208 } 1209 } else { 1210 sticky = a0 | a1; 1211 a0 = a1 = 0; 1212 goto done; 1213 } 1214 1215 sticky |= shr_double(a1, 0, c); 1216 a1 = shr_double(a0, a1, c); 1217 a0 = a0 >> c; 1218 1219 done: 1220 a->frac_lo = a1 | (sticky != 0); 1221 a->frac_hi = a0; 1222 } 1223 1224 static void frac256_shrjam(FloatParts256 *a, int c) 1225 { 1226 uint64_t a0 = a->frac_hi, a1 = a->frac_hm; 1227 uint64_t a2 = a->frac_lm, a3 = a->frac_lo; 1228 uint64_t sticky = 0; 1229 1230 if (unlikely(c == 0)) { 1231 return; 1232 } else if (likely(c < 64)) { 1233 /* nothing */ 1234 } else if (likely(c < 256)) { 1235 if (unlikely(c & 128)) { 1236 sticky |= a2 | a3; 1237 a3 = a1, a2 = a0, a1 = 0, a0 = 0; 1238 } 1239 if (unlikely(c & 64)) { 1240 sticky |= a3; 1241 a3 = a2, a2 = a1, a1 = a0, a0 = 0; 1242 } 1243 c &= 63; 1244 if (c == 0) { 1245 goto done; 1246 } 1247 } else { 1248 sticky = a0 | a1 | a2 | a3; 1249 a0 = a1 = a2 = a3 = 0; 1250 goto done; 1251 } 1252 1253 sticky |= shr_double(a3, 0, c); 1254 a3 = shr_double(a2, a3, c); 1255 a2 = shr_double(a1, a2, c); 1256 a1 = shr_double(a0, a1, c); 1257 a0 = a0 >> c; 1258 1259 done: 1260 a->frac_lo = a3 | (sticky != 0); 1261 a->frac_lm = a2; 1262 a->frac_hm = a1; 1263 a->frac_hi = a0; 1264 } 1265 1266 #define frac_shrjam(A, C) FRAC_GENERIC_64_128_256(shrjam, A)(A, C) 1267 1268 static bool frac64_sub(FloatParts64 *r, FloatParts64 *a, FloatParts64 *b) 1269 { 1270 return usub64_overflow(a->frac, b->frac, &r->frac); 1271 } 1272 1273 static bool frac128_sub(FloatParts128 *r, FloatParts128 *a, FloatParts128 *b) 1274 { 1275 bool c = 0; 1276 r->frac_lo = usub64_borrow(a->frac_lo, b->frac_lo, &c); 1277 r->frac_hi = usub64_borrow(a->frac_hi, b->frac_hi, &c); 1278 return c; 1279 } 1280 1281 static bool frac256_sub(FloatParts256 *r, FloatParts256 *a, FloatParts256 *b) 1282 { 1283 bool c = 0; 1284 r->frac_lo = usub64_borrow(a->frac_lo, b->frac_lo, &c); 1285 r->frac_lm = usub64_borrow(a->frac_lm, b->frac_lm, &c); 1286 r->frac_hm = usub64_borrow(a->frac_hm, b->frac_hm, &c); 1287 r->frac_hi = usub64_borrow(a->frac_hi, b->frac_hi, &c); 1288 return c; 1289 } 1290 1291 #define frac_sub(R, A, B) FRAC_GENERIC_64_128_256(sub, R)(R, A, B) 1292 1293 static void frac64_truncjam(FloatParts64 *r, FloatParts128 *a) 1294 { 1295 r->frac = a->frac_hi | (a->frac_lo != 0); 1296 } 1297 1298 static void frac128_truncjam(FloatParts128 *r, FloatParts256 *a) 1299 { 1300 r->frac_hi = a->frac_hi; 1301 r->frac_lo = a->frac_hm | ((a->frac_lm | a->frac_lo) != 0); 1302 } 1303 1304 #define frac_truncjam(R, A) FRAC_GENERIC_64_128(truncjam, R)(R, A) 1305 1306 static void frac64_widen(FloatParts128 *r, FloatParts64 *a) 1307 { 1308 r->frac_hi = a->frac; 1309 r->frac_lo = 0; 1310 } 1311 1312 static void frac128_widen(FloatParts256 *r, FloatParts128 *a) 1313 { 1314 r->frac_hi = a->frac_hi; 1315 r->frac_hm = a->frac_lo; 1316 r->frac_lm = 0; 1317 r->frac_lo = 0; 1318 } 1319 1320 #define frac_widen(A, B) FRAC_GENERIC_64_128(widen, B)(A, B) 1321 1322 #define partsN(NAME) glue(glue(glue(parts,N),_),NAME) 1323 #define FloatPartsN glue(FloatParts,N) 1324 #define FloatPartsW glue(FloatParts,W) 1325 1326 #define N 64 1327 #define W 128 1328 1329 #include "softfloat-parts-addsub.c.inc" 1330 #include "softfloat-parts.c.inc" 1331 1332 #undef N 1333 #undef W 1334 #define N 128 1335 #define W 256 1336 1337 #include "softfloat-parts-addsub.c.inc" 1338 #include "softfloat-parts.c.inc" 1339 1340 #undef N 1341 #undef W 1342 #define N 256 1343 1344 #include "softfloat-parts-addsub.c.inc" 1345 1346 #undef N 1347 #undef W 1348 #undef partsN 1349 #undef FloatPartsN 1350 #undef FloatPartsW 1351 1352 /* 1353 * Pack/unpack routines with a specific FloatFmt. 1354 */ 1355 1356 static void float16a_unpack_canonical(FloatParts64 *p, float16 f, 1357 float_status *s, const FloatFmt *params) 1358 { 1359 float16_unpack_raw(p, f); 1360 parts_canonicalize(p, s, params); 1361 } 1362 1363 static void float16_unpack_canonical(FloatParts64 *p, float16 f, 1364 float_status *s) 1365 { 1366 float16a_unpack_canonical(p, f, s, &float16_params); 1367 } 1368 1369 static void bfloat16_unpack_canonical(FloatParts64 *p, bfloat16 f, 1370 float_status *s) 1371 { 1372 bfloat16_unpack_raw(p, f); 1373 parts_canonicalize(p, s, &bfloat16_params); 1374 } 1375 1376 static float16 float16a_round_pack_canonical(FloatParts64 *p, 1377 float_status *s, 1378 const FloatFmt *params) 1379 { 1380 parts_uncanon(p, s, params); 1381 return float16_pack_raw(p); 1382 } 1383 1384 static float16 float16_round_pack_canonical(FloatParts64 *p, 1385 float_status *s) 1386 { 1387 return float16a_round_pack_canonical(p, s, &float16_params); 1388 } 1389 1390 static bfloat16 bfloat16_round_pack_canonical(FloatParts64 *p, 1391 float_status *s) 1392 { 1393 parts_uncanon(p, s, &bfloat16_params); 1394 return bfloat16_pack_raw(p); 1395 } 1396 1397 static void float32_unpack_canonical(FloatParts64 *p, float32 f, 1398 float_status *s) 1399 { 1400 float32_unpack_raw(p, f); 1401 parts_canonicalize(p, s, &float32_params); 1402 } 1403 1404 static float32 float32_round_pack_canonical(FloatParts64 *p, 1405 float_status *s) 1406 { 1407 parts_uncanon(p, s, &float32_params); 1408 return float32_pack_raw(p); 1409 } 1410 1411 static void float64_unpack_canonical(FloatParts64 *p, float64 f, 1412 float_status *s) 1413 { 1414 float64_unpack_raw(p, f); 1415 parts_canonicalize(p, s, &float64_params); 1416 } 1417 1418 static float64 float64_round_pack_canonical(FloatParts64 *p, 1419 float_status *s) 1420 { 1421 parts_uncanon(p, s, &float64_params); 1422 return float64_pack_raw(p); 1423 } 1424 1425 static void float128_unpack_canonical(FloatParts128 *p, float128 f, 1426 float_status *s) 1427 { 1428 float128_unpack_raw(p, f); 1429 parts_canonicalize(p, s, &float128_params); 1430 } 1431 1432 static float128 float128_round_pack_canonical(FloatParts128 *p, 1433 float_status *s) 1434 { 1435 parts_uncanon(p, s, &float128_params); 1436 return float128_pack_raw(p); 1437 } 1438 1439 /* 1440 * Addition and subtraction 1441 */ 1442 1443 static float16 QEMU_FLATTEN 1444 float16_addsub(float16 a, float16 b, float_status *status, bool subtract) 1445 { 1446 FloatParts64 pa, pb, *pr; 1447 1448 float16_unpack_canonical(&pa, a, status); 1449 float16_unpack_canonical(&pb, b, status); 1450 pr = parts_addsub(&pa, &pb, status, subtract); 1451 1452 return float16_round_pack_canonical(pr, status); 1453 } 1454 1455 float16 float16_add(float16 a, float16 b, float_status *status) 1456 { 1457 return float16_addsub(a, b, status, false); 1458 } 1459 1460 float16 float16_sub(float16 a, float16 b, float_status *status) 1461 { 1462 return float16_addsub(a, b, status, true); 1463 } 1464 1465 static float32 QEMU_SOFTFLOAT_ATTR 1466 soft_f32_addsub(float32 a, float32 b, float_status *status, bool subtract) 1467 { 1468 FloatParts64 pa, pb, *pr; 1469 1470 float32_unpack_canonical(&pa, a, status); 1471 float32_unpack_canonical(&pb, b, status); 1472 pr = parts_addsub(&pa, &pb, status, subtract); 1473 1474 return float32_round_pack_canonical(pr, status); 1475 } 1476 1477 static float32 soft_f32_add(float32 a, float32 b, float_status *status) 1478 { 1479 return soft_f32_addsub(a, b, status, false); 1480 } 1481 1482 static float32 soft_f32_sub(float32 a, float32 b, float_status *status) 1483 { 1484 return soft_f32_addsub(a, b, status, true); 1485 } 1486 1487 static float64 QEMU_SOFTFLOAT_ATTR 1488 soft_f64_addsub(float64 a, float64 b, float_status *status, bool subtract) 1489 { 1490 FloatParts64 pa, pb, *pr; 1491 1492 float64_unpack_canonical(&pa, a, status); 1493 float64_unpack_canonical(&pb, b, status); 1494 pr = parts_addsub(&pa, &pb, status, subtract); 1495 1496 return float64_round_pack_canonical(pr, status); 1497 } 1498 1499 static float64 soft_f64_add(float64 a, float64 b, float_status *status) 1500 { 1501 return soft_f64_addsub(a, b, status, false); 1502 } 1503 1504 static float64 soft_f64_sub(float64 a, float64 b, float_status *status) 1505 { 1506 return soft_f64_addsub(a, b, status, true); 1507 } 1508 1509 static float hard_f32_add(float a, float b) 1510 { 1511 return a + b; 1512 } 1513 1514 static float hard_f32_sub(float a, float b) 1515 { 1516 return a - b; 1517 } 1518 1519 static double hard_f64_add(double a, double b) 1520 { 1521 return a + b; 1522 } 1523 1524 static double hard_f64_sub(double a, double b) 1525 { 1526 return a - b; 1527 } 1528 1529 static bool f32_addsubmul_post(union_float32 a, union_float32 b) 1530 { 1531 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1532 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1533 } 1534 return !(float32_is_zero(a.s) && float32_is_zero(b.s)); 1535 } 1536 1537 static bool f64_addsubmul_post(union_float64 a, union_float64 b) 1538 { 1539 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1540 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1541 } else { 1542 return !(float64_is_zero(a.s) && float64_is_zero(b.s)); 1543 } 1544 } 1545 1546 static float32 float32_addsub(float32 a, float32 b, float_status *s, 1547 hard_f32_op2_fn hard, soft_f32_op2_fn soft) 1548 { 1549 return float32_gen2(a, b, s, hard, soft, 1550 f32_is_zon2, f32_addsubmul_post); 1551 } 1552 1553 static float64 float64_addsub(float64 a, float64 b, float_status *s, 1554 hard_f64_op2_fn hard, soft_f64_op2_fn soft) 1555 { 1556 return float64_gen2(a, b, s, hard, soft, 1557 f64_is_zon2, f64_addsubmul_post); 1558 } 1559 1560 float32 QEMU_FLATTEN 1561 float32_add(float32 a, float32 b, float_status *s) 1562 { 1563 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add); 1564 } 1565 1566 float32 QEMU_FLATTEN 1567 float32_sub(float32 a, float32 b, float_status *s) 1568 { 1569 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub); 1570 } 1571 1572 float64 QEMU_FLATTEN 1573 float64_add(float64 a, float64 b, float_status *s) 1574 { 1575 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add); 1576 } 1577 1578 float64 QEMU_FLATTEN 1579 float64_sub(float64 a, float64 b, float_status *s) 1580 { 1581 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub); 1582 } 1583 1584 static bfloat16 QEMU_FLATTEN 1585 bfloat16_addsub(bfloat16 a, bfloat16 b, float_status *status, bool subtract) 1586 { 1587 FloatParts64 pa, pb, *pr; 1588 1589 bfloat16_unpack_canonical(&pa, a, status); 1590 bfloat16_unpack_canonical(&pb, b, status); 1591 pr = parts_addsub(&pa, &pb, status, subtract); 1592 1593 return bfloat16_round_pack_canonical(pr, status); 1594 } 1595 1596 bfloat16 bfloat16_add(bfloat16 a, bfloat16 b, float_status *status) 1597 { 1598 return bfloat16_addsub(a, b, status, false); 1599 } 1600 1601 bfloat16 bfloat16_sub(bfloat16 a, bfloat16 b, float_status *status) 1602 { 1603 return bfloat16_addsub(a, b, status, true); 1604 } 1605 1606 static float128 QEMU_FLATTEN 1607 float128_addsub(float128 a, float128 b, float_status *status, bool subtract) 1608 { 1609 FloatParts128 pa, pb, *pr; 1610 1611 float128_unpack_canonical(&pa, a, status); 1612 float128_unpack_canonical(&pb, b, status); 1613 pr = parts_addsub(&pa, &pb, status, subtract); 1614 1615 return float128_round_pack_canonical(pr, status); 1616 } 1617 1618 float128 float128_add(float128 a, float128 b, float_status *status) 1619 { 1620 return float128_addsub(a, b, status, false); 1621 } 1622 1623 float128 float128_sub(float128 a, float128 b, float_status *status) 1624 { 1625 return float128_addsub(a, b, status, true); 1626 } 1627 1628 /* 1629 * Multiplication 1630 */ 1631 1632 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status) 1633 { 1634 FloatParts64 pa, pb, *pr; 1635 1636 float16_unpack_canonical(&pa, a, status); 1637 float16_unpack_canonical(&pb, b, status); 1638 pr = parts_mul(&pa, &pb, status); 1639 1640 return float16_round_pack_canonical(pr, status); 1641 } 1642 1643 static float32 QEMU_SOFTFLOAT_ATTR 1644 soft_f32_mul(float32 a, float32 b, float_status *status) 1645 { 1646 FloatParts64 pa, pb, *pr; 1647 1648 float32_unpack_canonical(&pa, a, status); 1649 float32_unpack_canonical(&pb, b, status); 1650 pr = parts_mul(&pa, &pb, status); 1651 1652 return float32_round_pack_canonical(pr, status); 1653 } 1654 1655 static float64 QEMU_SOFTFLOAT_ATTR 1656 soft_f64_mul(float64 a, float64 b, float_status *status) 1657 { 1658 FloatParts64 pa, pb, *pr; 1659 1660 float64_unpack_canonical(&pa, a, status); 1661 float64_unpack_canonical(&pb, b, status); 1662 pr = parts_mul(&pa, &pb, status); 1663 1664 return float64_round_pack_canonical(pr, status); 1665 } 1666 1667 static float hard_f32_mul(float a, float b) 1668 { 1669 return a * b; 1670 } 1671 1672 static double hard_f64_mul(double a, double b) 1673 { 1674 return a * b; 1675 } 1676 1677 float32 QEMU_FLATTEN 1678 float32_mul(float32 a, float32 b, float_status *s) 1679 { 1680 return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul, 1681 f32_is_zon2, f32_addsubmul_post); 1682 } 1683 1684 float64 QEMU_FLATTEN 1685 float64_mul(float64 a, float64 b, float_status *s) 1686 { 1687 return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul, 1688 f64_is_zon2, f64_addsubmul_post); 1689 } 1690 1691 bfloat16 QEMU_FLATTEN 1692 bfloat16_mul(bfloat16 a, bfloat16 b, float_status *status) 1693 { 1694 FloatParts64 pa, pb, *pr; 1695 1696 bfloat16_unpack_canonical(&pa, a, status); 1697 bfloat16_unpack_canonical(&pb, b, status); 1698 pr = parts_mul(&pa, &pb, status); 1699 1700 return bfloat16_round_pack_canonical(pr, status); 1701 } 1702 1703 float128 QEMU_FLATTEN 1704 float128_mul(float128 a, float128 b, float_status *status) 1705 { 1706 FloatParts128 pa, pb, *pr; 1707 1708 float128_unpack_canonical(&pa, a, status); 1709 float128_unpack_canonical(&pb, b, status); 1710 pr = parts_mul(&pa, &pb, status); 1711 1712 return float128_round_pack_canonical(pr, status); 1713 } 1714 1715 /* 1716 * Fused multiply-add 1717 */ 1718 1719 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c, 1720 int flags, float_status *status) 1721 { 1722 FloatParts64 pa, pb, pc, *pr; 1723 1724 float16_unpack_canonical(&pa, a, status); 1725 float16_unpack_canonical(&pb, b, status); 1726 float16_unpack_canonical(&pc, c, status); 1727 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1728 1729 return float16_round_pack_canonical(pr, status); 1730 } 1731 1732 static float32 QEMU_SOFTFLOAT_ATTR 1733 soft_f32_muladd(float32 a, float32 b, float32 c, int flags, 1734 float_status *status) 1735 { 1736 FloatParts64 pa, pb, pc, *pr; 1737 1738 float32_unpack_canonical(&pa, a, status); 1739 float32_unpack_canonical(&pb, b, status); 1740 float32_unpack_canonical(&pc, c, status); 1741 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1742 1743 return float32_round_pack_canonical(pr, status); 1744 } 1745 1746 static float64 QEMU_SOFTFLOAT_ATTR 1747 soft_f64_muladd(float64 a, float64 b, float64 c, int flags, 1748 float_status *status) 1749 { 1750 FloatParts64 pa, pb, pc, *pr; 1751 1752 float64_unpack_canonical(&pa, a, status); 1753 float64_unpack_canonical(&pb, b, status); 1754 float64_unpack_canonical(&pc, c, status); 1755 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1756 1757 return float64_round_pack_canonical(pr, status); 1758 } 1759 1760 static bool force_soft_fma; 1761 1762 float32 QEMU_FLATTEN 1763 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s) 1764 { 1765 union_float32 ua, ub, uc, ur; 1766 1767 ua.s = xa; 1768 ub.s = xb; 1769 uc.s = xc; 1770 1771 if (unlikely(!can_use_fpu(s))) { 1772 goto soft; 1773 } 1774 if (unlikely(flags & float_muladd_halve_result)) { 1775 goto soft; 1776 } 1777 1778 float32_input_flush3(&ua.s, &ub.s, &uc.s, s); 1779 if (unlikely(!f32_is_zon3(ua, ub, uc))) { 1780 goto soft; 1781 } 1782 1783 if (unlikely(force_soft_fma)) { 1784 goto soft; 1785 } 1786 1787 /* 1788 * When (a || b) == 0, there's no need to check for under/over flow, 1789 * since we know the addend is (normal || 0) and the product is 0. 1790 */ 1791 if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) { 1792 union_float32 up; 1793 bool prod_sign; 1794 1795 prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s); 1796 prod_sign ^= !!(flags & float_muladd_negate_product); 1797 up.s = float32_set_sign(float32_zero, prod_sign); 1798 1799 if (flags & float_muladd_negate_c) { 1800 uc.h = -uc.h; 1801 } 1802 ur.h = up.h + uc.h; 1803 } else { 1804 union_float32 ua_orig = ua; 1805 union_float32 uc_orig = uc; 1806 1807 if (flags & float_muladd_negate_product) { 1808 ua.h = -ua.h; 1809 } 1810 if (flags & float_muladd_negate_c) { 1811 uc.h = -uc.h; 1812 } 1813 1814 ur.h = fmaf(ua.h, ub.h, uc.h); 1815 1816 if (unlikely(f32_is_inf(ur))) { 1817 float_raise(float_flag_overflow, s); 1818 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) { 1819 ua = ua_orig; 1820 uc = uc_orig; 1821 goto soft; 1822 } 1823 } 1824 if (flags & float_muladd_negate_result) { 1825 return float32_chs(ur.s); 1826 } 1827 return ur.s; 1828 1829 soft: 1830 return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s); 1831 } 1832 1833 float64 QEMU_FLATTEN 1834 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s) 1835 { 1836 union_float64 ua, ub, uc, ur; 1837 1838 ua.s = xa; 1839 ub.s = xb; 1840 uc.s = xc; 1841 1842 if (unlikely(!can_use_fpu(s))) { 1843 goto soft; 1844 } 1845 if (unlikely(flags & float_muladd_halve_result)) { 1846 goto soft; 1847 } 1848 1849 float64_input_flush3(&ua.s, &ub.s, &uc.s, s); 1850 if (unlikely(!f64_is_zon3(ua, ub, uc))) { 1851 goto soft; 1852 } 1853 1854 if (unlikely(force_soft_fma)) { 1855 goto soft; 1856 } 1857 1858 /* 1859 * When (a || b) == 0, there's no need to check for under/over flow, 1860 * since we know the addend is (normal || 0) and the product is 0. 1861 */ 1862 if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) { 1863 union_float64 up; 1864 bool prod_sign; 1865 1866 prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s); 1867 prod_sign ^= !!(flags & float_muladd_negate_product); 1868 up.s = float64_set_sign(float64_zero, prod_sign); 1869 1870 if (flags & float_muladd_negate_c) { 1871 uc.h = -uc.h; 1872 } 1873 ur.h = up.h + uc.h; 1874 } else { 1875 union_float64 ua_orig = ua; 1876 union_float64 uc_orig = uc; 1877 1878 if (flags & float_muladd_negate_product) { 1879 ua.h = -ua.h; 1880 } 1881 if (flags & float_muladd_negate_c) { 1882 uc.h = -uc.h; 1883 } 1884 1885 ur.h = fma(ua.h, ub.h, uc.h); 1886 1887 if (unlikely(f64_is_inf(ur))) { 1888 float_raise(float_flag_overflow, s); 1889 } else if (unlikely(fabs(ur.h) <= FLT_MIN)) { 1890 ua = ua_orig; 1891 uc = uc_orig; 1892 goto soft; 1893 } 1894 } 1895 if (flags & float_muladd_negate_result) { 1896 return float64_chs(ur.s); 1897 } 1898 return ur.s; 1899 1900 soft: 1901 return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s); 1902 } 1903 1904 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c, 1905 int flags, float_status *status) 1906 { 1907 FloatParts64 pa, pb, pc, *pr; 1908 1909 bfloat16_unpack_canonical(&pa, a, status); 1910 bfloat16_unpack_canonical(&pb, b, status); 1911 bfloat16_unpack_canonical(&pc, c, status); 1912 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1913 1914 return bfloat16_round_pack_canonical(pr, status); 1915 } 1916 1917 float128 QEMU_FLATTEN float128_muladd(float128 a, float128 b, float128 c, 1918 int flags, float_status *status) 1919 { 1920 FloatParts128 pa, pb, pc, *pr; 1921 1922 float128_unpack_canonical(&pa, a, status); 1923 float128_unpack_canonical(&pb, b, status); 1924 float128_unpack_canonical(&pc, c, status); 1925 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1926 1927 return float128_round_pack_canonical(pr, status); 1928 } 1929 1930 /* 1931 * Division 1932 */ 1933 1934 float16 float16_div(float16 a, float16 b, float_status *status) 1935 { 1936 FloatParts64 pa, pb, *pr; 1937 1938 float16_unpack_canonical(&pa, a, status); 1939 float16_unpack_canonical(&pb, b, status); 1940 pr = parts_div(&pa, &pb, status); 1941 1942 return float16_round_pack_canonical(pr, status); 1943 } 1944 1945 static float32 QEMU_SOFTFLOAT_ATTR 1946 soft_f32_div(float32 a, float32 b, float_status *status) 1947 { 1948 FloatParts64 pa, pb, *pr; 1949 1950 float32_unpack_canonical(&pa, a, status); 1951 float32_unpack_canonical(&pb, b, status); 1952 pr = parts_div(&pa, &pb, status); 1953 1954 return float32_round_pack_canonical(pr, status); 1955 } 1956 1957 static float64 QEMU_SOFTFLOAT_ATTR 1958 soft_f64_div(float64 a, float64 b, float_status *status) 1959 { 1960 FloatParts64 pa, pb, *pr; 1961 1962 float64_unpack_canonical(&pa, a, status); 1963 float64_unpack_canonical(&pb, b, status); 1964 pr = parts_div(&pa, &pb, status); 1965 1966 return float64_round_pack_canonical(pr, status); 1967 } 1968 1969 static float hard_f32_div(float a, float b) 1970 { 1971 return a / b; 1972 } 1973 1974 static double hard_f64_div(double a, double b) 1975 { 1976 return a / b; 1977 } 1978 1979 static bool f32_div_pre(union_float32 a, union_float32 b) 1980 { 1981 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1982 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1983 fpclassify(b.h) == FP_NORMAL; 1984 } 1985 return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s); 1986 } 1987 1988 static bool f64_div_pre(union_float64 a, union_float64 b) 1989 { 1990 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1991 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1992 fpclassify(b.h) == FP_NORMAL; 1993 } 1994 return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s); 1995 } 1996 1997 static bool f32_div_post(union_float32 a, union_float32 b) 1998 { 1999 if (QEMU_HARDFLOAT_2F32_USE_FP) { 2000 return fpclassify(a.h) != FP_ZERO; 2001 } 2002 return !float32_is_zero(a.s); 2003 } 2004 2005 static bool f64_div_post(union_float64 a, union_float64 b) 2006 { 2007 if (QEMU_HARDFLOAT_2F64_USE_FP) { 2008 return fpclassify(a.h) != FP_ZERO; 2009 } 2010 return !float64_is_zero(a.s); 2011 } 2012 2013 float32 QEMU_FLATTEN 2014 float32_div(float32 a, float32 b, float_status *s) 2015 { 2016 return float32_gen2(a, b, s, hard_f32_div, soft_f32_div, 2017 f32_div_pre, f32_div_post); 2018 } 2019 2020 float64 QEMU_FLATTEN 2021 float64_div(float64 a, float64 b, float_status *s) 2022 { 2023 return float64_gen2(a, b, s, hard_f64_div, soft_f64_div, 2024 f64_div_pre, f64_div_post); 2025 } 2026 2027 bfloat16 QEMU_FLATTEN 2028 bfloat16_div(bfloat16 a, bfloat16 b, float_status *status) 2029 { 2030 FloatParts64 pa, pb, *pr; 2031 2032 bfloat16_unpack_canonical(&pa, a, status); 2033 bfloat16_unpack_canonical(&pb, b, status); 2034 pr = parts_div(&pa, &pb, status); 2035 2036 return bfloat16_round_pack_canonical(pr, status); 2037 } 2038 2039 float128 QEMU_FLATTEN 2040 float128_div(float128 a, float128 b, float_status *status) 2041 { 2042 FloatParts128 pa, pb, *pr; 2043 2044 float128_unpack_canonical(&pa, a, status); 2045 float128_unpack_canonical(&pb, b, status); 2046 pr = parts_div(&pa, &pb, status); 2047 2048 return float128_round_pack_canonical(pr, status); 2049 } 2050 2051 /* 2052 * Float to Float conversions 2053 * 2054 * Returns the result of converting one float format to another. The 2055 * conversion is performed according to the IEC/IEEE Standard for 2056 * Binary Floating-Point Arithmetic. 2057 * 2058 * Usually this only needs to take care of raising invalid exceptions 2059 * and handling the conversion on NaNs. 2060 */ 2061 2062 static void parts_float_to_ahp(FloatParts64 *a, float_status *s) 2063 { 2064 switch (a->cls) { 2065 case float_class_qnan: 2066 case float_class_snan: 2067 /* 2068 * There is no NaN in the destination format. Raise Invalid 2069 * and return a zero with the sign of the input NaN. 2070 */ 2071 float_raise(float_flag_invalid, s); 2072 a->cls = float_class_zero; 2073 break; 2074 2075 case float_class_inf: 2076 /* 2077 * There is no Inf in the destination format. Raise Invalid 2078 * and return the maximum normal with the correct sign. 2079 */ 2080 float_raise(float_flag_invalid, s); 2081 a->cls = float_class_normal; 2082 a->exp = float16_params_ahp.exp_max; 2083 a->frac = MAKE_64BIT_MASK(float16_params_ahp.frac_shift, 2084 float16_params_ahp.frac_size + 1); 2085 break; 2086 2087 case float_class_normal: 2088 case float_class_zero: 2089 break; 2090 2091 default: 2092 g_assert_not_reached(); 2093 } 2094 } 2095 2096 static void parts64_float_to_float(FloatParts64 *a, float_status *s) 2097 { 2098 if (is_nan(a->cls)) { 2099 parts_return_nan(a, s); 2100 } 2101 } 2102 2103 static void parts128_float_to_float(FloatParts128 *a, float_status *s) 2104 { 2105 if (is_nan(a->cls)) { 2106 parts_return_nan(a, s); 2107 } 2108 } 2109 2110 #define parts_float_to_float(P, S) \ 2111 PARTS_GENERIC_64_128(float_to_float, P)(P, S) 2112 2113 static void parts_float_to_float_narrow(FloatParts64 *a, FloatParts128 *b, 2114 float_status *s) 2115 { 2116 a->cls = b->cls; 2117 a->sign = b->sign; 2118 a->exp = b->exp; 2119 2120 if (a->cls == float_class_normal) { 2121 frac_truncjam(a, b); 2122 } else if (is_nan(a->cls)) { 2123 /* Discard the low bits of the NaN. */ 2124 a->frac = b->frac_hi; 2125 parts_return_nan(a, s); 2126 } 2127 } 2128 2129 static void parts_float_to_float_widen(FloatParts128 *a, FloatParts64 *b, 2130 float_status *s) 2131 { 2132 a->cls = b->cls; 2133 a->sign = b->sign; 2134 a->exp = b->exp; 2135 frac_widen(a, b); 2136 2137 if (is_nan(a->cls)) { 2138 parts_return_nan(a, s); 2139 } 2140 } 2141 2142 float32 float16_to_float32(float16 a, bool ieee, float_status *s) 2143 { 2144 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2145 FloatParts64 p; 2146 2147 float16a_unpack_canonical(&p, a, s, fmt16); 2148 parts_float_to_float(&p, s); 2149 return float32_round_pack_canonical(&p, s); 2150 } 2151 2152 float64 float16_to_float64(float16 a, bool ieee, float_status *s) 2153 { 2154 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2155 FloatParts64 p; 2156 2157 float16a_unpack_canonical(&p, a, s, fmt16); 2158 parts_float_to_float(&p, s); 2159 return float64_round_pack_canonical(&p, s); 2160 } 2161 2162 float16 float32_to_float16(float32 a, bool ieee, float_status *s) 2163 { 2164 FloatParts64 p; 2165 const FloatFmt *fmt; 2166 2167 float32_unpack_canonical(&p, a, s); 2168 if (ieee) { 2169 parts_float_to_float(&p, s); 2170 fmt = &float16_params; 2171 } else { 2172 parts_float_to_ahp(&p, s); 2173 fmt = &float16_params_ahp; 2174 } 2175 return float16a_round_pack_canonical(&p, s, fmt); 2176 } 2177 2178 static float64 QEMU_SOFTFLOAT_ATTR 2179 soft_float32_to_float64(float32 a, float_status *s) 2180 { 2181 FloatParts64 p; 2182 2183 float32_unpack_canonical(&p, a, s); 2184 parts_float_to_float(&p, s); 2185 return float64_round_pack_canonical(&p, s); 2186 } 2187 2188 float64 float32_to_float64(float32 a, float_status *s) 2189 { 2190 if (likely(float32_is_normal(a))) { 2191 /* Widening conversion can never produce inexact results. */ 2192 union_float32 uf; 2193 union_float64 ud; 2194 uf.s = a; 2195 ud.h = uf.h; 2196 return ud.s; 2197 } else if (float32_is_zero(a)) { 2198 return float64_set_sign(float64_zero, float32_is_neg(a)); 2199 } else { 2200 return soft_float32_to_float64(a, s); 2201 } 2202 } 2203 2204 float16 float64_to_float16(float64 a, bool ieee, float_status *s) 2205 { 2206 FloatParts64 p; 2207 const FloatFmt *fmt; 2208 2209 float64_unpack_canonical(&p, a, s); 2210 if (ieee) { 2211 parts_float_to_float(&p, s); 2212 fmt = &float16_params; 2213 } else { 2214 parts_float_to_ahp(&p, s); 2215 fmt = &float16_params_ahp; 2216 } 2217 return float16a_round_pack_canonical(&p, s, fmt); 2218 } 2219 2220 float32 float64_to_float32(float64 a, float_status *s) 2221 { 2222 FloatParts64 p; 2223 2224 float64_unpack_canonical(&p, a, s); 2225 parts_float_to_float(&p, s); 2226 return float32_round_pack_canonical(&p, s); 2227 } 2228 2229 float32 bfloat16_to_float32(bfloat16 a, float_status *s) 2230 { 2231 FloatParts64 p; 2232 2233 bfloat16_unpack_canonical(&p, a, s); 2234 parts_float_to_float(&p, s); 2235 return float32_round_pack_canonical(&p, s); 2236 } 2237 2238 float64 bfloat16_to_float64(bfloat16 a, float_status *s) 2239 { 2240 FloatParts64 p; 2241 2242 bfloat16_unpack_canonical(&p, a, s); 2243 parts_float_to_float(&p, s); 2244 return float64_round_pack_canonical(&p, s); 2245 } 2246 2247 bfloat16 float32_to_bfloat16(float32 a, float_status *s) 2248 { 2249 FloatParts64 p; 2250 2251 float32_unpack_canonical(&p, a, s); 2252 parts_float_to_float(&p, s); 2253 return bfloat16_round_pack_canonical(&p, s); 2254 } 2255 2256 bfloat16 float64_to_bfloat16(float64 a, float_status *s) 2257 { 2258 FloatParts64 p; 2259 2260 float64_unpack_canonical(&p, a, s); 2261 parts_float_to_float(&p, s); 2262 return bfloat16_round_pack_canonical(&p, s); 2263 } 2264 2265 float32 float128_to_float32(float128 a, float_status *s) 2266 { 2267 FloatParts64 p64; 2268 FloatParts128 p128; 2269 2270 float128_unpack_canonical(&p128, a, s); 2271 parts_float_to_float_narrow(&p64, &p128, s); 2272 return float32_round_pack_canonical(&p64, s); 2273 } 2274 2275 float64 float128_to_float64(float128 a, float_status *s) 2276 { 2277 FloatParts64 p64; 2278 FloatParts128 p128; 2279 2280 float128_unpack_canonical(&p128, a, s); 2281 parts_float_to_float_narrow(&p64, &p128, s); 2282 return float64_round_pack_canonical(&p64, s); 2283 } 2284 2285 float128 float32_to_float128(float32 a, float_status *s) 2286 { 2287 FloatParts64 p64; 2288 FloatParts128 p128; 2289 2290 float32_unpack_canonical(&p64, a, s); 2291 parts_float_to_float_widen(&p128, &p64, s); 2292 return float128_round_pack_canonical(&p128, s); 2293 } 2294 2295 float128 float64_to_float128(float64 a, float_status *s) 2296 { 2297 FloatParts64 p64; 2298 FloatParts128 p128; 2299 2300 float64_unpack_canonical(&p64, a, s); 2301 parts_float_to_float_widen(&p128, &p64, s); 2302 return float128_round_pack_canonical(&p128, s); 2303 } 2304 2305 /* 2306 * Round to integral value 2307 */ 2308 2309 float16 float16_round_to_int(float16 a, float_status *s) 2310 { 2311 FloatParts64 p; 2312 2313 float16_unpack_canonical(&p, a, s); 2314 parts_round_to_int(&p, s->float_rounding_mode, 0, s, &float16_params); 2315 return float16_round_pack_canonical(&p, s); 2316 } 2317 2318 float32 float32_round_to_int(float32 a, float_status *s) 2319 { 2320 FloatParts64 p; 2321 2322 float32_unpack_canonical(&p, a, s); 2323 parts_round_to_int(&p, s->float_rounding_mode, 0, s, &float32_params); 2324 return float32_round_pack_canonical(&p, s); 2325 } 2326 2327 float64 float64_round_to_int(float64 a, float_status *s) 2328 { 2329 FloatParts64 p; 2330 2331 float64_unpack_canonical(&p, a, s); 2332 parts_round_to_int(&p, s->float_rounding_mode, 0, s, &float64_params); 2333 return float64_round_pack_canonical(&p, s); 2334 } 2335 2336 bfloat16 bfloat16_round_to_int(bfloat16 a, float_status *s) 2337 { 2338 FloatParts64 p; 2339 2340 bfloat16_unpack_canonical(&p, a, s); 2341 parts_round_to_int(&p, s->float_rounding_mode, 0, s, &bfloat16_params); 2342 return bfloat16_round_pack_canonical(&p, s); 2343 } 2344 2345 float128 float128_round_to_int(float128 a, float_status *s) 2346 { 2347 FloatParts128 p; 2348 2349 float128_unpack_canonical(&p, a, s); 2350 parts_round_to_int(&p, s->float_rounding_mode, 0, s, &float128_params); 2351 return float128_round_pack_canonical(&p, s); 2352 } 2353 2354 /* 2355 * Returns the result of converting the floating-point value `a' to 2356 * the two's complement integer format. The conversion is performed 2357 * according to the IEC/IEEE Standard for Binary Floating-Point 2358 * Arithmetic---which means in particular that the conversion is 2359 * rounded according to the current rounding mode. If `a' is a NaN, 2360 * the largest positive integer is returned. Otherwise, if the 2361 * conversion overflows, the largest integer with the same sign as `a' 2362 * is returned. 2363 */ 2364 2365 static int64_t round_to_int_and_pack(FloatParts64 p, FloatRoundMode rmode, 2366 int scale, int64_t min, int64_t max, 2367 float_status *s) 2368 { 2369 int flags = 0; 2370 uint64_t r; 2371 2372 switch (p.cls) { 2373 case float_class_snan: 2374 case float_class_qnan: 2375 flags = float_flag_invalid; 2376 r = max; 2377 break; 2378 2379 case float_class_inf: 2380 flags = float_flag_invalid; 2381 r = p.sign ? min : max; 2382 break; 2383 2384 case float_class_zero: 2385 return 0; 2386 2387 case float_class_normal: 2388 /* TODO: 62 = N - 2, frac_size for rounding */ 2389 if (parts_round_to_int_normal(&p, rmode, scale, 62)) { 2390 flags = float_flag_inexact; 2391 } 2392 2393 if (p.exp <= DECOMPOSED_BINARY_POINT) { 2394 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2395 } else { 2396 r = UINT64_MAX; 2397 } 2398 if (p.sign) { 2399 if (r <= -(uint64_t)min) { 2400 r = -r; 2401 } else { 2402 flags = float_flag_invalid; 2403 r = min; 2404 } 2405 } else if (r > max) { 2406 flags = float_flag_invalid; 2407 r = max; 2408 } 2409 break; 2410 2411 default: 2412 g_assert_not_reached(); 2413 } 2414 2415 float_raise(flags, s); 2416 return r; 2417 } 2418 2419 int8_t float16_to_int8_scalbn(float16 a, FloatRoundMode rmode, int scale, 2420 float_status *s) 2421 { 2422 FloatParts64 p; 2423 2424 float16_unpack_canonical(&p, a, s); 2425 return round_to_int_and_pack(p, rmode, scale, INT8_MIN, INT8_MAX, s); 2426 } 2427 2428 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale, 2429 float_status *s) 2430 { 2431 FloatParts64 p; 2432 2433 float16_unpack_canonical(&p, a, s); 2434 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2435 } 2436 2437 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale, 2438 float_status *s) 2439 { 2440 FloatParts64 p; 2441 2442 float16_unpack_canonical(&p, a, s); 2443 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2444 } 2445 2446 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale, 2447 float_status *s) 2448 { 2449 FloatParts64 p; 2450 2451 float16_unpack_canonical(&p, a, s); 2452 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2453 } 2454 2455 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale, 2456 float_status *s) 2457 { 2458 FloatParts64 p; 2459 2460 float32_unpack_canonical(&p, a, s); 2461 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2462 } 2463 2464 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale, 2465 float_status *s) 2466 { 2467 FloatParts64 p; 2468 2469 float32_unpack_canonical(&p, a, s); 2470 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2471 } 2472 2473 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale, 2474 float_status *s) 2475 { 2476 FloatParts64 p; 2477 2478 float32_unpack_canonical(&p, a, s); 2479 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2480 } 2481 2482 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale, 2483 float_status *s) 2484 { 2485 FloatParts64 p; 2486 2487 float64_unpack_canonical(&p, a, s); 2488 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2489 } 2490 2491 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale, 2492 float_status *s) 2493 { 2494 FloatParts64 p; 2495 2496 float64_unpack_canonical(&p, a, s); 2497 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2498 } 2499 2500 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale, 2501 float_status *s) 2502 { 2503 FloatParts64 p; 2504 2505 float64_unpack_canonical(&p, a, s); 2506 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2507 } 2508 2509 int8_t float16_to_int8(float16 a, float_status *s) 2510 { 2511 return float16_to_int8_scalbn(a, s->float_rounding_mode, 0, s); 2512 } 2513 2514 int16_t float16_to_int16(float16 a, float_status *s) 2515 { 2516 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2517 } 2518 2519 int32_t float16_to_int32(float16 a, float_status *s) 2520 { 2521 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2522 } 2523 2524 int64_t float16_to_int64(float16 a, float_status *s) 2525 { 2526 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2527 } 2528 2529 int16_t float32_to_int16(float32 a, float_status *s) 2530 { 2531 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2532 } 2533 2534 int32_t float32_to_int32(float32 a, float_status *s) 2535 { 2536 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2537 } 2538 2539 int64_t float32_to_int64(float32 a, float_status *s) 2540 { 2541 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2542 } 2543 2544 int16_t float64_to_int16(float64 a, float_status *s) 2545 { 2546 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2547 } 2548 2549 int32_t float64_to_int32(float64 a, float_status *s) 2550 { 2551 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2552 } 2553 2554 int64_t float64_to_int64(float64 a, float_status *s) 2555 { 2556 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2557 } 2558 2559 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s) 2560 { 2561 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2562 } 2563 2564 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s) 2565 { 2566 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2567 } 2568 2569 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s) 2570 { 2571 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2572 } 2573 2574 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s) 2575 { 2576 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s); 2577 } 2578 2579 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s) 2580 { 2581 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s); 2582 } 2583 2584 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s) 2585 { 2586 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s); 2587 } 2588 2589 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s) 2590 { 2591 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s); 2592 } 2593 2594 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s) 2595 { 2596 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s); 2597 } 2598 2599 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s) 2600 { 2601 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s); 2602 } 2603 2604 /* 2605 * Returns the result of converting the floating-point value `a' to 2606 * the two's complement integer format. 2607 */ 2608 2609 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2610 float_status *s) 2611 { 2612 FloatParts64 p; 2613 2614 bfloat16_unpack_canonical(&p, a, s); 2615 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2616 } 2617 2618 int32_t bfloat16_to_int32_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2619 float_status *s) 2620 { 2621 FloatParts64 p; 2622 2623 bfloat16_unpack_canonical(&p, a, s); 2624 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2625 } 2626 2627 int64_t bfloat16_to_int64_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2628 float_status *s) 2629 { 2630 FloatParts64 p; 2631 2632 bfloat16_unpack_canonical(&p, a, s); 2633 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2634 } 2635 2636 int16_t bfloat16_to_int16(bfloat16 a, float_status *s) 2637 { 2638 return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2639 } 2640 2641 int32_t bfloat16_to_int32(bfloat16 a, float_status *s) 2642 { 2643 return bfloat16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2644 } 2645 2646 int64_t bfloat16_to_int64(bfloat16 a, float_status *s) 2647 { 2648 return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2649 } 2650 2651 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s) 2652 { 2653 return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2654 } 2655 2656 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a, float_status *s) 2657 { 2658 return bfloat16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2659 } 2660 2661 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a, float_status *s) 2662 { 2663 return bfloat16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2664 } 2665 2666 /* 2667 * Returns the result of converting the floating-point value `a' to 2668 * the unsigned integer format. The conversion is performed according 2669 * to the IEC/IEEE Standard for Binary Floating-Point 2670 * Arithmetic---which means in particular that the conversion is 2671 * rounded according to the current rounding mode. If `a' is a NaN, 2672 * the largest unsigned integer is returned. Otherwise, if the 2673 * conversion overflows, the largest unsigned integer is returned. If 2674 * the 'a' is negative, the result is rounded and zero is returned; 2675 * values that do not round to zero will raise the inexact exception 2676 * flag. 2677 */ 2678 2679 static uint64_t round_to_uint_and_pack(FloatParts64 p, FloatRoundMode rmode, 2680 int scale, uint64_t max, 2681 float_status *s) 2682 { 2683 int flags = 0; 2684 uint64_t r; 2685 2686 switch (p.cls) { 2687 case float_class_snan: 2688 case float_class_qnan: 2689 flags = float_flag_invalid; 2690 r = max; 2691 break; 2692 2693 case float_class_inf: 2694 flags = float_flag_invalid; 2695 r = p.sign ? 0 : max; 2696 break; 2697 2698 case float_class_zero: 2699 return 0; 2700 2701 case float_class_normal: 2702 /* TODO: 62 = N - 2, frac_size for rounding */ 2703 if (parts_round_to_int_normal(&p, rmode, scale, 62)) { 2704 flags = float_flag_inexact; 2705 if (p.cls == float_class_zero) { 2706 r = 0; 2707 break; 2708 } 2709 } 2710 2711 if (p.sign) { 2712 flags = float_flag_invalid; 2713 r = 0; 2714 } else if (p.exp > DECOMPOSED_BINARY_POINT) { 2715 flags = float_flag_invalid; 2716 r = max; 2717 } else { 2718 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2719 if (r > max) { 2720 flags = float_flag_invalid; 2721 r = max; 2722 } 2723 } 2724 break; 2725 2726 default: 2727 g_assert_not_reached(); 2728 } 2729 2730 float_raise(flags, s); 2731 return r; 2732 } 2733 2734 uint8_t float16_to_uint8_scalbn(float16 a, FloatRoundMode rmode, int scale, 2735 float_status *s) 2736 { 2737 FloatParts64 p; 2738 2739 float16_unpack_canonical(&p, a, s); 2740 return round_to_uint_and_pack(p, rmode, scale, UINT8_MAX, s); 2741 } 2742 2743 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale, 2744 float_status *s) 2745 { 2746 FloatParts64 p; 2747 2748 float16_unpack_canonical(&p, a, s); 2749 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2750 } 2751 2752 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale, 2753 float_status *s) 2754 { 2755 FloatParts64 p; 2756 2757 float16_unpack_canonical(&p, a, s); 2758 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2759 } 2760 2761 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale, 2762 float_status *s) 2763 { 2764 FloatParts64 p; 2765 2766 float16_unpack_canonical(&p, a, s); 2767 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2768 } 2769 2770 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale, 2771 float_status *s) 2772 { 2773 FloatParts64 p; 2774 2775 float32_unpack_canonical(&p, a, s); 2776 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2777 } 2778 2779 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale, 2780 float_status *s) 2781 { 2782 FloatParts64 p; 2783 2784 float32_unpack_canonical(&p, a, s); 2785 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2786 } 2787 2788 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale, 2789 float_status *s) 2790 { 2791 FloatParts64 p; 2792 2793 float32_unpack_canonical(&p, a, s); 2794 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2795 } 2796 2797 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale, 2798 float_status *s) 2799 { 2800 FloatParts64 p; 2801 2802 float64_unpack_canonical(&p, a, s); 2803 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2804 } 2805 2806 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale, 2807 float_status *s) 2808 { 2809 FloatParts64 p; 2810 2811 float64_unpack_canonical(&p, a, s); 2812 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2813 } 2814 2815 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale, 2816 float_status *s) 2817 { 2818 FloatParts64 p; 2819 2820 float64_unpack_canonical(&p, a, s); 2821 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2822 } 2823 2824 uint8_t float16_to_uint8(float16 a, float_status *s) 2825 { 2826 return float16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s); 2827 } 2828 2829 uint16_t float16_to_uint16(float16 a, float_status *s) 2830 { 2831 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2832 } 2833 2834 uint32_t float16_to_uint32(float16 a, float_status *s) 2835 { 2836 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2837 } 2838 2839 uint64_t float16_to_uint64(float16 a, float_status *s) 2840 { 2841 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2842 } 2843 2844 uint16_t float32_to_uint16(float32 a, float_status *s) 2845 { 2846 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2847 } 2848 2849 uint32_t float32_to_uint32(float32 a, float_status *s) 2850 { 2851 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2852 } 2853 2854 uint64_t float32_to_uint64(float32 a, float_status *s) 2855 { 2856 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2857 } 2858 2859 uint16_t float64_to_uint16(float64 a, float_status *s) 2860 { 2861 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2862 } 2863 2864 uint32_t float64_to_uint32(float64 a, float_status *s) 2865 { 2866 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2867 } 2868 2869 uint64_t float64_to_uint64(float64 a, float_status *s) 2870 { 2871 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2872 } 2873 2874 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s) 2875 { 2876 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2877 } 2878 2879 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s) 2880 { 2881 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2882 } 2883 2884 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s) 2885 { 2886 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2887 } 2888 2889 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s) 2890 { 2891 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2892 } 2893 2894 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s) 2895 { 2896 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2897 } 2898 2899 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s) 2900 { 2901 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2902 } 2903 2904 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s) 2905 { 2906 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2907 } 2908 2909 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s) 2910 { 2911 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2912 } 2913 2914 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s) 2915 { 2916 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2917 } 2918 2919 /* 2920 * Returns the result of converting the bfloat16 value `a' to 2921 * the unsigned integer format. 2922 */ 2923 2924 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode, 2925 int scale, float_status *s) 2926 { 2927 FloatParts64 p; 2928 2929 bfloat16_unpack_canonical(&p, a, s); 2930 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2931 } 2932 2933 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode rmode, 2934 int scale, float_status *s) 2935 { 2936 FloatParts64 p; 2937 2938 bfloat16_unpack_canonical(&p, a, s); 2939 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2940 } 2941 2942 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode rmode, 2943 int scale, float_status *s) 2944 { 2945 FloatParts64 p; 2946 2947 bfloat16_unpack_canonical(&p, a, s); 2948 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2949 } 2950 2951 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s) 2952 { 2953 return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2954 } 2955 2956 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *s) 2957 { 2958 return bfloat16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2959 } 2960 2961 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s) 2962 { 2963 return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2964 } 2965 2966 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s) 2967 { 2968 return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2969 } 2970 2971 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *s) 2972 { 2973 return bfloat16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2974 } 2975 2976 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *s) 2977 { 2978 return bfloat16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2979 } 2980 2981 /* 2982 * Integer to float conversions 2983 * 2984 * Returns the result of converting the two's complement integer `a' 2985 * to the floating-point format. The conversion is performed according 2986 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2987 */ 2988 2989 static FloatParts64 int_to_float(int64_t a, int scale, float_status *status) 2990 { 2991 FloatParts64 r = { .sign = false }; 2992 2993 if (a == 0) { 2994 r.cls = float_class_zero; 2995 } else { 2996 uint64_t f = a; 2997 int shift; 2998 2999 r.cls = float_class_normal; 3000 if (a < 0) { 3001 f = -f; 3002 r.sign = true; 3003 } 3004 shift = clz64(f); 3005 scale = MIN(MAX(scale, -0x10000), 0x10000); 3006 3007 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 3008 r.frac = f << shift; 3009 } 3010 3011 return r; 3012 } 3013 3014 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status) 3015 { 3016 FloatParts64 pa = int_to_float(a, scale, status); 3017 return float16_round_pack_canonical(&pa, status); 3018 } 3019 3020 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status) 3021 { 3022 return int64_to_float16_scalbn(a, scale, status); 3023 } 3024 3025 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status) 3026 { 3027 return int64_to_float16_scalbn(a, scale, status); 3028 } 3029 3030 float16 int64_to_float16(int64_t a, float_status *status) 3031 { 3032 return int64_to_float16_scalbn(a, 0, status); 3033 } 3034 3035 float16 int32_to_float16(int32_t a, float_status *status) 3036 { 3037 return int64_to_float16_scalbn(a, 0, status); 3038 } 3039 3040 float16 int16_to_float16(int16_t a, float_status *status) 3041 { 3042 return int64_to_float16_scalbn(a, 0, status); 3043 } 3044 3045 float16 int8_to_float16(int8_t a, float_status *status) 3046 { 3047 return int64_to_float16_scalbn(a, 0, status); 3048 } 3049 3050 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status) 3051 { 3052 FloatParts64 pa = int_to_float(a, scale, status); 3053 return float32_round_pack_canonical(&pa, status); 3054 } 3055 3056 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status) 3057 { 3058 return int64_to_float32_scalbn(a, scale, status); 3059 } 3060 3061 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status) 3062 { 3063 return int64_to_float32_scalbn(a, scale, status); 3064 } 3065 3066 float32 int64_to_float32(int64_t a, float_status *status) 3067 { 3068 return int64_to_float32_scalbn(a, 0, status); 3069 } 3070 3071 float32 int32_to_float32(int32_t a, float_status *status) 3072 { 3073 return int64_to_float32_scalbn(a, 0, status); 3074 } 3075 3076 float32 int16_to_float32(int16_t a, float_status *status) 3077 { 3078 return int64_to_float32_scalbn(a, 0, status); 3079 } 3080 3081 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status) 3082 { 3083 FloatParts64 pa = int_to_float(a, scale, status); 3084 return float64_round_pack_canonical(&pa, status); 3085 } 3086 3087 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status) 3088 { 3089 return int64_to_float64_scalbn(a, scale, status); 3090 } 3091 3092 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status) 3093 { 3094 return int64_to_float64_scalbn(a, scale, status); 3095 } 3096 3097 float64 int64_to_float64(int64_t a, float_status *status) 3098 { 3099 return int64_to_float64_scalbn(a, 0, status); 3100 } 3101 3102 float64 int32_to_float64(int32_t a, float_status *status) 3103 { 3104 return int64_to_float64_scalbn(a, 0, status); 3105 } 3106 3107 float64 int16_to_float64(int16_t a, float_status *status) 3108 { 3109 return int64_to_float64_scalbn(a, 0, status); 3110 } 3111 3112 /* 3113 * Returns the result of converting the two's complement integer `a' 3114 * to the bfloat16 format. 3115 */ 3116 3117 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status) 3118 { 3119 FloatParts64 pa = int_to_float(a, scale, status); 3120 return bfloat16_round_pack_canonical(&pa, status); 3121 } 3122 3123 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int scale, float_status *status) 3124 { 3125 return int64_to_bfloat16_scalbn(a, scale, status); 3126 } 3127 3128 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status) 3129 { 3130 return int64_to_bfloat16_scalbn(a, scale, status); 3131 } 3132 3133 bfloat16 int64_to_bfloat16(int64_t a, float_status *status) 3134 { 3135 return int64_to_bfloat16_scalbn(a, 0, status); 3136 } 3137 3138 bfloat16 int32_to_bfloat16(int32_t a, float_status *status) 3139 { 3140 return int64_to_bfloat16_scalbn(a, 0, status); 3141 } 3142 3143 bfloat16 int16_to_bfloat16(int16_t a, float_status *status) 3144 { 3145 return int64_to_bfloat16_scalbn(a, 0, status); 3146 } 3147 3148 /* 3149 * Unsigned Integer to float conversions 3150 * 3151 * Returns the result of converting the unsigned integer `a' to the 3152 * floating-point format. The conversion is performed according to the 3153 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3154 */ 3155 3156 static FloatParts64 uint_to_float(uint64_t a, int scale, float_status *status) 3157 { 3158 FloatParts64 r = { .sign = false }; 3159 int shift; 3160 3161 if (a == 0) { 3162 r.cls = float_class_zero; 3163 } else { 3164 scale = MIN(MAX(scale, -0x10000), 0x10000); 3165 shift = clz64(a); 3166 r.cls = float_class_normal; 3167 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 3168 r.frac = a << shift; 3169 } 3170 3171 return r; 3172 } 3173 3174 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status) 3175 { 3176 FloatParts64 pa = uint_to_float(a, scale, status); 3177 return float16_round_pack_canonical(&pa, status); 3178 } 3179 3180 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status) 3181 { 3182 return uint64_to_float16_scalbn(a, scale, status); 3183 } 3184 3185 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status) 3186 { 3187 return uint64_to_float16_scalbn(a, scale, status); 3188 } 3189 3190 float16 uint64_to_float16(uint64_t a, float_status *status) 3191 { 3192 return uint64_to_float16_scalbn(a, 0, status); 3193 } 3194 3195 float16 uint32_to_float16(uint32_t a, float_status *status) 3196 { 3197 return uint64_to_float16_scalbn(a, 0, status); 3198 } 3199 3200 float16 uint16_to_float16(uint16_t a, float_status *status) 3201 { 3202 return uint64_to_float16_scalbn(a, 0, status); 3203 } 3204 3205 float16 uint8_to_float16(uint8_t a, float_status *status) 3206 { 3207 return uint64_to_float16_scalbn(a, 0, status); 3208 } 3209 3210 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status) 3211 { 3212 FloatParts64 pa = uint_to_float(a, scale, status); 3213 return float32_round_pack_canonical(&pa, status); 3214 } 3215 3216 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status) 3217 { 3218 return uint64_to_float32_scalbn(a, scale, status); 3219 } 3220 3221 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status) 3222 { 3223 return uint64_to_float32_scalbn(a, scale, status); 3224 } 3225 3226 float32 uint64_to_float32(uint64_t a, float_status *status) 3227 { 3228 return uint64_to_float32_scalbn(a, 0, status); 3229 } 3230 3231 float32 uint32_to_float32(uint32_t a, float_status *status) 3232 { 3233 return uint64_to_float32_scalbn(a, 0, status); 3234 } 3235 3236 float32 uint16_to_float32(uint16_t a, float_status *status) 3237 { 3238 return uint64_to_float32_scalbn(a, 0, status); 3239 } 3240 3241 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status) 3242 { 3243 FloatParts64 pa = uint_to_float(a, scale, status); 3244 return float64_round_pack_canonical(&pa, status); 3245 } 3246 3247 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status) 3248 { 3249 return uint64_to_float64_scalbn(a, scale, status); 3250 } 3251 3252 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status) 3253 { 3254 return uint64_to_float64_scalbn(a, scale, status); 3255 } 3256 3257 float64 uint64_to_float64(uint64_t a, float_status *status) 3258 { 3259 return uint64_to_float64_scalbn(a, 0, status); 3260 } 3261 3262 float64 uint32_to_float64(uint32_t a, float_status *status) 3263 { 3264 return uint64_to_float64_scalbn(a, 0, status); 3265 } 3266 3267 float64 uint16_to_float64(uint16_t a, float_status *status) 3268 { 3269 return uint64_to_float64_scalbn(a, 0, status); 3270 } 3271 3272 /* 3273 * Returns the result of converting the unsigned integer `a' to the 3274 * bfloat16 format. 3275 */ 3276 3277 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status) 3278 { 3279 FloatParts64 pa = uint_to_float(a, scale, status); 3280 return bfloat16_round_pack_canonical(&pa, status); 3281 } 3282 3283 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int scale, float_status *status) 3284 { 3285 return uint64_to_bfloat16_scalbn(a, scale, status); 3286 } 3287 3288 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status) 3289 { 3290 return uint64_to_bfloat16_scalbn(a, scale, status); 3291 } 3292 3293 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status) 3294 { 3295 return uint64_to_bfloat16_scalbn(a, 0, status); 3296 } 3297 3298 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status) 3299 { 3300 return uint64_to_bfloat16_scalbn(a, 0, status); 3301 } 3302 3303 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status) 3304 { 3305 return uint64_to_bfloat16_scalbn(a, 0, status); 3306 } 3307 3308 /* Float Min/Max */ 3309 /* min() and max() functions. These can't be implemented as 3310 * 'compare and pick one input' because that would mishandle 3311 * NaNs and +0 vs -0. 3312 * 3313 * minnum() and maxnum() functions. These are similar to the min() 3314 * and max() functions but if one of the arguments is a QNaN and 3315 * the other is numerical then the numerical argument is returned. 3316 * SNaNs will get quietened before being returned. 3317 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 3318 * and maxNum() operations. min() and max() are the typical min/max 3319 * semantics provided by many CPUs which predate that specification. 3320 * 3321 * minnummag() and maxnummag() functions correspond to minNumMag() 3322 * and minNumMag() from the IEEE-754 2008. 3323 */ 3324 static FloatParts64 minmax_floats(FloatParts64 a, FloatParts64 b, bool ismin, 3325 bool ieee, bool ismag, float_status *s) 3326 { 3327 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) { 3328 if (ieee) { 3329 /* Takes two floating-point values `a' and `b', one of 3330 * which is a NaN, and returns the appropriate NaN 3331 * result. If either `a' or `b' is a signaling NaN, 3332 * the invalid exception is raised. 3333 */ 3334 if (is_snan(a.cls) || is_snan(b.cls)) { 3335 return *parts_pick_nan(&a, &b, s); 3336 } else if (is_nan(a.cls) && !is_nan(b.cls)) { 3337 return b; 3338 } else if (is_nan(b.cls) && !is_nan(a.cls)) { 3339 return a; 3340 } 3341 } 3342 return *parts_pick_nan(&a, &b, s); 3343 } else { 3344 int a_exp, b_exp; 3345 3346 switch (a.cls) { 3347 case float_class_normal: 3348 a_exp = a.exp; 3349 break; 3350 case float_class_inf: 3351 a_exp = INT_MAX; 3352 break; 3353 case float_class_zero: 3354 a_exp = INT_MIN; 3355 break; 3356 default: 3357 g_assert_not_reached(); 3358 break; 3359 } 3360 switch (b.cls) { 3361 case float_class_normal: 3362 b_exp = b.exp; 3363 break; 3364 case float_class_inf: 3365 b_exp = INT_MAX; 3366 break; 3367 case float_class_zero: 3368 b_exp = INT_MIN; 3369 break; 3370 default: 3371 g_assert_not_reached(); 3372 break; 3373 } 3374 3375 if (ismag && (a_exp != b_exp || a.frac != b.frac)) { 3376 bool a_less = a_exp < b_exp; 3377 if (a_exp == b_exp) { 3378 a_less = a.frac < b.frac; 3379 } 3380 return a_less ^ ismin ? b : a; 3381 } 3382 3383 if (a.sign == b.sign) { 3384 bool a_less = a_exp < b_exp; 3385 if (a_exp == b_exp) { 3386 a_less = a.frac < b.frac; 3387 } 3388 return a.sign ^ a_less ^ ismin ? b : a; 3389 } else { 3390 return a.sign ^ ismin ? b : a; 3391 } 3392 } 3393 } 3394 3395 #define MINMAX(sz, name, ismin, isiee, ismag) \ 3396 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \ 3397 float_status *s) \ 3398 { \ 3399 FloatParts64 pa, pb, pr; \ 3400 float ## sz ## _unpack_canonical(&pa, a, s); \ 3401 float ## sz ## _unpack_canonical(&pb, b, s); \ 3402 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 3403 return float ## sz ## _round_pack_canonical(&pr, s); \ 3404 } 3405 3406 MINMAX(16, min, true, false, false) 3407 MINMAX(16, minnum, true, true, false) 3408 MINMAX(16, minnummag, true, true, true) 3409 MINMAX(16, max, false, false, false) 3410 MINMAX(16, maxnum, false, true, false) 3411 MINMAX(16, maxnummag, false, true, true) 3412 3413 MINMAX(32, min, true, false, false) 3414 MINMAX(32, minnum, true, true, false) 3415 MINMAX(32, minnummag, true, true, true) 3416 MINMAX(32, max, false, false, false) 3417 MINMAX(32, maxnum, false, true, false) 3418 MINMAX(32, maxnummag, false, true, true) 3419 3420 MINMAX(64, min, true, false, false) 3421 MINMAX(64, minnum, true, true, false) 3422 MINMAX(64, minnummag, true, true, true) 3423 MINMAX(64, max, false, false, false) 3424 MINMAX(64, maxnum, false, true, false) 3425 MINMAX(64, maxnummag, false, true, true) 3426 3427 #undef MINMAX 3428 3429 #define BF16_MINMAX(name, ismin, isiee, ismag) \ 3430 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s) \ 3431 { \ 3432 FloatParts64 pa, pb, pr; \ 3433 bfloat16_unpack_canonical(&pa, a, s); \ 3434 bfloat16_unpack_canonical(&pb, b, s); \ 3435 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 3436 return bfloat16_round_pack_canonical(&pr, s); \ 3437 } 3438 3439 BF16_MINMAX(min, true, false, false) 3440 BF16_MINMAX(minnum, true, true, false) 3441 BF16_MINMAX(minnummag, true, true, true) 3442 BF16_MINMAX(max, false, false, false) 3443 BF16_MINMAX(maxnum, false, true, false) 3444 BF16_MINMAX(maxnummag, false, true, true) 3445 3446 #undef BF16_MINMAX 3447 3448 /* Floating point compare */ 3449 static FloatRelation compare_floats(FloatParts64 a, FloatParts64 b, bool is_quiet, 3450 float_status *s) 3451 { 3452 if (is_nan(a.cls) || is_nan(b.cls)) { 3453 if (!is_quiet || 3454 a.cls == float_class_snan || 3455 b.cls == float_class_snan) { 3456 float_raise(float_flag_invalid, s); 3457 } 3458 return float_relation_unordered; 3459 } 3460 3461 if (a.cls == float_class_zero) { 3462 if (b.cls == float_class_zero) { 3463 return float_relation_equal; 3464 } 3465 return b.sign ? float_relation_greater : float_relation_less; 3466 } else if (b.cls == float_class_zero) { 3467 return a.sign ? float_relation_less : float_relation_greater; 3468 } 3469 3470 /* The only really important thing about infinity is its sign. If 3471 * both are infinities the sign marks the smallest of the two. 3472 */ 3473 if (a.cls == float_class_inf) { 3474 if ((b.cls == float_class_inf) && (a.sign == b.sign)) { 3475 return float_relation_equal; 3476 } 3477 return a.sign ? float_relation_less : float_relation_greater; 3478 } else if (b.cls == float_class_inf) { 3479 return b.sign ? float_relation_greater : float_relation_less; 3480 } 3481 3482 if (a.sign != b.sign) { 3483 return a.sign ? float_relation_less : float_relation_greater; 3484 } 3485 3486 if (a.exp == b.exp) { 3487 if (a.frac == b.frac) { 3488 return float_relation_equal; 3489 } 3490 if (a.sign) { 3491 return a.frac > b.frac ? 3492 float_relation_less : float_relation_greater; 3493 } else { 3494 return a.frac > b.frac ? 3495 float_relation_greater : float_relation_less; 3496 } 3497 } else { 3498 if (a.sign) { 3499 return a.exp > b.exp ? float_relation_less : float_relation_greater; 3500 } else { 3501 return a.exp > b.exp ? float_relation_greater : float_relation_less; 3502 } 3503 } 3504 } 3505 3506 #define COMPARE(name, attr, sz) \ 3507 static int attr \ 3508 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \ 3509 { \ 3510 FloatParts64 pa, pb; \ 3511 float ## sz ## _unpack_canonical(&pa, a, s); \ 3512 float ## sz ## _unpack_canonical(&pb, b, s); \ 3513 return compare_floats(pa, pb, is_quiet, s); \ 3514 } 3515 3516 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16) 3517 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32) 3518 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64) 3519 3520 #undef COMPARE 3521 3522 FloatRelation float16_compare(float16 a, float16 b, float_status *s) 3523 { 3524 return soft_f16_compare(a, b, false, s); 3525 } 3526 3527 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s) 3528 { 3529 return soft_f16_compare(a, b, true, s); 3530 } 3531 3532 static FloatRelation QEMU_FLATTEN 3533 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s) 3534 { 3535 union_float32 ua, ub; 3536 3537 ua.s = xa; 3538 ub.s = xb; 3539 3540 if (QEMU_NO_HARDFLOAT) { 3541 goto soft; 3542 } 3543 3544 float32_input_flush2(&ua.s, &ub.s, s); 3545 if (isgreaterequal(ua.h, ub.h)) { 3546 if (isgreater(ua.h, ub.h)) { 3547 return float_relation_greater; 3548 } 3549 return float_relation_equal; 3550 } 3551 if (likely(isless(ua.h, ub.h))) { 3552 return float_relation_less; 3553 } 3554 /* The only condition remaining is unordered. 3555 * Fall through to set flags. 3556 */ 3557 soft: 3558 return soft_f32_compare(ua.s, ub.s, is_quiet, s); 3559 } 3560 3561 FloatRelation float32_compare(float32 a, float32 b, float_status *s) 3562 { 3563 return f32_compare(a, b, false, s); 3564 } 3565 3566 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s) 3567 { 3568 return f32_compare(a, b, true, s); 3569 } 3570 3571 static FloatRelation QEMU_FLATTEN 3572 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s) 3573 { 3574 union_float64 ua, ub; 3575 3576 ua.s = xa; 3577 ub.s = xb; 3578 3579 if (QEMU_NO_HARDFLOAT) { 3580 goto soft; 3581 } 3582 3583 float64_input_flush2(&ua.s, &ub.s, s); 3584 if (isgreaterequal(ua.h, ub.h)) { 3585 if (isgreater(ua.h, ub.h)) { 3586 return float_relation_greater; 3587 } 3588 return float_relation_equal; 3589 } 3590 if (likely(isless(ua.h, ub.h))) { 3591 return float_relation_less; 3592 } 3593 /* The only condition remaining is unordered. 3594 * Fall through to set flags. 3595 */ 3596 soft: 3597 return soft_f64_compare(ua.s, ub.s, is_quiet, s); 3598 } 3599 3600 FloatRelation float64_compare(float64 a, float64 b, float_status *s) 3601 { 3602 return f64_compare(a, b, false, s); 3603 } 3604 3605 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s) 3606 { 3607 return f64_compare(a, b, true, s); 3608 } 3609 3610 static FloatRelation QEMU_FLATTEN 3611 soft_bf16_compare(bfloat16 a, bfloat16 b, bool is_quiet, float_status *s) 3612 { 3613 FloatParts64 pa, pb; 3614 3615 bfloat16_unpack_canonical(&pa, a, s); 3616 bfloat16_unpack_canonical(&pb, b, s); 3617 return compare_floats(pa, pb, is_quiet, s); 3618 } 3619 3620 FloatRelation bfloat16_compare(bfloat16 a, bfloat16 b, float_status *s) 3621 { 3622 return soft_bf16_compare(a, b, false, s); 3623 } 3624 3625 FloatRelation bfloat16_compare_quiet(bfloat16 a, bfloat16 b, float_status *s) 3626 { 3627 return soft_bf16_compare(a, b, true, s); 3628 } 3629 3630 /* Multiply A by 2 raised to the power N. */ 3631 static FloatParts64 scalbn_decomposed(FloatParts64 a, int n, float_status *s) 3632 { 3633 if (unlikely(is_nan(a.cls))) { 3634 parts_return_nan(&a, s); 3635 } 3636 if (a.cls == float_class_normal) { 3637 /* The largest float type (even though not supported by FloatParts64) 3638 * is float128, which has a 15 bit exponent. Bounding N to 16 bits 3639 * still allows rounding to infinity, without allowing overflow 3640 * within the int32_t that backs FloatParts64.exp. 3641 */ 3642 n = MIN(MAX(n, -0x10000), 0x10000); 3643 a.exp += n; 3644 } 3645 return a; 3646 } 3647 3648 float16 float16_scalbn(float16 a, int n, float_status *status) 3649 { 3650 FloatParts64 pa, pr; 3651 3652 float16_unpack_canonical(&pa, a, status); 3653 pr = scalbn_decomposed(pa, n, status); 3654 return float16_round_pack_canonical(&pr, status); 3655 } 3656 3657 float32 float32_scalbn(float32 a, int n, float_status *status) 3658 { 3659 FloatParts64 pa, pr; 3660 3661 float32_unpack_canonical(&pa, a, status); 3662 pr = scalbn_decomposed(pa, n, status); 3663 return float32_round_pack_canonical(&pr, status); 3664 } 3665 3666 float64 float64_scalbn(float64 a, int n, float_status *status) 3667 { 3668 FloatParts64 pa, pr; 3669 3670 float64_unpack_canonical(&pa, a, status); 3671 pr = scalbn_decomposed(pa, n, status); 3672 return float64_round_pack_canonical(&pr, status); 3673 } 3674 3675 bfloat16 bfloat16_scalbn(bfloat16 a, int n, float_status *status) 3676 { 3677 FloatParts64 pa, pr; 3678 3679 bfloat16_unpack_canonical(&pa, a, status); 3680 pr = scalbn_decomposed(pa, n, status); 3681 return bfloat16_round_pack_canonical(&pr, status); 3682 } 3683 3684 /* 3685 * Square Root 3686 * 3687 * The old softfloat code did an approximation step before zeroing in 3688 * on the final result. However for simpleness we just compute the 3689 * square root by iterating down from the implicit bit to enough extra 3690 * bits to ensure we get a correctly rounded result. 3691 * 3692 * This does mean however the calculation is slower than before, 3693 * especially for 64 bit floats. 3694 */ 3695 3696 static FloatParts64 sqrt_float(FloatParts64 a, float_status *s, const FloatFmt *p) 3697 { 3698 uint64_t a_frac, r_frac, s_frac; 3699 int bit, last_bit; 3700 3701 if (is_nan(a.cls)) { 3702 parts_return_nan(&a, s); 3703 return a; 3704 } 3705 if (a.cls == float_class_zero) { 3706 return a; /* sqrt(+-0) = +-0 */ 3707 } 3708 if (a.sign) { 3709 float_raise(float_flag_invalid, s); 3710 parts_default_nan(&a, s); 3711 return a; 3712 } 3713 if (a.cls == float_class_inf) { 3714 return a; /* sqrt(+inf) = +inf */ 3715 } 3716 3717 assert(a.cls == float_class_normal); 3718 3719 /* We need two overflow bits at the top. Adding room for that is a 3720 * right shift. If the exponent is odd, we can discard the low bit 3721 * by multiplying the fraction by 2; that's a left shift. Combine 3722 * those and we shift right by 1 if the exponent is odd, otherwise 2. 3723 */ 3724 a_frac = a.frac >> (2 - (a.exp & 1)); 3725 a.exp >>= 1; 3726 3727 /* Bit-by-bit computation of sqrt. */ 3728 r_frac = 0; 3729 s_frac = 0; 3730 3731 /* Iterate from implicit bit down to the 3 extra bits to compute a 3732 * properly rounded result. Remember we've inserted two more bits 3733 * at the top, so these positions are two less. 3734 */ 3735 bit = DECOMPOSED_BINARY_POINT - 2; 3736 last_bit = MAX(p->frac_shift - 4, 0); 3737 do { 3738 uint64_t q = 1ULL << bit; 3739 uint64_t t_frac = s_frac + q; 3740 if (t_frac <= a_frac) { 3741 s_frac = t_frac + q; 3742 a_frac -= t_frac; 3743 r_frac += q; 3744 } 3745 a_frac <<= 1; 3746 } while (--bit >= last_bit); 3747 3748 /* Undo the right shift done above. If there is any remaining 3749 * fraction, the result is inexact. Set the sticky bit. 3750 */ 3751 a.frac = (r_frac << 2) + (a_frac != 0); 3752 3753 return a; 3754 } 3755 3756 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status) 3757 { 3758 FloatParts64 pa, pr; 3759 3760 float16_unpack_canonical(&pa, a, status); 3761 pr = sqrt_float(pa, status, &float16_params); 3762 return float16_round_pack_canonical(&pr, status); 3763 } 3764 3765 static float32 QEMU_SOFTFLOAT_ATTR 3766 soft_f32_sqrt(float32 a, float_status *status) 3767 { 3768 FloatParts64 pa, pr; 3769 3770 float32_unpack_canonical(&pa, a, status); 3771 pr = sqrt_float(pa, status, &float32_params); 3772 return float32_round_pack_canonical(&pr, status); 3773 } 3774 3775 static float64 QEMU_SOFTFLOAT_ATTR 3776 soft_f64_sqrt(float64 a, float_status *status) 3777 { 3778 FloatParts64 pa, pr; 3779 3780 float64_unpack_canonical(&pa, a, status); 3781 pr = sqrt_float(pa, status, &float64_params); 3782 return float64_round_pack_canonical(&pr, status); 3783 } 3784 3785 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s) 3786 { 3787 union_float32 ua, ur; 3788 3789 ua.s = xa; 3790 if (unlikely(!can_use_fpu(s))) { 3791 goto soft; 3792 } 3793 3794 float32_input_flush1(&ua.s, s); 3795 if (QEMU_HARDFLOAT_1F32_USE_FP) { 3796 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3797 fpclassify(ua.h) == FP_ZERO) || 3798 signbit(ua.h))) { 3799 goto soft; 3800 } 3801 } else if (unlikely(!float32_is_zero_or_normal(ua.s) || 3802 float32_is_neg(ua.s))) { 3803 goto soft; 3804 } 3805 ur.h = sqrtf(ua.h); 3806 return ur.s; 3807 3808 soft: 3809 return soft_f32_sqrt(ua.s, s); 3810 } 3811 3812 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s) 3813 { 3814 union_float64 ua, ur; 3815 3816 ua.s = xa; 3817 if (unlikely(!can_use_fpu(s))) { 3818 goto soft; 3819 } 3820 3821 float64_input_flush1(&ua.s, s); 3822 if (QEMU_HARDFLOAT_1F64_USE_FP) { 3823 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3824 fpclassify(ua.h) == FP_ZERO) || 3825 signbit(ua.h))) { 3826 goto soft; 3827 } 3828 } else if (unlikely(!float64_is_zero_or_normal(ua.s) || 3829 float64_is_neg(ua.s))) { 3830 goto soft; 3831 } 3832 ur.h = sqrt(ua.h); 3833 return ur.s; 3834 3835 soft: 3836 return soft_f64_sqrt(ua.s, s); 3837 } 3838 3839 bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status) 3840 { 3841 FloatParts64 pa, pr; 3842 3843 bfloat16_unpack_canonical(&pa, a, status); 3844 pr = sqrt_float(pa, status, &bfloat16_params); 3845 return bfloat16_round_pack_canonical(&pr, status); 3846 } 3847 3848 /*---------------------------------------------------------------------------- 3849 | The pattern for a default generated NaN. 3850 *----------------------------------------------------------------------------*/ 3851 3852 float16 float16_default_nan(float_status *status) 3853 { 3854 FloatParts64 p; 3855 3856 parts_default_nan(&p, status); 3857 p.frac >>= float16_params.frac_shift; 3858 return float16_pack_raw(&p); 3859 } 3860 3861 float32 float32_default_nan(float_status *status) 3862 { 3863 FloatParts64 p; 3864 3865 parts_default_nan(&p, status); 3866 p.frac >>= float32_params.frac_shift; 3867 return float32_pack_raw(&p); 3868 } 3869 3870 float64 float64_default_nan(float_status *status) 3871 { 3872 FloatParts64 p; 3873 3874 parts_default_nan(&p, status); 3875 p.frac >>= float64_params.frac_shift; 3876 return float64_pack_raw(&p); 3877 } 3878 3879 float128 float128_default_nan(float_status *status) 3880 { 3881 FloatParts128 p; 3882 3883 parts_default_nan(&p, status); 3884 frac_shr(&p, float128_params.frac_shift); 3885 return float128_pack_raw(&p); 3886 } 3887 3888 bfloat16 bfloat16_default_nan(float_status *status) 3889 { 3890 FloatParts64 p; 3891 3892 parts_default_nan(&p, status); 3893 p.frac >>= bfloat16_params.frac_shift; 3894 return bfloat16_pack_raw(&p); 3895 } 3896 3897 /*---------------------------------------------------------------------------- 3898 | Returns a quiet NaN from a signalling NaN for the floating point value `a'. 3899 *----------------------------------------------------------------------------*/ 3900 3901 float16 float16_silence_nan(float16 a, float_status *status) 3902 { 3903 FloatParts64 p; 3904 3905 float16_unpack_raw(&p, a); 3906 p.frac <<= float16_params.frac_shift; 3907 parts_silence_nan(&p, status); 3908 p.frac >>= float16_params.frac_shift; 3909 return float16_pack_raw(&p); 3910 } 3911 3912 float32 float32_silence_nan(float32 a, float_status *status) 3913 { 3914 FloatParts64 p; 3915 3916 float32_unpack_raw(&p, a); 3917 p.frac <<= float32_params.frac_shift; 3918 parts_silence_nan(&p, status); 3919 p.frac >>= float32_params.frac_shift; 3920 return float32_pack_raw(&p); 3921 } 3922 3923 float64 float64_silence_nan(float64 a, float_status *status) 3924 { 3925 FloatParts64 p; 3926 3927 float64_unpack_raw(&p, a); 3928 p.frac <<= float64_params.frac_shift; 3929 parts_silence_nan(&p, status); 3930 p.frac >>= float64_params.frac_shift; 3931 return float64_pack_raw(&p); 3932 } 3933 3934 bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status) 3935 { 3936 FloatParts64 p; 3937 3938 bfloat16_unpack_raw(&p, a); 3939 p.frac <<= bfloat16_params.frac_shift; 3940 parts_silence_nan(&p, status); 3941 p.frac >>= bfloat16_params.frac_shift; 3942 return bfloat16_pack_raw(&p); 3943 } 3944 3945 float128 float128_silence_nan(float128 a, float_status *status) 3946 { 3947 FloatParts128 p; 3948 3949 float128_unpack_raw(&p, a); 3950 frac_shl(&p, float128_params.frac_shift); 3951 parts_silence_nan(&p, status); 3952 frac_shr(&p, float128_params.frac_shift); 3953 return float128_pack_raw(&p); 3954 } 3955 3956 /*---------------------------------------------------------------------------- 3957 | If `a' is denormal and we are in flush-to-zero mode then set the 3958 | input-denormal exception and return zero. Otherwise just return the value. 3959 *----------------------------------------------------------------------------*/ 3960 3961 static bool parts_squash_denormal(FloatParts64 p, float_status *status) 3962 { 3963 if (p.exp == 0 && p.frac != 0) { 3964 float_raise(float_flag_input_denormal, status); 3965 return true; 3966 } 3967 3968 return false; 3969 } 3970 3971 float16 float16_squash_input_denormal(float16 a, float_status *status) 3972 { 3973 if (status->flush_inputs_to_zero) { 3974 FloatParts64 p; 3975 3976 float16_unpack_raw(&p, a); 3977 if (parts_squash_denormal(p, status)) { 3978 return float16_set_sign(float16_zero, p.sign); 3979 } 3980 } 3981 return a; 3982 } 3983 3984 float32 float32_squash_input_denormal(float32 a, float_status *status) 3985 { 3986 if (status->flush_inputs_to_zero) { 3987 FloatParts64 p; 3988 3989 float32_unpack_raw(&p, a); 3990 if (parts_squash_denormal(p, status)) { 3991 return float32_set_sign(float32_zero, p.sign); 3992 } 3993 } 3994 return a; 3995 } 3996 3997 float64 float64_squash_input_denormal(float64 a, float_status *status) 3998 { 3999 if (status->flush_inputs_to_zero) { 4000 FloatParts64 p; 4001 4002 float64_unpack_raw(&p, a); 4003 if (parts_squash_denormal(p, status)) { 4004 return float64_set_sign(float64_zero, p.sign); 4005 } 4006 } 4007 return a; 4008 } 4009 4010 bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status) 4011 { 4012 if (status->flush_inputs_to_zero) { 4013 FloatParts64 p; 4014 4015 bfloat16_unpack_raw(&p, a); 4016 if (parts_squash_denormal(p, status)) { 4017 return bfloat16_set_sign(bfloat16_zero, p.sign); 4018 } 4019 } 4020 return a; 4021 } 4022 4023 /*---------------------------------------------------------------------------- 4024 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 4025 | and 7, and returns the properly rounded 32-bit integer corresponding to the 4026 | input. If `zSign' is 1, the input is negated before being converted to an 4027 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 4028 | is simply rounded to an integer, with the inexact exception raised if the 4029 | input cannot be represented exactly as an integer. However, if the fixed- 4030 | point input is too large, the invalid exception is raised and the largest 4031 | positive or negative integer is returned. 4032 *----------------------------------------------------------------------------*/ 4033 4034 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ, 4035 float_status *status) 4036 { 4037 int8_t roundingMode; 4038 bool roundNearestEven; 4039 int8_t roundIncrement, roundBits; 4040 int32_t z; 4041 4042 roundingMode = status->float_rounding_mode; 4043 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4044 switch (roundingMode) { 4045 case float_round_nearest_even: 4046 case float_round_ties_away: 4047 roundIncrement = 0x40; 4048 break; 4049 case float_round_to_zero: 4050 roundIncrement = 0; 4051 break; 4052 case float_round_up: 4053 roundIncrement = zSign ? 0 : 0x7f; 4054 break; 4055 case float_round_down: 4056 roundIncrement = zSign ? 0x7f : 0; 4057 break; 4058 case float_round_to_odd: 4059 roundIncrement = absZ & 0x80 ? 0 : 0x7f; 4060 break; 4061 default: 4062 abort(); 4063 } 4064 roundBits = absZ & 0x7F; 4065 absZ = ( absZ + roundIncrement )>>7; 4066 if (!(roundBits ^ 0x40) && roundNearestEven) { 4067 absZ &= ~1; 4068 } 4069 z = absZ; 4070 if ( zSign ) z = - z; 4071 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 4072 float_raise(float_flag_invalid, status); 4073 return zSign ? INT32_MIN : INT32_MAX; 4074 } 4075 if (roundBits) { 4076 float_raise(float_flag_inexact, status); 4077 } 4078 return z; 4079 4080 } 4081 4082 /*---------------------------------------------------------------------------- 4083 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 4084 | `absZ1', with binary point between bits 63 and 64 (between the input words), 4085 | and returns the properly rounded 64-bit integer corresponding to the input. 4086 | If `zSign' is 1, the input is negated before being converted to an integer. 4087 | Ordinarily, the fixed-point input is simply rounded to an integer, with 4088 | the inexact exception raised if the input cannot be represented exactly as 4089 | an integer. However, if the fixed-point input is too large, the invalid 4090 | exception is raised and the largest positive or negative integer is 4091 | returned. 4092 *----------------------------------------------------------------------------*/ 4093 4094 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1, 4095 float_status *status) 4096 { 4097 int8_t roundingMode; 4098 bool roundNearestEven, increment; 4099 int64_t z; 4100 4101 roundingMode = status->float_rounding_mode; 4102 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4103 switch (roundingMode) { 4104 case float_round_nearest_even: 4105 case float_round_ties_away: 4106 increment = ((int64_t) absZ1 < 0); 4107 break; 4108 case float_round_to_zero: 4109 increment = 0; 4110 break; 4111 case float_round_up: 4112 increment = !zSign && absZ1; 4113 break; 4114 case float_round_down: 4115 increment = zSign && absZ1; 4116 break; 4117 case float_round_to_odd: 4118 increment = !(absZ0 & 1) && absZ1; 4119 break; 4120 default: 4121 abort(); 4122 } 4123 if ( increment ) { 4124 ++absZ0; 4125 if ( absZ0 == 0 ) goto overflow; 4126 if (!(absZ1 << 1) && roundNearestEven) { 4127 absZ0 &= ~1; 4128 } 4129 } 4130 z = absZ0; 4131 if ( zSign ) z = - z; 4132 if ( z && ( ( z < 0 ) ^ zSign ) ) { 4133 overflow: 4134 float_raise(float_flag_invalid, status); 4135 return zSign ? INT64_MIN : INT64_MAX; 4136 } 4137 if (absZ1) { 4138 float_raise(float_flag_inexact, status); 4139 } 4140 return z; 4141 4142 } 4143 4144 /*---------------------------------------------------------------------------- 4145 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 4146 | `absZ1', with binary point between bits 63 and 64 (between the input words), 4147 | and returns the properly rounded 64-bit unsigned integer corresponding to the 4148 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 4149 | with the inexact exception raised if the input cannot be represented exactly 4150 | as an integer. However, if the fixed-point input is too large, the invalid 4151 | exception is raised and the largest unsigned integer is returned. 4152 *----------------------------------------------------------------------------*/ 4153 4154 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0, 4155 uint64_t absZ1, float_status *status) 4156 { 4157 int8_t roundingMode; 4158 bool roundNearestEven, increment; 4159 4160 roundingMode = status->float_rounding_mode; 4161 roundNearestEven = (roundingMode == float_round_nearest_even); 4162 switch (roundingMode) { 4163 case float_round_nearest_even: 4164 case float_round_ties_away: 4165 increment = ((int64_t)absZ1 < 0); 4166 break; 4167 case float_round_to_zero: 4168 increment = 0; 4169 break; 4170 case float_round_up: 4171 increment = !zSign && absZ1; 4172 break; 4173 case float_round_down: 4174 increment = zSign && absZ1; 4175 break; 4176 case float_round_to_odd: 4177 increment = !(absZ0 & 1) && absZ1; 4178 break; 4179 default: 4180 abort(); 4181 } 4182 if (increment) { 4183 ++absZ0; 4184 if (absZ0 == 0) { 4185 float_raise(float_flag_invalid, status); 4186 return UINT64_MAX; 4187 } 4188 if (!(absZ1 << 1) && roundNearestEven) { 4189 absZ0 &= ~1; 4190 } 4191 } 4192 4193 if (zSign && absZ0) { 4194 float_raise(float_flag_invalid, status); 4195 return 0; 4196 } 4197 4198 if (absZ1) { 4199 float_raise(float_flag_inexact, status); 4200 } 4201 return absZ0; 4202 } 4203 4204 /*---------------------------------------------------------------------------- 4205 | Normalizes the subnormal single-precision floating-point value represented 4206 | by the denormalized significand `aSig'. The normalized exponent and 4207 | significand are stored at the locations pointed to by `zExpPtr' and 4208 | `zSigPtr', respectively. 4209 *----------------------------------------------------------------------------*/ 4210 4211 static void 4212 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 4213 { 4214 int8_t shiftCount; 4215 4216 shiftCount = clz32(aSig) - 8; 4217 *zSigPtr = aSig<<shiftCount; 4218 *zExpPtr = 1 - shiftCount; 4219 4220 } 4221 4222 /*---------------------------------------------------------------------------- 4223 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4224 | and significand `zSig', and returns the proper single-precision floating- 4225 | point value corresponding to the abstract input. Ordinarily, the abstract 4226 | value is simply rounded and packed into the single-precision format, with 4227 | the inexact exception raised if the abstract input cannot be represented 4228 | exactly. However, if the abstract value is too large, the overflow and 4229 | inexact exceptions are raised and an infinity or maximal finite value is 4230 | returned. If the abstract value is too small, the input value is rounded to 4231 | a subnormal number, and the underflow and inexact exceptions are raised if 4232 | the abstract input cannot be represented exactly as a subnormal single- 4233 | precision floating-point number. 4234 | The input significand `zSig' has its binary point between bits 30 4235 | and 29, which is 7 bits to the left of the usual location. This shifted 4236 | significand must be normalized or smaller. If `zSig' is not normalized, 4237 | `zExp' must be 0; in that case, the result returned is a subnormal number, 4238 | and it must not require rounding. In the usual case that `zSig' is 4239 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 4240 | The handling of underflow and overflow follows the IEC/IEEE Standard for 4241 | Binary Floating-Point Arithmetic. 4242 *----------------------------------------------------------------------------*/ 4243 4244 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig, 4245 float_status *status) 4246 { 4247 int8_t roundingMode; 4248 bool roundNearestEven; 4249 int8_t roundIncrement, roundBits; 4250 bool isTiny; 4251 4252 roundingMode = status->float_rounding_mode; 4253 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4254 switch (roundingMode) { 4255 case float_round_nearest_even: 4256 case float_round_ties_away: 4257 roundIncrement = 0x40; 4258 break; 4259 case float_round_to_zero: 4260 roundIncrement = 0; 4261 break; 4262 case float_round_up: 4263 roundIncrement = zSign ? 0 : 0x7f; 4264 break; 4265 case float_round_down: 4266 roundIncrement = zSign ? 0x7f : 0; 4267 break; 4268 case float_round_to_odd: 4269 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 4270 break; 4271 default: 4272 abort(); 4273 break; 4274 } 4275 roundBits = zSig & 0x7F; 4276 if ( 0xFD <= (uint16_t) zExp ) { 4277 if ( ( 0xFD < zExp ) 4278 || ( ( zExp == 0xFD ) 4279 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 4280 ) { 4281 bool overflow_to_inf = roundingMode != float_round_to_odd && 4282 roundIncrement != 0; 4283 float_raise(float_flag_overflow | float_flag_inexact, status); 4284 return packFloat32(zSign, 0xFF, -!overflow_to_inf); 4285 } 4286 if ( zExp < 0 ) { 4287 if (status->flush_to_zero) { 4288 float_raise(float_flag_output_denormal, status); 4289 return packFloat32(zSign, 0, 0); 4290 } 4291 isTiny = status->tininess_before_rounding 4292 || (zExp < -1) 4293 || (zSig + roundIncrement < 0x80000000); 4294 shift32RightJamming( zSig, - zExp, &zSig ); 4295 zExp = 0; 4296 roundBits = zSig & 0x7F; 4297 if (isTiny && roundBits) { 4298 float_raise(float_flag_underflow, status); 4299 } 4300 if (roundingMode == float_round_to_odd) { 4301 /* 4302 * For round-to-odd case, the roundIncrement depends on 4303 * zSig which just changed. 4304 */ 4305 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 4306 } 4307 } 4308 } 4309 if (roundBits) { 4310 float_raise(float_flag_inexact, status); 4311 } 4312 zSig = ( zSig + roundIncrement )>>7; 4313 if (!(roundBits ^ 0x40) && roundNearestEven) { 4314 zSig &= ~1; 4315 } 4316 if ( zSig == 0 ) zExp = 0; 4317 return packFloat32( zSign, zExp, zSig ); 4318 4319 } 4320 4321 /*---------------------------------------------------------------------------- 4322 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4323 | and significand `zSig', and returns the proper single-precision floating- 4324 | point value corresponding to the abstract input. This routine is just like 4325 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 4326 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 4327 | floating-point exponent. 4328 *----------------------------------------------------------------------------*/ 4329 4330 static float32 4331 normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig, 4332 float_status *status) 4333 { 4334 int8_t shiftCount; 4335 4336 shiftCount = clz32(zSig) - 1; 4337 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 4338 status); 4339 4340 } 4341 4342 /*---------------------------------------------------------------------------- 4343 | Normalizes the subnormal double-precision floating-point value represented 4344 | by the denormalized significand `aSig'. The normalized exponent and 4345 | significand are stored at the locations pointed to by `zExpPtr' and 4346 | `zSigPtr', respectively. 4347 *----------------------------------------------------------------------------*/ 4348 4349 static void 4350 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 4351 { 4352 int8_t shiftCount; 4353 4354 shiftCount = clz64(aSig) - 11; 4355 *zSigPtr = aSig<<shiftCount; 4356 *zExpPtr = 1 - shiftCount; 4357 4358 } 4359 4360 /*---------------------------------------------------------------------------- 4361 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 4362 | double-precision floating-point value, returning the result. After being 4363 | shifted into the proper positions, the three fields are simply added 4364 | together to form the result. This means that any integer portion of `zSig' 4365 | will be added into the exponent. Since a properly normalized significand 4366 | will have an integer portion equal to 1, the `zExp' input should be 1 less 4367 | than the desired result exponent whenever `zSig' is a complete, normalized 4368 | significand. 4369 *----------------------------------------------------------------------------*/ 4370 4371 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig) 4372 { 4373 4374 return make_float64( 4375 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 4376 4377 } 4378 4379 /*---------------------------------------------------------------------------- 4380 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4381 | and significand `zSig', and returns the proper double-precision floating- 4382 | point value corresponding to the abstract input. Ordinarily, the abstract 4383 | value is simply rounded and packed into the double-precision format, with 4384 | the inexact exception raised if the abstract input cannot be represented 4385 | exactly. However, if the abstract value is too large, the overflow and 4386 | inexact exceptions are raised and an infinity or maximal finite value is 4387 | returned. If the abstract value is too small, the input value is rounded to 4388 | a subnormal number, and the underflow and inexact exceptions are raised if 4389 | the abstract input cannot be represented exactly as a subnormal double- 4390 | precision floating-point number. 4391 | The input significand `zSig' has its binary point between bits 62 4392 | and 61, which is 10 bits to the left of the usual location. This shifted 4393 | significand must be normalized or smaller. If `zSig' is not normalized, 4394 | `zExp' must be 0; in that case, the result returned is a subnormal number, 4395 | and it must not require rounding. In the usual case that `zSig' is 4396 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 4397 | The handling of underflow and overflow follows the IEC/IEEE Standard for 4398 | Binary Floating-Point Arithmetic. 4399 *----------------------------------------------------------------------------*/ 4400 4401 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig, 4402 float_status *status) 4403 { 4404 int8_t roundingMode; 4405 bool roundNearestEven; 4406 int roundIncrement, roundBits; 4407 bool isTiny; 4408 4409 roundingMode = status->float_rounding_mode; 4410 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4411 switch (roundingMode) { 4412 case float_round_nearest_even: 4413 case float_round_ties_away: 4414 roundIncrement = 0x200; 4415 break; 4416 case float_round_to_zero: 4417 roundIncrement = 0; 4418 break; 4419 case float_round_up: 4420 roundIncrement = zSign ? 0 : 0x3ff; 4421 break; 4422 case float_round_down: 4423 roundIncrement = zSign ? 0x3ff : 0; 4424 break; 4425 case float_round_to_odd: 4426 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 4427 break; 4428 default: 4429 abort(); 4430 } 4431 roundBits = zSig & 0x3FF; 4432 if ( 0x7FD <= (uint16_t) zExp ) { 4433 if ( ( 0x7FD < zExp ) 4434 || ( ( zExp == 0x7FD ) 4435 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 4436 ) { 4437 bool overflow_to_inf = roundingMode != float_round_to_odd && 4438 roundIncrement != 0; 4439 float_raise(float_flag_overflow | float_flag_inexact, status); 4440 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 4441 } 4442 if ( zExp < 0 ) { 4443 if (status->flush_to_zero) { 4444 float_raise(float_flag_output_denormal, status); 4445 return packFloat64(zSign, 0, 0); 4446 } 4447 isTiny = status->tininess_before_rounding 4448 || (zExp < -1) 4449 || (zSig + roundIncrement < UINT64_C(0x8000000000000000)); 4450 shift64RightJamming( zSig, - zExp, &zSig ); 4451 zExp = 0; 4452 roundBits = zSig & 0x3FF; 4453 if (isTiny && roundBits) { 4454 float_raise(float_flag_underflow, status); 4455 } 4456 if (roundingMode == float_round_to_odd) { 4457 /* 4458 * For round-to-odd case, the roundIncrement depends on 4459 * zSig which just changed. 4460 */ 4461 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 4462 } 4463 } 4464 } 4465 if (roundBits) { 4466 float_raise(float_flag_inexact, status); 4467 } 4468 zSig = ( zSig + roundIncrement )>>10; 4469 if (!(roundBits ^ 0x200) && roundNearestEven) { 4470 zSig &= ~1; 4471 } 4472 if ( zSig == 0 ) zExp = 0; 4473 return packFloat64( zSign, zExp, zSig ); 4474 4475 } 4476 4477 /*---------------------------------------------------------------------------- 4478 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4479 | and significand `zSig', and returns the proper double-precision floating- 4480 | point value corresponding to the abstract input. This routine is just like 4481 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 4482 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 4483 | floating-point exponent. 4484 *----------------------------------------------------------------------------*/ 4485 4486 static float64 4487 normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig, 4488 float_status *status) 4489 { 4490 int8_t shiftCount; 4491 4492 shiftCount = clz64(zSig) - 1; 4493 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 4494 status); 4495 4496 } 4497 4498 /*---------------------------------------------------------------------------- 4499 | Normalizes the subnormal extended double-precision floating-point value 4500 | represented by the denormalized significand `aSig'. The normalized exponent 4501 | and significand are stored at the locations pointed to by `zExpPtr' and 4502 | `zSigPtr', respectively. 4503 *----------------------------------------------------------------------------*/ 4504 4505 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr, 4506 uint64_t *zSigPtr) 4507 { 4508 int8_t shiftCount; 4509 4510 shiftCount = clz64(aSig); 4511 *zSigPtr = aSig<<shiftCount; 4512 *zExpPtr = 1 - shiftCount; 4513 } 4514 4515 /*---------------------------------------------------------------------------- 4516 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4517 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 4518 | and returns the proper extended double-precision floating-point value 4519 | corresponding to the abstract input. Ordinarily, the abstract value is 4520 | rounded and packed into the extended double-precision format, with the 4521 | inexact exception raised if the abstract input cannot be represented 4522 | exactly. However, if the abstract value is too large, the overflow and 4523 | inexact exceptions are raised and an infinity or maximal finite value is 4524 | returned. If the abstract value is too small, the input value is rounded to 4525 | a subnormal number, and the underflow and inexact exceptions are raised if 4526 | the abstract input cannot be represented exactly as a subnormal extended 4527 | double-precision floating-point number. 4528 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 4529 | number of bits as single or double precision, respectively. Otherwise, the 4530 | result is rounded to the full precision of the extended double-precision 4531 | format. 4532 | The input significand must be normalized or smaller. If the input 4533 | significand is not normalized, `zExp' must be 0; in that case, the result 4534 | returned is a subnormal number, and it must not require rounding. The 4535 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 4536 | Floating-Point Arithmetic. 4537 *----------------------------------------------------------------------------*/ 4538 4539 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign, 4540 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 4541 float_status *status) 4542 { 4543 int8_t roundingMode; 4544 bool roundNearestEven, increment, isTiny; 4545 int64_t roundIncrement, roundMask, roundBits; 4546 4547 roundingMode = status->float_rounding_mode; 4548 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4549 if ( roundingPrecision == 80 ) goto precision80; 4550 if ( roundingPrecision == 64 ) { 4551 roundIncrement = UINT64_C(0x0000000000000400); 4552 roundMask = UINT64_C(0x00000000000007FF); 4553 } 4554 else if ( roundingPrecision == 32 ) { 4555 roundIncrement = UINT64_C(0x0000008000000000); 4556 roundMask = UINT64_C(0x000000FFFFFFFFFF); 4557 } 4558 else { 4559 goto precision80; 4560 } 4561 zSig0 |= ( zSig1 != 0 ); 4562 switch (roundingMode) { 4563 case float_round_nearest_even: 4564 case float_round_ties_away: 4565 break; 4566 case float_round_to_zero: 4567 roundIncrement = 0; 4568 break; 4569 case float_round_up: 4570 roundIncrement = zSign ? 0 : roundMask; 4571 break; 4572 case float_round_down: 4573 roundIncrement = zSign ? roundMask : 0; 4574 break; 4575 default: 4576 abort(); 4577 } 4578 roundBits = zSig0 & roundMask; 4579 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 4580 if ( ( 0x7FFE < zExp ) 4581 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 4582 ) { 4583 goto overflow; 4584 } 4585 if ( zExp <= 0 ) { 4586 if (status->flush_to_zero) { 4587 float_raise(float_flag_output_denormal, status); 4588 return packFloatx80(zSign, 0, 0); 4589 } 4590 isTiny = status->tininess_before_rounding 4591 || (zExp < 0 ) 4592 || (zSig0 <= zSig0 + roundIncrement); 4593 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 4594 zExp = 0; 4595 roundBits = zSig0 & roundMask; 4596 if (isTiny && roundBits) { 4597 float_raise(float_flag_underflow, status); 4598 } 4599 if (roundBits) { 4600 float_raise(float_flag_inexact, status); 4601 } 4602 zSig0 += roundIncrement; 4603 if ( (int64_t) zSig0 < 0 ) zExp = 1; 4604 roundIncrement = roundMask + 1; 4605 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 4606 roundMask |= roundIncrement; 4607 } 4608 zSig0 &= ~ roundMask; 4609 return packFloatx80( zSign, zExp, zSig0 ); 4610 } 4611 } 4612 if (roundBits) { 4613 float_raise(float_flag_inexact, status); 4614 } 4615 zSig0 += roundIncrement; 4616 if ( zSig0 < roundIncrement ) { 4617 ++zExp; 4618 zSig0 = UINT64_C(0x8000000000000000); 4619 } 4620 roundIncrement = roundMask + 1; 4621 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 4622 roundMask |= roundIncrement; 4623 } 4624 zSig0 &= ~ roundMask; 4625 if ( zSig0 == 0 ) zExp = 0; 4626 return packFloatx80( zSign, zExp, zSig0 ); 4627 precision80: 4628 switch (roundingMode) { 4629 case float_round_nearest_even: 4630 case float_round_ties_away: 4631 increment = ((int64_t)zSig1 < 0); 4632 break; 4633 case float_round_to_zero: 4634 increment = 0; 4635 break; 4636 case float_round_up: 4637 increment = !zSign && zSig1; 4638 break; 4639 case float_round_down: 4640 increment = zSign && zSig1; 4641 break; 4642 default: 4643 abort(); 4644 } 4645 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 4646 if ( ( 0x7FFE < zExp ) 4647 || ( ( zExp == 0x7FFE ) 4648 && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) ) 4649 && increment 4650 ) 4651 ) { 4652 roundMask = 0; 4653 overflow: 4654 float_raise(float_flag_overflow | float_flag_inexact, status); 4655 if ( ( roundingMode == float_round_to_zero ) 4656 || ( zSign && ( roundingMode == float_round_up ) ) 4657 || ( ! zSign && ( roundingMode == float_round_down ) ) 4658 ) { 4659 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 4660 } 4661 return packFloatx80(zSign, 4662 floatx80_infinity_high, 4663 floatx80_infinity_low); 4664 } 4665 if ( zExp <= 0 ) { 4666 isTiny = status->tininess_before_rounding 4667 || (zExp < 0) 4668 || !increment 4669 || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF)); 4670 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 4671 zExp = 0; 4672 if (isTiny && zSig1) { 4673 float_raise(float_flag_underflow, status); 4674 } 4675 if (zSig1) { 4676 float_raise(float_flag_inexact, status); 4677 } 4678 switch (roundingMode) { 4679 case float_round_nearest_even: 4680 case float_round_ties_away: 4681 increment = ((int64_t)zSig1 < 0); 4682 break; 4683 case float_round_to_zero: 4684 increment = 0; 4685 break; 4686 case float_round_up: 4687 increment = !zSign && zSig1; 4688 break; 4689 case float_round_down: 4690 increment = zSign && zSig1; 4691 break; 4692 default: 4693 abort(); 4694 } 4695 if ( increment ) { 4696 ++zSig0; 4697 if (!(zSig1 << 1) && roundNearestEven) { 4698 zSig0 &= ~1; 4699 } 4700 if ( (int64_t) zSig0 < 0 ) zExp = 1; 4701 } 4702 return packFloatx80( zSign, zExp, zSig0 ); 4703 } 4704 } 4705 if (zSig1) { 4706 float_raise(float_flag_inexact, status); 4707 } 4708 if ( increment ) { 4709 ++zSig0; 4710 if ( zSig0 == 0 ) { 4711 ++zExp; 4712 zSig0 = UINT64_C(0x8000000000000000); 4713 } 4714 else { 4715 if (!(zSig1 << 1) && roundNearestEven) { 4716 zSig0 &= ~1; 4717 } 4718 } 4719 } 4720 else { 4721 if ( zSig0 == 0 ) zExp = 0; 4722 } 4723 return packFloatx80( zSign, zExp, zSig0 ); 4724 4725 } 4726 4727 /*---------------------------------------------------------------------------- 4728 | Takes an abstract floating-point value having sign `zSign', exponent 4729 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 4730 | and returns the proper extended double-precision floating-point value 4731 | corresponding to the abstract input. This routine is just like 4732 | `roundAndPackFloatx80' except that the input significand does not have to be 4733 | normalized. 4734 *----------------------------------------------------------------------------*/ 4735 4736 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 4737 bool zSign, int32_t zExp, 4738 uint64_t zSig0, uint64_t zSig1, 4739 float_status *status) 4740 { 4741 int8_t shiftCount; 4742 4743 if ( zSig0 == 0 ) { 4744 zSig0 = zSig1; 4745 zSig1 = 0; 4746 zExp -= 64; 4747 } 4748 shiftCount = clz64(zSig0); 4749 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4750 zExp -= shiftCount; 4751 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 4752 zSig0, zSig1, status); 4753 4754 } 4755 4756 /*---------------------------------------------------------------------------- 4757 | Returns the least-significant 64 fraction bits of the quadruple-precision 4758 | floating-point value `a'. 4759 *----------------------------------------------------------------------------*/ 4760 4761 static inline uint64_t extractFloat128Frac1( float128 a ) 4762 { 4763 4764 return a.low; 4765 4766 } 4767 4768 /*---------------------------------------------------------------------------- 4769 | Returns the most-significant 48 fraction bits of the quadruple-precision 4770 | floating-point value `a'. 4771 *----------------------------------------------------------------------------*/ 4772 4773 static inline uint64_t extractFloat128Frac0( float128 a ) 4774 { 4775 4776 return a.high & UINT64_C(0x0000FFFFFFFFFFFF); 4777 4778 } 4779 4780 /*---------------------------------------------------------------------------- 4781 | Returns the exponent bits of the quadruple-precision floating-point value 4782 | `a'. 4783 *----------------------------------------------------------------------------*/ 4784 4785 static inline int32_t extractFloat128Exp( float128 a ) 4786 { 4787 4788 return ( a.high>>48 ) & 0x7FFF; 4789 4790 } 4791 4792 /*---------------------------------------------------------------------------- 4793 | Returns the sign bit of the quadruple-precision floating-point value `a'. 4794 *----------------------------------------------------------------------------*/ 4795 4796 static inline bool extractFloat128Sign(float128 a) 4797 { 4798 return a.high >> 63; 4799 } 4800 4801 /*---------------------------------------------------------------------------- 4802 | Normalizes the subnormal quadruple-precision floating-point value 4803 | represented by the denormalized significand formed by the concatenation of 4804 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 4805 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 4806 | significand are stored at the location pointed to by `zSig0Ptr', and the 4807 | least significant 64 bits of the normalized significand are stored at the 4808 | location pointed to by `zSig1Ptr'. 4809 *----------------------------------------------------------------------------*/ 4810 4811 static void 4812 normalizeFloat128Subnormal( 4813 uint64_t aSig0, 4814 uint64_t aSig1, 4815 int32_t *zExpPtr, 4816 uint64_t *zSig0Ptr, 4817 uint64_t *zSig1Ptr 4818 ) 4819 { 4820 int8_t shiftCount; 4821 4822 if ( aSig0 == 0 ) { 4823 shiftCount = clz64(aSig1) - 15; 4824 if ( shiftCount < 0 ) { 4825 *zSig0Ptr = aSig1>>( - shiftCount ); 4826 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 4827 } 4828 else { 4829 *zSig0Ptr = aSig1<<shiftCount; 4830 *zSig1Ptr = 0; 4831 } 4832 *zExpPtr = - shiftCount - 63; 4833 } 4834 else { 4835 shiftCount = clz64(aSig0) - 15; 4836 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 4837 *zExpPtr = 1 - shiftCount; 4838 } 4839 4840 } 4841 4842 /*---------------------------------------------------------------------------- 4843 | Packs the sign `zSign', the exponent `zExp', and the significand formed 4844 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 4845 | floating-point value, returning the result. After being shifted into the 4846 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 4847 | added together to form the most significant 32 bits of the result. This 4848 | means that any integer portion of `zSig0' will be added into the exponent. 4849 | Since a properly normalized significand will have an integer portion equal 4850 | to 1, the `zExp' input should be 1 less than the desired result exponent 4851 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 4852 | significand. 4853 *----------------------------------------------------------------------------*/ 4854 4855 static inline float128 4856 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1) 4857 { 4858 float128 z; 4859 4860 z.low = zSig1; 4861 z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0; 4862 return z; 4863 } 4864 4865 /*---------------------------------------------------------------------------- 4866 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4867 | and extended significand formed by the concatenation of `zSig0', `zSig1', 4868 | and `zSig2', and returns the proper quadruple-precision floating-point value 4869 | corresponding to the abstract input. Ordinarily, the abstract value is 4870 | simply rounded and packed into the quadruple-precision format, with the 4871 | inexact exception raised if the abstract input cannot be represented 4872 | exactly. However, if the abstract value is too large, the overflow and 4873 | inexact exceptions are raised and an infinity or maximal finite value is 4874 | returned. If the abstract value is too small, the input value is rounded to 4875 | a subnormal number, and the underflow and inexact exceptions are raised if 4876 | the abstract input cannot be represented exactly as a subnormal quadruple- 4877 | precision floating-point number. 4878 | The input significand must be normalized or smaller. If the input 4879 | significand is not normalized, `zExp' must be 0; in that case, the result 4880 | returned is a subnormal number, and it must not require rounding. In the 4881 | usual case that the input significand is normalized, `zExp' must be 1 less 4882 | than the ``true'' floating-point exponent. The handling of underflow and 4883 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4884 *----------------------------------------------------------------------------*/ 4885 4886 static float128 roundAndPackFloat128(bool zSign, int32_t zExp, 4887 uint64_t zSig0, uint64_t zSig1, 4888 uint64_t zSig2, float_status *status) 4889 { 4890 int8_t roundingMode; 4891 bool roundNearestEven, increment, isTiny; 4892 4893 roundingMode = status->float_rounding_mode; 4894 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4895 switch (roundingMode) { 4896 case float_round_nearest_even: 4897 case float_round_ties_away: 4898 increment = ((int64_t)zSig2 < 0); 4899 break; 4900 case float_round_to_zero: 4901 increment = 0; 4902 break; 4903 case float_round_up: 4904 increment = !zSign && zSig2; 4905 break; 4906 case float_round_down: 4907 increment = zSign && zSig2; 4908 break; 4909 case float_round_to_odd: 4910 increment = !(zSig1 & 0x1) && zSig2; 4911 break; 4912 default: 4913 abort(); 4914 } 4915 if ( 0x7FFD <= (uint32_t) zExp ) { 4916 if ( ( 0x7FFD < zExp ) 4917 || ( ( zExp == 0x7FFD ) 4918 && eq128( 4919 UINT64_C(0x0001FFFFFFFFFFFF), 4920 UINT64_C(0xFFFFFFFFFFFFFFFF), 4921 zSig0, 4922 zSig1 4923 ) 4924 && increment 4925 ) 4926 ) { 4927 float_raise(float_flag_overflow | float_flag_inexact, status); 4928 if ( ( roundingMode == float_round_to_zero ) 4929 || ( zSign && ( roundingMode == float_round_up ) ) 4930 || ( ! zSign && ( roundingMode == float_round_down ) ) 4931 || (roundingMode == float_round_to_odd) 4932 ) { 4933 return 4934 packFloat128( 4935 zSign, 4936 0x7FFE, 4937 UINT64_C(0x0000FFFFFFFFFFFF), 4938 UINT64_C(0xFFFFFFFFFFFFFFFF) 4939 ); 4940 } 4941 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4942 } 4943 if ( zExp < 0 ) { 4944 if (status->flush_to_zero) { 4945 float_raise(float_flag_output_denormal, status); 4946 return packFloat128(zSign, 0, 0, 0); 4947 } 4948 isTiny = status->tininess_before_rounding 4949 || (zExp < -1) 4950 || !increment 4951 || lt128(zSig0, zSig1, 4952 UINT64_C(0x0001FFFFFFFFFFFF), 4953 UINT64_C(0xFFFFFFFFFFFFFFFF)); 4954 shift128ExtraRightJamming( 4955 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 4956 zExp = 0; 4957 if (isTiny && zSig2) { 4958 float_raise(float_flag_underflow, status); 4959 } 4960 switch (roundingMode) { 4961 case float_round_nearest_even: 4962 case float_round_ties_away: 4963 increment = ((int64_t)zSig2 < 0); 4964 break; 4965 case float_round_to_zero: 4966 increment = 0; 4967 break; 4968 case float_round_up: 4969 increment = !zSign && zSig2; 4970 break; 4971 case float_round_down: 4972 increment = zSign && zSig2; 4973 break; 4974 case float_round_to_odd: 4975 increment = !(zSig1 & 0x1) && zSig2; 4976 break; 4977 default: 4978 abort(); 4979 } 4980 } 4981 } 4982 if (zSig2) { 4983 float_raise(float_flag_inexact, status); 4984 } 4985 if ( increment ) { 4986 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 4987 if ((zSig2 + zSig2 == 0) && roundNearestEven) { 4988 zSig1 &= ~1; 4989 } 4990 } 4991 else { 4992 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 4993 } 4994 return packFloat128( zSign, zExp, zSig0, zSig1 ); 4995 4996 } 4997 4998 /*---------------------------------------------------------------------------- 4999 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 5000 | and significand formed by the concatenation of `zSig0' and `zSig1', and 5001 | returns the proper quadruple-precision floating-point value corresponding 5002 | to the abstract input. This routine is just like `roundAndPackFloat128' 5003 | except that the input significand has fewer bits and does not have to be 5004 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 5005 | point exponent. 5006 *----------------------------------------------------------------------------*/ 5007 5008 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp, 5009 uint64_t zSig0, uint64_t zSig1, 5010 float_status *status) 5011 { 5012 int8_t shiftCount; 5013 uint64_t zSig2; 5014 5015 if ( zSig0 == 0 ) { 5016 zSig0 = zSig1; 5017 zSig1 = 0; 5018 zExp -= 64; 5019 } 5020 shiftCount = clz64(zSig0) - 15; 5021 if ( 0 <= shiftCount ) { 5022 zSig2 = 0; 5023 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 5024 } 5025 else { 5026 shift128ExtraRightJamming( 5027 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 5028 } 5029 zExp -= shiftCount; 5030 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 5031 5032 } 5033 5034 5035 /*---------------------------------------------------------------------------- 5036 | Returns the result of converting the 32-bit two's complement integer `a' 5037 | to the extended double-precision floating-point format. The conversion 5038 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5039 | Arithmetic. 5040 *----------------------------------------------------------------------------*/ 5041 5042 floatx80 int32_to_floatx80(int32_t a, float_status *status) 5043 { 5044 bool zSign; 5045 uint32_t absA; 5046 int8_t shiftCount; 5047 uint64_t zSig; 5048 5049 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 5050 zSign = ( a < 0 ); 5051 absA = zSign ? - a : a; 5052 shiftCount = clz32(absA) + 32; 5053 zSig = absA; 5054 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 5055 5056 } 5057 5058 /*---------------------------------------------------------------------------- 5059 | Returns the result of converting the 32-bit two's complement integer `a' to 5060 | the quadruple-precision floating-point format. The conversion is performed 5061 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5062 *----------------------------------------------------------------------------*/ 5063 5064 float128 int32_to_float128(int32_t a, float_status *status) 5065 { 5066 bool zSign; 5067 uint32_t absA; 5068 int8_t shiftCount; 5069 uint64_t zSig0; 5070 5071 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 5072 zSign = ( a < 0 ); 5073 absA = zSign ? - a : a; 5074 shiftCount = clz32(absA) + 17; 5075 zSig0 = absA; 5076 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 5077 5078 } 5079 5080 /*---------------------------------------------------------------------------- 5081 | Returns the result of converting the 64-bit two's complement integer `a' 5082 | to the extended double-precision floating-point format. The conversion 5083 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5084 | Arithmetic. 5085 *----------------------------------------------------------------------------*/ 5086 5087 floatx80 int64_to_floatx80(int64_t a, float_status *status) 5088 { 5089 bool zSign; 5090 uint64_t absA; 5091 int8_t shiftCount; 5092 5093 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 5094 zSign = ( a < 0 ); 5095 absA = zSign ? - a : a; 5096 shiftCount = clz64(absA); 5097 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 5098 5099 } 5100 5101 /*---------------------------------------------------------------------------- 5102 | Returns the result of converting the 64-bit two's complement integer `a' to 5103 | the quadruple-precision floating-point format. The conversion is performed 5104 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5105 *----------------------------------------------------------------------------*/ 5106 5107 float128 int64_to_float128(int64_t a, float_status *status) 5108 { 5109 bool zSign; 5110 uint64_t absA; 5111 int8_t shiftCount; 5112 int32_t zExp; 5113 uint64_t zSig0, zSig1; 5114 5115 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 5116 zSign = ( a < 0 ); 5117 absA = zSign ? - a : a; 5118 shiftCount = clz64(absA) + 49; 5119 zExp = 0x406E - shiftCount; 5120 if ( 64 <= shiftCount ) { 5121 zSig1 = 0; 5122 zSig0 = absA; 5123 shiftCount -= 64; 5124 } 5125 else { 5126 zSig1 = absA; 5127 zSig0 = 0; 5128 } 5129 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 5130 return packFloat128( zSign, zExp, zSig0, zSig1 ); 5131 5132 } 5133 5134 /*---------------------------------------------------------------------------- 5135 | Returns the result of converting the 64-bit unsigned integer `a' 5136 | to the quadruple-precision floating-point format. The conversion is performed 5137 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5138 *----------------------------------------------------------------------------*/ 5139 5140 float128 uint64_to_float128(uint64_t a, float_status *status) 5141 { 5142 if (a == 0) { 5143 return float128_zero; 5144 } 5145 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status); 5146 } 5147 5148 /*---------------------------------------------------------------------------- 5149 | Returns the result of converting the single-precision floating-point value 5150 | `a' to the extended double-precision floating-point format. The conversion 5151 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5152 | Arithmetic. 5153 *----------------------------------------------------------------------------*/ 5154 5155 floatx80 float32_to_floatx80(float32 a, float_status *status) 5156 { 5157 bool aSign; 5158 int aExp; 5159 uint32_t aSig; 5160 5161 a = float32_squash_input_denormal(a, status); 5162 aSig = extractFloat32Frac( a ); 5163 aExp = extractFloat32Exp( a ); 5164 aSign = extractFloat32Sign( a ); 5165 if ( aExp == 0xFF ) { 5166 if (aSig) { 5167 floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status), 5168 status); 5169 return floatx80_silence_nan(res, status); 5170 } 5171 return packFloatx80(aSign, 5172 floatx80_infinity_high, 5173 floatx80_infinity_low); 5174 } 5175 if ( aExp == 0 ) { 5176 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 5177 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5178 } 5179 aSig |= 0x00800000; 5180 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 5181 5182 } 5183 5184 /*---------------------------------------------------------------------------- 5185 | Returns the remainder of the single-precision floating-point value `a' 5186 | with respect to the corresponding value `b'. The operation is performed 5187 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5188 *----------------------------------------------------------------------------*/ 5189 5190 float32 float32_rem(float32 a, float32 b, float_status *status) 5191 { 5192 bool aSign, zSign; 5193 int aExp, bExp, expDiff; 5194 uint32_t aSig, bSig; 5195 uint32_t q; 5196 uint64_t aSig64, bSig64, q64; 5197 uint32_t alternateASig; 5198 int32_t sigMean; 5199 a = float32_squash_input_denormal(a, status); 5200 b = float32_squash_input_denormal(b, status); 5201 5202 aSig = extractFloat32Frac( a ); 5203 aExp = extractFloat32Exp( a ); 5204 aSign = extractFloat32Sign( a ); 5205 bSig = extractFloat32Frac( b ); 5206 bExp = extractFloat32Exp( b ); 5207 if ( aExp == 0xFF ) { 5208 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 5209 return propagateFloat32NaN(a, b, status); 5210 } 5211 float_raise(float_flag_invalid, status); 5212 return float32_default_nan(status); 5213 } 5214 if ( bExp == 0xFF ) { 5215 if (bSig) { 5216 return propagateFloat32NaN(a, b, status); 5217 } 5218 return a; 5219 } 5220 if ( bExp == 0 ) { 5221 if ( bSig == 0 ) { 5222 float_raise(float_flag_invalid, status); 5223 return float32_default_nan(status); 5224 } 5225 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 5226 } 5227 if ( aExp == 0 ) { 5228 if ( aSig == 0 ) return a; 5229 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5230 } 5231 expDiff = aExp - bExp; 5232 aSig |= 0x00800000; 5233 bSig |= 0x00800000; 5234 if ( expDiff < 32 ) { 5235 aSig <<= 8; 5236 bSig <<= 8; 5237 if ( expDiff < 0 ) { 5238 if ( expDiff < -1 ) return a; 5239 aSig >>= 1; 5240 } 5241 q = ( bSig <= aSig ); 5242 if ( q ) aSig -= bSig; 5243 if ( 0 < expDiff ) { 5244 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 5245 q >>= 32 - expDiff; 5246 bSig >>= 2; 5247 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 5248 } 5249 else { 5250 aSig >>= 2; 5251 bSig >>= 2; 5252 } 5253 } 5254 else { 5255 if ( bSig <= aSig ) aSig -= bSig; 5256 aSig64 = ( (uint64_t) aSig )<<40; 5257 bSig64 = ( (uint64_t) bSig )<<40; 5258 expDiff -= 64; 5259 while ( 0 < expDiff ) { 5260 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 5261 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 5262 aSig64 = - ( ( bSig * q64 )<<38 ); 5263 expDiff -= 62; 5264 } 5265 expDiff += 64; 5266 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 5267 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 5268 q = q64>>( 64 - expDiff ); 5269 bSig <<= 6; 5270 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 5271 } 5272 do { 5273 alternateASig = aSig; 5274 ++q; 5275 aSig -= bSig; 5276 } while ( 0 <= (int32_t) aSig ); 5277 sigMean = aSig + alternateASig; 5278 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 5279 aSig = alternateASig; 5280 } 5281 zSign = ( (int32_t) aSig < 0 ); 5282 if ( zSign ) aSig = - aSig; 5283 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 5284 } 5285 5286 5287 5288 /*---------------------------------------------------------------------------- 5289 | Returns the binary exponential of the single-precision floating-point value 5290 | `a'. The operation is performed according to the IEC/IEEE Standard for 5291 | Binary Floating-Point Arithmetic. 5292 | 5293 | Uses the following identities: 5294 | 5295 | 1. ------------------------------------------------------------------------- 5296 | x x*ln(2) 5297 | 2 = e 5298 | 5299 | 2. ------------------------------------------------------------------------- 5300 | 2 3 4 5 n 5301 | x x x x x x x 5302 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 5303 | 1! 2! 3! 4! 5! n! 5304 *----------------------------------------------------------------------------*/ 5305 5306 static const float64 float32_exp2_coefficients[15] = 5307 { 5308 const_float64( 0x3ff0000000000000ll ), /* 1 */ 5309 const_float64( 0x3fe0000000000000ll ), /* 2 */ 5310 const_float64( 0x3fc5555555555555ll ), /* 3 */ 5311 const_float64( 0x3fa5555555555555ll ), /* 4 */ 5312 const_float64( 0x3f81111111111111ll ), /* 5 */ 5313 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 5314 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 5315 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 5316 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 5317 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 5318 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 5319 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 5320 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 5321 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 5322 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 5323 }; 5324 5325 float32 float32_exp2(float32 a, float_status *status) 5326 { 5327 bool aSign; 5328 int aExp; 5329 uint32_t aSig; 5330 float64 r, x, xn; 5331 int i; 5332 a = float32_squash_input_denormal(a, status); 5333 5334 aSig = extractFloat32Frac( a ); 5335 aExp = extractFloat32Exp( a ); 5336 aSign = extractFloat32Sign( a ); 5337 5338 if ( aExp == 0xFF) { 5339 if (aSig) { 5340 return propagateFloat32NaN(a, float32_zero, status); 5341 } 5342 return (aSign) ? float32_zero : a; 5343 } 5344 if (aExp == 0) { 5345 if (aSig == 0) return float32_one; 5346 } 5347 5348 float_raise(float_flag_inexact, status); 5349 5350 /* ******************************* */ 5351 /* using float64 for approximation */ 5352 /* ******************************* */ 5353 x = float32_to_float64(a, status); 5354 x = float64_mul(x, float64_ln2, status); 5355 5356 xn = x; 5357 r = float64_one; 5358 for (i = 0 ; i < 15 ; i++) { 5359 float64 f; 5360 5361 f = float64_mul(xn, float32_exp2_coefficients[i], status); 5362 r = float64_add(r, f, status); 5363 5364 xn = float64_mul(xn, x, status); 5365 } 5366 5367 return float64_to_float32(r, status); 5368 } 5369 5370 /*---------------------------------------------------------------------------- 5371 | Returns the binary log of the single-precision floating-point value `a'. 5372 | The operation is performed according to the IEC/IEEE Standard for Binary 5373 | Floating-Point Arithmetic. 5374 *----------------------------------------------------------------------------*/ 5375 float32 float32_log2(float32 a, float_status *status) 5376 { 5377 bool aSign, zSign; 5378 int aExp; 5379 uint32_t aSig, zSig, i; 5380 5381 a = float32_squash_input_denormal(a, status); 5382 aSig = extractFloat32Frac( a ); 5383 aExp = extractFloat32Exp( a ); 5384 aSign = extractFloat32Sign( a ); 5385 5386 if ( aExp == 0 ) { 5387 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 5388 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5389 } 5390 if ( aSign ) { 5391 float_raise(float_flag_invalid, status); 5392 return float32_default_nan(status); 5393 } 5394 if ( aExp == 0xFF ) { 5395 if (aSig) { 5396 return propagateFloat32NaN(a, float32_zero, status); 5397 } 5398 return a; 5399 } 5400 5401 aExp -= 0x7F; 5402 aSig |= 0x00800000; 5403 zSign = aExp < 0; 5404 zSig = aExp << 23; 5405 5406 for (i = 1 << 22; i > 0; i >>= 1) { 5407 aSig = ( (uint64_t)aSig * aSig ) >> 23; 5408 if ( aSig & 0x01000000 ) { 5409 aSig >>= 1; 5410 zSig |= i; 5411 } 5412 } 5413 5414 if ( zSign ) 5415 zSig = -zSig; 5416 5417 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 5418 } 5419 5420 /*---------------------------------------------------------------------------- 5421 | Returns the result of converting the double-precision floating-point value 5422 | `a' to the extended double-precision floating-point format. The conversion 5423 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5424 | Arithmetic. 5425 *----------------------------------------------------------------------------*/ 5426 5427 floatx80 float64_to_floatx80(float64 a, float_status *status) 5428 { 5429 bool aSign; 5430 int aExp; 5431 uint64_t aSig; 5432 5433 a = float64_squash_input_denormal(a, status); 5434 aSig = extractFloat64Frac( a ); 5435 aExp = extractFloat64Exp( a ); 5436 aSign = extractFloat64Sign( a ); 5437 if ( aExp == 0x7FF ) { 5438 if (aSig) { 5439 floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status), 5440 status); 5441 return floatx80_silence_nan(res, status); 5442 } 5443 return packFloatx80(aSign, 5444 floatx80_infinity_high, 5445 floatx80_infinity_low); 5446 } 5447 if ( aExp == 0 ) { 5448 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 5449 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5450 } 5451 return 5452 packFloatx80( 5453 aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11); 5454 5455 } 5456 5457 /*---------------------------------------------------------------------------- 5458 | Returns the remainder of the double-precision floating-point value `a' 5459 | with respect to the corresponding value `b'. The operation is performed 5460 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5461 *----------------------------------------------------------------------------*/ 5462 5463 float64 float64_rem(float64 a, float64 b, float_status *status) 5464 { 5465 bool aSign, zSign; 5466 int aExp, bExp, expDiff; 5467 uint64_t aSig, bSig; 5468 uint64_t q, alternateASig; 5469 int64_t sigMean; 5470 5471 a = float64_squash_input_denormal(a, status); 5472 b = float64_squash_input_denormal(b, status); 5473 aSig = extractFloat64Frac( a ); 5474 aExp = extractFloat64Exp( a ); 5475 aSign = extractFloat64Sign( a ); 5476 bSig = extractFloat64Frac( b ); 5477 bExp = extractFloat64Exp( b ); 5478 if ( aExp == 0x7FF ) { 5479 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 5480 return propagateFloat64NaN(a, b, status); 5481 } 5482 float_raise(float_flag_invalid, status); 5483 return float64_default_nan(status); 5484 } 5485 if ( bExp == 0x7FF ) { 5486 if (bSig) { 5487 return propagateFloat64NaN(a, b, status); 5488 } 5489 return a; 5490 } 5491 if ( bExp == 0 ) { 5492 if ( bSig == 0 ) { 5493 float_raise(float_flag_invalid, status); 5494 return float64_default_nan(status); 5495 } 5496 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 5497 } 5498 if ( aExp == 0 ) { 5499 if ( aSig == 0 ) return a; 5500 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5501 } 5502 expDiff = aExp - bExp; 5503 aSig = (aSig | UINT64_C(0x0010000000000000)) << 11; 5504 bSig = (bSig | UINT64_C(0x0010000000000000)) << 11; 5505 if ( expDiff < 0 ) { 5506 if ( expDiff < -1 ) return a; 5507 aSig >>= 1; 5508 } 5509 q = ( bSig <= aSig ); 5510 if ( q ) aSig -= bSig; 5511 expDiff -= 64; 5512 while ( 0 < expDiff ) { 5513 q = estimateDiv128To64( aSig, 0, bSig ); 5514 q = ( 2 < q ) ? q - 2 : 0; 5515 aSig = - ( ( bSig>>2 ) * q ); 5516 expDiff -= 62; 5517 } 5518 expDiff += 64; 5519 if ( 0 < expDiff ) { 5520 q = estimateDiv128To64( aSig, 0, bSig ); 5521 q = ( 2 < q ) ? q - 2 : 0; 5522 q >>= 64 - expDiff; 5523 bSig >>= 2; 5524 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 5525 } 5526 else { 5527 aSig >>= 2; 5528 bSig >>= 2; 5529 } 5530 do { 5531 alternateASig = aSig; 5532 ++q; 5533 aSig -= bSig; 5534 } while ( 0 <= (int64_t) aSig ); 5535 sigMean = aSig + alternateASig; 5536 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 5537 aSig = alternateASig; 5538 } 5539 zSign = ( (int64_t) aSig < 0 ); 5540 if ( zSign ) aSig = - aSig; 5541 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 5542 5543 } 5544 5545 /*---------------------------------------------------------------------------- 5546 | Returns the binary log of the double-precision floating-point value `a'. 5547 | The operation is performed according to the IEC/IEEE Standard for Binary 5548 | Floating-Point Arithmetic. 5549 *----------------------------------------------------------------------------*/ 5550 float64 float64_log2(float64 a, float_status *status) 5551 { 5552 bool aSign, zSign; 5553 int aExp; 5554 uint64_t aSig, aSig0, aSig1, zSig, i; 5555 a = float64_squash_input_denormal(a, status); 5556 5557 aSig = extractFloat64Frac( a ); 5558 aExp = extractFloat64Exp( a ); 5559 aSign = extractFloat64Sign( a ); 5560 5561 if ( aExp == 0 ) { 5562 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 5563 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5564 } 5565 if ( aSign ) { 5566 float_raise(float_flag_invalid, status); 5567 return float64_default_nan(status); 5568 } 5569 if ( aExp == 0x7FF ) { 5570 if (aSig) { 5571 return propagateFloat64NaN(a, float64_zero, status); 5572 } 5573 return a; 5574 } 5575 5576 aExp -= 0x3FF; 5577 aSig |= UINT64_C(0x0010000000000000); 5578 zSign = aExp < 0; 5579 zSig = (uint64_t)aExp << 52; 5580 for (i = 1LL << 51; i > 0; i >>= 1) { 5581 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 5582 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 5583 if ( aSig & UINT64_C(0x0020000000000000) ) { 5584 aSig >>= 1; 5585 zSig |= i; 5586 } 5587 } 5588 5589 if ( zSign ) 5590 zSig = -zSig; 5591 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 5592 } 5593 5594 /*---------------------------------------------------------------------------- 5595 | Returns the result of converting the extended double-precision floating- 5596 | point value `a' to the 32-bit two's complement integer format. The 5597 | conversion is performed according to the IEC/IEEE Standard for Binary 5598 | Floating-Point Arithmetic---which means in particular that the conversion 5599 | is rounded according to the current rounding mode. If `a' is a NaN, the 5600 | largest positive integer is returned. Otherwise, if the conversion 5601 | overflows, the largest integer with the same sign as `a' is returned. 5602 *----------------------------------------------------------------------------*/ 5603 5604 int32_t floatx80_to_int32(floatx80 a, float_status *status) 5605 { 5606 bool aSign; 5607 int32_t aExp, shiftCount; 5608 uint64_t aSig; 5609 5610 if (floatx80_invalid_encoding(a)) { 5611 float_raise(float_flag_invalid, status); 5612 return 1 << 31; 5613 } 5614 aSig = extractFloatx80Frac( a ); 5615 aExp = extractFloatx80Exp( a ); 5616 aSign = extractFloatx80Sign( a ); 5617 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5618 shiftCount = 0x4037 - aExp; 5619 if ( shiftCount <= 0 ) shiftCount = 1; 5620 shift64RightJamming( aSig, shiftCount, &aSig ); 5621 return roundAndPackInt32(aSign, aSig, status); 5622 5623 } 5624 5625 /*---------------------------------------------------------------------------- 5626 | Returns the result of converting the extended double-precision floating- 5627 | point value `a' to the 32-bit two's complement integer format. The 5628 | conversion is performed according to the IEC/IEEE Standard for Binary 5629 | Floating-Point Arithmetic, except that the conversion is always rounded 5630 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5631 | Otherwise, if the conversion overflows, the largest integer with the same 5632 | sign as `a' is returned. 5633 *----------------------------------------------------------------------------*/ 5634 5635 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 5636 { 5637 bool aSign; 5638 int32_t aExp, shiftCount; 5639 uint64_t aSig, savedASig; 5640 int32_t z; 5641 5642 if (floatx80_invalid_encoding(a)) { 5643 float_raise(float_flag_invalid, status); 5644 return 1 << 31; 5645 } 5646 aSig = extractFloatx80Frac( a ); 5647 aExp = extractFloatx80Exp( a ); 5648 aSign = extractFloatx80Sign( a ); 5649 if ( 0x401E < aExp ) { 5650 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5651 goto invalid; 5652 } 5653 else if ( aExp < 0x3FFF ) { 5654 if (aExp || aSig) { 5655 float_raise(float_flag_inexact, status); 5656 } 5657 return 0; 5658 } 5659 shiftCount = 0x403E - aExp; 5660 savedASig = aSig; 5661 aSig >>= shiftCount; 5662 z = aSig; 5663 if ( aSign ) z = - z; 5664 if ( ( z < 0 ) ^ aSign ) { 5665 invalid: 5666 float_raise(float_flag_invalid, status); 5667 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5668 } 5669 if ( ( aSig<<shiftCount ) != savedASig ) { 5670 float_raise(float_flag_inexact, status); 5671 } 5672 return z; 5673 5674 } 5675 5676 /*---------------------------------------------------------------------------- 5677 | Returns the result of converting the extended double-precision floating- 5678 | point value `a' to the 64-bit two's complement integer format. The 5679 | conversion is performed according to the IEC/IEEE Standard for Binary 5680 | Floating-Point Arithmetic---which means in particular that the conversion 5681 | is rounded according to the current rounding mode. If `a' is a NaN, 5682 | the largest positive integer is returned. Otherwise, if the conversion 5683 | overflows, the largest integer with the same sign as `a' is returned. 5684 *----------------------------------------------------------------------------*/ 5685 5686 int64_t floatx80_to_int64(floatx80 a, float_status *status) 5687 { 5688 bool aSign; 5689 int32_t aExp, shiftCount; 5690 uint64_t aSig, aSigExtra; 5691 5692 if (floatx80_invalid_encoding(a)) { 5693 float_raise(float_flag_invalid, status); 5694 return 1ULL << 63; 5695 } 5696 aSig = extractFloatx80Frac( a ); 5697 aExp = extractFloatx80Exp( a ); 5698 aSign = extractFloatx80Sign( a ); 5699 shiftCount = 0x403E - aExp; 5700 if ( shiftCount <= 0 ) { 5701 if ( shiftCount ) { 5702 float_raise(float_flag_invalid, status); 5703 if (!aSign || floatx80_is_any_nan(a)) { 5704 return INT64_MAX; 5705 } 5706 return INT64_MIN; 5707 } 5708 aSigExtra = 0; 5709 } 5710 else { 5711 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 5712 } 5713 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 5714 5715 } 5716 5717 /*---------------------------------------------------------------------------- 5718 | Returns the result of converting the extended double-precision floating- 5719 | point value `a' to the 64-bit two's complement integer format. The 5720 | conversion is performed according to the IEC/IEEE Standard for Binary 5721 | Floating-Point Arithmetic, except that the conversion is always rounded 5722 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5723 | Otherwise, if the conversion overflows, the largest integer with the same 5724 | sign as `a' is returned. 5725 *----------------------------------------------------------------------------*/ 5726 5727 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 5728 { 5729 bool aSign; 5730 int32_t aExp, shiftCount; 5731 uint64_t aSig; 5732 int64_t z; 5733 5734 if (floatx80_invalid_encoding(a)) { 5735 float_raise(float_flag_invalid, status); 5736 return 1ULL << 63; 5737 } 5738 aSig = extractFloatx80Frac( a ); 5739 aExp = extractFloatx80Exp( a ); 5740 aSign = extractFloatx80Sign( a ); 5741 shiftCount = aExp - 0x403E; 5742 if ( 0 <= shiftCount ) { 5743 aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF); 5744 if ( ( a.high != 0xC03E ) || aSig ) { 5745 float_raise(float_flag_invalid, status); 5746 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 5747 return INT64_MAX; 5748 } 5749 } 5750 return INT64_MIN; 5751 } 5752 else if ( aExp < 0x3FFF ) { 5753 if (aExp | aSig) { 5754 float_raise(float_flag_inexact, status); 5755 } 5756 return 0; 5757 } 5758 z = aSig>>( - shiftCount ); 5759 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 5760 float_raise(float_flag_inexact, status); 5761 } 5762 if ( aSign ) z = - z; 5763 return z; 5764 5765 } 5766 5767 /*---------------------------------------------------------------------------- 5768 | Returns the result of converting the extended double-precision floating- 5769 | point value `a' to the single-precision floating-point format. The 5770 | conversion is performed according to the IEC/IEEE Standard for Binary 5771 | Floating-Point Arithmetic. 5772 *----------------------------------------------------------------------------*/ 5773 5774 float32 floatx80_to_float32(floatx80 a, float_status *status) 5775 { 5776 bool aSign; 5777 int32_t aExp; 5778 uint64_t aSig; 5779 5780 if (floatx80_invalid_encoding(a)) { 5781 float_raise(float_flag_invalid, status); 5782 return float32_default_nan(status); 5783 } 5784 aSig = extractFloatx80Frac( a ); 5785 aExp = extractFloatx80Exp( a ); 5786 aSign = extractFloatx80Sign( a ); 5787 if ( aExp == 0x7FFF ) { 5788 if ( (uint64_t) ( aSig<<1 ) ) { 5789 float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status), 5790 status); 5791 return float32_silence_nan(res, status); 5792 } 5793 return packFloat32( aSign, 0xFF, 0 ); 5794 } 5795 shift64RightJamming( aSig, 33, &aSig ); 5796 if ( aExp || aSig ) aExp -= 0x3F81; 5797 return roundAndPackFloat32(aSign, aExp, aSig, status); 5798 5799 } 5800 5801 /*---------------------------------------------------------------------------- 5802 | Returns the result of converting the extended double-precision floating- 5803 | point value `a' to the double-precision floating-point format. The 5804 | conversion is performed according to the IEC/IEEE Standard for Binary 5805 | Floating-Point Arithmetic. 5806 *----------------------------------------------------------------------------*/ 5807 5808 float64 floatx80_to_float64(floatx80 a, float_status *status) 5809 { 5810 bool aSign; 5811 int32_t aExp; 5812 uint64_t aSig, zSig; 5813 5814 if (floatx80_invalid_encoding(a)) { 5815 float_raise(float_flag_invalid, status); 5816 return float64_default_nan(status); 5817 } 5818 aSig = extractFloatx80Frac( a ); 5819 aExp = extractFloatx80Exp( a ); 5820 aSign = extractFloatx80Sign( a ); 5821 if ( aExp == 0x7FFF ) { 5822 if ( (uint64_t) ( aSig<<1 ) ) { 5823 float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status), 5824 status); 5825 return float64_silence_nan(res, status); 5826 } 5827 return packFloat64( aSign, 0x7FF, 0 ); 5828 } 5829 shift64RightJamming( aSig, 1, &zSig ); 5830 if ( aExp || aSig ) aExp -= 0x3C01; 5831 return roundAndPackFloat64(aSign, aExp, zSig, status); 5832 5833 } 5834 5835 /*---------------------------------------------------------------------------- 5836 | Returns the result of converting the extended double-precision floating- 5837 | point value `a' to the quadruple-precision floating-point format. The 5838 | conversion is performed according to the IEC/IEEE Standard for Binary 5839 | Floating-Point Arithmetic. 5840 *----------------------------------------------------------------------------*/ 5841 5842 float128 floatx80_to_float128(floatx80 a, float_status *status) 5843 { 5844 bool aSign; 5845 int aExp; 5846 uint64_t aSig, zSig0, zSig1; 5847 5848 if (floatx80_invalid_encoding(a)) { 5849 float_raise(float_flag_invalid, status); 5850 return float128_default_nan(status); 5851 } 5852 aSig = extractFloatx80Frac( a ); 5853 aExp = extractFloatx80Exp( a ); 5854 aSign = extractFloatx80Sign( a ); 5855 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 5856 float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status), 5857 status); 5858 return float128_silence_nan(res, status); 5859 } 5860 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 5861 return packFloat128( aSign, aExp, zSig0, zSig1 ); 5862 5863 } 5864 5865 /*---------------------------------------------------------------------------- 5866 | Rounds the extended double-precision floating-point value `a' 5867 | to the precision provided by floatx80_rounding_precision and returns the 5868 | result as an extended double-precision floating-point value. 5869 | The operation is performed according to the IEC/IEEE Standard for Binary 5870 | Floating-Point Arithmetic. 5871 *----------------------------------------------------------------------------*/ 5872 5873 floatx80 floatx80_round(floatx80 a, float_status *status) 5874 { 5875 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5876 extractFloatx80Sign(a), 5877 extractFloatx80Exp(a), 5878 extractFloatx80Frac(a), 0, status); 5879 } 5880 5881 /*---------------------------------------------------------------------------- 5882 | Rounds the extended double-precision floating-point value `a' to an integer, 5883 | and returns the result as an extended quadruple-precision floating-point 5884 | value. The operation is performed according to the IEC/IEEE Standard for 5885 | Binary Floating-Point Arithmetic. 5886 *----------------------------------------------------------------------------*/ 5887 5888 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 5889 { 5890 bool aSign; 5891 int32_t aExp; 5892 uint64_t lastBitMask, roundBitsMask; 5893 floatx80 z; 5894 5895 if (floatx80_invalid_encoding(a)) { 5896 float_raise(float_flag_invalid, status); 5897 return floatx80_default_nan(status); 5898 } 5899 aExp = extractFloatx80Exp( a ); 5900 if ( 0x403E <= aExp ) { 5901 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 5902 return propagateFloatx80NaN(a, a, status); 5903 } 5904 return a; 5905 } 5906 if ( aExp < 0x3FFF ) { 5907 if ( ( aExp == 0 ) 5908 && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) { 5909 return a; 5910 } 5911 float_raise(float_flag_inexact, status); 5912 aSign = extractFloatx80Sign( a ); 5913 switch (status->float_rounding_mode) { 5914 case float_round_nearest_even: 5915 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 5916 ) { 5917 return 5918 packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000)); 5919 } 5920 break; 5921 case float_round_ties_away: 5922 if (aExp == 0x3FFE) { 5923 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000)); 5924 } 5925 break; 5926 case float_round_down: 5927 return 5928 aSign ? 5929 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000)) 5930 : packFloatx80( 0, 0, 0 ); 5931 case float_round_up: 5932 return 5933 aSign ? packFloatx80( 1, 0, 0 ) 5934 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000)); 5935 5936 case float_round_to_zero: 5937 break; 5938 default: 5939 g_assert_not_reached(); 5940 } 5941 return packFloatx80( aSign, 0, 0 ); 5942 } 5943 lastBitMask = 1; 5944 lastBitMask <<= 0x403E - aExp; 5945 roundBitsMask = lastBitMask - 1; 5946 z = a; 5947 switch (status->float_rounding_mode) { 5948 case float_round_nearest_even: 5949 z.low += lastBitMask>>1; 5950 if ((z.low & roundBitsMask) == 0) { 5951 z.low &= ~lastBitMask; 5952 } 5953 break; 5954 case float_round_ties_away: 5955 z.low += lastBitMask >> 1; 5956 break; 5957 case float_round_to_zero: 5958 break; 5959 case float_round_up: 5960 if (!extractFloatx80Sign(z)) { 5961 z.low += roundBitsMask; 5962 } 5963 break; 5964 case float_round_down: 5965 if (extractFloatx80Sign(z)) { 5966 z.low += roundBitsMask; 5967 } 5968 break; 5969 default: 5970 abort(); 5971 } 5972 z.low &= ~ roundBitsMask; 5973 if ( z.low == 0 ) { 5974 ++z.high; 5975 z.low = UINT64_C(0x8000000000000000); 5976 } 5977 if (z.low != a.low) { 5978 float_raise(float_flag_inexact, status); 5979 } 5980 return z; 5981 5982 } 5983 5984 /*---------------------------------------------------------------------------- 5985 | Returns the result of adding the absolute values of the extended double- 5986 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 5987 | negated before being returned. `zSign' is ignored if the result is a NaN. 5988 | The addition is performed according to the IEC/IEEE Standard for Binary 5989 | Floating-Point Arithmetic. 5990 *----------------------------------------------------------------------------*/ 5991 5992 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign, 5993 float_status *status) 5994 { 5995 int32_t aExp, bExp, zExp; 5996 uint64_t aSig, bSig, zSig0, zSig1; 5997 int32_t expDiff; 5998 5999 aSig = extractFloatx80Frac( a ); 6000 aExp = extractFloatx80Exp( a ); 6001 bSig = extractFloatx80Frac( b ); 6002 bExp = extractFloatx80Exp( b ); 6003 expDiff = aExp - bExp; 6004 if ( 0 < expDiff ) { 6005 if ( aExp == 0x7FFF ) { 6006 if ((uint64_t)(aSig << 1)) { 6007 return propagateFloatx80NaN(a, b, status); 6008 } 6009 return a; 6010 } 6011 if ( bExp == 0 ) --expDiff; 6012 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 6013 zExp = aExp; 6014 } 6015 else if ( expDiff < 0 ) { 6016 if ( bExp == 0x7FFF ) { 6017 if ((uint64_t)(bSig << 1)) { 6018 return propagateFloatx80NaN(a, b, status); 6019 } 6020 return packFloatx80(zSign, 6021 floatx80_infinity_high, 6022 floatx80_infinity_low); 6023 } 6024 if ( aExp == 0 ) ++expDiff; 6025 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 6026 zExp = bExp; 6027 } 6028 else { 6029 if ( aExp == 0x7FFF ) { 6030 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 6031 return propagateFloatx80NaN(a, b, status); 6032 } 6033 return a; 6034 } 6035 zSig1 = 0; 6036 zSig0 = aSig + bSig; 6037 if ( aExp == 0 ) { 6038 if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) { 6039 /* At least one of the values is a pseudo-denormal, 6040 * and there is a carry out of the result. */ 6041 zExp = 1; 6042 goto shiftRight1; 6043 } 6044 if (zSig0 == 0) { 6045 return packFloatx80(zSign, 0, 0); 6046 } 6047 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 6048 goto roundAndPack; 6049 } 6050 zExp = aExp; 6051 goto shiftRight1; 6052 } 6053 zSig0 = aSig + bSig; 6054 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 6055 shiftRight1: 6056 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 6057 zSig0 |= UINT64_C(0x8000000000000000); 6058 ++zExp; 6059 roundAndPack: 6060 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6061 zSign, zExp, zSig0, zSig1, status); 6062 } 6063 6064 /*---------------------------------------------------------------------------- 6065 | Returns the result of subtracting the absolute values of the extended 6066 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 6067 | difference is negated before being returned. `zSign' is ignored if the 6068 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6069 | Standard for Binary Floating-Point Arithmetic. 6070 *----------------------------------------------------------------------------*/ 6071 6072 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign, 6073 float_status *status) 6074 { 6075 int32_t aExp, bExp, zExp; 6076 uint64_t aSig, bSig, zSig0, zSig1; 6077 int32_t expDiff; 6078 6079 aSig = extractFloatx80Frac( a ); 6080 aExp = extractFloatx80Exp( a ); 6081 bSig = extractFloatx80Frac( b ); 6082 bExp = extractFloatx80Exp( b ); 6083 expDiff = aExp - bExp; 6084 if ( 0 < expDiff ) goto aExpBigger; 6085 if ( expDiff < 0 ) goto bExpBigger; 6086 if ( aExp == 0x7FFF ) { 6087 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 6088 return propagateFloatx80NaN(a, b, status); 6089 } 6090 float_raise(float_flag_invalid, status); 6091 return floatx80_default_nan(status); 6092 } 6093 if ( aExp == 0 ) { 6094 aExp = 1; 6095 bExp = 1; 6096 } 6097 zSig1 = 0; 6098 if ( bSig < aSig ) goto aBigger; 6099 if ( aSig < bSig ) goto bBigger; 6100 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 6101 bExpBigger: 6102 if ( bExp == 0x7FFF ) { 6103 if ((uint64_t)(bSig << 1)) { 6104 return propagateFloatx80NaN(a, b, status); 6105 } 6106 return packFloatx80(zSign ^ 1, floatx80_infinity_high, 6107 floatx80_infinity_low); 6108 } 6109 if ( aExp == 0 ) ++expDiff; 6110 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 6111 bBigger: 6112 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 6113 zExp = bExp; 6114 zSign ^= 1; 6115 goto normalizeRoundAndPack; 6116 aExpBigger: 6117 if ( aExp == 0x7FFF ) { 6118 if ((uint64_t)(aSig << 1)) { 6119 return propagateFloatx80NaN(a, b, status); 6120 } 6121 return a; 6122 } 6123 if ( bExp == 0 ) --expDiff; 6124 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 6125 aBigger: 6126 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 6127 zExp = aExp; 6128 normalizeRoundAndPack: 6129 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 6130 zSign, zExp, zSig0, zSig1, status); 6131 } 6132 6133 /*---------------------------------------------------------------------------- 6134 | Returns the result of adding the extended double-precision floating-point 6135 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6136 | Standard for Binary Floating-Point Arithmetic. 6137 *----------------------------------------------------------------------------*/ 6138 6139 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 6140 { 6141 bool aSign, bSign; 6142 6143 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6144 float_raise(float_flag_invalid, status); 6145 return floatx80_default_nan(status); 6146 } 6147 aSign = extractFloatx80Sign( a ); 6148 bSign = extractFloatx80Sign( b ); 6149 if ( aSign == bSign ) { 6150 return addFloatx80Sigs(a, b, aSign, status); 6151 } 6152 else { 6153 return subFloatx80Sigs(a, b, aSign, status); 6154 } 6155 6156 } 6157 6158 /*---------------------------------------------------------------------------- 6159 | Returns the result of subtracting the extended double-precision floating- 6160 | point values `a' and `b'. The operation is performed according to the 6161 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6162 *----------------------------------------------------------------------------*/ 6163 6164 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 6165 { 6166 bool aSign, bSign; 6167 6168 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6169 float_raise(float_flag_invalid, status); 6170 return floatx80_default_nan(status); 6171 } 6172 aSign = extractFloatx80Sign( a ); 6173 bSign = extractFloatx80Sign( b ); 6174 if ( aSign == bSign ) { 6175 return subFloatx80Sigs(a, b, aSign, status); 6176 } 6177 else { 6178 return addFloatx80Sigs(a, b, aSign, status); 6179 } 6180 6181 } 6182 6183 /*---------------------------------------------------------------------------- 6184 | Returns the result of multiplying the extended double-precision floating- 6185 | point values `a' and `b'. The operation is performed according to the 6186 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6187 *----------------------------------------------------------------------------*/ 6188 6189 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 6190 { 6191 bool aSign, bSign, zSign; 6192 int32_t aExp, bExp, zExp; 6193 uint64_t aSig, bSig, zSig0, zSig1; 6194 6195 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6196 float_raise(float_flag_invalid, status); 6197 return floatx80_default_nan(status); 6198 } 6199 aSig = extractFloatx80Frac( a ); 6200 aExp = extractFloatx80Exp( a ); 6201 aSign = extractFloatx80Sign( a ); 6202 bSig = extractFloatx80Frac( b ); 6203 bExp = extractFloatx80Exp( b ); 6204 bSign = extractFloatx80Sign( b ); 6205 zSign = aSign ^ bSign; 6206 if ( aExp == 0x7FFF ) { 6207 if ( (uint64_t) ( aSig<<1 ) 6208 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 6209 return propagateFloatx80NaN(a, b, status); 6210 } 6211 if ( ( bExp | bSig ) == 0 ) goto invalid; 6212 return packFloatx80(zSign, floatx80_infinity_high, 6213 floatx80_infinity_low); 6214 } 6215 if ( bExp == 0x7FFF ) { 6216 if ((uint64_t)(bSig << 1)) { 6217 return propagateFloatx80NaN(a, b, status); 6218 } 6219 if ( ( aExp | aSig ) == 0 ) { 6220 invalid: 6221 float_raise(float_flag_invalid, status); 6222 return floatx80_default_nan(status); 6223 } 6224 return packFloatx80(zSign, floatx80_infinity_high, 6225 floatx80_infinity_low); 6226 } 6227 if ( aExp == 0 ) { 6228 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6229 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 6230 } 6231 if ( bExp == 0 ) { 6232 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6233 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6234 } 6235 zExp = aExp + bExp - 0x3FFE; 6236 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 6237 if ( 0 < (int64_t) zSig0 ) { 6238 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 6239 --zExp; 6240 } 6241 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6242 zSign, zExp, zSig0, zSig1, status); 6243 } 6244 6245 /*---------------------------------------------------------------------------- 6246 | Returns the result of dividing the extended double-precision floating-point 6247 | value `a' by the corresponding value `b'. The operation is performed 6248 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6249 *----------------------------------------------------------------------------*/ 6250 6251 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 6252 { 6253 bool aSign, bSign, zSign; 6254 int32_t aExp, bExp, zExp; 6255 uint64_t aSig, bSig, zSig0, zSig1; 6256 uint64_t rem0, rem1, rem2, term0, term1, term2; 6257 6258 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6259 float_raise(float_flag_invalid, status); 6260 return floatx80_default_nan(status); 6261 } 6262 aSig = extractFloatx80Frac( a ); 6263 aExp = extractFloatx80Exp( a ); 6264 aSign = extractFloatx80Sign( a ); 6265 bSig = extractFloatx80Frac( b ); 6266 bExp = extractFloatx80Exp( b ); 6267 bSign = extractFloatx80Sign( b ); 6268 zSign = aSign ^ bSign; 6269 if ( aExp == 0x7FFF ) { 6270 if ((uint64_t)(aSig << 1)) { 6271 return propagateFloatx80NaN(a, b, status); 6272 } 6273 if ( bExp == 0x7FFF ) { 6274 if ((uint64_t)(bSig << 1)) { 6275 return propagateFloatx80NaN(a, b, status); 6276 } 6277 goto invalid; 6278 } 6279 return packFloatx80(zSign, floatx80_infinity_high, 6280 floatx80_infinity_low); 6281 } 6282 if ( bExp == 0x7FFF ) { 6283 if ((uint64_t)(bSig << 1)) { 6284 return propagateFloatx80NaN(a, b, status); 6285 } 6286 return packFloatx80( zSign, 0, 0 ); 6287 } 6288 if ( bExp == 0 ) { 6289 if ( bSig == 0 ) { 6290 if ( ( aExp | aSig ) == 0 ) { 6291 invalid: 6292 float_raise(float_flag_invalid, status); 6293 return floatx80_default_nan(status); 6294 } 6295 float_raise(float_flag_divbyzero, status); 6296 return packFloatx80(zSign, floatx80_infinity_high, 6297 floatx80_infinity_low); 6298 } 6299 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6300 } 6301 if ( aExp == 0 ) { 6302 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6303 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 6304 } 6305 zExp = aExp - bExp + 0x3FFE; 6306 rem1 = 0; 6307 if ( bSig <= aSig ) { 6308 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 6309 ++zExp; 6310 } 6311 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 6312 mul64To128( bSig, zSig0, &term0, &term1 ); 6313 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 6314 while ( (int64_t) rem0 < 0 ) { 6315 --zSig0; 6316 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 6317 } 6318 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 6319 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 6320 mul64To128( bSig, zSig1, &term1, &term2 ); 6321 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6322 while ( (int64_t) rem1 < 0 ) { 6323 --zSig1; 6324 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 6325 } 6326 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 6327 } 6328 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6329 zSign, zExp, zSig0, zSig1, status); 6330 } 6331 6332 /*---------------------------------------------------------------------------- 6333 | Returns the remainder of the extended double-precision floating-point value 6334 | `a' with respect to the corresponding value `b'. The operation is performed 6335 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic, 6336 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating 6337 | the quotient toward zero instead. '*quotient' is set to the low 64 bits of 6338 | the absolute value of the integer quotient. 6339 *----------------------------------------------------------------------------*/ 6340 6341 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient, 6342 float_status *status) 6343 { 6344 bool aSign, zSign; 6345 int32_t aExp, bExp, expDiff, aExpOrig; 6346 uint64_t aSig0, aSig1, bSig; 6347 uint64_t q, term0, term1, alternateASig0, alternateASig1; 6348 6349 *quotient = 0; 6350 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6351 float_raise(float_flag_invalid, status); 6352 return floatx80_default_nan(status); 6353 } 6354 aSig0 = extractFloatx80Frac( a ); 6355 aExpOrig = aExp = extractFloatx80Exp( a ); 6356 aSign = extractFloatx80Sign( a ); 6357 bSig = extractFloatx80Frac( b ); 6358 bExp = extractFloatx80Exp( b ); 6359 if ( aExp == 0x7FFF ) { 6360 if ( (uint64_t) ( aSig0<<1 ) 6361 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 6362 return propagateFloatx80NaN(a, b, status); 6363 } 6364 goto invalid; 6365 } 6366 if ( bExp == 0x7FFF ) { 6367 if ((uint64_t)(bSig << 1)) { 6368 return propagateFloatx80NaN(a, b, status); 6369 } 6370 if (aExp == 0 && aSig0 >> 63) { 6371 /* 6372 * Pseudo-denormal argument must be returned in normalized 6373 * form. 6374 */ 6375 return packFloatx80(aSign, 1, aSig0); 6376 } 6377 return a; 6378 } 6379 if ( bExp == 0 ) { 6380 if ( bSig == 0 ) { 6381 invalid: 6382 float_raise(float_flag_invalid, status); 6383 return floatx80_default_nan(status); 6384 } 6385 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6386 } 6387 if ( aExp == 0 ) { 6388 if ( aSig0 == 0 ) return a; 6389 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6390 } 6391 zSign = aSign; 6392 expDiff = aExp - bExp; 6393 aSig1 = 0; 6394 if ( expDiff < 0 ) { 6395 if ( mod || expDiff < -1 ) { 6396 if (aExp == 1 && aExpOrig == 0) { 6397 /* 6398 * Pseudo-denormal argument must be returned in 6399 * normalized form. 6400 */ 6401 return packFloatx80(aSign, aExp, aSig0); 6402 } 6403 return a; 6404 } 6405 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 6406 expDiff = 0; 6407 } 6408 *quotient = q = ( bSig <= aSig0 ); 6409 if ( q ) aSig0 -= bSig; 6410 expDiff -= 64; 6411 while ( 0 < expDiff ) { 6412 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6413 q = ( 2 < q ) ? q - 2 : 0; 6414 mul64To128( bSig, q, &term0, &term1 ); 6415 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6416 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 6417 expDiff -= 62; 6418 *quotient <<= 62; 6419 *quotient += q; 6420 } 6421 expDiff += 64; 6422 if ( 0 < expDiff ) { 6423 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6424 q = ( 2 < q ) ? q - 2 : 0; 6425 q >>= 64 - expDiff; 6426 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 6427 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6428 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 6429 while ( le128( term0, term1, aSig0, aSig1 ) ) { 6430 ++q; 6431 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6432 } 6433 if (expDiff < 64) { 6434 *quotient <<= expDiff; 6435 } else { 6436 *quotient = 0; 6437 } 6438 *quotient += q; 6439 } 6440 else { 6441 term1 = 0; 6442 term0 = bSig; 6443 } 6444 if (!mod) { 6445 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 6446 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6447 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6448 && ( q & 1 ) ) 6449 ) { 6450 aSig0 = alternateASig0; 6451 aSig1 = alternateASig1; 6452 zSign = ! zSign; 6453 ++*quotient; 6454 } 6455 } 6456 return 6457 normalizeRoundAndPackFloatx80( 6458 80, zSign, bExp + expDiff, aSig0, aSig1, status); 6459 6460 } 6461 6462 /*---------------------------------------------------------------------------- 6463 | Returns the remainder of the extended double-precision floating-point value 6464 | `a' with respect to the corresponding value `b'. The operation is performed 6465 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6466 *----------------------------------------------------------------------------*/ 6467 6468 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 6469 { 6470 uint64_t quotient; 6471 return floatx80_modrem(a, b, false, "ient, status); 6472 } 6473 6474 /*---------------------------------------------------------------------------- 6475 | Returns the remainder of the extended double-precision floating-point value 6476 | `a' with respect to the corresponding value `b', with the quotient truncated 6477 | toward zero. 6478 *----------------------------------------------------------------------------*/ 6479 6480 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status) 6481 { 6482 uint64_t quotient; 6483 return floatx80_modrem(a, b, true, "ient, status); 6484 } 6485 6486 /*---------------------------------------------------------------------------- 6487 | Returns the square root of the extended double-precision floating-point 6488 | value `a'. The operation is performed according to the IEC/IEEE Standard 6489 | for Binary Floating-Point Arithmetic. 6490 *----------------------------------------------------------------------------*/ 6491 6492 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 6493 { 6494 bool aSign; 6495 int32_t aExp, zExp; 6496 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 6497 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6498 6499 if (floatx80_invalid_encoding(a)) { 6500 float_raise(float_flag_invalid, status); 6501 return floatx80_default_nan(status); 6502 } 6503 aSig0 = extractFloatx80Frac( a ); 6504 aExp = extractFloatx80Exp( a ); 6505 aSign = extractFloatx80Sign( a ); 6506 if ( aExp == 0x7FFF ) { 6507 if ((uint64_t)(aSig0 << 1)) { 6508 return propagateFloatx80NaN(a, a, status); 6509 } 6510 if ( ! aSign ) return a; 6511 goto invalid; 6512 } 6513 if ( aSign ) { 6514 if ( ( aExp | aSig0 ) == 0 ) return a; 6515 invalid: 6516 float_raise(float_flag_invalid, status); 6517 return floatx80_default_nan(status); 6518 } 6519 if ( aExp == 0 ) { 6520 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 6521 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6522 } 6523 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 6524 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 6525 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 6526 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6527 doubleZSig0 = zSig0<<1; 6528 mul64To128( zSig0, zSig0, &term0, &term1 ); 6529 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6530 while ( (int64_t) rem0 < 0 ) { 6531 --zSig0; 6532 doubleZSig0 -= 2; 6533 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6534 } 6535 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6536 if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) { 6537 if ( zSig1 == 0 ) zSig1 = 1; 6538 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6539 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6540 mul64To128( zSig1, zSig1, &term2, &term3 ); 6541 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6542 while ( (int64_t) rem1 < 0 ) { 6543 --zSig1; 6544 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6545 term3 |= 1; 6546 term2 |= doubleZSig0; 6547 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6548 } 6549 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6550 } 6551 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 6552 zSig0 |= doubleZSig0; 6553 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6554 0, zExp, zSig0, zSig1, status); 6555 } 6556 6557 /*---------------------------------------------------------------------------- 6558 | Returns the result of converting the quadruple-precision floating-point 6559 | value `a' to the 32-bit two's complement integer format. The conversion 6560 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6561 | Arithmetic---which means in particular that the conversion is rounded 6562 | according to the current rounding mode. If `a' is a NaN, the largest 6563 | positive integer is returned. Otherwise, if the conversion overflows, the 6564 | largest integer with the same sign as `a' is returned. 6565 *----------------------------------------------------------------------------*/ 6566 6567 int32_t float128_to_int32(float128 a, float_status *status) 6568 { 6569 bool aSign; 6570 int32_t aExp, shiftCount; 6571 uint64_t aSig0, aSig1; 6572 6573 aSig1 = extractFloat128Frac1( a ); 6574 aSig0 = extractFloat128Frac0( a ); 6575 aExp = extractFloat128Exp( a ); 6576 aSign = extractFloat128Sign( a ); 6577 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 6578 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6579 aSig0 |= ( aSig1 != 0 ); 6580 shiftCount = 0x4028 - aExp; 6581 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 6582 return roundAndPackInt32(aSign, aSig0, status); 6583 6584 } 6585 6586 /*---------------------------------------------------------------------------- 6587 | Returns the result of converting the quadruple-precision floating-point 6588 | value `a' to the 32-bit two's complement integer format. The conversion 6589 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6590 | Arithmetic, except that the conversion is always rounded toward zero. If 6591 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 6592 | conversion overflows, the largest integer with the same sign as `a' is 6593 | returned. 6594 *----------------------------------------------------------------------------*/ 6595 6596 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 6597 { 6598 bool aSign; 6599 int32_t aExp, shiftCount; 6600 uint64_t aSig0, aSig1, savedASig; 6601 int32_t z; 6602 6603 aSig1 = extractFloat128Frac1( a ); 6604 aSig0 = extractFloat128Frac0( a ); 6605 aExp = extractFloat128Exp( a ); 6606 aSign = extractFloat128Sign( a ); 6607 aSig0 |= ( aSig1 != 0 ); 6608 if ( 0x401E < aExp ) { 6609 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 6610 goto invalid; 6611 } 6612 else if ( aExp < 0x3FFF ) { 6613 if (aExp || aSig0) { 6614 float_raise(float_flag_inexact, status); 6615 } 6616 return 0; 6617 } 6618 aSig0 |= UINT64_C(0x0001000000000000); 6619 shiftCount = 0x402F - aExp; 6620 savedASig = aSig0; 6621 aSig0 >>= shiftCount; 6622 z = aSig0; 6623 if ( aSign ) z = - z; 6624 if ( ( z < 0 ) ^ aSign ) { 6625 invalid: 6626 float_raise(float_flag_invalid, status); 6627 return aSign ? INT32_MIN : INT32_MAX; 6628 } 6629 if ( ( aSig0<<shiftCount ) != savedASig ) { 6630 float_raise(float_flag_inexact, status); 6631 } 6632 return z; 6633 6634 } 6635 6636 /*---------------------------------------------------------------------------- 6637 | Returns the result of converting the quadruple-precision floating-point 6638 | value `a' to the 64-bit two's complement integer format. The conversion 6639 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6640 | Arithmetic---which means in particular that the conversion is rounded 6641 | according to the current rounding mode. If `a' is a NaN, the largest 6642 | positive integer is returned. Otherwise, if the conversion overflows, the 6643 | largest integer with the same sign as `a' is returned. 6644 *----------------------------------------------------------------------------*/ 6645 6646 int64_t float128_to_int64(float128 a, float_status *status) 6647 { 6648 bool aSign; 6649 int32_t aExp, shiftCount; 6650 uint64_t aSig0, aSig1; 6651 6652 aSig1 = extractFloat128Frac1( a ); 6653 aSig0 = extractFloat128Frac0( a ); 6654 aExp = extractFloat128Exp( a ); 6655 aSign = extractFloat128Sign( a ); 6656 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6657 shiftCount = 0x402F - aExp; 6658 if ( shiftCount <= 0 ) { 6659 if ( 0x403E < aExp ) { 6660 float_raise(float_flag_invalid, status); 6661 if ( ! aSign 6662 || ( ( aExp == 0x7FFF ) 6663 && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) ) 6664 ) 6665 ) { 6666 return INT64_MAX; 6667 } 6668 return INT64_MIN; 6669 } 6670 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 6671 } 6672 else { 6673 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 6674 } 6675 return roundAndPackInt64(aSign, aSig0, aSig1, status); 6676 6677 } 6678 6679 /*---------------------------------------------------------------------------- 6680 | Returns the result of converting the quadruple-precision floating-point 6681 | value `a' to the 64-bit two's complement integer format. The conversion 6682 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6683 | Arithmetic, except that the conversion is always rounded toward zero. 6684 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 6685 | the conversion overflows, the largest integer with the same sign as `a' is 6686 | returned. 6687 *----------------------------------------------------------------------------*/ 6688 6689 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 6690 { 6691 bool aSign; 6692 int32_t aExp, shiftCount; 6693 uint64_t aSig0, aSig1; 6694 int64_t z; 6695 6696 aSig1 = extractFloat128Frac1( a ); 6697 aSig0 = extractFloat128Frac0( a ); 6698 aExp = extractFloat128Exp( a ); 6699 aSign = extractFloat128Sign( a ); 6700 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6701 shiftCount = aExp - 0x402F; 6702 if ( 0 < shiftCount ) { 6703 if ( 0x403E <= aExp ) { 6704 aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF); 6705 if ( ( a.high == UINT64_C(0xC03E000000000000) ) 6706 && ( aSig1 < UINT64_C(0x0002000000000000) ) ) { 6707 if (aSig1) { 6708 float_raise(float_flag_inexact, status); 6709 } 6710 } 6711 else { 6712 float_raise(float_flag_invalid, status); 6713 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 6714 return INT64_MAX; 6715 } 6716 } 6717 return INT64_MIN; 6718 } 6719 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 6720 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 6721 float_raise(float_flag_inexact, status); 6722 } 6723 } 6724 else { 6725 if ( aExp < 0x3FFF ) { 6726 if ( aExp | aSig0 | aSig1 ) { 6727 float_raise(float_flag_inexact, status); 6728 } 6729 return 0; 6730 } 6731 z = aSig0>>( - shiftCount ); 6732 if ( aSig1 6733 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 6734 float_raise(float_flag_inexact, status); 6735 } 6736 } 6737 if ( aSign ) z = - z; 6738 return z; 6739 6740 } 6741 6742 /*---------------------------------------------------------------------------- 6743 | Returns the result of converting the quadruple-precision floating-point value 6744 | `a' to the 64-bit unsigned integer format. The conversion is 6745 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6746 | Arithmetic---which means in particular that the conversion is rounded 6747 | according to the current rounding mode. If `a' is a NaN, the largest 6748 | positive integer is returned. If the conversion overflows, the 6749 | largest unsigned integer is returned. If 'a' is negative, the value is 6750 | rounded and zero is returned; negative values that do not round to zero 6751 | will raise the inexact exception. 6752 *----------------------------------------------------------------------------*/ 6753 6754 uint64_t float128_to_uint64(float128 a, float_status *status) 6755 { 6756 bool aSign; 6757 int aExp; 6758 int shiftCount; 6759 uint64_t aSig0, aSig1; 6760 6761 aSig0 = extractFloat128Frac0(a); 6762 aSig1 = extractFloat128Frac1(a); 6763 aExp = extractFloat128Exp(a); 6764 aSign = extractFloat128Sign(a); 6765 if (aSign && (aExp > 0x3FFE)) { 6766 float_raise(float_flag_invalid, status); 6767 if (float128_is_any_nan(a)) { 6768 return UINT64_MAX; 6769 } else { 6770 return 0; 6771 } 6772 } 6773 if (aExp) { 6774 aSig0 |= UINT64_C(0x0001000000000000); 6775 } 6776 shiftCount = 0x402F - aExp; 6777 if (shiftCount <= 0) { 6778 if (0x403E < aExp) { 6779 float_raise(float_flag_invalid, status); 6780 return UINT64_MAX; 6781 } 6782 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 6783 } else { 6784 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 6785 } 6786 return roundAndPackUint64(aSign, aSig0, aSig1, status); 6787 } 6788 6789 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 6790 { 6791 uint64_t v; 6792 signed char current_rounding_mode = status->float_rounding_mode; 6793 6794 set_float_rounding_mode(float_round_to_zero, status); 6795 v = float128_to_uint64(a, status); 6796 set_float_rounding_mode(current_rounding_mode, status); 6797 6798 return v; 6799 } 6800 6801 /*---------------------------------------------------------------------------- 6802 | Returns the result of converting the quadruple-precision floating-point 6803 | value `a' to the 32-bit unsigned integer format. The conversion 6804 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6805 | Arithmetic except that the conversion is always rounded toward zero. 6806 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 6807 | if the conversion overflows, the largest unsigned integer is returned. 6808 | If 'a' is negative, the value is rounded and zero is returned; negative 6809 | values that do not round to zero will raise the inexact exception. 6810 *----------------------------------------------------------------------------*/ 6811 6812 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 6813 { 6814 uint64_t v; 6815 uint32_t res; 6816 int old_exc_flags = get_float_exception_flags(status); 6817 6818 v = float128_to_uint64_round_to_zero(a, status); 6819 if (v > 0xffffffff) { 6820 res = 0xffffffff; 6821 } else { 6822 return v; 6823 } 6824 set_float_exception_flags(old_exc_flags, status); 6825 float_raise(float_flag_invalid, status); 6826 return res; 6827 } 6828 6829 /*---------------------------------------------------------------------------- 6830 | Returns the result of converting the quadruple-precision floating-point value 6831 | `a' to the 32-bit unsigned integer format. The conversion is 6832 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6833 | Arithmetic---which means in particular that the conversion is rounded 6834 | according to the current rounding mode. If `a' is a NaN, the largest 6835 | positive integer is returned. If the conversion overflows, the 6836 | largest unsigned integer is returned. If 'a' is negative, the value is 6837 | rounded and zero is returned; negative values that do not round to zero 6838 | will raise the inexact exception. 6839 *----------------------------------------------------------------------------*/ 6840 6841 uint32_t float128_to_uint32(float128 a, float_status *status) 6842 { 6843 uint64_t v; 6844 uint32_t res; 6845 int old_exc_flags = get_float_exception_flags(status); 6846 6847 v = float128_to_uint64(a, status); 6848 if (v > 0xffffffff) { 6849 res = 0xffffffff; 6850 } else { 6851 return v; 6852 } 6853 set_float_exception_flags(old_exc_flags, status); 6854 float_raise(float_flag_invalid, status); 6855 return res; 6856 } 6857 6858 /*---------------------------------------------------------------------------- 6859 | Returns the result of converting the quadruple-precision floating-point 6860 | value `a' to the extended double-precision floating-point format. The 6861 | conversion is performed according to the IEC/IEEE Standard for Binary 6862 | Floating-Point Arithmetic. 6863 *----------------------------------------------------------------------------*/ 6864 6865 floatx80 float128_to_floatx80(float128 a, float_status *status) 6866 { 6867 bool aSign; 6868 int32_t aExp; 6869 uint64_t aSig0, aSig1; 6870 6871 aSig1 = extractFloat128Frac1( a ); 6872 aSig0 = extractFloat128Frac0( a ); 6873 aExp = extractFloat128Exp( a ); 6874 aSign = extractFloat128Sign( a ); 6875 if ( aExp == 0x7FFF ) { 6876 if ( aSig0 | aSig1 ) { 6877 floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status), 6878 status); 6879 return floatx80_silence_nan(res, status); 6880 } 6881 return packFloatx80(aSign, floatx80_infinity_high, 6882 floatx80_infinity_low); 6883 } 6884 if ( aExp == 0 ) { 6885 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6886 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6887 } 6888 else { 6889 aSig0 |= UINT64_C(0x0001000000000000); 6890 } 6891 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6892 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6893 6894 } 6895 6896 /*---------------------------------------------------------------------------- 6897 | Returns the remainder of the quadruple-precision floating-point value `a' 6898 | with respect to the corresponding value `b'. The operation is performed 6899 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6900 *----------------------------------------------------------------------------*/ 6901 6902 float128 float128_rem(float128 a, float128 b, float_status *status) 6903 { 6904 bool aSign, zSign; 6905 int32_t aExp, bExp, expDiff; 6906 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 6907 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 6908 int64_t sigMean0; 6909 6910 aSig1 = extractFloat128Frac1( a ); 6911 aSig0 = extractFloat128Frac0( a ); 6912 aExp = extractFloat128Exp( a ); 6913 aSign = extractFloat128Sign( a ); 6914 bSig1 = extractFloat128Frac1( b ); 6915 bSig0 = extractFloat128Frac0( b ); 6916 bExp = extractFloat128Exp( b ); 6917 if ( aExp == 0x7FFF ) { 6918 if ( ( aSig0 | aSig1 ) 6919 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6920 return propagateFloat128NaN(a, b, status); 6921 } 6922 goto invalid; 6923 } 6924 if ( bExp == 0x7FFF ) { 6925 if (bSig0 | bSig1) { 6926 return propagateFloat128NaN(a, b, status); 6927 } 6928 return a; 6929 } 6930 if ( bExp == 0 ) { 6931 if ( ( bSig0 | bSig1 ) == 0 ) { 6932 invalid: 6933 float_raise(float_flag_invalid, status); 6934 return float128_default_nan(status); 6935 } 6936 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6937 } 6938 if ( aExp == 0 ) { 6939 if ( ( aSig0 | aSig1 ) == 0 ) return a; 6940 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6941 } 6942 expDiff = aExp - bExp; 6943 if ( expDiff < -1 ) return a; 6944 shortShift128Left( 6945 aSig0 | UINT64_C(0x0001000000000000), 6946 aSig1, 6947 15 - ( expDiff < 0 ), 6948 &aSig0, 6949 &aSig1 6950 ); 6951 shortShift128Left( 6952 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 ); 6953 q = le128( bSig0, bSig1, aSig0, aSig1 ); 6954 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6955 expDiff -= 64; 6956 while ( 0 < expDiff ) { 6957 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6958 q = ( 4 < q ) ? q - 4 : 0; 6959 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6960 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 6961 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 6962 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 6963 expDiff -= 61; 6964 } 6965 if ( -64 < expDiff ) { 6966 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6967 q = ( 4 < q ) ? q - 4 : 0; 6968 q >>= - expDiff; 6969 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6970 expDiff += 52; 6971 if ( expDiff < 0 ) { 6972 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6973 } 6974 else { 6975 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 6976 } 6977 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6978 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 6979 } 6980 else { 6981 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 6982 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6983 } 6984 do { 6985 alternateASig0 = aSig0; 6986 alternateASig1 = aSig1; 6987 ++q; 6988 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6989 } while ( 0 <= (int64_t) aSig0 ); 6990 add128( 6991 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 6992 if ( ( sigMean0 < 0 ) 6993 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 6994 aSig0 = alternateASig0; 6995 aSig1 = alternateASig1; 6996 } 6997 zSign = ( (int64_t) aSig0 < 0 ); 6998 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 6999 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 7000 status); 7001 } 7002 7003 /*---------------------------------------------------------------------------- 7004 | Returns the square root of the quadruple-precision floating-point value `a'. 7005 | The operation is performed according to the IEC/IEEE Standard for Binary 7006 | Floating-Point Arithmetic. 7007 *----------------------------------------------------------------------------*/ 7008 7009 float128 float128_sqrt(float128 a, float_status *status) 7010 { 7011 bool aSign; 7012 int32_t aExp, zExp; 7013 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 7014 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7015 7016 aSig1 = extractFloat128Frac1( a ); 7017 aSig0 = extractFloat128Frac0( a ); 7018 aExp = extractFloat128Exp( a ); 7019 aSign = extractFloat128Sign( a ); 7020 if ( aExp == 0x7FFF ) { 7021 if (aSig0 | aSig1) { 7022 return propagateFloat128NaN(a, a, status); 7023 } 7024 if ( ! aSign ) return a; 7025 goto invalid; 7026 } 7027 if ( aSign ) { 7028 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 7029 invalid: 7030 float_raise(float_flag_invalid, status); 7031 return float128_default_nan(status); 7032 } 7033 if ( aExp == 0 ) { 7034 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 7035 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7036 } 7037 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 7038 aSig0 |= UINT64_C(0x0001000000000000); 7039 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 7040 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 7041 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 7042 doubleZSig0 = zSig0<<1; 7043 mul64To128( zSig0, zSig0, &term0, &term1 ); 7044 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 7045 while ( (int64_t) rem0 < 0 ) { 7046 --zSig0; 7047 doubleZSig0 -= 2; 7048 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 7049 } 7050 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 7051 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 7052 if ( zSig1 == 0 ) zSig1 = 1; 7053 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 7054 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 7055 mul64To128( zSig1, zSig1, &term2, &term3 ); 7056 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 7057 while ( (int64_t) rem1 < 0 ) { 7058 --zSig1; 7059 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 7060 term3 |= 1; 7061 term2 |= doubleZSig0; 7062 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 7063 } 7064 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7065 } 7066 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 7067 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 7068 7069 } 7070 7071 static inline FloatRelation 7072 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet, 7073 float_status *status) 7074 { 7075 bool aSign, bSign; 7076 7077 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 7078 float_raise(float_flag_invalid, status); 7079 return float_relation_unordered; 7080 } 7081 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 7082 ( extractFloatx80Frac( a )<<1 ) ) || 7083 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 7084 ( extractFloatx80Frac( b )<<1 ) )) { 7085 if (!is_quiet || 7086 floatx80_is_signaling_nan(a, status) || 7087 floatx80_is_signaling_nan(b, status)) { 7088 float_raise(float_flag_invalid, status); 7089 } 7090 return float_relation_unordered; 7091 } 7092 aSign = extractFloatx80Sign( a ); 7093 bSign = extractFloatx80Sign( b ); 7094 if ( aSign != bSign ) { 7095 7096 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 7097 ( ( a.low | b.low ) == 0 ) ) { 7098 /* zero case */ 7099 return float_relation_equal; 7100 } else { 7101 return 1 - (2 * aSign); 7102 } 7103 } else { 7104 /* Normalize pseudo-denormals before comparison. */ 7105 if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) { 7106 ++a.high; 7107 } 7108 if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) { 7109 ++b.high; 7110 } 7111 if (a.low == b.low && a.high == b.high) { 7112 return float_relation_equal; 7113 } else { 7114 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7115 } 7116 } 7117 } 7118 7119 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status) 7120 { 7121 return floatx80_compare_internal(a, b, 0, status); 7122 } 7123 7124 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b, 7125 float_status *status) 7126 { 7127 return floatx80_compare_internal(a, b, 1, status); 7128 } 7129 7130 static inline FloatRelation 7131 float128_compare_internal(float128 a, float128 b, bool is_quiet, 7132 float_status *status) 7133 { 7134 bool aSign, bSign; 7135 7136 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7137 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7138 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7139 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7140 if (!is_quiet || 7141 float128_is_signaling_nan(a, status) || 7142 float128_is_signaling_nan(b, status)) { 7143 float_raise(float_flag_invalid, status); 7144 } 7145 return float_relation_unordered; 7146 } 7147 aSign = extractFloat128Sign( a ); 7148 bSign = extractFloat128Sign( b ); 7149 if ( aSign != bSign ) { 7150 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7151 /* zero case */ 7152 return float_relation_equal; 7153 } else { 7154 return 1 - (2 * aSign); 7155 } 7156 } else { 7157 if (a.low == b.low && a.high == b.high) { 7158 return float_relation_equal; 7159 } else { 7160 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7161 } 7162 } 7163 } 7164 7165 FloatRelation float128_compare(float128 a, float128 b, float_status *status) 7166 { 7167 return float128_compare_internal(a, b, 0, status); 7168 } 7169 7170 FloatRelation float128_compare_quiet(float128 a, float128 b, 7171 float_status *status) 7172 { 7173 return float128_compare_internal(a, b, 1, status); 7174 } 7175 7176 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7177 { 7178 bool aSign; 7179 int32_t aExp; 7180 uint64_t aSig; 7181 7182 if (floatx80_invalid_encoding(a)) { 7183 float_raise(float_flag_invalid, status); 7184 return floatx80_default_nan(status); 7185 } 7186 aSig = extractFloatx80Frac( a ); 7187 aExp = extractFloatx80Exp( a ); 7188 aSign = extractFloatx80Sign( a ); 7189 7190 if ( aExp == 0x7FFF ) { 7191 if ( aSig<<1 ) { 7192 return propagateFloatx80NaN(a, a, status); 7193 } 7194 return a; 7195 } 7196 7197 if (aExp == 0) { 7198 if (aSig == 0) { 7199 return a; 7200 } 7201 aExp++; 7202 } 7203 7204 if (n > 0x10000) { 7205 n = 0x10000; 7206 } else if (n < -0x10000) { 7207 n = -0x10000; 7208 } 7209 7210 aExp += n; 7211 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7212 aSign, aExp, aSig, 0, status); 7213 } 7214 7215 float128 float128_scalbn(float128 a, int n, float_status *status) 7216 { 7217 bool aSign; 7218 int32_t aExp; 7219 uint64_t aSig0, aSig1; 7220 7221 aSig1 = extractFloat128Frac1( a ); 7222 aSig0 = extractFloat128Frac0( a ); 7223 aExp = extractFloat128Exp( a ); 7224 aSign = extractFloat128Sign( a ); 7225 if ( aExp == 0x7FFF ) { 7226 if ( aSig0 | aSig1 ) { 7227 return propagateFloat128NaN(a, a, status); 7228 } 7229 return a; 7230 } 7231 if (aExp != 0) { 7232 aSig0 |= UINT64_C(0x0001000000000000); 7233 } else if (aSig0 == 0 && aSig1 == 0) { 7234 return a; 7235 } else { 7236 aExp++; 7237 } 7238 7239 if (n > 0x10000) { 7240 n = 0x10000; 7241 } else if (n < -0x10000) { 7242 n = -0x10000; 7243 } 7244 7245 aExp += n - 1; 7246 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7247 , status); 7248 7249 } 7250 7251 static void __attribute__((constructor)) softfloat_init(void) 7252 { 7253 union_float64 ua, ub, uc, ur; 7254 7255 if (QEMU_NO_HARDFLOAT) { 7256 return; 7257 } 7258 /* 7259 * Test that the host's FMA is not obviously broken. For example, 7260 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see 7261 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304 7262 */ 7263 ua.s = 0x0020000000000001ULL; 7264 ub.s = 0x3ca0000000000000ULL; 7265 uc.s = 0x0020000000000000ULL; 7266 ur.h = fma(ua.h, ub.h, uc.h); 7267 if (ur.s != 0x0020000000000001ULL) { 7268 force_soft_fma = true; 7269 } 7270 } 7271