1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include <math.h> 87 #include "qemu/bitops.h" 88 #include "fpu/softfloat.h" 89 90 /* We only need stdlib for abort() */ 91 92 /*---------------------------------------------------------------------------- 93 | Primitive arithmetic functions, including multi-word arithmetic, and 94 | division and square root approximations. (Can be specialized to target if 95 | desired.) 96 *----------------------------------------------------------------------------*/ 97 #include "fpu/softfloat-macros.h" 98 99 /* 100 * Hardfloat 101 * 102 * Fast emulation of guest FP instructions is challenging for two reasons. 103 * First, FP instruction semantics are similar but not identical, particularly 104 * when handling NaNs. Second, emulating at reasonable speed the guest FP 105 * exception flags is not trivial: reading the host's flags register with a 106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp], 107 * and trapping on every FP exception is not fast nor pleasant to work with. 108 * 109 * We address these challenges by leveraging the host FPU for a subset of the 110 * operations. To do this we expand on the idea presented in this paper: 111 * 112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a 113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615. 114 * 115 * The idea is thus to leverage the host FPU to (1) compute FP operations 116 * and (2) identify whether FP exceptions occurred while avoiding 117 * expensive exception flag register accesses. 118 * 119 * An important optimization shown in the paper is that given that exception 120 * flags are rarely cleared by the guest, we can avoid recomputing some flags. 121 * This is particularly useful for the inexact flag, which is very frequently 122 * raised in floating-point workloads. 123 * 124 * We optimize the code further by deferring to soft-fp whenever FP exception 125 * detection might get hairy. Two examples: (1) when at least one operand is 126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result 127 * and the result is < the minimum normal. 128 */ 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \ 130 static inline void name(soft_t *a, float_status *s) \ 131 { \ 132 if (unlikely(soft_t ## _is_denormal(*a))) { \ 133 *a = soft_t ## _set_sign(soft_t ## _zero, \ 134 soft_t ## _is_neg(*a)); \ 135 float_raise(float_flag_input_denormal, s); \ 136 } \ 137 } 138 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32) 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64) 141 #undef GEN_INPUT_FLUSH__NOCHECK 142 143 #define GEN_INPUT_FLUSH1(name, soft_t) \ 144 static inline void name(soft_t *a, float_status *s) \ 145 { \ 146 if (likely(!s->flush_inputs_to_zero)) { \ 147 return; \ 148 } \ 149 soft_t ## _input_flush__nocheck(a, s); \ 150 } 151 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32) 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64) 154 #undef GEN_INPUT_FLUSH1 155 156 #define GEN_INPUT_FLUSH2(name, soft_t) \ 157 static inline void name(soft_t *a, soft_t *b, float_status *s) \ 158 { \ 159 if (likely(!s->flush_inputs_to_zero)) { \ 160 return; \ 161 } \ 162 soft_t ## _input_flush__nocheck(a, s); \ 163 soft_t ## _input_flush__nocheck(b, s); \ 164 } 165 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32) 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64) 168 #undef GEN_INPUT_FLUSH2 169 170 #define GEN_INPUT_FLUSH3(name, soft_t) \ 171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \ 172 { \ 173 if (likely(!s->flush_inputs_to_zero)) { \ 174 return; \ 175 } \ 176 soft_t ## _input_flush__nocheck(a, s); \ 177 soft_t ## _input_flush__nocheck(b, s); \ 178 soft_t ## _input_flush__nocheck(c, s); \ 179 } 180 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32) 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64) 183 #undef GEN_INPUT_FLUSH3 184 185 /* 186 * Choose whether to use fpclassify or float32/64_* primitives in the generated 187 * hardfloat functions. Each combination of number of inputs and float size 188 * gets its own value. 189 */ 190 #if defined(__x86_64__) 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1 197 #else 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0 204 #endif 205 206 /* 207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over 208 * float{32,64}_is_infinity when !USE_FP. 209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup. 210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%. 211 */ 212 #if defined(__x86_64__) || defined(__aarch64__) 213 # define QEMU_HARDFLOAT_USE_ISINF 1 214 #else 215 # define QEMU_HARDFLOAT_USE_ISINF 0 216 #endif 217 218 /* 219 * Some targets clear the FP flags before most FP operations. This prevents 220 * the use of hardfloat, since hardfloat relies on the inexact flag being 221 * already set. 222 */ 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__) 224 # if defined(__FAST_MATH__) 225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \ 226 IEEE implementation 227 # endif 228 # define QEMU_NO_HARDFLOAT 1 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN 230 #else 231 # define QEMU_NO_HARDFLOAT 0 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline)) 233 #endif 234 235 static inline bool can_use_fpu(const float_status *s) 236 { 237 if (QEMU_NO_HARDFLOAT) { 238 return false; 239 } 240 return likely(s->float_exception_flags & float_flag_inexact && 241 s->float_rounding_mode == float_round_nearest_even); 242 } 243 244 /* 245 * Hardfloat generation functions. Each operation can have two flavors: 246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for 247 * most condition checks, or native ones (e.g. fpclassify). 248 * 249 * The flavor is chosen by the callers. Instead of using macros, we rely on the 250 * compiler to propagate constants and inline everything into the callers. 251 * 252 * We only generate functions for operations with two inputs, since only 253 * these are common enough to justify consolidating them into common code. 254 */ 255 256 typedef union { 257 float32 s; 258 float h; 259 } union_float32; 260 261 typedef union { 262 float64 s; 263 double h; 264 } union_float64; 265 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b); 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b); 268 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s); 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s); 271 typedef float (*hard_f32_op2_fn)(float a, float b); 272 typedef double (*hard_f64_op2_fn)(double a, double b); 273 274 /* 2-input is-zero-or-normal */ 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b) 276 { 277 if (QEMU_HARDFLOAT_2F32_USE_FP) { 278 /* 279 * Not using a temp variable for consecutive fpclassify calls ends up 280 * generating faster code. 281 */ 282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 284 } 285 return float32_is_zero_or_normal(a.s) && 286 float32_is_zero_or_normal(b.s); 287 } 288 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b) 290 { 291 if (QEMU_HARDFLOAT_2F64_USE_FP) { 292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 294 } 295 return float64_is_zero_or_normal(a.s) && 296 float64_is_zero_or_normal(b.s); 297 } 298 299 /* 3-input is-zero-or-normal */ 300 static inline 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c) 302 { 303 if (QEMU_HARDFLOAT_3F32_USE_FP) { 304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 307 } 308 return float32_is_zero_or_normal(a.s) && 309 float32_is_zero_or_normal(b.s) && 310 float32_is_zero_or_normal(c.s); 311 } 312 313 static inline 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c) 315 { 316 if (QEMU_HARDFLOAT_3F64_USE_FP) { 317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 320 } 321 return float64_is_zero_or_normal(a.s) && 322 float64_is_zero_or_normal(b.s) && 323 float64_is_zero_or_normal(c.s); 324 } 325 326 static inline bool f32_is_inf(union_float32 a) 327 { 328 if (QEMU_HARDFLOAT_USE_ISINF) { 329 return isinf(a.h); 330 } 331 return float32_is_infinity(a.s); 332 } 333 334 static inline bool f64_is_inf(union_float64 a) 335 { 336 if (QEMU_HARDFLOAT_USE_ISINF) { 337 return isinf(a.h); 338 } 339 return float64_is_infinity(a.s); 340 } 341 342 static inline float32 343 float32_gen2(float32 xa, float32 xb, float_status *s, 344 hard_f32_op2_fn hard, soft_f32_op2_fn soft, 345 f32_check_fn pre, f32_check_fn post) 346 { 347 union_float32 ua, ub, ur; 348 349 ua.s = xa; 350 ub.s = xb; 351 352 if (unlikely(!can_use_fpu(s))) { 353 goto soft; 354 } 355 356 float32_input_flush2(&ua.s, &ub.s, s); 357 if (unlikely(!pre(ua, ub))) { 358 goto soft; 359 } 360 361 ur.h = hard(ua.h, ub.h); 362 if (unlikely(f32_is_inf(ur))) { 363 float_raise(float_flag_overflow, s); 364 } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) { 365 goto soft; 366 } 367 return ur.s; 368 369 soft: 370 return soft(ua.s, ub.s, s); 371 } 372 373 static inline float64 374 float64_gen2(float64 xa, float64 xb, float_status *s, 375 hard_f64_op2_fn hard, soft_f64_op2_fn soft, 376 f64_check_fn pre, f64_check_fn post) 377 { 378 union_float64 ua, ub, ur; 379 380 ua.s = xa; 381 ub.s = xb; 382 383 if (unlikely(!can_use_fpu(s))) { 384 goto soft; 385 } 386 387 float64_input_flush2(&ua.s, &ub.s, s); 388 if (unlikely(!pre(ua, ub))) { 389 goto soft; 390 } 391 392 ur.h = hard(ua.h, ub.h); 393 if (unlikely(f64_is_inf(ur))) { 394 float_raise(float_flag_overflow, s); 395 } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) { 396 goto soft; 397 } 398 return ur.s; 399 400 soft: 401 return soft(ua.s, ub.s, s); 402 } 403 404 /*---------------------------------------------------------------------------- 405 | Returns the fraction bits of the single-precision floating-point value `a'. 406 *----------------------------------------------------------------------------*/ 407 408 static inline uint32_t extractFloat32Frac(float32 a) 409 { 410 return float32_val(a) & 0x007FFFFF; 411 } 412 413 /*---------------------------------------------------------------------------- 414 | Returns the exponent bits of the single-precision floating-point value `a'. 415 *----------------------------------------------------------------------------*/ 416 417 static inline int extractFloat32Exp(float32 a) 418 { 419 return (float32_val(a) >> 23) & 0xFF; 420 } 421 422 /*---------------------------------------------------------------------------- 423 | Returns the sign bit of the single-precision floating-point value `a'. 424 *----------------------------------------------------------------------------*/ 425 426 static inline bool extractFloat32Sign(float32 a) 427 { 428 return float32_val(a) >> 31; 429 } 430 431 /*---------------------------------------------------------------------------- 432 | Returns the fraction bits of the double-precision floating-point value `a'. 433 *----------------------------------------------------------------------------*/ 434 435 static inline uint64_t extractFloat64Frac(float64 a) 436 { 437 return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF); 438 } 439 440 /*---------------------------------------------------------------------------- 441 | Returns the exponent bits of the double-precision floating-point value `a'. 442 *----------------------------------------------------------------------------*/ 443 444 static inline int extractFloat64Exp(float64 a) 445 { 446 return (float64_val(a) >> 52) & 0x7FF; 447 } 448 449 /*---------------------------------------------------------------------------- 450 | Returns the sign bit of the double-precision floating-point value `a'. 451 *----------------------------------------------------------------------------*/ 452 453 static inline bool extractFloat64Sign(float64 a) 454 { 455 return float64_val(a) >> 63; 456 } 457 458 /* 459 * Classify a floating point number. Everything above float_class_qnan 460 * is a NaN so cls >= float_class_qnan is any NaN. 461 */ 462 463 typedef enum __attribute__ ((__packed__)) { 464 float_class_unclassified, 465 float_class_zero, 466 float_class_normal, 467 float_class_inf, 468 float_class_qnan, /* all NaNs from here */ 469 float_class_snan, 470 } FloatClass; 471 472 #define float_cmask(bit) (1u << (bit)) 473 474 enum { 475 float_cmask_zero = float_cmask(float_class_zero), 476 float_cmask_normal = float_cmask(float_class_normal), 477 float_cmask_inf = float_cmask(float_class_inf), 478 float_cmask_qnan = float_cmask(float_class_qnan), 479 float_cmask_snan = float_cmask(float_class_snan), 480 481 float_cmask_infzero = float_cmask_zero | float_cmask_inf, 482 float_cmask_anynan = float_cmask_qnan | float_cmask_snan, 483 }; 484 485 486 /* Simple helpers for checking if, or what kind of, NaN we have */ 487 static inline __attribute__((unused)) bool is_nan(FloatClass c) 488 { 489 return unlikely(c >= float_class_qnan); 490 } 491 492 static inline __attribute__((unused)) bool is_snan(FloatClass c) 493 { 494 return c == float_class_snan; 495 } 496 497 static inline __attribute__((unused)) bool is_qnan(FloatClass c) 498 { 499 return c == float_class_qnan; 500 } 501 502 /* 503 * Structure holding all of the decomposed parts of a float. 504 * The exponent is unbiased and the fraction is normalized. 505 * 506 * The fraction words are stored in big-endian word ordering, 507 * so that truncation from a larger format to a smaller format 508 * can be done simply by ignoring subsequent elements. 509 */ 510 511 typedef struct { 512 FloatClass cls; 513 bool sign; 514 int32_t exp; 515 union { 516 /* Routines that know the structure may reference the singular name. */ 517 uint64_t frac; 518 /* 519 * Routines expanded with multiple structures reference "hi" and "lo" 520 * depending on the operation. In FloatParts64, "hi" and "lo" are 521 * both the same word and aliased here. 522 */ 523 uint64_t frac_hi; 524 uint64_t frac_lo; 525 }; 526 } FloatParts64; 527 528 typedef struct { 529 FloatClass cls; 530 bool sign; 531 int32_t exp; 532 uint64_t frac_hi; 533 uint64_t frac_lo; 534 } FloatParts128; 535 536 typedef struct { 537 FloatClass cls; 538 bool sign; 539 int32_t exp; 540 uint64_t frac_hi; 541 uint64_t frac_hm; /* high-middle */ 542 uint64_t frac_lm; /* low-middle */ 543 uint64_t frac_lo; 544 } FloatParts256; 545 546 /* These apply to the most significant word of each FloatPartsN. */ 547 #define DECOMPOSED_BINARY_POINT 63 548 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 549 550 /* Structure holding all of the relevant parameters for a format. 551 * exp_size: the size of the exponent field 552 * exp_bias: the offset applied to the exponent field 553 * exp_max: the maximum normalised exponent 554 * frac_size: the size of the fraction field 555 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 556 * The following are computed based the size of fraction 557 * frac_lsb: least significant bit of fraction 558 * frac_lsbm1: the bit below the least significant bit (for rounding) 559 * round_mask/roundeven_mask: masks used for rounding 560 * The following optional modifiers are available: 561 * arm_althp: handle ARM Alternative Half Precision 562 */ 563 typedef struct { 564 int exp_size; 565 int exp_bias; 566 int exp_max; 567 int frac_size; 568 int frac_shift; 569 uint64_t frac_lsb; 570 uint64_t frac_lsbm1; 571 uint64_t round_mask; 572 uint64_t roundeven_mask; 573 bool arm_althp; 574 } FloatFmt; 575 576 /* Expand fields based on the size of exponent and fraction */ 577 #define FLOAT_PARAMS(E, F) \ 578 .exp_size = E, \ 579 .exp_bias = ((1 << E) - 1) >> 1, \ 580 .exp_max = (1 << E) - 1, \ 581 .frac_size = F, \ 582 .frac_shift = (-F - 1) & 63, \ 583 .frac_lsb = 1ull << ((-F - 1) & 63), \ 584 .frac_lsbm1 = 1ull << ((-F - 2) & 63), \ 585 .round_mask = (1ull << ((-F - 1) & 63)) - 1, \ 586 .roundeven_mask = (2ull << ((-F - 1) & 63)) - 1 587 588 static const FloatFmt float16_params = { 589 FLOAT_PARAMS(5, 10) 590 }; 591 592 static const FloatFmt float16_params_ahp = { 593 FLOAT_PARAMS(5, 10), 594 .arm_althp = true 595 }; 596 597 static const FloatFmt bfloat16_params = { 598 FLOAT_PARAMS(8, 7) 599 }; 600 601 static const FloatFmt float32_params = { 602 FLOAT_PARAMS(8, 23) 603 }; 604 605 static const FloatFmt float64_params = { 606 FLOAT_PARAMS(11, 52) 607 }; 608 609 static const FloatFmt float128_params = { 610 FLOAT_PARAMS(15, 112) 611 }; 612 613 /* Unpack a float to parts, but do not canonicalize. */ 614 static void unpack_raw64(FloatParts64 *r, const FloatFmt *fmt, uint64_t raw) 615 { 616 const int f_size = fmt->frac_size; 617 const int e_size = fmt->exp_size; 618 619 *r = (FloatParts64) { 620 .cls = float_class_unclassified, 621 .sign = extract64(raw, f_size + e_size, 1), 622 .exp = extract64(raw, f_size, e_size), 623 .frac = extract64(raw, 0, f_size) 624 }; 625 } 626 627 static inline void float16_unpack_raw(FloatParts64 *p, float16 f) 628 { 629 unpack_raw64(p, &float16_params, f); 630 } 631 632 static inline void bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f) 633 { 634 unpack_raw64(p, &bfloat16_params, f); 635 } 636 637 static inline void float32_unpack_raw(FloatParts64 *p, float32 f) 638 { 639 unpack_raw64(p, &float32_params, f); 640 } 641 642 static inline void float64_unpack_raw(FloatParts64 *p, float64 f) 643 { 644 unpack_raw64(p, &float64_params, f); 645 } 646 647 static void float128_unpack_raw(FloatParts128 *p, float128 f) 648 { 649 const int f_size = float128_params.frac_size - 64; 650 const int e_size = float128_params.exp_size; 651 652 *p = (FloatParts128) { 653 .cls = float_class_unclassified, 654 .sign = extract64(f.high, f_size + e_size, 1), 655 .exp = extract64(f.high, f_size, e_size), 656 .frac_hi = extract64(f.high, 0, f_size), 657 .frac_lo = f.low, 658 }; 659 } 660 661 /* Pack a float from parts, but do not canonicalize. */ 662 static uint64_t pack_raw64(const FloatParts64 *p, const FloatFmt *fmt) 663 { 664 const int f_size = fmt->frac_size; 665 const int e_size = fmt->exp_size; 666 uint64_t ret; 667 668 ret = (uint64_t)p->sign << (f_size + e_size); 669 ret = deposit64(ret, f_size, e_size, p->exp); 670 ret = deposit64(ret, 0, f_size, p->frac); 671 return ret; 672 } 673 674 static inline float16 float16_pack_raw(const FloatParts64 *p) 675 { 676 return make_float16(pack_raw64(p, &float16_params)); 677 } 678 679 static inline bfloat16 bfloat16_pack_raw(const FloatParts64 *p) 680 { 681 return pack_raw64(p, &bfloat16_params); 682 } 683 684 static inline float32 float32_pack_raw(const FloatParts64 *p) 685 { 686 return make_float32(pack_raw64(p, &float32_params)); 687 } 688 689 static inline float64 float64_pack_raw(const FloatParts64 *p) 690 { 691 return make_float64(pack_raw64(p, &float64_params)); 692 } 693 694 static float128 float128_pack_raw(const FloatParts128 *p) 695 { 696 const int f_size = float128_params.frac_size - 64; 697 const int e_size = float128_params.exp_size; 698 uint64_t hi; 699 700 hi = (uint64_t)p->sign << (f_size + e_size); 701 hi = deposit64(hi, f_size, e_size, p->exp); 702 hi = deposit64(hi, 0, f_size, p->frac_hi); 703 return make_float128(hi, p->frac_lo); 704 } 705 706 /*---------------------------------------------------------------------------- 707 | Functions and definitions to determine: (1) whether tininess for underflow 708 | is detected before or after rounding by default, (2) what (if anything) 709 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 710 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 711 | are propagated from function inputs to output. These details are target- 712 | specific. 713 *----------------------------------------------------------------------------*/ 714 #include "softfloat-specialize.c.inc" 715 716 #define PARTS_GENERIC_64_128(NAME, P) \ 717 QEMU_GENERIC(P, (FloatParts128 *, parts128_##NAME), parts64_##NAME) 718 719 #define PARTS_GENERIC_64_128_256(NAME, P) \ 720 QEMU_GENERIC(P, (FloatParts256 *, parts256_##NAME), \ 721 (FloatParts128 *, parts128_##NAME), parts64_##NAME) 722 723 #define parts_default_nan(P, S) PARTS_GENERIC_64_128(default_nan, P)(P, S) 724 #define parts_silence_nan(P, S) PARTS_GENERIC_64_128(silence_nan, P)(P, S) 725 726 static void parts64_return_nan(FloatParts64 *a, float_status *s); 727 static void parts128_return_nan(FloatParts128 *a, float_status *s); 728 729 #define parts_return_nan(P, S) PARTS_GENERIC_64_128(return_nan, P)(P, S) 730 731 static FloatParts64 *parts64_pick_nan(FloatParts64 *a, FloatParts64 *b, 732 float_status *s); 733 static FloatParts128 *parts128_pick_nan(FloatParts128 *a, FloatParts128 *b, 734 float_status *s); 735 736 #define parts_pick_nan(A, B, S) PARTS_GENERIC_64_128(pick_nan, A)(A, B, S) 737 738 static FloatParts64 *parts64_pick_nan_muladd(FloatParts64 *a, FloatParts64 *b, 739 FloatParts64 *c, float_status *s, 740 int ab_mask, int abc_mask); 741 static FloatParts128 *parts128_pick_nan_muladd(FloatParts128 *a, 742 FloatParts128 *b, 743 FloatParts128 *c, 744 float_status *s, 745 int ab_mask, int abc_mask); 746 747 #define parts_pick_nan_muladd(A, B, C, S, ABM, ABCM) \ 748 PARTS_GENERIC_64_128(pick_nan_muladd, A)(A, B, C, S, ABM, ABCM) 749 750 static void parts64_canonicalize(FloatParts64 *p, float_status *status, 751 const FloatFmt *fmt); 752 static void parts128_canonicalize(FloatParts128 *p, float_status *status, 753 const FloatFmt *fmt); 754 755 #define parts_canonicalize(A, S, F) \ 756 PARTS_GENERIC_64_128(canonicalize, A)(A, S, F) 757 758 static void parts64_uncanon(FloatParts64 *p, float_status *status, 759 const FloatFmt *fmt); 760 static void parts128_uncanon(FloatParts128 *p, float_status *status, 761 const FloatFmt *fmt); 762 763 #define parts_uncanon(A, S, F) \ 764 PARTS_GENERIC_64_128(uncanon, A)(A, S, F) 765 766 static void parts64_add_normal(FloatParts64 *a, FloatParts64 *b); 767 static void parts128_add_normal(FloatParts128 *a, FloatParts128 *b); 768 static void parts256_add_normal(FloatParts256 *a, FloatParts256 *b); 769 770 #define parts_add_normal(A, B) \ 771 PARTS_GENERIC_64_128_256(add_normal, A)(A, B) 772 773 static bool parts64_sub_normal(FloatParts64 *a, FloatParts64 *b); 774 static bool parts128_sub_normal(FloatParts128 *a, FloatParts128 *b); 775 static bool parts256_sub_normal(FloatParts256 *a, FloatParts256 *b); 776 777 #define parts_sub_normal(A, B) \ 778 PARTS_GENERIC_64_128_256(sub_normal, A)(A, B) 779 780 static FloatParts64 *parts64_addsub(FloatParts64 *a, FloatParts64 *b, 781 float_status *s, bool subtract); 782 static FloatParts128 *parts128_addsub(FloatParts128 *a, FloatParts128 *b, 783 float_status *s, bool subtract); 784 785 #define parts_addsub(A, B, S, Z) \ 786 PARTS_GENERIC_64_128(addsub, A)(A, B, S, Z) 787 788 static FloatParts64 *parts64_mul(FloatParts64 *a, FloatParts64 *b, 789 float_status *s); 790 static FloatParts128 *parts128_mul(FloatParts128 *a, FloatParts128 *b, 791 float_status *s); 792 793 #define parts_mul(A, B, S) \ 794 PARTS_GENERIC_64_128(mul, A)(A, B, S) 795 796 static FloatParts64 *parts64_muladd(FloatParts64 *a, FloatParts64 *b, 797 FloatParts64 *c, int flags, 798 float_status *s); 799 static FloatParts128 *parts128_muladd(FloatParts128 *a, FloatParts128 *b, 800 FloatParts128 *c, int flags, 801 float_status *s); 802 803 #define parts_muladd(A, B, C, Z, S) \ 804 PARTS_GENERIC_64_128(muladd, A)(A, B, C, Z, S) 805 806 static FloatParts64 *parts64_div(FloatParts64 *a, FloatParts64 *b, 807 float_status *s); 808 static FloatParts128 *parts128_div(FloatParts128 *a, FloatParts128 *b, 809 float_status *s); 810 811 #define parts_div(A, B, S) \ 812 PARTS_GENERIC_64_128(div, A)(A, B, S) 813 814 /* 815 * Helper functions for softfloat-parts.c.inc, per-size operations. 816 */ 817 818 #define FRAC_GENERIC_64_128(NAME, P) \ 819 QEMU_GENERIC(P, (FloatParts128 *, frac128_##NAME), frac64_##NAME) 820 821 #define FRAC_GENERIC_64_128_256(NAME, P) \ 822 QEMU_GENERIC(P, (FloatParts256 *, frac256_##NAME), \ 823 (FloatParts128 *, frac128_##NAME), frac64_##NAME) 824 825 static bool frac64_add(FloatParts64 *r, FloatParts64 *a, FloatParts64 *b) 826 { 827 return uadd64_overflow(a->frac, b->frac, &r->frac); 828 } 829 830 static bool frac128_add(FloatParts128 *r, FloatParts128 *a, FloatParts128 *b) 831 { 832 bool c = 0; 833 r->frac_lo = uadd64_carry(a->frac_lo, b->frac_lo, &c); 834 r->frac_hi = uadd64_carry(a->frac_hi, b->frac_hi, &c); 835 return c; 836 } 837 838 static bool frac256_add(FloatParts256 *r, FloatParts256 *a, FloatParts256 *b) 839 { 840 bool c = 0; 841 r->frac_lo = uadd64_carry(a->frac_lo, b->frac_lo, &c); 842 r->frac_lm = uadd64_carry(a->frac_lm, b->frac_lm, &c); 843 r->frac_hm = uadd64_carry(a->frac_hm, b->frac_hm, &c); 844 r->frac_hi = uadd64_carry(a->frac_hi, b->frac_hi, &c); 845 return c; 846 } 847 848 #define frac_add(R, A, B) FRAC_GENERIC_64_128_256(add, R)(R, A, B) 849 850 static bool frac64_addi(FloatParts64 *r, FloatParts64 *a, uint64_t c) 851 { 852 return uadd64_overflow(a->frac, c, &r->frac); 853 } 854 855 static bool frac128_addi(FloatParts128 *r, FloatParts128 *a, uint64_t c) 856 { 857 c = uadd64_overflow(a->frac_lo, c, &r->frac_lo); 858 return uadd64_overflow(a->frac_hi, c, &r->frac_hi); 859 } 860 861 #define frac_addi(R, A, C) FRAC_GENERIC_64_128(addi, R)(R, A, C) 862 863 static void frac64_allones(FloatParts64 *a) 864 { 865 a->frac = -1; 866 } 867 868 static void frac128_allones(FloatParts128 *a) 869 { 870 a->frac_hi = a->frac_lo = -1; 871 } 872 873 #define frac_allones(A) FRAC_GENERIC_64_128(allones, A)(A) 874 875 static int frac64_cmp(FloatParts64 *a, FloatParts64 *b) 876 { 877 return a->frac == b->frac ? 0 : a->frac < b->frac ? -1 : 1; 878 } 879 880 static int frac128_cmp(FloatParts128 *a, FloatParts128 *b) 881 { 882 uint64_t ta = a->frac_hi, tb = b->frac_hi; 883 if (ta == tb) { 884 ta = a->frac_lo, tb = b->frac_lo; 885 if (ta == tb) { 886 return 0; 887 } 888 } 889 return ta < tb ? -1 : 1; 890 } 891 892 #define frac_cmp(A, B) FRAC_GENERIC_64_128(cmp, A)(A, B) 893 894 static void frac64_clear(FloatParts64 *a) 895 { 896 a->frac = 0; 897 } 898 899 static void frac128_clear(FloatParts128 *a) 900 { 901 a->frac_hi = a->frac_lo = 0; 902 } 903 904 #define frac_clear(A) FRAC_GENERIC_64_128(clear, A)(A) 905 906 static bool frac64_div(FloatParts64 *a, FloatParts64 *b) 907 { 908 uint64_t n1, n0, r, q; 909 bool ret; 910 911 /* 912 * We want a 2*N / N-bit division to produce exactly an N-bit 913 * result, so that we do not lose any precision and so that we 914 * do not have to renormalize afterward. If A.frac < B.frac, 915 * then division would produce an (N-1)-bit result; shift A left 916 * by one to produce the an N-bit result, and return true to 917 * decrement the exponent to match. 918 * 919 * The udiv_qrnnd algorithm that we're using requires normalization, 920 * i.e. the msb of the denominator must be set, which is already true. 921 */ 922 ret = a->frac < b->frac; 923 if (ret) { 924 n0 = a->frac; 925 n1 = 0; 926 } else { 927 n0 = a->frac >> 1; 928 n1 = a->frac << 63; 929 } 930 q = udiv_qrnnd(&r, n0, n1, b->frac); 931 932 /* Set lsb if there is a remainder, to set inexact. */ 933 a->frac = q | (r != 0); 934 935 return ret; 936 } 937 938 static bool frac128_div(FloatParts128 *a, FloatParts128 *b) 939 { 940 uint64_t q0, q1, a0, a1, b0, b1; 941 uint64_t r0, r1, r2, r3, t0, t1, t2, t3; 942 bool ret = false; 943 944 a0 = a->frac_hi, a1 = a->frac_lo; 945 b0 = b->frac_hi, b1 = b->frac_lo; 946 947 ret = lt128(a0, a1, b0, b1); 948 if (!ret) { 949 a1 = shr_double(a0, a1, 1); 950 a0 = a0 >> 1; 951 } 952 953 /* Use 128/64 -> 64 division as estimate for 192/128 -> 128 division. */ 954 q0 = estimateDiv128To64(a0, a1, b0); 955 956 /* 957 * Estimate is high because B1 was not included (unless B1 == 0). 958 * Reduce quotient and increase remainder until remainder is non-negative. 959 * This loop will execute 0 to 2 times. 960 */ 961 mul128By64To192(b0, b1, q0, &t0, &t1, &t2); 962 sub192(a0, a1, 0, t0, t1, t2, &r0, &r1, &r2); 963 while (r0 != 0) { 964 q0--; 965 add192(r0, r1, r2, 0, b0, b1, &r0, &r1, &r2); 966 } 967 968 /* Repeat using the remainder, producing a second word of quotient. */ 969 q1 = estimateDiv128To64(r1, r2, b0); 970 mul128By64To192(b0, b1, q1, &t1, &t2, &t3); 971 sub192(r1, r2, 0, t1, t2, t3, &r1, &r2, &r3); 972 while (r1 != 0) { 973 q1--; 974 add192(r1, r2, r3, 0, b0, b1, &r1, &r2, &r3); 975 } 976 977 /* Any remainder indicates inexact; set sticky bit. */ 978 q1 |= (r2 | r3) != 0; 979 980 a->frac_hi = q0; 981 a->frac_lo = q1; 982 return ret; 983 } 984 985 #define frac_div(A, B) FRAC_GENERIC_64_128(div, A)(A, B) 986 987 static bool frac64_eqz(FloatParts64 *a) 988 { 989 return a->frac == 0; 990 } 991 992 static bool frac128_eqz(FloatParts128 *a) 993 { 994 return (a->frac_hi | a->frac_lo) == 0; 995 } 996 997 #define frac_eqz(A) FRAC_GENERIC_64_128(eqz, A)(A) 998 999 static void frac64_mulw(FloatParts128 *r, FloatParts64 *a, FloatParts64 *b) 1000 { 1001 mulu64(&r->frac_lo, &r->frac_hi, a->frac, b->frac); 1002 } 1003 1004 static void frac128_mulw(FloatParts256 *r, FloatParts128 *a, FloatParts128 *b) 1005 { 1006 mul128To256(a->frac_hi, a->frac_lo, b->frac_hi, b->frac_lo, 1007 &r->frac_hi, &r->frac_hm, &r->frac_lm, &r->frac_lo); 1008 } 1009 1010 #define frac_mulw(R, A, B) FRAC_GENERIC_64_128(mulw, A)(R, A, B) 1011 1012 static void frac64_neg(FloatParts64 *a) 1013 { 1014 a->frac = -a->frac; 1015 } 1016 1017 static void frac128_neg(FloatParts128 *a) 1018 { 1019 bool c = 0; 1020 a->frac_lo = usub64_borrow(0, a->frac_lo, &c); 1021 a->frac_hi = usub64_borrow(0, a->frac_hi, &c); 1022 } 1023 1024 static void frac256_neg(FloatParts256 *a) 1025 { 1026 bool c = 0; 1027 a->frac_lo = usub64_borrow(0, a->frac_lo, &c); 1028 a->frac_lm = usub64_borrow(0, a->frac_lm, &c); 1029 a->frac_hm = usub64_borrow(0, a->frac_hm, &c); 1030 a->frac_hi = usub64_borrow(0, a->frac_hi, &c); 1031 } 1032 1033 #define frac_neg(A) FRAC_GENERIC_64_128_256(neg, A)(A) 1034 1035 static int frac64_normalize(FloatParts64 *a) 1036 { 1037 if (a->frac) { 1038 int shift = clz64(a->frac); 1039 a->frac <<= shift; 1040 return shift; 1041 } 1042 return 64; 1043 } 1044 1045 static int frac128_normalize(FloatParts128 *a) 1046 { 1047 if (a->frac_hi) { 1048 int shl = clz64(a->frac_hi); 1049 a->frac_hi = shl_double(a->frac_hi, a->frac_lo, shl); 1050 a->frac_lo <<= shl; 1051 return shl; 1052 } else if (a->frac_lo) { 1053 int shl = clz64(a->frac_lo); 1054 a->frac_hi = a->frac_lo << shl; 1055 a->frac_lo = 0; 1056 return shl + 64; 1057 } 1058 return 128; 1059 } 1060 1061 static int frac256_normalize(FloatParts256 *a) 1062 { 1063 uint64_t a0 = a->frac_hi, a1 = a->frac_hm; 1064 uint64_t a2 = a->frac_lm, a3 = a->frac_lo; 1065 int ret, shl; 1066 1067 if (likely(a0)) { 1068 shl = clz64(a0); 1069 if (shl == 0) { 1070 return 0; 1071 } 1072 ret = shl; 1073 } else { 1074 if (a1) { 1075 ret = 64; 1076 a0 = a1, a1 = a2, a2 = a3, a3 = 0; 1077 } else if (a2) { 1078 ret = 128; 1079 a0 = a2, a1 = a3, a2 = 0, a3 = 0; 1080 } else if (a3) { 1081 ret = 192; 1082 a0 = a3, a1 = 0, a2 = 0, a3 = 0; 1083 } else { 1084 ret = 256; 1085 a0 = 0, a1 = 0, a2 = 0, a3 = 0; 1086 goto done; 1087 } 1088 shl = clz64(a0); 1089 if (shl == 0) { 1090 goto done; 1091 } 1092 ret += shl; 1093 } 1094 1095 a0 = shl_double(a0, a1, shl); 1096 a1 = shl_double(a1, a2, shl); 1097 a2 = shl_double(a2, a3, shl); 1098 a3 <<= shl; 1099 1100 done: 1101 a->frac_hi = a0; 1102 a->frac_hm = a1; 1103 a->frac_lm = a2; 1104 a->frac_lo = a3; 1105 return ret; 1106 } 1107 1108 #define frac_normalize(A) FRAC_GENERIC_64_128_256(normalize, A)(A) 1109 1110 static void frac64_shl(FloatParts64 *a, int c) 1111 { 1112 a->frac <<= c; 1113 } 1114 1115 static void frac128_shl(FloatParts128 *a, int c) 1116 { 1117 uint64_t a0 = a->frac_hi, a1 = a->frac_lo; 1118 1119 if (c & 64) { 1120 a0 = a1, a1 = 0; 1121 } 1122 1123 c &= 63; 1124 if (c) { 1125 a0 = shl_double(a0, a1, c); 1126 a1 = a1 << c; 1127 } 1128 1129 a->frac_hi = a0; 1130 a->frac_lo = a1; 1131 } 1132 1133 #define frac_shl(A, C) FRAC_GENERIC_64_128(shl, A)(A, C) 1134 1135 static void frac64_shr(FloatParts64 *a, int c) 1136 { 1137 a->frac >>= c; 1138 } 1139 1140 static void frac128_shr(FloatParts128 *a, int c) 1141 { 1142 uint64_t a0 = a->frac_hi, a1 = a->frac_lo; 1143 1144 if (c & 64) { 1145 a1 = a0, a0 = 0; 1146 } 1147 1148 c &= 63; 1149 if (c) { 1150 a1 = shr_double(a0, a1, c); 1151 a0 = a0 >> c; 1152 } 1153 1154 a->frac_hi = a0; 1155 a->frac_lo = a1; 1156 } 1157 1158 #define frac_shr(A, C) FRAC_GENERIC_64_128(shr, A)(A, C) 1159 1160 static void frac64_shrjam(FloatParts64 *a, int c) 1161 { 1162 uint64_t a0 = a->frac; 1163 1164 if (likely(c != 0)) { 1165 if (likely(c < 64)) { 1166 a0 = (a0 >> c) | (shr_double(a0, 0, c) != 0); 1167 } else { 1168 a0 = a0 != 0; 1169 } 1170 a->frac = a0; 1171 } 1172 } 1173 1174 static void frac128_shrjam(FloatParts128 *a, int c) 1175 { 1176 uint64_t a0 = a->frac_hi, a1 = a->frac_lo; 1177 uint64_t sticky = 0; 1178 1179 if (unlikely(c == 0)) { 1180 return; 1181 } else if (likely(c < 64)) { 1182 /* nothing */ 1183 } else if (likely(c < 128)) { 1184 sticky = a1; 1185 a1 = a0; 1186 a0 = 0; 1187 c &= 63; 1188 if (c == 0) { 1189 goto done; 1190 } 1191 } else { 1192 sticky = a0 | a1; 1193 a0 = a1 = 0; 1194 goto done; 1195 } 1196 1197 sticky |= shr_double(a1, 0, c); 1198 a1 = shr_double(a0, a1, c); 1199 a0 = a0 >> c; 1200 1201 done: 1202 a->frac_lo = a1 | (sticky != 0); 1203 a->frac_hi = a0; 1204 } 1205 1206 static void frac256_shrjam(FloatParts256 *a, int c) 1207 { 1208 uint64_t a0 = a->frac_hi, a1 = a->frac_hm; 1209 uint64_t a2 = a->frac_lm, a3 = a->frac_lo; 1210 uint64_t sticky = 0; 1211 1212 if (unlikely(c == 0)) { 1213 return; 1214 } else if (likely(c < 64)) { 1215 /* nothing */ 1216 } else if (likely(c < 256)) { 1217 if (unlikely(c & 128)) { 1218 sticky |= a2 | a3; 1219 a3 = a1, a2 = a0, a1 = 0, a0 = 0; 1220 } 1221 if (unlikely(c & 64)) { 1222 sticky |= a3; 1223 a3 = a2, a2 = a1, a1 = a0, a0 = 0; 1224 } 1225 c &= 63; 1226 if (c == 0) { 1227 goto done; 1228 } 1229 } else { 1230 sticky = a0 | a1 | a2 | a3; 1231 a0 = a1 = a2 = a3 = 0; 1232 goto done; 1233 } 1234 1235 sticky |= shr_double(a3, 0, c); 1236 a3 = shr_double(a2, a3, c); 1237 a2 = shr_double(a1, a2, c); 1238 a1 = shr_double(a0, a1, c); 1239 a0 = a0 >> c; 1240 1241 done: 1242 a->frac_lo = a3 | (sticky != 0); 1243 a->frac_lm = a2; 1244 a->frac_hm = a1; 1245 a->frac_hi = a0; 1246 } 1247 1248 #define frac_shrjam(A, C) FRAC_GENERIC_64_128_256(shrjam, A)(A, C) 1249 1250 static bool frac64_sub(FloatParts64 *r, FloatParts64 *a, FloatParts64 *b) 1251 { 1252 return usub64_overflow(a->frac, b->frac, &r->frac); 1253 } 1254 1255 static bool frac128_sub(FloatParts128 *r, FloatParts128 *a, FloatParts128 *b) 1256 { 1257 bool c = 0; 1258 r->frac_lo = usub64_borrow(a->frac_lo, b->frac_lo, &c); 1259 r->frac_hi = usub64_borrow(a->frac_hi, b->frac_hi, &c); 1260 return c; 1261 } 1262 1263 static bool frac256_sub(FloatParts256 *r, FloatParts256 *a, FloatParts256 *b) 1264 { 1265 bool c = 0; 1266 r->frac_lo = usub64_borrow(a->frac_lo, b->frac_lo, &c); 1267 r->frac_lm = usub64_borrow(a->frac_lm, b->frac_lm, &c); 1268 r->frac_hm = usub64_borrow(a->frac_hm, b->frac_hm, &c); 1269 r->frac_hi = usub64_borrow(a->frac_hi, b->frac_hi, &c); 1270 return c; 1271 } 1272 1273 #define frac_sub(R, A, B) FRAC_GENERIC_64_128_256(sub, R)(R, A, B) 1274 1275 static void frac64_truncjam(FloatParts64 *r, FloatParts128 *a) 1276 { 1277 r->frac = a->frac_hi | (a->frac_lo != 0); 1278 } 1279 1280 static void frac128_truncjam(FloatParts128 *r, FloatParts256 *a) 1281 { 1282 r->frac_hi = a->frac_hi; 1283 r->frac_lo = a->frac_hm | ((a->frac_lm | a->frac_lo) != 0); 1284 } 1285 1286 #define frac_truncjam(R, A) FRAC_GENERIC_64_128(truncjam, R)(R, A) 1287 1288 static void frac64_widen(FloatParts128 *r, FloatParts64 *a) 1289 { 1290 r->frac_hi = a->frac; 1291 r->frac_lo = 0; 1292 } 1293 1294 static void frac128_widen(FloatParts256 *r, FloatParts128 *a) 1295 { 1296 r->frac_hi = a->frac_hi; 1297 r->frac_hm = a->frac_lo; 1298 r->frac_lm = 0; 1299 r->frac_lo = 0; 1300 } 1301 1302 #define frac_widen(A, B) FRAC_GENERIC_64_128(widen, B)(A, B) 1303 1304 #define partsN(NAME) glue(glue(glue(parts,N),_),NAME) 1305 #define FloatPartsN glue(FloatParts,N) 1306 #define FloatPartsW glue(FloatParts,W) 1307 1308 #define N 64 1309 #define W 128 1310 1311 #include "softfloat-parts-addsub.c.inc" 1312 #include "softfloat-parts.c.inc" 1313 1314 #undef N 1315 #undef W 1316 #define N 128 1317 #define W 256 1318 1319 #include "softfloat-parts-addsub.c.inc" 1320 #include "softfloat-parts.c.inc" 1321 1322 #undef N 1323 #undef W 1324 #define N 256 1325 1326 #include "softfloat-parts-addsub.c.inc" 1327 1328 #undef N 1329 #undef W 1330 #undef partsN 1331 #undef FloatPartsN 1332 #undef FloatPartsW 1333 1334 /* 1335 * Pack/unpack routines with a specific FloatFmt. 1336 */ 1337 1338 static void float16a_unpack_canonical(FloatParts64 *p, float16 f, 1339 float_status *s, const FloatFmt *params) 1340 { 1341 float16_unpack_raw(p, f); 1342 parts_canonicalize(p, s, params); 1343 } 1344 1345 static void float16_unpack_canonical(FloatParts64 *p, float16 f, 1346 float_status *s) 1347 { 1348 float16a_unpack_canonical(p, f, s, &float16_params); 1349 } 1350 1351 static void bfloat16_unpack_canonical(FloatParts64 *p, bfloat16 f, 1352 float_status *s) 1353 { 1354 bfloat16_unpack_raw(p, f); 1355 parts_canonicalize(p, s, &bfloat16_params); 1356 } 1357 1358 static float16 float16a_round_pack_canonical(FloatParts64 *p, 1359 float_status *s, 1360 const FloatFmt *params) 1361 { 1362 parts_uncanon(p, s, params); 1363 return float16_pack_raw(p); 1364 } 1365 1366 static float16 float16_round_pack_canonical(FloatParts64 *p, 1367 float_status *s) 1368 { 1369 return float16a_round_pack_canonical(p, s, &float16_params); 1370 } 1371 1372 static bfloat16 bfloat16_round_pack_canonical(FloatParts64 *p, 1373 float_status *s) 1374 { 1375 parts_uncanon(p, s, &bfloat16_params); 1376 return bfloat16_pack_raw(p); 1377 } 1378 1379 static void float32_unpack_canonical(FloatParts64 *p, float32 f, 1380 float_status *s) 1381 { 1382 float32_unpack_raw(p, f); 1383 parts_canonicalize(p, s, &float32_params); 1384 } 1385 1386 static float32 float32_round_pack_canonical(FloatParts64 *p, 1387 float_status *s) 1388 { 1389 parts_uncanon(p, s, &float32_params); 1390 return float32_pack_raw(p); 1391 } 1392 1393 static void float64_unpack_canonical(FloatParts64 *p, float64 f, 1394 float_status *s) 1395 { 1396 float64_unpack_raw(p, f); 1397 parts_canonicalize(p, s, &float64_params); 1398 } 1399 1400 static float64 float64_round_pack_canonical(FloatParts64 *p, 1401 float_status *s) 1402 { 1403 parts_uncanon(p, s, &float64_params); 1404 return float64_pack_raw(p); 1405 } 1406 1407 static void float128_unpack_canonical(FloatParts128 *p, float128 f, 1408 float_status *s) 1409 { 1410 float128_unpack_raw(p, f); 1411 parts_canonicalize(p, s, &float128_params); 1412 } 1413 1414 static float128 float128_round_pack_canonical(FloatParts128 *p, 1415 float_status *s) 1416 { 1417 parts_uncanon(p, s, &float128_params); 1418 return float128_pack_raw(p); 1419 } 1420 1421 /* 1422 * Addition and subtraction 1423 */ 1424 1425 static float16 QEMU_FLATTEN 1426 float16_addsub(float16 a, float16 b, float_status *status, bool subtract) 1427 { 1428 FloatParts64 pa, pb, *pr; 1429 1430 float16_unpack_canonical(&pa, a, status); 1431 float16_unpack_canonical(&pb, b, status); 1432 pr = parts_addsub(&pa, &pb, status, subtract); 1433 1434 return float16_round_pack_canonical(pr, status); 1435 } 1436 1437 float16 float16_add(float16 a, float16 b, float_status *status) 1438 { 1439 return float16_addsub(a, b, status, false); 1440 } 1441 1442 float16 float16_sub(float16 a, float16 b, float_status *status) 1443 { 1444 return float16_addsub(a, b, status, true); 1445 } 1446 1447 static float32 QEMU_SOFTFLOAT_ATTR 1448 soft_f32_addsub(float32 a, float32 b, float_status *status, bool subtract) 1449 { 1450 FloatParts64 pa, pb, *pr; 1451 1452 float32_unpack_canonical(&pa, a, status); 1453 float32_unpack_canonical(&pb, b, status); 1454 pr = parts_addsub(&pa, &pb, status, subtract); 1455 1456 return float32_round_pack_canonical(pr, status); 1457 } 1458 1459 static float32 soft_f32_add(float32 a, float32 b, float_status *status) 1460 { 1461 return soft_f32_addsub(a, b, status, false); 1462 } 1463 1464 static float32 soft_f32_sub(float32 a, float32 b, float_status *status) 1465 { 1466 return soft_f32_addsub(a, b, status, true); 1467 } 1468 1469 static float64 QEMU_SOFTFLOAT_ATTR 1470 soft_f64_addsub(float64 a, float64 b, float_status *status, bool subtract) 1471 { 1472 FloatParts64 pa, pb, *pr; 1473 1474 float64_unpack_canonical(&pa, a, status); 1475 float64_unpack_canonical(&pb, b, status); 1476 pr = parts_addsub(&pa, &pb, status, subtract); 1477 1478 return float64_round_pack_canonical(pr, status); 1479 } 1480 1481 static float64 soft_f64_add(float64 a, float64 b, float_status *status) 1482 { 1483 return soft_f64_addsub(a, b, status, false); 1484 } 1485 1486 static float64 soft_f64_sub(float64 a, float64 b, float_status *status) 1487 { 1488 return soft_f64_addsub(a, b, status, true); 1489 } 1490 1491 static float hard_f32_add(float a, float b) 1492 { 1493 return a + b; 1494 } 1495 1496 static float hard_f32_sub(float a, float b) 1497 { 1498 return a - b; 1499 } 1500 1501 static double hard_f64_add(double a, double b) 1502 { 1503 return a + b; 1504 } 1505 1506 static double hard_f64_sub(double a, double b) 1507 { 1508 return a - b; 1509 } 1510 1511 static bool f32_addsubmul_post(union_float32 a, union_float32 b) 1512 { 1513 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1514 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1515 } 1516 return !(float32_is_zero(a.s) && float32_is_zero(b.s)); 1517 } 1518 1519 static bool f64_addsubmul_post(union_float64 a, union_float64 b) 1520 { 1521 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1522 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1523 } else { 1524 return !(float64_is_zero(a.s) && float64_is_zero(b.s)); 1525 } 1526 } 1527 1528 static float32 float32_addsub(float32 a, float32 b, float_status *s, 1529 hard_f32_op2_fn hard, soft_f32_op2_fn soft) 1530 { 1531 return float32_gen2(a, b, s, hard, soft, 1532 f32_is_zon2, f32_addsubmul_post); 1533 } 1534 1535 static float64 float64_addsub(float64 a, float64 b, float_status *s, 1536 hard_f64_op2_fn hard, soft_f64_op2_fn soft) 1537 { 1538 return float64_gen2(a, b, s, hard, soft, 1539 f64_is_zon2, f64_addsubmul_post); 1540 } 1541 1542 float32 QEMU_FLATTEN 1543 float32_add(float32 a, float32 b, float_status *s) 1544 { 1545 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add); 1546 } 1547 1548 float32 QEMU_FLATTEN 1549 float32_sub(float32 a, float32 b, float_status *s) 1550 { 1551 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub); 1552 } 1553 1554 float64 QEMU_FLATTEN 1555 float64_add(float64 a, float64 b, float_status *s) 1556 { 1557 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add); 1558 } 1559 1560 float64 QEMU_FLATTEN 1561 float64_sub(float64 a, float64 b, float_status *s) 1562 { 1563 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub); 1564 } 1565 1566 static bfloat16 QEMU_FLATTEN 1567 bfloat16_addsub(bfloat16 a, bfloat16 b, float_status *status, bool subtract) 1568 { 1569 FloatParts64 pa, pb, *pr; 1570 1571 bfloat16_unpack_canonical(&pa, a, status); 1572 bfloat16_unpack_canonical(&pb, b, status); 1573 pr = parts_addsub(&pa, &pb, status, subtract); 1574 1575 return bfloat16_round_pack_canonical(pr, status); 1576 } 1577 1578 bfloat16 bfloat16_add(bfloat16 a, bfloat16 b, float_status *status) 1579 { 1580 return bfloat16_addsub(a, b, status, false); 1581 } 1582 1583 bfloat16 bfloat16_sub(bfloat16 a, bfloat16 b, float_status *status) 1584 { 1585 return bfloat16_addsub(a, b, status, true); 1586 } 1587 1588 static float128 QEMU_FLATTEN 1589 float128_addsub(float128 a, float128 b, float_status *status, bool subtract) 1590 { 1591 FloatParts128 pa, pb, *pr; 1592 1593 float128_unpack_canonical(&pa, a, status); 1594 float128_unpack_canonical(&pb, b, status); 1595 pr = parts_addsub(&pa, &pb, status, subtract); 1596 1597 return float128_round_pack_canonical(pr, status); 1598 } 1599 1600 float128 float128_add(float128 a, float128 b, float_status *status) 1601 { 1602 return float128_addsub(a, b, status, false); 1603 } 1604 1605 float128 float128_sub(float128 a, float128 b, float_status *status) 1606 { 1607 return float128_addsub(a, b, status, true); 1608 } 1609 1610 /* 1611 * Multiplication 1612 */ 1613 1614 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status) 1615 { 1616 FloatParts64 pa, pb, *pr; 1617 1618 float16_unpack_canonical(&pa, a, status); 1619 float16_unpack_canonical(&pb, b, status); 1620 pr = parts_mul(&pa, &pb, status); 1621 1622 return float16_round_pack_canonical(pr, status); 1623 } 1624 1625 static float32 QEMU_SOFTFLOAT_ATTR 1626 soft_f32_mul(float32 a, float32 b, float_status *status) 1627 { 1628 FloatParts64 pa, pb, *pr; 1629 1630 float32_unpack_canonical(&pa, a, status); 1631 float32_unpack_canonical(&pb, b, status); 1632 pr = parts_mul(&pa, &pb, status); 1633 1634 return float32_round_pack_canonical(pr, status); 1635 } 1636 1637 static float64 QEMU_SOFTFLOAT_ATTR 1638 soft_f64_mul(float64 a, float64 b, float_status *status) 1639 { 1640 FloatParts64 pa, pb, *pr; 1641 1642 float64_unpack_canonical(&pa, a, status); 1643 float64_unpack_canonical(&pb, b, status); 1644 pr = parts_mul(&pa, &pb, status); 1645 1646 return float64_round_pack_canonical(pr, status); 1647 } 1648 1649 static float hard_f32_mul(float a, float b) 1650 { 1651 return a * b; 1652 } 1653 1654 static double hard_f64_mul(double a, double b) 1655 { 1656 return a * b; 1657 } 1658 1659 float32 QEMU_FLATTEN 1660 float32_mul(float32 a, float32 b, float_status *s) 1661 { 1662 return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul, 1663 f32_is_zon2, f32_addsubmul_post); 1664 } 1665 1666 float64 QEMU_FLATTEN 1667 float64_mul(float64 a, float64 b, float_status *s) 1668 { 1669 return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul, 1670 f64_is_zon2, f64_addsubmul_post); 1671 } 1672 1673 bfloat16 QEMU_FLATTEN 1674 bfloat16_mul(bfloat16 a, bfloat16 b, float_status *status) 1675 { 1676 FloatParts64 pa, pb, *pr; 1677 1678 bfloat16_unpack_canonical(&pa, a, status); 1679 bfloat16_unpack_canonical(&pb, b, status); 1680 pr = parts_mul(&pa, &pb, status); 1681 1682 return bfloat16_round_pack_canonical(pr, status); 1683 } 1684 1685 float128 QEMU_FLATTEN 1686 float128_mul(float128 a, float128 b, float_status *status) 1687 { 1688 FloatParts128 pa, pb, *pr; 1689 1690 float128_unpack_canonical(&pa, a, status); 1691 float128_unpack_canonical(&pb, b, status); 1692 pr = parts_mul(&pa, &pb, status); 1693 1694 return float128_round_pack_canonical(pr, status); 1695 } 1696 1697 /* 1698 * Fused multiply-add 1699 */ 1700 1701 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c, 1702 int flags, float_status *status) 1703 { 1704 FloatParts64 pa, pb, pc, *pr; 1705 1706 float16_unpack_canonical(&pa, a, status); 1707 float16_unpack_canonical(&pb, b, status); 1708 float16_unpack_canonical(&pc, c, status); 1709 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1710 1711 return float16_round_pack_canonical(pr, status); 1712 } 1713 1714 static float32 QEMU_SOFTFLOAT_ATTR 1715 soft_f32_muladd(float32 a, float32 b, float32 c, int flags, 1716 float_status *status) 1717 { 1718 FloatParts64 pa, pb, pc, *pr; 1719 1720 float32_unpack_canonical(&pa, a, status); 1721 float32_unpack_canonical(&pb, b, status); 1722 float32_unpack_canonical(&pc, c, status); 1723 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1724 1725 return float32_round_pack_canonical(pr, status); 1726 } 1727 1728 static float64 QEMU_SOFTFLOAT_ATTR 1729 soft_f64_muladd(float64 a, float64 b, float64 c, int flags, 1730 float_status *status) 1731 { 1732 FloatParts64 pa, pb, pc, *pr; 1733 1734 float64_unpack_canonical(&pa, a, status); 1735 float64_unpack_canonical(&pb, b, status); 1736 float64_unpack_canonical(&pc, c, status); 1737 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1738 1739 return float64_round_pack_canonical(pr, status); 1740 } 1741 1742 static bool force_soft_fma; 1743 1744 float32 QEMU_FLATTEN 1745 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s) 1746 { 1747 union_float32 ua, ub, uc, ur; 1748 1749 ua.s = xa; 1750 ub.s = xb; 1751 uc.s = xc; 1752 1753 if (unlikely(!can_use_fpu(s))) { 1754 goto soft; 1755 } 1756 if (unlikely(flags & float_muladd_halve_result)) { 1757 goto soft; 1758 } 1759 1760 float32_input_flush3(&ua.s, &ub.s, &uc.s, s); 1761 if (unlikely(!f32_is_zon3(ua, ub, uc))) { 1762 goto soft; 1763 } 1764 1765 if (unlikely(force_soft_fma)) { 1766 goto soft; 1767 } 1768 1769 /* 1770 * When (a || b) == 0, there's no need to check for under/over flow, 1771 * since we know the addend is (normal || 0) and the product is 0. 1772 */ 1773 if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) { 1774 union_float32 up; 1775 bool prod_sign; 1776 1777 prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s); 1778 prod_sign ^= !!(flags & float_muladd_negate_product); 1779 up.s = float32_set_sign(float32_zero, prod_sign); 1780 1781 if (flags & float_muladd_negate_c) { 1782 uc.h = -uc.h; 1783 } 1784 ur.h = up.h + uc.h; 1785 } else { 1786 union_float32 ua_orig = ua; 1787 union_float32 uc_orig = uc; 1788 1789 if (flags & float_muladd_negate_product) { 1790 ua.h = -ua.h; 1791 } 1792 if (flags & float_muladd_negate_c) { 1793 uc.h = -uc.h; 1794 } 1795 1796 ur.h = fmaf(ua.h, ub.h, uc.h); 1797 1798 if (unlikely(f32_is_inf(ur))) { 1799 float_raise(float_flag_overflow, s); 1800 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) { 1801 ua = ua_orig; 1802 uc = uc_orig; 1803 goto soft; 1804 } 1805 } 1806 if (flags & float_muladd_negate_result) { 1807 return float32_chs(ur.s); 1808 } 1809 return ur.s; 1810 1811 soft: 1812 return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s); 1813 } 1814 1815 float64 QEMU_FLATTEN 1816 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s) 1817 { 1818 union_float64 ua, ub, uc, ur; 1819 1820 ua.s = xa; 1821 ub.s = xb; 1822 uc.s = xc; 1823 1824 if (unlikely(!can_use_fpu(s))) { 1825 goto soft; 1826 } 1827 if (unlikely(flags & float_muladd_halve_result)) { 1828 goto soft; 1829 } 1830 1831 float64_input_flush3(&ua.s, &ub.s, &uc.s, s); 1832 if (unlikely(!f64_is_zon3(ua, ub, uc))) { 1833 goto soft; 1834 } 1835 1836 if (unlikely(force_soft_fma)) { 1837 goto soft; 1838 } 1839 1840 /* 1841 * When (a || b) == 0, there's no need to check for under/over flow, 1842 * since we know the addend is (normal || 0) and the product is 0. 1843 */ 1844 if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) { 1845 union_float64 up; 1846 bool prod_sign; 1847 1848 prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s); 1849 prod_sign ^= !!(flags & float_muladd_negate_product); 1850 up.s = float64_set_sign(float64_zero, prod_sign); 1851 1852 if (flags & float_muladd_negate_c) { 1853 uc.h = -uc.h; 1854 } 1855 ur.h = up.h + uc.h; 1856 } else { 1857 union_float64 ua_orig = ua; 1858 union_float64 uc_orig = uc; 1859 1860 if (flags & float_muladd_negate_product) { 1861 ua.h = -ua.h; 1862 } 1863 if (flags & float_muladd_negate_c) { 1864 uc.h = -uc.h; 1865 } 1866 1867 ur.h = fma(ua.h, ub.h, uc.h); 1868 1869 if (unlikely(f64_is_inf(ur))) { 1870 float_raise(float_flag_overflow, s); 1871 } else if (unlikely(fabs(ur.h) <= FLT_MIN)) { 1872 ua = ua_orig; 1873 uc = uc_orig; 1874 goto soft; 1875 } 1876 } 1877 if (flags & float_muladd_negate_result) { 1878 return float64_chs(ur.s); 1879 } 1880 return ur.s; 1881 1882 soft: 1883 return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s); 1884 } 1885 1886 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c, 1887 int flags, float_status *status) 1888 { 1889 FloatParts64 pa, pb, pc, *pr; 1890 1891 bfloat16_unpack_canonical(&pa, a, status); 1892 bfloat16_unpack_canonical(&pb, b, status); 1893 bfloat16_unpack_canonical(&pc, c, status); 1894 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1895 1896 return bfloat16_round_pack_canonical(pr, status); 1897 } 1898 1899 float128 QEMU_FLATTEN float128_muladd(float128 a, float128 b, float128 c, 1900 int flags, float_status *status) 1901 { 1902 FloatParts128 pa, pb, pc, *pr; 1903 1904 float128_unpack_canonical(&pa, a, status); 1905 float128_unpack_canonical(&pb, b, status); 1906 float128_unpack_canonical(&pc, c, status); 1907 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1908 1909 return float128_round_pack_canonical(pr, status); 1910 } 1911 1912 /* 1913 * Division 1914 */ 1915 1916 float16 float16_div(float16 a, float16 b, float_status *status) 1917 { 1918 FloatParts64 pa, pb, *pr; 1919 1920 float16_unpack_canonical(&pa, a, status); 1921 float16_unpack_canonical(&pb, b, status); 1922 pr = parts_div(&pa, &pb, status); 1923 1924 return float16_round_pack_canonical(pr, status); 1925 } 1926 1927 static float32 QEMU_SOFTFLOAT_ATTR 1928 soft_f32_div(float32 a, float32 b, float_status *status) 1929 { 1930 FloatParts64 pa, pb, *pr; 1931 1932 float32_unpack_canonical(&pa, a, status); 1933 float32_unpack_canonical(&pb, b, status); 1934 pr = parts_div(&pa, &pb, status); 1935 1936 return float32_round_pack_canonical(pr, status); 1937 } 1938 1939 static float64 QEMU_SOFTFLOAT_ATTR 1940 soft_f64_div(float64 a, float64 b, float_status *status) 1941 { 1942 FloatParts64 pa, pb, *pr; 1943 1944 float64_unpack_canonical(&pa, a, status); 1945 float64_unpack_canonical(&pb, b, status); 1946 pr = parts_div(&pa, &pb, status); 1947 1948 return float64_round_pack_canonical(pr, status); 1949 } 1950 1951 static float hard_f32_div(float a, float b) 1952 { 1953 return a / b; 1954 } 1955 1956 static double hard_f64_div(double a, double b) 1957 { 1958 return a / b; 1959 } 1960 1961 static bool f32_div_pre(union_float32 a, union_float32 b) 1962 { 1963 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1964 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1965 fpclassify(b.h) == FP_NORMAL; 1966 } 1967 return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s); 1968 } 1969 1970 static bool f64_div_pre(union_float64 a, union_float64 b) 1971 { 1972 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1973 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1974 fpclassify(b.h) == FP_NORMAL; 1975 } 1976 return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s); 1977 } 1978 1979 static bool f32_div_post(union_float32 a, union_float32 b) 1980 { 1981 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1982 return fpclassify(a.h) != FP_ZERO; 1983 } 1984 return !float32_is_zero(a.s); 1985 } 1986 1987 static bool f64_div_post(union_float64 a, union_float64 b) 1988 { 1989 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1990 return fpclassify(a.h) != FP_ZERO; 1991 } 1992 return !float64_is_zero(a.s); 1993 } 1994 1995 float32 QEMU_FLATTEN 1996 float32_div(float32 a, float32 b, float_status *s) 1997 { 1998 return float32_gen2(a, b, s, hard_f32_div, soft_f32_div, 1999 f32_div_pre, f32_div_post); 2000 } 2001 2002 float64 QEMU_FLATTEN 2003 float64_div(float64 a, float64 b, float_status *s) 2004 { 2005 return float64_gen2(a, b, s, hard_f64_div, soft_f64_div, 2006 f64_div_pre, f64_div_post); 2007 } 2008 2009 bfloat16 QEMU_FLATTEN 2010 bfloat16_div(bfloat16 a, bfloat16 b, float_status *status) 2011 { 2012 FloatParts64 pa, pb, *pr; 2013 2014 bfloat16_unpack_canonical(&pa, a, status); 2015 bfloat16_unpack_canonical(&pb, b, status); 2016 pr = parts_div(&pa, &pb, status); 2017 2018 return bfloat16_round_pack_canonical(pr, status); 2019 } 2020 2021 float128 QEMU_FLATTEN 2022 float128_div(float128 a, float128 b, float_status *status) 2023 { 2024 FloatParts128 pa, pb, *pr; 2025 2026 float128_unpack_canonical(&pa, a, status); 2027 float128_unpack_canonical(&pb, b, status); 2028 pr = parts_div(&pa, &pb, status); 2029 2030 return float128_round_pack_canonical(pr, status); 2031 } 2032 2033 /* 2034 * Float to Float conversions 2035 * 2036 * Returns the result of converting one float format to another. The 2037 * conversion is performed according to the IEC/IEEE Standard for 2038 * Binary Floating-Point Arithmetic. 2039 * 2040 * The float_to_float helper only needs to take care of raising 2041 * invalid exceptions and handling the conversion on NaNs. 2042 */ 2043 2044 static FloatParts64 float_to_float(FloatParts64 a, const FloatFmt *dstf, 2045 float_status *s) 2046 { 2047 if (dstf->arm_althp) { 2048 switch (a.cls) { 2049 case float_class_qnan: 2050 case float_class_snan: 2051 /* There is no NaN in the destination format. Raise Invalid 2052 * and return a zero with the sign of the input NaN. 2053 */ 2054 float_raise(float_flag_invalid, s); 2055 a.cls = float_class_zero; 2056 a.frac = 0; 2057 a.exp = 0; 2058 break; 2059 2060 case float_class_inf: 2061 /* There is no Inf in the destination format. Raise Invalid 2062 * and return the maximum normal with the correct sign. 2063 */ 2064 float_raise(float_flag_invalid, s); 2065 a.cls = float_class_normal; 2066 a.exp = dstf->exp_max; 2067 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift; 2068 break; 2069 2070 default: 2071 break; 2072 } 2073 } else if (is_nan(a.cls)) { 2074 parts_return_nan(&a, s); 2075 } 2076 return a; 2077 } 2078 2079 float32 float16_to_float32(float16 a, bool ieee, float_status *s) 2080 { 2081 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2082 FloatParts64 pa, pr; 2083 2084 float16a_unpack_canonical(&pa, a, s, fmt16); 2085 pr = float_to_float(pa, &float32_params, s); 2086 return float32_round_pack_canonical(&pr, s); 2087 } 2088 2089 float64 float16_to_float64(float16 a, bool ieee, float_status *s) 2090 { 2091 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2092 FloatParts64 pa, pr; 2093 2094 float16a_unpack_canonical(&pa, a, s, fmt16); 2095 pr = float_to_float(pa, &float64_params, s); 2096 return float64_round_pack_canonical(&pr, s); 2097 } 2098 2099 float16 float32_to_float16(float32 a, bool ieee, float_status *s) 2100 { 2101 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2102 FloatParts64 pa, pr; 2103 2104 float32_unpack_canonical(&pa, a, s); 2105 pr = float_to_float(pa, fmt16, s); 2106 return float16a_round_pack_canonical(&pr, s, fmt16); 2107 } 2108 2109 static float64 QEMU_SOFTFLOAT_ATTR 2110 soft_float32_to_float64(float32 a, float_status *s) 2111 { 2112 FloatParts64 pa, pr; 2113 2114 float32_unpack_canonical(&pa, a, s); 2115 pr = float_to_float(pa, &float64_params, s); 2116 return float64_round_pack_canonical(&pr, s); 2117 } 2118 2119 float64 float32_to_float64(float32 a, float_status *s) 2120 { 2121 if (likely(float32_is_normal(a))) { 2122 /* Widening conversion can never produce inexact results. */ 2123 union_float32 uf; 2124 union_float64 ud; 2125 uf.s = a; 2126 ud.h = uf.h; 2127 return ud.s; 2128 } else if (float32_is_zero(a)) { 2129 return float64_set_sign(float64_zero, float32_is_neg(a)); 2130 } else { 2131 return soft_float32_to_float64(a, s); 2132 } 2133 } 2134 2135 float16 float64_to_float16(float64 a, bool ieee, float_status *s) 2136 { 2137 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2138 FloatParts64 pa, pr; 2139 2140 float64_unpack_canonical(&pa, a, s); 2141 pr = float_to_float(pa, fmt16, s); 2142 return float16a_round_pack_canonical(&pr, s, fmt16); 2143 } 2144 2145 float32 float64_to_float32(float64 a, float_status *s) 2146 { 2147 FloatParts64 pa, pr; 2148 2149 float64_unpack_canonical(&pa, a, s); 2150 pr = float_to_float(pa, &float32_params, s); 2151 return float32_round_pack_canonical(&pr, s); 2152 } 2153 2154 float32 bfloat16_to_float32(bfloat16 a, float_status *s) 2155 { 2156 FloatParts64 pa, pr; 2157 2158 bfloat16_unpack_canonical(&pa, a, s); 2159 pr = float_to_float(pa, &float32_params, s); 2160 return float32_round_pack_canonical(&pr, s); 2161 } 2162 2163 float64 bfloat16_to_float64(bfloat16 a, float_status *s) 2164 { 2165 FloatParts64 pa, pr; 2166 2167 bfloat16_unpack_canonical(&pa, a, s); 2168 pr = float_to_float(pa, &float64_params, s); 2169 return float64_round_pack_canonical(&pr, s); 2170 } 2171 2172 bfloat16 float32_to_bfloat16(float32 a, float_status *s) 2173 { 2174 FloatParts64 pa, pr; 2175 2176 float32_unpack_canonical(&pa, a, s); 2177 pr = float_to_float(pa, &bfloat16_params, s); 2178 return bfloat16_round_pack_canonical(&pr, s); 2179 } 2180 2181 bfloat16 float64_to_bfloat16(float64 a, float_status *s) 2182 { 2183 FloatParts64 pa, pr; 2184 2185 float64_unpack_canonical(&pa, a, s); 2186 pr = float_to_float(pa, &bfloat16_params, s); 2187 return bfloat16_round_pack_canonical(&pr, s); 2188 } 2189 2190 /* 2191 * Rounds the floating-point value `a' to an integer, and returns the 2192 * result as a floating-point value. The operation is performed 2193 * according to the IEC/IEEE Standard for Binary Floating-Point 2194 * Arithmetic. 2195 */ 2196 2197 static FloatParts64 round_to_int(FloatParts64 a, FloatRoundMode rmode, 2198 int scale, float_status *s) 2199 { 2200 switch (a.cls) { 2201 case float_class_qnan: 2202 case float_class_snan: 2203 parts_return_nan(&a, s); 2204 break; 2205 2206 case float_class_zero: 2207 case float_class_inf: 2208 /* already "integral" */ 2209 break; 2210 2211 case float_class_normal: 2212 scale = MIN(MAX(scale, -0x10000), 0x10000); 2213 a.exp += scale; 2214 2215 if (a.exp >= DECOMPOSED_BINARY_POINT) { 2216 /* already integral */ 2217 break; 2218 } 2219 if (a.exp < 0) { 2220 bool one; 2221 /* all fractional */ 2222 float_raise(float_flag_inexact, s); 2223 switch (rmode) { 2224 case float_round_nearest_even: 2225 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT; 2226 break; 2227 case float_round_ties_away: 2228 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT; 2229 break; 2230 case float_round_to_zero: 2231 one = false; 2232 break; 2233 case float_round_up: 2234 one = !a.sign; 2235 break; 2236 case float_round_down: 2237 one = a.sign; 2238 break; 2239 case float_round_to_odd: 2240 one = true; 2241 break; 2242 default: 2243 g_assert_not_reached(); 2244 } 2245 2246 if (one) { 2247 a.frac = DECOMPOSED_IMPLICIT_BIT; 2248 a.exp = 0; 2249 } else { 2250 a.cls = float_class_zero; 2251 } 2252 } else { 2253 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp; 2254 uint64_t frac_lsbm1 = frac_lsb >> 1; 2255 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb; 2256 uint64_t rnd_mask = rnd_even_mask >> 1; 2257 uint64_t inc; 2258 2259 switch (rmode) { 2260 case float_round_nearest_even: 2261 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 2262 break; 2263 case float_round_ties_away: 2264 inc = frac_lsbm1; 2265 break; 2266 case float_round_to_zero: 2267 inc = 0; 2268 break; 2269 case float_round_up: 2270 inc = a.sign ? 0 : rnd_mask; 2271 break; 2272 case float_round_down: 2273 inc = a.sign ? rnd_mask : 0; 2274 break; 2275 case float_round_to_odd: 2276 inc = a.frac & frac_lsb ? 0 : rnd_mask; 2277 break; 2278 default: 2279 g_assert_not_reached(); 2280 } 2281 2282 if (a.frac & rnd_mask) { 2283 float_raise(float_flag_inexact, s); 2284 if (uadd64_overflow(a.frac, inc, &a.frac)) { 2285 a.frac >>= 1; 2286 a.frac |= DECOMPOSED_IMPLICIT_BIT; 2287 a.exp++; 2288 } 2289 a.frac &= ~rnd_mask; 2290 } 2291 } 2292 break; 2293 default: 2294 g_assert_not_reached(); 2295 } 2296 return a; 2297 } 2298 2299 float16 float16_round_to_int(float16 a, float_status *s) 2300 { 2301 FloatParts64 pa, pr; 2302 2303 float16_unpack_canonical(&pa, a, s); 2304 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2305 return float16_round_pack_canonical(&pr, s); 2306 } 2307 2308 float32 float32_round_to_int(float32 a, float_status *s) 2309 { 2310 FloatParts64 pa, pr; 2311 2312 float32_unpack_canonical(&pa, a, s); 2313 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2314 return float32_round_pack_canonical(&pr, s); 2315 } 2316 2317 float64 float64_round_to_int(float64 a, float_status *s) 2318 { 2319 FloatParts64 pa, pr; 2320 2321 float64_unpack_canonical(&pa, a, s); 2322 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2323 return float64_round_pack_canonical(&pr, s); 2324 } 2325 2326 /* 2327 * Rounds the bfloat16 value `a' to an integer, and returns the 2328 * result as a bfloat16 value. 2329 */ 2330 2331 bfloat16 bfloat16_round_to_int(bfloat16 a, float_status *s) 2332 { 2333 FloatParts64 pa, pr; 2334 2335 bfloat16_unpack_canonical(&pa, a, s); 2336 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2337 return bfloat16_round_pack_canonical(&pr, s); 2338 } 2339 2340 /* 2341 * Returns the result of converting the floating-point value `a' to 2342 * the two's complement integer format. The conversion is performed 2343 * according to the IEC/IEEE Standard for Binary Floating-Point 2344 * Arithmetic---which means in particular that the conversion is 2345 * rounded according to the current rounding mode. If `a' is a NaN, 2346 * the largest positive integer is returned. Otherwise, if the 2347 * conversion overflows, the largest integer with the same sign as `a' 2348 * is returned. 2349 */ 2350 2351 static int64_t round_to_int_and_pack(FloatParts64 in, FloatRoundMode rmode, 2352 int scale, int64_t min, int64_t max, 2353 float_status *s) 2354 { 2355 uint64_t r; 2356 int orig_flags = get_float_exception_flags(s); 2357 FloatParts64 p = round_to_int(in, rmode, scale, s); 2358 2359 switch (p.cls) { 2360 case float_class_snan: 2361 case float_class_qnan: 2362 s->float_exception_flags = orig_flags | float_flag_invalid; 2363 return max; 2364 case float_class_inf: 2365 s->float_exception_flags = orig_flags | float_flag_invalid; 2366 return p.sign ? min : max; 2367 case float_class_zero: 2368 return 0; 2369 case float_class_normal: 2370 if (p.exp <= DECOMPOSED_BINARY_POINT) { 2371 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2372 } else { 2373 r = UINT64_MAX; 2374 } 2375 if (p.sign) { 2376 if (r <= -(uint64_t) min) { 2377 return -r; 2378 } else { 2379 s->float_exception_flags = orig_flags | float_flag_invalid; 2380 return min; 2381 } 2382 } else { 2383 if (r <= max) { 2384 return r; 2385 } else { 2386 s->float_exception_flags = orig_flags | float_flag_invalid; 2387 return max; 2388 } 2389 } 2390 default: 2391 g_assert_not_reached(); 2392 } 2393 } 2394 2395 int8_t float16_to_int8_scalbn(float16 a, FloatRoundMode rmode, int scale, 2396 float_status *s) 2397 { 2398 FloatParts64 p; 2399 2400 float16_unpack_canonical(&p, a, s); 2401 return round_to_int_and_pack(p, rmode, scale, INT8_MIN, INT8_MAX, s); 2402 } 2403 2404 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale, 2405 float_status *s) 2406 { 2407 FloatParts64 p; 2408 2409 float16_unpack_canonical(&p, a, s); 2410 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2411 } 2412 2413 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale, 2414 float_status *s) 2415 { 2416 FloatParts64 p; 2417 2418 float16_unpack_canonical(&p, a, s); 2419 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2420 } 2421 2422 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale, 2423 float_status *s) 2424 { 2425 FloatParts64 p; 2426 2427 float16_unpack_canonical(&p, a, s); 2428 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2429 } 2430 2431 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale, 2432 float_status *s) 2433 { 2434 FloatParts64 p; 2435 2436 float32_unpack_canonical(&p, a, s); 2437 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2438 } 2439 2440 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale, 2441 float_status *s) 2442 { 2443 FloatParts64 p; 2444 2445 float32_unpack_canonical(&p, a, s); 2446 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2447 } 2448 2449 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale, 2450 float_status *s) 2451 { 2452 FloatParts64 p; 2453 2454 float32_unpack_canonical(&p, a, s); 2455 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2456 } 2457 2458 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale, 2459 float_status *s) 2460 { 2461 FloatParts64 p; 2462 2463 float64_unpack_canonical(&p, a, s); 2464 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2465 } 2466 2467 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale, 2468 float_status *s) 2469 { 2470 FloatParts64 p; 2471 2472 float64_unpack_canonical(&p, a, s); 2473 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2474 } 2475 2476 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale, 2477 float_status *s) 2478 { 2479 FloatParts64 p; 2480 2481 float64_unpack_canonical(&p, a, s); 2482 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2483 } 2484 2485 int8_t float16_to_int8(float16 a, float_status *s) 2486 { 2487 return float16_to_int8_scalbn(a, s->float_rounding_mode, 0, s); 2488 } 2489 2490 int16_t float16_to_int16(float16 a, float_status *s) 2491 { 2492 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2493 } 2494 2495 int32_t float16_to_int32(float16 a, float_status *s) 2496 { 2497 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2498 } 2499 2500 int64_t float16_to_int64(float16 a, float_status *s) 2501 { 2502 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2503 } 2504 2505 int16_t float32_to_int16(float32 a, float_status *s) 2506 { 2507 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2508 } 2509 2510 int32_t float32_to_int32(float32 a, float_status *s) 2511 { 2512 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2513 } 2514 2515 int64_t float32_to_int64(float32 a, float_status *s) 2516 { 2517 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2518 } 2519 2520 int16_t float64_to_int16(float64 a, float_status *s) 2521 { 2522 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2523 } 2524 2525 int32_t float64_to_int32(float64 a, float_status *s) 2526 { 2527 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2528 } 2529 2530 int64_t float64_to_int64(float64 a, float_status *s) 2531 { 2532 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2533 } 2534 2535 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s) 2536 { 2537 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2538 } 2539 2540 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s) 2541 { 2542 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2543 } 2544 2545 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s) 2546 { 2547 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2548 } 2549 2550 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s) 2551 { 2552 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s); 2553 } 2554 2555 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s) 2556 { 2557 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s); 2558 } 2559 2560 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s) 2561 { 2562 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s); 2563 } 2564 2565 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s) 2566 { 2567 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s); 2568 } 2569 2570 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s) 2571 { 2572 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s); 2573 } 2574 2575 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s) 2576 { 2577 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s); 2578 } 2579 2580 /* 2581 * Returns the result of converting the floating-point value `a' to 2582 * the two's complement integer format. 2583 */ 2584 2585 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2586 float_status *s) 2587 { 2588 FloatParts64 p; 2589 2590 bfloat16_unpack_canonical(&p, a, s); 2591 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2592 } 2593 2594 int32_t bfloat16_to_int32_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2595 float_status *s) 2596 { 2597 FloatParts64 p; 2598 2599 bfloat16_unpack_canonical(&p, a, s); 2600 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2601 } 2602 2603 int64_t bfloat16_to_int64_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2604 float_status *s) 2605 { 2606 FloatParts64 p; 2607 2608 bfloat16_unpack_canonical(&p, a, s); 2609 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2610 } 2611 2612 int16_t bfloat16_to_int16(bfloat16 a, float_status *s) 2613 { 2614 return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2615 } 2616 2617 int32_t bfloat16_to_int32(bfloat16 a, float_status *s) 2618 { 2619 return bfloat16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2620 } 2621 2622 int64_t bfloat16_to_int64(bfloat16 a, float_status *s) 2623 { 2624 return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2625 } 2626 2627 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s) 2628 { 2629 return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2630 } 2631 2632 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a, float_status *s) 2633 { 2634 return bfloat16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2635 } 2636 2637 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a, float_status *s) 2638 { 2639 return bfloat16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2640 } 2641 2642 /* 2643 * Returns the result of converting the floating-point value `a' to 2644 * the unsigned integer format. The conversion is performed according 2645 * to the IEC/IEEE Standard for Binary Floating-Point 2646 * Arithmetic---which means in particular that the conversion is 2647 * rounded according to the current rounding mode. If `a' is a NaN, 2648 * the largest unsigned integer is returned. Otherwise, if the 2649 * conversion overflows, the largest unsigned integer is returned. If 2650 * the 'a' is negative, the result is rounded and zero is returned; 2651 * values that do not round to zero will raise the inexact exception 2652 * flag. 2653 */ 2654 2655 static uint64_t round_to_uint_and_pack(FloatParts64 in, FloatRoundMode rmode, 2656 int scale, uint64_t max, 2657 float_status *s) 2658 { 2659 int orig_flags = get_float_exception_flags(s); 2660 FloatParts64 p = round_to_int(in, rmode, scale, s); 2661 uint64_t r; 2662 2663 switch (p.cls) { 2664 case float_class_snan: 2665 case float_class_qnan: 2666 s->float_exception_flags = orig_flags | float_flag_invalid; 2667 return max; 2668 case float_class_inf: 2669 s->float_exception_flags = orig_flags | float_flag_invalid; 2670 return p.sign ? 0 : max; 2671 case float_class_zero: 2672 return 0; 2673 case float_class_normal: 2674 if (p.sign) { 2675 s->float_exception_flags = orig_flags | float_flag_invalid; 2676 return 0; 2677 } 2678 2679 if (p.exp <= DECOMPOSED_BINARY_POINT) { 2680 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2681 } else { 2682 s->float_exception_flags = orig_flags | float_flag_invalid; 2683 return max; 2684 } 2685 2686 /* For uint64 this will never trip, but if p.exp is too large 2687 * to shift a decomposed fraction we shall have exited via the 2688 * 3rd leg above. 2689 */ 2690 if (r > max) { 2691 s->float_exception_flags = orig_flags | float_flag_invalid; 2692 return max; 2693 } 2694 return r; 2695 default: 2696 g_assert_not_reached(); 2697 } 2698 } 2699 2700 uint8_t float16_to_uint8_scalbn(float16 a, FloatRoundMode rmode, int scale, 2701 float_status *s) 2702 { 2703 FloatParts64 p; 2704 2705 float16_unpack_canonical(&p, a, s); 2706 return round_to_uint_and_pack(p, rmode, scale, UINT8_MAX, s); 2707 } 2708 2709 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale, 2710 float_status *s) 2711 { 2712 FloatParts64 p; 2713 2714 float16_unpack_canonical(&p, a, s); 2715 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2716 } 2717 2718 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale, 2719 float_status *s) 2720 { 2721 FloatParts64 p; 2722 2723 float16_unpack_canonical(&p, a, s); 2724 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2725 } 2726 2727 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale, 2728 float_status *s) 2729 { 2730 FloatParts64 p; 2731 2732 float16_unpack_canonical(&p, a, s); 2733 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2734 } 2735 2736 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale, 2737 float_status *s) 2738 { 2739 FloatParts64 p; 2740 2741 float32_unpack_canonical(&p, a, s); 2742 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2743 } 2744 2745 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale, 2746 float_status *s) 2747 { 2748 FloatParts64 p; 2749 2750 float32_unpack_canonical(&p, a, s); 2751 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2752 } 2753 2754 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale, 2755 float_status *s) 2756 { 2757 FloatParts64 p; 2758 2759 float32_unpack_canonical(&p, a, s); 2760 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2761 } 2762 2763 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale, 2764 float_status *s) 2765 { 2766 FloatParts64 p; 2767 2768 float64_unpack_canonical(&p, a, s); 2769 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2770 } 2771 2772 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale, 2773 float_status *s) 2774 { 2775 FloatParts64 p; 2776 2777 float64_unpack_canonical(&p, a, s); 2778 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2779 } 2780 2781 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale, 2782 float_status *s) 2783 { 2784 FloatParts64 p; 2785 2786 float64_unpack_canonical(&p, a, s); 2787 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2788 } 2789 2790 uint8_t float16_to_uint8(float16 a, float_status *s) 2791 { 2792 return float16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s); 2793 } 2794 2795 uint16_t float16_to_uint16(float16 a, float_status *s) 2796 { 2797 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2798 } 2799 2800 uint32_t float16_to_uint32(float16 a, float_status *s) 2801 { 2802 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2803 } 2804 2805 uint64_t float16_to_uint64(float16 a, float_status *s) 2806 { 2807 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2808 } 2809 2810 uint16_t float32_to_uint16(float32 a, float_status *s) 2811 { 2812 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2813 } 2814 2815 uint32_t float32_to_uint32(float32 a, float_status *s) 2816 { 2817 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2818 } 2819 2820 uint64_t float32_to_uint64(float32 a, float_status *s) 2821 { 2822 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2823 } 2824 2825 uint16_t float64_to_uint16(float64 a, float_status *s) 2826 { 2827 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2828 } 2829 2830 uint32_t float64_to_uint32(float64 a, float_status *s) 2831 { 2832 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2833 } 2834 2835 uint64_t float64_to_uint64(float64 a, float_status *s) 2836 { 2837 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2838 } 2839 2840 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s) 2841 { 2842 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2843 } 2844 2845 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s) 2846 { 2847 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2848 } 2849 2850 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s) 2851 { 2852 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2853 } 2854 2855 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s) 2856 { 2857 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2858 } 2859 2860 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s) 2861 { 2862 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2863 } 2864 2865 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s) 2866 { 2867 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2868 } 2869 2870 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s) 2871 { 2872 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2873 } 2874 2875 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s) 2876 { 2877 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2878 } 2879 2880 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s) 2881 { 2882 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2883 } 2884 2885 /* 2886 * Returns the result of converting the bfloat16 value `a' to 2887 * the unsigned integer format. 2888 */ 2889 2890 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode, 2891 int scale, float_status *s) 2892 { 2893 FloatParts64 p; 2894 2895 bfloat16_unpack_canonical(&p, a, s); 2896 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2897 } 2898 2899 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode rmode, 2900 int scale, float_status *s) 2901 { 2902 FloatParts64 p; 2903 2904 bfloat16_unpack_canonical(&p, a, s); 2905 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2906 } 2907 2908 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode rmode, 2909 int scale, float_status *s) 2910 { 2911 FloatParts64 p; 2912 2913 bfloat16_unpack_canonical(&p, a, s); 2914 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2915 } 2916 2917 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s) 2918 { 2919 return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2920 } 2921 2922 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *s) 2923 { 2924 return bfloat16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2925 } 2926 2927 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s) 2928 { 2929 return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2930 } 2931 2932 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s) 2933 { 2934 return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2935 } 2936 2937 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *s) 2938 { 2939 return bfloat16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2940 } 2941 2942 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *s) 2943 { 2944 return bfloat16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2945 } 2946 2947 /* 2948 * Integer to float conversions 2949 * 2950 * Returns the result of converting the two's complement integer `a' 2951 * to the floating-point format. The conversion is performed according 2952 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2953 */ 2954 2955 static FloatParts64 int_to_float(int64_t a, int scale, float_status *status) 2956 { 2957 FloatParts64 r = { .sign = false }; 2958 2959 if (a == 0) { 2960 r.cls = float_class_zero; 2961 } else { 2962 uint64_t f = a; 2963 int shift; 2964 2965 r.cls = float_class_normal; 2966 if (a < 0) { 2967 f = -f; 2968 r.sign = true; 2969 } 2970 shift = clz64(f); 2971 scale = MIN(MAX(scale, -0x10000), 0x10000); 2972 2973 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 2974 r.frac = f << shift; 2975 } 2976 2977 return r; 2978 } 2979 2980 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status) 2981 { 2982 FloatParts64 pa = int_to_float(a, scale, status); 2983 return float16_round_pack_canonical(&pa, status); 2984 } 2985 2986 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status) 2987 { 2988 return int64_to_float16_scalbn(a, scale, status); 2989 } 2990 2991 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status) 2992 { 2993 return int64_to_float16_scalbn(a, scale, status); 2994 } 2995 2996 float16 int64_to_float16(int64_t a, float_status *status) 2997 { 2998 return int64_to_float16_scalbn(a, 0, status); 2999 } 3000 3001 float16 int32_to_float16(int32_t a, float_status *status) 3002 { 3003 return int64_to_float16_scalbn(a, 0, status); 3004 } 3005 3006 float16 int16_to_float16(int16_t a, float_status *status) 3007 { 3008 return int64_to_float16_scalbn(a, 0, status); 3009 } 3010 3011 float16 int8_to_float16(int8_t a, float_status *status) 3012 { 3013 return int64_to_float16_scalbn(a, 0, status); 3014 } 3015 3016 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status) 3017 { 3018 FloatParts64 pa = int_to_float(a, scale, status); 3019 return float32_round_pack_canonical(&pa, status); 3020 } 3021 3022 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status) 3023 { 3024 return int64_to_float32_scalbn(a, scale, status); 3025 } 3026 3027 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status) 3028 { 3029 return int64_to_float32_scalbn(a, scale, status); 3030 } 3031 3032 float32 int64_to_float32(int64_t a, float_status *status) 3033 { 3034 return int64_to_float32_scalbn(a, 0, status); 3035 } 3036 3037 float32 int32_to_float32(int32_t a, float_status *status) 3038 { 3039 return int64_to_float32_scalbn(a, 0, status); 3040 } 3041 3042 float32 int16_to_float32(int16_t a, float_status *status) 3043 { 3044 return int64_to_float32_scalbn(a, 0, status); 3045 } 3046 3047 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status) 3048 { 3049 FloatParts64 pa = int_to_float(a, scale, status); 3050 return float64_round_pack_canonical(&pa, status); 3051 } 3052 3053 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status) 3054 { 3055 return int64_to_float64_scalbn(a, scale, status); 3056 } 3057 3058 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status) 3059 { 3060 return int64_to_float64_scalbn(a, scale, status); 3061 } 3062 3063 float64 int64_to_float64(int64_t a, float_status *status) 3064 { 3065 return int64_to_float64_scalbn(a, 0, status); 3066 } 3067 3068 float64 int32_to_float64(int32_t a, float_status *status) 3069 { 3070 return int64_to_float64_scalbn(a, 0, status); 3071 } 3072 3073 float64 int16_to_float64(int16_t a, float_status *status) 3074 { 3075 return int64_to_float64_scalbn(a, 0, status); 3076 } 3077 3078 /* 3079 * Returns the result of converting the two's complement integer `a' 3080 * to the bfloat16 format. 3081 */ 3082 3083 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status) 3084 { 3085 FloatParts64 pa = int_to_float(a, scale, status); 3086 return bfloat16_round_pack_canonical(&pa, status); 3087 } 3088 3089 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int scale, float_status *status) 3090 { 3091 return int64_to_bfloat16_scalbn(a, scale, status); 3092 } 3093 3094 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status) 3095 { 3096 return int64_to_bfloat16_scalbn(a, scale, status); 3097 } 3098 3099 bfloat16 int64_to_bfloat16(int64_t a, float_status *status) 3100 { 3101 return int64_to_bfloat16_scalbn(a, 0, status); 3102 } 3103 3104 bfloat16 int32_to_bfloat16(int32_t a, float_status *status) 3105 { 3106 return int64_to_bfloat16_scalbn(a, 0, status); 3107 } 3108 3109 bfloat16 int16_to_bfloat16(int16_t a, float_status *status) 3110 { 3111 return int64_to_bfloat16_scalbn(a, 0, status); 3112 } 3113 3114 /* 3115 * Unsigned Integer to float conversions 3116 * 3117 * Returns the result of converting the unsigned integer `a' to the 3118 * floating-point format. The conversion is performed according to the 3119 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3120 */ 3121 3122 static FloatParts64 uint_to_float(uint64_t a, int scale, float_status *status) 3123 { 3124 FloatParts64 r = { .sign = false }; 3125 int shift; 3126 3127 if (a == 0) { 3128 r.cls = float_class_zero; 3129 } else { 3130 scale = MIN(MAX(scale, -0x10000), 0x10000); 3131 shift = clz64(a); 3132 r.cls = float_class_normal; 3133 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 3134 r.frac = a << shift; 3135 } 3136 3137 return r; 3138 } 3139 3140 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status) 3141 { 3142 FloatParts64 pa = uint_to_float(a, scale, status); 3143 return float16_round_pack_canonical(&pa, status); 3144 } 3145 3146 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status) 3147 { 3148 return uint64_to_float16_scalbn(a, scale, status); 3149 } 3150 3151 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status) 3152 { 3153 return uint64_to_float16_scalbn(a, scale, status); 3154 } 3155 3156 float16 uint64_to_float16(uint64_t a, float_status *status) 3157 { 3158 return uint64_to_float16_scalbn(a, 0, status); 3159 } 3160 3161 float16 uint32_to_float16(uint32_t a, float_status *status) 3162 { 3163 return uint64_to_float16_scalbn(a, 0, status); 3164 } 3165 3166 float16 uint16_to_float16(uint16_t a, float_status *status) 3167 { 3168 return uint64_to_float16_scalbn(a, 0, status); 3169 } 3170 3171 float16 uint8_to_float16(uint8_t a, float_status *status) 3172 { 3173 return uint64_to_float16_scalbn(a, 0, status); 3174 } 3175 3176 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status) 3177 { 3178 FloatParts64 pa = uint_to_float(a, scale, status); 3179 return float32_round_pack_canonical(&pa, status); 3180 } 3181 3182 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status) 3183 { 3184 return uint64_to_float32_scalbn(a, scale, status); 3185 } 3186 3187 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status) 3188 { 3189 return uint64_to_float32_scalbn(a, scale, status); 3190 } 3191 3192 float32 uint64_to_float32(uint64_t a, float_status *status) 3193 { 3194 return uint64_to_float32_scalbn(a, 0, status); 3195 } 3196 3197 float32 uint32_to_float32(uint32_t a, float_status *status) 3198 { 3199 return uint64_to_float32_scalbn(a, 0, status); 3200 } 3201 3202 float32 uint16_to_float32(uint16_t a, float_status *status) 3203 { 3204 return uint64_to_float32_scalbn(a, 0, status); 3205 } 3206 3207 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status) 3208 { 3209 FloatParts64 pa = uint_to_float(a, scale, status); 3210 return float64_round_pack_canonical(&pa, status); 3211 } 3212 3213 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status) 3214 { 3215 return uint64_to_float64_scalbn(a, scale, status); 3216 } 3217 3218 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status) 3219 { 3220 return uint64_to_float64_scalbn(a, scale, status); 3221 } 3222 3223 float64 uint64_to_float64(uint64_t a, float_status *status) 3224 { 3225 return uint64_to_float64_scalbn(a, 0, status); 3226 } 3227 3228 float64 uint32_to_float64(uint32_t a, float_status *status) 3229 { 3230 return uint64_to_float64_scalbn(a, 0, status); 3231 } 3232 3233 float64 uint16_to_float64(uint16_t a, float_status *status) 3234 { 3235 return uint64_to_float64_scalbn(a, 0, status); 3236 } 3237 3238 /* 3239 * Returns the result of converting the unsigned integer `a' to the 3240 * bfloat16 format. 3241 */ 3242 3243 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status) 3244 { 3245 FloatParts64 pa = uint_to_float(a, scale, status); 3246 return bfloat16_round_pack_canonical(&pa, status); 3247 } 3248 3249 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int scale, float_status *status) 3250 { 3251 return uint64_to_bfloat16_scalbn(a, scale, status); 3252 } 3253 3254 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status) 3255 { 3256 return uint64_to_bfloat16_scalbn(a, scale, status); 3257 } 3258 3259 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status) 3260 { 3261 return uint64_to_bfloat16_scalbn(a, 0, status); 3262 } 3263 3264 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status) 3265 { 3266 return uint64_to_bfloat16_scalbn(a, 0, status); 3267 } 3268 3269 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status) 3270 { 3271 return uint64_to_bfloat16_scalbn(a, 0, status); 3272 } 3273 3274 /* Float Min/Max */ 3275 /* min() and max() functions. These can't be implemented as 3276 * 'compare and pick one input' because that would mishandle 3277 * NaNs and +0 vs -0. 3278 * 3279 * minnum() and maxnum() functions. These are similar to the min() 3280 * and max() functions but if one of the arguments is a QNaN and 3281 * the other is numerical then the numerical argument is returned. 3282 * SNaNs will get quietened before being returned. 3283 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 3284 * and maxNum() operations. min() and max() are the typical min/max 3285 * semantics provided by many CPUs which predate that specification. 3286 * 3287 * minnummag() and maxnummag() functions correspond to minNumMag() 3288 * and minNumMag() from the IEEE-754 2008. 3289 */ 3290 static FloatParts64 minmax_floats(FloatParts64 a, FloatParts64 b, bool ismin, 3291 bool ieee, bool ismag, float_status *s) 3292 { 3293 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) { 3294 if (ieee) { 3295 /* Takes two floating-point values `a' and `b', one of 3296 * which is a NaN, and returns the appropriate NaN 3297 * result. If either `a' or `b' is a signaling NaN, 3298 * the invalid exception is raised. 3299 */ 3300 if (is_snan(a.cls) || is_snan(b.cls)) { 3301 return *parts_pick_nan(&a, &b, s); 3302 } else if (is_nan(a.cls) && !is_nan(b.cls)) { 3303 return b; 3304 } else if (is_nan(b.cls) && !is_nan(a.cls)) { 3305 return a; 3306 } 3307 } 3308 return *parts_pick_nan(&a, &b, s); 3309 } else { 3310 int a_exp, b_exp; 3311 3312 switch (a.cls) { 3313 case float_class_normal: 3314 a_exp = a.exp; 3315 break; 3316 case float_class_inf: 3317 a_exp = INT_MAX; 3318 break; 3319 case float_class_zero: 3320 a_exp = INT_MIN; 3321 break; 3322 default: 3323 g_assert_not_reached(); 3324 break; 3325 } 3326 switch (b.cls) { 3327 case float_class_normal: 3328 b_exp = b.exp; 3329 break; 3330 case float_class_inf: 3331 b_exp = INT_MAX; 3332 break; 3333 case float_class_zero: 3334 b_exp = INT_MIN; 3335 break; 3336 default: 3337 g_assert_not_reached(); 3338 break; 3339 } 3340 3341 if (ismag && (a_exp != b_exp || a.frac != b.frac)) { 3342 bool a_less = a_exp < b_exp; 3343 if (a_exp == b_exp) { 3344 a_less = a.frac < b.frac; 3345 } 3346 return a_less ^ ismin ? b : a; 3347 } 3348 3349 if (a.sign == b.sign) { 3350 bool a_less = a_exp < b_exp; 3351 if (a_exp == b_exp) { 3352 a_less = a.frac < b.frac; 3353 } 3354 return a.sign ^ a_less ^ ismin ? b : a; 3355 } else { 3356 return a.sign ^ ismin ? b : a; 3357 } 3358 } 3359 } 3360 3361 #define MINMAX(sz, name, ismin, isiee, ismag) \ 3362 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \ 3363 float_status *s) \ 3364 { \ 3365 FloatParts64 pa, pb, pr; \ 3366 float ## sz ## _unpack_canonical(&pa, a, s); \ 3367 float ## sz ## _unpack_canonical(&pb, b, s); \ 3368 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 3369 return float ## sz ## _round_pack_canonical(&pr, s); \ 3370 } 3371 3372 MINMAX(16, min, true, false, false) 3373 MINMAX(16, minnum, true, true, false) 3374 MINMAX(16, minnummag, true, true, true) 3375 MINMAX(16, max, false, false, false) 3376 MINMAX(16, maxnum, false, true, false) 3377 MINMAX(16, maxnummag, false, true, true) 3378 3379 MINMAX(32, min, true, false, false) 3380 MINMAX(32, minnum, true, true, false) 3381 MINMAX(32, minnummag, true, true, true) 3382 MINMAX(32, max, false, false, false) 3383 MINMAX(32, maxnum, false, true, false) 3384 MINMAX(32, maxnummag, false, true, true) 3385 3386 MINMAX(64, min, true, false, false) 3387 MINMAX(64, minnum, true, true, false) 3388 MINMAX(64, minnummag, true, true, true) 3389 MINMAX(64, max, false, false, false) 3390 MINMAX(64, maxnum, false, true, false) 3391 MINMAX(64, maxnummag, false, true, true) 3392 3393 #undef MINMAX 3394 3395 #define BF16_MINMAX(name, ismin, isiee, ismag) \ 3396 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s) \ 3397 { \ 3398 FloatParts64 pa, pb, pr; \ 3399 bfloat16_unpack_canonical(&pa, a, s); \ 3400 bfloat16_unpack_canonical(&pb, b, s); \ 3401 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 3402 return bfloat16_round_pack_canonical(&pr, s); \ 3403 } 3404 3405 BF16_MINMAX(min, true, false, false) 3406 BF16_MINMAX(minnum, true, true, false) 3407 BF16_MINMAX(minnummag, true, true, true) 3408 BF16_MINMAX(max, false, false, false) 3409 BF16_MINMAX(maxnum, false, true, false) 3410 BF16_MINMAX(maxnummag, false, true, true) 3411 3412 #undef BF16_MINMAX 3413 3414 /* Floating point compare */ 3415 static FloatRelation compare_floats(FloatParts64 a, FloatParts64 b, bool is_quiet, 3416 float_status *s) 3417 { 3418 if (is_nan(a.cls) || is_nan(b.cls)) { 3419 if (!is_quiet || 3420 a.cls == float_class_snan || 3421 b.cls == float_class_snan) { 3422 float_raise(float_flag_invalid, s); 3423 } 3424 return float_relation_unordered; 3425 } 3426 3427 if (a.cls == float_class_zero) { 3428 if (b.cls == float_class_zero) { 3429 return float_relation_equal; 3430 } 3431 return b.sign ? float_relation_greater : float_relation_less; 3432 } else if (b.cls == float_class_zero) { 3433 return a.sign ? float_relation_less : float_relation_greater; 3434 } 3435 3436 /* The only really important thing about infinity is its sign. If 3437 * both are infinities the sign marks the smallest of the two. 3438 */ 3439 if (a.cls == float_class_inf) { 3440 if ((b.cls == float_class_inf) && (a.sign == b.sign)) { 3441 return float_relation_equal; 3442 } 3443 return a.sign ? float_relation_less : float_relation_greater; 3444 } else if (b.cls == float_class_inf) { 3445 return b.sign ? float_relation_greater : float_relation_less; 3446 } 3447 3448 if (a.sign != b.sign) { 3449 return a.sign ? float_relation_less : float_relation_greater; 3450 } 3451 3452 if (a.exp == b.exp) { 3453 if (a.frac == b.frac) { 3454 return float_relation_equal; 3455 } 3456 if (a.sign) { 3457 return a.frac > b.frac ? 3458 float_relation_less : float_relation_greater; 3459 } else { 3460 return a.frac > b.frac ? 3461 float_relation_greater : float_relation_less; 3462 } 3463 } else { 3464 if (a.sign) { 3465 return a.exp > b.exp ? float_relation_less : float_relation_greater; 3466 } else { 3467 return a.exp > b.exp ? float_relation_greater : float_relation_less; 3468 } 3469 } 3470 } 3471 3472 #define COMPARE(name, attr, sz) \ 3473 static int attr \ 3474 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \ 3475 { \ 3476 FloatParts64 pa, pb; \ 3477 float ## sz ## _unpack_canonical(&pa, a, s); \ 3478 float ## sz ## _unpack_canonical(&pb, b, s); \ 3479 return compare_floats(pa, pb, is_quiet, s); \ 3480 } 3481 3482 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16) 3483 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32) 3484 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64) 3485 3486 #undef COMPARE 3487 3488 FloatRelation float16_compare(float16 a, float16 b, float_status *s) 3489 { 3490 return soft_f16_compare(a, b, false, s); 3491 } 3492 3493 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s) 3494 { 3495 return soft_f16_compare(a, b, true, s); 3496 } 3497 3498 static FloatRelation QEMU_FLATTEN 3499 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s) 3500 { 3501 union_float32 ua, ub; 3502 3503 ua.s = xa; 3504 ub.s = xb; 3505 3506 if (QEMU_NO_HARDFLOAT) { 3507 goto soft; 3508 } 3509 3510 float32_input_flush2(&ua.s, &ub.s, s); 3511 if (isgreaterequal(ua.h, ub.h)) { 3512 if (isgreater(ua.h, ub.h)) { 3513 return float_relation_greater; 3514 } 3515 return float_relation_equal; 3516 } 3517 if (likely(isless(ua.h, ub.h))) { 3518 return float_relation_less; 3519 } 3520 /* The only condition remaining is unordered. 3521 * Fall through to set flags. 3522 */ 3523 soft: 3524 return soft_f32_compare(ua.s, ub.s, is_quiet, s); 3525 } 3526 3527 FloatRelation float32_compare(float32 a, float32 b, float_status *s) 3528 { 3529 return f32_compare(a, b, false, s); 3530 } 3531 3532 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s) 3533 { 3534 return f32_compare(a, b, true, s); 3535 } 3536 3537 static FloatRelation QEMU_FLATTEN 3538 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s) 3539 { 3540 union_float64 ua, ub; 3541 3542 ua.s = xa; 3543 ub.s = xb; 3544 3545 if (QEMU_NO_HARDFLOAT) { 3546 goto soft; 3547 } 3548 3549 float64_input_flush2(&ua.s, &ub.s, s); 3550 if (isgreaterequal(ua.h, ub.h)) { 3551 if (isgreater(ua.h, ub.h)) { 3552 return float_relation_greater; 3553 } 3554 return float_relation_equal; 3555 } 3556 if (likely(isless(ua.h, ub.h))) { 3557 return float_relation_less; 3558 } 3559 /* The only condition remaining is unordered. 3560 * Fall through to set flags. 3561 */ 3562 soft: 3563 return soft_f64_compare(ua.s, ub.s, is_quiet, s); 3564 } 3565 3566 FloatRelation float64_compare(float64 a, float64 b, float_status *s) 3567 { 3568 return f64_compare(a, b, false, s); 3569 } 3570 3571 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s) 3572 { 3573 return f64_compare(a, b, true, s); 3574 } 3575 3576 static FloatRelation QEMU_FLATTEN 3577 soft_bf16_compare(bfloat16 a, bfloat16 b, bool is_quiet, float_status *s) 3578 { 3579 FloatParts64 pa, pb; 3580 3581 bfloat16_unpack_canonical(&pa, a, s); 3582 bfloat16_unpack_canonical(&pb, b, s); 3583 return compare_floats(pa, pb, is_quiet, s); 3584 } 3585 3586 FloatRelation bfloat16_compare(bfloat16 a, bfloat16 b, float_status *s) 3587 { 3588 return soft_bf16_compare(a, b, false, s); 3589 } 3590 3591 FloatRelation bfloat16_compare_quiet(bfloat16 a, bfloat16 b, float_status *s) 3592 { 3593 return soft_bf16_compare(a, b, true, s); 3594 } 3595 3596 /* Multiply A by 2 raised to the power N. */ 3597 static FloatParts64 scalbn_decomposed(FloatParts64 a, int n, float_status *s) 3598 { 3599 if (unlikely(is_nan(a.cls))) { 3600 parts_return_nan(&a, s); 3601 } 3602 if (a.cls == float_class_normal) { 3603 /* The largest float type (even though not supported by FloatParts64) 3604 * is float128, which has a 15 bit exponent. Bounding N to 16 bits 3605 * still allows rounding to infinity, without allowing overflow 3606 * within the int32_t that backs FloatParts64.exp. 3607 */ 3608 n = MIN(MAX(n, -0x10000), 0x10000); 3609 a.exp += n; 3610 } 3611 return a; 3612 } 3613 3614 float16 float16_scalbn(float16 a, int n, float_status *status) 3615 { 3616 FloatParts64 pa, pr; 3617 3618 float16_unpack_canonical(&pa, a, status); 3619 pr = scalbn_decomposed(pa, n, status); 3620 return float16_round_pack_canonical(&pr, status); 3621 } 3622 3623 float32 float32_scalbn(float32 a, int n, float_status *status) 3624 { 3625 FloatParts64 pa, pr; 3626 3627 float32_unpack_canonical(&pa, a, status); 3628 pr = scalbn_decomposed(pa, n, status); 3629 return float32_round_pack_canonical(&pr, status); 3630 } 3631 3632 float64 float64_scalbn(float64 a, int n, float_status *status) 3633 { 3634 FloatParts64 pa, pr; 3635 3636 float64_unpack_canonical(&pa, a, status); 3637 pr = scalbn_decomposed(pa, n, status); 3638 return float64_round_pack_canonical(&pr, status); 3639 } 3640 3641 bfloat16 bfloat16_scalbn(bfloat16 a, int n, float_status *status) 3642 { 3643 FloatParts64 pa, pr; 3644 3645 bfloat16_unpack_canonical(&pa, a, status); 3646 pr = scalbn_decomposed(pa, n, status); 3647 return bfloat16_round_pack_canonical(&pr, status); 3648 } 3649 3650 /* 3651 * Square Root 3652 * 3653 * The old softfloat code did an approximation step before zeroing in 3654 * on the final result. However for simpleness we just compute the 3655 * square root by iterating down from the implicit bit to enough extra 3656 * bits to ensure we get a correctly rounded result. 3657 * 3658 * This does mean however the calculation is slower than before, 3659 * especially for 64 bit floats. 3660 */ 3661 3662 static FloatParts64 sqrt_float(FloatParts64 a, float_status *s, const FloatFmt *p) 3663 { 3664 uint64_t a_frac, r_frac, s_frac; 3665 int bit, last_bit; 3666 3667 if (is_nan(a.cls)) { 3668 parts_return_nan(&a, s); 3669 return a; 3670 } 3671 if (a.cls == float_class_zero) { 3672 return a; /* sqrt(+-0) = +-0 */ 3673 } 3674 if (a.sign) { 3675 float_raise(float_flag_invalid, s); 3676 parts_default_nan(&a, s); 3677 return a; 3678 } 3679 if (a.cls == float_class_inf) { 3680 return a; /* sqrt(+inf) = +inf */ 3681 } 3682 3683 assert(a.cls == float_class_normal); 3684 3685 /* We need two overflow bits at the top. Adding room for that is a 3686 * right shift. If the exponent is odd, we can discard the low bit 3687 * by multiplying the fraction by 2; that's a left shift. Combine 3688 * those and we shift right by 1 if the exponent is odd, otherwise 2. 3689 */ 3690 a_frac = a.frac >> (2 - (a.exp & 1)); 3691 a.exp >>= 1; 3692 3693 /* Bit-by-bit computation of sqrt. */ 3694 r_frac = 0; 3695 s_frac = 0; 3696 3697 /* Iterate from implicit bit down to the 3 extra bits to compute a 3698 * properly rounded result. Remember we've inserted two more bits 3699 * at the top, so these positions are two less. 3700 */ 3701 bit = DECOMPOSED_BINARY_POINT - 2; 3702 last_bit = MAX(p->frac_shift - 4, 0); 3703 do { 3704 uint64_t q = 1ULL << bit; 3705 uint64_t t_frac = s_frac + q; 3706 if (t_frac <= a_frac) { 3707 s_frac = t_frac + q; 3708 a_frac -= t_frac; 3709 r_frac += q; 3710 } 3711 a_frac <<= 1; 3712 } while (--bit >= last_bit); 3713 3714 /* Undo the right shift done above. If there is any remaining 3715 * fraction, the result is inexact. Set the sticky bit. 3716 */ 3717 a.frac = (r_frac << 2) + (a_frac != 0); 3718 3719 return a; 3720 } 3721 3722 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status) 3723 { 3724 FloatParts64 pa, pr; 3725 3726 float16_unpack_canonical(&pa, a, status); 3727 pr = sqrt_float(pa, status, &float16_params); 3728 return float16_round_pack_canonical(&pr, status); 3729 } 3730 3731 static float32 QEMU_SOFTFLOAT_ATTR 3732 soft_f32_sqrt(float32 a, float_status *status) 3733 { 3734 FloatParts64 pa, pr; 3735 3736 float32_unpack_canonical(&pa, a, status); 3737 pr = sqrt_float(pa, status, &float32_params); 3738 return float32_round_pack_canonical(&pr, status); 3739 } 3740 3741 static float64 QEMU_SOFTFLOAT_ATTR 3742 soft_f64_sqrt(float64 a, float_status *status) 3743 { 3744 FloatParts64 pa, pr; 3745 3746 float64_unpack_canonical(&pa, a, status); 3747 pr = sqrt_float(pa, status, &float64_params); 3748 return float64_round_pack_canonical(&pr, status); 3749 } 3750 3751 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s) 3752 { 3753 union_float32 ua, ur; 3754 3755 ua.s = xa; 3756 if (unlikely(!can_use_fpu(s))) { 3757 goto soft; 3758 } 3759 3760 float32_input_flush1(&ua.s, s); 3761 if (QEMU_HARDFLOAT_1F32_USE_FP) { 3762 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3763 fpclassify(ua.h) == FP_ZERO) || 3764 signbit(ua.h))) { 3765 goto soft; 3766 } 3767 } else if (unlikely(!float32_is_zero_or_normal(ua.s) || 3768 float32_is_neg(ua.s))) { 3769 goto soft; 3770 } 3771 ur.h = sqrtf(ua.h); 3772 return ur.s; 3773 3774 soft: 3775 return soft_f32_sqrt(ua.s, s); 3776 } 3777 3778 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s) 3779 { 3780 union_float64 ua, ur; 3781 3782 ua.s = xa; 3783 if (unlikely(!can_use_fpu(s))) { 3784 goto soft; 3785 } 3786 3787 float64_input_flush1(&ua.s, s); 3788 if (QEMU_HARDFLOAT_1F64_USE_FP) { 3789 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3790 fpclassify(ua.h) == FP_ZERO) || 3791 signbit(ua.h))) { 3792 goto soft; 3793 } 3794 } else if (unlikely(!float64_is_zero_or_normal(ua.s) || 3795 float64_is_neg(ua.s))) { 3796 goto soft; 3797 } 3798 ur.h = sqrt(ua.h); 3799 return ur.s; 3800 3801 soft: 3802 return soft_f64_sqrt(ua.s, s); 3803 } 3804 3805 bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status) 3806 { 3807 FloatParts64 pa, pr; 3808 3809 bfloat16_unpack_canonical(&pa, a, status); 3810 pr = sqrt_float(pa, status, &bfloat16_params); 3811 return bfloat16_round_pack_canonical(&pr, status); 3812 } 3813 3814 /*---------------------------------------------------------------------------- 3815 | The pattern for a default generated NaN. 3816 *----------------------------------------------------------------------------*/ 3817 3818 float16 float16_default_nan(float_status *status) 3819 { 3820 FloatParts64 p; 3821 3822 parts_default_nan(&p, status); 3823 p.frac >>= float16_params.frac_shift; 3824 return float16_pack_raw(&p); 3825 } 3826 3827 float32 float32_default_nan(float_status *status) 3828 { 3829 FloatParts64 p; 3830 3831 parts_default_nan(&p, status); 3832 p.frac >>= float32_params.frac_shift; 3833 return float32_pack_raw(&p); 3834 } 3835 3836 float64 float64_default_nan(float_status *status) 3837 { 3838 FloatParts64 p; 3839 3840 parts_default_nan(&p, status); 3841 p.frac >>= float64_params.frac_shift; 3842 return float64_pack_raw(&p); 3843 } 3844 3845 float128 float128_default_nan(float_status *status) 3846 { 3847 FloatParts128 p; 3848 3849 parts_default_nan(&p, status); 3850 frac_shr(&p, float128_params.frac_shift); 3851 return float128_pack_raw(&p); 3852 } 3853 3854 bfloat16 bfloat16_default_nan(float_status *status) 3855 { 3856 FloatParts64 p; 3857 3858 parts_default_nan(&p, status); 3859 p.frac >>= bfloat16_params.frac_shift; 3860 return bfloat16_pack_raw(&p); 3861 } 3862 3863 /*---------------------------------------------------------------------------- 3864 | Returns a quiet NaN from a signalling NaN for the floating point value `a'. 3865 *----------------------------------------------------------------------------*/ 3866 3867 float16 float16_silence_nan(float16 a, float_status *status) 3868 { 3869 FloatParts64 p; 3870 3871 float16_unpack_raw(&p, a); 3872 p.frac <<= float16_params.frac_shift; 3873 parts_silence_nan(&p, status); 3874 p.frac >>= float16_params.frac_shift; 3875 return float16_pack_raw(&p); 3876 } 3877 3878 float32 float32_silence_nan(float32 a, float_status *status) 3879 { 3880 FloatParts64 p; 3881 3882 float32_unpack_raw(&p, a); 3883 p.frac <<= float32_params.frac_shift; 3884 parts_silence_nan(&p, status); 3885 p.frac >>= float32_params.frac_shift; 3886 return float32_pack_raw(&p); 3887 } 3888 3889 float64 float64_silence_nan(float64 a, float_status *status) 3890 { 3891 FloatParts64 p; 3892 3893 float64_unpack_raw(&p, a); 3894 p.frac <<= float64_params.frac_shift; 3895 parts_silence_nan(&p, status); 3896 p.frac >>= float64_params.frac_shift; 3897 return float64_pack_raw(&p); 3898 } 3899 3900 bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status) 3901 { 3902 FloatParts64 p; 3903 3904 bfloat16_unpack_raw(&p, a); 3905 p.frac <<= bfloat16_params.frac_shift; 3906 parts_silence_nan(&p, status); 3907 p.frac >>= bfloat16_params.frac_shift; 3908 return bfloat16_pack_raw(&p); 3909 } 3910 3911 float128 float128_silence_nan(float128 a, float_status *status) 3912 { 3913 FloatParts128 p; 3914 3915 float128_unpack_raw(&p, a); 3916 frac_shl(&p, float128_params.frac_shift); 3917 parts_silence_nan(&p, status); 3918 frac_shr(&p, float128_params.frac_shift); 3919 return float128_pack_raw(&p); 3920 } 3921 3922 /*---------------------------------------------------------------------------- 3923 | If `a' is denormal and we are in flush-to-zero mode then set the 3924 | input-denormal exception and return zero. Otherwise just return the value. 3925 *----------------------------------------------------------------------------*/ 3926 3927 static bool parts_squash_denormal(FloatParts64 p, float_status *status) 3928 { 3929 if (p.exp == 0 && p.frac != 0) { 3930 float_raise(float_flag_input_denormal, status); 3931 return true; 3932 } 3933 3934 return false; 3935 } 3936 3937 float16 float16_squash_input_denormal(float16 a, float_status *status) 3938 { 3939 if (status->flush_inputs_to_zero) { 3940 FloatParts64 p; 3941 3942 float16_unpack_raw(&p, a); 3943 if (parts_squash_denormal(p, status)) { 3944 return float16_set_sign(float16_zero, p.sign); 3945 } 3946 } 3947 return a; 3948 } 3949 3950 float32 float32_squash_input_denormal(float32 a, float_status *status) 3951 { 3952 if (status->flush_inputs_to_zero) { 3953 FloatParts64 p; 3954 3955 float32_unpack_raw(&p, a); 3956 if (parts_squash_denormal(p, status)) { 3957 return float32_set_sign(float32_zero, p.sign); 3958 } 3959 } 3960 return a; 3961 } 3962 3963 float64 float64_squash_input_denormal(float64 a, float_status *status) 3964 { 3965 if (status->flush_inputs_to_zero) { 3966 FloatParts64 p; 3967 3968 float64_unpack_raw(&p, a); 3969 if (parts_squash_denormal(p, status)) { 3970 return float64_set_sign(float64_zero, p.sign); 3971 } 3972 } 3973 return a; 3974 } 3975 3976 bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status) 3977 { 3978 if (status->flush_inputs_to_zero) { 3979 FloatParts64 p; 3980 3981 bfloat16_unpack_raw(&p, a); 3982 if (parts_squash_denormal(p, status)) { 3983 return bfloat16_set_sign(bfloat16_zero, p.sign); 3984 } 3985 } 3986 return a; 3987 } 3988 3989 /*---------------------------------------------------------------------------- 3990 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 3991 | and 7, and returns the properly rounded 32-bit integer corresponding to the 3992 | input. If `zSign' is 1, the input is negated before being converted to an 3993 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 3994 | is simply rounded to an integer, with the inexact exception raised if the 3995 | input cannot be represented exactly as an integer. However, if the fixed- 3996 | point input is too large, the invalid exception is raised and the largest 3997 | positive or negative integer is returned. 3998 *----------------------------------------------------------------------------*/ 3999 4000 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ, 4001 float_status *status) 4002 { 4003 int8_t roundingMode; 4004 bool roundNearestEven; 4005 int8_t roundIncrement, roundBits; 4006 int32_t z; 4007 4008 roundingMode = status->float_rounding_mode; 4009 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4010 switch (roundingMode) { 4011 case float_round_nearest_even: 4012 case float_round_ties_away: 4013 roundIncrement = 0x40; 4014 break; 4015 case float_round_to_zero: 4016 roundIncrement = 0; 4017 break; 4018 case float_round_up: 4019 roundIncrement = zSign ? 0 : 0x7f; 4020 break; 4021 case float_round_down: 4022 roundIncrement = zSign ? 0x7f : 0; 4023 break; 4024 case float_round_to_odd: 4025 roundIncrement = absZ & 0x80 ? 0 : 0x7f; 4026 break; 4027 default: 4028 abort(); 4029 } 4030 roundBits = absZ & 0x7F; 4031 absZ = ( absZ + roundIncrement )>>7; 4032 if (!(roundBits ^ 0x40) && roundNearestEven) { 4033 absZ &= ~1; 4034 } 4035 z = absZ; 4036 if ( zSign ) z = - z; 4037 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 4038 float_raise(float_flag_invalid, status); 4039 return zSign ? INT32_MIN : INT32_MAX; 4040 } 4041 if (roundBits) { 4042 float_raise(float_flag_inexact, status); 4043 } 4044 return z; 4045 4046 } 4047 4048 /*---------------------------------------------------------------------------- 4049 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 4050 | `absZ1', with binary point between bits 63 and 64 (between the input words), 4051 | and returns the properly rounded 64-bit integer corresponding to the input. 4052 | If `zSign' is 1, the input is negated before being converted to an integer. 4053 | Ordinarily, the fixed-point input is simply rounded to an integer, with 4054 | the inexact exception raised if the input cannot be represented exactly as 4055 | an integer. However, if the fixed-point input is too large, the invalid 4056 | exception is raised and the largest positive or negative integer is 4057 | returned. 4058 *----------------------------------------------------------------------------*/ 4059 4060 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1, 4061 float_status *status) 4062 { 4063 int8_t roundingMode; 4064 bool roundNearestEven, increment; 4065 int64_t z; 4066 4067 roundingMode = status->float_rounding_mode; 4068 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4069 switch (roundingMode) { 4070 case float_round_nearest_even: 4071 case float_round_ties_away: 4072 increment = ((int64_t) absZ1 < 0); 4073 break; 4074 case float_round_to_zero: 4075 increment = 0; 4076 break; 4077 case float_round_up: 4078 increment = !zSign && absZ1; 4079 break; 4080 case float_round_down: 4081 increment = zSign && absZ1; 4082 break; 4083 case float_round_to_odd: 4084 increment = !(absZ0 & 1) && absZ1; 4085 break; 4086 default: 4087 abort(); 4088 } 4089 if ( increment ) { 4090 ++absZ0; 4091 if ( absZ0 == 0 ) goto overflow; 4092 if (!(absZ1 << 1) && roundNearestEven) { 4093 absZ0 &= ~1; 4094 } 4095 } 4096 z = absZ0; 4097 if ( zSign ) z = - z; 4098 if ( z && ( ( z < 0 ) ^ zSign ) ) { 4099 overflow: 4100 float_raise(float_flag_invalid, status); 4101 return zSign ? INT64_MIN : INT64_MAX; 4102 } 4103 if (absZ1) { 4104 float_raise(float_flag_inexact, status); 4105 } 4106 return z; 4107 4108 } 4109 4110 /*---------------------------------------------------------------------------- 4111 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 4112 | `absZ1', with binary point between bits 63 and 64 (between the input words), 4113 | and returns the properly rounded 64-bit unsigned integer corresponding to the 4114 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 4115 | with the inexact exception raised if the input cannot be represented exactly 4116 | as an integer. However, if the fixed-point input is too large, the invalid 4117 | exception is raised and the largest unsigned integer is returned. 4118 *----------------------------------------------------------------------------*/ 4119 4120 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0, 4121 uint64_t absZ1, float_status *status) 4122 { 4123 int8_t roundingMode; 4124 bool roundNearestEven, increment; 4125 4126 roundingMode = status->float_rounding_mode; 4127 roundNearestEven = (roundingMode == float_round_nearest_even); 4128 switch (roundingMode) { 4129 case float_round_nearest_even: 4130 case float_round_ties_away: 4131 increment = ((int64_t)absZ1 < 0); 4132 break; 4133 case float_round_to_zero: 4134 increment = 0; 4135 break; 4136 case float_round_up: 4137 increment = !zSign && absZ1; 4138 break; 4139 case float_round_down: 4140 increment = zSign && absZ1; 4141 break; 4142 case float_round_to_odd: 4143 increment = !(absZ0 & 1) && absZ1; 4144 break; 4145 default: 4146 abort(); 4147 } 4148 if (increment) { 4149 ++absZ0; 4150 if (absZ0 == 0) { 4151 float_raise(float_flag_invalid, status); 4152 return UINT64_MAX; 4153 } 4154 if (!(absZ1 << 1) && roundNearestEven) { 4155 absZ0 &= ~1; 4156 } 4157 } 4158 4159 if (zSign && absZ0) { 4160 float_raise(float_flag_invalid, status); 4161 return 0; 4162 } 4163 4164 if (absZ1) { 4165 float_raise(float_flag_inexact, status); 4166 } 4167 return absZ0; 4168 } 4169 4170 /*---------------------------------------------------------------------------- 4171 | Normalizes the subnormal single-precision floating-point value represented 4172 | by the denormalized significand `aSig'. The normalized exponent and 4173 | significand are stored at the locations pointed to by `zExpPtr' and 4174 | `zSigPtr', respectively. 4175 *----------------------------------------------------------------------------*/ 4176 4177 static void 4178 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 4179 { 4180 int8_t shiftCount; 4181 4182 shiftCount = clz32(aSig) - 8; 4183 *zSigPtr = aSig<<shiftCount; 4184 *zExpPtr = 1 - shiftCount; 4185 4186 } 4187 4188 /*---------------------------------------------------------------------------- 4189 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4190 | and significand `zSig', and returns the proper single-precision floating- 4191 | point value corresponding to the abstract input. Ordinarily, the abstract 4192 | value is simply rounded and packed into the single-precision format, with 4193 | the inexact exception raised if the abstract input cannot be represented 4194 | exactly. However, if the abstract value is too large, the overflow and 4195 | inexact exceptions are raised and an infinity or maximal finite value is 4196 | returned. If the abstract value is too small, the input value is rounded to 4197 | a subnormal number, and the underflow and inexact exceptions are raised if 4198 | the abstract input cannot be represented exactly as a subnormal single- 4199 | precision floating-point number. 4200 | The input significand `zSig' has its binary point between bits 30 4201 | and 29, which is 7 bits to the left of the usual location. This shifted 4202 | significand must be normalized or smaller. If `zSig' is not normalized, 4203 | `zExp' must be 0; in that case, the result returned is a subnormal number, 4204 | and it must not require rounding. In the usual case that `zSig' is 4205 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 4206 | The handling of underflow and overflow follows the IEC/IEEE Standard for 4207 | Binary Floating-Point Arithmetic. 4208 *----------------------------------------------------------------------------*/ 4209 4210 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig, 4211 float_status *status) 4212 { 4213 int8_t roundingMode; 4214 bool roundNearestEven; 4215 int8_t roundIncrement, roundBits; 4216 bool isTiny; 4217 4218 roundingMode = status->float_rounding_mode; 4219 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4220 switch (roundingMode) { 4221 case float_round_nearest_even: 4222 case float_round_ties_away: 4223 roundIncrement = 0x40; 4224 break; 4225 case float_round_to_zero: 4226 roundIncrement = 0; 4227 break; 4228 case float_round_up: 4229 roundIncrement = zSign ? 0 : 0x7f; 4230 break; 4231 case float_round_down: 4232 roundIncrement = zSign ? 0x7f : 0; 4233 break; 4234 case float_round_to_odd: 4235 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 4236 break; 4237 default: 4238 abort(); 4239 break; 4240 } 4241 roundBits = zSig & 0x7F; 4242 if ( 0xFD <= (uint16_t) zExp ) { 4243 if ( ( 0xFD < zExp ) 4244 || ( ( zExp == 0xFD ) 4245 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 4246 ) { 4247 bool overflow_to_inf = roundingMode != float_round_to_odd && 4248 roundIncrement != 0; 4249 float_raise(float_flag_overflow | float_flag_inexact, status); 4250 return packFloat32(zSign, 0xFF, -!overflow_to_inf); 4251 } 4252 if ( zExp < 0 ) { 4253 if (status->flush_to_zero) { 4254 float_raise(float_flag_output_denormal, status); 4255 return packFloat32(zSign, 0, 0); 4256 } 4257 isTiny = status->tininess_before_rounding 4258 || (zExp < -1) 4259 || (zSig + roundIncrement < 0x80000000); 4260 shift32RightJamming( zSig, - zExp, &zSig ); 4261 zExp = 0; 4262 roundBits = zSig & 0x7F; 4263 if (isTiny && roundBits) { 4264 float_raise(float_flag_underflow, status); 4265 } 4266 if (roundingMode == float_round_to_odd) { 4267 /* 4268 * For round-to-odd case, the roundIncrement depends on 4269 * zSig which just changed. 4270 */ 4271 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 4272 } 4273 } 4274 } 4275 if (roundBits) { 4276 float_raise(float_flag_inexact, status); 4277 } 4278 zSig = ( zSig + roundIncrement )>>7; 4279 if (!(roundBits ^ 0x40) && roundNearestEven) { 4280 zSig &= ~1; 4281 } 4282 if ( zSig == 0 ) zExp = 0; 4283 return packFloat32( zSign, zExp, zSig ); 4284 4285 } 4286 4287 /*---------------------------------------------------------------------------- 4288 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4289 | and significand `zSig', and returns the proper single-precision floating- 4290 | point value corresponding to the abstract input. This routine is just like 4291 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 4292 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 4293 | floating-point exponent. 4294 *----------------------------------------------------------------------------*/ 4295 4296 static float32 4297 normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig, 4298 float_status *status) 4299 { 4300 int8_t shiftCount; 4301 4302 shiftCount = clz32(zSig) - 1; 4303 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 4304 status); 4305 4306 } 4307 4308 /*---------------------------------------------------------------------------- 4309 | Normalizes the subnormal double-precision floating-point value represented 4310 | by the denormalized significand `aSig'. The normalized exponent and 4311 | significand are stored at the locations pointed to by `zExpPtr' and 4312 | `zSigPtr', respectively. 4313 *----------------------------------------------------------------------------*/ 4314 4315 static void 4316 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 4317 { 4318 int8_t shiftCount; 4319 4320 shiftCount = clz64(aSig) - 11; 4321 *zSigPtr = aSig<<shiftCount; 4322 *zExpPtr = 1 - shiftCount; 4323 4324 } 4325 4326 /*---------------------------------------------------------------------------- 4327 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 4328 | double-precision floating-point value, returning the result. After being 4329 | shifted into the proper positions, the three fields are simply added 4330 | together to form the result. This means that any integer portion of `zSig' 4331 | will be added into the exponent. Since a properly normalized significand 4332 | will have an integer portion equal to 1, the `zExp' input should be 1 less 4333 | than the desired result exponent whenever `zSig' is a complete, normalized 4334 | significand. 4335 *----------------------------------------------------------------------------*/ 4336 4337 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig) 4338 { 4339 4340 return make_float64( 4341 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 4342 4343 } 4344 4345 /*---------------------------------------------------------------------------- 4346 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4347 | and significand `zSig', and returns the proper double-precision floating- 4348 | point value corresponding to the abstract input. Ordinarily, the abstract 4349 | value is simply rounded and packed into the double-precision format, with 4350 | the inexact exception raised if the abstract input cannot be represented 4351 | exactly. However, if the abstract value is too large, the overflow and 4352 | inexact exceptions are raised and an infinity or maximal finite value is 4353 | returned. If the abstract value is too small, the input value is rounded to 4354 | a subnormal number, and the underflow and inexact exceptions are raised if 4355 | the abstract input cannot be represented exactly as a subnormal double- 4356 | precision floating-point number. 4357 | The input significand `zSig' has its binary point between bits 62 4358 | and 61, which is 10 bits to the left of the usual location. This shifted 4359 | significand must be normalized or smaller. If `zSig' is not normalized, 4360 | `zExp' must be 0; in that case, the result returned is a subnormal number, 4361 | and it must not require rounding. In the usual case that `zSig' is 4362 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 4363 | The handling of underflow and overflow follows the IEC/IEEE Standard for 4364 | Binary Floating-Point Arithmetic. 4365 *----------------------------------------------------------------------------*/ 4366 4367 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig, 4368 float_status *status) 4369 { 4370 int8_t roundingMode; 4371 bool roundNearestEven; 4372 int roundIncrement, roundBits; 4373 bool isTiny; 4374 4375 roundingMode = status->float_rounding_mode; 4376 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4377 switch (roundingMode) { 4378 case float_round_nearest_even: 4379 case float_round_ties_away: 4380 roundIncrement = 0x200; 4381 break; 4382 case float_round_to_zero: 4383 roundIncrement = 0; 4384 break; 4385 case float_round_up: 4386 roundIncrement = zSign ? 0 : 0x3ff; 4387 break; 4388 case float_round_down: 4389 roundIncrement = zSign ? 0x3ff : 0; 4390 break; 4391 case float_round_to_odd: 4392 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 4393 break; 4394 default: 4395 abort(); 4396 } 4397 roundBits = zSig & 0x3FF; 4398 if ( 0x7FD <= (uint16_t) zExp ) { 4399 if ( ( 0x7FD < zExp ) 4400 || ( ( zExp == 0x7FD ) 4401 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 4402 ) { 4403 bool overflow_to_inf = roundingMode != float_round_to_odd && 4404 roundIncrement != 0; 4405 float_raise(float_flag_overflow | float_flag_inexact, status); 4406 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 4407 } 4408 if ( zExp < 0 ) { 4409 if (status->flush_to_zero) { 4410 float_raise(float_flag_output_denormal, status); 4411 return packFloat64(zSign, 0, 0); 4412 } 4413 isTiny = status->tininess_before_rounding 4414 || (zExp < -1) 4415 || (zSig + roundIncrement < UINT64_C(0x8000000000000000)); 4416 shift64RightJamming( zSig, - zExp, &zSig ); 4417 zExp = 0; 4418 roundBits = zSig & 0x3FF; 4419 if (isTiny && roundBits) { 4420 float_raise(float_flag_underflow, status); 4421 } 4422 if (roundingMode == float_round_to_odd) { 4423 /* 4424 * For round-to-odd case, the roundIncrement depends on 4425 * zSig which just changed. 4426 */ 4427 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 4428 } 4429 } 4430 } 4431 if (roundBits) { 4432 float_raise(float_flag_inexact, status); 4433 } 4434 zSig = ( zSig + roundIncrement )>>10; 4435 if (!(roundBits ^ 0x200) && roundNearestEven) { 4436 zSig &= ~1; 4437 } 4438 if ( zSig == 0 ) zExp = 0; 4439 return packFloat64( zSign, zExp, zSig ); 4440 4441 } 4442 4443 /*---------------------------------------------------------------------------- 4444 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4445 | and significand `zSig', and returns the proper double-precision floating- 4446 | point value corresponding to the abstract input. This routine is just like 4447 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 4448 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 4449 | floating-point exponent. 4450 *----------------------------------------------------------------------------*/ 4451 4452 static float64 4453 normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig, 4454 float_status *status) 4455 { 4456 int8_t shiftCount; 4457 4458 shiftCount = clz64(zSig) - 1; 4459 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 4460 status); 4461 4462 } 4463 4464 /*---------------------------------------------------------------------------- 4465 | Normalizes the subnormal extended double-precision floating-point value 4466 | represented by the denormalized significand `aSig'. The normalized exponent 4467 | and significand are stored at the locations pointed to by `zExpPtr' and 4468 | `zSigPtr', respectively. 4469 *----------------------------------------------------------------------------*/ 4470 4471 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr, 4472 uint64_t *zSigPtr) 4473 { 4474 int8_t shiftCount; 4475 4476 shiftCount = clz64(aSig); 4477 *zSigPtr = aSig<<shiftCount; 4478 *zExpPtr = 1 - shiftCount; 4479 } 4480 4481 /*---------------------------------------------------------------------------- 4482 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4483 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 4484 | and returns the proper extended double-precision floating-point value 4485 | corresponding to the abstract input. Ordinarily, the abstract value is 4486 | rounded and packed into the extended double-precision format, with the 4487 | inexact exception raised if the abstract input cannot be represented 4488 | exactly. However, if the abstract value is too large, the overflow and 4489 | inexact exceptions are raised and an infinity or maximal finite value is 4490 | returned. If the abstract value is too small, the input value is rounded to 4491 | a subnormal number, and the underflow and inexact exceptions are raised if 4492 | the abstract input cannot be represented exactly as a subnormal extended 4493 | double-precision floating-point number. 4494 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 4495 | number of bits as single or double precision, respectively. Otherwise, the 4496 | result is rounded to the full precision of the extended double-precision 4497 | format. 4498 | The input significand must be normalized or smaller. If the input 4499 | significand is not normalized, `zExp' must be 0; in that case, the result 4500 | returned is a subnormal number, and it must not require rounding. The 4501 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 4502 | Floating-Point Arithmetic. 4503 *----------------------------------------------------------------------------*/ 4504 4505 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign, 4506 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 4507 float_status *status) 4508 { 4509 int8_t roundingMode; 4510 bool roundNearestEven, increment, isTiny; 4511 int64_t roundIncrement, roundMask, roundBits; 4512 4513 roundingMode = status->float_rounding_mode; 4514 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4515 if ( roundingPrecision == 80 ) goto precision80; 4516 if ( roundingPrecision == 64 ) { 4517 roundIncrement = UINT64_C(0x0000000000000400); 4518 roundMask = UINT64_C(0x00000000000007FF); 4519 } 4520 else if ( roundingPrecision == 32 ) { 4521 roundIncrement = UINT64_C(0x0000008000000000); 4522 roundMask = UINT64_C(0x000000FFFFFFFFFF); 4523 } 4524 else { 4525 goto precision80; 4526 } 4527 zSig0 |= ( zSig1 != 0 ); 4528 switch (roundingMode) { 4529 case float_round_nearest_even: 4530 case float_round_ties_away: 4531 break; 4532 case float_round_to_zero: 4533 roundIncrement = 0; 4534 break; 4535 case float_round_up: 4536 roundIncrement = zSign ? 0 : roundMask; 4537 break; 4538 case float_round_down: 4539 roundIncrement = zSign ? roundMask : 0; 4540 break; 4541 default: 4542 abort(); 4543 } 4544 roundBits = zSig0 & roundMask; 4545 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 4546 if ( ( 0x7FFE < zExp ) 4547 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 4548 ) { 4549 goto overflow; 4550 } 4551 if ( zExp <= 0 ) { 4552 if (status->flush_to_zero) { 4553 float_raise(float_flag_output_denormal, status); 4554 return packFloatx80(zSign, 0, 0); 4555 } 4556 isTiny = status->tininess_before_rounding 4557 || (zExp < 0 ) 4558 || (zSig0 <= zSig0 + roundIncrement); 4559 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 4560 zExp = 0; 4561 roundBits = zSig0 & roundMask; 4562 if (isTiny && roundBits) { 4563 float_raise(float_flag_underflow, status); 4564 } 4565 if (roundBits) { 4566 float_raise(float_flag_inexact, status); 4567 } 4568 zSig0 += roundIncrement; 4569 if ( (int64_t) zSig0 < 0 ) zExp = 1; 4570 roundIncrement = roundMask + 1; 4571 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 4572 roundMask |= roundIncrement; 4573 } 4574 zSig0 &= ~ roundMask; 4575 return packFloatx80( zSign, zExp, zSig0 ); 4576 } 4577 } 4578 if (roundBits) { 4579 float_raise(float_flag_inexact, status); 4580 } 4581 zSig0 += roundIncrement; 4582 if ( zSig0 < roundIncrement ) { 4583 ++zExp; 4584 zSig0 = UINT64_C(0x8000000000000000); 4585 } 4586 roundIncrement = roundMask + 1; 4587 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 4588 roundMask |= roundIncrement; 4589 } 4590 zSig0 &= ~ roundMask; 4591 if ( zSig0 == 0 ) zExp = 0; 4592 return packFloatx80( zSign, zExp, zSig0 ); 4593 precision80: 4594 switch (roundingMode) { 4595 case float_round_nearest_even: 4596 case float_round_ties_away: 4597 increment = ((int64_t)zSig1 < 0); 4598 break; 4599 case float_round_to_zero: 4600 increment = 0; 4601 break; 4602 case float_round_up: 4603 increment = !zSign && zSig1; 4604 break; 4605 case float_round_down: 4606 increment = zSign && zSig1; 4607 break; 4608 default: 4609 abort(); 4610 } 4611 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 4612 if ( ( 0x7FFE < zExp ) 4613 || ( ( zExp == 0x7FFE ) 4614 && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) ) 4615 && increment 4616 ) 4617 ) { 4618 roundMask = 0; 4619 overflow: 4620 float_raise(float_flag_overflow | float_flag_inexact, status); 4621 if ( ( roundingMode == float_round_to_zero ) 4622 || ( zSign && ( roundingMode == float_round_up ) ) 4623 || ( ! zSign && ( roundingMode == float_round_down ) ) 4624 ) { 4625 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 4626 } 4627 return packFloatx80(zSign, 4628 floatx80_infinity_high, 4629 floatx80_infinity_low); 4630 } 4631 if ( zExp <= 0 ) { 4632 isTiny = status->tininess_before_rounding 4633 || (zExp < 0) 4634 || !increment 4635 || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF)); 4636 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 4637 zExp = 0; 4638 if (isTiny && zSig1) { 4639 float_raise(float_flag_underflow, status); 4640 } 4641 if (zSig1) { 4642 float_raise(float_flag_inexact, status); 4643 } 4644 switch (roundingMode) { 4645 case float_round_nearest_even: 4646 case float_round_ties_away: 4647 increment = ((int64_t)zSig1 < 0); 4648 break; 4649 case float_round_to_zero: 4650 increment = 0; 4651 break; 4652 case float_round_up: 4653 increment = !zSign && zSig1; 4654 break; 4655 case float_round_down: 4656 increment = zSign && zSig1; 4657 break; 4658 default: 4659 abort(); 4660 } 4661 if ( increment ) { 4662 ++zSig0; 4663 if (!(zSig1 << 1) && roundNearestEven) { 4664 zSig0 &= ~1; 4665 } 4666 if ( (int64_t) zSig0 < 0 ) zExp = 1; 4667 } 4668 return packFloatx80( zSign, zExp, zSig0 ); 4669 } 4670 } 4671 if (zSig1) { 4672 float_raise(float_flag_inexact, status); 4673 } 4674 if ( increment ) { 4675 ++zSig0; 4676 if ( zSig0 == 0 ) { 4677 ++zExp; 4678 zSig0 = UINT64_C(0x8000000000000000); 4679 } 4680 else { 4681 if (!(zSig1 << 1) && roundNearestEven) { 4682 zSig0 &= ~1; 4683 } 4684 } 4685 } 4686 else { 4687 if ( zSig0 == 0 ) zExp = 0; 4688 } 4689 return packFloatx80( zSign, zExp, zSig0 ); 4690 4691 } 4692 4693 /*---------------------------------------------------------------------------- 4694 | Takes an abstract floating-point value having sign `zSign', exponent 4695 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 4696 | and returns the proper extended double-precision floating-point value 4697 | corresponding to the abstract input. This routine is just like 4698 | `roundAndPackFloatx80' except that the input significand does not have to be 4699 | normalized. 4700 *----------------------------------------------------------------------------*/ 4701 4702 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 4703 bool zSign, int32_t zExp, 4704 uint64_t zSig0, uint64_t zSig1, 4705 float_status *status) 4706 { 4707 int8_t shiftCount; 4708 4709 if ( zSig0 == 0 ) { 4710 zSig0 = zSig1; 4711 zSig1 = 0; 4712 zExp -= 64; 4713 } 4714 shiftCount = clz64(zSig0); 4715 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4716 zExp -= shiftCount; 4717 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 4718 zSig0, zSig1, status); 4719 4720 } 4721 4722 /*---------------------------------------------------------------------------- 4723 | Returns the least-significant 64 fraction bits of the quadruple-precision 4724 | floating-point value `a'. 4725 *----------------------------------------------------------------------------*/ 4726 4727 static inline uint64_t extractFloat128Frac1( float128 a ) 4728 { 4729 4730 return a.low; 4731 4732 } 4733 4734 /*---------------------------------------------------------------------------- 4735 | Returns the most-significant 48 fraction bits of the quadruple-precision 4736 | floating-point value `a'. 4737 *----------------------------------------------------------------------------*/ 4738 4739 static inline uint64_t extractFloat128Frac0( float128 a ) 4740 { 4741 4742 return a.high & UINT64_C(0x0000FFFFFFFFFFFF); 4743 4744 } 4745 4746 /*---------------------------------------------------------------------------- 4747 | Returns the exponent bits of the quadruple-precision floating-point value 4748 | `a'. 4749 *----------------------------------------------------------------------------*/ 4750 4751 static inline int32_t extractFloat128Exp( float128 a ) 4752 { 4753 4754 return ( a.high>>48 ) & 0x7FFF; 4755 4756 } 4757 4758 /*---------------------------------------------------------------------------- 4759 | Returns the sign bit of the quadruple-precision floating-point value `a'. 4760 *----------------------------------------------------------------------------*/ 4761 4762 static inline bool extractFloat128Sign(float128 a) 4763 { 4764 return a.high >> 63; 4765 } 4766 4767 /*---------------------------------------------------------------------------- 4768 | Normalizes the subnormal quadruple-precision floating-point value 4769 | represented by the denormalized significand formed by the concatenation of 4770 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 4771 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 4772 | significand are stored at the location pointed to by `zSig0Ptr', and the 4773 | least significant 64 bits of the normalized significand are stored at the 4774 | location pointed to by `zSig1Ptr'. 4775 *----------------------------------------------------------------------------*/ 4776 4777 static void 4778 normalizeFloat128Subnormal( 4779 uint64_t aSig0, 4780 uint64_t aSig1, 4781 int32_t *zExpPtr, 4782 uint64_t *zSig0Ptr, 4783 uint64_t *zSig1Ptr 4784 ) 4785 { 4786 int8_t shiftCount; 4787 4788 if ( aSig0 == 0 ) { 4789 shiftCount = clz64(aSig1) - 15; 4790 if ( shiftCount < 0 ) { 4791 *zSig0Ptr = aSig1>>( - shiftCount ); 4792 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 4793 } 4794 else { 4795 *zSig0Ptr = aSig1<<shiftCount; 4796 *zSig1Ptr = 0; 4797 } 4798 *zExpPtr = - shiftCount - 63; 4799 } 4800 else { 4801 shiftCount = clz64(aSig0) - 15; 4802 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 4803 *zExpPtr = 1 - shiftCount; 4804 } 4805 4806 } 4807 4808 /*---------------------------------------------------------------------------- 4809 | Packs the sign `zSign', the exponent `zExp', and the significand formed 4810 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 4811 | floating-point value, returning the result. After being shifted into the 4812 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 4813 | added together to form the most significant 32 bits of the result. This 4814 | means that any integer portion of `zSig0' will be added into the exponent. 4815 | Since a properly normalized significand will have an integer portion equal 4816 | to 1, the `zExp' input should be 1 less than the desired result exponent 4817 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 4818 | significand. 4819 *----------------------------------------------------------------------------*/ 4820 4821 static inline float128 4822 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1) 4823 { 4824 float128 z; 4825 4826 z.low = zSig1; 4827 z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0; 4828 return z; 4829 } 4830 4831 /*---------------------------------------------------------------------------- 4832 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4833 | and extended significand formed by the concatenation of `zSig0', `zSig1', 4834 | and `zSig2', and returns the proper quadruple-precision floating-point value 4835 | corresponding to the abstract input. Ordinarily, the abstract value is 4836 | simply rounded and packed into the quadruple-precision format, with the 4837 | inexact exception raised if the abstract input cannot be represented 4838 | exactly. However, if the abstract value is too large, the overflow and 4839 | inexact exceptions are raised and an infinity or maximal finite value is 4840 | returned. If the abstract value is too small, the input value is rounded to 4841 | a subnormal number, and the underflow and inexact exceptions are raised if 4842 | the abstract input cannot be represented exactly as a subnormal quadruple- 4843 | precision floating-point number. 4844 | The input significand must be normalized or smaller. If the input 4845 | significand is not normalized, `zExp' must be 0; in that case, the result 4846 | returned is a subnormal number, and it must not require rounding. In the 4847 | usual case that the input significand is normalized, `zExp' must be 1 less 4848 | than the ``true'' floating-point exponent. The handling of underflow and 4849 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4850 *----------------------------------------------------------------------------*/ 4851 4852 static float128 roundAndPackFloat128(bool zSign, int32_t zExp, 4853 uint64_t zSig0, uint64_t zSig1, 4854 uint64_t zSig2, float_status *status) 4855 { 4856 int8_t roundingMode; 4857 bool roundNearestEven, increment, isTiny; 4858 4859 roundingMode = status->float_rounding_mode; 4860 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4861 switch (roundingMode) { 4862 case float_round_nearest_even: 4863 case float_round_ties_away: 4864 increment = ((int64_t)zSig2 < 0); 4865 break; 4866 case float_round_to_zero: 4867 increment = 0; 4868 break; 4869 case float_round_up: 4870 increment = !zSign && zSig2; 4871 break; 4872 case float_round_down: 4873 increment = zSign && zSig2; 4874 break; 4875 case float_round_to_odd: 4876 increment = !(zSig1 & 0x1) && zSig2; 4877 break; 4878 default: 4879 abort(); 4880 } 4881 if ( 0x7FFD <= (uint32_t) zExp ) { 4882 if ( ( 0x7FFD < zExp ) 4883 || ( ( zExp == 0x7FFD ) 4884 && eq128( 4885 UINT64_C(0x0001FFFFFFFFFFFF), 4886 UINT64_C(0xFFFFFFFFFFFFFFFF), 4887 zSig0, 4888 zSig1 4889 ) 4890 && increment 4891 ) 4892 ) { 4893 float_raise(float_flag_overflow | float_flag_inexact, status); 4894 if ( ( roundingMode == float_round_to_zero ) 4895 || ( zSign && ( roundingMode == float_round_up ) ) 4896 || ( ! zSign && ( roundingMode == float_round_down ) ) 4897 || (roundingMode == float_round_to_odd) 4898 ) { 4899 return 4900 packFloat128( 4901 zSign, 4902 0x7FFE, 4903 UINT64_C(0x0000FFFFFFFFFFFF), 4904 UINT64_C(0xFFFFFFFFFFFFFFFF) 4905 ); 4906 } 4907 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4908 } 4909 if ( zExp < 0 ) { 4910 if (status->flush_to_zero) { 4911 float_raise(float_flag_output_denormal, status); 4912 return packFloat128(zSign, 0, 0, 0); 4913 } 4914 isTiny = status->tininess_before_rounding 4915 || (zExp < -1) 4916 || !increment 4917 || lt128(zSig0, zSig1, 4918 UINT64_C(0x0001FFFFFFFFFFFF), 4919 UINT64_C(0xFFFFFFFFFFFFFFFF)); 4920 shift128ExtraRightJamming( 4921 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 4922 zExp = 0; 4923 if (isTiny && zSig2) { 4924 float_raise(float_flag_underflow, status); 4925 } 4926 switch (roundingMode) { 4927 case float_round_nearest_even: 4928 case float_round_ties_away: 4929 increment = ((int64_t)zSig2 < 0); 4930 break; 4931 case float_round_to_zero: 4932 increment = 0; 4933 break; 4934 case float_round_up: 4935 increment = !zSign && zSig2; 4936 break; 4937 case float_round_down: 4938 increment = zSign && zSig2; 4939 break; 4940 case float_round_to_odd: 4941 increment = !(zSig1 & 0x1) && zSig2; 4942 break; 4943 default: 4944 abort(); 4945 } 4946 } 4947 } 4948 if (zSig2) { 4949 float_raise(float_flag_inexact, status); 4950 } 4951 if ( increment ) { 4952 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 4953 if ((zSig2 + zSig2 == 0) && roundNearestEven) { 4954 zSig1 &= ~1; 4955 } 4956 } 4957 else { 4958 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 4959 } 4960 return packFloat128( zSign, zExp, zSig0, zSig1 ); 4961 4962 } 4963 4964 /*---------------------------------------------------------------------------- 4965 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4966 | and significand formed by the concatenation of `zSig0' and `zSig1', and 4967 | returns the proper quadruple-precision floating-point value corresponding 4968 | to the abstract input. This routine is just like `roundAndPackFloat128' 4969 | except that the input significand has fewer bits and does not have to be 4970 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 4971 | point exponent. 4972 *----------------------------------------------------------------------------*/ 4973 4974 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp, 4975 uint64_t zSig0, uint64_t zSig1, 4976 float_status *status) 4977 { 4978 int8_t shiftCount; 4979 uint64_t zSig2; 4980 4981 if ( zSig0 == 0 ) { 4982 zSig0 = zSig1; 4983 zSig1 = 0; 4984 zExp -= 64; 4985 } 4986 shiftCount = clz64(zSig0) - 15; 4987 if ( 0 <= shiftCount ) { 4988 zSig2 = 0; 4989 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4990 } 4991 else { 4992 shift128ExtraRightJamming( 4993 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 4994 } 4995 zExp -= shiftCount; 4996 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 4997 4998 } 4999 5000 5001 /*---------------------------------------------------------------------------- 5002 | Returns the result of converting the 32-bit two's complement integer `a' 5003 | to the extended double-precision floating-point format. The conversion 5004 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5005 | Arithmetic. 5006 *----------------------------------------------------------------------------*/ 5007 5008 floatx80 int32_to_floatx80(int32_t a, float_status *status) 5009 { 5010 bool zSign; 5011 uint32_t absA; 5012 int8_t shiftCount; 5013 uint64_t zSig; 5014 5015 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 5016 zSign = ( a < 0 ); 5017 absA = zSign ? - a : a; 5018 shiftCount = clz32(absA) + 32; 5019 zSig = absA; 5020 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 5021 5022 } 5023 5024 /*---------------------------------------------------------------------------- 5025 | Returns the result of converting the 32-bit two's complement integer `a' to 5026 | the quadruple-precision floating-point format. The conversion is performed 5027 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5028 *----------------------------------------------------------------------------*/ 5029 5030 float128 int32_to_float128(int32_t a, float_status *status) 5031 { 5032 bool zSign; 5033 uint32_t absA; 5034 int8_t shiftCount; 5035 uint64_t zSig0; 5036 5037 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 5038 zSign = ( a < 0 ); 5039 absA = zSign ? - a : a; 5040 shiftCount = clz32(absA) + 17; 5041 zSig0 = absA; 5042 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 5043 5044 } 5045 5046 /*---------------------------------------------------------------------------- 5047 | Returns the result of converting the 64-bit two's complement integer `a' 5048 | to the extended double-precision floating-point format. The conversion 5049 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5050 | Arithmetic. 5051 *----------------------------------------------------------------------------*/ 5052 5053 floatx80 int64_to_floatx80(int64_t a, float_status *status) 5054 { 5055 bool zSign; 5056 uint64_t absA; 5057 int8_t shiftCount; 5058 5059 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 5060 zSign = ( a < 0 ); 5061 absA = zSign ? - a : a; 5062 shiftCount = clz64(absA); 5063 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 5064 5065 } 5066 5067 /*---------------------------------------------------------------------------- 5068 | Returns the result of converting the 64-bit two's complement integer `a' to 5069 | the quadruple-precision floating-point format. The conversion is performed 5070 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5071 *----------------------------------------------------------------------------*/ 5072 5073 float128 int64_to_float128(int64_t a, float_status *status) 5074 { 5075 bool zSign; 5076 uint64_t absA; 5077 int8_t shiftCount; 5078 int32_t zExp; 5079 uint64_t zSig0, zSig1; 5080 5081 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 5082 zSign = ( a < 0 ); 5083 absA = zSign ? - a : a; 5084 shiftCount = clz64(absA) + 49; 5085 zExp = 0x406E - shiftCount; 5086 if ( 64 <= shiftCount ) { 5087 zSig1 = 0; 5088 zSig0 = absA; 5089 shiftCount -= 64; 5090 } 5091 else { 5092 zSig1 = absA; 5093 zSig0 = 0; 5094 } 5095 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 5096 return packFloat128( zSign, zExp, zSig0, zSig1 ); 5097 5098 } 5099 5100 /*---------------------------------------------------------------------------- 5101 | Returns the result of converting the 64-bit unsigned integer `a' 5102 | to the quadruple-precision floating-point format. The conversion is performed 5103 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5104 *----------------------------------------------------------------------------*/ 5105 5106 float128 uint64_to_float128(uint64_t a, float_status *status) 5107 { 5108 if (a == 0) { 5109 return float128_zero; 5110 } 5111 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status); 5112 } 5113 5114 /*---------------------------------------------------------------------------- 5115 | Returns the result of converting the single-precision floating-point value 5116 | `a' to the extended double-precision floating-point format. The conversion 5117 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5118 | Arithmetic. 5119 *----------------------------------------------------------------------------*/ 5120 5121 floatx80 float32_to_floatx80(float32 a, float_status *status) 5122 { 5123 bool aSign; 5124 int aExp; 5125 uint32_t aSig; 5126 5127 a = float32_squash_input_denormal(a, status); 5128 aSig = extractFloat32Frac( a ); 5129 aExp = extractFloat32Exp( a ); 5130 aSign = extractFloat32Sign( a ); 5131 if ( aExp == 0xFF ) { 5132 if (aSig) { 5133 floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status), 5134 status); 5135 return floatx80_silence_nan(res, status); 5136 } 5137 return packFloatx80(aSign, 5138 floatx80_infinity_high, 5139 floatx80_infinity_low); 5140 } 5141 if ( aExp == 0 ) { 5142 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 5143 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5144 } 5145 aSig |= 0x00800000; 5146 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 5147 5148 } 5149 5150 /*---------------------------------------------------------------------------- 5151 | Returns the result of converting the single-precision floating-point value 5152 | `a' to the double-precision floating-point format. The conversion is 5153 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5154 | Arithmetic. 5155 *----------------------------------------------------------------------------*/ 5156 5157 float128 float32_to_float128(float32 a, float_status *status) 5158 { 5159 bool aSign; 5160 int aExp; 5161 uint32_t aSig; 5162 5163 a = float32_squash_input_denormal(a, status); 5164 aSig = extractFloat32Frac( a ); 5165 aExp = extractFloat32Exp( a ); 5166 aSign = extractFloat32Sign( a ); 5167 if ( aExp == 0xFF ) { 5168 if (aSig) { 5169 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 5170 } 5171 return packFloat128( aSign, 0x7FFF, 0, 0 ); 5172 } 5173 if ( aExp == 0 ) { 5174 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 5175 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5176 --aExp; 5177 } 5178 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 5179 5180 } 5181 5182 /*---------------------------------------------------------------------------- 5183 | Returns the remainder of the single-precision floating-point value `a' 5184 | with respect to the corresponding value `b'. The operation is performed 5185 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5186 *----------------------------------------------------------------------------*/ 5187 5188 float32 float32_rem(float32 a, float32 b, float_status *status) 5189 { 5190 bool aSign, zSign; 5191 int aExp, bExp, expDiff; 5192 uint32_t aSig, bSig; 5193 uint32_t q; 5194 uint64_t aSig64, bSig64, q64; 5195 uint32_t alternateASig; 5196 int32_t sigMean; 5197 a = float32_squash_input_denormal(a, status); 5198 b = float32_squash_input_denormal(b, status); 5199 5200 aSig = extractFloat32Frac( a ); 5201 aExp = extractFloat32Exp( a ); 5202 aSign = extractFloat32Sign( a ); 5203 bSig = extractFloat32Frac( b ); 5204 bExp = extractFloat32Exp( b ); 5205 if ( aExp == 0xFF ) { 5206 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 5207 return propagateFloat32NaN(a, b, status); 5208 } 5209 float_raise(float_flag_invalid, status); 5210 return float32_default_nan(status); 5211 } 5212 if ( bExp == 0xFF ) { 5213 if (bSig) { 5214 return propagateFloat32NaN(a, b, status); 5215 } 5216 return a; 5217 } 5218 if ( bExp == 0 ) { 5219 if ( bSig == 0 ) { 5220 float_raise(float_flag_invalid, status); 5221 return float32_default_nan(status); 5222 } 5223 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 5224 } 5225 if ( aExp == 0 ) { 5226 if ( aSig == 0 ) return a; 5227 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5228 } 5229 expDiff = aExp - bExp; 5230 aSig |= 0x00800000; 5231 bSig |= 0x00800000; 5232 if ( expDiff < 32 ) { 5233 aSig <<= 8; 5234 bSig <<= 8; 5235 if ( expDiff < 0 ) { 5236 if ( expDiff < -1 ) return a; 5237 aSig >>= 1; 5238 } 5239 q = ( bSig <= aSig ); 5240 if ( q ) aSig -= bSig; 5241 if ( 0 < expDiff ) { 5242 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 5243 q >>= 32 - expDiff; 5244 bSig >>= 2; 5245 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 5246 } 5247 else { 5248 aSig >>= 2; 5249 bSig >>= 2; 5250 } 5251 } 5252 else { 5253 if ( bSig <= aSig ) aSig -= bSig; 5254 aSig64 = ( (uint64_t) aSig )<<40; 5255 bSig64 = ( (uint64_t) bSig )<<40; 5256 expDiff -= 64; 5257 while ( 0 < expDiff ) { 5258 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 5259 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 5260 aSig64 = - ( ( bSig * q64 )<<38 ); 5261 expDiff -= 62; 5262 } 5263 expDiff += 64; 5264 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 5265 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 5266 q = q64>>( 64 - expDiff ); 5267 bSig <<= 6; 5268 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 5269 } 5270 do { 5271 alternateASig = aSig; 5272 ++q; 5273 aSig -= bSig; 5274 } while ( 0 <= (int32_t) aSig ); 5275 sigMean = aSig + alternateASig; 5276 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 5277 aSig = alternateASig; 5278 } 5279 zSign = ( (int32_t) aSig < 0 ); 5280 if ( zSign ) aSig = - aSig; 5281 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 5282 } 5283 5284 5285 5286 /*---------------------------------------------------------------------------- 5287 | Returns the binary exponential of the single-precision floating-point value 5288 | `a'. The operation is performed according to the IEC/IEEE Standard for 5289 | Binary Floating-Point Arithmetic. 5290 | 5291 | Uses the following identities: 5292 | 5293 | 1. ------------------------------------------------------------------------- 5294 | x x*ln(2) 5295 | 2 = e 5296 | 5297 | 2. ------------------------------------------------------------------------- 5298 | 2 3 4 5 n 5299 | x x x x x x x 5300 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 5301 | 1! 2! 3! 4! 5! n! 5302 *----------------------------------------------------------------------------*/ 5303 5304 static const float64 float32_exp2_coefficients[15] = 5305 { 5306 const_float64( 0x3ff0000000000000ll ), /* 1 */ 5307 const_float64( 0x3fe0000000000000ll ), /* 2 */ 5308 const_float64( 0x3fc5555555555555ll ), /* 3 */ 5309 const_float64( 0x3fa5555555555555ll ), /* 4 */ 5310 const_float64( 0x3f81111111111111ll ), /* 5 */ 5311 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 5312 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 5313 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 5314 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 5315 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 5316 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 5317 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 5318 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 5319 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 5320 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 5321 }; 5322 5323 float32 float32_exp2(float32 a, float_status *status) 5324 { 5325 bool aSign; 5326 int aExp; 5327 uint32_t aSig; 5328 float64 r, x, xn; 5329 int i; 5330 a = float32_squash_input_denormal(a, status); 5331 5332 aSig = extractFloat32Frac( a ); 5333 aExp = extractFloat32Exp( a ); 5334 aSign = extractFloat32Sign( a ); 5335 5336 if ( aExp == 0xFF) { 5337 if (aSig) { 5338 return propagateFloat32NaN(a, float32_zero, status); 5339 } 5340 return (aSign) ? float32_zero : a; 5341 } 5342 if (aExp == 0) { 5343 if (aSig == 0) return float32_one; 5344 } 5345 5346 float_raise(float_flag_inexact, status); 5347 5348 /* ******************************* */ 5349 /* using float64 for approximation */ 5350 /* ******************************* */ 5351 x = float32_to_float64(a, status); 5352 x = float64_mul(x, float64_ln2, status); 5353 5354 xn = x; 5355 r = float64_one; 5356 for (i = 0 ; i < 15 ; i++) { 5357 float64 f; 5358 5359 f = float64_mul(xn, float32_exp2_coefficients[i], status); 5360 r = float64_add(r, f, status); 5361 5362 xn = float64_mul(xn, x, status); 5363 } 5364 5365 return float64_to_float32(r, status); 5366 } 5367 5368 /*---------------------------------------------------------------------------- 5369 | Returns the binary log of the single-precision floating-point value `a'. 5370 | The operation is performed according to the IEC/IEEE Standard for Binary 5371 | Floating-Point Arithmetic. 5372 *----------------------------------------------------------------------------*/ 5373 float32 float32_log2(float32 a, float_status *status) 5374 { 5375 bool aSign, zSign; 5376 int aExp; 5377 uint32_t aSig, zSig, i; 5378 5379 a = float32_squash_input_denormal(a, status); 5380 aSig = extractFloat32Frac( a ); 5381 aExp = extractFloat32Exp( a ); 5382 aSign = extractFloat32Sign( a ); 5383 5384 if ( aExp == 0 ) { 5385 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 5386 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5387 } 5388 if ( aSign ) { 5389 float_raise(float_flag_invalid, status); 5390 return float32_default_nan(status); 5391 } 5392 if ( aExp == 0xFF ) { 5393 if (aSig) { 5394 return propagateFloat32NaN(a, float32_zero, status); 5395 } 5396 return a; 5397 } 5398 5399 aExp -= 0x7F; 5400 aSig |= 0x00800000; 5401 zSign = aExp < 0; 5402 zSig = aExp << 23; 5403 5404 for (i = 1 << 22; i > 0; i >>= 1) { 5405 aSig = ( (uint64_t)aSig * aSig ) >> 23; 5406 if ( aSig & 0x01000000 ) { 5407 aSig >>= 1; 5408 zSig |= i; 5409 } 5410 } 5411 5412 if ( zSign ) 5413 zSig = -zSig; 5414 5415 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 5416 } 5417 5418 /*---------------------------------------------------------------------------- 5419 | Returns the result of converting the double-precision floating-point value 5420 | `a' to the extended double-precision floating-point format. The conversion 5421 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5422 | Arithmetic. 5423 *----------------------------------------------------------------------------*/ 5424 5425 floatx80 float64_to_floatx80(float64 a, float_status *status) 5426 { 5427 bool aSign; 5428 int aExp; 5429 uint64_t aSig; 5430 5431 a = float64_squash_input_denormal(a, status); 5432 aSig = extractFloat64Frac( a ); 5433 aExp = extractFloat64Exp( a ); 5434 aSign = extractFloat64Sign( a ); 5435 if ( aExp == 0x7FF ) { 5436 if (aSig) { 5437 floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status), 5438 status); 5439 return floatx80_silence_nan(res, status); 5440 } 5441 return packFloatx80(aSign, 5442 floatx80_infinity_high, 5443 floatx80_infinity_low); 5444 } 5445 if ( aExp == 0 ) { 5446 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 5447 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5448 } 5449 return 5450 packFloatx80( 5451 aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11); 5452 5453 } 5454 5455 /*---------------------------------------------------------------------------- 5456 | Returns the result of converting the double-precision floating-point value 5457 | `a' to the quadruple-precision floating-point format. The conversion is 5458 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5459 | Arithmetic. 5460 *----------------------------------------------------------------------------*/ 5461 5462 float128 float64_to_float128(float64 a, float_status *status) 5463 { 5464 bool aSign; 5465 int aExp; 5466 uint64_t aSig, zSig0, zSig1; 5467 5468 a = float64_squash_input_denormal(a, status); 5469 aSig = extractFloat64Frac( a ); 5470 aExp = extractFloat64Exp( a ); 5471 aSign = extractFloat64Sign( a ); 5472 if ( aExp == 0x7FF ) { 5473 if (aSig) { 5474 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 5475 } 5476 return packFloat128( aSign, 0x7FFF, 0, 0 ); 5477 } 5478 if ( aExp == 0 ) { 5479 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 5480 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5481 --aExp; 5482 } 5483 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 5484 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 5485 5486 } 5487 5488 5489 /*---------------------------------------------------------------------------- 5490 | Returns the remainder of the double-precision floating-point value `a' 5491 | with respect to the corresponding value `b'. The operation is performed 5492 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5493 *----------------------------------------------------------------------------*/ 5494 5495 float64 float64_rem(float64 a, float64 b, float_status *status) 5496 { 5497 bool aSign, zSign; 5498 int aExp, bExp, expDiff; 5499 uint64_t aSig, bSig; 5500 uint64_t q, alternateASig; 5501 int64_t sigMean; 5502 5503 a = float64_squash_input_denormal(a, status); 5504 b = float64_squash_input_denormal(b, status); 5505 aSig = extractFloat64Frac( a ); 5506 aExp = extractFloat64Exp( a ); 5507 aSign = extractFloat64Sign( a ); 5508 bSig = extractFloat64Frac( b ); 5509 bExp = extractFloat64Exp( b ); 5510 if ( aExp == 0x7FF ) { 5511 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 5512 return propagateFloat64NaN(a, b, status); 5513 } 5514 float_raise(float_flag_invalid, status); 5515 return float64_default_nan(status); 5516 } 5517 if ( bExp == 0x7FF ) { 5518 if (bSig) { 5519 return propagateFloat64NaN(a, b, status); 5520 } 5521 return a; 5522 } 5523 if ( bExp == 0 ) { 5524 if ( bSig == 0 ) { 5525 float_raise(float_flag_invalid, status); 5526 return float64_default_nan(status); 5527 } 5528 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 5529 } 5530 if ( aExp == 0 ) { 5531 if ( aSig == 0 ) return a; 5532 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5533 } 5534 expDiff = aExp - bExp; 5535 aSig = (aSig | UINT64_C(0x0010000000000000)) << 11; 5536 bSig = (bSig | UINT64_C(0x0010000000000000)) << 11; 5537 if ( expDiff < 0 ) { 5538 if ( expDiff < -1 ) return a; 5539 aSig >>= 1; 5540 } 5541 q = ( bSig <= aSig ); 5542 if ( q ) aSig -= bSig; 5543 expDiff -= 64; 5544 while ( 0 < expDiff ) { 5545 q = estimateDiv128To64( aSig, 0, bSig ); 5546 q = ( 2 < q ) ? q - 2 : 0; 5547 aSig = - ( ( bSig>>2 ) * q ); 5548 expDiff -= 62; 5549 } 5550 expDiff += 64; 5551 if ( 0 < expDiff ) { 5552 q = estimateDiv128To64( aSig, 0, bSig ); 5553 q = ( 2 < q ) ? q - 2 : 0; 5554 q >>= 64 - expDiff; 5555 bSig >>= 2; 5556 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 5557 } 5558 else { 5559 aSig >>= 2; 5560 bSig >>= 2; 5561 } 5562 do { 5563 alternateASig = aSig; 5564 ++q; 5565 aSig -= bSig; 5566 } while ( 0 <= (int64_t) aSig ); 5567 sigMean = aSig + alternateASig; 5568 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 5569 aSig = alternateASig; 5570 } 5571 zSign = ( (int64_t) aSig < 0 ); 5572 if ( zSign ) aSig = - aSig; 5573 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 5574 5575 } 5576 5577 /*---------------------------------------------------------------------------- 5578 | Returns the binary log of the double-precision floating-point value `a'. 5579 | The operation is performed according to the IEC/IEEE Standard for Binary 5580 | Floating-Point Arithmetic. 5581 *----------------------------------------------------------------------------*/ 5582 float64 float64_log2(float64 a, float_status *status) 5583 { 5584 bool aSign, zSign; 5585 int aExp; 5586 uint64_t aSig, aSig0, aSig1, zSig, i; 5587 a = float64_squash_input_denormal(a, status); 5588 5589 aSig = extractFloat64Frac( a ); 5590 aExp = extractFloat64Exp( a ); 5591 aSign = extractFloat64Sign( a ); 5592 5593 if ( aExp == 0 ) { 5594 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 5595 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5596 } 5597 if ( aSign ) { 5598 float_raise(float_flag_invalid, status); 5599 return float64_default_nan(status); 5600 } 5601 if ( aExp == 0x7FF ) { 5602 if (aSig) { 5603 return propagateFloat64NaN(a, float64_zero, status); 5604 } 5605 return a; 5606 } 5607 5608 aExp -= 0x3FF; 5609 aSig |= UINT64_C(0x0010000000000000); 5610 zSign = aExp < 0; 5611 zSig = (uint64_t)aExp << 52; 5612 for (i = 1LL << 51; i > 0; i >>= 1) { 5613 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 5614 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 5615 if ( aSig & UINT64_C(0x0020000000000000) ) { 5616 aSig >>= 1; 5617 zSig |= i; 5618 } 5619 } 5620 5621 if ( zSign ) 5622 zSig = -zSig; 5623 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 5624 } 5625 5626 /*---------------------------------------------------------------------------- 5627 | Returns the result of converting the extended double-precision floating- 5628 | point value `a' to the 32-bit two's complement integer format. The 5629 | conversion is performed according to the IEC/IEEE Standard for Binary 5630 | Floating-Point Arithmetic---which means in particular that the conversion 5631 | is rounded according to the current rounding mode. If `a' is a NaN, the 5632 | largest positive integer is returned. Otherwise, if the conversion 5633 | overflows, the largest integer with the same sign as `a' is returned. 5634 *----------------------------------------------------------------------------*/ 5635 5636 int32_t floatx80_to_int32(floatx80 a, float_status *status) 5637 { 5638 bool aSign; 5639 int32_t aExp, shiftCount; 5640 uint64_t aSig; 5641 5642 if (floatx80_invalid_encoding(a)) { 5643 float_raise(float_flag_invalid, status); 5644 return 1 << 31; 5645 } 5646 aSig = extractFloatx80Frac( a ); 5647 aExp = extractFloatx80Exp( a ); 5648 aSign = extractFloatx80Sign( a ); 5649 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5650 shiftCount = 0x4037 - aExp; 5651 if ( shiftCount <= 0 ) shiftCount = 1; 5652 shift64RightJamming( aSig, shiftCount, &aSig ); 5653 return roundAndPackInt32(aSign, aSig, status); 5654 5655 } 5656 5657 /*---------------------------------------------------------------------------- 5658 | Returns the result of converting the extended double-precision floating- 5659 | point value `a' to the 32-bit two's complement integer format. The 5660 | conversion is performed according to the IEC/IEEE Standard for Binary 5661 | Floating-Point Arithmetic, except that the conversion is always rounded 5662 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5663 | Otherwise, if the conversion overflows, the largest integer with the same 5664 | sign as `a' is returned. 5665 *----------------------------------------------------------------------------*/ 5666 5667 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 5668 { 5669 bool aSign; 5670 int32_t aExp, shiftCount; 5671 uint64_t aSig, savedASig; 5672 int32_t z; 5673 5674 if (floatx80_invalid_encoding(a)) { 5675 float_raise(float_flag_invalid, status); 5676 return 1 << 31; 5677 } 5678 aSig = extractFloatx80Frac( a ); 5679 aExp = extractFloatx80Exp( a ); 5680 aSign = extractFloatx80Sign( a ); 5681 if ( 0x401E < aExp ) { 5682 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5683 goto invalid; 5684 } 5685 else if ( aExp < 0x3FFF ) { 5686 if (aExp || aSig) { 5687 float_raise(float_flag_inexact, status); 5688 } 5689 return 0; 5690 } 5691 shiftCount = 0x403E - aExp; 5692 savedASig = aSig; 5693 aSig >>= shiftCount; 5694 z = aSig; 5695 if ( aSign ) z = - z; 5696 if ( ( z < 0 ) ^ aSign ) { 5697 invalid: 5698 float_raise(float_flag_invalid, status); 5699 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5700 } 5701 if ( ( aSig<<shiftCount ) != savedASig ) { 5702 float_raise(float_flag_inexact, status); 5703 } 5704 return z; 5705 5706 } 5707 5708 /*---------------------------------------------------------------------------- 5709 | Returns the result of converting the extended double-precision floating- 5710 | point value `a' to the 64-bit two's complement integer format. The 5711 | conversion is performed according to the IEC/IEEE Standard for Binary 5712 | Floating-Point Arithmetic---which means in particular that the conversion 5713 | is rounded according to the current rounding mode. If `a' is a NaN, 5714 | the largest positive integer is returned. Otherwise, if the conversion 5715 | overflows, the largest integer with the same sign as `a' is returned. 5716 *----------------------------------------------------------------------------*/ 5717 5718 int64_t floatx80_to_int64(floatx80 a, float_status *status) 5719 { 5720 bool aSign; 5721 int32_t aExp, shiftCount; 5722 uint64_t aSig, aSigExtra; 5723 5724 if (floatx80_invalid_encoding(a)) { 5725 float_raise(float_flag_invalid, status); 5726 return 1ULL << 63; 5727 } 5728 aSig = extractFloatx80Frac( a ); 5729 aExp = extractFloatx80Exp( a ); 5730 aSign = extractFloatx80Sign( a ); 5731 shiftCount = 0x403E - aExp; 5732 if ( shiftCount <= 0 ) { 5733 if ( shiftCount ) { 5734 float_raise(float_flag_invalid, status); 5735 if (!aSign || floatx80_is_any_nan(a)) { 5736 return INT64_MAX; 5737 } 5738 return INT64_MIN; 5739 } 5740 aSigExtra = 0; 5741 } 5742 else { 5743 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 5744 } 5745 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 5746 5747 } 5748 5749 /*---------------------------------------------------------------------------- 5750 | Returns the result of converting the extended double-precision floating- 5751 | point value `a' to the 64-bit two's complement integer format. The 5752 | conversion is performed according to the IEC/IEEE Standard for Binary 5753 | Floating-Point Arithmetic, except that the conversion is always rounded 5754 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5755 | Otherwise, if the conversion overflows, the largest integer with the same 5756 | sign as `a' is returned. 5757 *----------------------------------------------------------------------------*/ 5758 5759 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 5760 { 5761 bool aSign; 5762 int32_t aExp, shiftCount; 5763 uint64_t aSig; 5764 int64_t z; 5765 5766 if (floatx80_invalid_encoding(a)) { 5767 float_raise(float_flag_invalid, status); 5768 return 1ULL << 63; 5769 } 5770 aSig = extractFloatx80Frac( a ); 5771 aExp = extractFloatx80Exp( a ); 5772 aSign = extractFloatx80Sign( a ); 5773 shiftCount = aExp - 0x403E; 5774 if ( 0 <= shiftCount ) { 5775 aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF); 5776 if ( ( a.high != 0xC03E ) || aSig ) { 5777 float_raise(float_flag_invalid, status); 5778 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 5779 return INT64_MAX; 5780 } 5781 } 5782 return INT64_MIN; 5783 } 5784 else if ( aExp < 0x3FFF ) { 5785 if (aExp | aSig) { 5786 float_raise(float_flag_inexact, status); 5787 } 5788 return 0; 5789 } 5790 z = aSig>>( - shiftCount ); 5791 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 5792 float_raise(float_flag_inexact, status); 5793 } 5794 if ( aSign ) z = - z; 5795 return z; 5796 5797 } 5798 5799 /*---------------------------------------------------------------------------- 5800 | Returns the result of converting the extended double-precision floating- 5801 | point value `a' to the single-precision floating-point format. The 5802 | conversion is performed according to the IEC/IEEE Standard for Binary 5803 | Floating-Point Arithmetic. 5804 *----------------------------------------------------------------------------*/ 5805 5806 float32 floatx80_to_float32(floatx80 a, float_status *status) 5807 { 5808 bool aSign; 5809 int32_t aExp; 5810 uint64_t aSig; 5811 5812 if (floatx80_invalid_encoding(a)) { 5813 float_raise(float_flag_invalid, status); 5814 return float32_default_nan(status); 5815 } 5816 aSig = extractFloatx80Frac( a ); 5817 aExp = extractFloatx80Exp( a ); 5818 aSign = extractFloatx80Sign( a ); 5819 if ( aExp == 0x7FFF ) { 5820 if ( (uint64_t) ( aSig<<1 ) ) { 5821 float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status), 5822 status); 5823 return float32_silence_nan(res, status); 5824 } 5825 return packFloat32( aSign, 0xFF, 0 ); 5826 } 5827 shift64RightJamming( aSig, 33, &aSig ); 5828 if ( aExp || aSig ) aExp -= 0x3F81; 5829 return roundAndPackFloat32(aSign, aExp, aSig, status); 5830 5831 } 5832 5833 /*---------------------------------------------------------------------------- 5834 | Returns the result of converting the extended double-precision floating- 5835 | point value `a' to the double-precision floating-point format. The 5836 | conversion is performed according to the IEC/IEEE Standard for Binary 5837 | Floating-Point Arithmetic. 5838 *----------------------------------------------------------------------------*/ 5839 5840 float64 floatx80_to_float64(floatx80 a, float_status *status) 5841 { 5842 bool aSign; 5843 int32_t aExp; 5844 uint64_t aSig, zSig; 5845 5846 if (floatx80_invalid_encoding(a)) { 5847 float_raise(float_flag_invalid, status); 5848 return float64_default_nan(status); 5849 } 5850 aSig = extractFloatx80Frac( a ); 5851 aExp = extractFloatx80Exp( a ); 5852 aSign = extractFloatx80Sign( a ); 5853 if ( aExp == 0x7FFF ) { 5854 if ( (uint64_t) ( aSig<<1 ) ) { 5855 float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status), 5856 status); 5857 return float64_silence_nan(res, status); 5858 } 5859 return packFloat64( aSign, 0x7FF, 0 ); 5860 } 5861 shift64RightJamming( aSig, 1, &zSig ); 5862 if ( aExp || aSig ) aExp -= 0x3C01; 5863 return roundAndPackFloat64(aSign, aExp, zSig, status); 5864 5865 } 5866 5867 /*---------------------------------------------------------------------------- 5868 | Returns the result of converting the extended double-precision floating- 5869 | point value `a' to the quadruple-precision floating-point format. The 5870 | conversion is performed according to the IEC/IEEE Standard for Binary 5871 | Floating-Point Arithmetic. 5872 *----------------------------------------------------------------------------*/ 5873 5874 float128 floatx80_to_float128(floatx80 a, float_status *status) 5875 { 5876 bool aSign; 5877 int aExp; 5878 uint64_t aSig, zSig0, zSig1; 5879 5880 if (floatx80_invalid_encoding(a)) { 5881 float_raise(float_flag_invalid, status); 5882 return float128_default_nan(status); 5883 } 5884 aSig = extractFloatx80Frac( a ); 5885 aExp = extractFloatx80Exp( a ); 5886 aSign = extractFloatx80Sign( a ); 5887 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 5888 float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status), 5889 status); 5890 return float128_silence_nan(res, status); 5891 } 5892 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 5893 return packFloat128( aSign, aExp, zSig0, zSig1 ); 5894 5895 } 5896 5897 /*---------------------------------------------------------------------------- 5898 | Rounds the extended double-precision floating-point value `a' 5899 | to the precision provided by floatx80_rounding_precision and returns the 5900 | result as an extended double-precision floating-point value. 5901 | The operation is performed according to the IEC/IEEE Standard for Binary 5902 | Floating-Point Arithmetic. 5903 *----------------------------------------------------------------------------*/ 5904 5905 floatx80 floatx80_round(floatx80 a, float_status *status) 5906 { 5907 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5908 extractFloatx80Sign(a), 5909 extractFloatx80Exp(a), 5910 extractFloatx80Frac(a), 0, status); 5911 } 5912 5913 /*---------------------------------------------------------------------------- 5914 | Rounds the extended double-precision floating-point value `a' to an integer, 5915 | and returns the result as an extended quadruple-precision floating-point 5916 | value. The operation is performed according to the IEC/IEEE Standard for 5917 | Binary Floating-Point Arithmetic. 5918 *----------------------------------------------------------------------------*/ 5919 5920 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 5921 { 5922 bool aSign; 5923 int32_t aExp; 5924 uint64_t lastBitMask, roundBitsMask; 5925 floatx80 z; 5926 5927 if (floatx80_invalid_encoding(a)) { 5928 float_raise(float_flag_invalid, status); 5929 return floatx80_default_nan(status); 5930 } 5931 aExp = extractFloatx80Exp( a ); 5932 if ( 0x403E <= aExp ) { 5933 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 5934 return propagateFloatx80NaN(a, a, status); 5935 } 5936 return a; 5937 } 5938 if ( aExp < 0x3FFF ) { 5939 if ( ( aExp == 0 ) 5940 && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) { 5941 return a; 5942 } 5943 float_raise(float_flag_inexact, status); 5944 aSign = extractFloatx80Sign( a ); 5945 switch (status->float_rounding_mode) { 5946 case float_round_nearest_even: 5947 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 5948 ) { 5949 return 5950 packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000)); 5951 } 5952 break; 5953 case float_round_ties_away: 5954 if (aExp == 0x3FFE) { 5955 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000)); 5956 } 5957 break; 5958 case float_round_down: 5959 return 5960 aSign ? 5961 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000)) 5962 : packFloatx80( 0, 0, 0 ); 5963 case float_round_up: 5964 return 5965 aSign ? packFloatx80( 1, 0, 0 ) 5966 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000)); 5967 5968 case float_round_to_zero: 5969 break; 5970 default: 5971 g_assert_not_reached(); 5972 } 5973 return packFloatx80( aSign, 0, 0 ); 5974 } 5975 lastBitMask = 1; 5976 lastBitMask <<= 0x403E - aExp; 5977 roundBitsMask = lastBitMask - 1; 5978 z = a; 5979 switch (status->float_rounding_mode) { 5980 case float_round_nearest_even: 5981 z.low += lastBitMask>>1; 5982 if ((z.low & roundBitsMask) == 0) { 5983 z.low &= ~lastBitMask; 5984 } 5985 break; 5986 case float_round_ties_away: 5987 z.low += lastBitMask >> 1; 5988 break; 5989 case float_round_to_zero: 5990 break; 5991 case float_round_up: 5992 if (!extractFloatx80Sign(z)) { 5993 z.low += roundBitsMask; 5994 } 5995 break; 5996 case float_round_down: 5997 if (extractFloatx80Sign(z)) { 5998 z.low += roundBitsMask; 5999 } 6000 break; 6001 default: 6002 abort(); 6003 } 6004 z.low &= ~ roundBitsMask; 6005 if ( z.low == 0 ) { 6006 ++z.high; 6007 z.low = UINT64_C(0x8000000000000000); 6008 } 6009 if (z.low != a.low) { 6010 float_raise(float_flag_inexact, status); 6011 } 6012 return z; 6013 6014 } 6015 6016 /*---------------------------------------------------------------------------- 6017 | Returns the result of adding the absolute values of the extended double- 6018 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 6019 | negated before being returned. `zSign' is ignored if the result is a NaN. 6020 | The addition is performed according to the IEC/IEEE Standard for Binary 6021 | Floating-Point Arithmetic. 6022 *----------------------------------------------------------------------------*/ 6023 6024 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign, 6025 float_status *status) 6026 { 6027 int32_t aExp, bExp, zExp; 6028 uint64_t aSig, bSig, zSig0, zSig1; 6029 int32_t expDiff; 6030 6031 aSig = extractFloatx80Frac( a ); 6032 aExp = extractFloatx80Exp( a ); 6033 bSig = extractFloatx80Frac( b ); 6034 bExp = extractFloatx80Exp( b ); 6035 expDiff = aExp - bExp; 6036 if ( 0 < expDiff ) { 6037 if ( aExp == 0x7FFF ) { 6038 if ((uint64_t)(aSig << 1)) { 6039 return propagateFloatx80NaN(a, b, status); 6040 } 6041 return a; 6042 } 6043 if ( bExp == 0 ) --expDiff; 6044 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 6045 zExp = aExp; 6046 } 6047 else if ( expDiff < 0 ) { 6048 if ( bExp == 0x7FFF ) { 6049 if ((uint64_t)(bSig << 1)) { 6050 return propagateFloatx80NaN(a, b, status); 6051 } 6052 return packFloatx80(zSign, 6053 floatx80_infinity_high, 6054 floatx80_infinity_low); 6055 } 6056 if ( aExp == 0 ) ++expDiff; 6057 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 6058 zExp = bExp; 6059 } 6060 else { 6061 if ( aExp == 0x7FFF ) { 6062 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 6063 return propagateFloatx80NaN(a, b, status); 6064 } 6065 return a; 6066 } 6067 zSig1 = 0; 6068 zSig0 = aSig + bSig; 6069 if ( aExp == 0 ) { 6070 if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) { 6071 /* At least one of the values is a pseudo-denormal, 6072 * and there is a carry out of the result. */ 6073 zExp = 1; 6074 goto shiftRight1; 6075 } 6076 if (zSig0 == 0) { 6077 return packFloatx80(zSign, 0, 0); 6078 } 6079 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 6080 goto roundAndPack; 6081 } 6082 zExp = aExp; 6083 goto shiftRight1; 6084 } 6085 zSig0 = aSig + bSig; 6086 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 6087 shiftRight1: 6088 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 6089 zSig0 |= UINT64_C(0x8000000000000000); 6090 ++zExp; 6091 roundAndPack: 6092 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6093 zSign, zExp, zSig0, zSig1, status); 6094 } 6095 6096 /*---------------------------------------------------------------------------- 6097 | Returns the result of subtracting the absolute values of the extended 6098 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 6099 | difference is negated before being returned. `zSign' is ignored if the 6100 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6101 | Standard for Binary Floating-Point Arithmetic. 6102 *----------------------------------------------------------------------------*/ 6103 6104 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign, 6105 float_status *status) 6106 { 6107 int32_t aExp, bExp, zExp; 6108 uint64_t aSig, bSig, zSig0, zSig1; 6109 int32_t expDiff; 6110 6111 aSig = extractFloatx80Frac( a ); 6112 aExp = extractFloatx80Exp( a ); 6113 bSig = extractFloatx80Frac( b ); 6114 bExp = extractFloatx80Exp( b ); 6115 expDiff = aExp - bExp; 6116 if ( 0 < expDiff ) goto aExpBigger; 6117 if ( expDiff < 0 ) goto bExpBigger; 6118 if ( aExp == 0x7FFF ) { 6119 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 6120 return propagateFloatx80NaN(a, b, status); 6121 } 6122 float_raise(float_flag_invalid, status); 6123 return floatx80_default_nan(status); 6124 } 6125 if ( aExp == 0 ) { 6126 aExp = 1; 6127 bExp = 1; 6128 } 6129 zSig1 = 0; 6130 if ( bSig < aSig ) goto aBigger; 6131 if ( aSig < bSig ) goto bBigger; 6132 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 6133 bExpBigger: 6134 if ( bExp == 0x7FFF ) { 6135 if ((uint64_t)(bSig << 1)) { 6136 return propagateFloatx80NaN(a, b, status); 6137 } 6138 return packFloatx80(zSign ^ 1, floatx80_infinity_high, 6139 floatx80_infinity_low); 6140 } 6141 if ( aExp == 0 ) ++expDiff; 6142 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 6143 bBigger: 6144 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 6145 zExp = bExp; 6146 zSign ^= 1; 6147 goto normalizeRoundAndPack; 6148 aExpBigger: 6149 if ( aExp == 0x7FFF ) { 6150 if ((uint64_t)(aSig << 1)) { 6151 return propagateFloatx80NaN(a, b, status); 6152 } 6153 return a; 6154 } 6155 if ( bExp == 0 ) --expDiff; 6156 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 6157 aBigger: 6158 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 6159 zExp = aExp; 6160 normalizeRoundAndPack: 6161 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 6162 zSign, zExp, zSig0, zSig1, status); 6163 } 6164 6165 /*---------------------------------------------------------------------------- 6166 | Returns the result of adding the extended double-precision floating-point 6167 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6168 | Standard for Binary Floating-Point Arithmetic. 6169 *----------------------------------------------------------------------------*/ 6170 6171 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 6172 { 6173 bool aSign, bSign; 6174 6175 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6176 float_raise(float_flag_invalid, status); 6177 return floatx80_default_nan(status); 6178 } 6179 aSign = extractFloatx80Sign( a ); 6180 bSign = extractFloatx80Sign( b ); 6181 if ( aSign == bSign ) { 6182 return addFloatx80Sigs(a, b, aSign, status); 6183 } 6184 else { 6185 return subFloatx80Sigs(a, b, aSign, status); 6186 } 6187 6188 } 6189 6190 /*---------------------------------------------------------------------------- 6191 | Returns the result of subtracting the extended double-precision floating- 6192 | point values `a' and `b'. The operation is performed according to the 6193 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6194 *----------------------------------------------------------------------------*/ 6195 6196 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 6197 { 6198 bool aSign, bSign; 6199 6200 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6201 float_raise(float_flag_invalid, status); 6202 return floatx80_default_nan(status); 6203 } 6204 aSign = extractFloatx80Sign( a ); 6205 bSign = extractFloatx80Sign( b ); 6206 if ( aSign == bSign ) { 6207 return subFloatx80Sigs(a, b, aSign, status); 6208 } 6209 else { 6210 return addFloatx80Sigs(a, b, aSign, status); 6211 } 6212 6213 } 6214 6215 /*---------------------------------------------------------------------------- 6216 | Returns the result of multiplying the extended double-precision floating- 6217 | point values `a' and `b'. The operation is performed according to the 6218 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6219 *----------------------------------------------------------------------------*/ 6220 6221 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 6222 { 6223 bool aSign, bSign, zSign; 6224 int32_t aExp, bExp, zExp; 6225 uint64_t aSig, bSig, zSig0, zSig1; 6226 6227 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6228 float_raise(float_flag_invalid, status); 6229 return floatx80_default_nan(status); 6230 } 6231 aSig = extractFloatx80Frac( a ); 6232 aExp = extractFloatx80Exp( a ); 6233 aSign = extractFloatx80Sign( a ); 6234 bSig = extractFloatx80Frac( b ); 6235 bExp = extractFloatx80Exp( b ); 6236 bSign = extractFloatx80Sign( b ); 6237 zSign = aSign ^ bSign; 6238 if ( aExp == 0x7FFF ) { 6239 if ( (uint64_t) ( aSig<<1 ) 6240 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 6241 return propagateFloatx80NaN(a, b, status); 6242 } 6243 if ( ( bExp | bSig ) == 0 ) goto invalid; 6244 return packFloatx80(zSign, floatx80_infinity_high, 6245 floatx80_infinity_low); 6246 } 6247 if ( bExp == 0x7FFF ) { 6248 if ((uint64_t)(bSig << 1)) { 6249 return propagateFloatx80NaN(a, b, status); 6250 } 6251 if ( ( aExp | aSig ) == 0 ) { 6252 invalid: 6253 float_raise(float_flag_invalid, status); 6254 return floatx80_default_nan(status); 6255 } 6256 return packFloatx80(zSign, floatx80_infinity_high, 6257 floatx80_infinity_low); 6258 } 6259 if ( aExp == 0 ) { 6260 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6261 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 6262 } 6263 if ( bExp == 0 ) { 6264 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6265 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6266 } 6267 zExp = aExp + bExp - 0x3FFE; 6268 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 6269 if ( 0 < (int64_t) zSig0 ) { 6270 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 6271 --zExp; 6272 } 6273 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6274 zSign, zExp, zSig0, zSig1, status); 6275 } 6276 6277 /*---------------------------------------------------------------------------- 6278 | Returns the result of dividing the extended double-precision floating-point 6279 | value `a' by the corresponding value `b'. The operation is performed 6280 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6281 *----------------------------------------------------------------------------*/ 6282 6283 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 6284 { 6285 bool aSign, bSign, zSign; 6286 int32_t aExp, bExp, zExp; 6287 uint64_t aSig, bSig, zSig0, zSig1; 6288 uint64_t rem0, rem1, rem2, term0, term1, term2; 6289 6290 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6291 float_raise(float_flag_invalid, status); 6292 return floatx80_default_nan(status); 6293 } 6294 aSig = extractFloatx80Frac( a ); 6295 aExp = extractFloatx80Exp( a ); 6296 aSign = extractFloatx80Sign( a ); 6297 bSig = extractFloatx80Frac( b ); 6298 bExp = extractFloatx80Exp( b ); 6299 bSign = extractFloatx80Sign( b ); 6300 zSign = aSign ^ bSign; 6301 if ( aExp == 0x7FFF ) { 6302 if ((uint64_t)(aSig << 1)) { 6303 return propagateFloatx80NaN(a, b, status); 6304 } 6305 if ( bExp == 0x7FFF ) { 6306 if ((uint64_t)(bSig << 1)) { 6307 return propagateFloatx80NaN(a, b, status); 6308 } 6309 goto invalid; 6310 } 6311 return packFloatx80(zSign, floatx80_infinity_high, 6312 floatx80_infinity_low); 6313 } 6314 if ( bExp == 0x7FFF ) { 6315 if ((uint64_t)(bSig << 1)) { 6316 return propagateFloatx80NaN(a, b, status); 6317 } 6318 return packFloatx80( zSign, 0, 0 ); 6319 } 6320 if ( bExp == 0 ) { 6321 if ( bSig == 0 ) { 6322 if ( ( aExp | aSig ) == 0 ) { 6323 invalid: 6324 float_raise(float_flag_invalid, status); 6325 return floatx80_default_nan(status); 6326 } 6327 float_raise(float_flag_divbyzero, status); 6328 return packFloatx80(zSign, floatx80_infinity_high, 6329 floatx80_infinity_low); 6330 } 6331 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6332 } 6333 if ( aExp == 0 ) { 6334 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6335 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 6336 } 6337 zExp = aExp - bExp + 0x3FFE; 6338 rem1 = 0; 6339 if ( bSig <= aSig ) { 6340 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 6341 ++zExp; 6342 } 6343 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 6344 mul64To128( bSig, zSig0, &term0, &term1 ); 6345 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 6346 while ( (int64_t) rem0 < 0 ) { 6347 --zSig0; 6348 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 6349 } 6350 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 6351 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 6352 mul64To128( bSig, zSig1, &term1, &term2 ); 6353 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6354 while ( (int64_t) rem1 < 0 ) { 6355 --zSig1; 6356 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 6357 } 6358 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 6359 } 6360 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6361 zSign, zExp, zSig0, zSig1, status); 6362 } 6363 6364 /*---------------------------------------------------------------------------- 6365 | Returns the remainder of the extended double-precision floating-point value 6366 | `a' with respect to the corresponding value `b'. The operation is performed 6367 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic, 6368 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating 6369 | the quotient toward zero instead. '*quotient' is set to the low 64 bits of 6370 | the absolute value of the integer quotient. 6371 *----------------------------------------------------------------------------*/ 6372 6373 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient, 6374 float_status *status) 6375 { 6376 bool aSign, zSign; 6377 int32_t aExp, bExp, expDiff, aExpOrig; 6378 uint64_t aSig0, aSig1, bSig; 6379 uint64_t q, term0, term1, alternateASig0, alternateASig1; 6380 6381 *quotient = 0; 6382 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6383 float_raise(float_flag_invalid, status); 6384 return floatx80_default_nan(status); 6385 } 6386 aSig0 = extractFloatx80Frac( a ); 6387 aExpOrig = aExp = extractFloatx80Exp( a ); 6388 aSign = extractFloatx80Sign( a ); 6389 bSig = extractFloatx80Frac( b ); 6390 bExp = extractFloatx80Exp( b ); 6391 if ( aExp == 0x7FFF ) { 6392 if ( (uint64_t) ( aSig0<<1 ) 6393 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 6394 return propagateFloatx80NaN(a, b, status); 6395 } 6396 goto invalid; 6397 } 6398 if ( bExp == 0x7FFF ) { 6399 if ((uint64_t)(bSig << 1)) { 6400 return propagateFloatx80NaN(a, b, status); 6401 } 6402 if (aExp == 0 && aSig0 >> 63) { 6403 /* 6404 * Pseudo-denormal argument must be returned in normalized 6405 * form. 6406 */ 6407 return packFloatx80(aSign, 1, aSig0); 6408 } 6409 return a; 6410 } 6411 if ( bExp == 0 ) { 6412 if ( bSig == 0 ) { 6413 invalid: 6414 float_raise(float_flag_invalid, status); 6415 return floatx80_default_nan(status); 6416 } 6417 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6418 } 6419 if ( aExp == 0 ) { 6420 if ( aSig0 == 0 ) return a; 6421 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6422 } 6423 zSign = aSign; 6424 expDiff = aExp - bExp; 6425 aSig1 = 0; 6426 if ( expDiff < 0 ) { 6427 if ( mod || expDiff < -1 ) { 6428 if (aExp == 1 && aExpOrig == 0) { 6429 /* 6430 * Pseudo-denormal argument must be returned in 6431 * normalized form. 6432 */ 6433 return packFloatx80(aSign, aExp, aSig0); 6434 } 6435 return a; 6436 } 6437 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 6438 expDiff = 0; 6439 } 6440 *quotient = q = ( bSig <= aSig0 ); 6441 if ( q ) aSig0 -= bSig; 6442 expDiff -= 64; 6443 while ( 0 < expDiff ) { 6444 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6445 q = ( 2 < q ) ? q - 2 : 0; 6446 mul64To128( bSig, q, &term0, &term1 ); 6447 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6448 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 6449 expDiff -= 62; 6450 *quotient <<= 62; 6451 *quotient += q; 6452 } 6453 expDiff += 64; 6454 if ( 0 < expDiff ) { 6455 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6456 q = ( 2 < q ) ? q - 2 : 0; 6457 q >>= 64 - expDiff; 6458 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 6459 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6460 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 6461 while ( le128( term0, term1, aSig0, aSig1 ) ) { 6462 ++q; 6463 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6464 } 6465 if (expDiff < 64) { 6466 *quotient <<= expDiff; 6467 } else { 6468 *quotient = 0; 6469 } 6470 *quotient += q; 6471 } 6472 else { 6473 term1 = 0; 6474 term0 = bSig; 6475 } 6476 if (!mod) { 6477 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 6478 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6479 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6480 && ( q & 1 ) ) 6481 ) { 6482 aSig0 = alternateASig0; 6483 aSig1 = alternateASig1; 6484 zSign = ! zSign; 6485 ++*quotient; 6486 } 6487 } 6488 return 6489 normalizeRoundAndPackFloatx80( 6490 80, zSign, bExp + expDiff, aSig0, aSig1, status); 6491 6492 } 6493 6494 /*---------------------------------------------------------------------------- 6495 | Returns the remainder of the extended double-precision floating-point value 6496 | `a' with respect to the corresponding value `b'. The operation is performed 6497 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6498 *----------------------------------------------------------------------------*/ 6499 6500 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 6501 { 6502 uint64_t quotient; 6503 return floatx80_modrem(a, b, false, "ient, status); 6504 } 6505 6506 /*---------------------------------------------------------------------------- 6507 | Returns the remainder of the extended double-precision floating-point value 6508 | `a' with respect to the corresponding value `b', with the quotient truncated 6509 | toward zero. 6510 *----------------------------------------------------------------------------*/ 6511 6512 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status) 6513 { 6514 uint64_t quotient; 6515 return floatx80_modrem(a, b, true, "ient, status); 6516 } 6517 6518 /*---------------------------------------------------------------------------- 6519 | Returns the square root of the extended double-precision floating-point 6520 | value `a'. The operation is performed according to the IEC/IEEE Standard 6521 | for Binary Floating-Point Arithmetic. 6522 *----------------------------------------------------------------------------*/ 6523 6524 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 6525 { 6526 bool aSign; 6527 int32_t aExp, zExp; 6528 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 6529 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6530 6531 if (floatx80_invalid_encoding(a)) { 6532 float_raise(float_flag_invalid, status); 6533 return floatx80_default_nan(status); 6534 } 6535 aSig0 = extractFloatx80Frac( a ); 6536 aExp = extractFloatx80Exp( a ); 6537 aSign = extractFloatx80Sign( a ); 6538 if ( aExp == 0x7FFF ) { 6539 if ((uint64_t)(aSig0 << 1)) { 6540 return propagateFloatx80NaN(a, a, status); 6541 } 6542 if ( ! aSign ) return a; 6543 goto invalid; 6544 } 6545 if ( aSign ) { 6546 if ( ( aExp | aSig0 ) == 0 ) return a; 6547 invalid: 6548 float_raise(float_flag_invalid, status); 6549 return floatx80_default_nan(status); 6550 } 6551 if ( aExp == 0 ) { 6552 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 6553 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6554 } 6555 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 6556 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 6557 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 6558 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6559 doubleZSig0 = zSig0<<1; 6560 mul64To128( zSig0, zSig0, &term0, &term1 ); 6561 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6562 while ( (int64_t) rem0 < 0 ) { 6563 --zSig0; 6564 doubleZSig0 -= 2; 6565 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6566 } 6567 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6568 if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) { 6569 if ( zSig1 == 0 ) zSig1 = 1; 6570 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6571 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6572 mul64To128( zSig1, zSig1, &term2, &term3 ); 6573 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6574 while ( (int64_t) rem1 < 0 ) { 6575 --zSig1; 6576 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6577 term3 |= 1; 6578 term2 |= doubleZSig0; 6579 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6580 } 6581 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6582 } 6583 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 6584 zSig0 |= doubleZSig0; 6585 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6586 0, zExp, zSig0, zSig1, status); 6587 } 6588 6589 /*---------------------------------------------------------------------------- 6590 | Returns the result of converting the quadruple-precision floating-point 6591 | value `a' to the 32-bit two's complement integer format. The conversion 6592 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6593 | Arithmetic---which means in particular that the conversion is rounded 6594 | according to the current rounding mode. If `a' is a NaN, the largest 6595 | positive integer is returned. Otherwise, if the conversion overflows, the 6596 | largest integer with the same sign as `a' is returned. 6597 *----------------------------------------------------------------------------*/ 6598 6599 int32_t float128_to_int32(float128 a, float_status *status) 6600 { 6601 bool aSign; 6602 int32_t aExp, shiftCount; 6603 uint64_t aSig0, aSig1; 6604 6605 aSig1 = extractFloat128Frac1( a ); 6606 aSig0 = extractFloat128Frac0( a ); 6607 aExp = extractFloat128Exp( a ); 6608 aSign = extractFloat128Sign( a ); 6609 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 6610 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6611 aSig0 |= ( aSig1 != 0 ); 6612 shiftCount = 0x4028 - aExp; 6613 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 6614 return roundAndPackInt32(aSign, aSig0, status); 6615 6616 } 6617 6618 /*---------------------------------------------------------------------------- 6619 | Returns the result of converting the quadruple-precision floating-point 6620 | value `a' to the 32-bit two's complement integer format. The conversion 6621 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6622 | Arithmetic, except that the conversion is always rounded toward zero. If 6623 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 6624 | conversion overflows, the largest integer with the same sign as `a' is 6625 | returned. 6626 *----------------------------------------------------------------------------*/ 6627 6628 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 6629 { 6630 bool aSign; 6631 int32_t aExp, shiftCount; 6632 uint64_t aSig0, aSig1, savedASig; 6633 int32_t z; 6634 6635 aSig1 = extractFloat128Frac1( a ); 6636 aSig0 = extractFloat128Frac0( a ); 6637 aExp = extractFloat128Exp( a ); 6638 aSign = extractFloat128Sign( a ); 6639 aSig0 |= ( aSig1 != 0 ); 6640 if ( 0x401E < aExp ) { 6641 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 6642 goto invalid; 6643 } 6644 else if ( aExp < 0x3FFF ) { 6645 if (aExp || aSig0) { 6646 float_raise(float_flag_inexact, status); 6647 } 6648 return 0; 6649 } 6650 aSig0 |= UINT64_C(0x0001000000000000); 6651 shiftCount = 0x402F - aExp; 6652 savedASig = aSig0; 6653 aSig0 >>= shiftCount; 6654 z = aSig0; 6655 if ( aSign ) z = - z; 6656 if ( ( z < 0 ) ^ aSign ) { 6657 invalid: 6658 float_raise(float_flag_invalid, status); 6659 return aSign ? INT32_MIN : INT32_MAX; 6660 } 6661 if ( ( aSig0<<shiftCount ) != savedASig ) { 6662 float_raise(float_flag_inexact, status); 6663 } 6664 return z; 6665 6666 } 6667 6668 /*---------------------------------------------------------------------------- 6669 | Returns the result of converting the quadruple-precision floating-point 6670 | value `a' to the 64-bit two's complement integer format. The conversion 6671 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6672 | Arithmetic---which means in particular that the conversion is rounded 6673 | according to the current rounding mode. If `a' is a NaN, the largest 6674 | positive integer is returned. Otherwise, if the conversion overflows, the 6675 | largest integer with the same sign as `a' is returned. 6676 *----------------------------------------------------------------------------*/ 6677 6678 int64_t float128_to_int64(float128 a, float_status *status) 6679 { 6680 bool aSign; 6681 int32_t aExp, shiftCount; 6682 uint64_t aSig0, aSig1; 6683 6684 aSig1 = extractFloat128Frac1( a ); 6685 aSig0 = extractFloat128Frac0( a ); 6686 aExp = extractFloat128Exp( a ); 6687 aSign = extractFloat128Sign( a ); 6688 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6689 shiftCount = 0x402F - aExp; 6690 if ( shiftCount <= 0 ) { 6691 if ( 0x403E < aExp ) { 6692 float_raise(float_flag_invalid, status); 6693 if ( ! aSign 6694 || ( ( aExp == 0x7FFF ) 6695 && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) ) 6696 ) 6697 ) { 6698 return INT64_MAX; 6699 } 6700 return INT64_MIN; 6701 } 6702 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 6703 } 6704 else { 6705 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 6706 } 6707 return roundAndPackInt64(aSign, aSig0, aSig1, status); 6708 6709 } 6710 6711 /*---------------------------------------------------------------------------- 6712 | Returns the result of converting the quadruple-precision floating-point 6713 | value `a' to the 64-bit two's complement integer format. The conversion 6714 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6715 | Arithmetic, except that the conversion is always rounded toward zero. 6716 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 6717 | the conversion overflows, the largest integer with the same sign as `a' is 6718 | returned. 6719 *----------------------------------------------------------------------------*/ 6720 6721 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 6722 { 6723 bool aSign; 6724 int32_t aExp, shiftCount; 6725 uint64_t aSig0, aSig1; 6726 int64_t z; 6727 6728 aSig1 = extractFloat128Frac1( a ); 6729 aSig0 = extractFloat128Frac0( a ); 6730 aExp = extractFloat128Exp( a ); 6731 aSign = extractFloat128Sign( a ); 6732 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6733 shiftCount = aExp - 0x402F; 6734 if ( 0 < shiftCount ) { 6735 if ( 0x403E <= aExp ) { 6736 aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF); 6737 if ( ( a.high == UINT64_C(0xC03E000000000000) ) 6738 && ( aSig1 < UINT64_C(0x0002000000000000) ) ) { 6739 if (aSig1) { 6740 float_raise(float_flag_inexact, status); 6741 } 6742 } 6743 else { 6744 float_raise(float_flag_invalid, status); 6745 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 6746 return INT64_MAX; 6747 } 6748 } 6749 return INT64_MIN; 6750 } 6751 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 6752 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 6753 float_raise(float_flag_inexact, status); 6754 } 6755 } 6756 else { 6757 if ( aExp < 0x3FFF ) { 6758 if ( aExp | aSig0 | aSig1 ) { 6759 float_raise(float_flag_inexact, status); 6760 } 6761 return 0; 6762 } 6763 z = aSig0>>( - shiftCount ); 6764 if ( aSig1 6765 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 6766 float_raise(float_flag_inexact, status); 6767 } 6768 } 6769 if ( aSign ) z = - z; 6770 return z; 6771 6772 } 6773 6774 /*---------------------------------------------------------------------------- 6775 | Returns the result of converting the quadruple-precision floating-point value 6776 | `a' to the 64-bit unsigned integer format. The conversion is 6777 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6778 | Arithmetic---which means in particular that the conversion is rounded 6779 | according to the current rounding mode. If `a' is a NaN, the largest 6780 | positive integer is returned. If the conversion overflows, the 6781 | largest unsigned integer is returned. If 'a' is negative, the value is 6782 | rounded and zero is returned; negative values that do not round to zero 6783 | will raise the inexact exception. 6784 *----------------------------------------------------------------------------*/ 6785 6786 uint64_t float128_to_uint64(float128 a, float_status *status) 6787 { 6788 bool aSign; 6789 int aExp; 6790 int shiftCount; 6791 uint64_t aSig0, aSig1; 6792 6793 aSig0 = extractFloat128Frac0(a); 6794 aSig1 = extractFloat128Frac1(a); 6795 aExp = extractFloat128Exp(a); 6796 aSign = extractFloat128Sign(a); 6797 if (aSign && (aExp > 0x3FFE)) { 6798 float_raise(float_flag_invalid, status); 6799 if (float128_is_any_nan(a)) { 6800 return UINT64_MAX; 6801 } else { 6802 return 0; 6803 } 6804 } 6805 if (aExp) { 6806 aSig0 |= UINT64_C(0x0001000000000000); 6807 } 6808 shiftCount = 0x402F - aExp; 6809 if (shiftCount <= 0) { 6810 if (0x403E < aExp) { 6811 float_raise(float_flag_invalid, status); 6812 return UINT64_MAX; 6813 } 6814 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 6815 } else { 6816 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 6817 } 6818 return roundAndPackUint64(aSign, aSig0, aSig1, status); 6819 } 6820 6821 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 6822 { 6823 uint64_t v; 6824 signed char current_rounding_mode = status->float_rounding_mode; 6825 6826 set_float_rounding_mode(float_round_to_zero, status); 6827 v = float128_to_uint64(a, status); 6828 set_float_rounding_mode(current_rounding_mode, status); 6829 6830 return v; 6831 } 6832 6833 /*---------------------------------------------------------------------------- 6834 | Returns the result of converting the quadruple-precision floating-point 6835 | value `a' to the 32-bit unsigned integer format. The conversion 6836 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6837 | Arithmetic except that the conversion is always rounded toward zero. 6838 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 6839 | if the conversion overflows, the largest unsigned integer is returned. 6840 | If 'a' is negative, the value is rounded and zero is returned; negative 6841 | values that do not round to zero will raise the inexact exception. 6842 *----------------------------------------------------------------------------*/ 6843 6844 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 6845 { 6846 uint64_t v; 6847 uint32_t res; 6848 int old_exc_flags = get_float_exception_flags(status); 6849 6850 v = float128_to_uint64_round_to_zero(a, status); 6851 if (v > 0xffffffff) { 6852 res = 0xffffffff; 6853 } else { 6854 return v; 6855 } 6856 set_float_exception_flags(old_exc_flags, status); 6857 float_raise(float_flag_invalid, status); 6858 return res; 6859 } 6860 6861 /*---------------------------------------------------------------------------- 6862 | Returns the result of converting the quadruple-precision floating-point value 6863 | `a' to the 32-bit unsigned integer format. The conversion is 6864 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6865 | Arithmetic---which means in particular that the conversion is rounded 6866 | according to the current rounding mode. If `a' is a NaN, the largest 6867 | positive integer is returned. If the conversion overflows, the 6868 | largest unsigned integer is returned. If 'a' is negative, the value is 6869 | rounded and zero is returned; negative values that do not round to zero 6870 | will raise the inexact exception. 6871 *----------------------------------------------------------------------------*/ 6872 6873 uint32_t float128_to_uint32(float128 a, float_status *status) 6874 { 6875 uint64_t v; 6876 uint32_t res; 6877 int old_exc_flags = get_float_exception_flags(status); 6878 6879 v = float128_to_uint64(a, status); 6880 if (v > 0xffffffff) { 6881 res = 0xffffffff; 6882 } else { 6883 return v; 6884 } 6885 set_float_exception_flags(old_exc_flags, status); 6886 float_raise(float_flag_invalid, status); 6887 return res; 6888 } 6889 6890 /*---------------------------------------------------------------------------- 6891 | Returns the result of converting the quadruple-precision floating-point 6892 | value `a' to the single-precision floating-point format. The conversion 6893 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6894 | Arithmetic. 6895 *----------------------------------------------------------------------------*/ 6896 6897 float32 float128_to_float32(float128 a, float_status *status) 6898 { 6899 bool aSign; 6900 int32_t aExp; 6901 uint64_t aSig0, aSig1; 6902 uint32_t zSig; 6903 6904 aSig1 = extractFloat128Frac1( a ); 6905 aSig0 = extractFloat128Frac0( a ); 6906 aExp = extractFloat128Exp( a ); 6907 aSign = extractFloat128Sign( a ); 6908 if ( aExp == 0x7FFF ) { 6909 if ( aSig0 | aSig1 ) { 6910 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 6911 } 6912 return packFloat32( aSign, 0xFF, 0 ); 6913 } 6914 aSig0 |= ( aSig1 != 0 ); 6915 shift64RightJamming( aSig0, 18, &aSig0 ); 6916 zSig = aSig0; 6917 if ( aExp || zSig ) { 6918 zSig |= 0x40000000; 6919 aExp -= 0x3F81; 6920 } 6921 return roundAndPackFloat32(aSign, aExp, zSig, status); 6922 6923 } 6924 6925 /*---------------------------------------------------------------------------- 6926 | Returns the result of converting the quadruple-precision floating-point 6927 | value `a' to the double-precision floating-point format. The conversion 6928 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6929 | Arithmetic. 6930 *----------------------------------------------------------------------------*/ 6931 6932 float64 float128_to_float64(float128 a, float_status *status) 6933 { 6934 bool aSign; 6935 int32_t aExp; 6936 uint64_t aSig0, aSig1; 6937 6938 aSig1 = extractFloat128Frac1( a ); 6939 aSig0 = extractFloat128Frac0( a ); 6940 aExp = extractFloat128Exp( a ); 6941 aSign = extractFloat128Sign( a ); 6942 if ( aExp == 0x7FFF ) { 6943 if ( aSig0 | aSig1 ) { 6944 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 6945 } 6946 return packFloat64( aSign, 0x7FF, 0 ); 6947 } 6948 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6949 aSig0 |= ( aSig1 != 0 ); 6950 if ( aExp || aSig0 ) { 6951 aSig0 |= UINT64_C(0x4000000000000000); 6952 aExp -= 0x3C01; 6953 } 6954 return roundAndPackFloat64(aSign, aExp, aSig0, status); 6955 6956 } 6957 6958 /*---------------------------------------------------------------------------- 6959 | Returns the result of converting the quadruple-precision floating-point 6960 | value `a' to the extended double-precision floating-point format. The 6961 | conversion is performed according to the IEC/IEEE Standard for Binary 6962 | Floating-Point Arithmetic. 6963 *----------------------------------------------------------------------------*/ 6964 6965 floatx80 float128_to_floatx80(float128 a, float_status *status) 6966 { 6967 bool aSign; 6968 int32_t aExp; 6969 uint64_t aSig0, aSig1; 6970 6971 aSig1 = extractFloat128Frac1( a ); 6972 aSig0 = extractFloat128Frac0( a ); 6973 aExp = extractFloat128Exp( a ); 6974 aSign = extractFloat128Sign( a ); 6975 if ( aExp == 0x7FFF ) { 6976 if ( aSig0 | aSig1 ) { 6977 floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status), 6978 status); 6979 return floatx80_silence_nan(res, status); 6980 } 6981 return packFloatx80(aSign, floatx80_infinity_high, 6982 floatx80_infinity_low); 6983 } 6984 if ( aExp == 0 ) { 6985 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6986 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6987 } 6988 else { 6989 aSig0 |= UINT64_C(0x0001000000000000); 6990 } 6991 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6992 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6993 6994 } 6995 6996 /*---------------------------------------------------------------------------- 6997 | Rounds the quadruple-precision floating-point value `a' to an integer, and 6998 | returns the result as a quadruple-precision floating-point value. The 6999 | operation is performed according to the IEC/IEEE Standard for Binary 7000 | Floating-Point Arithmetic. 7001 *----------------------------------------------------------------------------*/ 7002 7003 float128 float128_round_to_int(float128 a, float_status *status) 7004 { 7005 bool aSign; 7006 int32_t aExp; 7007 uint64_t lastBitMask, roundBitsMask; 7008 float128 z; 7009 7010 aExp = extractFloat128Exp( a ); 7011 if ( 0x402F <= aExp ) { 7012 if ( 0x406F <= aExp ) { 7013 if ( ( aExp == 0x7FFF ) 7014 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 7015 ) { 7016 return propagateFloat128NaN(a, a, status); 7017 } 7018 return a; 7019 } 7020 lastBitMask = 1; 7021 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 7022 roundBitsMask = lastBitMask - 1; 7023 z = a; 7024 switch (status->float_rounding_mode) { 7025 case float_round_nearest_even: 7026 if ( lastBitMask ) { 7027 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 7028 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 7029 } 7030 else { 7031 if ( (int64_t) z.low < 0 ) { 7032 ++z.high; 7033 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 7034 } 7035 } 7036 break; 7037 case float_round_ties_away: 7038 if (lastBitMask) { 7039 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 7040 } else { 7041 if ((int64_t) z.low < 0) { 7042 ++z.high; 7043 } 7044 } 7045 break; 7046 case float_round_to_zero: 7047 break; 7048 case float_round_up: 7049 if (!extractFloat128Sign(z)) { 7050 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 7051 } 7052 break; 7053 case float_round_down: 7054 if (extractFloat128Sign(z)) { 7055 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 7056 } 7057 break; 7058 case float_round_to_odd: 7059 /* 7060 * Note that if lastBitMask == 0, the last bit is the lsb 7061 * of high, and roundBitsMask == -1. 7062 */ 7063 if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) { 7064 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 7065 } 7066 break; 7067 default: 7068 abort(); 7069 } 7070 z.low &= ~ roundBitsMask; 7071 } 7072 else { 7073 if ( aExp < 0x3FFF ) { 7074 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 7075 float_raise(float_flag_inexact, status); 7076 aSign = extractFloat128Sign( a ); 7077 switch (status->float_rounding_mode) { 7078 case float_round_nearest_even: 7079 if ( ( aExp == 0x3FFE ) 7080 && ( extractFloat128Frac0( a ) 7081 | extractFloat128Frac1( a ) ) 7082 ) { 7083 return packFloat128( aSign, 0x3FFF, 0, 0 ); 7084 } 7085 break; 7086 case float_round_ties_away: 7087 if (aExp == 0x3FFE) { 7088 return packFloat128(aSign, 0x3FFF, 0, 0); 7089 } 7090 break; 7091 case float_round_down: 7092 return 7093 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 7094 : packFloat128( 0, 0, 0, 0 ); 7095 case float_round_up: 7096 return 7097 aSign ? packFloat128( 1, 0, 0, 0 ) 7098 : packFloat128( 0, 0x3FFF, 0, 0 ); 7099 7100 case float_round_to_odd: 7101 return packFloat128(aSign, 0x3FFF, 0, 0); 7102 7103 case float_round_to_zero: 7104 break; 7105 } 7106 return packFloat128( aSign, 0, 0, 0 ); 7107 } 7108 lastBitMask = 1; 7109 lastBitMask <<= 0x402F - aExp; 7110 roundBitsMask = lastBitMask - 1; 7111 z.low = 0; 7112 z.high = a.high; 7113 switch (status->float_rounding_mode) { 7114 case float_round_nearest_even: 7115 z.high += lastBitMask>>1; 7116 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 7117 z.high &= ~ lastBitMask; 7118 } 7119 break; 7120 case float_round_ties_away: 7121 z.high += lastBitMask>>1; 7122 break; 7123 case float_round_to_zero: 7124 break; 7125 case float_round_up: 7126 if (!extractFloat128Sign(z)) { 7127 z.high |= ( a.low != 0 ); 7128 z.high += roundBitsMask; 7129 } 7130 break; 7131 case float_round_down: 7132 if (extractFloat128Sign(z)) { 7133 z.high |= (a.low != 0); 7134 z.high += roundBitsMask; 7135 } 7136 break; 7137 case float_round_to_odd: 7138 if ((z.high & lastBitMask) == 0) { 7139 z.high |= (a.low != 0); 7140 z.high += roundBitsMask; 7141 } 7142 break; 7143 default: 7144 abort(); 7145 } 7146 z.high &= ~ roundBitsMask; 7147 } 7148 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 7149 float_raise(float_flag_inexact, status); 7150 } 7151 return z; 7152 7153 } 7154 7155 /*---------------------------------------------------------------------------- 7156 | Returns the remainder of the quadruple-precision floating-point value `a' 7157 | with respect to the corresponding value `b'. The operation is performed 7158 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7159 *----------------------------------------------------------------------------*/ 7160 7161 float128 float128_rem(float128 a, float128 b, float_status *status) 7162 { 7163 bool aSign, zSign; 7164 int32_t aExp, bExp, expDiff; 7165 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 7166 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 7167 int64_t sigMean0; 7168 7169 aSig1 = extractFloat128Frac1( a ); 7170 aSig0 = extractFloat128Frac0( a ); 7171 aExp = extractFloat128Exp( a ); 7172 aSign = extractFloat128Sign( a ); 7173 bSig1 = extractFloat128Frac1( b ); 7174 bSig0 = extractFloat128Frac0( b ); 7175 bExp = extractFloat128Exp( b ); 7176 if ( aExp == 0x7FFF ) { 7177 if ( ( aSig0 | aSig1 ) 7178 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 7179 return propagateFloat128NaN(a, b, status); 7180 } 7181 goto invalid; 7182 } 7183 if ( bExp == 0x7FFF ) { 7184 if (bSig0 | bSig1) { 7185 return propagateFloat128NaN(a, b, status); 7186 } 7187 return a; 7188 } 7189 if ( bExp == 0 ) { 7190 if ( ( bSig0 | bSig1 ) == 0 ) { 7191 invalid: 7192 float_raise(float_flag_invalid, status); 7193 return float128_default_nan(status); 7194 } 7195 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7196 } 7197 if ( aExp == 0 ) { 7198 if ( ( aSig0 | aSig1 ) == 0 ) return a; 7199 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7200 } 7201 expDiff = aExp - bExp; 7202 if ( expDiff < -1 ) return a; 7203 shortShift128Left( 7204 aSig0 | UINT64_C(0x0001000000000000), 7205 aSig1, 7206 15 - ( expDiff < 0 ), 7207 &aSig0, 7208 &aSig1 7209 ); 7210 shortShift128Left( 7211 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 ); 7212 q = le128( bSig0, bSig1, aSig0, aSig1 ); 7213 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7214 expDiff -= 64; 7215 while ( 0 < expDiff ) { 7216 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7217 q = ( 4 < q ) ? q - 4 : 0; 7218 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7219 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 7220 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 7221 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 7222 expDiff -= 61; 7223 } 7224 if ( -64 < expDiff ) { 7225 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7226 q = ( 4 < q ) ? q - 4 : 0; 7227 q >>= - expDiff; 7228 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7229 expDiff += 52; 7230 if ( expDiff < 0 ) { 7231 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 7232 } 7233 else { 7234 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 7235 } 7236 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7237 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 7238 } 7239 else { 7240 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 7241 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7242 } 7243 do { 7244 alternateASig0 = aSig0; 7245 alternateASig1 = aSig1; 7246 ++q; 7247 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7248 } while ( 0 <= (int64_t) aSig0 ); 7249 add128( 7250 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 7251 if ( ( sigMean0 < 0 ) 7252 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 7253 aSig0 = alternateASig0; 7254 aSig1 = alternateASig1; 7255 } 7256 zSign = ( (int64_t) aSig0 < 0 ); 7257 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 7258 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 7259 status); 7260 } 7261 7262 /*---------------------------------------------------------------------------- 7263 | Returns the square root of the quadruple-precision floating-point value `a'. 7264 | The operation is performed according to the IEC/IEEE Standard for Binary 7265 | Floating-Point Arithmetic. 7266 *----------------------------------------------------------------------------*/ 7267 7268 float128 float128_sqrt(float128 a, float_status *status) 7269 { 7270 bool aSign; 7271 int32_t aExp, zExp; 7272 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 7273 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7274 7275 aSig1 = extractFloat128Frac1( a ); 7276 aSig0 = extractFloat128Frac0( a ); 7277 aExp = extractFloat128Exp( a ); 7278 aSign = extractFloat128Sign( a ); 7279 if ( aExp == 0x7FFF ) { 7280 if (aSig0 | aSig1) { 7281 return propagateFloat128NaN(a, a, status); 7282 } 7283 if ( ! aSign ) return a; 7284 goto invalid; 7285 } 7286 if ( aSign ) { 7287 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 7288 invalid: 7289 float_raise(float_flag_invalid, status); 7290 return float128_default_nan(status); 7291 } 7292 if ( aExp == 0 ) { 7293 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 7294 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7295 } 7296 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 7297 aSig0 |= UINT64_C(0x0001000000000000); 7298 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 7299 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 7300 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 7301 doubleZSig0 = zSig0<<1; 7302 mul64To128( zSig0, zSig0, &term0, &term1 ); 7303 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 7304 while ( (int64_t) rem0 < 0 ) { 7305 --zSig0; 7306 doubleZSig0 -= 2; 7307 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 7308 } 7309 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 7310 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 7311 if ( zSig1 == 0 ) zSig1 = 1; 7312 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 7313 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 7314 mul64To128( zSig1, zSig1, &term2, &term3 ); 7315 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 7316 while ( (int64_t) rem1 < 0 ) { 7317 --zSig1; 7318 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 7319 term3 |= 1; 7320 term2 |= doubleZSig0; 7321 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 7322 } 7323 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7324 } 7325 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 7326 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 7327 7328 } 7329 7330 static inline FloatRelation 7331 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet, 7332 float_status *status) 7333 { 7334 bool aSign, bSign; 7335 7336 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 7337 float_raise(float_flag_invalid, status); 7338 return float_relation_unordered; 7339 } 7340 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 7341 ( extractFloatx80Frac( a )<<1 ) ) || 7342 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 7343 ( extractFloatx80Frac( b )<<1 ) )) { 7344 if (!is_quiet || 7345 floatx80_is_signaling_nan(a, status) || 7346 floatx80_is_signaling_nan(b, status)) { 7347 float_raise(float_flag_invalid, status); 7348 } 7349 return float_relation_unordered; 7350 } 7351 aSign = extractFloatx80Sign( a ); 7352 bSign = extractFloatx80Sign( b ); 7353 if ( aSign != bSign ) { 7354 7355 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 7356 ( ( a.low | b.low ) == 0 ) ) { 7357 /* zero case */ 7358 return float_relation_equal; 7359 } else { 7360 return 1 - (2 * aSign); 7361 } 7362 } else { 7363 /* Normalize pseudo-denormals before comparison. */ 7364 if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) { 7365 ++a.high; 7366 } 7367 if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) { 7368 ++b.high; 7369 } 7370 if (a.low == b.low && a.high == b.high) { 7371 return float_relation_equal; 7372 } else { 7373 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7374 } 7375 } 7376 } 7377 7378 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status) 7379 { 7380 return floatx80_compare_internal(a, b, 0, status); 7381 } 7382 7383 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b, 7384 float_status *status) 7385 { 7386 return floatx80_compare_internal(a, b, 1, status); 7387 } 7388 7389 static inline FloatRelation 7390 float128_compare_internal(float128 a, float128 b, bool is_quiet, 7391 float_status *status) 7392 { 7393 bool aSign, bSign; 7394 7395 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7396 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7397 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7398 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7399 if (!is_quiet || 7400 float128_is_signaling_nan(a, status) || 7401 float128_is_signaling_nan(b, status)) { 7402 float_raise(float_flag_invalid, status); 7403 } 7404 return float_relation_unordered; 7405 } 7406 aSign = extractFloat128Sign( a ); 7407 bSign = extractFloat128Sign( b ); 7408 if ( aSign != bSign ) { 7409 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7410 /* zero case */ 7411 return float_relation_equal; 7412 } else { 7413 return 1 - (2 * aSign); 7414 } 7415 } else { 7416 if (a.low == b.low && a.high == b.high) { 7417 return float_relation_equal; 7418 } else { 7419 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7420 } 7421 } 7422 } 7423 7424 FloatRelation float128_compare(float128 a, float128 b, float_status *status) 7425 { 7426 return float128_compare_internal(a, b, 0, status); 7427 } 7428 7429 FloatRelation float128_compare_quiet(float128 a, float128 b, 7430 float_status *status) 7431 { 7432 return float128_compare_internal(a, b, 1, status); 7433 } 7434 7435 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7436 { 7437 bool aSign; 7438 int32_t aExp; 7439 uint64_t aSig; 7440 7441 if (floatx80_invalid_encoding(a)) { 7442 float_raise(float_flag_invalid, status); 7443 return floatx80_default_nan(status); 7444 } 7445 aSig = extractFloatx80Frac( a ); 7446 aExp = extractFloatx80Exp( a ); 7447 aSign = extractFloatx80Sign( a ); 7448 7449 if ( aExp == 0x7FFF ) { 7450 if ( aSig<<1 ) { 7451 return propagateFloatx80NaN(a, a, status); 7452 } 7453 return a; 7454 } 7455 7456 if (aExp == 0) { 7457 if (aSig == 0) { 7458 return a; 7459 } 7460 aExp++; 7461 } 7462 7463 if (n > 0x10000) { 7464 n = 0x10000; 7465 } else if (n < -0x10000) { 7466 n = -0x10000; 7467 } 7468 7469 aExp += n; 7470 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7471 aSign, aExp, aSig, 0, status); 7472 } 7473 7474 float128 float128_scalbn(float128 a, int n, float_status *status) 7475 { 7476 bool aSign; 7477 int32_t aExp; 7478 uint64_t aSig0, aSig1; 7479 7480 aSig1 = extractFloat128Frac1( a ); 7481 aSig0 = extractFloat128Frac0( a ); 7482 aExp = extractFloat128Exp( a ); 7483 aSign = extractFloat128Sign( a ); 7484 if ( aExp == 0x7FFF ) { 7485 if ( aSig0 | aSig1 ) { 7486 return propagateFloat128NaN(a, a, status); 7487 } 7488 return a; 7489 } 7490 if (aExp != 0) { 7491 aSig0 |= UINT64_C(0x0001000000000000); 7492 } else if (aSig0 == 0 && aSig1 == 0) { 7493 return a; 7494 } else { 7495 aExp++; 7496 } 7497 7498 if (n > 0x10000) { 7499 n = 0x10000; 7500 } else if (n < -0x10000) { 7501 n = -0x10000; 7502 } 7503 7504 aExp += n - 1; 7505 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7506 , status); 7507 7508 } 7509 7510 static void __attribute__((constructor)) softfloat_init(void) 7511 { 7512 union_float64 ua, ub, uc, ur; 7513 7514 if (QEMU_NO_HARDFLOAT) { 7515 return; 7516 } 7517 /* 7518 * Test that the host's FMA is not obviously broken. For example, 7519 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see 7520 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304 7521 */ 7522 ua.s = 0x0020000000000001ULL; 7523 ub.s = 0x3ca0000000000000ULL; 7524 uc.s = 0x0020000000000000ULL; 7525 ur.h = fma(ua.h, ub.h, uc.h); 7526 if (ur.s != 0x0020000000000001ULL) { 7527 force_soft_fma = true; 7528 } 7529 } 7530