1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include <math.h> 87 #include "qemu/bitops.h" 88 #include "fpu/softfloat.h" 89 90 /* We only need stdlib for abort() */ 91 92 /*---------------------------------------------------------------------------- 93 | Primitive arithmetic functions, including multi-word arithmetic, and 94 | division and square root approximations. (Can be specialized to target if 95 | desired.) 96 *----------------------------------------------------------------------------*/ 97 #include "fpu/softfloat-macros.h" 98 99 /* 100 * Hardfloat 101 * 102 * Fast emulation of guest FP instructions is challenging for two reasons. 103 * First, FP instruction semantics are similar but not identical, particularly 104 * when handling NaNs. Second, emulating at reasonable speed the guest FP 105 * exception flags is not trivial: reading the host's flags register with a 106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp], 107 * and trapping on every FP exception is not fast nor pleasant to work with. 108 * 109 * We address these challenges by leveraging the host FPU for a subset of the 110 * operations. To do this we expand on the idea presented in this paper: 111 * 112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a 113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615. 114 * 115 * The idea is thus to leverage the host FPU to (1) compute FP operations 116 * and (2) identify whether FP exceptions occurred while avoiding 117 * expensive exception flag register accesses. 118 * 119 * An important optimization shown in the paper is that given that exception 120 * flags are rarely cleared by the guest, we can avoid recomputing some flags. 121 * This is particularly useful for the inexact flag, which is very frequently 122 * raised in floating-point workloads. 123 * 124 * We optimize the code further by deferring to soft-fp whenever FP exception 125 * detection might get hairy. Two examples: (1) when at least one operand is 126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result 127 * and the result is < the minimum normal. 128 */ 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \ 130 static inline void name(soft_t *a, float_status *s) \ 131 { \ 132 if (unlikely(soft_t ## _is_denormal(*a))) { \ 133 *a = soft_t ## _set_sign(soft_t ## _zero, \ 134 soft_t ## _is_neg(*a)); \ 135 float_raise(float_flag_input_denormal, s); \ 136 } \ 137 } 138 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32) 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64) 141 #undef GEN_INPUT_FLUSH__NOCHECK 142 143 #define GEN_INPUT_FLUSH1(name, soft_t) \ 144 static inline void name(soft_t *a, float_status *s) \ 145 { \ 146 if (likely(!s->flush_inputs_to_zero)) { \ 147 return; \ 148 } \ 149 soft_t ## _input_flush__nocheck(a, s); \ 150 } 151 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32) 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64) 154 #undef GEN_INPUT_FLUSH1 155 156 #define GEN_INPUT_FLUSH2(name, soft_t) \ 157 static inline void name(soft_t *a, soft_t *b, float_status *s) \ 158 { \ 159 if (likely(!s->flush_inputs_to_zero)) { \ 160 return; \ 161 } \ 162 soft_t ## _input_flush__nocheck(a, s); \ 163 soft_t ## _input_flush__nocheck(b, s); \ 164 } 165 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32) 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64) 168 #undef GEN_INPUT_FLUSH2 169 170 #define GEN_INPUT_FLUSH3(name, soft_t) \ 171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \ 172 { \ 173 if (likely(!s->flush_inputs_to_zero)) { \ 174 return; \ 175 } \ 176 soft_t ## _input_flush__nocheck(a, s); \ 177 soft_t ## _input_flush__nocheck(b, s); \ 178 soft_t ## _input_flush__nocheck(c, s); \ 179 } 180 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32) 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64) 183 #undef GEN_INPUT_FLUSH3 184 185 /* 186 * Choose whether to use fpclassify or float32/64_* primitives in the generated 187 * hardfloat functions. Each combination of number of inputs and float size 188 * gets its own value. 189 */ 190 #if defined(__x86_64__) 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1 197 #else 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0 204 #endif 205 206 /* 207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over 208 * float{32,64}_is_infinity when !USE_FP. 209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup. 210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%. 211 */ 212 #if defined(__x86_64__) || defined(__aarch64__) 213 # define QEMU_HARDFLOAT_USE_ISINF 1 214 #else 215 # define QEMU_HARDFLOAT_USE_ISINF 0 216 #endif 217 218 /* 219 * Some targets clear the FP flags before most FP operations. This prevents 220 * the use of hardfloat, since hardfloat relies on the inexact flag being 221 * already set. 222 */ 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__) 224 # if defined(__FAST_MATH__) 225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \ 226 IEEE implementation 227 # endif 228 # define QEMU_NO_HARDFLOAT 1 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN 230 #else 231 # define QEMU_NO_HARDFLOAT 0 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline)) 233 #endif 234 235 static inline bool can_use_fpu(const float_status *s) 236 { 237 if (QEMU_NO_HARDFLOAT) { 238 return false; 239 } 240 return likely(s->float_exception_flags & float_flag_inexact && 241 s->float_rounding_mode == float_round_nearest_even); 242 } 243 244 /* 245 * Hardfloat generation functions. Each operation can have two flavors: 246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for 247 * most condition checks, or native ones (e.g. fpclassify). 248 * 249 * The flavor is chosen by the callers. Instead of using macros, we rely on the 250 * compiler to propagate constants and inline everything into the callers. 251 * 252 * We only generate functions for operations with two inputs, since only 253 * these are common enough to justify consolidating them into common code. 254 */ 255 256 typedef union { 257 float32 s; 258 float h; 259 } union_float32; 260 261 typedef union { 262 float64 s; 263 double h; 264 } union_float64; 265 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b); 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b); 268 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s); 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s); 271 typedef float (*hard_f32_op2_fn)(float a, float b); 272 typedef double (*hard_f64_op2_fn)(double a, double b); 273 274 /* 2-input is-zero-or-normal */ 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b) 276 { 277 if (QEMU_HARDFLOAT_2F32_USE_FP) { 278 /* 279 * Not using a temp variable for consecutive fpclassify calls ends up 280 * generating faster code. 281 */ 282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 284 } 285 return float32_is_zero_or_normal(a.s) && 286 float32_is_zero_or_normal(b.s); 287 } 288 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b) 290 { 291 if (QEMU_HARDFLOAT_2F64_USE_FP) { 292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 294 } 295 return float64_is_zero_or_normal(a.s) && 296 float64_is_zero_or_normal(b.s); 297 } 298 299 /* 3-input is-zero-or-normal */ 300 static inline 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c) 302 { 303 if (QEMU_HARDFLOAT_3F32_USE_FP) { 304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 307 } 308 return float32_is_zero_or_normal(a.s) && 309 float32_is_zero_or_normal(b.s) && 310 float32_is_zero_or_normal(c.s); 311 } 312 313 static inline 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c) 315 { 316 if (QEMU_HARDFLOAT_3F64_USE_FP) { 317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 320 } 321 return float64_is_zero_or_normal(a.s) && 322 float64_is_zero_or_normal(b.s) && 323 float64_is_zero_or_normal(c.s); 324 } 325 326 static inline bool f32_is_inf(union_float32 a) 327 { 328 if (QEMU_HARDFLOAT_USE_ISINF) { 329 return isinf(a.h); 330 } 331 return float32_is_infinity(a.s); 332 } 333 334 static inline bool f64_is_inf(union_float64 a) 335 { 336 if (QEMU_HARDFLOAT_USE_ISINF) { 337 return isinf(a.h); 338 } 339 return float64_is_infinity(a.s); 340 } 341 342 static inline float32 343 float32_gen2(float32 xa, float32 xb, float_status *s, 344 hard_f32_op2_fn hard, soft_f32_op2_fn soft, 345 f32_check_fn pre, f32_check_fn post) 346 { 347 union_float32 ua, ub, ur; 348 349 ua.s = xa; 350 ub.s = xb; 351 352 if (unlikely(!can_use_fpu(s))) { 353 goto soft; 354 } 355 356 float32_input_flush2(&ua.s, &ub.s, s); 357 if (unlikely(!pre(ua, ub))) { 358 goto soft; 359 } 360 361 ur.h = hard(ua.h, ub.h); 362 if (unlikely(f32_is_inf(ur))) { 363 float_raise(float_flag_overflow, s); 364 } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) { 365 goto soft; 366 } 367 return ur.s; 368 369 soft: 370 return soft(ua.s, ub.s, s); 371 } 372 373 static inline float64 374 float64_gen2(float64 xa, float64 xb, float_status *s, 375 hard_f64_op2_fn hard, soft_f64_op2_fn soft, 376 f64_check_fn pre, f64_check_fn post) 377 { 378 union_float64 ua, ub, ur; 379 380 ua.s = xa; 381 ub.s = xb; 382 383 if (unlikely(!can_use_fpu(s))) { 384 goto soft; 385 } 386 387 float64_input_flush2(&ua.s, &ub.s, s); 388 if (unlikely(!pre(ua, ub))) { 389 goto soft; 390 } 391 392 ur.h = hard(ua.h, ub.h); 393 if (unlikely(f64_is_inf(ur))) { 394 float_raise(float_flag_overflow, s); 395 } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) { 396 goto soft; 397 } 398 return ur.s; 399 400 soft: 401 return soft(ua.s, ub.s, s); 402 } 403 404 /*---------------------------------------------------------------------------- 405 | Returns the fraction bits of the single-precision floating-point value `a'. 406 *----------------------------------------------------------------------------*/ 407 408 static inline uint32_t extractFloat32Frac(float32 a) 409 { 410 return float32_val(a) & 0x007FFFFF; 411 } 412 413 /*---------------------------------------------------------------------------- 414 | Returns the exponent bits of the single-precision floating-point value `a'. 415 *----------------------------------------------------------------------------*/ 416 417 static inline int extractFloat32Exp(float32 a) 418 { 419 return (float32_val(a) >> 23) & 0xFF; 420 } 421 422 /*---------------------------------------------------------------------------- 423 | Returns the sign bit of the single-precision floating-point value `a'. 424 *----------------------------------------------------------------------------*/ 425 426 static inline bool extractFloat32Sign(float32 a) 427 { 428 return float32_val(a) >> 31; 429 } 430 431 /*---------------------------------------------------------------------------- 432 | Returns the fraction bits of the double-precision floating-point value `a'. 433 *----------------------------------------------------------------------------*/ 434 435 static inline uint64_t extractFloat64Frac(float64 a) 436 { 437 return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF); 438 } 439 440 /*---------------------------------------------------------------------------- 441 | Returns the exponent bits of the double-precision floating-point value `a'. 442 *----------------------------------------------------------------------------*/ 443 444 static inline int extractFloat64Exp(float64 a) 445 { 446 return (float64_val(a) >> 52) & 0x7FF; 447 } 448 449 /*---------------------------------------------------------------------------- 450 | Returns the sign bit of the double-precision floating-point value `a'. 451 *----------------------------------------------------------------------------*/ 452 453 static inline bool extractFloat64Sign(float64 a) 454 { 455 return float64_val(a) >> 63; 456 } 457 458 /* 459 * Classify a floating point number. Everything above float_class_qnan 460 * is a NaN so cls >= float_class_qnan is any NaN. 461 */ 462 463 typedef enum __attribute__ ((__packed__)) { 464 float_class_unclassified, 465 float_class_zero, 466 float_class_normal, 467 float_class_inf, 468 float_class_qnan, /* all NaNs from here */ 469 float_class_snan, 470 } FloatClass; 471 472 #define float_cmask(bit) (1u << (bit)) 473 474 enum { 475 float_cmask_zero = float_cmask(float_class_zero), 476 float_cmask_normal = float_cmask(float_class_normal), 477 float_cmask_inf = float_cmask(float_class_inf), 478 float_cmask_qnan = float_cmask(float_class_qnan), 479 float_cmask_snan = float_cmask(float_class_snan), 480 481 float_cmask_infzero = float_cmask_zero | float_cmask_inf, 482 float_cmask_anynan = float_cmask_qnan | float_cmask_snan, 483 }; 484 485 486 /* Simple helpers for checking if, or what kind of, NaN we have */ 487 static inline __attribute__((unused)) bool is_nan(FloatClass c) 488 { 489 return unlikely(c >= float_class_qnan); 490 } 491 492 static inline __attribute__((unused)) bool is_snan(FloatClass c) 493 { 494 return c == float_class_snan; 495 } 496 497 static inline __attribute__((unused)) bool is_qnan(FloatClass c) 498 { 499 return c == float_class_qnan; 500 } 501 502 /* 503 * Structure holding all of the decomposed parts of a float. 504 * The exponent is unbiased and the fraction is normalized. 505 * 506 * The fraction words are stored in big-endian word ordering, 507 * so that truncation from a larger format to a smaller format 508 * can be done simply by ignoring subsequent elements. 509 */ 510 511 typedef struct { 512 FloatClass cls; 513 bool sign; 514 int32_t exp; 515 union { 516 /* Routines that know the structure may reference the singular name. */ 517 uint64_t frac; 518 /* 519 * Routines expanded with multiple structures reference "hi" and "lo" 520 * depending on the operation. In FloatParts64, "hi" and "lo" are 521 * both the same word and aliased here. 522 */ 523 uint64_t frac_hi; 524 uint64_t frac_lo; 525 }; 526 } FloatParts64; 527 528 typedef struct { 529 FloatClass cls; 530 bool sign; 531 int32_t exp; 532 uint64_t frac_hi; 533 uint64_t frac_lo; 534 } FloatParts128; 535 536 typedef struct { 537 FloatClass cls; 538 bool sign; 539 int32_t exp; 540 uint64_t frac_hi; 541 uint64_t frac_hm; /* high-middle */ 542 uint64_t frac_lm; /* low-middle */ 543 uint64_t frac_lo; 544 } FloatParts256; 545 546 /* These apply to the most significant word of each FloatPartsN. */ 547 #define DECOMPOSED_BINARY_POINT 63 548 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 549 550 /* Structure holding all of the relevant parameters for a format. 551 * exp_size: the size of the exponent field 552 * exp_bias: the offset applied to the exponent field 553 * exp_max: the maximum normalised exponent 554 * frac_size: the size of the fraction field 555 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 556 * The following are computed based the size of fraction 557 * frac_lsb: least significant bit of fraction 558 * frac_lsbm1: the bit below the least significant bit (for rounding) 559 * round_mask/roundeven_mask: masks used for rounding 560 * The following optional modifiers are available: 561 * arm_althp: handle ARM Alternative Half Precision 562 */ 563 typedef struct { 564 int exp_size; 565 int exp_bias; 566 int exp_max; 567 int frac_size; 568 int frac_shift; 569 uint64_t frac_lsb; 570 uint64_t frac_lsbm1; 571 uint64_t round_mask; 572 uint64_t roundeven_mask; 573 bool arm_althp; 574 } FloatFmt; 575 576 /* Expand fields based on the size of exponent and fraction */ 577 #define FLOAT_PARAMS(E, F) \ 578 .exp_size = E, \ 579 .exp_bias = ((1 << E) - 1) >> 1, \ 580 .exp_max = (1 << E) - 1, \ 581 .frac_size = F, \ 582 .frac_shift = (-F - 1) & 63, \ 583 .frac_lsb = 1ull << ((-F - 1) & 63), \ 584 .frac_lsbm1 = 1ull << ((-F - 2) & 63), \ 585 .round_mask = (1ull << ((-F - 1) & 63)) - 1, \ 586 .roundeven_mask = (2ull << ((-F - 1) & 63)) - 1 587 588 static const FloatFmt float16_params = { 589 FLOAT_PARAMS(5, 10) 590 }; 591 592 static const FloatFmt float16_params_ahp = { 593 FLOAT_PARAMS(5, 10), 594 .arm_althp = true 595 }; 596 597 static const FloatFmt bfloat16_params = { 598 FLOAT_PARAMS(8, 7) 599 }; 600 601 static const FloatFmt float32_params = { 602 FLOAT_PARAMS(8, 23) 603 }; 604 605 static const FloatFmt float64_params = { 606 FLOAT_PARAMS(11, 52) 607 }; 608 609 static const FloatFmt float128_params = { 610 FLOAT_PARAMS(15, 112) 611 }; 612 613 /* Unpack a float to parts, but do not canonicalize. */ 614 static void unpack_raw64(FloatParts64 *r, const FloatFmt *fmt, uint64_t raw) 615 { 616 const int f_size = fmt->frac_size; 617 const int e_size = fmt->exp_size; 618 619 *r = (FloatParts64) { 620 .cls = float_class_unclassified, 621 .sign = extract64(raw, f_size + e_size, 1), 622 .exp = extract64(raw, f_size, e_size), 623 .frac = extract64(raw, 0, f_size) 624 }; 625 } 626 627 static inline void float16_unpack_raw(FloatParts64 *p, float16 f) 628 { 629 unpack_raw64(p, &float16_params, f); 630 } 631 632 static inline void bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f) 633 { 634 unpack_raw64(p, &bfloat16_params, f); 635 } 636 637 static inline void float32_unpack_raw(FloatParts64 *p, float32 f) 638 { 639 unpack_raw64(p, &float32_params, f); 640 } 641 642 static inline void float64_unpack_raw(FloatParts64 *p, float64 f) 643 { 644 unpack_raw64(p, &float64_params, f); 645 } 646 647 static void float128_unpack_raw(FloatParts128 *p, float128 f) 648 { 649 const int f_size = float128_params.frac_size - 64; 650 const int e_size = float128_params.exp_size; 651 652 *p = (FloatParts128) { 653 .cls = float_class_unclassified, 654 .sign = extract64(f.high, f_size + e_size, 1), 655 .exp = extract64(f.high, f_size, e_size), 656 .frac_hi = extract64(f.high, 0, f_size), 657 .frac_lo = f.low, 658 }; 659 } 660 661 /* Pack a float from parts, but do not canonicalize. */ 662 static uint64_t pack_raw64(const FloatParts64 *p, const FloatFmt *fmt) 663 { 664 const int f_size = fmt->frac_size; 665 const int e_size = fmt->exp_size; 666 uint64_t ret; 667 668 ret = (uint64_t)p->sign << (f_size + e_size); 669 ret = deposit64(ret, f_size, e_size, p->exp); 670 ret = deposit64(ret, 0, f_size, p->frac); 671 return ret; 672 } 673 674 static inline float16 float16_pack_raw(const FloatParts64 *p) 675 { 676 return make_float16(pack_raw64(p, &float16_params)); 677 } 678 679 static inline bfloat16 bfloat16_pack_raw(const FloatParts64 *p) 680 { 681 return pack_raw64(p, &bfloat16_params); 682 } 683 684 static inline float32 float32_pack_raw(const FloatParts64 *p) 685 { 686 return make_float32(pack_raw64(p, &float32_params)); 687 } 688 689 static inline float64 float64_pack_raw(const FloatParts64 *p) 690 { 691 return make_float64(pack_raw64(p, &float64_params)); 692 } 693 694 static float128 float128_pack_raw(const FloatParts128 *p) 695 { 696 const int f_size = float128_params.frac_size - 64; 697 const int e_size = float128_params.exp_size; 698 uint64_t hi; 699 700 hi = (uint64_t)p->sign << (f_size + e_size); 701 hi = deposit64(hi, f_size, e_size, p->exp); 702 hi = deposit64(hi, 0, f_size, p->frac_hi); 703 return make_float128(hi, p->frac_lo); 704 } 705 706 /*---------------------------------------------------------------------------- 707 | Functions and definitions to determine: (1) whether tininess for underflow 708 | is detected before or after rounding by default, (2) what (if anything) 709 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 710 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 711 | are propagated from function inputs to output. These details are target- 712 | specific. 713 *----------------------------------------------------------------------------*/ 714 #include "softfloat-specialize.c.inc" 715 716 #define PARTS_GENERIC_64_128(NAME, P) \ 717 QEMU_GENERIC(P, (FloatParts128 *, parts128_##NAME), parts64_##NAME) 718 719 #define parts_default_nan(P, S) PARTS_GENERIC_64_128(default_nan, P)(P, S) 720 #define parts_silence_nan(P, S) PARTS_GENERIC_64_128(silence_nan, P)(P, S) 721 722 static void parts64_return_nan(FloatParts64 *a, float_status *s); 723 static void parts128_return_nan(FloatParts128 *a, float_status *s); 724 725 #define parts_return_nan(P, S) PARTS_GENERIC_64_128(return_nan, P)(P, S) 726 727 static FloatParts64 *parts64_pick_nan(FloatParts64 *a, FloatParts64 *b, 728 float_status *s); 729 static FloatParts128 *parts128_pick_nan(FloatParts128 *a, FloatParts128 *b, 730 float_status *s); 731 732 #define parts_pick_nan(A, B, S) PARTS_GENERIC_64_128(pick_nan, A)(A, B, S) 733 734 static FloatParts64 *parts64_pick_nan_muladd(FloatParts64 *a, FloatParts64 *b, 735 FloatParts64 *c, float_status *s, 736 int ab_mask, int abc_mask); 737 static FloatParts128 *parts128_pick_nan_muladd(FloatParts128 *a, 738 FloatParts128 *b, 739 FloatParts128 *c, 740 float_status *s, 741 int ab_mask, int abc_mask); 742 743 #define parts_pick_nan_muladd(A, B, C, S, ABM, ABCM) \ 744 PARTS_GENERIC_64_128(pick_nan_muladd, A)(A, B, C, S, ABM, ABCM) 745 746 static void parts64_canonicalize(FloatParts64 *p, float_status *status, 747 const FloatFmt *fmt); 748 static void parts128_canonicalize(FloatParts128 *p, float_status *status, 749 const FloatFmt *fmt); 750 751 #define parts_canonicalize(A, S, F) \ 752 PARTS_GENERIC_64_128(canonicalize, A)(A, S, F) 753 754 static void parts64_uncanon(FloatParts64 *p, float_status *status, 755 const FloatFmt *fmt); 756 static void parts128_uncanon(FloatParts128 *p, float_status *status, 757 const FloatFmt *fmt); 758 759 #define parts_uncanon(A, S, F) \ 760 PARTS_GENERIC_64_128(uncanon, A)(A, S, F) 761 762 static void parts64_add_normal(FloatParts64 *a, FloatParts64 *b); 763 static void parts128_add_normal(FloatParts128 *a, FloatParts128 *b); 764 765 #define parts_add_normal(A, B) \ 766 PARTS_GENERIC_64_128(add_normal, A)(A, B) 767 768 static bool parts64_sub_normal(FloatParts64 *a, FloatParts64 *b); 769 static bool parts128_sub_normal(FloatParts128 *a, FloatParts128 *b); 770 771 #define parts_sub_normal(A, B) \ 772 PARTS_GENERIC_64_128(sub_normal, A)(A, B) 773 774 static FloatParts64 *parts64_addsub(FloatParts64 *a, FloatParts64 *b, 775 float_status *s, bool subtract); 776 static FloatParts128 *parts128_addsub(FloatParts128 *a, FloatParts128 *b, 777 float_status *s, bool subtract); 778 779 #define parts_addsub(A, B, S, Z) \ 780 PARTS_GENERIC_64_128(addsub, A)(A, B, S, Z) 781 782 static FloatParts64 *parts64_mul(FloatParts64 *a, FloatParts64 *b, 783 float_status *s); 784 static FloatParts128 *parts128_mul(FloatParts128 *a, FloatParts128 *b, 785 float_status *s); 786 787 #define parts_mul(A, B, S) \ 788 PARTS_GENERIC_64_128(mul, A)(A, B, S) 789 790 /* 791 * Helper functions for softfloat-parts.c.inc, per-size operations. 792 */ 793 794 #define FRAC_GENERIC_64_128(NAME, P) \ 795 QEMU_GENERIC(P, (FloatParts128 *, frac128_##NAME), frac64_##NAME) 796 797 static bool frac64_add(FloatParts64 *r, FloatParts64 *a, FloatParts64 *b) 798 { 799 return uadd64_overflow(a->frac, b->frac, &r->frac); 800 } 801 802 static bool frac128_add(FloatParts128 *r, FloatParts128 *a, FloatParts128 *b) 803 { 804 bool c = 0; 805 r->frac_lo = uadd64_carry(a->frac_lo, b->frac_lo, &c); 806 r->frac_hi = uadd64_carry(a->frac_hi, b->frac_hi, &c); 807 return c; 808 } 809 810 #define frac_add(R, A, B) FRAC_GENERIC_64_128(add, R)(R, A, B) 811 812 static bool frac64_addi(FloatParts64 *r, FloatParts64 *a, uint64_t c) 813 { 814 return uadd64_overflow(a->frac, c, &r->frac); 815 } 816 817 static bool frac128_addi(FloatParts128 *r, FloatParts128 *a, uint64_t c) 818 { 819 c = uadd64_overflow(a->frac_lo, c, &r->frac_lo); 820 return uadd64_overflow(a->frac_hi, c, &r->frac_hi); 821 } 822 823 #define frac_addi(R, A, C) FRAC_GENERIC_64_128(addi, R)(R, A, C) 824 825 static void frac64_allones(FloatParts64 *a) 826 { 827 a->frac = -1; 828 } 829 830 static void frac128_allones(FloatParts128 *a) 831 { 832 a->frac_hi = a->frac_lo = -1; 833 } 834 835 #define frac_allones(A) FRAC_GENERIC_64_128(allones, A)(A) 836 837 static int frac64_cmp(FloatParts64 *a, FloatParts64 *b) 838 { 839 return a->frac == b->frac ? 0 : a->frac < b->frac ? -1 : 1; 840 } 841 842 static int frac128_cmp(FloatParts128 *a, FloatParts128 *b) 843 { 844 uint64_t ta = a->frac_hi, tb = b->frac_hi; 845 if (ta == tb) { 846 ta = a->frac_lo, tb = b->frac_lo; 847 if (ta == tb) { 848 return 0; 849 } 850 } 851 return ta < tb ? -1 : 1; 852 } 853 854 #define frac_cmp(A, B) FRAC_GENERIC_64_128(cmp, A)(A, B) 855 856 static void frac64_clear(FloatParts64 *a) 857 { 858 a->frac = 0; 859 } 860 861 static void frac128_clear(FloatParts128 *a) 862 { 863 a->frac_hi = a->frac_lo = 0; 864 } 865 866 #define frac_clear(A) FRAC_GENERIC_64_128(clear, A)(A) 867 868 static bool frac64_eqz(FloatParts64 *a) 869 { 870 return a->frac == 0; 871 } 872 873 static bool frac128_eqz(FloatParts128 *a) 874 { 875 return (a->frac_hi | a->frac_lo) == 0; 876 } 877 878 #define frac_eqz(A) FRAC_GENERIC_64_128(eqz, A)(A) 879 880 static void frac64_mulw(FloatParts128 *r, FloatParts64 *a, FloatParts64 *b) 881 { 882 mulu64(&r->frac_lo, &r->frac_hi, a->frac, b->frac); 883 } 884 885 static void frac128_mulw(FloatParts256 *r, FloatParts128 *a, FloatParts128 *b) 886 { 887 mul128To256(a->frac_hi, a->frac_lo, b->frac_hi, b->frac_lo, 888 &r->frac_hi, &r->frac_hm, &r->frac_lm, &r->frac_lo); 889 } 890 891 #define frac_mulw(R, A, B) FRAC_GENERIC_64_128(mulw, A)(R, A, B) 892 893 static void frac64_neg(FloatParts64 *a) 894 { 895 a->frac = -a->frac; 896 } 897 898 static void frac128_neg(FloatParts128 *a) 899 { 900 bool c = 0; 901 a->frac_lo = usub64_borrow(0, a->frac_lo, &c); 902 a->frac_hi = usub64_borrow(0, a->frac_hi, &c); 903 } 904 905 #define frac_neg(A) FRAC_GENERIC_64_128(neg, A)(A) 906 907 static int frac64_normalize(FloatParts64 *a) 908 { 909 if (a->frac) { 910 int shift = clz64(a->frac); 911 a->frac <<= shift; 912 return shift; 913 } 914 return 64; 915 } 916 917 static int frac128_normalize(FloatParts128 *a) 918 { 919 if (a->frac_hi) { 920 int shl = clz64(a->frac_hi); 921 if (shl) { 922 int shr = 64 - shl; 923 a->frac_hi = (a->frac_hi << shl) | (a->frac_lo >> shr); 924 a->frac_lo = (a->frac_lo << shl); 925 } 926 return shl; 927 } else if (a->frac_lo) { 928 int shl = clz64(a->frac_lo); 929 a->frac_hi = (a->frac_lo << shl); 930 a->frac_lo = 0; 931 return shl + 64; 932 } 933 return 128; 934 } 935 936 #define frac_normalize(A) FRAC_GENERIC_64_128(normalize, A)(A) 937 938 static void frac64_shl(FloatParts64 *a, int c) 939 { 940 a->frac <<= c; 941 } 942 943 static void frac128_shl(FloatParts128 *a, int c) 944 { 945 shift128Left(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo); 946 } 947 948 #define frac_shl(A, C) FRAC_GENERIC_64_128(shl, A)(A, C) 949 950 static void frac64_shr(FloatParts64 *a, int c) 951 { 952 a->frac >>= c; 953 } 954 955 static void frac128_shr(FloatParts128 *a, int c) 956 { 957 shift128Right(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo); 958 } 959 960 #define frac_shr(A, C) FRAC_GENERIC_64_128(shr, A)(A, C) 961 962 static void frac64_shrjam(FloatParts64 *a, int c) 963 { 964 shift64RightJamming(a->frac, c, &a->frac); 965 } 966 967 static void frac128_shrjam(FloatParts128 *a, int c) 968 { 969 shift128RightJamming(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo); 970 } 971 972 #define frac_shrjam(A, C) FRAC_GENERIC_64_128(shrjam, A)(A, C) 973 974 static bool frac64_sub(FloatParts64 *r, FloatParts64 *a, FloatParts64 *b) 975 { 976 return usub64_overflow(a->frac, b->frac, &r->frac); 977 } 978 979 static bool frac128_sub(FloatParts128 *r, FloatParts128 *a, FloatParts128 *b) 980 { 981 bool c = 0; 982 r->frac_lo = usub64_borrow(a->frac_lo, b->frac_lo, &c); 983 r->frac_hi = usub64_borrow(a->frac_hi, b->frac_hi, &c); 984 return c; 985 } 986 987 #define frac_sub(R, A, B) FRAC_GENERIC_64_128(sub, R)(R, A, B) 988 989 static void frac64_truncjam(FloatParts64 *r, FloatParts128 *a) 990 { 991 r->frac = a->frac_hi | (a->frac_lo != 0); 992 } 993 994 static void frac128_truncjam(FloatParts128 *r, FloatParts256 *a) 995 { 996 r->frac_hi = a->frac_hi; 997 r->frac_lo = a->frac_hm | ((a->frac_lm | a->frac_lo) != 0); 998 } 999 1000 #define frac_truncjam(R, A) FRAC_GENERIC_64_128(truncjam, R)(R, A) 1001 1002 #define partsN(NAME) glue(glue(glue(parts,N),_),NAME) 1003 #define FloatPartsN glue(FloatParts,N) 1004 #define FloatPartsW glue(FloatParts,W) 1005 1006 #define N 64 1007 #define W 128 1008 1009 #include "softfloat-parts-addsub.c.inc" 1010 #include "softfloat-parts.c.inc" 1011 1012 #undef N 1013 #undef W 1014 #define N 128 1015 #define W 256 1016 1017 #include "softfloat-parts-addsub.c.inc" 1018 #include "softfloat-parts.c.inc" 1019 1020 #undef N 1021 #undef W 1022 #undef partsN 1023 #undef FloatPartsN 1024 #undef FloatPartsW 1025 1026 /* 1027 * Pack/unpack routines with a specific FloatFmt. 1028 */ 1029 1030 static void float16a_unpack_canonical(FloatParts64 *p, float16 f, 1031 float_status *s, const FloatFmt *params) 1032 { 1033 float16_unpack_raw(p, f); 1034 parts_canonicalize(p, s, params); 1035 } 1036 1037 static void float16_unpack_canonical(FloatParts64 *p, float16 f, 1038 float_status *s) 1039 { 1040 float16a_unpack_canonical(p, f, s, &float16_params); 1041 } 1042 1043 static void bfloat16_unpack_canonical(FloatParts64 *p, bfloat16 f, 1044 float_status *s) 1045 { 1046 bfloat16_unpack_raw(p, f); 1047 parts_canonicalize(p, s, &bfloat16_params); 1048 } 1049 1050 static float16 float16a_round_pack_canonical(FloatParts64 *p, 1051 float_status *s, 1052 const FloatFmt *params) 1053 { 1054 parts_uncanon(p, s, params); 1055 return float16_pack_raw(p); 1056 } 1057 1058 static float16 float16_round_pack_canonical(FloatParts64 *p, 1059 float_status *s) 1060 { 1061 return float16a_round_pack_canonical(p, s, &float16_params); 1062 } 1063 1064 static bfloat16 bfloat16_round_pack_canonical(FloatParts64 *p, 1065 float_status *s) 1066 { 1067 parts_uncanon(p, s, &bfloat16_params); 1068 return bfloat16_pack_raw(p); 1069 } 1070 1071 static void float32_unpack_canonical(FloatParts64 *p, float32 f, 1072 float_status *s) 1073 { 1074 float32_unpack_raw(p, f); 1075 parts_canonicalize(p, s, &float32_params); 1076 } 1077 1078 static float32 float32_round_pack_canonical(FloatParts64 *p, 1079 float_status *s) 1080 { 1081 parts_uncanon(p, s, &float32_params); 1082 return float32_pack_raw(p); 1083 } 1084 1085 static void float64_unpack_canonical(FloatParts64 *p, float64 f, 1086 float_status *s) 1087 { 1088 float64_unpack_raw(p, f); 1089 parts_canonicalize(p, s, &float64_params); 1090 } 1091 1092 static float64 float64_round_pack_canonical(FloatParts64 *p, 1093 float_status *s) 1094 { 1095 parts_uncanon(p, s, &float64_params); 1096 return float64_pack_raw(p); 1097 } 1098 1099 static void float128_unpack_canonical(FloatParts128 *p, float128 f, 1100 float_status *s) 1101 { 1102 float128_unpack_raw(p, f); 1103 parts_canonicalize(p, s, &float128_params); 1104 } 1105 1106 static float128 float128_round_pack_canonical(FloatParts128 *p, 1107 float_status *s) 1108 { 1109 parts_uncanon(p, s, &float128_params); 1110 return float128_pack_raw(p); 1111 } 1112 1113 /* 1114 * Addition and subtraction 1115 */ 1116 1117 static float16 QEMU_FLATTEN 1118 float16_addsub(float16 a, float16 b, float_status *status, bool subtract) 1119 { 1120 FloatParts64 pa, pb, *pr; 1121 1122 float16_unpack_canonical(&pa, a, status); 1123 float16_unpack_canonical(&pb, b, status); 1124 pr = parts_addsub(&pa, &pb, status, subtract); 1125 1126 return float16_round_pack_canonical(pr, status); 1127 } 1128 1129 float16 float16_add(float16 a, float16 b, float_status *status) 1130 { 1131 return float16_addsub(a, b, status, false); 1132 } 1133 1134 float16 float16_sub(float16 a, float16 b, float_status *status) 1135 { 1136 return float16_addsub(a, b, status, true); 1137 } 1138 1139 static float32 QEMU_SOFTFLOAT_ATTR 1140 soft_f32_addsub(float32 a, float32 b, float_status *status, bool subtract) 1141 { 1142 FloatParts64 pa, pb, *pr; 1143 1144 float32_unpack_canonical(&pa, a, status); 1145 float32_unpack_canonical(&pb, b, status); 1146 pr = parts_addsub(&pa, &pb, status, subtract); 1147 1148 return float32_round_pack_canonical(pr, status); 1149 } 1150 1151 static float32 soft_f32_add(float32 a, float32 b, float_status *status) 1152 { 1153 return soft_f32_addsub(a, b, status, false); 1154 } 1155 1156 static float32 soft_f32_sub(float32 a, float32 b, float_status *status) 1157 { 1158 return soft_f32_addsub(a, b, status, true); 1159 } 1160 1161 static float64 QEMU_SOFTFLOAT_ATTR 1162 soft_f64_addsub(float64 a, float64 b, float_status *status, bool subtract) 1163 { 1164 FloatParts64 pa, pb, *pr; 1165 1166 float64_unpack_canonical(&pa, a, status); 1167 float64_unpack_canonical(&pb, b, status); 1168 pr = parts_addsub(&pa, &pb, status, subtract); 1169 1170 return float64_round_pack_canonical(pr, status); 1171 } 1172 1173 static float64 soft_f64_add(float64 a, float64 b, float_status *status) 1174 { 1175 return soft_f64_addsub(a, b, status, false); 1176 } 1177 1178 static float64 soft_f64_sub(float64 a, float64 b, float_status *status) 1179 { 1180 return soft_f64_addsub(a, b, status, true); 1181 } 1182 1183 static float hard_f32_add(float a, float b) 1184 { 1185 return a + b; 1186 } 1187 1188 static float hard_f32_sub(float a, float b) 1189 { 1190 return a - b; 1191 } 1192 1193 static double hard_f64_add(double a, double b) 1194 { 1195 return a + b; 1196 } 1197 1198 static double hard_f64_sub(double a, double b) 1199 { 1200 return a - b; 1201 } 1202 1203 static bool f32_addsubmul_post(union_float32 a, union_float32 b) 1204 { 1205 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1206 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1207 } 1208 return !(float32_is_zero(a.s) && float32_is_zero(b.s)); 1209 } 1210 1211 static bool f64_addsubmul_post(union_float64 a, union_float64 b) 1212 { 1213 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1214 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1215 } else { 1216 return !(float64_is_zero(a.s) && float64_is_zero(b.s)); 1217 } 1218 } 1219 1220 static float32 float32_addsub(float32 a, float32 b, float_status *s, 1221 hard_f32_op2_fn hard, soft_f32_op2_fn soft) 1222 { 1223 return float32_gen2(a, b, s, hard, soft, 1224 f32_is_zon2, f32_addsubmul_post); 1225 } 1226 1227 static float64 float64_addsub(float64 a, float64 b, float_status *s, 1228 hard_f64_op2_fn hard, soft_f64_op2_fn soft) 1229 { 1230 return float64_gen2(a, b, s, hard, soft, 1231 f64_is_zon2, f64_addsubmul_post); 1232 } 1233 1234 float32 QEMU_FLATTEN 1235 float32_add(float32 a, float32 b, float_status *s) 1236 { 1237 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add); 1238 } 1239 1240 float32 QEMU_FLATTEN 1241 float32_sub(float32 a, float32 b, float_status *s) 1242 { 1243 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub); 1244 } 1245 1246 float64 QEMU_FLATTEN 1247 float64_add(float64 a, float64 b, float_status *s) 1248 { 1249 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add); 1250 } 1251 1252 float64 QEMU_FLATTEN 1253 float64_sub(float64 a, float64 b, float_status *s) 1254 { 1255 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub); 1256 } 1257 1258 static bfloat16 QEMU_FLATTEN 1259 bfloat16_addsub(bfloat16 a, bfloat16 b, float_status *status, bool subtract) 1260 { 1261 FloatParts64 pa, pb, *pr; 1262 1263 bfloat16_unpack_canonical(&pa, a, status); 1264 bfloat16_unpack_canonical(&pb, b, status); 1265 pr = parts_addsub(&pa, &pb, status, subtract); 1266 1267 return bfloat16_round_pack_canonical(pr, status); 1268 } 1269 1270 bfloat16 bfloat16_add(bfloat16 a, bfloat16 b, float_status *status) 1271 { 1272 return bfloat16_addsub(a, b, status, false); 1273 } 1274 1275 bfloat16 bfloat16_sub(bfloat16 a, bfloat16 b, float_status *status) 1276 { 1277 return bfloat16_addsub(a, b, status, true); 1278 } 1279 1280 static float128 QEMU_FLATTEN 1281 float128_addsub(float128 a, float128 b, float_status *status, bool subtract) 1282 { 1283 FloatParts128 pa, pb, *pr; 1284 1285 float128_unpack_canonical(&pa, a, status); 1286 float128_unpack_canonical(&pb, b, status); 1287 pr = parts_addsub(&pa, &pb, status, subtract); 1288 1289 return float128_round_pack_canonical(pr, status); 1290 } 1291 1292 float128 float128_add(float128 a, float128 b, float_status *status) 1293 { 1294 return float128_addsub(a, b, status, false); 1295 } 1296 1297 float128 float128_sub(float128 a, float128 b, float_status *status) 1298 { 1299 return float128_addsub(a, b, status, true); 1300 } 1301 1302 /* 1303 * Multiplication 1304 */ 1305 1306 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status) 1307 { 1308 FloatParts64 pa, pb, *pr; 1309 1310 float16_unpack_canonical(&pa, a, status); 1311 float16_unpack_canonical(&pb, b, status); 1312 pr = parts_mul(&pa, &pb, status); 1313 1314 return float16_round_pack_canonical(pr, status); 1315 } 1316 1317 static float32 QEMU_SOFTFLOAT_ATTR 1318 soft_f32_mul(float32 a, float32 b, float_status *status) 1319 { 1320 FloatParts64 pa, pb, *pr; 1321 1322 float32_unpack_canonical(&pa, a, status); 1323 float32_unpack_canonical(&pb, b, status); 1324 pr = parts_mul(&pa, &pb, status); 1325 1326 return float32_round_pack_canonical(pr, status); 1327 } 1328 1329 static float64 QEMU_SOFTFLOAT_ATTR 1330 soft_f64_mul(float64 a, float64 b, float_status *status) 1331 { 1332 FloatParts64 pa, pb, *pr; 1333 1334 float64_unpack_canonical(&pa, a, status); 1335 float64_unpack_canonical(&pb, b, status); 1336 pr = parts_mul(&pa, &pb, status); 1337 1338 return float64_round_pack_canonical(pr, status); 1339 } 1340 1341 static float hard_f32_mul(float a, float b) 1342 { 1343 return a * b; 1344 } 1345 1346 static double hard_f64_mul(double a, double b) 1347 { 1348 return a * b; 1349 } 1350 1351 float32 QEMU_FLATTEN 1352 float32_mul(float32 a, float32 b, float_status *s) 1353 { 1354 return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul, 1355 f32_is_zon2, f32_addsubmul_post); 1356 } 1357 1358 float64 QEMU_FLATTEN 1359 float64_mul(float64 a, float64 b, float_status *s) 1360 { 1361 return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul, 1362 f64_is_zon2, f64_addsubmul_post); 1363 } 1364 1365 bfloat16 QEMU_FLATTEN 1366 bfloat16_mul(bfloat16 a, bfloat16 b, float_status *status) 1367 { 1368 FloatParts64 pa, pb, *pr; 1369 1370 bfloat16_unpack_canonical(&pa, a, status); 1371 bfloat16_unpack_canonical(&pb, b, status); 1372 pr = parts_mul(&pa, &pb, status); 1373 1374 return bfloat16_round_pack_canonical(pr, status); 1375 } 1376 1377 float128 QEMU_FLATTEN 1378 float128_mul(float128 a, float128 b, float_status *status) 1379 { 1380 FloatParts128 pa, pb, *pr; 1381 1382 float128_unpack_canonical(&pa, a, status); 1383 float128_unpack_canonical(&pb, b, status); 1384 pr = parts_mul(&pa, &pb, status); 1385 1386 return float128_round_pack_canonical(pr, status); 1387 } 1388 1389 /* 1390 * Returns the result of multiplying the floating-point values `a' and 1391 * `b' then adding 'c', with no intermediate rounding step after the 1392 * multiplication. The operation is performed according to the 1393 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008. 1394 * The flags argument allows the caller to select negation of the 1395 * addend, the intermediate product, or the final result. (The 1396 * difference between this and having the caller do a separate 1397 * negation is that negating externally will flip the sign bit on 1398 * NaNs.) 1399 */ 1400 1401 static FloatParts64 muladd_floats(FloatParts64 a, FloatParts64 b, FloatParts64 c, 1402 int flags, float_status *s) 1403 { 1404 bool inf_zero, p_sign; 1405 bool sign_flip = flags & float_muladd_negate_result; 1406 FloatClass p_class; 1407 uint64_t hi, lo; 1408 int p_exp; 1409 int ab_mask, abc_mask; 1410 1411 ab_mask = float_cmask(a.cls) | float_cmask(b.cls); 1412 abc_mask = float_cmask(c.cls) | ab_mask; 1413 inf_zero = ab_mask == float_cmask_infzero; 1414 1415 /* It is implementation-defined whether the cases of (0,inf,qnan) 1416 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 1417 * they return if they do), so we have to hand this information 1418 * off to the target-specific pick-a-NaN routine. 1419 */ 1420 if (unlikely(abc_mask & float_cmask_anynan)) { 1421 return *parts_pick_nan_muladd(&a, &b, &c, s, ab_mask, abc_mask); 1422 } 1423 1424 if (inf_zero) { 1425 float_raise(float_flag_invalid, s); 1426 parts_default_nan(&a, s); 1427 return a; 1428 } 1429 1430 if (flags & float_muladd_negate_c) { 1431 c.sign ^= 1; 1432 } 1433 1434 p_sign = a.sign ^ b.sign; 1435 1436 if (flags & float_muladd_negate_product) { 1437 p_sign ^= 1; 1438 } 1439 1440 if (ab_mask & float_cmask_inf) { 1441 p_class = float_class_inf; 1442 } else if (ab_mask & float_cmask_zero) { 1443 p_class = float_class_zero; 1444 } else { 1445 p_class = float_class_normal; 1446 } 1447 1448 if (c.cls == float_class_inf) { 1449 if (p_class == float_class_inf && p_sign != c.sign) { 1450 float_raise(float_flag_invalid, s); 1451 parts_default_nan(&c, s); 1452 } else { 1453 c.sign ^= sign_flip; 1454 } 1455 return c; 1456 } 1457 1458 if (p_class == float_class_inf) { 1459 a.cls = float_class_inf; 1460 a.sign = p_sign ^ sign_flip; 1461 return a; 1462 } 1463 1464 if (p_class == float_class_zero) { 1465 if (c.cls == float_class_zero) { 1466 if (p_sign != c.sign) { 1467 p_sign = s->float_rounding_mode == float_round_down; 1468 } 1469 c.sign = p_sign; 1470 } else if (flags & float_muladd_halve_result) { 1471 c.exp -= 1; 1472 } 1473 c.sign ^= sign_flip; 1474 return c; 1475 } 1476 1477 /* a & b should be normals now... */ 1478 assert(a.cls == float_class_normal && 1479 b.cls == float_class_normal); 1480 1481 p_exp = a.exp + b.exp; 1482 1483 mul64To128(a.frac, b.frac, &hi, &lo); 1484 1485 /* Renormalize to the msb. */ 1486 if (hi & DECOMPOSED_IMPLICIT_BIT) { 1487 p_exp += 1; 1488 } else { 1489 shortShift128Left(hi, lo, 1, &hi, &lo); 1490 } 1491 1492 /* + add/sub */ 1493 if (c.cls != float_class_zero) { 1494 int exp_diff = p_exp - c.exp; 1495 if (p_sign == c.sign) { 1496 /* Addition */ 1497 if (exp_diff <= 0) { 1498 shift64RightJamming(hi, -exp_diff, &hi); 1499 p_exp = c.exp; 1500 if (uadd64_overflow(hi, c.frac, &hi)) { 1501 shift64RightJamming(hi, 1, &hi); 1502 hi |= DECOMPOSED_IMPLICIT_BIT; 1503 p_exp += 1; 1504 } 1505 } else { 1506 uint64_t c_hi, c_lo, over; 1507 shift128RightJamming(c.frac, 0, exp_diff, &c_hi, &c_lo); 1508 add192(0, hi, lo, 0, c_hi, c_lo, &over, &hi, &lo); 1509 if (over) { 1510 shift64RightJamming(hi, 1, &hi); 1511 hi |= DECOMPOSED_IMPLICIT_BIT; 1512 p_exp += 1; 1513 } 1514 } 1515 } else { 1516 /* Subtraction */ 1517 uint64_t c_hi = c.frac, c_lo = 0; 1518 1519 if (exp_diff <= 0) { 1520 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo); 1521 if (exp_diff == 0 1522 && 1523 (hi > c_hi || (hi == c_hi && lo >= c_lo))) { 1524 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1525 } else { 1526 sub128(c_hi, c_lo, hi, lo, &hi, &lo); 1527 p_sign ^= 1; 1528 p_exp = c.exp; 1529 } 1530 } else { 1531 shift128RightJamming(c_hi, c_lo, 1532 exp_diff, 1533 &c_hi, &c_lo); 1534 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1535 } 1536 1537 if (hi == 0 && lo == 0) { 1538 a.cls = float_class_zero; 1539 a.sign = s->float_rounding_mode == float_round_down; 1540 a.sign ^= sign_flip; 1541 return a; 1542 } else { 1543 int shift; 1544 if (hi != 0) { 1545 shift = clz64(hi); 1546 } else { 1547 shift = clz64(lo) + 64; 1548 } 1549 /* Normalizing to a binary point of 124 is the 1550 correct adjust for the exponent. However since we're 1551 shifting, we might as well put the binary point back 1552 at 63 where we really want it. Therefore shift as 1553 if we're leaving 1 bit at the top of the word, but 1554 adjust the exponent as if we're leaving 3 bits. */ 1555 shift128Left(hi, lo, shift, &hi, &lo); 1556 p_exp -= shift; 1557 } 1558 } 1559 } 1560 hi |= (lo != 0); 1561 1562 if (flags & float_muladd_halve_result) { 1563 p_exp -= 1; 1564 } 1565 1566 /* finally prepare our result */ 1567 a.cls = float_class_normal; 1568 a.sign = p_sign ^ sign_flip; 1569 a.exp = p_exp; 1570 a.frac = hi; 1571 1572 return a; 1573 } 1574 1575 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c, 1576 int flags, float_status *status) 1577 { 1578 FloatParts64 pa, pb, pc, pr; 1579 1580 float16_unpack_canonical(&pa, a, status); 1581 float16_unpack_canonical(&pb, b, status); 1582 float16_unpack_canonical(&pc, c, status); 1583 pr = muladd_floats(pa, pb, pc, flags, status); 1584 1585 return float16_round_pack_canonical(&pr, status); 1586 } 1587 1588 static float32 QEMU_SOFTFLOAT_ATTR 1589 soft_f32_muladd(float32 a, float32 b, float32 c, int flags, 1590 float_status *status) 1591 { 1592 FloatParts64 pa, pb, pc, pr; 1593 1594 float32_unpack_canonical(&pa, a, status); 1595 float32_unpack_canonical(&pb, b, status); 1596 float32_unpack_canonical(&pc, c, status); 1597 pr = muladd_floats(pa, pb, pc, flags, status); 1598 1599 return float32_round_pack_canonical(&pr, status); 1600 } 1601 1602 static float64 QEMU_SOFTFLOAT_ATTR 1603 soft_f64_muladd(float64 a, float64 b, float64 c, int flags, 1604 float_status *status) 1605 { 1606 FloatParts64 pa, pb, pc, pr; 1607 1608 float64_unpack_canonical(&pa, a, status); 1609 float64_unpack_canonical(&pb, b, status); 1610 float64_unpack_canonical(&pc, c, status); 1611 pr = muladd_floats(pa, pb, pc, flags, status); 1612 1613 return float64_round_pack_canonical(&pr, status); 1614 } 1615 1616 static bool force_soft_fma; 1617 1618 float32 QEMU_FLATTEN 1619 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s) 1620 { 1621 union_float32 ua, ub, uc, ur; 1622 1623 ua.s = xa; 1624 ub.s = xb; 1625 uc.s = xc; 1626 1627 if (unlikely(!can_use_fpu(s))) { 1628 goto soft; 1629 } 1630 if (unlikely(flags & float_muladd_halve_result)) { 1631 goto soft; 1632 } 1633 1634 float32_input_flush3(&ua.s, &ub.s, &uc.s, s); 1635 if (unlikely(!f32_is_zon3(ua, ub, uc))) { 1636 goto soft; 1637 } 1638 1639 if (unlikely(force_soft_fma)) { 1640 goto soft; 1641 } 1642 1643 /* 1644 * When (a || b) == 0, there's no need to check for under/over flow, 1645 * since we know the addend is (normal || 0) and the product is 0. 1646 */ 1647 if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) { 1648 union_float32 up; 1649 bool prod_sign; 1650 1651 prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s); 1652 prod_sign ^= !!(flags & float_muladd_negate_product); 1653 up.s = float32_set_sign(float32_zero, prod_sign); 1654 1655 if (flags & float_muladd_negate_c) { 1656 uc.h = -uc.h; 1657 } 1658 ur.h = up.h + uc.h; 1659 } else { 1660 union_float32 ua_orig = ua; 1661 union_float32 uc_orig = uc; 1662 1663 if (flags & float_muladd_negate_product) { 1664 ua.h = -ua.h; 1665 } 1666 if (flags & float_muladd_negate_c) { 1667 uc.h = -uc.h; 1668 } 1669 1670 ur.h = fmaf(ua.h, ub.h, uc.h); 1671 1672 if (unlikely(f32_is_inf(ur))) { 1673 float_raise(float_flag_overflow, s); 1674 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) { 1675 ua = ua_orig; 1676 uc = uc_orig; 1677 goto soft; 1678 } 1679 } 1680 if (flags & float_muladd_negate_result) { 1681 return float32_chs(ur.s); 1682 } 1683 return ur.s; 1684 1685 soft: 1686 return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s); 1687 } 1688 1689 float64 QEMU_FLATTEN 1690 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s) 1691 { 1692 union_float64 ua, ub, uc, ur; 1693 1694 ua.s = xa; 1695 ub.s = xb; 1696 uc.s = xc; 1697 1698 if (unlikely(!can_use_fpu(s))) { 1699 goto soft; 1700 } 1701 if (unlikely(flags & float_muladd_halve_result)) { 1702 goto soft; 1703 } 1704 1705 float64_input_flush3(&ua.s, &ub.s, &uc.s, s); 1706 if (unlikely(!f64_is_zon3(ua, ub, uc))) { 1707 goto soft; 1708 } 1709 1710 if (unlikely(force_soft_fma)) { 1711 goto soft; 1712 } 1713 1714 /* 1715 * When (a || b) == 0, there's no need to check for under/over flow, 1716 * since we know the addend is (normal || 0) and the product is 0. 1717 */ 1718 if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) { 1719 union_float64 up; 1720 bool prod_sign; 1721 1722 prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s); 1723 prod_sign ^= !!(flags & float_muladd_negate_product); 1724 up.s = float64_set_sign(float64_zero, prod_sign); 1725 1726 if (flags & float_muladd_negate_c) { 1727 uc.h = -uc.h; 1728 } 1729 ur.h = up.h + uc.h; 1730 } else { 1731 union_float64 ua_orig = ua; 1732 union_float64 uc_orig = uc; 1733 1734 if (flags & float_muladd_negate_product) { 1735 ua.h = -ua.h; 1736 } 1737 if (flags & float_muladd_negate_c) { 1738 uc.h = -uc.h; 1739 } 1740 1741 ur.h = fma(ua.h, ub.h, uc.h); 1742 1743 if (unlikely(f64_is_inf(ur))) { 1744 float_raise(float_flag_overflow, s); 1745 } else if (unlikely(fabs(ur.h) <= FLT_MIN)) { 1746 ua = ua_orig; 1747 uc = uc_orig; 1748 goto soft; 1749 } 1750 } 1751 if (flags & float_muladd_negate_result) { 1752 return float64_chs(ur.s); 1753 } 1754 return ur.s; 1755 1756 soft: 1757 return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s); 1758 } 1759 1760 /* 1761 * Returns the result of multiplying the bfloat16 values `a' 1762 * and `b' then adding 'c', with no intermediate rounding step after the 1763 * multiplication. 1764 */ 1765 1766 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c, 1767 int flags, float_status *status) 1768 { 1769 FloatParts64 pa, pb, pc, pr; 1770 1771 bfloat16_unpack_canonical(&pa, a, status); 1772 bfloat16_unpack_canonical(&pb, b, status); 1773 bfloat16_unpack_canonical(&pc, c, status); 1774 pr = muladd_floats(pa, pb, pc, flags, status); 1775 1776 return bfloat16_round_pack_canonical(&pr, status); 1777 } 1778 1779 /* 1780 * Returns the result of dividing the floating-point value `a' by the 1781 * corresponding value `b'. The operation is performed according to 1782 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1783 */ 1784 1785 static FloatParts64 div_floats(FloatParts64 a, FloatParts64 b, float_status *s) 1786 { 1787 bool sign = a.sign ^ b.sign; 1788 1789 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1790 uint64_t n0, n1, q, r; 1791 int exp = a.exp - b.exp; 1792 1793 /* 1794 * We want a 2*N / N-bit division to produce exactly an N-bit 1795 * result, so that we do not lose any precision and so that we 1796 * do not have to renormalize afterward. If A.frac < B.frac, 1797 * then division would produce an (N-1)-bit result; shift A left 1798 * by one to produce the an N-bit result, and decrement the 1799 * exponent to match. 1800 * 1801 * The udiv_qrnnd algorithm that we're using requires normalization, 1802 * i.e. the msb of the denominator must be set, which is already true. 1803 */ 1804 if (a.frac < b.frac) { 1805 exp -= 1; 1806 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0); 1807 } else { 1808 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, &n1, &n0); 1809 } 1810 q = udiv_qrnnd(&r, n1, n0, b.frac); 1811 1812 /* Set lsb if there is a remainder, to set inexact. */ 1813 a.frac = q | (r != 0); 1814 a.sign = sign; 1815 a.exp = exp; 1816 return a; 1817 } 1818 /* handle all the NaN cases */ 1819 if (is_nan(a.cls) || is_nan(b.cls)) { 1820 return *parts_pick_nan(&a, &b, s); 1821 } 1822 /* 0/0 or Inf/Inf */ 1823 if (a.cls == b.cls 1824 && 1825 (a.cls == float_class_inf || a.cls == float_class_zero)) { 1826 float_raise(float_flag_invalid, s); 1827 parts_default_nan(&a, s); 1828 return a; 1829 } 1830 /* Inf / x or 0 / x */ 1831 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1832 a.sign = sign; 1833 return a; 1834 } 1835 /* Div 0 => Inf */ 1836 if (b.cls == float_class_zero) { 1837 float_raise(float_flag_divbyzero, s); 1838 a.cls = float_class_inf; 1839 a.sign = sign; 1840 return a; 1841 } 1842 /* Div by Inf */ 1843 if (b.cls == float_class_inf) { 1844 a.cls = float_class_zero; 1845 a.sign = sign; 1846 return a; 1847 } 1848 g_assert_not_reached(); 1849 } 1850 1851 float16 float16_div(float16 a, float16 b, float_status *status) 1852 { 1853 FloatParts64 pa, pb, pr; 1854 1855 float16_unpack_canonical(&pa, a, status); 1856 float16_unpack_canonical(&pb, b, status); 1857 pr = div_floats(pa, pb, status); 1858 1859 return float16_round_pack_canonical(&pr, status); 1860 } 1861 1862 static float32 QEMU_SOFTFLOAT_ATTR 1863 soft_f32_div(float32 a, float32 b, float_status *status) 1864 { 1865 FloatParts64 pa, pb, pr; 1866 1867 float32_unpack_canonical(&pa, a, status); 1868 float32_unpack_canonical(&pb, b, status); 1869 pr = div_floats(pa, pb, status); 1870 1871 return float32_round_pack_canonical(&pr, status); 1872 } 1873 1874 static float64 QEMU_SOFTFLOAT_ATTR 1875 soft_f64_div(float64 a, float64 b, float_status *status) 1876 { 1877 FloatParts64 pa, pb, pr; 1878 1879 float64_unpack_canonical(&pa, a, status); 1880 float64_unpack_canonical(&pb, b, status); 1881 pr = div_floats(pa, pb, status); 1882 1883 return float64_round_pack_canonical(&pr, status); 1884 } 1885 1886 static float hard_f32_div(float a, float b) 1887 { 1888 return a / b; 1889 } 1890 1891 static double hard_f64_div(double a, double b) 1892 { 1893 return a / b; 1894 } 1895 1896 static bool f32_div_pre(union_float32 a, union_float32 b) 1897 { 1898 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1899 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1900 fpclassify(b.h) == FP_NORMAL; 1901 } 1902 return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s); 1903 } 1904 1905 static bool f64_div_pre(union_float64 a, union_float64 b) 1906 { 1907 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1908 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1909 fpclassify(b.h) == FP_NORMAL; 1910 } 1911 return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s); 1912 } 1913 1914 static bool f32_div_post(union_float32 a, union_float32 b) 1915 { 1916 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1917 return fpclassify(a.h) != FP_ZERO; 1918 } 1919 return !float32_is_zero(a.s); 1920 } 1921 1922 static bool f64_div_post(union_float64 a, union_float64 b) 1923 { 1924 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1925 return fpclassify(a.h) != FP_ZERO; 1926 } 1927 return !float64_is_zero(a.s); 1928 } 1929 1930 float32 QEMU_FLATTEN 1931 float32_div(float32 a, float32 b, float_status *s) 1932 { 1933 return float32_gen2(a, b, s, hard_f32_div, soft_f32_div, 1934 f32_div_pre, f32_div_post); 1935 } 1936 1937 float64 QEMU_FLATTEN 1938 float64_div(float64 a, float64 b, float_status *s) 1939 { 1940 return float64_gen2(a, b, s, hard_f64_div, soft_f64_div, 1941 f64_div_pre, f64_div_post); 1942 } 1943 1944 /* 1945 * Returns the result of dividing the bfloat16 1946 * value `a' by the corresponding value `b'. 1947 */ 1948 1949 bfloat16 bfloat16_div(bfloat16 a, bfloat16 b, float_status *status) 1950 { 1951 FloatParts64 pa, pb, pr; 1952 1953 bfloat16_unpack_canonical(&pa, a, status); 1954 bfloat16_unpack_canonical(&pb, b, status); 1955 pr = div_floats(pa, pb, status); 1956 1957 return bfloat16_round_pack_canonical(&pr, status); 1958 } 1959 1960 /* 1961 * Float to Float conversions 1962 * 1963 * Returns the result of converting one float format to another. The 1964 * conversion is performed according to the IEC/IEEE Standard for 1965 * Binary Floating-Point Arithmetic. 1966 * 1967 * The float_to_float helper only needs to take care of raising 1968 * invalid exceptions and handling the conversion on NaNs. 1969 */ 1970 1971 static FloatParts64 float_to_float(FloatParts64 a, const FloatFmt *dstf, 1972 float_status *s) 1973 { 1974 if (dstf->arm_althp) { 1975 switch (a.cls) { 1976 case float_class_qnan: 1977 case float_class_snan: 1978 /* There is no NaN in the destination format. Raise Invalid 1979 * and return a zero with the sign of the input NaN. 1980 */ 1981 float_raise(float_flag_invalid, s); 1982 a.cls = float_class_zero; 1983 a.frac = 0; 1984 a.exp = 0; 1985 break; 1986 1987 case float_class_inf: 1988 /* There is no Inf in the destination format. Raise Invalid 1989 * and return the maximum normal with the correct sign. 1990 */ 1991 float_raise(float_flag_invalid, s); 1992 a.cls = float_class_normal; 1993 a.exp = dstf->exp_max; 1994 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift; 1995 break; 1996 1997 default: 1998 break; 1999 } 2000 } else if (is_nan(a.cls)) { 2001 parts_return_nan(&a, s); 2002 } 2003 return a; 2004 } 2005 2006 float32 float16_to_float32(float16 a, bool ieee, float_status *s) 2007 { 2008 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2009 FloatParts64 pa, pr; 2010 2011 float16a_unpack_canonical(&pa, a, s, fmt16); 2012 pr = float_to_float(pa, &float32_params, s); 2013 return float32_round_pack_canonical(&pr, s); 2014 } 2015 2016 float64 float16_to_float64(float16 a, bool ieee, float_status *s) 2017 { 2018 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2019 FloatParts64 pa, pr; 2020 2021 float16a_unpack_canonical(&pa, a, s, fmt16); 2022 pr = float_to_float(pa, &float64_params, s); 2023 return float64_round_pack_canonical(&pr, s); 2024 } 2025 2026 float16 float32_to_float16(float32 a, bool ieee, float_status *s) 2027 { 2028 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2029 FloatParts64 pa, pr; 2030 2031 float32_unpack_canonical(&pa, a, s); 2032 pr = float_to_float(pa, fmt16, s); 2033 return float16a_round_pack_canonical(&pr, s, fmt16); 2034 } 2035 2036 static float64 QEMU_SOFTFLOAT_ATTR 2037 soft_float32_to_float64(float32 a, float_status *s) 2038 { 2039 FloatParts64 pa, pr; 2040 2041 float32_unpack_canonical(&pa, a, s); 2042 pr = float_to_float(pa, &float64_params, s); 2043 return float64_round_pack_canonical(&pr, s); 2044 } 2045 2046 float64 float32_to_float64(float32 a, float_status *s) 2047 { 2048 if (likely(float32_is_normal(a))) { 2049 /* Widening conversion can never produce inexact results. */ 2050 union_float32 uf; 2051 union_float64 ud; 2052 uf.s = a; 2053 ud.h = uf.h; 2054 return ud.s; 2055 } else if (float32_is_zero(a)) { 2056 return float64_set_sign(float64_zero, float32_is_neg(a)); 2057 } else { 2058 return soft_float32_to_float64(a, s); 2059 } 2060 } 2061 2062 float16 float64_to_float16(float64 a, bool ieee, float_status *s) 2063 { 2064 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2065 FloatParts64 pa, pr; 2066 2067 float64_unpack_canonical(&pa, a, s); 2068 pr = float_to_float(pa, fmt16, s); 2069 return float16a_round_pack_canonical(&pr, s, fmt16); 2070 } 2071 2072 float32 float64_to_float32(float64 a, float_status *s) 2073 { 2074 FloatParts64 pa, pr; 2075 2076 float64_unpack_canonical(&pa, a, s); 2077 pr = float_to_float(pa, &float32_params, s); 2078 return float32_round_pack_canonical(&pr, s); 2079 } 2080 2081 float32 bfloat16_to_float32(bfloat16 a, float_status *s) 2082 { 2083 FloatParts64 pa, pr; 2084 2085 bfloat16_unpack_canonical(&pa, a, s); 2086 pr = float_to_float(pa, &float32_params, s); 2087 return float32_round_pack_canonical(&pr, s); 2088 } 2089 2090 float64 bfloat16_to_float64(bfloat16 a, float_status *s) 2091 { 2092 FloatParts64 pa, pr; 2093 2094 bfloat16_unpack_canonical(&pa, a, s); 2095 pr = float_to_float(pa, &float64_params, s); 2096 return float64_round_pack_canonical(&pr, s); 2097 } 2098 2099 bfloat16 float32_to_bfloat16(float32 a, float_status *s) 2100 { 2101 FloatParts64 pa, pr; 2102 2103 float32_unpack_canonical(&pa, a, s); 2104 pr = float_to_float(pa, &bfloat16_params, s); 2105 return bfloat16_round_pack_canonical(&pr, s); 2106 } 2107 2108 bfloat16 float64_to_bfloat16(float64 a, float_status *s) 2109 { 2110 FloatParts64 pa, pr; 2111 2112 float64_unpack_canonical(&pa, a, s); 2113 pr = float_to_float(pa, &bfloat16_params, s); 2114 return bfloat16_round_pack_canonical(&pr, s); 2115 } 2116 2117 /* 2118 * Rounds the floating-point value `a' to an integer, and returns the 2119 * result as a floating-point value. The operation is performed 2120 * according to the IEC/IEEE Standard for Binary Floating-Point 2121 * Arithmetic. 2122 */ 2123 2124 static FloatParts64 round_to_int(FloatParts64 a, FloatRoundMode rmode, 2125 int scale, float_status *s) 2126 { 2127 switch (a.cls) { 2128 case float_class_qnan: 2129 case float_class_snan: 2130 parts_return_nan(&a, s); 2131 break; 2132 2133 case float_class_zero: 2134 case float_class_inf: 2135 /* already "integral" */ 2136 break; 2137 2138 case float_class_normal: 2139 scale = MIN(MAX(scale, -0x10000), 0x10000); 2140 a.exp += scale; 2141 2142 if (a.exp >= DECOMPOSED_BINARY_POINT) { 2143 /* already integral */ 2144 break; 2145 } 2146 if (a.exp < 0) { 2147 bool one; 2148 /* all fractional */ 2149 float_raise(float_flag_inexact, s); 2150 switch (rmode) { 2151 case float_round_nearest_even: 2152 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT; 2153 break; 2154 case float_round_ties_away: 2155 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT; 2156 break; 2157 case float_round_to_zero: 2158 one = false; 2159 break; 2160 case float_round_up: 2161 one = !a.sign; 2162 break; 2163 case float_round_down: 2164 one = a.sign; 2165 break; 2166 case float_round_to_odd: 2167 one = true; 2168 break; 2169 default: 2170 g_assert_not_reached(); 2171 } 2172 2173 if (one) { 2174 a.frac = DECOMPOSED_IMPLICIT_BIT; 2175 a.exp = 0; 2176 } else { 2177 a.cls = float_class_zero; 2178 } 2179 } else { 2180 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp; 2181 uint64_t frac_lsbm1 = frac_lsb >> 1; 2182 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb; 2183 uint64_t rnd_mask = rnd_even_mask >> 1; 2184 uint64_t inc; 2185 2186 switch (rmode) { 2187 case float_round_nearest_even: 2188 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 2189 break; 2190 case float_round_ties_away: 2191 inc = frac_lsbm1; 2192 break; 2193 case float_round_to_zero: 2194 inc = 0; 2195 break; 2196 case float_round_up: 2197 inc = a.sign ? 0 : rnd_mask; 2198 break; 2199 case float_round_down: 2200 inc = a.sign ? rnd_mask : 0; 2201 break; 2202 case float_round_to_odd: 2203 inc = a.frac & frac_lsb ? 0 : rnd_mask; 2204 break; 2205 default: 2206 g_assert_not_reached(); 2207 } 2208 2209 if (a.frac & rnd_mask) { 2210 float_raise(float_flag_inexact, s); 2211 if (uadd64_overflow(a.frac, inc, &a.frac)) { 2212 a.frac >>= 1; 2213 a.frac |= DECOMPOSED_IMPLICIT_BIT; 2214 a.exp++; 2215 } 2216 a.frac &= ~rnd_mask; 2217 } 2218 } 2219 break; 2220 default: 2221 g_assert_not_reached(); 2222 } 2223 return a; 2224 } 2225 2226 float16 float16_round_to_int(float16 a, float_status *s) 2227 { 2228 FloatParts64 pa, pr; 2229 2230 float16_unpack_canonical(&pa, a, s); 2231 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2232 return float16_round_pack_canonical(&pr, s); 2233 } 2234 2235 float32 float32_round_to_int(float32 a, float_status *s) 2236 { 2237 FloatParts64 pa, pr; 2238 2239 float32_unpack_canonical(&pa, a, s); 2240 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2241 return float32_round_pack_canonical(&pr, s); 2242 } 2243 2244 float64 float64_round_to_int(float64 a, float_status *s) 2245 { 2246 FloatParts64 pa, pr; 2247 2248 float64_unpack_canonical(&pa, a, s); 2249 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2250 return float64_round_pack_canonical(&pr, s); 2251 } 2252 2253 /* 2254 * Rounds the bfloat16 value `a' to an integer, and returns the 2255 * result as a bfloat16 value. 2256 */ 2257 2258 bfloat16 bfloat16_round_to_int(bfloat16 a, float_status *s) 2259 { 2260 FloatParts64 pa, pr; 2261 2262 bfloat16_unpack_canonical(&pa, a, s); 2263 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2264 return bfloat16_round_pack_canonical(&pr, s); 2265 } 2266 2267 /* 2268 * Returns the result of converting the floating-point value `a' to 2269 * the two's complement integer format. The conversion is performed 2270 * according to the IEC/IEEE Standard for Binary Floating-Point 2271 * Arithmetic---which means in particular that the conversion is 2272 * rounded according to the current rounding mode. If `a' is a NaN, 2273 * the largest positive integer is returned. Otherwise, if the 2274 * conversion overflows, the largest integer with the same sign as `a' 2275 * is returned. 2276 */ 2277 2278 static int64_t round_to_int_and_pack(FloatParts64 in, FloatRoundMode rmode, 2279 int scale, int64_t min, int64_t max, 2280 float_status *s) 2281 { 2282 uint64_t r; 2283 int orig_flags = get_float_exception_flags(s); 2284 FloatParts64 p = round_to_int(in, rmode, scale, s); 2285 2286 switch (p.cls) { 2287 case float_class_snan: 2288 case float_class_qnan: 2289 s->float_exception_flags = orig_flags | float_flag_invalid; 2290 return max; 2291 case float_class_inf: 2292 s->float_exception_flags = orig_flags | float_flag_invalid; 2293 return p.sign ? min : max; 2294 case float_class_zero: 2295 return 0; 2296 case float_class_normal: 2297 if (p.exp <= DECOMPOSED_BINARY_POINT) { 2298 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2299 } else { 2300 r = UINT64_MAX; 2301 } 2302 if (p.sign) { 2303 if (r <= -(uint64_t) min) { 2304 return -r; 2305 } else { 2306 s->float_exception_flags = orig_flags | float_flag_invalid; 2307 return min; 2308 } 2309 } else { 2310 if (r <= max) { 2311 return r; 2312 } else { 2313 s->float_exception_flags = orig_flags | float_flag_invalid; 2314 return max; 2315 } 2316 } 2317 default: 2318 g_assert_not_reached(); 2319 } 2320 } 2321 2322 int8_t float16_to_int8_scalbn(float16 a, FloatRoundMode rmode, int scale, 2323 float_status *s) 2324 { 2325 FloatParts64 p; 2326 2327 float16_unpack_canonical(&p, a, s); 2328 return round_to_int_and_pack(p, rmode, scale, INT8_MIN, INT8_MAX, s); 2329 } 2330 2331 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale, 2332 float_status *s) 2333 { 2334 FloatParts64 p; 2335 2336 float16_unpack_canonical(&p, a, s); 2337 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2338 } 2339 2340 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale, 2341 float_status *s) 2342 { 2343 FloatParts64 p; 2344 2345 float16_unpack_canonical(&p, a, s); 2346 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2347 } 2348 2349 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale, 2350 float_status *s) 2351 { 2352 FloatParts64 p; 2353 2354 float16_unpack_canonical(&p, a, s); 2355 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2356 } 2357 2358 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale, 2359 float_status *s) 2360 { 2361 FloatParts64 p; 2362 2363 float32_unpack_canonical(&p, a, s); 2364 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2365 } 2366 2367 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale, 2368 float_status *s) 2369 { 2370 FloatParts64 p; 2371 2372 float32_unpack_canonical(&p, a, s); 2373 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2374 } 2375 2376 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale, 2377 float_status *s) 2378 { 2379 FloatParts64 p; 2380 2381 float32_unpack_canonical(&p, a, s); 2382 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2383 } 2384 2385 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale, 2386 float_status *s) 2387 { 2388 FloatParts64 p; 2389 2390 float64_unpack_canonical(&p, a, s); 2391 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2392 } 2393 2394 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale, 2395 float_status *s) 2396 { 2397 FloatParts64 p; 2398 2399 float64_unpack_canonical(&p, a, s); 2400 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2401 } 2402 2403 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale, 2404 float_status *s) 2405 { 2406 FloatParts64 p; 2407 2408 float64_unpack_canonical(&p, a, s); 2409 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2410 } 2411 2412 int8_t float16_to_int8(float16 a, float_status *s) 2413 { 2414 return float16_to_int8_scalbn(a, s->float_rounding_mode, 0, s); 2415 } 2416 2417 int16_t float16_to_int16(float16 a, float_status *s) 2418 { 2419 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2420 } 2421 2422 int32_t float16_to_int32(float16 a, float_status *s) 2423 { 2424 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2425 } 2426 2427 int64_t float16_to_int64(float16 a, float_status *s) 2428 { 2429 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2430 } 2431 2432 int16_t float32_to_int16(float32 a, float_status *s) 2433 { 2434 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2435 } 2436 2437 int32_t float32_to_int32(float32 a, float_status *s) 2438 { 2439 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2440 } 2441 2442 int64_t float32_to_int64(float32 a, float_status *s) 2443 { 2444 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2445 } 2446 2447 int16_t float64_to_int16(float64 a, float_status *s) 2448 { 2449 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2450 } 2451 2452 int32_t float64_to_int32(float64 a, float_status *s) 2453 { 2454 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2455 } 2456 2457 int64_t float64_to_int64(float64 a, float_status *s) 2458 { 2459 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2460 } 2461 2462 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s) 2463 { 2464 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2465 } 2466 2467 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s) 2468 { 2469 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2470 } 2471 2472 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s) 2473 { 2474 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2475 } 2476 2477 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s) 2478 { 2479 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s); 2480 } 2481 2482 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s) 2483 { 2484 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s); 2485 } 2486 2487 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s) 2488 { 2489 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s); 2490 } 2491 2492 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s) 2493 { 2494 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s); 2495 } 2496 2497 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s) 2498 { 2499 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s); 2500 } 2501 2502 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s) 2503 { 2504 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s); 2505 } 2506 2507 /* 2508 * Returns the result of converting the floating-point value `a' to 2509 * the two's complement integer format. 2510 */ 2511 2512 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2513 float_status *s) 2514 { 2515 FloatParts64 p; 2516 2517 bfloat16_unpack_canonical(&p, a, s); 2518 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2519 } 2520 2521 int32_t bfloat16_to_int32_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2522 float_status *s) 2523 { 2524 FloatParts64 p; 2525 2526 bfloat16_unpack_canonical(&p, a, s); 2527 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2528 } 2529 2530 int64_t bfloat16_to_int64_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2531 float_status *s) 2532 { 2533 FloatParts64 p; 2534 2535 bfloat16_unpack_canonical(&p, a, s); 2536 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2537 } 2538 2539 int16_t bfloat16_to_int16(bfloat16 a, float_status *s) 2540 { 2541 return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2542 } 2543 2544 int32_t bfloat16_to_int32(bfloat16 a, float_status *s) 2545 { 2546 return bfloat16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2547 } 2548 2549 int64_t bfloat16_to_int64(bfloat16 a, float_status *s) 2550 { 2551 return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2552 } 2553 2554 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s) 2555 { 2556 return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2557 } 2558 2559 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a, float_status *s) 2560 { 2561 return bfloat16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2562 } 2563 2564 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a, float_status *s) 2565 { 2566 return bfloat16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2567 } 2568 2569 /* 2570 * Returns the result of converting the floating-point value `a' to 2571 * the unsigned integer format. The conversion is performed according 2572 * to the IEC/IEEE Standard for Binary Floating-Point 2573 * Arithmetic---which means in particular that the conversion is 2574 * rounded according to the current rounding mode. If `a' is a NaN, 2575 * the largest unsigned integer is returned. Otherwise, if the 2576 * conversion overflows, the largest unsigned integer is returned. If 2577 * the 'a' is negative, the result is rounded and zero is returned; 2578 * values that do not round to zero will raise the inexact exception 2579 * flag. 2580 */ 2581 2582 static uint64_t round_to_uint_and_pack(FloatParts64 in, FloatRoundMode rmode, 2583 int scale, uint64_t max, 2584 float_status *s) 2585 { 2586 int orig_flags = get_float_exception_flags(s); 2587 FloatParts64 p = round_to_int(in, rmode, scale, s); 2588 uint64_t r; 2589 2590 switch (p.cls) { 2591 case float_class_snan: 2592 case float_class_qnan: 2593 s->float_exception_flags = orig_flags | float_flag_invalid; 2594 return max; 2595 case float_class_inf: 2596 s->float_exception_flags = orig_flags | float_flag_invalid; 2597 return p.sign ? 0 : max; 2598 case float_class_zero: 2599 return 0; 2600 case float_class_normal: 2601 if (p.sign) { 2602 s->float_exception_flags = orig_flags | float_flag_invalid; 2603 return 0; 2604 } 2605 2606 if (p.exp <= DECOMPOSED_BINARY_POINT) { 2607 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2608 } else { 2609 s->float_exception_flags = orig_flags | float_flag_invalid; 2610 return max; 2611 } 2612 2613 /* For uint64 this will never trip, but if p.exp is too large 2614 * to shift a decomposed fraction we shall have exited via the 2615 * 3rd leg above. 2616 */ 2617 if (r > max) { 2618 s->float_exception_flags = orig_flags | float_flag_invalid; 2619 return max; 2620 } 2621 return r; 2622 default: 2623 g_assert_not_reached(); 2624 } 2625 } 2626 2627 uint8_t float16_to_uint8_scalbn(float16 a, FloatRoundMode rmode, int scale, 2628 float_status *s) 2629 { 2630 FloatParts64 p; 2631 2632 float16_unpack_canonical(&p, a, s); 2633 return round_to_uint_and_pack(p, rmode, scale, UINT8_MAX, s); 2634 } 2635 2636 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale, 2637 float_status *s) 2638 { 2639 FloatParts64 p; 2640 2641 float16_unpack_canonical(&p, a, s); 2642 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2643 } 2644 2645 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale, 2646 float_status *s) 2647 { 2648 FloatParts64 p; 2649 2650 float16_unpack_canonical(&p, a, s); 2651 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2652 } 2653 2654 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale, 2655 float_status *s) 2656 { 2657 FloatParts64 p; 2658 2659 float16_unpack_canonical(&p, a, s); 2660 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2661 } 2662 2663 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale, 2664 float_status *s) 2665 { 2666 FloatParts64 p; 2667 2668 float32_unpack_canonical(&p, a, s); 2669 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2670 } 2671 2672 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale, 2673 float_status *s) 2674 { 2675 FloatParts64 p; 2676 2677 float32_unpack_canonical(&p, a, s); 2678 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2679 } 2680 2681 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale, 2682 float_status *s) 2683 { 2684 FloatParts64 p; 2685 2686 float32_unpack_canonical(&p, a, s); 2687 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2688 } 2689 2690 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale, 2691 float_status *s) 2692 { 2693 FloatParts64 p; 2694 2695 float64_unpack_canonical(&p, a, s); 2696 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2697 } 2698 2699 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale, 2700 float_status *s) 2701 { 2702 FloatParts64 p; 2703 2704 float64_unpack_canonical(&p, a, s); 2705 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2706 } 2707 2708 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale, 2709 float_status *s) 2710 { 2711 FloatParts64 p; 2712 2713 float64_unpack_canonical(&p, a, s); 2714 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2715 } 2716 2717 uint8_t float16_to_uint8(float16 a, float_status *s) 2718 { 2719 return float16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s); 2720 } 2721 2722 uint16_t float16_to_uint16(float16 a, float_status *s) 2723 { 2724 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2725 } 2726 2727 uint32_t float16_to_uint32(float16 a, float_status *s) 2728 { 2729 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2730 } 2731 2732 uint64_t float16_to_uint64(float16 a, float_status *s) 2733 { 2734 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2735 } 2736 2737 uint16_t float32_to_uint16(float32 a, float_status *s) 2738 { 2739 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2740 } 2741 2742 uint32_t float32_to_uint32(float32 a, float_status *s) 2743 { 2744 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2745 } 2746 2747 uint64_t float32_to_uint64(float32 a, float_status *s) 2748 { 2749 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2750 } 2751 2752 uint16_t float64_to_uint16(float64 a, float_status *s) 2753 { 2754 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2755 } 2756 2757 uint32_t float64_to_uint32(float64 a, float_status *s) 2758 { 2759 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2760 } 2761 2762 uint64_t float64_to_uint64(float64 a, float_status *s) 2763 { 2764 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2765 } 2766 2767 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s) 2768 { 2769 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2770 } 2771 2772 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s) 2773 { 2774 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2775 } 2776 2777 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s) 2778 { 2779 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2780 } 2781 2782 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s) 2783 { 2784 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2785 } 2786 2787 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s) 2788 { 2789 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2790 } 2791 2792 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s) 2793 { 2794 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2795 } 2796 2797 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s) 2798 { 2799 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2800 } 2801 2802 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s) 2803 { 2804 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2805 } 2806 2807 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s) 2808 { 2809 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2810 } 2811 2812 /* 2813 * Returns the result of converting the bfloat16 value `a' to 2814 * the unsigned integer format. 2815 */ 2816 2817 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode, 2818 int scale, float_status *s) 2819 { 2820 FloatParts64 p; 2821 2822 bfloat16_unpack_canonical(&p, a, s); 2823 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2824 } 2825 2826 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode rmode, 2827 int scale, float_status *s) 2828 { 2829 FloatParts64 p; 2830 2831 bfloat16_unpack_canonical(&p, a, s); 2832 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2833 } 2834 2835 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode rmode, 2836 int scale, float_status *s) 2837 { 2838 FloatParts64 p; 2839 2840 bfloat16_unpack_canonical(&p, a, s); 2841 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2842 } 2843 2844 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s) 2845 { 2846 return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2847 } 2848 2849 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *s) 2850 { 2851 return bfloat16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2852 } 2853 2854 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s) 2855 { 2856 return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2857 } 2858 2859 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s) 2860 { 2861 return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2862 } 2863 2864 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *s) 2865 { 2866 return bfloat16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2867 } 2868 2869 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *s) 2870 { 2871 return bfloat16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2872 } 2873 2874 /* 2875 * Integer to float conversions 2876 * 2877 * Returns the result of converting the two's complement integer `a' 2878 * to the floating-point format. The conversion is performed according 2879 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2880 */ 2881 2882 static FloatParts64 int_to_float(int64_t a, int scale, float_status *status) 2883 { 2884 FloatParts64 r = { .sign = false }; 2885 2886 if (a == 0) { 2887 r.cls = float_class_zero; 2888 } else { 2889 uint64_t f = a; 2890 int shift; 2891 2892 r.cls = float_class_normal; 2893 if (a < 0) { 2894 f = -f; 2895 r.sign = true; 2896 } 2897 shift = clz64(f); 2898 scale = MIN(MAX(scale, -0x10000), 0x10000); 2899 2900 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 2901 r.frac = f << shift; 2902 } 2903 2904 return r; 2905 } 2906 2907 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status) 2908 { 2909 FloatParts64 pa = int_to_float(a, scale, status); 2910 return float16_round_pack_canonical(&pa, status); 2911 } 2912 2913 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status) 2914 { 2915 return int64_to_float16_scalbn(a, scale, status); 2916 } 2917 2918 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status) 2919 { 2920 return int64_to_float16_scalbn(a, scale, status); 2921 } 2922 2923 float16 int64_to_float16(int64_t a, float_status *status) 2924 { 2925 return int64_to_float16_scalbn(a, 0, status); 2926 } 2927 2928 float16 int32_to_float16(int32_t a, float_status *status) 2929 { 2930 return int64_to_float16_scalbn(a, 0, status); 2931 } 2932 2933 float16 int16_to_float16(int16_t a, float_status *status) 2934 { 2935 return int64_to_float16_scalbn(a, 0, status); 2936 } 2937 2938 float16 int8_to_float16(int8_t a, float_status *status) 2939 { 2940 return int64_to_float16_scalbn(a, 0, status); 2941 } 2942 2943 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status) 2944 { 2945 FloatParts64 pa = int_to_float(a, scale, status); 2946 return float32_round_pack_canonical(&pa, status); 2947 } 2948 2949 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status) 2950 { 2951 return int64_to_float32_scalbn(a, scale, status); 2952 } 2953 2954 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status) 2955 { 2956 return int64_to_float32_scalbn(a, scale, status); 2957 } 2958 2959 float32 int64_to_float32(int64_t a, float_status *status) 2960 { 2961 return int64_to_float32_scalbn(a, 0, status); 2962 } 2963 2964 float32 int32_to_float32(int32_t a, float_status *status) 2965 { 2966 return int64_to_float32_scalbn(a, 0, status); 2967 } 2968 2969 float32 int16_to_float32(int16_t a, float_status *status) 2970 { 2971 return int64_to_float32_scalbn(a, 0, status); 2972 } 2973 2974 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status) 2975 { 2976 FloatParts64 pa = int_to_float(a, scale, status); 2977 return float64_round_pack_canonical(&pa, status); 2978 } 2979 2980 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status) 2981 { 2982 return int64_to_float64_scalbn(a, scale, status); 2983 } 2984 2985 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status) 2986 { 2987 return int64_to_float64_scalbn(a, scale, status); 2988 } 2989 2990 float64 int64_to_float64(int64_t a, float_status *status) 2991 { 2992 return int64_to_float64_scalbn(a, 0, status); 2993 } 2994 2995 float64 int32_to_float64(int32_t a, float_status *status) 2996 { 2997 return int64_to_float64_scalbn(a, 0, status); 2998 } 2999 3000 float64 int16_to_float64(int16_t a, float_status *status) 3001 { 3002 return int64_to_float64_scalbn(a, 0, status); 3003 } 3004 3005 /* 3006 * Returns the result of converting the two's complement integer `a' 3007 * to the bfloat16 format. 3008 */ 3009 3010 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status) 3011 { 3012 FloatParts64 pa = int_to_float(a, scale, status); 3013 return bfloat16_round_pack_canonical(&pa, status); 3014 } 3015 3016 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int scale, float_status *status) 3017 { 3018 return int64_to_bfloat16_scalbn(a, scale, status); 3019 } 3020 3021 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status) 3022 { 3023 return int64_to_bfloat16_scalbn(a, scale, status); 3024 } 3025 3026 bfloat16 int64_to_bfloat16(int64_t a, float_status *status) 3027 { 3028 return int64_to_bfloat16_scalbn(a, 0, status); 3029 } 3030 3031 bfloat16 int32_to_bfloat16(int32_t a, float_status *status) 3032 { 3033 return int64_to_bfloat16_scalbn(a, 0, status); 3034 } 3035 3036 bfloat16 int16_to_bfloat16(int16_t a, float_status *status) 3037 { 3038 return int64_to_bfloat16_scalbn(a, 0, status); 3039 } 3040 3041 /* 3042 * Unsigned Integer to float conversions 3043 * 3044 * Returns the result of converting the unsigned integer `a' to the 3045 * floating-point format. The conversion is performed according to the 3046 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3047 */ 3048 3049 static FloatParts64 uint_to_float(uint64_t a, int scale, float_status *status) 3050 { 3051 FloatParts64 r = { .sign = false }; 3052 int shift; 3053 3054 if (a == 0) { 3055 r.cls = float_class_zero; 3056 } else { 3057 scale = MIN(MAX(scale, -0x10000), 0x10000); 3058 shift = clz64(a); 3059 r.cls = float_class_normal; 3060 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 3061 r.frac = a << shift; 3062 } 3063 3064 return r; 3065 } 3066 3067 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status) 3068 { 3069 FloatParts64 pa = uint_to_float(a, scale, status); 3070 return float16_round_pack_canonical(&pa, status); 3071 } 3072 3073 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status) 3074 { 3075 return uint64_to_float16_scalbn(a, scale, status); 3076 } 3077 3078 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status) 3079 { 3080 return uint64_to_float16_scalbn(a, scale, status); 3081 } 3082 3083 float16 uint64_to_float16(uint64_t a, float_status *status) 3084 { 3085 return uint64_to_float16_scalbn(a, 0, status); 3086 } 3087 3088 float16 uint32_to_float16(uint32_t a, float_status *status) 3089 { 3090 return uint64_to_float16_scalbn(a, 0, status); 3091 } 3092 3093 float16 uint16_to_float16(uint16_t a, float_status *status) 3094 { 3095 return uint64_to_float16_scalbn(a, 0, status); 3096 } 3097 3098 float16 uint8_to_float16(uint8_t a, float_status *status) 3099 { 3100 return uint64_to_float16_scalbn(a, 0, status); 3101 } 3102 3103 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status) 3104 { 3105 FloatParts64 pa = uint_to_float(a, scale, status); 3106 return float32_round_pack_canonical(&pa, status); 3107 } 3108 3109 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status) 3110 { 3111 return uint64_to_float32_scalbn(a, scale, status); 3112 } 3113 3114 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status) 3115 { 3116 return uint64_to_float32_scalbn(a, scale, status); 3117 } 3118 3119 float32 uint64_to_float32(uint64_t a, float_status *status) 3120 { 3121 return uint64_to_float32_scalbn(a, 0, status); 3122 } 3123 3124 float32 uint32_to_float32(uint32_t a, float_status *status) 3125 { 3126 return uint64_to_float32_scalbn(a, 0, status); 3127 } 3128 3129 float32 uint16_to_float32(uint16_t a, float_status *status) 3130 { 3131 return uint64_to_float32_scalbn(a, 0, status); 3132 } 3133 3134 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status) 3135 { 3136 FloatParts64 pa = uint_to_float(a, scale, status); 3137 return float64_round_pack_canonical(&pa, status); 3138 } 3139 3140 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status) 3141 { 3142 return uint64_to_float64_scalbn(a, scale, status); 3143 } 3144 3145 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status) 3146 { 3147 return uint64_to_float64_scalbn(a, scale, status); 3148 } 3149 3150 float64 uint64_to_float64(uint64_t a, float_status *status) 3151 { 3152 return uint64_to_float64_scalbn(a, 0, status); 3153 } 3154 3155 float64 uint32_to_float64(uint32_t a, float_status *status) 3156 { 3157 return uint64_to_float64_scalbn(a, 0, status); 3158 } 3159 3160 float64 uint16_to_float64(uint16_t a, float_status *status) 3161 { 3162 return uint64_to_float64_scalbn(a, 0, status); 3163 } 3164 3165 /* 3166 * Returns the result of converting the unsigned integer `a' to the 3167 * bfloat16 format. 3168 */ 3169 3170 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status) 3171 { 3172 FloatParts64 pa = uint_to_float(a, scale, status); 3173 return bfloat16_round_pack_canonical(&pa, status); 3174 } 3175 3176 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int scale, float_status *status) 3177 { 3178 return uint64_to_bfloat16_scalbn(a, scale, status); 3179 } 3180 3181 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status) 3182 { 3183 return uint64_to_bfloat16_scalbn(a, scale, status); 3184 } 3185 3186 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status) 3187 { 3188 return uint64_to_bfloat16_scalbn(a, 0, status); 3189 } 3190 3191 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status) 3192 { 3193 return uint64_to_bfloat16_scalbn(a, 0, status); 3194 } 3195 3196 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status) 3197 { 3198 return uint64_to_bfloat16_scalbn(a, 0, status); 3199 } 3200 3201 /* Float Min/Max */ 3202 /* min() and max() functions. These can't be implemented as 3203 * 'compare and pick one input' because that would mishandle 3204 * NaNs and +0 vs -0. 3205 * 3206 * minnum() and maxnum() functions. These are similar to the min() 3207 * and max() functions but if one of the arguments is a QNaN and 3208 * the other is numerical then the numerical argument is returned. 3209 * SNaNs will get quietened before being returned. 3210 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 3211 * and maxNum() operations. min() and max() are the typical min/max 3212 * semantics provided by many CPUs which predate that specification. 3213 * 3214 * minnummag() and maxnummag() functions correspond to minNumMag() 3215 * and minNumMag() from the IEEE-754 2008. 3216 */ 3217 static FloatParts64 minmax_floats(FloatParts64 a, FloatParts64 b, bool ismin, 3218 bool ieee, bool ismag, float_status *s) 3219 { 3220 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) { 3221 if (ieee) { 3222 /* Takes two floating-point values `a' and `b', one of 3223 * which is a NaN, and returns the appropriate NaN 3224 * result. If either `a' or `b' is a signaling NaN, 3225 * the invalid exception is raised. 3226 */ 3227 if (is_snan(a.cls) || is_snan(b.cls)) { 3228 return *parts_pick_nan(&a, &b, s); 3229 } else if (is_nan(a.cls) && !is_nan(b.cls)) { 3230 return b; 3231 } else if (is_nan(b.cls) && !is_nan(a.cls)) { 3232 return a; 3233 } 3234 } 3235 return *parts_pick_nan(&a, &b, s); 3236 } else { 3237 int a_exp, b_exp; 3238 3239 switch (a.cls) { 3240 case float_class_normal: 3241 a_exp = a.exp; 3242 break; 3243 case float_class_inf: 3244 a_exp = INT_MAX; 3245 break; 3246 case float_class_zero: 3247 a_exp = INT_MIN; 3248 break; 3249 default: 3250 g_assert_not_reached(); 3251 break; 3252 } 3253 switch (b.cls) { 3254 case float_class_normal: 3255 b_exp = b.exp; 3256 break; 3257 case float_class_inf: 3258 b_exp = INT_MAX; 3259 break; 3260 case float_class_zero: 3261 b_exp = INT_MIN; 3262 break; 3263 default: 3264 g_assert_not_reached(); 3265 break; 3266 } 3267 3268 if (ismag && (a_exp != b_exp || a.frac != b.frac)) { 3269 bool a_less = a_exp < b_exp; 3270 if (a_exp == b_exp) { 3271 a_less = a.frac < b.frac; 3272 } 3273 return a_less ^ ismin ? b : a; 3274 } 3275 3276 if (a.sign == b.sign) { 3277 bool a_less = a_exp < b_exp; 3278 if (a_exp == b_exp) { 3279 a_less = a.frac < b.frac; 3280 } 3281 return a.sign ^ a_less ^ ismin ? b : a; 3282 } else { 3283 return a.sign ^ ismin ? b : a; 3284 } 3285 } 3286 } 3287 3288 #define MINMAX(sz, name, ismin, isiee, ismag) \ 3289 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \ 3290 float_status *s) \ 3291 { \ 3292 FloatParts64 pa, pb, pr; \ 3293 float ## sz ## _unpack_canonical(&pa, a, s); \ 3294 float ## sz ## _unpack_canonical(&pb, b, s); \ 3295 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 3296 return float ## sz ## _round_pack_canonical(&pr, s); \ 3297 } 3298 3299 MINMAX(16, min, true, false, false) 3300 MINMAX(16, minnum, true, true, false) 3301 MINMAX(16, minnummag, true, true, true) 3302 MINMAX(16, max, false, false, false) 3303 MINMAX(16, maxnum, false, true, false) 3304 MINMAX(16, maxnummag, false, true, true) 3305 3306 MINMAX(32, min, true, false, false) 3307 MINMAX(32, minnum, true, true, false) 3308 MINMAX(32, minnummag, true, true, true) 3309 MINMAX(32, max, false, false, false) 3310 MINMAX(32, maxnum, false, true, false) 3311 MINMAX(32, maxnummag, false, true, true) 3312 3313 MINMAX(64, min, true, false, false) 3314 MINMAX(64, minnum, true, true, false) 3315 MINMAX(64, minnummag, true, true, true) 3316 MINMAX(64, max, false, false, false) 3317 MINMAX(64, maxnum, false, true, false) 3318 MINMAX(64, maxnummag, false, true, true) 3319 3320 #undef MINMAX 3321 3322 #define BF16_MINMAX(name, ismin, isiee, ismag) \ 3323 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s) \ 3324 { \ 3325 FloatParts64 pa, pb, pr; \ 3326 bfloat16_unpack_canonical(&pa, a, s); \ 3327 bfloat16_unpack_canonical(&pb, b, s); \ 3328 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 3329 return bfloat16_round_pack_canonical(&pr, s); \ 3330 } 3331 3332 BF16_MINMAX(min, true, false, false) 3333 BF16_MINMAX(minnum, true, true, false) 3334 BF16_MINMAX(minnummag, true, true, true) 3335 BF16_MINMAX(max, false, false, false) 3336 BF16_MINMAX(maxnum, false, true, false) 3337 BF16_MINMAX(maxnummag, false, true, true) 3338 3339 #undef BF16_MINMAX 3340 3341 /* Floating point compare */ 3342 static FloatRelation compare_floats(FloatParts64 a, FloatParts64 b, bool is_quiet, 3343 float_status *s) 3344 { 3345 if (is_nan(a.cls) || is_nan(b.cls)) { 3346 if (!is_quiet || 3347 a.cls == float_class_snan || 3348 b.cls == float_class_snan) { 3349 float_raise(float_flag_invalid, s); 3350 } 3351 return float_relation_unordered; 3352 } 3353 3354 if (a.cls == float_class_zero) { 3355 if (b.cls == float_class_zero) { 3356 return float_relation_equal; 3357 } 3358 return b.sign ? float_relation_greater : float_relation_less; 3359 } else if (b.cls == float_class_zero) { 3360 return a.sign ? float_relation_less : float_relation_greater; 3361 } 3362 3363 /* The only really important thing about infinity is its sign. If 3364 * both are infinities the sign marks the smallest of the two. 3365 */ 3366 if (a.cls == float_class_inf) { 3367 if ((b.cls == float_class_inf) && (a.sign == b.sign)) { 3368 return float_relation_equal; 3369 } 3370 return a.sign ? float_relation_less : float_relation_greater; 3371 } else if (b.cls == float_class_inf) { 3372 return b.sign ? float_relation_greater : float_relation_less; 3373 } 3374 3375 if (a.sign != b.sign) { 3376 return a.sign ? float_relation_less : float_relation_greater; 3377 } 3378 3379 if (a.exp == b.exp) { 3380 if (a.frac == b.frac) { 3381 return float_relation_equal; 3382 } 3383 if (a.sign) { 3384 return a.frac > b.frac ? 3385 float_relation_less : float_relation_greater; 3386 } else { 3387 return a.frac > b.frac ? 3388 float_relation_greater : float_relation_less; 3389 } 3390 } else { 3391 if (a.sign) { 3392 return a.exp > b.exp ? float_relation_less : float_relation_greater; 3393 } else { 3394 return a.exp > b.exp ? float_relation_greater : float_relation_less; 3395 } 3396 } 3397 } 3398 3399 #define COMPARE(name, attr, sz) \ 3400 static int attr \ 3401 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \ 3402 { \ 3403 FloatParts64 pa, pb; \ 3404 float ## sz ## _unpack_canonical(&pa, a, s); \ 3405 float ## sz ## _unpack_canonical(&pb, b, s); \ 3406 return compare_floats(pa, pb, is_quiet, s); \ 3407 } 3408 3409 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16) 3410 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32) 3411 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64) 3412 3413 #undef COMPARE 3414 3415 FloatRelation float16_compare(float16 a, float16 b, float_status *s) 3416 { 3417 return soft_f16_compare(a, b, false, s); 3418 } 3419 3420 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s) 3421 { 3422 return soft_f16_compare(a, b, true, s); 3423 } 3424 3425 static FloatRelation QEMU_FLATTEN 3426 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s) 3427 { 3428 union_float32 ua, ub; 3429 3430 ua.s = xa; 3431 ub.s = xb; 3432 3433 if (QEMU_NO_HARDFLOAT) { 3434 goto soft; 3435 } 3436 3437 float32_input_flush2(&ua.s, &ub.s, s); 3438 if (isgreaterequal(ua.h, ub.h)) { 3439 if (isgreater(ua.h, ub.h)) { 3440 return float_relation_greater; 3441 } 3442 return float_relation_equal; 3443 } 3444 if (likely(isless(ua.h, ub.h))) { 3445 return float_relation_less; 3446 } 3447 /* The only condition remaining is unordered. 3448 * Fall through to set flags. 3449 */ 3450 soft: 3451 return soft_f32_compare(ua.s, ub.s, is_quiet, s); 3452 } 3453 3454 FloatRelation float32_compare(float32 a, float32 b, float_status *s) 3455 { 3456 return f32_compare(a, b, false, s); 3457 } 3458 3459 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s) 3460 { 3461 return f32_compare(a, b, true, s); 3462 } 3463 3464 static FloatRelation QEMU_FLATTEN 3465 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s) 3466 { 3467 union_float64 ua, ub; 3468 3469 ua.s = xa; 3470 ub.s = xb; 3471 3472 if (QEMU_NO_HARDFLOAT) { 3473 goto soft; 3474 } 3475 3476 float64_input_flush2(&ua.s, &ub.s, s); 3477 if (isgreaterequal(ua.h, ub.h)) { 3478 if (isgreater(ua.h, ub.h)) { 3479 return float_relation_greater; 3480 } 3481 return float_relation_equal; 3482 } 3483 if (likely(isless(ua.h, ub.h))) { 3484 return float_relation_less; 3485 } 3486 /* The only condition remaining is unordered. 3487 * Fall through to set flags. 3488 */ 3489 soft: 3490 return soft_f64_compare(ua.s, ub.s, is_quiet, s); 3491 } 3492 3493 FloatRelation float64_compare(float64 a, float64 b, float_status *s) 3494 { 3495 return f64_compare(a, b, false, s); 3496 } 3497 3498 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s) 3499 { 3500 return f64_compare(a, b, true, s); 3501 } 3502 3503 static FloatRelation QEMU_FLATTEN 3504 soft_bf16_compare(bfloat16 a, bfloat16 b, bool is_quiet, float_status *s) 3505 { 3506 FloatParts64 pa, pb; 3507 3508 bfloat16_unpack_canonical(&pa, a, s); 3509 bfloat16_unpack_canonical(&pb, b, s); 3510 return compare_floats(pa, pb, is_quiet, s); 3511 } 3512 3513 FloatRelation bfloat16_compare(bfloat16 a, bfloat16 b, float_status *s) 3514 { 3515 return soft_bf16_compare(a, b, false, s); 3516 } 3517 3518 FloatRelation bfloat16_compare_quiet(bfloat16 a, bfloat16 b, float_status *s) 3519 { 3520 return soft_bf16_compare(a, b, true, s); 3521 } 3522 3523 /* Multiply A by 2 raised to the power N. */ 3524 static FloatParts64 scalbn_decomposed(FloatParts64 a, int n, float_status *s) 3525 { 3526 if (unlikely(is_nan(a.cls))) { 3527 parts_return_nan(&a, s); 3528 } 3529 if (a.cls == float_class_normal) { 3530 /* The largest float type (even though not supported by FloatParts64) 3531 * is float128, which has a 15 bit exponent. Bounding N to 16 bits 3532 * still allows rounding to infinity, without allowing overflow 3533 * within the int32_t that backs FloatParts64.exp. 3534 */ 3535 n = MIN(MAX(n, -0x10000), 0x10000); 3536 a.exp += n; 3537 } 3538 return a; 3539 } 3540 3541 float16 float16_scalbn(float16 a, int n, float_status *status) 3542 { 3543 FloatParts64 pa, pr; 3544 3545 float16_unpack_canonical(&pa, a, status); 3546 pr = scalbn_decomposed(pa, n, status); 3547 return float16_round_pack_canonical(&pr, status); 3548 } 3549 3550 float32 float32_scalbn(float32 a, int n, float_status *status) 3551 { 3552 FloatParts64 pa, pr; 3553 3554 float32_unpack_canonical(&pa, a, status); 3555 pr = scalbn_decomposed(pa, n, status); 3556 return float32_round_pack_canonical(&pr, status); 3557 } 3558 3559 float64 float64_scalbn(float64 a, int n, float_status *status) 3560 { 3561 FloatParts64 pa, pr; 3562 3563 float64_unpack_canonical(&pa, a, status); 3564 pr = scalbn_decomposed(pa, n, status); 3565 return float64_round_pack_canonical(&pr, status); 3566 } 3567 3568 bfloat16 bfloat16_scalbn(bfloat16 a, int n, float_status *status) 3569 { 3570 FloatParts64 pa, pr; 3571 3572 bfloat16_unpack_canonical(&pa, a, status); 3573 pr = scalbn_decomposed(pa, n, status); 3574 return bfloat16_round_pack_canonical(&pr, status); 3575 } 3576 3577 /* 3578 * Square Root 3579 * 3580 * The old softfloat code did an approximation step before zeroing in 3581 * on the final result. However for simpleness we just compute the 3582 * square root by iterating down from the implicit bit to enough extra 3583 * bits to ensure we get a correctly rounded result. 3584 * 3585 * This does mean however the calculation is slower than before, 3586 * especially for 64 bit floats. 3587 */ 3588 3589 static FloatParts64 sqrt_float(FloatParts64 a, float_status *s, const FloatFmt *p) 3590 { 3591 uint64_t a_frac, r_frac, s_frac; 3592 int bit, last_bit; 3593 3594 if (is_nan(a.cls)) { 3595 parts_return_nan(&a, s); 3596 return a; 3597 } 3598 if (a.cls == float_class_zero) { 3599 return a; /* sqrt(+-0) = +-0 */ 3600 } 3601 if (a.sign) { 3602 float_raise(float_flag_invalid, s); 3603 parts_default_nan(&a, s); 3604 return a; 3605 } 3606 if (a.cls == float_class_inf) { 3607 return a; /* sqrt(+inf) = +inf */ 3608 } 3609 3610 assert(a.cls == float_class_normal); 3611 3612 /* We need two overflow bits at the top. Adding room for that is a 3613 * right shift. If the exponent is odd, we can discard the low bit 3614 * by multiplying the fraction by 2; that's a left shift. Combine 3615 * those and we shift right by 1 if the exponent is odd, otherwise 2. 3616 */ 3617 a_frac = a.frac >> (2 - (a.exp & 1)); 3618 a.exp >>= 1; 3619 3620 /* Bit-by-bit computation of sqrt. */ 3621 r_frac = 0; 3622 s_frac = 0; 3623 3624 /* Iterate from implicit bit down to the 3 extra bits to compute a 3625 * properly rounded result. Remember we've inserted two more bits 3626 * at the top, so these positions are two less. 3627 */ 3628 bit = DECOMPOSED_BINARY_POINT - 2; 3629 last_bit = MAX(p->frac_shift - 4, 0); 3630 do { 3631 uint64_t q = 1ULL << bit; 3632 uint64_t t_frac = s_frac + q; 3633 if (t_frac <= a_frac) { 3634 s_frac = t_frac + q; 3635 a_frac -= t_frac; 3636 r_frac += q; 3637 } 3638 a_frac <<= 1; 3639 } while (--bit >= last_bit); 3640 3641 /* Undo the right shift done above. If there is any remaining 3642 * fraction, the result is inexact. Set the sticky bit. 3643 */ 3644 a.frac = (r_frac << 2) + (a_frac != 0); 3645 3646 return a; 3647 } 3648 3649 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status) 3650 { 3651 FloatParts64 pa, pr; 3652 3653 float16_unpack_canonical(&pa, a, status); 3654 pr = sqrt_float(pa, status, &float16_params); 3655 return float16_round_pack_canonical(&pr, status); 3656 } 3657 3658 static float32 QEMU_SOFTFLOAT_ATTR 3659 soft_f32_sqrt(float32 a, float_status *status) 3660 { 3661 FloatParts64 pa, pr; 3662 3663 float32_unpack_canonical(&pa, a, status); 3664 pr = sqrt_float(pa, status, &float32_params); 3665 return float32_round_pack_canonical(&pr, status); 3666 } 3667 3668 static float64 QEMU_SOFTFLOAT_ATTR 3669 soft_f64_sqrt(float64 a, float_status *status) 3670 { 3671 FloatParts64 pa, pr; 3672 3673 float64_unpack_canonical(&pa, a, status); 3674 pr = sqrt_float(pa, status, &float64_params); 3675 return float64_round_pack_canonical(&pr, status); 3676 } 3677 3678 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s) 3679 { 3680 union_float32 ua, ur; 3681 3682 ua.s = xa; 3683 if (unlikely(!can_use_fpu(s))) { 3684 goto soft; 3685 } 3686 3687 float32_input_flush1(&ua.s, s); 3688 if (QEMU_HARDFLOAT_1F32_USE_FP) { 3689 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3690 fpclassify(ua.h) == FP_ZERO) || 3691 signbit(ua.h))) { 3692 goto soft; 3693 } 3694 } else if (unlikely(!float32_is_zero_or_normal(ua.s) || 3695 float32_is_neg(ua.s))) { 3696 goto soft; 3697 } 3698 ur.h = sqrtf(ua.h); 3699 return ur.s; 3700 3701 soft: 3702 return soft_f32_sqrt(ua.s, s); 3703 } 3704 3705 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s) 3706 { 3707 union_float64 ua, ur; 3708 3709 ua.s = xa; 3710 if (unlikely(!can_use_fpu(s))) { 3711 goto soft; 3712 } 3713 3714 float64_input_flush1(&ua.s, s); 3715 if (QEMU_HARDFLOAT_1F64_USE_FP) { 3716 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3717 fpclassify(ua.h) == FP_ZERO) || 3718 signbit(ua.h))) { 3719 goto soft; 3720 } 3721 } else if (unlikely(!float64_is_zero_or_normal(ua.s) || 3722 float64_is_neg(ua.s))) { 3723 goto soft; 3724 } 3725 ur.h = sqrt(ua.h); 3726 return ur.s; 3727 3728 soft: 3729 return soft_f64_sqrt(ua.s, s); 3730 } 3731 3732 bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status) 3733 { 3734 FloatParts64 pa, pr; 3735 3736 bfloat16_unpack_canonical(&pa, a, status); 3737 pr = sqrt_float(pa, status, &bfloat16_params); 3738 return bfloat16_round_pack_canonical(&pr, status); 3739 } 3740 3741 /*---------------------------------------------------------------------------- 3742 | The pattern for a default generated NaN. 3743 *----------------------------------------------------------------------------*/ 3744 3745 float16 float16_default_nan(float_status *status) 3746 { 3747 FloatParts64 p; 3748 3749 parts_default_nan(&p, status); 3750 p.frac >>= float16_params.frac_shift; 3751 return float16_pack_raw(&p); 3752 } 3753 3754 float32 float32_default_nan(float_status *status) 3755 { 3756 FloatParts64 p; 3757 3758 parts_default_nan(&p, status); 3759 p.frac >>= float32_params.frac_shift; 3760 return float32_pack_raw(&p); 3761 } 3762 3763 float64 float64_default_nan(float_status *status) 3764 { 3765 FloatParts64 p; 3766 3767 parts_default_nan(&p, status); 3768 p.frac >>= float64_params.frac_shift; 3769 return float64_pack_raw(&p); 3770 } 3771 3772 float128 float128_default_nan(float_status *status) 3773 { 3774 FloatParts128 p; 3775 3776 parts_default_nan(&p, status); 3777 frac_shr(&p, float128_params.frac_shift); 3778 return float128_pack_raw(&p); 3779 } 3780 3781 bfloat16 bfloat16_default_nan(float_status *status) 3782 { 3783 FloatParts64 p; 3784 3785 parts_default_nan(&p, status); 3786 p.frac >>= bfloat16_params.frac_shift; 3787 return bfloat16_pack_raw(&p); 3788 } 3789 3790 /*---------------------------------------------------------------------------- 3791 | Returns a quiet NaN from a signalling NaN for the floating point value `a'. 3792 *----------------------------------------------------------------------------*/ 3793 3794 float16 float16_silence_nan(float16 a, float_status *status) 3795 { 3796 FloatParts64 p; 3797 3798 float16_unpack_raw(&p, a); 3799 p.frac <<= float16_params.frac_shift; 3800 parts_silence_nan(&p, status); 3801 p.frac >>= float16_params.frac_shift; 3802 return float16_pack_raw(&p); 3803 } 3804 3805 float32 float32_silence_nan(float32 a, float_status *status) 3806 { 3807 FloatParts64 p; 3808 3809 float32_unpack_raw(&p, a); 3810 p.frac <<= float32_params.frac_shift; 3811 parts_silence_nan(&p, status); 3812 p.frac >>= float32_params.frac_shift; 3813 return float32_pack_raw(&p); 3814 } 3815 3816 float64 float64_silence_nan(float64 a, float_status *status) 3817 { 3818 FloatParts64 p; 3819 3820 float64_unpack_raw(&p, a); 3821 p.frac <<= float64_params.frac_shift; 3822 parts_silence_nan(&p, status); 3823 p.frac >>= float64_params.frac_shift; 3824 return float64_pack_raw(&p); 3825 } 3826 3827 bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status) 3828 { 3829 FloatParts64 p; 3830 3831 bfloat16_unpack_raw(&p, a); 3832 p.frac <<= bfloat16_params.frac_shift; 3833 parts_silence_nan(&p, status); 3834 p.frac >>= bfloat16_params.frac_shift; 3835 return bfloat16_pack_raw(&p); 3836 } 3837 3838 float128 float128_silence_nan(float128 a, float_status *status) 3839 { 3840 FloatParts128 p; 3841 3842 float128_unpack_raw(&p, a); 3843 frac_shl(&p, float128_params.frac_shift); 3844 parts_silence_nan(&p, status); 3845 frac_shr(&p, float128_params.frac_shift); 3846 return float128_pack_raw(&p); 3847 } 3848 3849 /*---------------------------------------------------------------------------- 3850 | If `a' is denormal and we are in flush-to-zero mode then set the 3851 | input-denormal exception and return zero. Otherwise just return the value. 3852 *----------------------------------------------------------------------------*/ 3853 3854 static bool parts_squash_denormal(FloatParts64 p, float_status *status) 3855 { 3856 if (p.exp == 0 && p.frac != 0) { 3857 float_raise(float_flag_input_denormal, status); 3858 return true; 3859 } 3860 3861 return false; 3862 } 3863 3864 float16 float16_squash_input_denormal(float16 a, float_status *status) 3865 { 3866 if (status->flush_inputs_to_zero) { 3867 FloatParts64 p; 3868 3869 float16_unpack_raw(&p, a); 3870 if (parts_squash_denormal(p, status)) { 3871 return float16_set_sign(float16_zero, p.sign); 3872 } 3873 } 3874 return a; 3875 } 3876 3877 float32 float32_squash_input_denormal(float32 a, float_status *status) 3878 { 3879 if (status->flush_inputs_to_zero) { 3880 FloatParts64 p; 3881 3882 float32_unpack_raw(&p, a); 3883 if (parts_squash_denormal(p, status)) { 3884 return float32_set_sign(float32_zero, p.sign); 3885 } 3886 } 3887 return a; 3888 } 3889 3890 float64 float64_squash_input_denormal(float64 a, float_status *status) 3891 { 3892 if (status->flush_inputs_to_zero) { 3893 FloatParts64 p; 3894 3895 float64_unpack_raw(&p, a); 3896 if (parts_squash_denormal(p, status)) { 3897 return float64_set_sign(float64_zero, p.sign); 3898 } 3899 } 3900 return a; 3901 } 3902 3903 bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status) 3904 { 3905 if (status->flush_inputs_to_zero) { 3906 FloatParts64 p; 3907 3908 bfloat16_unpack_raw(&p, a); 3909 if (parts_squash_denormal(p, status)) { 3910 return bfloat16_set_sign(bfloat16_zero, p.sign); 3911 } 3912 } 3913 return a; 3914 } 3915 3916 /*---------------------------------------------------------------------------- 3917 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 3918 | and 7, and returns the properly rounded 32-bit integer corresponding to the 3919 | input. If `zSign' is 1, the input is negated before being converted to an 3920 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 3921 | is simply rounded to an integer, with the inexact exception raised if the 3922 | input cannot be represented exactly as an integer. However, if the fixed- 3923 | point input is too large, the invalid exception is raised and the largest 3924 | positive or negative integer is returned. 3925 *----------------------------------------------------------------------------*/ 3926 3927 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ, 3928 float_status *status) 3929 { 3930 int8_t roundingMode; 3931 bool roundNearestEven; 3932 int8_t roundIncrement, roundBits; 3933 int32_t z; 3934 3935 roundingMode = status->float_rounding_mode; 3936 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3937 switch (roundingMode) { 3938 case float_round_nearest_even: 3939 case float_round_ties_away: 3940 roundIncrement = 0x40; 3941 break; 3942 case float_round_to_zero: 3943 roundIncrement = 0; 3944 break; 3945 case float_round_up: 3946 roundIncrement = zSign ? 0 : 0x7f; 3947 break; 3948 case float_round_down: 3949 roundIncrement = zSign ? 0x7f : 0; 3950 break; 3951 case float_round_to_odd: 3952 roundIncrement = absZ & 0x80 ? 0 : 0x7f; 3953 break; 3954 default: 3955 abort(); 3956 } 3957 roundBits = absZ & 0x7F; 3958 absZ = ( absZ + roundIncrement )>>7; 3959 if (!(roundBits ^ 0x40) && roundNearestEven) { 3960 absZ &= ~1; 3961 } 3962 z = absZ; 3963 if ( zSign ) z = - z; 3964 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 3965 float_raise(float_flag_invalid, status); 3966 return zSign ? INT32_MIN : INT32_MAX; 3967 } 3968 if (roundBits) { 3969 float_raise(float_flag_inexact, status); 3970 } 3971 return z; 3972 3973 } 3974 3975 /*---------------------------------------------------------------------------- 3976 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 3977 | `absZ1', with binary point between bits 63 and 64 (between the input words), 3978 | and returns the properly rounded 64-bit integer corresponding to the input. 3979 | If `zSign' is 1, the input is negated before being converted to an integer. 3980 | Ordinarily, the fixed-point input is simply rounded to an integer, with 3981 | the inexact exception raised if the input cannot be represented exactly as 3982 | an integer. However, if the fixed-point input is too large, the invalid 3983 | exception is raised and the largest positive or negative integer is 3984 | returned. 3985 *----------------------------------------------------------------------------*/ 3986 3987 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1, 3988 float_status *status) 3989 { 3990 int8_t roundingMode; 3991 bool roundNearestEven, increment; 3992 int64_t z; 3993 3994 roundingMode = status->float_rounding_mode; 3995 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3996 switch (roundingMode) { 3997 case float_round_nearest_even: 3998 case float_round_ties_away: 3999 increment = ((int64_t) absZ1 < 0); 4000 break; 4001 case float_round_to_zero: 4002 increment = 0; 4003 break; 4004 case float_round_up: 4005 increment = !zSign && absZ1; 4006 break; 4007 case float_round_down: 4008 increment = zSign && absZ1; 4009 break; 4010 case float_round_to_odd: 4011 increment = !(absZ0 & 1) && absZ1; 4012 break; 4013 default: 4014 abort(); 4015 } 4016 if ( increment ) { 4017 ++absZ0; 4018 if ( absZ0 == 0 ) goto overflow; 4019 if (!(absZ1 << 1) && roundNearestEven) { 4020 absZ0 &= ~1; 4021 } 4022 } 4023 z = absZ0; 4024 if ( zSign ) z = - z; 4025 if ( z && ( ( z < 0 ) ^ zSign ) ) { 4026 overflow: 4027 float_raise(float_flag_invalid, status); 4028 return zSign ? INT64_MIN : INT64_MAX; 4029 } 4030 if (absZ1) { 4031 float_raise(float_flag_inexact, status); 4032 } 4033 return z; 4034 4035 } 4036 4037 /*---------------------------------------------------------------------------- 4038 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 4039 | `absZ1', with binary point between bits 63 and 64 (between the input words), 4040 | and returns the properly rounded 64-bit unsigned integer corresponding to the 4041 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 4042 | with the inexact exception raised if the input cannot be represented exactly 4043 | as an integer. However, if the fixed-point input is too large, the invalid 4044 | exception is raised and the largest unsigned integer is returned. 4045 *----------------------------------------------------------------------------*/ 4046 4047 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0, 4048 uint64_t absZ1, float_status *status) 4049 { 4050 int8_t roundingMode; 4051 bool roundNearestEven, increment; 4052 4053 roundingMode = status->float_rounding_mode; 4054 roundNearestEven = (roundingMode == float_round_nearest_even); 4055 switch (roundingMode) { 4056 case float_round_nearest_even: 4057 case float_round_ties_away: 4058 increment = ((int64_t)absZ1 < 0); 4059 break; 4060 case float_round_to_zero: 4061 increment = 0; 4062 break; 4063 case float_round_up: 4064 increment = !zSign && absZ1; 4065 break; 4066 case float_round_down: 4067 increment = zSign && absZ1; 4068 break; 4069 case float_round_to_odd: 4070 increment = !(absZ0 & 1) && absZ1; 4071 break; 4072 default: 4073 abort(); 4074 } 4075 if (increment) { 4076 ++absZ0; 4077 if (absZ0 == 0) { 4078 float_raise(float_flag_invalid, status); 4079 return UINT64_MAX; 4080 } 4081 if (!(absZ1 << 1) && roundNearestEven) { 4082 absZ0 &= ~1; 4083 } 4084 } 4085 4086 if (zSign && absZ0) { 4087 float_raise(float_flag_invalid, status); 4088 return 0; 4089 } 4090 4091 if (absZ1) { 4092 float_raise(float_flag_inexact, status); 4093 } 4094 return absZ0; 4095 } 4096 4097 /*---------------------------------------------------------------------------- 4098 | Normalizes the subnormal single-precision floating-point value represented 4099 | by the denormalized significand `aSig'. The normalized exponent and 4100 | significand are stored at the locations pointed to by `zExpPtr' and 4101 | `zSigPtr', respectively. 4102 *----------------------------------------------------------------------------*/ 4103 4104 static void 4105 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 4106 { 4107 int8_t shiftCount; 4108 4109 shiftCount = clz32(aSig) - 8; 4110 *zSigPtr = aSig<<shiftCount; 4111 *zExpPtr = 1 - shiftCount; 4112 4113 } 4114 4115 /*---------------------------------------------------------------------------- 4116 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4117 | and significand `zSig', and returns the proper single-precision floating- 4118 | point value corresponding to the abstract input. Ordinarily, the abstract 4119 | value is simply rounded and packed into the single-precision format, with 4120 | the inexact exception raised if the abstract input cannot be represented 4121 | exactly. However, if the abstract value is too large, the overflow and 4122 | inexact exceptions are raised and an infinity or maximal finite value is 4123 | returned. If the abstract value is too small, the input value is rounded to 4124 | a subnormal number, and the underflow and inexact exceptions are raised if 4125 | the abstract input cannot be represented exactly as a subnormal single- 4126 | precision floating-point number. 4127 | The input significand `zSig' has its binary point between bits 30 4128 | and 29, which is 7 bits to the left of the usual location. This shifted 4129 | significand must be normalized or smaller. If `zSig' is not normalized, 4130 | `zExp' must be 0; in that case, the result returned is a subnormal number, 4131 | and it must not require rounding. In the usual case that `zSig' is 4132 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 4133 | The handling of underflow and overflow follows the IEC/IEEE Standard for 4134 | Binary Floating-Point Arithmetic. 4135 *----------------------------------------------------------------------------*/ 4136 4137 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig, 4138 float_status *status) 4139 { 4140 int8_t roundingMode; 4141 bool roundNearestEven; 4142 int8_t roundIncrement, roundBits; 4143 bool isTiny; 4144 4145 roundingMode = status->float_rounding_mode; 4146 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4147 switch (roundingMode) { 4148 case float_round_nearest_even: 4149 case float_round_ties_away: 4150 roundIncrement = 0x40; 4151 break; 4152 case float_round_to_zero: 4153 roundIncrement = 0; 4154 break; 4155 case float_round_up: 4156 roundIncrement = zSign ? 0 : 0x7f; 4157 break; 4158 case float_round_down: 4159 roundIncrement = zSign ? 0x7f : 0; 4160 break; 4161 case float_round_to_odd: 4162 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 4163 break; 4164 default: 4165 abort(); 4166 break; 4167 } 4168 roundBits = zSig & 0x7F; 4169 if ( 0xFD <= (uint16_t) zExp ) { 4170 if ( ( 0xFD < zExp ) 4171 || ( ( zExp == 0xFD ) 4172 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 4173 ) { 4174 bool overflow_to_inf = roundingMode != float_round_to_odd && 4175 roundIncrement != 0; 4176 float_raise(float_flag_overflow | float_flag_inexact, status); 4177 return packFloat32(zSign, 0xFF, -!overflow_to_inf); 4178 } 4179 if ( zExp < 0 ) { 4180 if (status->flush_to_zero) { 4181 float_raise(float_flag_output_denormal, status); 4182 return packFloat32(zSign, 0, 0); 4183 } 4184 isTiny = status->tininess_before_rounding 4185 || (zExp < -1) 4186 || (zSig + roundIncrement < 0x80000000); 4187 shift32RightJamming( zSig, - zExp, &zSig ); 4188 zExp = 0; 4189 roundBits = zSig & 0x7F; 4190 if (isTiny && roundBits) { 4191 float_raise(float_flag_underflow, status); 4192 } 4193 if (roundingMode == float_round_to_odd) { 4194 /* 4195 * For round-to-odd case, the roundIncrement depends on 4196 * zSig which just changed. 4197 */ 4198 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 4199 } 4200 } 4201 } 4202 if (roundBits) { 4203 float_raise(float_flag_inexact, status); 4204 } 4205 zSig = ( zSig + roundIncrement )>>7; 4206 if (!(roundBits ^ 0x40) && roundNearestEven) { 4207 zSig &= ~1; 4208 } 4209 if ( zSig == 0 ) zExp = 0; 4210 return packFloat32( zSign, zExp, zSig ); 4211 4212 } 4213 4214 /*---------------------------------------------------------------------------- 4215 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4216 | and significand `zSig', and returns the proper single-precision floating- 4217 | point value corresponding to the abstract input. This routine is just like 4218 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 4219 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 4220 | floating-point exponent. 4221 *----------------------------------------------------------------------------*/ 4222 4223 static float32 4224 normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig, 4225 float_status *status) 4226 { 4227 int8_t shiftCount; 4228 4229 shiftCount = clz32(zSig) - 1; 4230 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 4231 status); 4232 4233 } 4234 4235 /*---------------------------------------------------------------------------- 4236 | Normalizes the subnormal double-precision floating-point value represented 4237 | by the denormalized significand `aSig'. The normalized exponent and 4238 | significand are stored at the locations pointed to by `zExpPtr' and 4239 | `zSigPtr', respectively. 4240 *----------------------------------------------------------------------------*/ 4241 4242 static void 4243 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 4244 { 4245 int8_t shiftCount; 4246 4247 shiftCount = clz64(aSig) - 11; 4248 *zSigPtr = aSig<<shiftCount; 4249 *zExpPtr = 1 - shiftCount; 4250 4251 } 4252 4253 /*---------------------------------------------------------------------------- 4254 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 4255 | double-precision floating-point value, returning the result. After being 4256 | shifted into the proper positions, the three fields are simply added 4257 | together to form the result. This means that any integer portion of `zSig' 4258 | will be added into the exponent. Since a properly normalized significand 4259 | will have an integer portion equal to 1, the `zExp' input should be 1 less 4260 | than the desired result exponent whenever `zSig' is a complete, normalized 4261 | significand. 4262 *----------------------------------------------------------------------------*/ 4263 4264 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig) 4265 { 4266 4267 return make_float64( 4268 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 4269 4270 } 4271 4272 /*---------------------------------------------------------------------------- 4273 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4274 | and significand `zSig', and returns the proper double-precision floating- 4275 | point value corresponding to the abstract input. Ordinarily, the abstract 4276 | value is simply rounded and packed into the double-precision format, with 4277 | the inexact exception raised if the abstract input cannot be represented 4278 | exactly. However, if the abstract value is too large, the overflow and 4279 | inexact exceptions are raised and an infinity or maximal finite value is 4280 | returned. If the abstract value is too small, the input value is rounded to 4281 | a subnormal number, and the underflow and inexact exceptions are raised if 4282 | the abstract input cannot be represented exactly as a subnormal double- 4283 | precision floating-point number. 4284 | The input significand `zSig' has its binary point between bits 62 4285 | and 61, which is 10 bits to the left of the usual location. This shifted 4286 | significand must be normalized or smaller. If `zSig' is not normalized, 4287 | `zExp' must be 0; in that case, the result returned is a subnormal number, 4288 | and it must not require rounding. In the usual case that `zSig' is 4289 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 4290 | The handling of underflow and overflow follows the IEC/IEEE Standard for 4291 | Binary Floating-Point Arithmetic. 4292 *----------------------------------------------------------------------------*/ 4293 4294 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig, 4295 float_status *status) 4296 { 4297 int8_t roundingMode; 4298 bool roundNearestEven; 4299 int roundIncrement, roundBits; 4300 bool isTiny; 4301 4302 roundingMode = status->float_rounding_mode; 4303 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4304 switch (roundingMode) { 4305 case float_round_nearest_even: 4306 case float_round_ties_away: 4307 roundIncrement = 0x200; 4308 break; 4309 case float_round_to_zero: 4310 roundIncrement = 0; 4311 break; 4312 case float_round_up: 4313 roundIncrement = zSign ? 0 : 0x3ff; 4314 break; 4315 case float_round_down: 4316 roundIncrement = zSign ? 0x3ff : 0; 4317 break; 4318 case float_round_to_odd: 4319 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 4320 break; 4321 default: 4322 abort(); 4323 } 4324 roundBits = zSig & 0x3FF; 4325 if ( 0x7FD <= (uint16_t) zExp ) { 4326 if ( ( 0x7FD < zExp ) 4327 || ( ( zExp == 0x7FD ) 4328 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 4329 ) { 4330 bool overflow_to_inf = roundingMode != float_round_to_odd && 4331 roundIncrement != 0; 4332 float_raise(float_flag_overflow | float_flag_inexact, status); 4333 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 4334 } 4335 if ( zExp < 0 ) { 4336 if (status->flush_to_zero) { 4337 float_raise(float_flag_output_denormal, status); 4338 return packFloat64(zSign, 0, 0); 4339 } 4340 isTiny = status->tininess_before_rounding 4341 || (zExp < -1) 4342 || (zSig + roundIncrement < UINT64_C(0x8000000000000000)); 4343 shift64RightJamming( zSig, - zExp, &zSig ); 4344 zExp = 0; 4345 roundBits = zSig & 0x3FF; 4346 if (isTiny && roundBits) { 4347 float_raise(float_flag_underflow, status); 4348 } 4349 if (roundingMode == float_round_to_odd) { 4350 /* 4351 * For round-to-odd case, the roundIncrement depends on 4352 * zSig which just changed. 4353 */ 4354 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 4355 } 4356 } 4357 } 4358 if (roundBits) { 4359 float_raise(float_flag_inexact, status); 4360 } 4361 zSig = ( zSig + roundIncrement )>>10; 4362 if (!(roundBits ^ 0x200) && roundNearestEven) { 4363 zSig &= ~1; 4364 } 4365 if ( zSig == 0 ) zExp = 0; 4366 return packFloat64( zSign, zExp, zSig ); 4367 4368 } 4369 4370 /*---------------------------------------------------------------------------- 4371 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4372 | and significand `zSig', and returns the proper double-precision floating- 4373 | point value corresponding to the abstract input. This routine is just like 4374 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 4375 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 4376 | floating-point exponent. 4377 *----------------------------------------------------------------------------*/ 4378 4379 static float64 4380 normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig, 4381 float_status *status) 4382 { 4383 int8_t shiftCount; 4384 4385 shiftCount = clz64(zSig) - 1; 4386 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 4387 status); 4388 4389 } 4390 4391 /*---------------------------------------------------------------------------- 4392 | Normalizes the subnormal extended double-precision floating-point value 4393 | represented by the denormalized significand `aSig'. The normalized exponent 4394 | and significand are stored at the locations pointed to by `zExpPtr' and 4395 | `zSigPtr', respectively. 4396 *----------------------------------------------------------------------------*/ 4397 4398 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr, 4399 uint64_t *zSigPtr) 4400 { 4401 int8_t shiftCount; 4402 4403 shiftCount = clz64(aSig); 4404 *zSigPtr = aSig<<shiftCount; 4405 *zExpPtr = 1 - shiftCount; 4406 } 4407 4408 /*---------------------------------------------------------------------------- 4409 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4410 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 4411 | and returns the proper extended double-precision floating-point value 4412 | corresponding to the abstract input. Ordinarily, the abstract value is 4413 | rounded and packed into the extended double-precision format, with the 4414 | inexact exception raised if the abstract input cannot be represented 4415 | exactly. However, if the abstract value is too large, the overflow and 4416 | inexact exceptions are raised and an infinity or maximal finite value is 4417 | returned. If the abstract value is too small, the input value is rounded to 4418 | a subnormal number, and the underflow and inexact exceptions are raised if 4419 | the abstract input cannot be represented exactly as a subnormal extended 4420 | double-precision floating-point number. 4421 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 4422 | number of bits as single or double precision, respectively. Otherwise, the 4423 | result is rounded to the full precision of the extended double-precision 4424 | format. 4425 | The input significand must be normalized or smaller. If the input 4426 | significand is not normalized, `zExp' must be 0; in that case, the result 4427 | returned is a subnormal number, and it must not require rounding. The 4428 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 4429 | Floating-Point Arithmetic. 4430 *----------------------------------------------------------------------------*/ 4431 4432 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign, 4433 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 4434 float_status *status) 4435 { 4436 int8_t roundingMode; 4437 bool roundNearestEven, increment, isTiny; 4438 int64_t roundIncrement, roundMask, roundBits; 4439 4440 roundingMode = status->float_rounding_mode; 4441 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4442 if ( roundingPrecision == 80 ) goto precision80; 4443 if ( roundingPrecision == 64 ) { 4444 roundIncrement = UINT64_C(0x0000000000000400); 4445 roundMask = UINT64_C(0x00000000000007FF); 4446 } 4447 else if ( roundingPrecision == 32 ) { 4448 roundIncrement = UINT64_C(0x0000008000000000); 4449 roundMask = UINT64_C(0x000000FFFFFFFFFF); 4450 } 4451 else { 4452 goto precision80; 4453 } 4454 zSig0 |= ( zSig1 != 0 ); 4455 switch (roundingMode) { 4456 case float_round_nearest_even: 4457 case float_round_ties_away: 4458 break; 4459 case float_round_to_zero: 4460 roundIncrement = 0; 4461 break; 4462 case float_round_up: 4463 roundIncrement = zSign ? 0 : roundMask; 4464 break; 4465 case float_round_down: 4466 roundIncrement = zSign ? roundMask : 0; 4467 break; 4468 default: 4469 abort(); 4470 } 4471 roundBits = zSig0 & roundMask; 4472 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 4473 if ( ( 0x7FFE < zExp ) 4474 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 4475 ) { 4476 goto overflow; 4477 } 4478 if ( zExp <= 0 ) { 4479 if (status->flush_to_zero) { 4480 float_raise(float_flag_output_denormal, status); 4481 return packFloatx80(zSign, 0, 0); 4482 } 4483 isTiny = status->tininess_before_rounding 4484 || (zExp < 0 ) 4485 || (zSig0 <= zSig0 + roundIncrement); 4486 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 4487 zExp = 0; 4488 roundBits = zSig0 & roundMask; 4489 if (isTiny && roundBits) { 4490 float_raise(float_flag_underflow, status); 4491 } 4492 if (roundBits) { 4493 float_raise(float_flag_inexact, status); 4494 } 4495 zSig0 += roundIncrement; 4496 if ( (int64_t) zSig0 < 0 ) zExp = 1; 4497 roundIncrement = roundMask + 1; 4498 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 4499 roundMask |= roundIncrement; 4500 } 4501 zSig0 &= ~ roundMask; 4502 return packFloatx80( zSign, zExp, zSig0 ); 4503 } 4504 } 4505 if (roundBits) { 4506 float_raise(float_flag_inexact, status); 4507 } 4508 zSig0 += roundIncrement; 4509 if ( zSig0 < roundIncrement ) { 4510 ++zExp; 4511 zSig0 = UINT64_C(0x8000000000000000); 4512 } 4513 roundIncrement = roundMask + 1; 4514 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 4515 roundMask |= roundIncrement; 4516 } 4517 zSig0 &= ~ roundMask; 4518 if ( zSig0 == 0 ) zExp = 0; 4519 return packFloatx80( zSign, zExp, zSig0 ); 4520 precision80: 4521 switch (roundingMode) { 4522 case float_round_nearest_even: 4523 case float_round_ties_away: 4524 increment = ((int64_t)zSig1 < 0); 4525 break; 4526 case float_round_to_zero: 4527 increment = 0; 4528 break; 4529 case float_round_up: 4530 increment = !zSign && zSig1; 4531 break; 4532 case float_round_down: 4533 increment = zSign && zSig1; 4534 break; 4535 default: 4536 abort(); 4537 } 4538 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 4539 if ( ( 0x7FFE < zExp ) 4540 || ( ( zExp == 0x7FFE ) 4541 && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) ) 4542 && increment 4543 ) 4544 ) { 4545 roundMask = 0; 4546 overflow: 4547 float_raise(float_flag_overflow | float_flag_inexact, status); 4548 if ( ( roundingMode == float_round_to_zero ) 4549 || ( zSign && ( roundingMode == float_round_up ) ) 4550 || ( ! zSign && ( roundingMode == float_round_down ) ) 4551 ) { 4552 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 4553 } 4554 return packFloatx80(zSign, 4555 floatx80_infinity_high, 4556 floatx80_infinity_low); 4557 } 4558 if ( zExp <= 0 ) { 4559 isTiny = status->tininess_before_rounding 4560 || (zExp < 0) 4561 || !increment 4562 || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF)); 4563 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 4564 zExp = 0; 4565 if (isTiny && zSig1) { 4566 float_raise(float_flag_underflow, status); 4567 } 4568 if (zSig1) { 4569 float_raise(float_flag_inexact, status); 4570 } 4571 switch (roundingMode) { 4572 case float_round_nearest_even: 4573 case float_round_ties_away: 4574 increment = ((int64_t)zSig1 < 0); 4575 break; 4576 case float_round_to_zero: 4577 increment = 0; 4578 break; 4579 case float_round_up: 4580 increment = !zSign && zSig1; 4581 break; 4582 case float_round_down: 4583 increment = zSign && zSig1; 4584 break; 4585 default: 4586 abort(); 4587 } 4588 if ( increment ) { 4589 ++zSig0; 4590 if (!(zSig1 << 1) && roundNearestEven) { 4591 zSig0 &= ~1; 4592 } 4593 if ( (int64_t) zSig0 < 0 ) zExp = 1; 4594 } 4595 return packFloatx80( zSign, zExp, zSig0 ); 4596 } 4597 } 4598 if (zSig1) { 4599 float_raise(float_flag_inexact, status); 4600 } 4601 if ( increment ) { 4602 ++zSig0; 4603 if ( zSig0 == 0 ) { 4604 ++zExp; 4605 zSig0 = UINT64_C(0x8000000000000000); 4606 } 4607 else { 4608 if (!(zSig1 << 1) && roundNearestEven) { 4609 zSig0 &= ~1; 4610 } 4611 } 4612 } 4613 else { 4614 if ( zSig0 == 0 ) zExp = 0; 4615 } 4616 return packFloatx80( zSign, zExp, zSig0 ); 4617 4618 } 4619 4620 /*---------------------------------------------------------------------------- 4621 | Takes an abstract floating-point value having sign `zSign', exponent 4622 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 4623 | and returns the proper extended double-precision floating-point value 4624 | corresponding to the abstract input. This routine is just like 4625 | `roundAndPackFloatx80' except that the input significand does not have to be 4626 | normalized. 4627 *----------------------------------------------------------------------------*/ 4628 4629 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 4630 bool zSign, int32_t zExp, 4631 uint64_t zSig0, uint64_t zSig1, 4632 float_status *status) 4633 { 4634 int8_t shiftCount; 4635 4636 if ( zSig0 == 0 ) { 4637 zSig0 = zSig1; 4638 zSig1 = 0; 4639 zExp -= 64; 4640 } 4641 shiftCount = clz64(zSig0); 4642 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4643 zExp -= shiftCount; 4644 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 4645 zSig0, zSig1, status); 4646 4647 } 4648 4649 /*---------------------------------------------------------------------------- 4650 | Returns the least-significant 64 fraction bits of the quadruple-precision 4651 | floating-point value `a'. 4652 *----------------------------------------------------------------------------*/ 4653 4654 static inline uint64_t extractFloat128Frac1( float128 a ) 4655 { 4656 4657 return a.low; 4658 4659 } 4660 4661 /*---------------------------------------------------------------------------- 4662 | Returns the most-significant 48 fraction bits of the quadruple-precision 4663 | floating-point value `a'. 4664 *----------------------------------------------------------------------------*/ 4665 4666 static inline uint64_t extractFloat128Frac0( float128 a ) 4667 { 4668 4669 return a.high & UINT64_C(0x0000FFFFFFFFFFFF); 4670 4671 } 4672 4673 /*---------------------------------------------------------------------------- 4674 | Returns the exponent bits of the quadruple-precision floating-point value 4675 | `a'. 4676 *----------------------------------------------------------------------------*/ 4677 4678 static inline int32_t extractFloat128Exp( float128 a ) 4679 { 4680 4681 return ( a.high>>48 ) & 0x7FFF; 4682 4683 } 4684 4685 /*---------------------------------------------------------------------------- 4686 | Returns the sign bit of the quadruple-precision floating-point value `a'. 4687 *----------------------------------------------------------------------------*/ 4688 4689 static inline bool extractFloat128Sign(float128 a) 4690 { 4691 return a.high >> 63; 4692 } 4693 4694 /*---------------------------------------------------------------------------- 4695 | Normalizes the subnormal quadruple-precision floating-point value 4696 | represented by the denormalized significand formed by the concatenation of 4697 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 4698 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 4699 | significand are stored at the location pointed to by `zSig0Ptr', and the 4700 | least significant 64 bits of the normalized significand are stored at the 4701 | location pointed to by `zSig1Ptr'. 4702 *----------------------------------------------------------------------------*/ 4703 4704 static void 4705 normalizeFloat128Subnormal( 4706 uint64_t aSig0, 4707 uint64_t aSig1, 4708 int32_t *zExpPtr, 4709 uint64_t *zSig0Ptr, 4710 uint64_t *zSig1Ptr 4711 ) 4712 { 4713 int8_t shiftCount; 4714 4715 if ( aSig0 == 0 ) { 4716 shiftCount = clz64(aSig1) - 15; 4717 if ( shiftCount < 0 ) { 4718 *zSig0Ptr = aSig1>>( - shiftCount ); 4719 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 4720 } 4721 else { 4722 *zSig0Ptr = aSig1<<shiftCount; 4723 *zSig1Ptr = 0; 4724 } 4725 *zExpPtr = - shiftCount - 63; 4726 } 4727 else { 4728 shiftCount = clz64(aSig0) - 15; 4729 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 4730 *zExpPtr = 1 - shiftCount; 4731 } 4732 4733 } 4734 4735 /*---------------------------------------------------------------------------- 4736 | Packs the sign `zSign', the exponent `zExp', and the significand formed 4737 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 4738 | floating-point value, returning the result. After being shifted into the 4739 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 4740 | added together to form the most significant 32 bits of the result. This 4741 | means that any integer portion of `zSig0' will be added into the exponent. 4742 | Since a properly normalized significand will have an integer portion equal 4743 | to 1, the `zExp' input should be 1 less than the desired result exponent 4744 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 4745 | significand. 4746 *----------------------------------------------------------------------------*/ 4747 4748 static inline float128 4749 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1) 4750 { 4751 float128 z; 4752 4753 z.low = zSig1; 4754 z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0; 4755 return z; 4756 } 4757 4758 /*---------------------------------------------------------------------------- 4759 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4760 | and extended significand formed by the concatenation of `zSig0', `zSig1', 4761 | and `zSig2', and returns the proper quadruple-precision floating-point value 4762 | corresponding to the abstract input. Ordinarily, the abstract value is 4763 | simply rounded and packed into the quadruple-precision format, with the 4764 | inexact exception raised if the abstract input cannot be represented 4765 | exactly. However, if the abstract value is too large, the overflow and 4766 | inexact exceptions are raised and an infinity or maximal finite value is 4767 | returned. If the abstract value is too small, the input value is rounded to 4768 | a subnormal number, and the underflow and inexact exceptions are raised if 4769 | the abstract input cannot be represented exactly as a subnormal quadruple- 4770 | precision floating-point number. 4771 | The input significand must be normalized or smaller. If the input 4772 | significand is not normalized, `zExp' must be 0; in that case, the result 4773 | returned is a subnormal number, and it must not require rounding. In the 4774 | usual case that the input significand is normalized, `zExp' must be 1 less 4775 | than the ``true'' floating-point exponent. The handling of underflow and 4776 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4777 *----------------------------------------------------------------------------*/ 4778 4779 static float128 roundAndPackFloat128(bool zSign, int32_t zExp, 4780 uint64_t zSig0, uint64_t zSig1, 4781 uint64_t zSig2, float_status *status) 4782 { 4783 int8_t roundingMode; 4784 bool roundNearestEven, increment, isTiny; 4785 4786 roundingMode = status->float_rounding_mode; 4787 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4788 switch (roundingMode) { 4789 case float_round_nearest_even: 4790 case float_round_ties_away: 4791 increment = ((int64_t)zSig2 < 0); 4792 break; 4793 case float_round_to_zero: 4794 increment = 0; 4795 break; 4796 case float_round_up: 4797 increment = !zSign && zSig2; 4798 break; 4799 case float_round_down: 4800 increment = zSign && zSig2; 4801 break; 4802 case float_round_to_odd: 4803 increment = !(zSig1 & 0x1) && zSig2; 4804 break; 4805 default: 4806 abort(); 4807 } 4808 if ( 0x7FFD <= (uint32_t) zExp ) { 4809 if ( ( 0x7FFD < zExp ) 4810 || ( ( zExp == 0x7FFD ) 4811 && eq128( 4812 UINT64_C(0x0001FFFFFFFFFFFF), 4813 UINT64_C(0xFFFFFFFFFFFFFFFF), 4814 zSig0, 4815 zSig1 4816 ) 4817 && increment 4818 ) 4819 ) { 4820 float_raise(float_flag_overflow | float_flag_inexact, status); 4821 if ( ( roundingMode == float_round_to_zero ) 4822 || ( zSign && ( roundingMode == float_round_up ) ) 4823 || ( ! zSign && ( roundingMode == float_round_down ) ) 4824 || (roundingMode == float_round_to_odd) 4825 ) { 4826 return 4827 packFloat128( 4828 zSign, 4829 0x7FFE, 4830 UINT64_C(0x0000FFFFFFFFFFFF), 4831 UINT64_C(0xFFFFFFFFFFFFFFFF) 4832 ); 4833 } 4834 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4835 } 4836 if ( zExp < 0 ) { 4837 if (status->flush_to_zero) { 4838 float_raise(float_flag_output_denormal, status); 4839 return packFloat128(zSign, 0, 0, 0); 4840 } 4841 isTiny = status->tininess_before_rounding 4842 || (zExp < -1) 4843 || !increment 4844 || lt128(zSig0, zSig1, 4845 UINT64_C(0x0001FFFFFFFFFFFF), 4846 UINT64_C(0xFFFFFFFFFFFFFFFF)); 4847 shift128ExtraRightJamming( 4848 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 4849 zExp = 0; 4850 if (isTiny && zSig2) { 4851 float_raise(float_flag_underflow, status); 4852 } 4853 switch (roundingMode) { 4854 case float_round_nearest_even: 4855 case float_round_ties_away: 4856 increment = ((int64_t)zSig2 < 0); 4857 break; 4858 case float_round_to_zero: 4859 increment = 0; 4860 break; 4861 case float_round_up: 4862 increment = !zSign && zSig2; 4863 break; 4864 case float_round_down: 4865 increment = zSign && zSig2; 4866 break; 4867 case float_round_to_odd: 4868 increment = !(zSig1 & 0x1) && zSig2; 4869 break; 4870 default: 4871 abort(); 4872 } 4873 } 4874 } 4875 if (zSig2) { 4876 float_raise(float_flag_inexact, status); 4877 } 4878 if ( increment ) { 4879 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 4880 if ((zSig2 + zSig2 == 0) && roundNearestEven) { 4881 zSig1 &= ~1; 4882 } 4883 } 4884 else { 4885 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 4886 } 4887 return packFloat128( zSign, zExp, zSig0, zSig1 ); 4888 4889 } 4890 4891 /*---------------------------------------------------------------------------- 4892 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4893 | and significand formed by the concatenation of `zSig0' and `zSig1', and 4894 | returns the proper quadruple-precision floating-point value corresponding 4895 | to the abstract input. This routine is just like `roundAndPackFloat128' 4896 | except that the input significand has fewer bits and does not have to be 4897 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 4898 | point exponent. 4899 *----------------------------------------------------------------------------*/ 4900 4901 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp, 4902 uint64_t zSig0, uint64_t zSig1, 4903 float_status *status) 4904 { 4905 int8_t shiftCount; 4906 uint64_t zSig2; 4907 4908 if ( zSig0 == 0 ) { 4909 zSig0 = zSig1; 4910 zSig1 = 0; 4911 zExp -= 64; 4912 } 4913 shiftCount = clz64(zSig0) - 15; 4914 if ( 0 <= shiftCount ) { 4915 zSig2 = 0; 4916 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4917 } 4918 else { 4919 shift128ExtraRightJamming( 4920 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 4921 } 4922 zExp -= shiftCount; 4923 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 4924 4925 } 4926 4927 4928 /*---------------------------------------------------------------------------- 4929 | Returns the result of converting the 32-bit two's complement integer `a' 4930 | to the extended double-precision floating-point format. The conversion 4931 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4932 | Arithmetic. 4933 *----------------------------------------------------------------------------*/ 4934 4935 floatx80 int32_to_floatx80(int32_t a, float_status *status) 4936 { 4937 bool zSign; 4938 uint32_t absA; 4939 int8_t shiftCount; 4940 uint64_t zSig; 4941 4942 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 4943 zSign = ( a < 0 ); 4944 absA = zSign ? - a : a; 4945 shiftCount = clz32(absA) + 32; 4946 zSig = absA; 4947 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 4948 4949 } 4950 4951 /*---------------------------------------------------------------------------- 4952 | Returns the result of converting the 32-bit two's complement integer `a' to 4953 | the quadruple-precision floating-point format. The conversion is performed 4954 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4955 *----------------------------------------------------------------------------*/ 4956 4957 float128 int32_to_float128(int32_t a, float_status *status) 4958 { 4959 bool zSign; 4960 uint32_t absA; 4961 int8_t shiftCount; 4962 uint64_t zSig0; 4963 4964 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 4965 zSign = ( a < 0 ); 4966 absA = zSign ? - a : a; 4967 shiftCount = clz32(absA) + 17; 4968 zSig0 = absA; 4969 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 4970 4971 } 4972 4973 /*---------------------------------------------------------------------------- 4974 | Returns the result of converting the 64-bit two's complement integer `a' 4975 | to the extended double-precision floating-point format. The conversion 4976 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4977 | Arithmetic. 4978 *----------------------------------------------------------------------------*/ 4979 4980 floatx80 int64_to_floatx80(int64_t a, float_status *status) 4981 { 4982 bool zSign; 4983 uint64_t absA; 4984 int8_t shiftCount; 4985 4986 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 4987 zSign = ( a < 0 ); 4988 absA = zSign ? - a : a; 4989 shiftCount = clz64(absA); 4990 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 4991 4992 } 4993 4994 /*---------------------------------------------------------------------------- 4995 | Returns the result of converting the 64-bit two's complement integer `a' to 4996 | the quadruple-precision floating-point format. The conversion is performed 4997 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4998 *----------------------------------------------------------------------------*/ 4999 5000 float128 int64_to_float128(int64_t a, float_status *status) 5001 { 5002 bool zSign; 5003 uint64_t absA; 5004 int8_t shiftCount; 5005 int32_t zExp; 5006 uint64_t zSig0, zSig1; 5007 5008 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 5009 zSign = ( a < 0 ); 5010 absA = zSign ? - a : a; 5011 shiftCount = clz64(absA) + 49; 5012 zExp = 0x406E - shiftCount; 5013 if ( 64 <= shiftCount ) { 5014 zSig1 = 0; 5015 zSig0 = absA; 5016 shiftCount -= 64; 5017 } 5018 else { 5019 zSig1 = absA; 5020 zSig0 = 0; 5021 } 5022 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 5023 return packFloat128( zSign, zExp, zSig0, zSig1 ); 5024 5025 } 5026 5027 /*---------------------------------------------------------------------------- 5028 | Returns the result of converting the 64-bit unsigned integer `a' 5029 | to the quadruple-precision floating-point format. The conversion is performed 5030 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5031 *----------------------------------------------------------------------------*/ 5032 5033 float128 uint64_to_float128(uint64_t a, float_status *status) 5034 { 5035 if (a == 0) { 5036 return float128_zero; 5037 } 5038 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status); 5039 } 5040 5041 /*---------------------------------------------------------------------------- 5042 | Returns the result of converting the single-precision floating-point value 5043 | `a' to the extended double-precision floating-point format. The conversion 5044 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5045 | Arithmetic. 5046 *----------------------------------------------------------------------------*/ 5047 5048 floatx80 float32_to_floatx80(float32 a, float_status *status) 5049 { 5050 bool aSign; 5051 int aExp; 5052 uint32_t aSig; 5053 5054 a = float32_squash_input_denormal(a, status); 5055 aSig = extractFloat32Frac( a ); 5056 aExp = extractFloat32Exp( a ); 5057 aSign = extractFloat32Sign( a ); 5058 if ( aExp == 0xFF ) { 5059 if (aSig) { 5060 floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status), 5061 status); 5062 return floatx80_silence_nan(res, status); 5063 } 5064 return packFloatx80(aSign, 5065 floatx80_infinity_high, 5066 floatx80_infinity_low); 5067 } 5068 if ( aExp == 0 ) { 5069 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 5070 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5071 } 5072 aSig |= 0x00800000; 5073 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 5074 5075 } 5076 5077 /*---------------------------------------------------------------------------- 5078 | Returns the result of converting the single-precision floating-point value 5079 | `a' to the double-precision floating-point format. The conversion is 5080 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5081 | Arithmetic. 5082 *----------------------------------------------------------------------------*/ 5083 5084 float128 float32_to_float128(float32 a, float_status *status) 5085 { 5086 bool aSign; 5087 int aExp; 5088 uint32_t aSig; 5089 5090 a = float32_squash_input_denormal(a, status); 5091 aSig = extractFloat32Frac( a ); 5092 aExp = extractFloat32Exp( a ); 5093 aSign = extractFloat32Sign( a ); 5094 if ( aExp == 0xFF ) { 5095 if (aSig) { 5096 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 5097 } 5098 return packFloat128( aSign, 0x7FFF, 0, 0 ); 5099 } 5100 if ( aExp == 0 ) { 5101 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 5102 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5103 --aExp; 5104 } 5105 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 5106 5107 } 5108 5109 /*---------------------------------------------------------------------------- 5110 | Returns the remainder of the single-precision floating-point value `a' 5111 | with respect to the corresponding value `b'. The operation is performed 5112 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5113 *----------------------------------------------------------------------------*/ 5114 5115 float32 float32_rem(float32 a, float32 b, float_status *status) 5116 { 5117 bool aSign, zSign; 5118 int aExp, bExp, expDiff; 5119 uint32_t aSig, bSig; 5120 uint32_t q; 5121 uint64_t aSig64, bSig64, q64; 5122 uint32_t alternateASig; 5123 int32_t sigMean; 5124 a = float32_squash_input_denormal(a, status); 5125 b = float32_squash_input_denormal(b, status); 5126 5127 aSig = extractFloat32Frac( a ); 5128 aExp = extractFloat32Exp( a ); 5129 aSign = extractFloat32Sign( a ); 5130 bSig = extractFloat32Frac( b ); 5131 bExp = extractFloat32Exp( b ); 5132 if ( aExp == 0xFF ) { 5133 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 5134 return propagateFloat32NaN(a, b, status); 5135 } 5136 float_raise(float_flag_invalid, status); 5137 return float32_default_nan(status); 5138 } 5139 if ( bExp == 0xFF ) { 5140 if (bSig) { 5141 return propagateFloat32NaN(a, b, status); 5142 } 5143 return a; 5144 } 5145 if ( bExp == 0 ) { 5146 if ( bSig == 0 ) { 5147 float_raise(float_flag_invalid, status); 5148 return float32_default_nan(status); 5149 } 5150 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 5151 } 5152 if ( aExp == 0 ) { 5153 if ( aSig == 0 ) return a; 5154 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5155 } 5156 expDiff = aExp - bExp; 5157 aSig |= 0x00800000; 5158 bSig |= 0x00800000; 5159 if ( expDiff < 32 ) { 5160 aSig <<= 8; 5161 bSig <<= 8; 5162 if ( expDiff < 0 ) { 5163 if ( expDiff < -1 ) return a; 5164 aSig >>= 1; 5165 } 5166 q = ( bSig <= aSig ); 5167 if ( q ) aSig -= bSig; 5168 if ( 0 < expDiff ) { 5169 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 5170 q >>= 32 - expDiff; 5171 bSig >>= 2; 5172 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 5173 } 5174 else { 5175 aSig >>= 2; 5176 bSig >>= 2; 5177 } 5178 } 5179 else { 5180 if ( bSig <= aSig ) aSig -= bSig; 5181 aSig64 = ( (uint64_t) aSig )<<40; 5182 bSig64 = ( (uint64_t) bSig )<<40; 5183 expDiff -= 64; 5184 while ( 0 < expDiff ) { 5185 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 5186 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 5187 aSig64 = - ( ( bSig * q64 )<<38 ); 5188 expDiff -= 62; 5189 } 5190 expDiff += 64; 5191 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 5192 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 5193 q = q64>>( 64 - expDiff ); 5194 bSig <<= 6; 5195 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 5196 } 5197 do { 5198 alternateASig = aSig; 5199 ++q; 5200 aSig -= bSig; 5201 } while ( 0 <= (int32_t) aSig ); 5202 sigMean = aSig + alternateASig; 5203 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 5204 aSig = alternateASig; 5205 } 5206 zSign = ( (int32_t) aSig < 0 ); 5207 if ( zSign ) aSig = - aSig; 5208 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 5209 } 5210 5211 5212 5213 /*---------------------------------------------------------------------------- 5214 | Returns the binary exponential of the single-precision floating-point value 5215 | `a'. The operation is performed according to the IEC/IEEE Standard for 5216 | Binary Floating-Point Arithmetic. 5217 | 5218 | Uses the following identities: 5219 | 5220 | 1. ------------------------------------------------------------------------- 5221 | x x*ln(2) 5222 | 2 = e 5223 | 5224 | 2. ------------------------------------------------------------------------- 5225 | 2 3 4 5 n 5226 | x x x x x x x 5227 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 5228 | 1! 2! 3! 4! 5! n! 5229 *----------------------------------------------------------------------------*/ 5230 5231 static const float64 float32_exp2_coefficients[15] = 5232 { 5233 const_float64( 0x3ff0000000000000ll ), /* 1 */ 5234 const_float64( 0x3fe0000000000000ll ), /* 2 */ 5235 const_float64( 0x3fc5555555555555ll ), /* 3 */ 5236 const_float64( 0x3fa5555555555555ll ), /* 4 */ 5237 const_float64( 0x3f81111111111111ll ), /* 5 */ 5238 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 5239 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 5240 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 5241 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 5242 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 5243 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 5244 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 5245 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 5246 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 5247 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 5248 }; 5249 5250 float32 float32_exp2(float32 a, float_status *status) 5251 { 5252 bool aSign; 5253 int aExp; 5254 uint32_t aSig; 5255 float64 r, x, xn; 5256 int i; 5257 a = float32_squash_input_denormal(a, status); 5258 5259 aSig = extractFloat32Frac( a ); 5260 aExp = extractFloat32Exp( a ); 5261 aSign = extractFloat32Sign( a ); 5262 5263 if ( aExp == 0xFF) { 5264 if (aSig) { 5265 return propagateFloat32NaN(a, float32_zero, status); 5266 } 5267 return (aSign) ? float32_zero : a; 5268 } 5269 if (aExp == 0) { 5270 if (aSig == 0) return float32_one; 5271 } 5272 5273 float_raise(float_flag_inexact, status); 5274 5275 /* ******************************* */ 5276 /* using float64 for approximation */ 5277 /* ******************************* */ 5278 x = float32_to_float64(a, status); 5279 x = float64_mul(x, float64_ln2, status); 5280 5281 xn = x; 5282 r = float64_one; 5283 for (i = 0 ; i < 15 ; i++) { 5284 float64 f; 5285 5286 f = float64_mul(xn, float32_exp2_coefficients[i], status); 5287 r = float64_add(r, f, status); 5288 5289 xn = float64_mul(xn, x, status); 5290 } 5291 5292 return float64_to_float32(r, status); 5293 } 5294 5295 /*---------------------------------------------------------------------------- 5296 | Returns the binary log of the single-precision floating-point value `a'. 5297 | The operation is performed according to the IEC/IEEE Standard for Binary 5298 | Floating-Point Arithmetic. 5299 *----------------------------------------------------------------------------*/ 5300 float32 float32_log2(float32 a, float_status *status) 5301 { 5302 bool aSign, zSign; 5303 int aExp; 5304 uint32_t aSig, zSig, i; 5305 5306 a = float32_squash_input_denormal(a, status); 5307 aSig = extractFloat32Frac( a ); 5308 aExp = extractFloat32Exp( a ); 5309 aSign = extractFloat32Sign( a ); 5310 5311 if ( aExp == 0 ) { 5312 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 5313 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5314 } 5315 if ( aSign ) { 5316 float_raise(float_flag_invalid, status); 5317 return float32_default_nan(status); 5318 } 5319 if ( aExp == 0xFF ) { 5320 if (aSig) { 5321 return propagateFloat32NaN(a, float32_zero, status); 5322 } 5323 return a; 5324 } 5325 5326 aExp -= 0x7F; 5327 aSig |= 0x00800000; 5328 zSign = aExp < 0; 5329 zSig = aExp << 23; 5330 5331 for (i = 1 << 22; i > 0; i >>= 1) { 5332 aSig = ( (uint64_t)aSig * aSig ) >> 23; 5333 if ( aSig & 0x01000000 ) { 5334 aSig >>= 1; 5335 zSig |= i; 5336 } 5337 } 5338 5339 if ( zSign ) 5340 zSig = -zSig; 5341 5342 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 5343 } 5344 5345 /*---------------------------------------------------------------------------- 5346 | Returns the result of converting the double-precision floating-point value 5347 | `a' to the extended double-precision floating-point format. The conversion 5348 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5349 | Arithmetic. 5350 *----------------------------------------------------------------------------*/ 5351 5352 floatx80 float64_to_floatx80(float64 a, float_status *status) 5353 { 5354 bool aSign; 5355 int aExp; 5356 uint64_t aSig; 5357 5358 a = float64_squash_input_denormal(a, status); 5359 aSig = extractFloat64Frac( a ); 5360 aExp = extractFloat64Exp( a ); 5361 aSign = extractFloat64Sign( a ); 5362 if ( aExp == 0x7FF ) { 5363 if (aSig) { 5364 floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status), 5365 status); 5366 return floatx80_silence_nan(res, status); 5367 } 5368 return packFloatx80(aSign, 5369 floatx80_infinity_high, 5370 floatx80_infinity_low); 5371 } 5372 if ( aExp == 0 ) { 5373 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 5374 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5375 } 5376 return 5377 packFloatx80( 5378 aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11); 5379 5380 } 5381 5382 /*---------------------------------------------------------------------------- 5383 | Returns the result of converting the double-precision floating-point value 5384 | `a' to the quadruple-precision floating-point format. The conversion is 5385 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5386 | Arithmetic. 5387 *----------------------------------------------------------------------------*/ 5388 5389 float128 float64_to_float128(float64 a, float_status *status) 5390 { 5391 bool aSign; 5392 int aExp; 5393 uint64_t aSig, zSig0, zSig1; 5394 5395 a = float64_squash_input_denormal(a, status); 5396 aSig = extractFloat64Frac( a ); 5397 aExp = extractFloat64Exp( a ); 5398 aSign = extractFloat64Sign( a ); 5399 if ( aExp == 0x7FF ) { 5400 if (aSig) { 5401 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 5402 } 5403 return packFloat128( aSign, 0x7FFF, 0, 0 ); 5404 } 5405 if ( aExp == 0 ) { 5406 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 5407 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5408 --aExp; 5409 } 5410 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 5411 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 5412 5413 } 5414 5415 5416 /*---------------------------------------------------------------------------- 5417 | Returns the remainder of the double-precision floating-point value `a' 5418 | with respect to the corresponding value `b'. The operation is performed 5419 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5420 *----------------------------------------------------------------------------*/ 5421 5422 float64 float64_rem(float64 a, float64 b, float_status *status) 5423 { 5424 bool aSign, zSign; 5425 int aExp, bExp, expDiff; 5426 uint64_t aSig, bSig; 5427 uint64_t q, alternateASig; 5428 int64_t sigMean; 5429 5430 a = float64_squash_input_denormal(a, status); 5431 b = float64_squash_input_denormal(b, status); 5432 aSig = extractFloat64Frac( a ); 5433 aExp = extractFloat64Exp( a ); 5434 aSign = extractFloat64Sign( a ); 5435 bSig = extractFloat64Frac( b ); 5436 bExp = extractFloat64Exp( b ); 5437 if ( aExp == 0x7FF ) { 5438 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 5439 return propagateFloat64NaN(a, b, status); 5440 } 5441 float_raise(float_flag_invalid, status); 5442 return float64_default_nan(status); 5443 } 5444 if ( bExp == 0x7FF ) { 5445 if (bSig) { 5446 return propagateFloat64NaN(a, b, status); 5447 } 5448 return a; 5449 } 5450 if ( bExp == 0 ) { 5451 if ( bSig == 0 ) { 5452 float_raise(float_flag_invalid, status); 5453 return float64_default_nan(status); 5454 } 5455 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 5456 } 5457 if ( aExp == 0 ) { 5458 if ( aSig == 0 ) return a; 5459 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5460 } 5461 expDiff = aExp - bExp; 5462 aSig = (aSig | UINT64_C(0x0010000000000000)) << 11; 5463 bSig = (bSig | UINT64_C(0x0010000000000000)) << 11; 5464 if ( expDiff < 0 ) { 5465 if ( expDiff < -1 ) return a; 5466 aSig >>= 1; 5467 } 5468 q = ( bSig <= aSig ); 5469 if ( q ) aSig -= bSig; 5470 expDiff -= 64; 5471 while ( 0 < expDiff ) { 5472 q = estimateDiv128To64( aSig, 0, bSig ); 5473 q = ( 2 < q ) ? q - 2 : 0; 5474 aSig = - ( ( bSig>>2 ) * q ); 5475 expDiff -= 62; 5476 } 5477 expDiff += 64; 5478 if ( 0 < expDiff ) { 5479 q = estimateDiv128To64( aSig, 0, bSig ); 5480 q = ( 2 < q ) ? q - 2 : 0; 5481 q >>= 64 - expDiff; 5482 bSig >>= 2; 5483 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 5484 } 5485 else { 5486 aSig >>= 2; 5487 bSig >>= 2; 5488 } 5489 do { 5490 alternateASig = aSig; 5491 ++q; 5492 aSig -= bSig; 5493 } while ( 0 <= (int64_t) aSig ); 5494 sigMean = aSig + alternateASig; 5495 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 5496 aSig = alternateASig; 5497 } 5498 zSign = ( (int64_t) aSig < 0 ); 5499 if ( zSign ) aSig = - aSig; 5500 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 5501 5502 } 5503 5504 /*---------------------------------------------------------------------------- 5505 | Returns the binary log of the double-precision floating-point value `a'. 5506 | The operation is performed according to the IEC/IEEE Standard for Binary 5507 | Floating-Point Arithmetic. 5508 *----------------------------------------------------------------------------*/ 5509 float64 float64_log2(float64 a, float_status *status) 5510 { 5511 bool aSign, zSign; 5512 int aExp; 5513 uint64_t aSig, aSig0, aSig1, zSig, i; 5514 a = float64_squash_input_denormal(a, status); 5515 5516 aSig = extractFloat64Frac( a ); 5517 aExp = extractFloat64Exp( a ); 5518 aSign = extractFloat64Sign( a ); 5519 5520 if ( aExp == 0 ) { 5521 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 5522 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5523 } 5524 if ( aSign ) { 5525 float_raise(float_flag_invalid, status); 5526 return float64_default_nan(status); 5527 } 5528 if ( aExp == 0x7FF ) { 5529 if (aSig) { 5530 return propagateFloat64NaN(a, float64_zero, status); 5531 } 5532 return a; 5533 } 5534 5535 aExp -= 0x3FF; 5536 aSig |= UINT64_C(0x0010000000000000); 5537 zSign = aExp < 0; 5538 zSig = (uint64_t)aExp << 52; 5539 for (i = 1LL << 51; i > 0; i >>= 1) { 5540 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 5541 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 5542 if ( aSig & UINT64_C(0x0020000000000000) ) { 5543 aSig >>= 1; 5544 zSig |= i; 5545 } 5546 } 5547 5548 if ( zSign ) 5549 zSig = -zSig; 5550 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 5551 } 5552 5553 /*---------------------------------------------------------------------------- 5554 | Returns the result of converting the extended double-precision floating- 5555 | point value `a' to the 32-bit two's complement integer format. The 5556 | conversion is performed according to the IEC/IEEE Standard for Binary 5557 | Floating-Point Arithmetic---which means in particular that the conversion 5558 | is rounded according to the current rounding mode. If `a' is a NaN, the 5559 | largest positive integer is returned. Otherwise, if the conversion 5560 | overflows, the largest integer with the same sign as `a' is returned. 5561 *----------------------------------------------------------------------------*/ 5562 5563 int32_t floatx80_to_int32(floatx80 a, float_status *status) 5564 { 5565 bool aSign; 5566 int32_t aExp, shiftCount; 5567 uint64_t aSig; 5568 5569 if (floatx80_invalid_encoding(a)) { 5570 float_raise(float_flag_invalid, status); 5571 return 1 << 31; 5572 } 5573 aSig = extractFloatx80Frac( a ); 5574 aExp = extractFloatx80Exp( a ); 5575 aSign = extractFloatx80Sign( a ); 5576 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5577 shiftCount = 0x4037 - aExp; 5578 if ( shiftCount <= 0 ) shiftCount = 1; 5579 shift64RightJamming( aSig, shiftCount, &aSig ); 5580 return roundAndPackInt32(aSign, aSig, status); 5581 5582 } 5583 5584 /*---------------------------------------------------------------------------- 5585 | Returns the result of converting the extended double-precision floating- 5586 | point value `a' to the 32-bit two's complement integer format. The 5587 | conversion is performed according to the IEC/IEEE Standard for Binary 5588 | Floating-Point Arithmetic, except that the conversion is always rounded 5589 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5590 | Otherwise, if the conversion overflows, the largest integer with the same 5591 | sign as `a' is returned. 5592 *----------------------------------------------------------------------------*/ 5593 5594 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 5595 { 5596 bool aSign; 5597 int32_t aExp, shiftCount; 5598 uint64_t aSig, savedASig; 5599 int32_t z; 5600 5601 if (floatx80_invalid_encoding(a)) { 5602 float_raise(float_flag_invalid, status); 5603 return 1 << 31; 5604 } 5605 aSig = extractFloatx80Frac( a ); 5606 aExp = extractFloatx80Exp( a ); 5607 aSign = extractFloatx80Sign( a ); 5608 if ( 0x401E < aExp ) { 5609 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5610 goto invalid; 5611 } 5612 else if ( aExp < 0x3FFF ) { 5613 if (aExp || aSig) { 5614 float_raise(float_flag_inexact, status); 5615 } 5616 return 0; 5617 } 5618 shiftCount = 0x403E - aExp; 5619 savedASig = aSig; 5620 aSig >>= shiftCount; 5621 z = aSig; 5622 if ( aSign ) z = - z; 5623 if ( ( z < 0 ) ^ aSign ) { 5624 invalid: 5625 float_raise(float_flag_invalid, status); 5626 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5627 } 5628 if ( ( aSig<<shiftCount ) != savedASig ) { 5629 float_raise(float_flag_inexact, status); 5630 } 5631 return z; 5632 5633 } 5634 5635 /*---------------------------------------------------------------------------- 5636 | Returns the result of converting the extended double-precision floating- 5637 | point value `a' to the 64-bit two's complement integer format. The 5638 | conversion is performed according to the IEC/IEEE Standard for Binary 5639 | Floating-Point Arithmetic---which means in particular that the conversion 5640 | is rounded according to the current rounding mode. If `a' is a NaN, 5641 | the largest positive integer is returned. Otherwise, if the conversion 5642 | overflows, the largest integer with the same sign as `a' is returned. 5643 *----------------------------------------------------------------------------*/ 5644 5645 int64_t floatx80_to_int64(floatx80 a, float_status *status) 5646 { 5647 bool aSign; 5648 int32_t aExp, shiftCount; 5649 uint64_t aSig, aSigExtra; 5650 5651 if (floatx80_invalid_encoding(a)) { 5652 float_raise(float_flag_invalid, status); 5653 return 1ULL << 63; 5654 } 5655 aSig = extractFloatx80Frac( a ); 5656 aExp = extractFloatx80Exp( a ); 5657 aSign = extractFloatx80Sign( a ); 5658 shiftCount = 0x403E - aExp; 5659 if ( shiftCount <= 0 ) { 5660 if ( shiftCount ) { 5661 float_raise(float_flag_invalid, status); 5662 if (!aSign || floatx80_is_any_nan(a)) { 5663 return INT64_MAX; 5664 } 5665 return INT64_MIN; 5666 } 5667 aSigExtra = 0; 5668 } 5669 else { 5670 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 5671 } 5672 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 5673 5674 } 5675 5676 /*---------------------------------------------------------------------------- 5677 | Returns the result of converting the extended double-precision floating- 5678 | point value `a' to the 64-bit two's complement integer format. The 5679 | conversion is performed according to the IEC/IEEE Standard for Binary 5680 | Floating-Point Arithmetic, except that the conversion is always rounded 5681 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5682 | Otherwise, if the conversion overflows, the largest integer with the same 5683 | sign as `a' is returned. 5684 *----------------------------------------------------------------------------*/ 5685 5686 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 5687 { 5688 bool aSign; 5689 int32_t aExp, shiftCount; 5690 uint64_t aSig; 5691 int64_t z; 5692 5693 if (floatx80_invalid_encoding(a)) { 5694 float_raise(float_flag_invalid, status); 5695 return 1ULL << 63; 5696 } 5697 aSig = extractFloatx80Frac( a ); 5698 aExp = extractFloatx80Exp( a ); 5699 aSign = extractFloatx80Sign( a ); 5700 shiftCount = aExp - 0x403E; 5701 if ( 0 <= shiftCount ) { 5702 aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF); 5703 if ( ( a.high != 0xC03E ) || aSig ) { 5704 float_raise(float_flag_invalid, status); 5705 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 5706 return INT64_MAX; 5707 } 5708 } 5709 return INT64_MIN; 5710 } 5711 else if ( aExp < 0x3FFF ) { 5712 if (aExp | aSig) { 5713 float_raise(float_flag_inexact, status); 5714 } 5715 return 0; 5716 } 5717 z = aSig>>( - shiftCount ); 5718 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 5719 float_raise(float_flag_inexact, status); 5720 } 5721 if ( aSign ) z = - z; 5722 return z; 5723 5724 } 5725 5726 /*---------------------------------------------------------------------------- 5727 | Returns the result of converting the extended double-precision floating- 5728 | point value `a' to the single-precision floating-point format. The 5729 | conversion is performed according to the IEC/IEEE Standard for Binary 5730 | Floating-Point Arithmetic. 5731 *----------------------------------------------------------------------------*/ 5732 5733 float32 floatx80_to_float32(floatx80 a, float_status *status) 5734 { 5735 bool aSign; 5736 int32_t aExp; 5737 uint64_t aSig; 5738 5739 if (floatx80_invalid_encoding(a)) { 5740 float_raise(float_flag_invalid, status); 5741 return float32_default_nan(status); 5742 } 5743 aSig = extractFloatx80Frac( a ); 5744 aExp = extractFloatx80Exp( a ); 5745 aSign = extractFloatx80Sign( a ); 5746 if ( aExp == 0x7FFF ) { 5747 if ( (uint64_t) ( aSig<<1 ) ) { 5748 float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status), 5749 status); 5750 return float32_silence_nan(res, status); 5751 } 5752 return packFloat32( aSign, 0xFF, 0 ); 5753 } 5754 shift64RightJamming( aSig, 33, &aSig ); 5755 if ( aExp || aSig ) aExp -= 0x3F81; 5756 return roundAndPackFloat32(aSign, aExp, aSig, status); 5757 5758 } 5759 5760 /*---------------------------------------------------------------------------- 5761 | Returns the result of converting the extended double-precision floating- 5762 | point value `a' to the double-precision floating-point format. The 5763 | conversion is performed according to the IEC/IEEE Standard for Binary 5764 | Floating-Point Arithmetic. 5765 *----------------------------------------------------------------------------*/ 5766 5767 float64 floatx80_to_float64(floatx80 a, float_status *status) 5768 { 5769 bool aSign; 5770 int32_t aExp; 5771 uint64_t aSig, zSig; 5772 5773 if (floatx80_invalid_encoding(a)) { 5774 float_raise(float_flag_invalid, status); 5775 return float64_default_nan(status); 5776 } 5777 aSig = extractFloatx80Frac( a ); 5778 aExp = extractFloatx80Exp( a ); 5779 aSign = extractFloatx80Sign( a ); 5780 if ( aExp == 0x7FFF ) { 5781 if ( (uint64_t) ( aSig<<1 ) ) { 5782 float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status), 5783 status); 5784 return float64_silence_nan(res, status); 5785 } 5786 return packFloat64( aSign, 0x7FF, 0 ); 5787 } 5788 shift64RightJamming( aSig, 1, &zSig ); 5789 if ( aExp || aSig ) aExp -= 0x3C01; 5790 return roundAndPackFloat64(aSign, aExp, zSig, status); 5791 5792 } 5793 5794 /*---------------------------------------------------------------------------- 5795 | Returns the result of converting the extended double-precision floating- 5796 | point value `a' to the quadruple-precision floating-point format. The 5797 | conversion is performed according to the IEC/IEEE Standard for Binary 5798 | Floating-Point Arithmetic. 5799 *----------------------------------------------------------------------------*/ 5800 5801 float128 floatx80_to_float128(floatx80 a, float_status *status) 5802 { 5803 bool aSign; 5804 int aExp; 5805 uint64_t aSig, zSig0, zSig1; 5806 5807 if (floatx80_invalid_encoding(a)) { 5808 float_raise(float_flag_invalid, status); 5809 return float128_default_nan(status); 5810 } 5811 aSig = extractFloatx80Frac( a ); 5812 aExp = extractFloatx80Exp( a ); 5813 aSign = extractFloatx80Sign( a ); 5814 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 5815 float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status), 5816 status); 5817 return float128_silence_nan(res, status); 5818 } 5819 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 5820 return packFloat128( aSign, aExp, zSig0, zSig1 ); 5821 5822 } 5823 5824 /*---------------------------------------------------------------------------- 5825 | Rounds the extended double-precision floating-point value `a' 5826 | to the precision provided by floatx80_rounding_precision and returns the 5827 | result as an extended double-precision floating-point value. 5828 | The operation is performed according to the IEC/IEEE Standard for Binary 5829 | Floating-Point Arithmetic. 5830 *----------------------------------------------------------------------------*/ 5831 5832 floatx80 floatx80_round(floatx80 a, float_status *status) 5833 { 5834 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5835 extractFloatx80Sign(a), 5836 extractFloatx80Exp(a), 5837 extractFloatx80Frac(a), 0, status); 5838 } 5839 5840 /*---------------------------------------------------------------------------- 5841 | Rounds the extended double-precision floating-point value `a' to an integer, 5842 | and returns the result as an extended quadruple-precision floating-point 5843 | value. The operation is performed according to the IEC/IEEE Standard for 5844 | Binary Floating-Point Arithmetic. 5845 *----------------------------------------------------------------------------*/ 5846 5847 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 5848 { 5849 bool aSign; 5850 int32_t aExp; 5851 uint64_t lastBitMask, roundBitsMask; 5852 floatx80 z; 5853 5854 if (floatx80_invalid_encoding(a)) { 5855 float_raise(float_flag_invalid, status); 5856 return floatx80_default_nan(status); 5857 } 5858 aExp = extractFloatx80Exp( a ); 5859 if ( 0x403E <= aExp ) { 5860 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 5861 return propagateFloatx80NaN(a, a, status); 5862 } 5863 return a; 5864 } 5865 if ( aExp < 0x3FFF ) { 5866 if ( ( aExp == 0 ) 5867 && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) { 5868 return a; 5869 } 5870 float_raise(float_flag_inexact, status); 5871 aSign = extractFloatx80Sign( a ); 5872 switch (status->float_rounding_mode) { 5873 case float_round_nearest_even: 5874 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 5875 ) { 5876 return 5877 packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000)); 5878 } 5879 break; 5880 case float_round_ties_away: 5881 if (aExp == 0x3FFE) { 5882 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000)); 5883 } 5884 break; 5885 case float_round_down: 5886 return 5887 aSign ? 5888 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000)) 5889 : packFloatx80( 0, 0, 0 ); 5890 case float_round_up: 5891 return 5892 aSign ? packFloatx80( 1, 0, 0 ) 5893 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000)); 5894 5895 case float_round_to_zero: 5896 break; 5897 default: 5898 g_assert_not_reached(); 5899 } 5900 return packFloatx80( aSign, 0, 0 ); 5901 } 5902 lastBitMask = 1; 5903 lastBitMask <<= 0x403E - aExp; 5904 roundBitsMask = lastBitMask - 1; 5905 z = a; 5906 switch (status->float_rounding_mode) { 5907 case float_round_nearest_even: 5908 z.low += lastBitMask>>1; 5909 if ((z.low & roundBitsMask) == 0) { 5910 z.low &= ~lastBitMask; 5911 } 5912 break; 5913 case float_round_ties_away: 5914 z.low += lastBitMask >> 1; 5915 break; 5916 case float_round_to_zero: 5917 break; 5918 case float_round_up: 5919 if (!extractFloatx80Sign(z)) { 5920 z.low += roundBitsMask; 5921 } 5922 break; 5923 case float_round_down: 5924 if (extractFloatx80Sign(z)) { 5925 z.low += roundBitsMask; 5926 } 5927 break; 5928 default: 5929 abort(); 5930 } 5931 z.low &= ~ roundBitsMask; 5932 if ( z.low == 0 ) { 5933 ++z.high; 5934 z.low = UINT64_C(0x8000000000000000); 5935 } 5936 if (z.low != a.low) { 5937 float_raise(float_flag_inexact, status); 5938 } 5939 return z; 5940 5941 } 5942 5943 /*---------------------------------------------------------------------------- 5944 | Returns the result of adding the absolute values of the extended double- 5945 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 5946 | negated before being returned. `zSign' is ignored if the result is a NaN. 5947 | The addition is performed according to the IEC/IEEE Standard for Binary 5948 | Floating-Point Arithmetic. 5949 *----------------------------------------------------------------------------*/ 5950 5951 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign, 5952 float_status *status) 5953 { 5954 int32_t aExp, bExp, zExp; 5955 uint64_t aSig, bSig, zSig0, zSig1; 5956 int32_t expDiff; 5957 5958 aSig = extractFloatx80Frac( a ); 5959 aExp = extractFloatx80Exp( a ); 5960 bSig = extractFloatx80Frac( b ); 5961 bExp = extractFloatx80Exp( b ); 5962 expDiff = aExp - bExp; 5963 if ( 0 < expDiff ) { 5964 if ( aExp == 0x7FFF ) { 5965 if ((uint64_t)(aSig << 1)) { 5966 return propagateFloatx80NaN(a, b, status); 5967 } 5968 return a; 5969 } 5970 if ( bExp == 0 ) --expDiff; 5971 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5972 zExp = aExp; 5973 } 5974 else if ( expDiff < 0 ) { 5975 if ( bExp == 0x7FFF ) { 5976 if ((uint64_t)(bSig << 1)) { 5977 return propagateFloatx80NaN(a, b, status); 5978 } 5979 return packFloatx80(zSign, 5980 floatx80_infinity_high, 5981 floatx80_infinity_low); 5982 } 5983 if ( aExp == 0 ) ++expDiff; 5984 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5985 zExp = bExp; 5986 } 5987 else { 5988 if ( aExp == 0x7FFF ) { 5989 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5990 return propagateFloatx80NaN(a, b, status); 5991 } 5992 return a; 5993 } 5994 zSig1 = 0; 5995 zSig0 = aSig + bSig; 5996 if ( aExp == 0 ) { 5997 if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) { 5998 /* At least one of the values is a pseudo-denormal, 5999 * and there is a carry out of the result. */ 6000 zExp = 1; 6001 goto shiftRight1; 6002 } 6003 if (zSig0 == 0) { 6004 return packFloatx80(zSign, 0, 0); 6005 } 6006 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 6007 goto roundAndPack; 6008 } 6009 zExp = aExp; 6010 goto shiftRight1; 6011 } 6012 zSig0 = aSig + bSig; 6013 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 6014 shiftRight1: 6015 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 6016 zSig0 |= UINT64_C(0x8000000000000000); 6017 ++zExp; 6018 roundAndPack: 6019 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6020 zSign, zExp, zSig0, zSig1, status); 6021 } 6022 6023 /*---------------------------------------------------------------------------- 6024 | Returns the result of subtracting the absolute values of the extended 6025 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 6026 | difference is negated before being returned. `zSign' is ignored if the 6027 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6028 | Standard for Binary Floating-Point Arithmetic. 6029 *----------------------------------------------------------------------------*/ 6030 6031 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign, 6032 float_status *status) 6033 { 6034 int32_t aExp, bExp, zExp; 6035 uint64_t aSig, bSig, zSig0, zSig1; 6036 int32_t expDiff; 6037 6038 aSig = extractFloatx80Frac( a ); 6039 aExp = extractFloatx80Exp( a ); 6040 bSig = extractFloatx80Frac( b ); 6041 bExp = extractFloatx80Exp( b ); 6042 expDiff = aExp - bExp; 6043 if ( 0 < expDiff ) goto aExpBigger; 6044 if ( expDiff < 0 ) goto bExpBigger; 6045 if ( aExp == 0x7FFF ) { 6046 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 6047 return propagateFloatx80NaN(a, b, status); 6048 } 6049 float_raise(float_flag_invalid, status); 6050 return floatx80_default_nan(status); 6051 } 6052 if ( aExp == 0 ) { 6053 aExp = 1; 6054 bExp = 1; 6055 } 6056 zSig1 = 0; 6057 if ( bSig < aSig ) goto aBigger; 6058 if ( aSig < bSig ) goto bBigger; 6059 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 6060 bExpBigger: 6061 if ( bExp == 0x7FFF ) { 6062 if ((uint64_t)(bSig << 1)) { 6063 return propagateFloatx80NaN(a, b, status); 6064 } 6065 return packFloatx80(zSign ^ 1, floatx80_infinity_high, 6066 floatx80_infinity_low); 6067 } 6068 if ( aExp == 0 ) ++expDiff; 6069 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 6070 bBigger: 6071 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 6072 zExp = bExp; 6073 zSign ^= 1; 6074 goto normalizeRoundAndPack; 6075 aExpBigger: 6076 if ( aExp == 0x7FFF ) { 6077 if ((uint64_t)(aSig << 1)) { 6078 return propagateFloatx80NaN(a, b, status); 6079 } 6080 return a; 6081 } 6082 if ( bExp == 0 ) --expDiff; 6083 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 6084 aBigger: 6085 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 6086 zExp = aExp; 6087 normalizeRoundAndPack: 6088 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 6089 zSign, zExp, zSig0, zSig1, status); 6090 } 6091 6092 /*---------------------------------------------------------------------------- 6093 | Returns the result of adding the extended double-precision floating-point 6094 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6095 | Standard for Binary Floating-Point Arithmetic. 6096 *----------------------------------------------------------------------------*/ 6097 6098 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 6099 { 6100 bool aSign, bSign; 6101 6102 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6103 float_raise(float_flag_invalid, status); 6104 return floatx80_default_nan(status); 6105 } 6106 aSign = extractFloatx80Sign( a ); 6107 bSign = extractFloatx80Sign( b ); 6108 if ( aSign == bSign ) { 6109 return addFloatx80Sigs(a, b, aSign, status); 6110 } 6111 else { 6112 return subFloatx80Sigs(a, b, aSign, status); 6113 } 6114 6115 } 6116 6117 /*---------------------------------------------------------------------------- 6118 | Returns the result of subtracting the extended double-precision floating- 6119 | point values `a' and `b'. The operation is performed according to the 6120 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6121 *----------------------------------------------------------------------------*/ 6122 6123 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 6124 { 6125 bool aSign, bSign; 6126 6127 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6128 float_raise(float_flag_invalid, status); 6129 return floatx80_default_nan(status); 6130 } 6131 aSign = extractFloatx80Sign( a ); 6132 bSign = extractFloatx80Sign( b ); 6133 if ( aSign == bSign ) { 6134 return subFloatx80Sigs(a, b, aSign, status); 6135 } 6136 else { 6137 return addFloatx80Sigs(a, b, aSign, status); 6138 } 6139 6140 } 6141 6142 /*---------------------------------------------------------------------------- 6143 | Returns the result of multiplying the extended double-precision floating- 6144 | point values `a' and `b'. The operation is performed according to the 6145 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6146 *----------------------------------------------------------------------------*/ 6147 6148 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 6149 { 6150 bool aSign, bSign, zSign; 6151 int32_t aExp, bExp, zExp; 6152 uint64_t aSig, bSig, zSig0, zSig1; 6153 6154 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6155 float_raise(float_flag_invalid, status); 6156 return floatx80_default_nan(status); 6157 } 6158 aSig = extractFloatx80Frac( a ); 6159 aExp = extractFloatx80Exp( a ); 6160 aSign = extractFloatx80Sign( a ); 6161 bSig = extractFloatx80Frac( b ); 6162 bExp = extractFloatx80Exp( b ); 6163 bSign = extractFloatx80Sign( b ); 6164 zSign = aSign ^ bSign; 6165 if ( aExp == 0x7FFF ) { 6166 if ( (uint64_t) ( aSig<<1 ) 6167 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 6168 return propagateFloatx80NaN(a, b, status); 6169 } 6170 if ( ( bExp | bSig ) == 0 ) goto invalid; 6171 return packFloatx80(zSign, floatx80_infinity_high, 6172 floatx80_infinity_low); 6173 } 6174 if ( bExp == 0x7FFF ) { 6175 if ((uint64_t)(bSig << 1)) { 6176 return propagateFloatx80NaN(a, b, status); 6177 } 6178 if ( ( aExp | aSig ) == 0 ) { 6179 invalid: 6180 float_raise(float_flag_invalid, status); 6181 return floatx80_default_nan(status); 6182 } 6183 return packFloatx80(zSign, floatx80_infinity_high, 6184 floatx80_infinity_low); 6185 } 6186 if ( aExp == 0 ) { 6187 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6188 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 6189 } 6190 if ( bExp == 0 ) { 6191 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6192 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6193 } 6194 zExp = aExp + bExp - 0x3FFE; 6195 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 6196 if ( 0 < (int64_t) zSig0 ) { 6197 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 6198 --zExp; 6199 } 6200 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6201 zSign, zExp, zSig0, zSig1, status); 6202 } 6203 6204 /*---------------------------------------------------------------------------- 6205 | Returns the result of dividing the extended double-precision floating-point 6206 | value `a' by the corresponding value `b'. The operation is performed 6207 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6208 *----------------------------------------------------------------------------*/ 6209 6210 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 6211 { 6212 bool aSign, bSign, zSign; 6213 int32_t aExp, bExp, zExp; 6214 uint64_t aSig, bSig, zSig0, zSig1; 6215 uint64_t rem0, rem1, rem2, term0, term1, term2; 6216 6217 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6218 float_raise(float_flag_invalid, status); 6219 return floatx80_default_nan(status); 6220 } 6221 aSig = extractFloatx80Frac( a ); 6222 aExp = extractFloatx80Exp( a ); 6223 aSign = extractFloatx80Sign( a ); 6224 bSig = extractFloatx80Frac( b ); 6225 bExp = extractFloatx80Exp( b ); 6226 bSign = extractFloatx80Sign( b ); 6227 zSign = aSign ^ bSign; 6228 if ( aExp == 0x7FFF ) { 6229 if ((uint64_t)(aSig << 1)) { 6230 return propagateFloatx80NaN(a, b, status); 6231 } 6232 if ( bExp == 0x7FFF ) { 6233 if ((uint64_t)(bSig << 1)) { 6234 return propagateFloatx80NaN(a, b, status); 6235 } 6236 goto invalid; 6237 } 6238 return packFloatx80(zSign, floatx80_infinity_high, 6239 floatx80_infinity_low); 6240 } 6241 if ( bExp == 0x7FFF ) { 6242 if ((uint64_t)(bSig << 1)) { 6243 return propagateFloatx80NaN(a, b, status); 6244 } 6245 return packFloatx80( zSign, 0, 0 ); 6246 } 6247 if ( bExp == 0 ) { 6248 if ( bSig == 0 ) { 6249 if ( ( aExp | aSig ) == 0 ) { 6250 invalid: 6251 float_raise(float_flag_invalid, status); 6252 return floatx80_default_nan(status); 6253 } 6254 float_raise(float_flag_divbyzero, status); 6255 return packFloatx80(zSign, floatx80_infinity_high, 6256 floatx80_infinity_low); 6257 } 6258 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6259 } 6260 if ( aExp == 0 ) { 6261 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6262 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 6263 } 6264 zExp = aExp - bExp + 0x3FFE; 6265 rem1 = 0; 6266 if ( bSig <= aSig ) { 6267 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 6268 ++zExp; 6269 } 6270 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 6271 mul64To128( bSig, zSig0, &term0, &term1 ); 6272 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 6273 while ( (int64_t) rem0 < 0 ) { 6274 --zSig0; 6275 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 6276 } 6277 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 6278 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 6279 mul64To128( bSig, zSig1, &term1, &term2 ); 6280 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6281 while ( (int64_t) rem1 < 0 ) { 6282 --zSig1; 6283 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 6284 } 6285 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 6286 } 6287 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6288 zSign, zExp, zSig0, zSig1, status); 6289 } 6290 6291 /*---------------------------------------------------------------------------- 6292 | Returns the remainder of the extended double-precision floating-point value 6293 | `a' with respect to the corresponding value `b'. The operation is performed 6294 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic, 6295 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating 6296 | the quotient toward zero instead. '*quotient' is set to the low 64 bits of 6297 | the absolute value of the integer quotient. 6298 *----------------------------------------------------------------------------*/ 6299 6300 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient, 6301 float_status *status) 6302 { 6303 bool aSign, zSign; 6304 int32_t aExp, bExp, expDiff, aExpOrig; 6305 uint64_t aSig0, aSig1, bSig; 6306 uint64_t q, term0, term1, alternateASig0, alternateASig1; 6307 6308 *quotient = 0; 6309 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6310 float_raise(float_flag_invalid, status); 6311 return floatx80_default_nan(status); 6312 } 6313 aSig0 = extractFloatx80Frac( a ); 6314 aExpOrig = aExp = extractFloatx80Exp( a ); 6315 aSign = extractFloatx80Sign( a ); 6316 bSig = extractFloatx80Frac( b ); 6317 bExp = extractFloatx80Exp( b ); 6318 if ( aExp == 0x7FFF ) { 6319 if ( (uint64_t) ( aSig0<<1 ) 6320 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 6321 return propagateFloatx80NaN(a, b, status); 6322 } 6323 goto invalid; 6324 } 6325 if ( bExp == 0x7FFF ) { 6326 if ((uint64_t)(bSig << 1)) { 6327 return propagateFloatx80NaN(a, b, status); 6328 } 6329 if (aExp == 0 && aSig0 >> 63) { 6330 /* 6331 * Pseudo-denormal argument must be returned in normalized 6332 * form. 6333 */ 6334 return packFloatx80(aSign, 1, aSig0); 6335 } 6336 return a; 6337 } 6338 if ( bExp == 0 ) { 6339 if ( bSig == 0 ) { 6340 invalid: 6341 float_raise(float_flag_invalid, status); 6342 return floatx80_default_nan(status); 6343 } 6344 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6345 } 6346 if ( aExp == 0 ) { 6347 if ( aSig0 == 0 ) return a; 6348 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6349 } 6350 zSign = aSign; 6351 expDiff = aExp - bExp; 6352 aSig1 = 0; 6353 if ( expDiff < 0 ) { 6354 if ( mod || expDiff < -1 ) { 6355 if (aExp == 1 && aExpOrig == 0) { 6356 /* 6357 * Pseudo-denormal argument must be returned in 6358 * normalized form. 6359 */ 6360 return packFloatx80(aSign, aExp, aSig0); 6361 } 6362 return a; 6363 } 6364 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 6365 expDiff = 0; 6366 } 6367 *quotient = q = ( bSig <= aSig0 ); 6368 if ( q ) aSig0 -= bSig; 6369 expDiff -= 64; 6370 while ( 0 < expDiff ) { 6371 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6372 q = ( 2 < q ) ? q - 2 : 0; 6373 mul64To128( bSig, q, &term0, &term1 ); 6374 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6375 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 6376 expDiff -= 62; 6377 *quotient <<= 62; 6378 *quotient += q; 6379 } 6380 expDiff += 64; 6381 if ( 0 < expDiff ) { 6382 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6383 q = ( 2 < q ) ? q - 2 : 0; 6384 q >>= 64 - expDiff; 6385 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 6386 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6387 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 6388 while ( le128( term0, term1, aSig0, aSig1 ) ) { 6389 ++q; 6390 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6391 } 6392 if (expDiff < 64) { 6393 *quotient <<= expDiff; 6394 } else { 6395 *quotient = 0; 6396 } 6397 *quotient += q; 6398 } 6399 else { 6400 term1 = 0; 6401 term0 = bSig; 6402 } 6403 if (!mod) { 6404 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 6405 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6406 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6407 && ( q & 1 ) ) 6408 ) { 6409 aSig0 = alternateASig0; 6410 aSig1 = alternateASig1; 6411 zSign = ! zSign; 6412 ++*quotient; 6413 } 6414 } 6415 return 6416 normalizeRoundAndPackFloatx80( 6417 80, zSign, bExp + expDiff, aSig0, aSig1, status); 6418 6419 } 6420 6421 /*---------------------------------------------------------------------------- 6422 | Returns the remainder of the extended double-precision floating-point value 6423 | `a' with respect to the corresponding value `b'. The operation is performed 6424 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6425 *----------------------------------------------------------------------------*/ 6426 6427 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 6428 { 6429 uint64_t quotient; 6430 return floatx80_modrem(a, b, false, "ient, status); 6431 } 6432 6433 /*---------------------------------------------------------------------------- 6434 | Returns the remainder of the extended double-precision floating-point value 6435 | `a' with respect to the corresponding value `b', with the quotient truncated 6436 | toward zero. 6437 *----------------------------------------------------------------------------*/ 6438 6439 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status) 6440 { 6441 uint64_t quotient; 6442 return floatx80_modrem(a, b, true, "ient, status); 6443 } 6444 6445 /*---------------------------------------------------------------------------- 6446 | Returns the square root of the extended double-precision floating-point 6447 | value `a'. The operation is performed according to the IEC/IEEE Standard 6448 | for Binary Floating-Point Arithmetic. 6449 *----------------------------------------------------------------------------*/ 6450 6451 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 6452 { 6453 bool aSign; 6454 int32_t aExp, zExp; 6455 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 6456 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6457 6458 if (floatx80_invalid_encoding(a)) { 6459 float_raise(float_flag_invalid, status); 6460 return floatx80_default_nan(status); 6461 } 6462 aSig0 = extractFloatx80Frac( a ); 6463 aExp = extractFloatx80Exp( a ); 6464 aSign = extractFloatx80Sign( a ); 6465 if ( aExp == 0x7FFF ) { 6466 if ((uint64_t)(aSig0 << 1)) { 6467 return propagateFloatx80NaN(a, a, status); 6468 } 6469 if ( ! aSign ) return a; 6470 goto invalid; 6471 } 6472 if ( aSign ) { 6473 if ( ( aExp | aSig0 ) == 0 ) return a; 6474 invalid: 6475 float_raise(float_flag_invalid, status); 6476 return floatx80_default_nan(status); 6477 } 6478 if ( aExp == 0 ) { 6479 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 6480 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6481 } 6482 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 6483 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 6484 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 6485 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6486 doubleZSig0 = zSig0<<1; 6487 mul64To128( zSig0, zSig0, &term0, &term1 ); 6488 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6489 while ( (int64_t) rem0 < 0 ) { 6490 --zSig0; 6491 doubleZSig0 -= 2; 6492 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6493 } 6494 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6495 if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) { 6496 if ( zSig1 == 0 ) zSig1 = 1; 6497 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6498 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6499 mul64To128( zSig1, zSig1, &term2, &term3 ); 6500 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6501 while ( (int64_t) rem1 < 0 ) { 6502 --zSig1; 6503 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6504 term3 |= 1; 6505 term2 |= doubleZSig0; 6506 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6507 } 6508 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6509 } 6510 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 6511 zSig0 |= doubleZSig0; 6512 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6513 0, zExp, zSig0, zSig1, status); 6514 } 6515 6516 /*---------------------------------------------------------------------------- 6517 | Returns the result of converting the quadruple-precision floating-point 6518 | value `a' to the 32-bit two's complement integer format. The conversion 6519 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6520 | Arithmetic---which means in particular that the conversion is rounded 6521 | according to the current rounding mode. If `a' is a NaN, the largest 6522 | positive integer is returned. Otherwise, if the conversion overflows, the 6523 | largest integer with the same sign as `a' is returned. 6524 *----------------------------------------------------------------------------*/ 6525 6526 int32_t float128_to_int32(float128 a, float_status *status) 6527 { 6528 bool aSign; 6529 int32_t aExp, shiftCount; 6530 uint64_t aSig0, aSig1; 6531 6532 aSig1 = extractFloat128Frac1( a ); 6533 aSig0 = extractFloat128Frac0( a ); 6534 aExp = extractFloat128Exp( a ); 6535 aSign = extractFloat128Sign( a ); 6536 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 6537 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6538 aSig0 |= ( aSig1 != 0 ); 6539 shiftCount = 0x4028 - aExp; 6540 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 6541 return roundAndPackInt32(aSign, aSig0, status); 6542 6543 } 6544 6545 /*---------------------------------------------------------------------------- 6546 | Returns the result of converting the quadruple-precision floating-point 6547 | value `a' to the 32-bit two's complement integer format. The conversion 6548 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6549 | Arithmetic, except that the conversion is always rounded toward zero. If 6550 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 6551 | conversion overflows, the largest integer with the same sign as `a' is 6552 | returned. 6553 *----------------------------------------------------------------------------*/ 6554 6555 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 6556 { 6557 bool aSign; 6558 int32_t aExp, shiftCount; 6559 uint64_t aSig0, aSig1, savedASig; 6560 int32_t z; 6561 6562 aSig1 = extractFloat128Frac1( a ); 6563 aSig0 = extractFloat128Frac0( a ); 6564 aExp = extractFloat128Exp( a ); 6565 aSign = extractFloat128Sign( a ); 6566 aSig0 |= ( aSig1 != 0 ); 6567 if ( 0x401E < aExp ) { 6568 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 6569 goto invalid; 6570 } 6571 else if ( aExp < 0x3FFF ) { 6572 if (aExp || aSig0) { 6573 float_raise(float_flag_inexact, status); 6574 } 6575 return 0; 6576 } 6577 aSig0 |= UINT64_C(0x0001000000000000); 6578 shiftCount = 0x402F - aExp; 6579 savedASig = aSig0; 6580 aSig0 >>= shiftCount; 6581 z = aSig0; 6582 if ( aSign ) z = - z; 6583 if ( ( z < 0 ) ^ aSign ) { 6584 invalid: 6585 float_raise(float_flag_invalid, status); 6586 return aSign ? INT32_MIN : INT32_MAX; 6587 } 6588 if ( ( aSig0<<shiftCount ) != savedASig ) { 6589 float_raise(float_flag_inexact, status); 6590 } 6591 return z; 6592 6593 } 6594 6595 /*---------------------------------------------------------------------------- 6596 | Returns the result of converting the quadruple-precision floating-point 6597 | value `a' to the 64-bit two's complement integer format. The conversion 6598 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6599 | Arithmetic---which means in particular that the conversion is rounded 6600 | according to the current rounding mode. If `a' is a NaN, the largest 6601 | positive integer is returned. Otherwise, if the conversion overflows, the 6602 | largest integer with the same sign as `a' is returned. 6603 *----------------------------------------------------------------------------*/ 6604 6605 int64_t float128_to_int64(float128 a, float_status *status) 6606 { 6607 bool aSign; 6608 int32_t aExp, shiftCount; 6609 uint64_t aSig0, aSig1; 6610 6611 aSig1 = extractFloat128Frac1( a ); 6612 aSig0 = extractFloat128Frac0( a ); 6613 aExp = extractFloat128Exp( a ); 6614 aSign = extractFloat128Sign( a ); 6615 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6616 shiftCount = 0x402F - aExp; 6617 if ( shiftCount <= 0 ) { 6618 if ( 0x403E < aExp ) { 6619 float_raise(float_flag_invalid, status); 6620 if ( ! aSign 6621 || ( ( aExp == 0x7FFF ) 6622 && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) ) 6623 ) 6624 ) { 6625 return INT64_MAX; 6626 } 6627 return INT64_MIN; 6628 } 6629 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 6630 } 6631 else { 6632 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 6633 } 6634 return roundAndPackInt64(aSign, aSig0, aSig1, status); 6635 6636 } 6637 6638 /*---------------------------------------------------------------------------- 6639 | Returns the result of converting the quadruple-precision floating-point 6640 | value `a' to the 64-bit two's complement integer format. The conversion 6641 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6642 | Arithmetic, except that the conversion is always rounded toward zero. 6643 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 6644 | the conversion overflows, the largest integer with the same sign as `a' is 6645 | returned. 6646 *----------------------------------------------------------------------------*/ 6647 6648 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 6649 { 6650 bool aSign; 6651 int32_t aExp, shiftCount; 6652 uint64_t aSig0, aSig1; 6653 int64_t z; 6654 6655 aSig1 = extractFloat128Frac1( a ); 6656 aSig0 = extractFloat128Frac0( a ); 6657 aExp = extractFloat128Exp( a ); 6658 aSign = extractFloat128Sign( a ); 6659 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6660 shiftCount = aExp - 0x402F; 6661 if ( 0 < shiftCount ) { 6662 if ( 0x403E <= aExp ) { 6663 aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF); 6664 if ( ( a.high == UINT64_C(0xC03E000000000000) ) 6665 && ( aSig1 < UINT64_C(0x0002000000000000) ) ) { 6666 if (aSig1) { 6667 float_raise(float_flag_inexact, status); 6668 } 6669 } 6670 else { 6671 float_raise(float_flag_invalid, status); 6672 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 6673 return INT64_MAX; 6674 } 6675 } 6676 return INT64_MIN; 6677 } 6678 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 6679 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 6680 float_raise(float_flag_inexact, status); 6681 } 6682 } 6683 else { 6684 if ( aExp < 0x3FFF ) { 6685 if ( aExp | aSig0 | aSig1 ) { 6686 float_raise(float_flag_inexact, status); 6687 } 6688 return 0; 6689 } 6690 z = aSig0>>( - shiftCount ); 6691 if ( aSig1 6692 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 6693 float_raise(float_flag_inexact, status); 6694 } 6695 } 6696 if ( aSign ) z = - z; 6697 return z; 6698 6699 } 6700 6701 /*---------------------------------------------------------------------------- 6702 | Returns the result of converting the quadruple-precision floating-point value 6703 | `a' to the 64-bit unsigned integer format. The conversion is 6704 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6705 | Arithmetic---which means in particular that the conversion is rounded 6706 | according to the current rounding mode. If `a' is a NaN, the largest 6707 | positive integer is returned. If the conversion overflows, the 6708 | largest unsigned integer is returned. If 'a' is negative, the value is 6709 | rounded and zero is returned; negative values that do not round to zero 6710 | will raise the inexact exception. 6711 *----------------------------------------------------------------------------*/ 6712 6713 uint64_t float128_to_uint64(float128 a, float_status *status) 6714 { 6715 bool aSign; 6716 int aExp; 6717 int shiftCount; 6718 uint64_t aSig0, aSig1; 6719 6720 aSig0 = extractFloat128Frac0(a); 6721 aSig1 = extractFloat128Frac1(a); 6722 aExp = extractFloat128Exp(a); 6723 aSign = extractFloat128Sign(a); 6724 if (aSign && (aExp > 0x3FFE)) { 6725 float_raise(float_flag_invalid, status); 6726 if (float128_is_any_nan(a)) { 6727 return UINT64_MAX; 6728 } else { 6729 return 0; 6730 } 6731 } 6732 if (aExp) { 6733 aSig0 |= UINT64_C(0x0001000000000000); 6734 } 6735 shiftCount = 0x402F - aExp; 6736 if (shiftCount <= 0) { 6737 if (0x403E < aExp) { 6738 float_raise(float_flag_invalid, status); 6739 return UINT64_MAX; 6740 } 6741 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 6742 } else { 6743 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 6744 } 6745 return roundAndPackUint64(aSign, aSig0, aSig1, status); 6746 } 6747 6748 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 6749 { 6750 uint64_t v; 6751 signed char current_rounding_mode = status->float_rounding_mode; 6752 6753 set_float_rounding_mode(float_round_to_zero, status); 6754 v = float128_to_uint64(a, status); 6755 set_float_rounding_mode(current_rounding_mode, status); 6756 6757 return v; 6758 } 6759 6760 /*---------------------------------------------------------------------------- 6761 | Returns the result of converting the quadruple-precision floating-point 6762 | value `a' to the 32-bit unsigned integer format. The conversion 6763 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6764 | Arithmetic except that the conversion is always rounded toward zero. 6765 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 6766 | if the conversion overflows, the largest unsigned integer is returned. 6767 | If 'a' is negative, the value is rounded and zero is returned; negative 6768 | values that do not round to zero will raise the inexact exception. 6769 *----------------------------------------------------------------------------*/ 6770 6771 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 6772 { 6773 uint64_t v; 6774 uint32_t res; 6775 int old_exc_flags = get_float_exception_flags(status); 6776 6777 v = float128_to_uint64_round_to_zero(a, status); 6778 if (v > 0xffffffff) { 6779 res = 0xffffffff; 6780 } else { 6781 return v; 6782 } 6783 set_float_exception_flags(old_exc_flags, status); 6784 float_raise(float_flag_invalid, status); 6785 return res; 6786 } 6787 6788 /*---------------------------------------------------------------------------- 6789 | Returns the result of converting the quadruple-precision floating-point value 6790 | `a' to the 32-bit unsigned integer format. The conversion is 6791 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6792 | Arithmetic---which means in particular that the conversion is rounded 6793 | according to the current rounding mode. If `a' is a NaN, the largest 6794 | positive integer is returned. If the conversion overflows, the 6795 | largest unsigned integer is returned. If 'a' is negative, the value is 6796 | rounded and zero is returned; negative values that do not round to zero 6797 | will raise the inexact exception. 6798 *----------------------------------------------------------------------------*/ 6799 6800 uint32_t float128_to_uint32(float128 a, float_status *status) 6801 { 6802 uint64_t v; 6803 uint32_t res; 6804 int old_exc_flags = get_float_exception_flags(status); 6805 6806 v = float128_to_uint64(a, status); 6807 if (v > 0xffffffff) { 6808 res = 0xffffffff; 6809 } else { 6810 return v; 6811 } 6812 set_float_exception_flags(old_exc_flags, status); 6813 float_raise(float_flag_invalid, status); 6814 return res; 6815 } 6816 6817 /*---------------------------------------------------------------------------- 6818 | Returns the result of converting the quadruple-precision floating-point 6819 | value `a' to the single-precision floating-point format. The conversion 6820 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6821 | Arithmetic. 6822 *----------------------------------------------------------------------------*/ 6823 6824 float32 float128_to_float32(float128 a, float_status *status) 6825 { 6826 bool aSign; 6827 int32_t aExp; 6828 uint64_t aSig0, aSig1; 6829 uint32_t zSig; 6830 6831 aSig1 = extractFloat128Frac1( a ); 6832 aSig0 = extractFloat128Frac0( a ); 6833 aExp = extractFloat128Exp( a ); 6834 aSign = extractFloat128Sign( a ); 6835 if ( aExp == 0x7FFF ) { 6836 if ( aSig0 | aSig1 ) { 6837 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 6838 } 6839 return packFloat32( aSign, 0xFF, 0 ); 6840 } 6841 aSig0 |= ( aSig1 != 0 ); 6842 shift64RightJamming( aSig0, 18, &aSig0 ); 6843 zSig = aSig0; 6844 if ( aExp || zSig ) { 6845 zSig |= 0x40000000; 6846 aExp -= 0x3F81; 6847 } 6848 return roundAndPackFloat32(aSign, aExp, zSig, status); 6849 6850 } 6851 6852 /*---------------------------------------------------------------------------- 6853 | Returns the result of converting the quadruple-precision floating-point 6854 | value `a' to the double-precision floating-point format. The conversion 6855 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6856 | Arithmetic. 6857 *----------------------------------------------------------------------------*/ 6858 6859 float64 float128_to_float64(float128 a, float_status *status) 6860 { 6861 bool aSign; 6862 int32_t aExp; 6863 uint64_t aSig0, aSig1; 6864 6865 aSig1 = extractFloat128Frac1( a ); 6866 aSig0 = extractFloat128Frac0( a ); 6867 aExp = extractFloat128Exp( a ); 6868 aSign = extractFloat128Sign( a ); 6869 if ( aExp == 0x7FFF ) { 6870 if ( aSig0 | aSig1 ) { 6871 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 6872 } 6873 return packFloat64( aSign, 0x7FF, 0 ); 6874 } 6875 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6876 aSig0 |= ( aSig1 != 0 ); 6877 if ( aExp || aSig0 ) { 6878 aSig0 |= UINT64_C(0x4000000000000000); 6879 aExp -= 0x3C01; 6880 } 6881 return roundAndPackFloat64(aSign, aExp, aSig0, status); 6882 6883 } 6884 6885 /*---------------------------------------------------------------------------- 6886 | Returns the result of converting the quadruple-precision floating-point 6887 | value `a' to the extended double-precision floating-point format. The 6888 | conversion is performed according to the IEC/IEEE Standard for Binary 6889 | Floating-Point Arithmetic. 6890 *----------------------------------------------------------------------------*/ 6891 6892 floatx80 float128_to_floatx80(float128 a, float_status *status) 6893 { 6894 bool aSign; 6895 int32_t aExp; 6896 uint64_t aSig0, aSig1; 6897 6898 aSig1 = extractFloat128Frac1( a ); 6899 aSig0 = extractFloat128Frac0( a ); 6900 aExp = extractFloat128Exp( a ); 6901 aSign = extractFloat128Sign( a ); 6902 if ( aExp == 0x7FFF ) { 6903 if ( aSig0 | aSig1 ) { 6904 floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status), 6905 status); 6906 return floatx80_silence_nan(res, status); 6907 } 6908 return packFloatx80(aSign, floatx80_infinity_high, 6909 floatx80_infinity_low); 6910 } 6911 if ( aExp == 0 ) { 6912 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6913 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6914 } 6915 else { 6916 aSig0 |= UINT64_C(0x0001000000000000); 6917 } 6918 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6919 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6920 6921 } 6922 6923 /*---------------------------------------------------------------------------- 6924 | Rounds the quadruple-precision floating-point value `a' to an integer, and 6925 | returns the result as a quadruple-precision floating-point value. The 6926 | operation is performed according to the IEC/IEEE Standard for Binary 6927 | Floating-Point Arithmetic. 6928 *----------------------------------------------------------------------------*/ 6929 6930 float128 float128_round_to_int(float128 a, float_status *status) 6931 { 6932 bool aSign; 6933 int32_t aExp; 6934 uint64_t lastBitMask, roundBitsMask; 6935 float128 z; 6936 6937 aExp = extractFloat128Exp( a ); 6938 if ( 0x402F <= aExp ) { 6939 if ( 0x406F <= aExp ) { 6940 if ( ( aExp == 0x7FFF ) 6941 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 6942 ) { 6943 return propagateFloat128NaN(a, a, status); 6944 } 6945 return a; 6946 } 6947 lastBitMask = 1; 6948 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 6949 roundBitsMask = lastBitMask - 1; 6950 z = a; 6951 switch (status->float_rounding_mode) { 6952 case float_round_nearest_even: 6953 if ( lastBitMask ) { 6954 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 6955 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 6956 } 6957 else { 6958 if ( (int64_t) z.low < 0 ) { 6959 ++z.high; 6960 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 6961 } 6962 } 6963 break; 6964 case float_round_ties_away: 6965 if (lastBitMask) { 6966 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 6967 } else { 6968 if ((int64_t) z.low < 0) { 6969 ++z.high; 6970 } 6971 } 6972 break; 6973 case float_round_to_zero: 6974 break; 6975 case float_round_up: 6976 if (!extractFloat128Sign(z)) { 6977 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6978 } 6979 break; 6980 case float_round_down: 6981 if (extractFloat128Sign(z)) { 6982 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6983 } 6984 break; 6985 case float_round_to_odd: 6986 /* 6987 * Note that if lastBitMask == 0, the last bit is the lsb 6988 * of high, and roundBitsMask == -1. 6989 */ 6990 if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) { 6991 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6992 } 6993 break; 6994 default: 6995 abort(); 6996 } 6997 z.low &= ~ roundBitsMask; 6998 } 6999 else { 7000 if ( aExp < 0x3FFF ) { 7001 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 7002 float_raise(float_flag_inexact, status); 7003 aSign = extractFloat128Sign( a ); 7004 switch (status->float_rounding_mode) { 7005 case float_round_nearest_even: 7006 if ( ( aExp == 0x3FFE ) 7007 && ( extractFloat128Frac0( a ) 7008 | extractFloat128Frac1( a ) ) 7009 ) { 7010 return packFloat128( aSign, 0x3FFF, 0, 0 ); 7011 } 7012 break; 7013 case float_round_ties_away: 7014 if (aExp == 0x3FFE) { 7015 return packFloat128(aSign, 0x3FFF, 0, 0); 7016 } 7017 break; 7018 case float_round_down: 7019 return 7020 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 7021 : packFloat128( 0, 0, 0, 0 ); 7022 case float_round_up: 7023 return 7024 aSign ? packFloat128( 1, 0, 0, 0 ) 7025 : packFloat128( 0, 0x3FFF, 0, 0 ); 7026 7027 case float_round_to_odd: 7028 return packFloat128(aSign, 0x3FFF, 0, 0); 7029 7030 case float_round_to_zero: 7031 break; 7032 } 7033 return packFloat128( aSign, 0, 0, 0 ); 7034 } 7035 lastBitMask = 1; 7036 lastBitMask <<= 0x402F - aExp; 7037 roundBitsMask = lastBitMask - 1; 7038 z.low = 0; 7039 z.high = a.high; 7040 switch (status->float_rounding_mode) { 7041 case float_round_nearest_even: 7042 z.high += lastBitMask>>1; 7043 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 7044 z.high &= ~ lastBitMask; 7045 } 7046 break; 7047 case float_round_ties_away: 7048 z.high += lastBitMask>>1; 7049 break; 7050 case float_round_to_zero: 7051 break; 7052 case float_round_up: 7053 if (!extractFloat128Sign(z)) { 7054 z.high |= ( a.low != 0 ); 7055 z.high += roundBitsMask; 7056 } 7057 break; 7058 case float_round_down: 7059 if (extractFloat128Sign(z)) { 7060 z.high |= (a.low != 0); 7061 z.high += roundBitsMask; 7062 } 7063 break; 7064 case float_round_to_odd: 7065 if ((z.high & lastBitMask) == 0) { 7066 z.high |= (a.low != 0); 7067 z.high += roundBitsMask; 7068 } 7069 break; 7070 default: 7071 abort(); 7072 } 7073 z.high &= ~ roundBitsMask; 7074 } 7075 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 7076 float_raise(float_flag_inexact, status); 7077 } 7078 return z; 7079 7080 } 7081 7082 /*---------------------------------------------------------------------------- 7083 | Returns the result of dividing the quadruple-precision floating-point value 7084 | `a' by the corresponding value `b'. The operation is performed according to 7085 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7086 *----------------------------------------------------------------------------*/ 7087 7088 float128 float128_div(float128 a, float128 b, float_status *status) 7089 { 7090 bool aSign, bSign, zSign; 7091 int32_t aExp, bExp, zExp; 7092 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 7093 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7094 7095 aSig1 = extractFloat128Frac1( a ); 7096 aSig0 = extractFloat128Frac0( a ); 7097 aExp = extractFloat128Exp( a ); 7098 aSign = extractFloat128Sign( a ); 7099 bSig1 = extractFloat128Frac1( b ); 7100 bSig0 = extractFloat128Frac0( b ); 7101 bExp = extractFloat128Exp( b ); 7102 bSign = extractFloat128Sign( b ); 7103 zSign = aSign ^ bSign; 7104 if ( aExp == 0x7FFF ) { 7105 if (aSig0 | aSig1) { 7106 return propagateFloat128NaN(a, b, status); 7107 } 7108 if ( bExp == 0x7FFF ) { 7109 if (bSig0 | bSig1) { 7110 return propagateFloat128NaN(a, b, status); 7111 } 7112 goto invalid; 7113 } 7114 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7115 } 7116 if ( bExp == 0x7FFF ) { 7117 if (bSig0 | bSig1) { 7118 return propagateFloat128NaN(a, b, status); 7119 } 7120 return packFloat128( zSign, 0, 0, 0 ); 7121 } 7122 if ( bExp == 0 ) { 7123 if ( ( bSig0 | bSig1 ) == 0 ) { 7124 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 7125 invalid: 7126 float_raise(float_flag_invalid, status); 7127 return float128_default_nan(status); 7128 } 7129 float_raise(float_flag_divbyzero, status); 7130 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7131 } 7132 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7133 } 7134 if ( aExp == 0 ) { 7135 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7136 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7137 } 7138 zExp = aExp - bExp + 0x3FFD; 7139 shortShift128Left( 7140 aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 ); 7141 shortShift128Left( 7142 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 ); 7143 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 7144 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 7145 ++zExp; 7146 } 7147 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7148 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 7149 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 7150 while ( (int64_t) rem0 < 0 ) { 7151 --zSig0; 7152 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 7153 } 7154 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 7155 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 7156 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 7157 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 7158 while ( (int64_t) rem1 < 0 ) { 7159 --zSig1; 7160 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 7161 } 7162 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7163 } 7164 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 7165 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7166 7167 } 7168 7169 /*---------------------------------------------------------------------------- 7170 | Returns the remainder of the quadruple-precision floating-point value `a' 7171 | with respect to the corresponding value `b'. The operation is performed 7172 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7173 *----------------------------------------------------------------------------*/ 7174 7175 float128 float128_rem(float128 a, float128 b, float_status *status) 7176 { 7177 bool aSign, zSign; 7178 int32_t aExp, bExp, expDiff; 7179 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 7180 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 7181 int64_t sigMean0; 7182 7183 aSig1 = extractFloat128Frac1( a ); 7184 aSig0 = extractFloat128Frac0( a ); 7185 aExp = extractFloat128Exp( a ); 7186 aSign = extractFloat128Sign( a ); 7187 bSig1 = extractFloat128Frac1( b ); 7188 bSig0 = extractFloat128Frac0( b ); 7189 bExp = extractFloat128Exp( b ); 7190 if ( aExp == 0x7FFF ) { 7191 if ( ( aSig0 | aSig1 ) 7192 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 7193 return propagateFloat128NaN(a, b, status); 7194 } 7195 goto invalid; 7196 } 7197 if ( bExp == 0x7FFF ) { 7198 if (bSig0 | bSig1) { 7199 return propagateFloat128NaN(a, b, status); 7200 } 7201 return a; 7202 } 7203 if ( bExp == 0 ) { 7204 if ( ( bSig0 | bSig1 ) == 0 ) { 7205 invalid: 7206 float_raise(float_flag_invalid, status); 7207 return float128_default_nan(status); 7208 } 7209 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7210 } 7211 if ( aExp == 0 ) { 7212 if ( ( aSig0 | aSig1 ) == 0 ) return a; 7213 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7214 } 7215 expDiff = aExp - bExp; 7216 if ( expDiff < -1 ) return a; 7217 shortShift128Left( 7218 aSig0 | UINT64_C(0x0001000000000000), 7219 aSig1, 7220 15 - ( expDiff < 0 ), 7221 &aSig0, 7222 &aSig1 7223 ); 7224 shortShift128Left( 7225 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 ); 7226 q = le128( bSig0, bSig1, aSig0, aSig1 ); 7227 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7228 expDiff -= 64; 7229 while ( 0 < expDiff ) { 7230 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7231 q = ( 4 < q ) ? q - 4 : 0; 7232 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7233 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 7234 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 7235 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 7236 expDiff -= 61; 7237 } 7238 if ( -64 < expDiff ) { 7239 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7240 q = ( 4 < q ) ? q - 4 : 0; 7241 q >>= - expDiff; 7242 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7243 expDiff += 52; 7244 if ( expDiff < 0 ) { 7245 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 7246 } 7247 else { 7248 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 7249 } 7250 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7251 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 7252 } 7253 else { 7254 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 7255 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7256 } 7257 do { 7258 alternateASig0 = aSig0; 7259 alternateASig1 = aSig1; 7260 ++q; 7261 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7262 } while ( 0 <= (int64_t) aSig0 ); 7263 add128( 7264 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 7265 if ( ( sigMean0 < 0 ) 7266 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 7267 aSig0 = alternateASig0; 7268 aSig1 = alternateASig1; 7269 } 7270 zSign = ( (int64_t) aSig0 < 0 ); 7271 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 7272 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 7273 status); 7274 } 7275 7276 /*---------------------------------------------------------------------------- 7277 | Returns the square root of the quadruple-precision floating-point value `a'. 7278 | The operation is performed according to the IEC/IEEE Standard for Binary 7279 | Floating-Point Arithmetic. 7280 *----------------------------------------------------------------------------*/ 7281 7282 float128 float128_sqrt(float128 a, float_status *status) 7283 { 7284 bool aSign; 7285 int32_t aExp, zExp; 7286 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 7287 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7288 7289 aSig1 = extractFloat128Frac1( a ); 7290 aSig0 = extractFloat128Frac0( a ); 7291 aExp = extractFloat128Exp( a ); 7292 aSign = extractFloat128Sign( a ); 7293 if ( aExp == 0x7FFF ) { 7294 if (aSig0 | aSig1) { 7295 return propagateFloat128NaN(a, a, status); 7296 } 7297 if ( ! aSign ) return a; 7298 goto invalid; 7299 } 7300 if ( aSign ) { 7301 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 7302 invalid: 7303 float_raise(float_flag_invalid, status); 7304 return float128_default_nan(status); 7305 } 7306 if ( aExp == 0 ) { 7307 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 7308 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7309 } 7310 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 7311 aSig0 |= UINT64_C(0x0001000000000000); 7312 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 7313 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 7314 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 7315 doubleZSig0 = zSig0<<1; 7316 mul64To128( zSig0, zSig0, &term0, &term1 ); 7317 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 7318 while ( (int64_t) rem0 < 0 ) { 7319 --zSig0; 7320 doubleZSig0 -= 2; 7321 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 7322 } 7323 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 7324 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 7325 if ( zSig1 == 0 ) zSig1 = 1; 7326 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 7327 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 7328 mul64To128( zSig1, zSig1, &term2, &term3 ); 7329 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 7330 while ( (int64_t) rem1 < 0 ) { 7331 --zSig1; 7332 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 7333 term3 |= 1; 7334 term2 |= doubleZSig0; 7335 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 7336 } 7337 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7338 } 7339 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 7340 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 7341 7342 } 7343 7344 static inline FloatRelation 7345 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet, 7346 float_status *status) 7347 { 7348 bool aSign, bSign; 7349 7350 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 7351 float_raise(float_flag_invalid, status); 7352 return float_relation_unordered; 7353 } 7354 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 7355 ( extractFloatx80Frac( a )<<1 ) ) || 7356 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 7357 ( extractFloatx80Frac( b )<<1 ) )) { 7358 if (!is_quiet || 7359 floatx80_is_signaling_nan(a, status) || 7360 floatx80_is_signaling_nan(b, status)) { 7361 float_raise(float_flag_invalid, status); 7362 } 7363 return float_relation_unordered; 7364 } 7365 aSign = extractFloatx80Sign( a ); 7366 bSign = extractFloatx80Sign( b ); 7367 if ( aSign != bSign ) { 7368 7369 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 7370 ( ( a.low | b.low ) == 0 ) ) { 7371 /* zero case */ 7372 return float_relation_equal; 7373 } else { 7374 return 1 - (2 * aSign); 7375 } 7376 } else { 7377 /* Normalize pseudo-denormals before comparison. */ 7378 if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) { 7379 ++a.high; 7380 } 7381 if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) { 7382 ++b.high; 7383 } 7384 if (a.low == b.low && a.high == b.high) { 7385 return float_relation_equal; 7386 } else { 7387 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7388 } 7389 } 7390 } 7391 7392 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status) 7393 { 7394 return floatx80_compare_internal(a, b, 0, status); 7395 } 7396 7397 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b, 7398 float_status *status) 7399 { 7400 return floatx80_compare_internal(a, b, 1, status); 7401 } 7402 7403 static inline FloatRelation 7404 float128_compare_internal(float128 a, float128 b, bool is_quiet, 7405 float_status *status) 7406 { 7407 bool aSign, bSign; 7408 7409 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7410 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7411 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7412 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7413 if (!is_quiet || 7414 float128_is_signaling_nan(a, status) || 7415 float128_is_signaling_nan(b, status)) { 7416 float_raise(float_flag_invalid, status); 7417 } 7418 return float_relation_unordered; 7419 } 7420 aSign = extractFloat128Sign( a ); 7421 bSign = extractFloat128Sign( b ); 7422 if ( aSign != bSign ) { 7423 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7424 /* zero case */ 7425 return float_relation_equal; 7426 } else { 7427 return 1 - (2 * aSign); 7428 } 7429 } else { 7430 if (a.low == b.low && a.high == b.high) { 7431 return float_relation_equal; 7432 } else { 7433 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7434 } 7435 } 7436 } 7437 7438 FloatRelation float128_compare(float128 a, float128 b, float_status *status) 7439 { 7440 return float128_compare_internal(a, b, 0, status); 7441 } 7442 7443 FloatRelation float128_compare_quiet(float128 a, float128 b, 7444 float_status *status) 7445 { 7446 return float128_compare_internal(a, b, 1, status); 7447 } 7448 7449 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7450 { 7451 bool aSign; 7452 int32_t aExp; 7453 uint64_t aSig; 7454 7455 if (floatx80_invalid_encoding(a)) { 7456 float_raise(float_flag_invalid, status); 7457 return floatx80_default_nan(status); 7458 } 7459 aSig = extractFloatx80Frac( a ); 7460 aExp = extractFloatx80Exp( a ); 7461 aSign = extractFloatx80Sign( a ); 7462 7463 if ( aExp == 0x7FFF ) { 7464 if ( aSig<<1 ) { 7465 return propagateFloatx80NaN(a, a, status); 7466 } 7467 return a; 7468 } 7469 7470 if (aExp == 0) { 7471 if (aSig == 0) { 7472 return a; 7473 } 7474 aExp++; 7475 } 7476 7477 if (n > 0x10000) { 7478 n = 0x10000; 7479 } else if (n < -0x10000) { 7480 n = -0x10000; 7481 } 7482 7483 aExp += n; 7484 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7485 aSign, aExp, aSig, 0, status); 7486 } 7487 7488 float128 float128_scalbn(float128 a, int n, float_status *status) 7489 { 7490 bool aSign; 7491 int32_t aExp; 7492 uint64_t aSig0, aSig1; 7493 7494 aSig1 = extractFloat128Frac1( a ); 7495 aSig0 = extractFloat128Frac0( a ); 7496 aExp = extractFloat128Exp( a ); 7497 aSign = extractFloat128Sign( a ); 7498 if ( aExp == 0x7FFF ) { 7499 if ( aSig0 | aSig1 ) { 7500 return propagateFloat128NaN(a, a, status); 7501 } 7502 return a; 7503 } 7504 if (aExp != 0) { 7505 aSig0 |= UINT64_C(0x0001000000000000); 7506 } else if (aSig0 == 0 && aSig1 == 0) { 7507 return a; 7508 } else { 7509 aExp++; 7510 } 7511 7512 if (n > 0x10000) { 7513 n = 0x10000; 7514 } else if (n < -0x10000) { 7515 n = -0x10000; 7516 } 7517 7518 aExp += n - 1; 7519 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7520 , status); 7521 7522 } 7523 7524 static void __attribute__((constructor)) softfloat_init(void) 7525 { 7526 union_float64 ua, ub, uc, ur; 7527 7528 if (QEMU_NO_HARDFLOAT) { 7529 return; 7530 } 7531 /* 7532 * Test that the host's FMA is not obviously broken. For example, 7533 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see 7534 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304 7535 */ 7536 ua.s = 0x0020000000000001ULL; 7537 ub.s = 0x3ca0000000000000ULL; 7538 uc.s = 0x0020000000000000ULL; 7539 ur.h = fma(ua.h, ub.h, uc.h); 7540 if (ur.s != 0x0020000000000001ULL) { 7541 force_soft_fma = true; 7542 } 7543 } 7544