1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include <math.h> 87 #include "qemu/bitops.h" 88 #include "fpu/softfloat.h" 89 90 /* We only need stdlib for abort() */ 91 92 /*---------------------------------------------------------------------------- 93 | Primitive arithmetic functions, including multi-word arithmetic, and 94 | division and square root approximations. (Can be specialized to target if 95 | desired.) 96 *----------------------------------------------------------------------------*/ 97 #include "fpu/softfloat-macros.h" 98 99 /* 100 * Hardfloat 101 * 102 * Fast emulation of guest FP instructions is challenging for two reasons. 103 * First, FP instruction semantics are similar but not identical, particularly 104 * when handling NaNs. Second, emulating at reasonable speed the guest FP 105 * exception flags is not trivial: reading the host's flags register with a 106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp], 107 * and trapping on every FP exception is not fast nor pleasant to work with. 108 * 109 * We address these challenges by leveraging the host FPU for a subset of the 110 * operations. To do this we expand on the idea presented in this paper: 111 * 112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a 113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615. 114 * 115 * The idea is thus to leverage the host FPU to (1) compute FP operations 116 * and (2) identify whether FP exceptions occurred while avoiding 117 * expensive exception flag register accesses. 118 * 119 * An important optimization shown in the paper is that given that exception 120 * flags are rarely cleared by the guest, we can avoid recomputing some flags. 121 * This is particularly useful for the inexact flag, which is very frequently 122 * raised in floating-point workloads. 123 * 124 * We optimize the code further by deferring to soft-fp whenever FP exception 125 * detection might get hairy. Two examples: (1) when at least one operand is 126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result 127 * and the result is < the minimum normal. 128 */ 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \ 130 static inline void name(soft_t *a, float_status *s) \ 131 { \ 132 if (unlikely(soft_t ## _is_denormal(*a))) { \ 133 *a = soft_t ## _set_sign(soft_t ## _zero, \ 134 soft_t ## _is_neg(*a)); \ 135 float_raise(float_flag_input_denormal, s); \ 136 } \ 137 } 138 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32) 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64) 141 #undef GEN_INPUT_FLUSH__NOCHECK 142 143 #define GEN_INPUT_FLUSH1(name, soft_t) \ 144 static inline void name(soft_t *a, float_status *s) \ 145 { \ 146 if (likely(!s->flush_inputs_to_zero)) { \ 147 return; \ 148 } \ 149 soft_t ## _input_flush__nocheck(a, s); \ 150 } 151 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32) 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64) 154 #undef GEN_INPUT_FLUSH1 155 156 #define GEN_INPUT_FLUSH2(name, soft_t) \ 157 static inline void name(soft_t *a, soft_t *b, float_status *s) \ 158 { \ 159 if (likely(!s->flush_inputs_to_zero)) { \ 160 return; \ 161 } \ 162 soft_t ## _input_flush__nocheck(a, s); \ 163 soft_t ## _input_flush__nocheck(b, s); \ 164 } 165 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32) 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64) 168 #undef GEN_INPUT_FLUSH2 169 170 #define GEN_INPUT_FLUSH3(name, soft_t) \ 171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \ 172 { \ 173 if (likely(!s->flush_inputs_to_zero)) { \ 174 return; \ 175 } \ 176 soft_t ## _input_flush__nocheck(a, s); \ 177 soft_t ## _input_flush__nocheck(b, s); \ 178 soft_t ## _input_flush__nocheck(c, s); \ 179 } 180 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32) 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64) 183 #undef GEN_INPUT_FLUSH3 184 185 /* 186 * Choose whether to use fpclassify or float32/64_* primitives in the generated 187 * hardfloat functions. Each combination of number of inputs and float size 188 * gets its own value. 189 */ 190 #if defined(__x86_64__) 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1 197 #else 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0 204 #endif 205 206 /* 207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over 208 * float{32,64}_is_infinity when !USE_FP. 209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup. 210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%. 211 */ 212 #if defined(__x86_64__) || defined(__aarch64__) 213 # define QEMU_HARDFLOAT_USE_ISINF 1 214 #else 215 # define QEMU_HARDFLOAT_USE_ISINF 0 216 #endif 217 218 /* 219 * Some targets clear the FP flags before most FP operations. This prevents 220 * the use of hardfloat, since hardfloat relies on the inexact flag being 221 * already set. 222 */ 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__) 224 # if defined(__FAST_MATH__) 225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \ 226 IEEE implementation 227 # endif 228 # define QEMU_NO_HARDFLOAT 1 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN 230 #else 231 # define QEMU_NO_HARDFLOAT 0 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline)) 233 #endif 234 235 static inline bool can_use_fpu(const float_status *s) 236 { 237 if (QEMU_NO_HARDFLOAT) { 238 return false; 239 } 240 return likely(s->float_exception_flags & float_flag_inexact && 241 s->float_rounding_mode == float_round_nearest_even); 242 } 243 244 /* 245 * Hardfloat generation functions. Each operation can have two flavors: 246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for 247 * most condition checks, or native ones (e.g. fpclassify). 248 * 249 * The flavor is chosen by the callers. Instead of using macros, we rely on the 250 * compiler to propagate constants and inline everything into the callers. 251 * 252 * We only generate functions for operations with two inputs, since only 253 * these are common enough to justify consolidating them into common code. 254 */ 255 256 typedef union { 257 float32 s; 258 float h; 259 } union_float32; 260 261 typedef union { 262 float64 s; 263 double h; 264 } union_float64; 265 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b); 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b); 268 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s); 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s); 271 typedef float (*hard_f32_op2_fn)(float a, float b); 272 typedef double (*hard_f64_op2_fn)(double a, double b); 273 274 /* 2-input is-zero-or-normal */ 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b) 276 { 277 if (QEMU_HARDFLOAT_2F32_USE_FP) { 278 /* 279 * Not using a temp variable for consecutive fpclassify calls ends up 280 * generating faster code. 281 */ 282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 284 } 285 return float32_is_zero_or_normal(a.s) && 286 float32_is_zero_or_normal(b.s); 287 } 288 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b) 290 { 291 if (QEMU_HARDFLOAT_2F64_USE_FP) { 292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 294 } 295 return float64_is_zero_or_normal(a.s) && 296 float64_is_zero_or_normal(b.s); 297 } 298 299 /* 3-input is-zero-or-normal */ 300 static inline 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c) 302 { 303 if (QEMU_HARDFLOAT_3F32_USE_FP) { 304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 307 } 308 return float32_is_zero_or_normal(a.s) && 309 float32_is_zero_or_normal(b.s) && 310 float32_is_zero_or_normal(c.s); 311 } 312 313 static inline 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c) 315 { 316 if (QEMU_HARDFLOAT_3F64_USE_FP) { 317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 320 } 321 return float64_is_zero_or_normal(a.s) && 322 float64_is_zero_or_normal(b.s) && 323 float64_is_zero_or_normal(c.s); 324 } 325 326 static inline bool f32_is_inf(union_float32 a) 327 { 328 if (QEMU_HARDFLOAT_USE_ISINF) { 329 return isinf(a.h); 330 } 331 return float32_is_infinity(a.s); 332 } 333 334 static inline bool f64_is_inf(union_float64 a) 335 { 336 if (QEMU_HARDFLOAT_USE_ISINF) { 337 return isinf(a.h); 338 } 339 return float64_is_infinity(a.s); 340 } 341 342 static inline float32 343 float32_gen2(float32 xa, float32 xb, float_status *s, 344 hard_f32_op2_fn hard, soft_f32_op2_fn soft, 345 f32_check_fn pre, f32_check_fn post) 346 { 347 union_float32 ua, ub, ur; 348 349 ua.s = xa; 350 ub.s = xb; 351 352 if (unlikely(!can_use_fpu(s))) { 353 goto soft; 354 } 355 356 float32_input_flush2(&ua.s, &ub.s, s); 357 if (unlikely(!pre(ua, ub))) { 358 goto soft; 359 } 360 361 ur.h = hard(ua.h, ub.h); 362 if (unlikely(f32_is_inf(ur))) { 363 float_raise(float_flag_overflow, s); 364 } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) { 365 goto soft; 366 } 367 return ur.s; 368 369 soft: 370 return soft(ua.s, ub.s, s); 371 } 372 373 static inline float64 374 float64_gen2(float64 xa, float64 xb, float_status *s, 375 hard_f64_op2_fn hard, soft_f64_op2_fn soft, 376 f64_check_fn pre, f64_check_fn post) 377 { 378 union_float64 ua, ub, ur; 379 380 ua.s = xa; 381 ub.s = xb; 382 383 if (unlikely(!can_use_fpu(s))) { 384 goto soft; 385 } 386 387 float64_input_flush2(&ua.s, &ub.s, s); 388 if (unlikely(!pre(ua, ub))) { 389 goto soft; 390 } 391 392 ur.h = hard(ua.h, ub.h); 393 if (unlikely(f64_is_inf(ur))) { 394 float_raise(float_flag_overflow, s); 395 } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) { 396 goto soft; 397 } 398 return ur.s; 399 400 soft: 401 return soft(ua.s, ub.s, s); 402 } 403 404 /*---------------------------------------------------------------------------- 405 | Returns the fraction bits of the single-precision floating-point value `a'. 406 *----------------------------------------------------------------------------*/ 407 408 static inline uint32_t extractFloat32Frac(float32 a) 409 { 410 return float32_val(a) & 0x007FFFFF; 411 } 412 413 /*---------------------------------------------------------------------------- 414 | Returns the exponent bits of the single-precision floating-point value `a'. 415 *----------------------------------------------------------------------------*/ 416 417 static inline int extractFloat32Exp(float32 a) 418 { 419 return (float32_val(a) >> 23) & 0xFF; 420 } 421 422 /*---------------------------------------------------------------------------- 423 | Returns the sign bit of the single-precision floating-point value `a'. 424 *----------------------------------------------------------------------------*/ 425 426 static inline bool extractFloat32Sign(float32 a) 427 { 428 return float32_val(a) >> 31; 429 } 430 431 /*---------------------------------------------------------------------------- 432 | Returns the fraction bits of the double-precision floating-point value `a'. 433 *----------------------------------------------------------------------------*/ 434 435 static inline uint64_t extractFloat64Frac(float64 a) 436 { 437 return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF); 438 } 439 440 /*---------------------------------------------------------------------------- 441 | Returns the exponent bits of the double-precision floating-point value `a'. 442 *----------------------------------------------------------------------------*/ 443 444 static inline int extractFloat64Exp(float64 a) 445 { 446 return (float64_val(a) >> 52) & 0x7FF; 447 } 448 449 /*---------------------------------------------------------------------------- 450 | Returns the sign bit of the double-precision floating-point value `a'. 451 *----------------------------------------------------------------------------*/ 452 453 static inline bool extractFloat64Sign(float64 a) 454 { 455 return float64_val(a) >> 63; 456 } 457 458 /* 459 * Classify a floating point number. Everything above float_class_qnan 460 * is a NaN so cls >= float_class_qnan is any NaN. 461 */ 462 463 typedef enum __attribute__ ((__packed__)) { 464 float_class_unclassified, 465 float_class_zero, 466 float_class_normal, 467 float_class_inf, 468 float_class_qnan, /* all NaNs from here */ 469 float_class_snan, 470 } FloatClass; 471 472 #define float_cmask(bit) (1u << (bit)) 473 474 enum { 475 float_cmask_zero = float_cmask(float_class_zero), 476 float_cmask_normal = float_cmask(float_class_normal), 477 float_cmask_inf = float_cmask(float_class_inf), 478 float_cmask_qnan = float_cmask(float_class_qnan), 479 float_cmask_snan = float_cmask(float_class_snan), 480 481 float_cmask_infzero = float_cmask_zero | float_cmask_inf, 482 float_cmask_anynan = float_cmask_qnan | float_cmask_snan, 483 }; 484 485 486 /* Simple helpers for checking if, or what kind of, NaN we have */ 487 static inline __attribute__((unused)) bool is_nan(FloatClass c) 488 { 489 return unlikely(c >= float_class_qnan); 490 } 491 492 static inline __attribute__((unused)) bool is_snan(FloatClass c) 493 { 494 return c == float_class_snan; 495 } 496 497 static inline __attribute__((unused)) bool is_qnan(FloatClass c) 498 { 499 return c == float_class_qnan; 500 } 501 502 /* 503 * Structure holding all of the decomposed parts of a float. 504 * The exponent is unbiased and the fraction is normalized. 505 * 506 * The fraction words are stored in big-endian word ordering, 507 * so that truncation from a larger format to a smaller format 508 * can be done simply by ignoring subsequent elements. 509 */ 510 511 typedef struct { 512 FloatClass cls; 513 bool sign; 514 int32_t exp; 515 union { 516 /* Routines that know the structure may reference the singular name. */ 517 uint64_t frac; 518 /* 519 * Routines expanded with multiple structures reference "hi" and "lo" 520 * depending on the operation. In FloatParts64, "hi" and "lo" are 521 * both the same word and aliased here. 522 */ 523 uint64_t frac_hi; 524 uint64_t frac_lo; 525 }; 526 } FloatParts64; 527 528 typedef struct { 529 FloatClass cls; 530 bool sign; 531 int32_t exp; 532 uint64_t frac_hi; 533 uint64_t frac_lo; 534 } FloatParts128; 535 536 typedef struct { 537 FloatClass cls; 538 bool sign; 539 int32_t exp; 540 uint64_t frac_hi; 541 uint64_t frac_hm; /* high-middle */ 542 uint64_t frac_lm; /* low-middle */ 543 uint64_t frac_lo; 544 } FloatParts256; 545 546 /* These apply to the most significant word of each FloatPartsN. */ 547 #define DECOMPOSED_BINARY_POINT 63 548 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 549 550 /* Structure holding all of the relevant parameters for a format. 551 * exp_size: the size of the exponent field 552 * exp_bias: the offset applied to the exponent field 553 * exp_max: the maximum normalised exponent 554 * frac_size: the size of the fraction field 555 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 556 * The following are computed based the size of fraction 557 * frac_lsb: least significant bit of fraction 558 * frac_lsbm1: the bit below the least significant bit (for rounding) 559 * round_mask/roundeven_mask: masks used for rounding 560 * The following optional modifiers are available: 561 * arm_althp: handle ARM Alternative Half Precision 562 */ 563 typedef struct { 564 int exp_size; 565 int exp_bias; 566 int exp_max; 567 int frac_size; 568 int frac_shift; 569 uint64_t frac_lsb; 570 uint64_t frac_lsbm1; 571 uint64_t round_mask; 572 uint64_t roundeven_mask; 573 bool arm_althp; 574 } FloatFmt; 575 576 /* Expand fields based on the size of exponent and fraction */ 577 #define FLOAT_PARAMS(E, F) \ 578 .exp_size = E, \ 579 .exp_bias = ((1 << E) - 1) >> 1, \ 580 .exp_max = (1 << E) - 1, \ 581 .frac_size = F, \ 582 .frac_shift = (-F - 1) & 63, \ 583 .frac_lsb = 1ull << ((-F - 1) & 63), \ 584 .frac_lsbm1 = 1ull << ((-F - 2) & 63), \ 585 .round_mask = (1ull << ((-F - 1) & 63)) - 1, \ 586 .roundeven_mask = (2ull << ((-F - 1) & 63)) - 1 587 588 static const FloatFmt float16_params = { 589 FLOAT_PARAMS(5, 10) 590 }; 591 592 static const FloatFmt float16_params_ahp = { 593 FLOAT_PARAMS(5, 10), 594 .arm_althp = true 595 }; 596 597 static const FloatFmt bfloat16_params = { 598 FLOAT_PARAMS(8, 7) 599 }; 600 601 static const FloatFmt float32_params = { 602 FLOAT_PARAMS(8, 23) 603 }; 604 605 static const FloatFmt float64_params = { 606 FLOAT_PARAMS(11, 52) 607 }; 608 609 static const FloatFmt float128_params = { 610 FLOAT_PARAMS(15, 112) 611 }; 612 613 /* Unpack a float to parts, but do not canonicalize. */ 614 static void unpack_raw64(FloatParts64 *r, const FloatFmt *fmt, uint64_t raw) 615 { 616 const int f_size = fmt->frac_size; 617 const int e_size = fmt->exp_size; 618 619 *r = (FloatParts64) { 620 .cls = float_class_unclassified, 621 .sign = extract64(raw, f_size + e_size, 1), 622 .exp = extract64(raw, f_size, e_size), 623 .frac = extract64(raw, 0, f_size) 624 }; 625 } 626 627 static inline void float16_unpack_raw(FloatParts64 *p, float16 f) 628 { 629 unpack_raw64(p, &float16_params, f); 630 } 631 632 static inline void bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f) 633 { 634 unpack_raw64(p, &bfloat16_params, f); 635 } 636 637 static inline void float32_unpack_raw(FloatParts64 *p, float32 f) 638 { 639 unpack_raw64(p, &float32_params, f); 640 } 641 642 static inline void float64_unpack_raw(FloatParts64 *p, float64 f) 643 { 644 unpack_raw64(p, &float64_params, f); 645 } 646 647 static void float128_unpack_raw(FloatParts128 *p, float128 f) 648 { 649 const int f_size = float128_params.frac_size - 64; 650 const int e_size = float128_params.exp_size; 651 652 *p = (FloatParts128) { 653 .cls = float_class_unclassified, 654 .sign = extract64(f.high, f_size + e_size, 1), 655 .exp = extract64(f.high, f_size, e_size), 656 .frac_hi = extract64(f.high, 0, f_size), 657 .frac_lo = f.low, 658 }; 659 } 660 661 /* Pack a float from parts, but do not canonicalize. */ 662 static uint64_t pack_raw64(const FloatParts64 *p, const FloatFmt *fmt) 663 { 664 const int f_size = fmt->frac_size; 665 const int e_size = fmt->exp_size; 666 uint64_t ret; 667 668 ret = (uint64_t)p->sign << (f_size + e_size); 669 ret = deposit64(ret, f_size, e_size, p->exp); 670 ret = deposit64(ret, 0, f_size, p->frac); 671 return ret; 672 } 673 674 static inline float16 float16_pack_raw(const FloatParts64 *p) 675 { 676 return make_float16(pack_raw64(p, &float16_params)); 677 } 678 679 static inline bfloat16 bfloat16_pack_raw(const FloatParts64 *p) 680 { 681 return pack_raw64(p, &bfloat16_params); 682 } 683 684 static inline float32 float32_pack_raw(const FloatParts64 *p) 685 { 686 return make_float32(pack_raw64(p, &float32_params)); 687 } 688 689 static inline float64 float64_pack_raw(const FloatParts64 *p) 690 { 691 return make_float64(pack_raw64(p, &float64_params)); 692 } 693 694 static float128 float128_pack_raw(const FloatParts128 *p) 695 { 696 const int f_size = float128_params.frac_size - 64; 697 const int e_size = float128_params.exp_size; 698 uint64_t hi; 699 700 hi = (uint64_t)p->sign << (f_size + e_size); 701 hi = deposit64(hi, f_size, e_size, p->exp); 702 hi = deposit64(hi, 0, f_size, p->frac_hi); 703 return make_float128(hi, p->frac_lo); 704 } 705 706 /*---------------------------------------------------------------------------- 707 | Functions and definitions to determine: (1) whether tininess for underflow 708 | is detected before or after rounding by default, (2) what (if anything) 709 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 710 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 711 | are propagated from function inputs to output. These details are target- 712 | specific. 713 *----------------------------------------------------------------------------*/ 714 #include "softfloat-specialize.c.inc" 715 716 #define PARTS_GENERIC_64_128(NAME, P) \ 717 QEMU_GENERIC(P, (FloatParts128 *, parts128_##NAME), parts64_##NAME) 718 719 #define PARTS_GENERIC_64_128_256(NAME, P) \ 720 QEMU_GENERIC(P, (FloatParts256 *, parts256_##NAME), \ 721 (FloatParts128 *, parts128_##NAME), parts64_##NAME) 722 723 #define parts_default_nan(P, S) PARTS_GENERIC_64_128(default_nan, P)(P, S) 724 #define parts_silence_nan(P, S) PARTS_GENERIC_64_128(silence_nan, P)(P, S) 725 726 static void parts64_return_nan(FloatParts64 *a, float_status *s); 727 static void parts128_return_nan(FloatParts128 *a, float_status *s); 728 729 #define parts_return_nan(P, S) PARTS_GENERIC_64_128(return_nan, P)(P, S) 730 731 static FloatParts64 *parts64_pick_nan(FloatParts64 *a, FloatParts64 *b, 732 float_status *s); 733 static FloatParts128 *parts128_pick_nan(FloatParts128 *a, FloatParts128 *b, 734 float_status *s); 735 736 #define parts_pick_nan(A, B, S) PARTS_GENERIC_64_128(pick_nan, A)(A, B, S) 737 738 static FloatParts64 *parts64_pick_nan_muladd(FloatParts64 *a, FloatParts64 *b, 739 FloatParts64 *c, float_status *s, 740 int ab_mask, int abc_mask); 741 static FloatParts128 *parts128_pick_nan_muladd(FloatParts128 *a, 742 FloatParts128 *b, 743 FloatParts128 *c, 744 float_status *s, 745 int ab_mask, int abc_mask); 746 747 #define parts_pick_nan_muladd(A, B, C, S, ABM, ABCM) \ 748 PARTS_GENERIC_64_128(pick_nan_muladd, A)(A, B, C, S, ABM, ABCM) 749 750 static void parts64_canonicalize(FloatParts64 *p, float_status *status, 751 const FloatFmt *fmt); 752 static void parts128_canonicalize(FloatParts128 *p, float_status *status, 753 const FloatFmt *fmt); 754 755 #define parts_canonicalize(A, S, F) \ 756 PARTS_GENERIC_64_128(canonicalize, A)(A, S, F) 757 758 static void parts64_uncanon(FloatParts64 *p, float_status *status, 759 const FloatFmt *fmt); 760 static void parts128_uncanon(FloatParts128 *p, float_status *status, 761 const FloatFmt *fmt); 762 763 #define parts_uncanon(A, S, F) \ 764 PARTS_GENERIC_64_128(uncanon, A)(A, S, F) 765 766 static void parts64_add_normal(FloatParts64 *a, FloatParts64 *b); 767 static void parts128_add_normal(FloatParts128 *a, FloatParts128 *b); 768 static void parts256_add_normal(FloatParts256 *a, FloatParts256 *b); 769 770 #define parts_add_normal(A, B) \ 771 PARTS_GENERIC_64_128_256(add_normal, A)(A, B) 772 773 static bool parts64_sub_normal(FloatParts64 *a, FloatParts64 *b); 774 static bool parts128_sub_normal(FloatParts128 *a, FloatParts128 *b); 775 static bool parts256_sub_normal(FloatParts256 *a, FloatParts256 *b); 776 777 #define parts_sub_normal(A, B) \ 778 PARTS_GENERIC_64_128_256(sub_normal, A)(A, B) 779 780 static FloatParts64 *parts64_addsub(FloatParts64 *a, FloatParts64 *b, 781 float_status *s, bool subtract); 782 static FloatParts128 *parts128_addsub(FloatParts128 *a, FloatParts128 *b, 783 float_status *s, bool subtract); 784 785 #define parts_addsub(A, B, S, Z) \ 786 PARTS_GENERIC_64_128(addsub, A)(A, B, S, Z) 787 788 static FloatParts64 *parts64_mul(FloatParts64 *a, FloatParts64 *b, 789 float_status *s); 790 static FloatParts128 *parts128_mul(FloatParts128 *a, FloatParts128 *b, 791 float_status *s); 792 793 #define parts_mul(A, B, S) \ 794 PARTS_GENERIC_64_128(mul, A)(A, B, S) 795 796 static FloatParts64 *parts64_muladd(FloatParts64 *a, FloatParts64 *b, 797 FloatParts64 *c, int flags, 798 float_status *s); 799 static FloatParts128 *parts128_muladd(FloatParts128 *a, FloatParts128 *b, 800 FloatParts128 *c, int flags, 801 float_status *s); 802 803 #define parts_muladd(A, B, C, Z, S) \ 804 PARTS_GENERIC_64_128(muladd, A)(A, B, C, Z, S) 805 806 /* 807 * Helper functions for softfloat-parts.c.inc, per-size operations. 808 */ 809 810 #define FRAC_GENERIC_64_128(NAME, P) \ 811 QEMU_GENERIC(P, (FloatParts128 *, frac128_##NAME), frac64_##NAME) 812 813 #define FRAC_GENERIC_64_128_256(NAME, P) \ 814 QEMU_GENERIC(P, (FloatParts256 *, frac256_##NAME), \ 815 (FloatParts128 *, frac128_##NAME), frac64_##NAME) 816 817 static bool frac64_add(FloatParts64 *r, FloatParts64 *a, FloatParts64 *b) 818 { 819 return uadd64_overflow(a->frac, b->frac, &r->frac); 820 } 821 822 static bool frac128_add(FloatParts128 *r, FloatParts128 *a, FloatParts128 *b) 823 { 824 bool c = 0; 825 r->frac_lo = uadd64_carry(a->frac_lo, b->frac_lo, &c); 826 r->frac_hi = uadd64_carry(a->frac_hi, b->frac_hi, &c); 827 return c; 828 } 829 830 static bool frac256_add(FloatParts256 *r, FloatParts256 *a, FloatParts256 *b) 831 { 832 bool c = 0; 833 r->frac_lo = uadd64_carry(a->frac_lo, b->frac_lo, &c); 834 r->frac_lm = uadd64_carry(a->frac_lm, b->frac_lm, &c); 835 r->frac_hm = uadd64_carry(a->frac_hm, b->frac_hm, &c); 836 r->frac_hi = uadd64_carry(a->frac_hi, b->frac_hi, &c); 837 return c; 838 } 839 840 #define frac_add(R, A, B) FRAC_GENERIC_64_128_256(add, R)(R, A, B) 841 842 static bool frac64_addi(FloatParts64 *r, FloatParts64 *a, uint64_t c) 843 { 844 return uadd64_overflow(a->frac, c, &r->frac); 845 } 846 847 static bool frac128_addi(FloatParts128 *r, FloatParts128 *a, uint64_t c) 848 { 849 c = uadd64_overflow(a->frac_lo, c, &r->frac_lo); 850 return uadd64_overflow(a->frac_hi, c, &r->frac_hi); 851 } 852 853 #define frac_addi(R, A, C) FRAC_GENERIC_64_128(addi, R)(R, A, C) 854 855 static void frac64_allones(FloatParts64 *a) 856 { 857 a->frac = -1; 858 } 859 860 static void frac128_allones(FloatParts128 *a) 861 { 862 a->frac_hi = a->frac_lo = -1; 863 } 864 865 #define frac_allones(A) FRAC_GENERIC_64_128(allones, A)(A) 866 867 static int frac64_cmp(FloatParts64 *a, FloatParts64 *b) 868 { 869 return a->frac == b->frac ? 0 : a->frac < b->frac ? -1 : 1; 870 } 871 872 static int frac128_cmp(FloatParts128 *a, FloatParts128 *b) 873 { 874 uint64_t ta = a->frac_hi, tb = b->frac_hi; 875 if (ta == tb) { 876 ta = a->frac_lo, tb = b->frac_lo; 877 if (ta == tb) { 878 return 0; 879 } 880 } 881 return ta < tb ? -1 : 1; 882 } 883 884 #define frac_cmp(A, B) FRAC_GENERIC_64_128(cmp, A)(A, B) 885 886 static void frac64_clear(FloatParts64 *a) 887 { 888 a->frac = 0; 889 } 890 891 static void frac128_clear(FloatParts128 *a) 892 { 893 a->frac_hi = a->frac_lo = 0; 894 } 895 896 #define frac_clear(A) FRAC_GENERIC_64_128(clear, A)(A) 897 898 static bool frac64_eqz(FloatParts64 *a) 899 { 900 return a->frac == 0; 901 } 902 903 static bool frac128_eqz(FloatParts128 *a) 904 { 905 return (a->frac_hi | a->frac_lo) == 0; 906 } 907 908 #define frac_eqz(A) FRAC_GENERIC_64_128(eqz, A)(A) 909 910 static void frac64_mulw(FloatParts128 *r, FloatParts64 *a, FloatParts64 *b) 911 { 912 mulu64(&r->frac_lo, &r->frac_hi, a->frac, b->frac); 913 } 914 915 static void frac128_mulw(FloatParts256 *r, FloatParts128 *a, FloatParts128 *b) 916 { 917 mul128To256(a->frac_hi, a->frac_lo, b->frac_hi, b->frac_lo, 918 &r->frac_hi, &r->frac_hm, &r->frac_lm, &r->frac_lo); 919 } 920 921 #define frac_mulw(R, A, B) FRAC_GENERIC_64_128(mulw, A)(R, A, B) 922 923 static void frac64_neg(FloatParts64 *a) 924 { 925 a->frac = -a->frac; 926 } 927 928 static void frac128_neg(FloatParts128 *a) 929 { 930 bool c = 0; 931 a->frac_lo = usub64_borrow(0, a->frac_lo, &c); 932 a->frac_hi = usub64_borrow(0, a->frac_hi, &c); 933 } 934 935 static void frac256_neg(FloatParts256 *a) 936 { 937 bool c = 0; 938 a->frac_lo = usub64_borrow(0, a->frac_lo, &c); 939 a->frac_lm = usub64_borrow(0, a->frac_lm, &c); 940 a->frac_hm = usub64_borrow(0, a->frac_hm, &c); 941 a->frac_hi = usub64_borrow(0, a->frac_hi, &c); 942 } 943 944 #define frac_neg(A) FRAC_GENERIC_64_128_256(neg, A)(A) 945 946 static int frac64_normalize(FloatParts64 *a) 947 { 948 if (a->frac) { 949 int shift = clz64(a->frac); 950 a->frac <<= shift; 951 return shift; 952 } 953 return 64; 954 } 955 956 static int frac128_normalize(FloatParts128 *a) 957 { 958 if (a->frac_hi) { 959 int shl = clz64(a->frac_hi); 960 if (shl) { 961 int shr = 64 - shl; 962 a->frac_hi = (a->frac_hi << shl) | (a->frac_lo >> shr); 963 a->frac_lo = (a->frac_lo << shl); 964 } 965 return shl; 966 } else if (a->frac_lo) { 967 int shl = clz64(a->frac_lo); 968 a->frac_hi = (a->frac_lo << shl); 969 a->frac_lo = 0; 970 return shl + 64; 971 } 972 return 128; 973 } 974 975 static int frac256_normalize(FloatParts256 *a) 976 { 977 uint64_t a0 = a->frac_hi, a1 = a->frac_hm; 978 uint64_t a2 = a->frac_lm, a3 = a->frac_lo; 979 int ret, shl, shr; 980 981 if (likely(a0)) { 982 shl = clz64(a0); 983 if (shl == 0) { 984 return 0; 985 } 986 ret = shl; 987 } else { 988 if (a1) { 989 ret = 64; 990 a0 = a1, a1 = a2, a2 = a3, a3 = 0; 991 } else if (a2) { 992 ret = 128; 993 a0 = a2, a1 = a3, a2 = 0, a3 = 0; 994 } else if (a3) { 995 ret = 192; 996 a0 = a3, a1 = 0, a2 = 0, a3 = 0; 997 } else { 998 ret = 256; 999 a0 = 0, a1 = 0, a2 = 0, a3 = 0; 1000 goto done; 1001 } 1002 shl = clz64(a0); 1003 if (shl == 0) { 1004 goto done; 1005 } 1006 ret += shl; 1007 } 1008 1009 shr = -shl & 63; 1010 a0 = (a0 << shl) | (a1 >> shr); 1011 a1 = (a1 << shl) | (a2 >> shr); 1012 a2 = (a2 << shl) | (a3 >> shr); 1013 a3 = (a3 << shl); 1014 1015 done: 1016 a->frac_hi = a0; 1017 a->frac_hm = a1; 1018 a->frac_lm = a2; 1019 a->frac_lo = a3; 1020 return ret; 1021 } 1022 1023 #define frac_normalize(A) FRAC_GENERIC_64_128_256(normalize, A)(A) 1024 1025 static void frac64_shl(FloatParts64 *a, int c) 1026 { 1027 a->frac <<= c; 1028 } 1029 1030 static void frac128_shl(FloatParts128 *a, int c) 1031 { 1032 shift128Left(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo); 1033 } 1034 1035 #define frac_shl(A, C) FRAC_GENERIC_64_128(shl, A)(A, C) 1036 1037 static void frac64_shr(FloatParts64 *a, int c) 1038 { 1039 a->frac >>= c; 1040 } 1041 1042 static void frac128_shr(FloatParts128 *a, int c) 1043 { 1044 shift128Right(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo); 1045 } 1046 1047 #define frac_shr(A, C) FRAC_GENERIC_64_128(shr, A)(A, C) 1048 1049 static void frac64_shrjam(FloatParts64 *a, int c) 1050 { 1051 shift64RightJamming(a->frac, c, &a->frac); 1052 } 1053 1054 static void frac128_shrjam(FloatParts128 *a, int c) 1055 { 1056 shift128RightJamming(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo); 1057 } 1058 1059 static void frac256_shrjam(FloatParts256 *a, int c) 1060 { 1061 uint64_t a0 = a->frac_hi, a1 = a->frac_hm; 1062 uint64_t a2 = a->frac_lm, a3 = a->frac_lo; 1063 uint64_t sticky = 0; 1064 int invc; 1065 1066 if (unlikely(c == 0)) { 1067 return; 1068 } else if (likely(c < 64)) { 1069 /* nothing */ 1070 } else if (likely(c < 256)) { 1071 if (unlikely(c & 128)) { 1072 sticky |= a2 | a3; 1073 a3 = a1, a2 = a0, a1 = 0, a0 = 0; 1074 } 1075 if (unlikely(c & 64)) { 1076 sticky |= a3; 1077 a3 = a2, a2 = a1, a1 = a0, a0 = 0; 1078 } 1079 c &= 63; 1080 if (c == 0) { 1081 goto done; 1082 } 1083 } else { 1084 sticky = a0 | a1 | a2 | a3; 1085 a0 = a1 = a2 = a3 = 0; 1086 goto done; 1087 } 1088 1089 invc = -c & 63; 1090 sticky |= a3 << invc; 1091 a3 = (a3 >> c) | (a2 << invc); 1092 a2 = (a2 >> c) | (a1 << invc); 1093 a1 = (a1 >> c) | (a0 << invc); 1094 a0 = (a0 >> c); 1095 1096 done: 1097 a->frac_lo = a3 | (sticky != 0); 1098 a->frac_lm = a2; 1099 a->frac_hm = a1; 1100 a->frac_hi = a0; 1101 } 1102 1103 #define frac_shrjam(A, C) FRAC_GENERIC_64_128_256(shrjam, A)(A, C) 1104 1105 static bool frac64_sub(FloatParts64 *r, FloatParts64 *a, FloatParts64 *b) 1106 { 1107 return usub64_overflow(a->frac, b->frac, &r->frac); 1108 } 1109 1110 static bool frac128_sub(FloatParts128 *r, FloatParts128 *a, FloatParts128 *b) 1111 { 1112 bool c = 0; 1113 r->frac_lo = usub64_borrow(a->frac_lo, b->frac_lo, &c); 1114 r->frac_hi = usub64_borrow(a->frac_hi, b->frac_hi, &c); 1115 return c; 1116 } 1117 1118 static bool frac256_sub(FloatParts256 *r, FloatParts256 *a, FloatParts256 *b) 1119 { 1120 bool c = 0; 1121 r->frac_lo = usub64_borrow(a->frac_lo, b->frac_lo, &c); 1122 r->frac_lm = usub64_borrow(a->frac_lm, b->frac_lm, &c); 1123 r->frac_hm = usub64_borrow(a->frac_hm, b->frac_hm, &c); 1124 r->frac_hi = usub64_borrow(a->frac_hi, b->frac_hi, &c); 1125 return c; 1126 } 1127 1128 #define frac_sub(R, A, B) FRAC_GENERIC_64_128_256(sub, R)(R, A, B) 1129 1130 static void frac64_truncjam(FloatParts64 *r, FloatParts128 *a) 1131 { 1132 r->frac = a->frac_hi | (a->frac_lo != 0); 1133 } 1134 1135 static void frac128_truncjam(FloatParts128 *r, FloatParts256 *a) 1136 { 1137 r->frac_hi = a->frac_hi; 1138 r->frac_lo = a->frac_hm | ((a->frac_lm | a->frac_lo) != 0); 1139 } 1140 1141 #define frac_truncjam(R, A) FRAC_GENERIC_64_128(truncjam, R)(R, A) 1142 1143 static void frac64_widen(FloatParts128 *r, FloatParts64 *a) 1144 { 1145 r->frac_hi = a->frac; 1146 r->frac_lo = 0; 1147 } 1148 1149 static void frac128_widen(FloatParts256 *r, FloatParts128 *a) 1150 { 1151 r->frac_hi = a->frac_hi; 1152 r->frac_hm = a->frac_lo; 1153 r->frac_lm = 0; 1154 r->frac_lo = 0; 1155 } 1156 1157 #define frac_widen(A, B) FRAC_GENERIC_64_128(widen, B)(A, B) 1158 1159 #define partsN(NAME) glue(glue(glue(parts,N),_),NAME) 1160 #define FloatPartsN glue(FloatParts,N) 1161 #define FloatPartsW glue(FloatParts,W) 1162 1163 #define N 64 1164 #define W 128 1165 1166 #include "softfloat-parts-addsub.c.inc" 1167 #include "softfloat-parts.c.inc" 1168 1169 #undef N 1170 #undef W 1171 #define N 128 1172 #define W 256 1173 1174 #include "softfloat-parts-addsub.c.inc" 1175 #include "softfloat-parts.c.inc" 1176 1177 #undef N 1178 #undef W 1179 #define N 256 1180 1181 #include "softfloat-parts-addsub.c.inc" 1182 1183 #undef N 1184 #undef W 1185 #undef partsN 1186 #undef FloatPartsN 1187 #undef FloatPartsW 1188 1189 /* 1190 * Pack/unpack routines with a specific FloatFmt. 1191 */ 1192 1193 static void float16a_unpack_canonical(FloatParts64 *p, float16 f, 1194 float_status *s, const FloatFmt *params) 1195 { 1196 float16_unpack_raw(p, f); 1197 parts_canonicalize(p, s, params); 1198 } 1199 1200 static void float16_unpack_canonical(FloatParts64 *p, float16 f, 1201 float_status *s) 1202 { 1203 float16a_unpack_canonical(p, f, s, &float16_params); 1204 } 1205 1206 static void bfloat16_unpack_canonical(FloatParts64 *p, bfloat16 f, 1207 float_status *s) 1208 { 1209 bfloat16_unpack_raw(p, f); 1210 parts_canonicalize(p, s, &bfloat16_params); 1211 } 1212 1213 static float16 float16a_round_pack_canonical(FloatParts64 *p, 1214 float_status *s, 1215 const FloatFmt *params) 1216 { 1217 parts_uncanon(p, s, params); 1218 return float16_pack_raw(p); 1219 } 1220 1221 static float16 float16_round_pack_canonical(FloatParts64 *p, 1222 float_status *s) 1223 { 1224 return float16a_round_pack_canonical(p, s, &float16_params); 1225 } 1226 1227 static bfloat16 bfloat16_round_pack_canonical(FloatParts64 *p, 1228 float_status *s) 1229 { 1230 parts_uncanon(p, s, &bfloat16_params); 1231 return bfloat16_pack_raw(p); 1232 } 1233 1234 static void float32_unpack_canonical(FloatParts64 *p, float32 f, 1235 float_status *s) 1236 { 1237 float32_unpack_raw(p, f); 1238 parts_canonicalize(p, s, &float32_params); 1239 } 1240 1241 static float32 float32_round_pack_canonical(FloatParts64 *p, 1242 float_status *s) 1243 { 1244 parts_uncanon(p, s, &float32_params); 1245 return float32_pack_raw(p); 1246 } 1247 1248 static void float64_unpack_canonical(FloatParts64 *p, float64 f, 1249 float_status *s) 1250 { 1251 float64_unpack_raw(p, f); 1252 parts_canonicalize(p, s, &float64_params); 1253 } 1254 1255 static float64 float64_round_pack_canonical(FloatParts64 *p, 1256 float_status *s) 1257 { 1258 parts_uncanon(p, s, &float64_params); 1259 return float64_pack_raw(p); 1260 } 1261 1262 static void float128_unpack_canonical(FloatParts128 *p, float128 f, 1263 float_status *s) 1264 { 1265 float128_unpack_raw(p, f); 1266 parts_canonicalize(p, s, &float128_params); 1267 } 1268 1269 static float128 float128_round_pack_canonical(FloatParts128 *p, 1270 float_status *s) 1271 { 1272 parts_uncanon(p, s, &float128_params); 1273 return float128_pack_raw(p); 1274 } 1275 1276 /* 1277 * Addition and subtraction 1278 */ 1279 1280 static float16 QEMU_FLATTEN 1281 float16_addsub(float16 a, float16 b, float_status *status, bool subtract) 1282 { 1283 FloatParts64 pa, pb, *pr; 1284 1285 float16_unpack_canonical(&pa, a, status); 1286 float16_unpack_canonical(&pb, b, status); 1287 pr = parts_addsub(&pa, &pb, status, subtract); 1288 1289 return float16_round_pack_canonical(pr, status); 1290 } 1291 1292 float16 float16_add(float16 a, float16 b, float_status *status) 1293 { 1294 return float16_addsub(a, b, status, false); 1295 } 1296 1297 float16 float16_sub(float16 a, float16 b, float_status *status) 1298 { 1299 return float16_addsub(a, b, status, true); 1300 } 1301 1302 static float32 QEMU_SOFTFLOAT_ATTR 1303 soft_f32_addsub(float32 a, float32 b, float_status *status, bool subtract) 1304 { 1305 FloatParts64 pa, pb, *pr; 1306 1307 float32_unpack_canonical(&pa, a, status); 1308 float32_unpack_canonical(&pb, b, status); 1309 pr = parts_addsub(&pa, &pb, status, subtract); 1310 1311 return float32_round_pack_canonical(pr, status); 1312 } 1313 1314 static float32 soft_f32_add(float32 a, float32 b, float_status *status) 1315 { 1316 return soft_f32_addsub(a, b, status, false); 1317 } 1318 1319 static float32 soft_f32_sub(float32 a, float32 b, float_status *status) 1320 { 1321 return soft_f32_addsub(a, b, status, true); 1322 } 1323 1324 static float64 QEMU_SOFTFLOAT_ATTR 1325 soft_f64_addsub(float64 a, float64 b, float_status *status, bool subtract) 1326 { 1327 FloatParts64 pa, pb, *pr; 1328 1329 float64_unpack_canonical(&pa, a, status); 1330 float64_unpack_canonical(&pb, b, status); 1331 pr = parts_addsub(&pa, &pb, status, subtract); 1332 1333 return float64_round_pack_canonical(pr, status); 1334 } 1335 1336 static float64 soft_f64_add(float64 a, float64 b, float_status *status) 1337 { 1338 return soft_f64_addsub(a, b, status, false); 1339 } 1340 1341 static float64 soft_f64_sub(float64 a, float64 b, float_status *status) 1342 { 1343 return soft_f64_addsub(a, b, status, true); 1344 } 1345 1346 static float hard_f32_add(float a, float b) 1347 { 1348 return a + b; 1349 } 1350 1351 static float hard_f32_sub(float a, float b) 1352 { 1353 return a - b; 1354 } 1355 1356 static double hard_f64_add(double a, double b) 1357 { 1358 return a + b; 1359 } 1360 1361 static double hard_f64_sub(double a, double b) 1362 { 1363 return a - b; 1364 } 1365 1366 static bool f32_addsubmul_post(union_float32 a, union_float32 b) 1367 { 1368 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1369 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1370 } 1371 return !(float32_is_zero(a.s) && float32_is_zero(b.s)); 1372 } 1373 1374 static bool f64_addsubmul_post(union_float64 a, union_float64 b) 1375 { 1376 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1377 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1378 } else { 1379 return !(float64_is_zero(a.s) && float64_is_zero(b.s)); 1380 } 1381 } 1382 1383 static float32 float32_addsub(float32 a, float32 b, float_status *s, 1384 hard_f32_op2_fn hard, soft_f32_op2_fn soft) 1385 { 1386 return float32_gen2(a, b, s, hard, soft, 1387 f32_is_zon2, f32_addsubmul_post); 1388 } 1389 1390 static float64 float64_addsub(float64 a, float64 b, float_status *s, 1391 hard_f64_op2_fn hard, soft_f64_op2_fn soft) 1392 { 1393 return float64_gen2(a, b, s, hard, soft, 1394 f64_is_zon2, f64_addsubmul_post); 1395 } 1396 1397 float32 QEMU_FLATTEN 1398 float32_add(float32 a, float32 b, float_status *s) 1399 { 1400 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add); 1401 } 1402 1403 float32 QEMU_FLATTEN 1404 float32_sub(float32 a, float32 b, float_status *s) 1405 { 1406 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub); 1407 } 1408 1409 float64 QEMU_FLATTEN 1410 float64_add(float64 a, float64 b, float_status *s) 1411 { 1412 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add); 1413 } 1414 1415 float64 QEMU_FLATTEN 1416 float64_sub(float64 a, float64 b, float_status *s) 1417 { 1418 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub); 1419 } 1420 1421 static bfloat16 QEMU_FLATTEN 1422 bfloat16_addsub(bfloat16 a, bfloat16 b, float_status *status, bool subtract) 1423 { 1424 FloatParts64 pa, pb, *pr; 1425 1426 bfloat16_unpack_canonical(&pa, a, status); 1427 bfloat16_unpack_canonical(&pb, b, status); 1428 pr = parts_addsub(&pa, &pb, status, subtract); 1429 1430 return bfloat16_round_pack_canonical(pr, status); 1431 } 1432 1433 bfloat16 bfloat16_add(bfloat16 a, bfloat16 b, float_status *status) 1434 { 1435 return bfloat16_addsub(a, b, status, false); 1436 } 1437 1438 bfloat16 bfloat16_sub(bfloat16 a, bfloat16 b, float_status *status) 1439 { 1440 return bfloat16_addsub(a, b, status, true); 1441 } 1442 1443 static float128 QEMU_FLATTEN 1444 float128_addsub(float128 a, float128 b, float_status *status, bool subtract) 1445 { 1446 FloatParts128 pa, pb, *pr; 1447 1448 float128_unpack_canonical(&pa, a, status); 1449 float128_unpack_canonical(&pb, b, status); 1450 pr = parts_addsub(&pa, &pb, status, subtract); 1451 1452 return float128_round_pack_canonical(pr, status); 1453 } 1454 1455 float128 float128_add(float128 a, float128 b, float_status *status) 1456 { 1457 return float128_addsub(a, b, status, false); 1458 } 1459 1460 float128 float128_sub(float128 a, float128 b, float_status *status) 1461 { 1462 return float128_addsub(a, b, status, true); 1463 } 1464 1465 /* 1466 * Multiplication 1467 */ 1468 1469 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status) 1470 { 1471 FloatParts64 pa, pb, *pr; 1472 1473 float16_unpack_canonical(&pa, a, status); 1474 float16_unpack_canonical(&pb, b, status); 1475 pr = parts_mul(&pa, &pb, status); 1476 1477 return float16_round_pack_canonical(pr, status); 1478 } 1479 1480 static float32 QEMU_SOFTFLOAT_ATTR 1481 soft_f32_mul(float32 a, float32 b, float_status *status) 1482 { 1483 FloatParts64 pa, pb, *pr; 1484 1485 float32_unpack_canonical(&pa, a, status); 1486 float32_unpack_canonical(&pb, b, status); 1487 pr = parts_mul(&pa, &pb, status); 1488 1489 return float32_round_pack_canonical(pr, status); 1490 } 1491 1492 static float64 QEMU_SOFTFLOAT_ATTR 1493 soft_f64_mul(float64 a, float64 b, float_status *status) 1494 { 1495 FloatParts64 pa, pb, *pr; 1496 1497 float64_unpack_canonical(&pa, a, status); 1498 float64_unpack_canonical(&pb, b, status); 1499 pr = parts_mul(&pa, &pb, status); 1500 1501 return float64_round_pack_canonical(pr, status); 1502 } 1503 1504 static float hard_f32_mul(float a, float b) 1505 { 1506 return a * b; 1507 } 1508 1509 static double hard_f64_mul(double a, double b) 1510 { 1511 return a * b; 1512 } 1513 1514 float32 QEMU_FLATTEN 1515 float32_mul(float32 a, float32 b, float_status *s) 1516 { 1517 return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul, 1518 f32_is_zon2, f32_addsubmul_post); 1519 } 1520 1521 float64 QEMU_FLATTEN 1522 float64_mul(float64 a, float64 b, float_status *s) 1523 { 1524 return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul, 1525 f64_is_zon2, f64_addsubmul_post); 1526 } 1527 1528 bfloat16 QEMU_FLATTEN 1529 bfloat16_mul(bfloat16 a, bfloat16 b, float_status *status) 1530 { 1531 FloatParts64 pa, pb, *pr; 1532 1533 bfloat16_unpack_canonical(&pa, a, status); 1534 bfloat16_unpack_canonical(&pb, b, status); 1535 pr = parts_mul(&pa, &pb, status); 1536 1537 return bfloat16_round_pack_canonical(pr, status); 1538 } 1539 1540 float128 QEMU_FLATTEN 1541 float128_mul(float128 a, float128 b, float_status *status) 1542 { 1543 FloatParts128 pa, pb, *pr; 1544 1545 float128_unpack_canonical(&pa, a, status); 1546 float128_unpack_canonical(&pb, b, status); 1547 pr = parts_mul(&pa, &pb, status); 1548 1549 return float128_round_pack_canonical(pr, status); 1550 } 1551 1552 /* 1553 * Fused multiply-add 1554 */ 1555 1556 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c, 1557 int flags, float_status *status) 1558 { 1559 FloatParts64 pa, pb, pc, *pr; 1560 1561 float16_unpack_canonical(&pa, a, status); 1562 float16_unpack_canonical(&pb, b, status); 1563 float16_unpack_canonical(&pc, c, status); 1564 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1565 1566 return float16_round_pack_canonical(pr, status); 1567 } 1568 1569 static float32 QEMU_SOFTFLOAT_ATTR 1570 soft_f32_muladd(float32 a, float32 b, float32 c, int flags, 1571 float_status *status) 1572 { 1573 FloatParts64 pa, pb, pc, *pr; 1574 1575 float32_unpack_canonical(&pa, a, status); 1576 float32_unpack_canonical(&pb, b, status); 1577 float32_unpack_canonical(&pc, c, status); 1578 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1579 1580 return float32_round_pack_canonical(pr, status); 1581 } 1582 1583 static float64 QEMU_SOFTFLOAT_ATTR 1584 soft_f64_muladd(float64 a, float64 b, float64 c, int flags, 1585 float_status *status) 1586 { 1587 FloatParts64 pa, pb, pc, *pr; 1588 1589 float64_unpack_canonical(&pa, a, status); 1590 float64_unpack_canonical(&pb, b, status); 1591 float64_unpack_canonical(&pc, c, status); 1592 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1593 1594 return float64_round_pack_canonical(pr, status); 1595 } 1596 1597 static bool force_soft_fma; 1598 1599 float32 QEMU_FLATTEN 1600 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s) 1601 { 1602 union_float32 ua, ub, uc, ur; 1603 1604 ua.s = xa; 1605 ub.s = xb; 1606 uc.s = xc; 1607 1608 if (unlikely(!can_use_fpu(s))) { 1609 goto soft; 1610 } 1611 if (unlikely(flags & float_muladd_halve_result)) { 1612 goto soft; 1613 } 1614 1615 float32_input_flush3(&ua.s, &ub.s, &uc.s, s); 1616 if (unlikely(!f32_is_zon3(ua, ub, uc))) { 1617 goto soft; 1618 } 1619 1620 if (unlikely(force_soft_fma)) { 1621 goto soft; 1622 } 1623 1624 /* 1625 * When (a || b) == 0, there's no need to check for under/over flow, 1626 * since we know the addend is (normal || 0) and the product is 0. 1627 */ 1628 if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) { 1629 union_float32 up; 1630 bool prod_sign; 1631 1632 prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s); 1633 prod_sign ^= !!(flags & float_muladd_negate_product); 1634 up.s = float32_set_sign(float32_zero, prod_sign); 1635 1636 if (flags & float_muladd_negate_c) { 1637 uc.h = -uc.h; 1638 } 1639 ur.h = up.h + uc.h; 1640 } else { 1641 union_float32 ua_orig = ua; 1642 union_float32 uc_orig = uc; 1643 1644 if (flags & float_muladd_negate_product) { 1645 ua.h = -ua.h; 1646 } 1647 if (flags & float_muladd_negate_c) { 1648 uc.h = -uc.h; 1649 } 1650 1651 ur.h = fmaf(ua.h, ub.h, uc.h); 1652 1653 if (unlikely(f32_is_inf(ur))) { 1654 float_raise(float_flag_overflow, s); 1655 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) { 1656 ua = ua_orig; 1657 uc = uc_orig; 1658 goto soft; 1659 } 1660 } 1661 if (flags & float_muladd_negate_result) { 1662 return float32_chs(ur.s); 1663 } 1664 return ur.s; 1665 1666 soft: 1667 return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s); 1668 } 1669 1670 float64 QEMU_FLATTEN 1671 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s) 1672 { 1673 union_float64 ua, ub, uc, ur; 1674 1675 ua.s = xa; 1676 ub.s = xb; 1677 uc.s = xc; 1678 1679 if (unlikely(!can_use_fpu(s))) { 1680 goto soft; 1681 } 1682 if (unlikely(flags & float_muladd_halve_result)) { 1683 goto soft; 1684 } 1685 1686 float64_input_flush3(&ua.s, &ub.s, &uc.s, s); 1687 if (unlikely(!f64_is_zon3(ua, ub, uc))) { 1688 goto soft; 1689 } 1690 1691 if (unlikely(force_soft_fma)) { 1692 goto soft; 1693 } 1694 1695 /* 1696 * When (a || b) == 0, there's no need to check for under/over flow, 1697 * since we know the addend is (normal || 0) and the product is 0. 1698 */ 1699 if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) { 1700 union_float64 up; 1701 bool prod_sign; 1702 1703 prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s); 1704 prod_sign ^= !!(flags & float_muladd_negate_product); 1705 up.s = float64_set_sign(float64_zero, prod_sign); 1706 1707 if (flags & float_muladd_negate_c) { 1708 uc.h = -uc.h; 1709 } 1710 ur.h = up.h + uc.h; 1711 } else { 1712 union_float64 ua_orig = ua; 1713 union_float64 uc_orig = uc; 1714 1715 if (flags & float_muladd_negate_product) { 1716 ua.h = -ua.h; 1717 } 1718 if (flags & float_muladd_negate_c) { 1719 uc.h = -uc.h; 1720 } 1721 1722 ur.h = fma(ua.h, ub.h, uc.h); 1723 1724 if (unlikely(f64_is_inf(ur))) { 1725 float_raise(float_flag_overflow, s); 1726 } else if (unlikely(fabs(ur.h) <= FLT_MIN)) { 1727 ua = ua_orig; 1728 uc = uc_orig; 1729 goto soft; 1730 } 1731 } 1732 if (flags & float_muladd_negate_result) { 1733 return float64_chs(ur.s); 1734 } 1735 return ur.s; 1736 1737 soft: 1738 return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s); 1739 } 1740 1741 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c, 1742 int flags, float_status *status) 1743 { 1744 FloatParts64 pa, pb, pc, *pr; 1745 1746 bfloat16_unpack_canonical(&pa, a, status); 1747 bfloat16_unpack_canonical(&pb, b, status); 1748 bfloat16_unpack_canonical(&pc, c, status); 1749 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1750 1751 return bfloat16_round_pack_canonical(pr, status); 1752 } 1753 1754 float128 QEMU_FLATTEN float128_muladd(float128 a, float128 b, float128 c, 1755 int flags, float_status *status) 1756 { 1757 FloatParts128 pa, pb, pc, *pr; 1758 1759 float128_unpack_canonical(&pa, a, status); 1760 float128_unpack_canonical(&pb, b, status); 1761 float128_unpack_canonical(&pc, c, status); 1762 pr = parts_muladd(&pa, &pb, &pc, flags, status); 1763 1764 return float128_round_pack_canonical(pr, status); 1765 } 1766 1767 /* 1768 * Returns the result of dividing the floating-point value `a' by the 1769 * corresponding value `b'. The operation is performed according to 1770 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1771 */ 1772 1773 static FloatParts64 div_floats(FloatParts64 a, FloatParts64 b, float_status *s) 1774 { 1775 bool sign = a.sign ^ b.sign; 1776 1777 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1778 uint64_t n0, n1, q, r; 1779 int exp = a.exp - b.exp; 1780 1781 /* 1782 * We want a 2*N / N-bit division to produce exactly an N-bit 1783 * result, so that we do not lose any precision and so that we 1784 * do not have to renormalize afterward. If A.frac < B.frac, 1785 * then division would produce an (N-1)-bit result; shift A left 1786 * by one to produce the an N-bit result, and decrement the 1787 * exponent to match. 1788 * 1789 * The udiv_qrnnd algorithm that we're using requires normalization, 1790 * i.e. the msb of the denominator must be set, which is already true. 1791 */ 1792 if (a.frac < b.frac) { 1793 exp -= 1; 1794 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0); 1795 } else { 1796 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, &n1, &n0); 1797 } 1798 q = udiv_qrnnd(&r, n1, n0, b.frac); 1799 1800 /* Set lsb if there is a remainder, to set inexact. */ 1801 a.frac = q | (r != 0); 1802 a.sign = sign; 1803 a.exp = exp; 1804 return a; 1805 } 1806 /* handle all the NaN cases */ 1807 if (is_nan(a.cls) || is_nan(b.cls)) { 1808 return *parts_pick_nan(&a, &b, s); 1809 } 1810 /* 0/0 or Inf/Inf */ 1811 if (a.cls == b.cls 1812 && 1813 (a.cls == float_class_inf || a.cls == float_class_zero)) { 1814 float_raise(float_flag_invalid, s); 1815 parts_default_nan(&a, s); 1816 return a; 1817 } 1818 /* Inf / x or 0 / x */ 1819 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1820 a.sign = sign; 1821 return a; 1822 } 1823 /* Div 0 => Inf */ 1824 if (b.cls == float_class_zero) { 1825 float_raise(float_flag_divbyzero, s); 1826 a.cls = float_class_inf; 1827 a.sign = sign; 1828 return a; 1829 } 1830 /* Div by Inf */ 1831 if (b.cls == float_class_inf) { 1832 a.cls = float_class_zero; 1833 a.sign = sign; 1834 return a; 1835 } 1836 g_assert_not_reached(); 1837 } 1838 1839 float16 float16_div(float16 a, float16 b, float_status *status) 1840 { 1841 FloatParts64 pa, pb, pr; 1842 1843 float16_unpack_canonical(&pa, a, status); 1844 float16_unpack_canonical(&pb, b, status); 1845 pr = div_floats(pa, pb, status); 1846 1847 return float16_round_pack_canonical(&pr, status); 1848 } 1849 1850 static float32 QEMU_SOFTFLOAT_ATTR 1851 soft_f32_div(float32 a, float32 b, float_status *status) 1852 { 1853 FloatParts64 pa, pb, pr; 1854 1855 float32_unpack_canonical(&pa, a, status); 1856 float32_unpack_canonical(&pb, b, status); 1857 pr = div_floats(pa, pb, status); 1858 1859 return float32_round_pack_canonical(&pr, status); 1860 } 1861 1862 static float64 QEMU_SOFTFLOAT_ATTR 1863 soft_f64_div(float64 a, float64 b, float_status *status) 1864 { 1865 FloatParts64 pa, pb, pr; 1866 1867 float64_unpack_canonical(&pa, a, status); 1868 float64_unpack_canonical(&pb, b, status); 1869 pr = div_floats(pa, pb, status); 1870 1871 return float64_round_pack_canonical(&pr, status); 1872 } 1873 1874 static float hard_f32_div(float a, float b) 1875 { 1876 return a / b; 1877 } 1878 1879 static double hard_f64_div(double a, double b) 1880 { 1881 return a / b; 1882 } 1883 1884 static bool f32_div_pre(union_float32 a, union_float32 b) 1885 { 1886 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1887 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1888 fpclassify(b.h) == FP_NORMAL; 1889 } 1890 return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s); 1891 } 1892 1893 static bool f64_div_pre(union_float64 a, union_float64 b) 1894 { 1895 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1896 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1897 fpclassify(b.h) == FP_NORMAL; 1898 } 1899 return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s); 1900 } 1901 1902 static bool f32_div_post(union_float32 a, union_float32 b) 1903 { 1904 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1905 return fpclassify(a.h) != FP_ZERO; 1906 } 1907 return !float32_is_zero(a.s); 1908 } 1909 1910 static bool f64_div_post(union_float64 a, union_float64 b) 1911 { 1912 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1913 return fpclassify(a.h) != FP_ZERO; 1914 } 1915 return !float64_is_zero(a.s); 1916 } 1917 1918 float32 QEMU_FLATTEN 1919 float32_div(float32 a, float32 b, float_status *s) 1920 { 1921 return float32_gen2(a, b, s, hard_f32_div, soft_f32_div, 1922 f32_div_pre, f32_div_post); 1923 } 1924 1925 float64 QEMU_FLATTEN 1926 float64_div(float64 a, float64 b, float_status *s) 1927 { 1928 return float64_gen2(a, b, s, hard_f64_div, soft_f64_div, 1929 f64_div_pre, f64_div_post); 1930 } 1931 1932 /* 1933 * Returns the result of dividing the bfloat16 1934 * value `a' by the corresponding value `b'. 1935 */ 1936 1937 bfloat16 bfloat16_div(bfloat16 a, bfloat16 b, float_status *status) 1938 { 1939 FloatParts64 pa, pb, pr; 1940 1941 bfloat16_unpack_canonical(&pa, a, status); 1942 bfloat16_unpack_canonical(&pb, b, status); 1943 pr = div_floats(pa, pb, status); 1944 1945 return bfloat16_round_pack_canonical(&pr, status); 1946 } 1947 1948 /* 1949 * Float to Float conversions 1950 * 1951 * Returns the result of converting one float format to another. The 1952 * conversion is performed according to the IEC/IEEE Standard for 1953 * Binary Floating-Point Arithmetic. 1954 * 1955 * The float_to_float helper only needs to take care of raising 1956 * invalid exceptions and handling the conversion on NaNs. 1957 */ 1958 1959 static FloatParts64 float_to_float(FloatParts64 a, const FloatFmt *dstf, 1960 float_status *s) 1961 { 1962 if (dstf->arm_althp) { 1963 switch (a.cls) { 1964 case float_class_qnan: 1965 case float_class_snan: 1966 /* There is no NaN in the destination format. Raise Invalid 1967 * and return a zero with the sign of the input NaN. 1968 */ 1969 float_raise(float_flag_invalid, s); 1970 a.cls = float_class_zero; 1971 a.frac = 0; 1972 a.exp = 0; 1973 break; 1974 1975 case float_class_inf: 1976 /* There is no Inf in the destination format. Raise Invalid 1977 * and return the maximum normal with the correct sign. 1978 */ 1979 float_raise(float_flag_invalid, s); 1980 a.cls = float_class_normal; 1981 a.exp = dstf->exp_max; 1982 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift; 1983 break; 1984 1985 default: 1986 break; 1987 } 1988 } else if (is_nan(a.cls)) { 1989 parts_return_nan(&a, s); 1990 } 1991 return a; 1992 } 1993 1994 float32 float16_to_float32(float16 a, bool ieee, float_status *s) 1995 { 1996 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1997 FloatParts64 pa, pr; 1998 1999 float16a_unpack_canonical(&pa, a, s, fmt16); 2000 pr = float_to_float(pa, &float32_params, s); 2001 return float32_round_pack_canonical(&pr, s); 2002 } 2003 2004 float64 float16_to_float64(float16 a, bool ieee, float_status *s) 2005 { 2006 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2007 FloatParts64 pa, pr; 2008 2009 float16a_unpack_canonical(&pa, a, s, fmt16); 2010 pr = float_to_float(pa, &float64_params, s); 2011 return float64_round_pack_canonical(&pr, s); 2012 } 2013 2014 float16 float32_to_float16(float32 a, bool ieee, float_status *s) 2015 { 2016 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2017 FloatParts64 pa, pr; 2018 2019 float32_unpack_canonical(&pa, a, s); 2020 pr = float_to_float(pa, fmt16, s); 2021 return float16a_round_pack_canonical(&pr, s, fmt16); 2022 } 2023 2024 static float64 QEMU_SOFTFLOAT_ATTR 2025 soft_float32_to_float64(float32 a, float_status *s) 2026 { 2027 FloatParts64 pa, pr; 2028 2029 float32_unpack_canonical(&pa, a, s); 2030 pr = float_to_float(pa, &float64_params, s); 2031 return float64_round_pack_canonical(&pr, s); 2032 } 2033 2034 float64 float32_to_float64(float32 a, float_status *s) 2035 { 2036 if (likely(float32_is_normal(a))) { 2037 /* Widening conversion can never produce inexact results. */ 2038 union_float32 uf; 2039 union_float64 ud; 2040 uf.s = a; 2041 ud.h = uf.h; 2042 return ud.s; 2043 } else if (float32_is_zero(a)) { 2044 return float64_set_sign(float64_zero, float32_is_neg(a)); 2045 } else { 2046 return soft_float32_to_float64(a, s); 2047 } 2048 } 2049 2050 float16 float64_to_float16(float64 a, bool ieee, float_status *s) 2051 { 2052 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2053 FloatParts64 pa, pr; 2054 2055 float64_unpack_canonical(&pa, a, s); 2056 pr = float_to_float(pa, fmt16, s); 2057 return float16a_round_pack_canonical(&pr, s, fmt16); 2058 } 2059 2060 float32 float64_to_float32(float64 a, float_status *s) 2061 { 2062 FloatParts64 pa, pr; 2063 2064 float64_unpack_canonical(&pa, a, s); 2065 pr = float_to_float(pa, &float32_params, s); 2066 return float32_round_pack_canonical(&pr, s); 2067 } 2068 2069 float32 bfloat16_to_float32(bfloat16 a, float_status *s) 2070 { 2071 FloatParts64 pa, pr; 2072 2073 bfloat16_unpack_canonical(&pa, a, s); 2074 pr = float_to_float(pa, &float32_params, s); 2075 return float32_round_pack_canonical(&pr, s); 2076 } 2077 2078 float64 bfloat16_to_float64(bfloat16 a, float_status *s) 2079 { 2080 FloatParts64 pa, pr; 2081 2082 bfloat16_unpack_canonical(&pa, a, s); 2083 pr = float_to_float(pa, &float64_params, s); 2084 return float64_round_pack_canonical(&pr, s); 2085 } 2086 2087 bfloat16 float32_to_bfloat16(float32 a, float_status *s) 2088 { 2089 FloatParts64 pa, pr; 2090 2091 float32_unpack_canonical(&pa, a, s); 2092 pr = float_to_float(pa, &bfloat16_params, s); 2093 return bfloat16_round_pack_canonical(&pr, s); 2094 } 2095 2096 bfloat16 float64_to_bfloat16(float64 a, float_status *s) 2097 { 2098 FloatParts64 pa, pr; 2099 2100 float64_unpack_canonical(&pa, a, s); 2101 pr = float_to_float(pa, &bfloat16_params, s); 2102 return bfloat16_round_pack_canonical(&pr, s); 2103 } 2104 2105 /* 2106 * Rounds the floating-point value `a' to an integer, and returns the 2107 * result as a floating-point value. The operation is performed 2108 * according to the IEC/IEEE Standard for Binary Floating-Point 2109 * Arithmetic. 2110 */ 2111 2112 static FloatParts64 round_to_int(FloatParts64 a, FloatRoundMode rmode, 2113 int scale, float_status *s) 2114 { 2115 switch (a.cls) { 2116 case float_class_qnan: 2117 case float_class_snan: 2118 parts_return_nan(&a, s); 2119 break; 2120 2121 case float_class_zero: 2122 case float_class_inf: 2123 /* already "integral" */ 2124 break; 2125 2126 case float_class_normal: 2127 scale = MIN(MAX(scale, -0x10000), 0x10000); 2128 a.exp += scale; 2129 2130 if (a.exp >= DECOMPOSED_BINARY_POINT) { 2131 /* already integral */ 2132 break; 2133 } 2134 if (a.exp < 0) { 2135 bool one; 2136 /* all fractional */ 2137 float_raise(float_flag_inexact, s); 2138 switch (rmode) { 2139 case float_round_nearest_even: 2140 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT; 2141 break; 2142 case float_round_ties_away: 2143 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT; 2144 break; 2145 case float_round_to_zero: 2146 one = false; 2147 break; 2148 case float_round_up: 2149 one = !a.sign; 2150 break; 2151 case float_round_down: 2152 one = a.sign; 2153 break; 2154 case float_round_to_odd: 2155 one = true; 2156 break; 2157 default: 2158 g_assert_not_reached(); 2159 } 2160 2161 if (one) { 2162 a.frac = DECOMPOSED_IMPLICIT_BIT; 2163 a.exp = 0; 2164 } else { 2165 a.cls = float_class_zero; 2166 } 2167 } else { 2168 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp; 2169 uint64_t frac_lsbm1 = frac_lsb >> 1; 2170 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb; 2171 uint64_t rnd_mask = rnd_even_mask >> 1; 2172 uint64_t inc; 2173 2174 switch (rmode) { 2175 case float_round_nearest_even: 2176 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 2177 break; 2178 case float_round_ties_away: 2179 inc = frac_lsbm1; 2180 break; 2181 case float_round_to_zero: 2182 inc = 0; 2183 break; 2184 case float_round_up: 2185 inc = a.sign ? 0 : rnd_mask; 2186 break; 2187 case float_round_down: 2188 inc = a.sign ? rnd_mask : 0; 2189 break; 2190 case float_round_to_odd: 2191 inc = a.frac & frac_lsb ? 0 : rnd_mask; 2192 break; 2193 default: 2194 g_assert_not_reached(); 2195 } 2196 2197 if (a.frac & rnd_mask) { 2198 float_raise(float_flag_inexact, s); 2199 if (uadd64_overflow(a.frac, inc, &a.frac)) { 2200 a.frac >>= 1; 2201 a.frac |= DECOMPOSED_IMPLICIT_BIT; 2202 a.exp++; 2203 } 2204 a.frac &= ~rnd_mask; 2205 } 2206 } 2207 break; 2208 default: 2209 g_assert_not_reached(); 2210 } 2211 return a; 2212 } 2213 2214 float16 float16_round_to_int(float16 a, float_status *s) 2215 { 2216 FloatParts64 pa, pr; 2217 2218 float16_unpack_canonical(&pa, a, s); 2219 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2220 return float16_round_pack_canonical(&pr, s); 2221 } 2222 2223 float32 float32_round_to_int(float32 a, float_status *s) 2224 { 2225 FloatParts64 pa, pr; 2226 2227 float32_unpack_canonical(&pa, a, s); 2228 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2229 return float32_round_pack_canonical(&pr, s); 2230 } 2231 2232 float64 float64_round_to_int(float64 a, float_status *s) 2233 { 2234 FloatParts64 pa, pr; 2235 2236 float64_unpack_canonical(&pa, a, s); 2237 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2238 return float64_round_pack_canonical(&pr, s); 2239 } 2240 2241 /* 2242 * Rounds the bfloat16 value `a' to an integer, and returns the 2243 * result as a bfloat16 value. 2244 */ 2245 2246 bfloat16 bfloat16_round_to_int(bfloat16 a, float_status *s) 2247 { 2248 FloatParts64 pa, pr; 2249 2250 bfloat16_unpack_canonical(&pa, a, s); 2251 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2252 return bfloat16_round_pack_canonical(&pr, s); 2253 } 2254 2255 /* 2256 * Returns the result of converting the floating-point value `a' to 2257 * the two's complement integer format. The conversion is performed 2258 * according to the IEC/IEEE Standard for Binary Floating-Point 2259 * Arithmetic---which means in particular that the conversion is 2260 * rounded according to the current rounding mode. If `a' is a NaN, 2261 * the largest positive integer is returned. Otherwise, if the 2262 * conversion overflows, the largest integer with the same sign as `a' 2263 * is returned. 2264 */ 2265 2266 static int64_t round_to_int_and_pack(FloatParts64 in, FloatRoundMode rmode, 2267 int scale, int64_t min, int64_t max, 2268 float_status *s) 2269 { 2270 uint64_t r; 2271 int orig_flags = get_float_exception_flags(s); 2272 FloatParts64 p = round_to_int(in, rmode, scale, s); 2273 2274 switch (p.cls) { 2275 case float_class_snan: 2276 case float_class_qnan: 2277 s->float_exception_flags = orig_flags | float_flag_invalid; 2278 return max; 2279 case float_class_inf: 2280 s->float_exception_flags = orig_flags | float_flag_invalid; 2281 return p.sign ? min : max; 2282 case float_class_zero: 2283 return 0; 2284 case float_class_normal: 2285 if (p.exp <= DECOMPOSED_BINARY_POINT) { 2286 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2287 } else { 2288 r = UINT64_MAX; 2289 } 2290 if (p.sign) { 2291 if (r <= -(uint64_t) min) { 2292 return -r; 2293 } else { 2294 s->float_exception_flags = orig_flags | float_flag_invalid; 2295 return min; 2296 } 2297 } else { 2298 if (r <= max) { 2299 return r; 2300 } else { 2301 s->float_exception_flags = orig_flags | float_flag_invalid; 2302 return max; 2303 } 2304 } 2305 default: 2306 g_assert_not_reached(); 2307 } 2308 } 2309 2310 int8_t float16_to_int8_scalbn(float16 a, FloatRoundMode rmode, int scale, 2311 float_status *s) 2312 { 2313 FloatParts64 p; 2314 2315 float16_unpack_canonical(&p, a, s); 2316 return round_to_int_and_pack(p, rmode, scale, INT8_MIN, INT8_MAX, s); 2317 } 2318 2319 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale, 2320 float_status *s) 2321 { 2322 FloatParts64 p; 2323 2324 float16_unpack_canonical(&p, a, s); 2325 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2326 } 2327 2328 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale, 2329 float_status *s) 2330 { 2331 FloatParts64 p; 2332 2333 float16_unpack_canonical(&p, a, s); 2334 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2335 } 2336 2337 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale, 2338 float_status *s) 2339 { 2340 FloatParts64 p; 2341 2342 float16_unpack_canonical(&p, a, s); 2343 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2344 } 2345 2346 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale, 2347 float_status *s) 2348 { 2349 FloatParts64 p; 2350 2351 float32_unpack_canonical(&p, a, s); 2352 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2353 } 2354 2355 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale, 2356 float_status *s) 2357 { 2358 FloatParts64 p; 2359 2360 float32_unpack_canonical(&p, a, s); 2361 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2362 } 2363 2364 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale, 2365 float_status *s) 2366 { 2367 FloatParts64 p; 2368 2369 float32_unpack_canonical(&p, a, s); 2370 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2371 } 2372 2373 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale, 2374 float_status *s) 2375 { 2376 FloatParts64 p; 2377 2378 float64_unpack_canonical(&p, a, s); 2379 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2380 } 2381 2382 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale, 2383 float_status *s) 2384 { 2385 FloatParts64 p; 2386 2387 float64_unpack_canonical(&p, a, s); 2388 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2389 } 2390 2391 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale, 2392 float_status *s) 2393 { 2394 FloatParts64 p; 2395 2396 float64_unpack_canonical(&p, a, s); 2397 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2398 } 2399 2400 int8_t float16_to_int8(float16 a, float_status *s) 2401 { 2402 return float16_to_int8_scalbn(a, s->float_rounding_mode, 0, s); 2403 } 2404 2405 int16_t float16_to_int16(float16 a, float_status *s) 2406 { 2407 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2408 } 2409 2410 int32_t float16_to_int32(float16 a, float_status *s) 2411 { 2412 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2413 } 2414 2415 int64_t float16_to_int64(float16 a, float_status *s) 2416 { 2417 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2418 } 2419 2420 int16_t float32_to_int16(float32 a, float_status *s) 2421 { 2422 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2423 } 2424 2425 int32_t float32_to_int32(float32 a, float_status *s) 2426 { 2427 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2428 } 2429 2430 int64_t float32_to_int64(float32 a, float_status *s) 2431 { 2432 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2433 } 2434 2435 int16_t float64_to_int16(float64 a, float_status *s) 2436 { 2437 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2438 } 2439 2440 int32_t float64_to_int32(float64 a, float_status *s) 2441 { 2442 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2443 } 2444 2445 int64_t float64_to_int64(float64 a, float_status *s) 2446 { 2447 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2448 } 2449 2450 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s) 2451 { 2452 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2453 } 2454 2455 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s) 2456 { 2457 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2458 } 2459 2460 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s) 2461 { 2462 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2463 } 2464 2465 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s) 2466 { 2467 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s); 2468 } 2469 2470 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s) 2471 { 2472 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s); 2473 } 2474 2475 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s) 2476 { 2477 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s); 2478 } 2479 2480 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s) 2481 { 2482 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s); 2483 } 2484 2485 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s) 2486 { 2487 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s); 2488 } 2489 2490 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s) 2491 { 2492 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s); 2493 } 2494 2495 /* 2496 * Returns the result of converting the floating-point value `a' to 2497 * the two's complement integer format. 2498 */ 2499 2500 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2501 float_status *s) 2502 { 2503 FloatParts64 p; 2504 2505 bfloat16_unpack_canonical(&p, a, s); 2506 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2507 } 2508 2509 int32_t bfloat16_to_int32_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2510 float_status *s) 2511 { 2512 FloatParts64 p; 2513 2514 bfloat16_unpack_canonical(&p, a, s); 2515 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2516 } 2517 2518 int64_t bfloat16_to_int64_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2519 float_status *s) 2520 { 2521 FloatParts64 p; 2522 2523 bfloat16_unpack_canonical(&p, a, s); 2524 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2525 } 2526 2527 int16_t bfloat16_to_int16(bfloat16 a, float_status *s) 2528 { 2529 return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2530 } 2531 2532 int32_t bfloat16_to_int32(bfloat16 a, float_status *s) 2533 { 2534 return bfloat16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2535 } 2536 2537 int64_t bfloat16_to_int64(bfloat16 a, float_status *s) 2538 { 2539 return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2540 } 2541 2542 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s) 2543 { 2544 return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2545 } 2546 2547 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a, float_status *s) 2548 { 2549 return bfloat16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2550 } 2551 2552 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a, float_status *s) 2553 { 2554 return bfloat16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2555 } 2556 2557 /* 2558 * Returns the result of converting the floating-point value `a' to 2559 * the unsigned integer format. The conversion is performed according 2560 * to the IEC/IEEE Standard for Binary Floating-Point 2561 * Arithmetic---which means in particular that the conversion is 2562 * rounded according to the current rounding mode. If `a' is a NaN, 2563 * the largest unsigned integer is returned. Otherwise, if the 2564 * conversion overflows, the largest unsigned integer is returned. If 2565 * the 'a' is negative, the result is rounded and zero is returned; 2566 * values that do not round to zero will raise the inexact exception 2567 * flag. 2568 */ 2569 2570 static uint64_t round_to_uint_and_pack(FloatParts64 in, FloatRoundMode rmode, 2571 int scale, uint64_t max, 2572 float_status *s) 2573 { 2574 int orig_flags = get_float_exception_flags(s); 2575 FloatParts64 p = round_to_int(in, rmode, scale, s); 2576 uint64_t r; 2577 2578 switch (p.cls) { 2579 case float_class_snan: 2580 case float_class_qnan: 2581 s->float_exception_flags = orig_flags | float_flag_invalid; 2582 return max; 2583 case float_class_inf: 2584 s->float_exception_flags = orig_flags | float_flag_invalid; 2585 return p.sign ? 0 : max; 2586 case float_class_zero: 2587 return 0; 2588 case float_class_normal: 2589 if (p.sign) { 2590 s->float_exception_flags = orig_flags | float_flag_invalid; 2591 return 0; 2592 } 2593 2594 if (p.exp <= DECOMPOSED_BINARY_POINT) { 2595 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2596 } else { 2597 s->float_exception_flags = orig_flags | float_flag_invalid; 2598 return max; 2599 } 2600 2601 /* For uint64 this will never trip, but if p.exp is too large 2602 * to shift a decomposed fraction we shall have exited via the 2603 * 3rd leg above. 2604 */ 2605 if (r > max) { 2606 s->float_exception_flags = orig_flags | float_flag_invalid; 2607 return max; 2608 } 2609 return r; 2610 default: 2611 g_assert_not_reached(); 2612 } 2613 } 2614 2615 uint8_t float16_to_uint8_scalbn(float16 a, FloatRoundMode rmode, int scale, 2616 float_status *s) 2617 { 2618 FloatParts64 p; 2619 2620 float16_unpack_canonical(&p, a, s); 2621 return round_to_uint_and_pack(p, rmode, scale, UINT8_MAX, s); 2622 } 2623 2624 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale, 2625 float_status *s) 2626 { 2627 FloatParts64 p; 2628 2629 float16_unpack_canonical(&p, a, s); 2630 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2631 } 2632 2633 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale, 2634 float_status *s) 2635 { 2636 FloatParts64 p; 2637 2638 float16_unpack_canonical(&p, a, s); 2639 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2640 } 2641 2642 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale, 2643 float_status *s) 2644 { 2645 FloatParts64 p; 2646 2647 float16_unpack_canonical(&p, a, s); 2648 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2649 } 2650 2651 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale, 2652 float_status *s) 2653 { 2654 FloatParts64 p; 2655 2656 float32_unpack_canonical(&p, a, s); 2657 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2658 } 2659 2660 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale, 2661 float_status *s) 2662 { 2663 FloatParts64 p; 2664 2665 float32_unpack_canonical(&p, a, s); 2666 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2667 } 2668 2669 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale, 2670 float_status *s) 2671 { 2672 FloatParts64 p; 2673 2674 float32_unpack_canonical(&p, a, s); 2675 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2676 } 2677 2678 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale, 2679 float_status *s) 2680 { 2681 FloatParts64 p; 2682 2683 float64_unpack_canonical(&p, a, s); 2684 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2685 } 2686 2687 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale, 2688 float_status *s) 2689 { 2690 FloatParts64 p; 2691 2692 float64_unpack_canonical(&p, a, s); 2693 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2694 } 2695 2696 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale, 2697 float_status *s) 2698 { 2699 FloatParts64 p; 2700 2701 float64_unpack_canonical(&p, a, s); 2702 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2703 } 2704 2705 uint8_t float16_to_uint8(float16 a, float_status *s) 2706 { 2707 return float16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s); 2708 } 2709 2710 uint16_t float16_to_uint16(float16 a, float_status *s) 2711 { 2712 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2713 } 2714 2715 uint32_t float16_to_uint32(float16 a, float_status *s) 2716 { 2717 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2718 } 2719 2720 uint64_t float16_to_uint64(float16 a, float_status *s) 2721 { 2722 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2723 } 2724 2725 uint16_t float32_to_uint16(float32 a, float_status *s) 2726 { 2727 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2728 } 2729 2730 uint32_t float32_to_uint32(float32 a, float_status *s) 2731 { 2732 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2733 } 2734 2735 uint64_t float32_to_uint64(float32 a, float_status *s) 2736 { 2737 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2738 } 2739 2740 uint16_t float64_to_uint16(float64 a, float_status *s) 2741 { 2742 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2743 } 2744 2745 uint32_t float64_to_uint32(float64 a, float_status *s) 2746 { 2747 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2748 } 2749 2750 uint64_t float64_to_uint64(float64 a, float_status *s) 2751 { 2752 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2753 } 2754 2755 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s) 2756 { 2757 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2758 } 2759 2760 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s) 2761 { 2762 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2763 } 2764 2765 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s) 2766 { 2767 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2768 } 2769 2770 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s) 2771 { 2772 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2773 } 2774 2775 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s) 2776 { 2777 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2778 } 2779 2780 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s) 2781 { 2782 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2783 } 2784 2785 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s) 2786 { 2787 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2788 } 2789 2790 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s) 2791 { 2792 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2793 } 2794 2795 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s) 2796 { 2797 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2798 } 2799 2800 /* 2801 * Returns the result of converting the bfloat16 value `a' to 2802 * the unsigned integer format. 2803 */ 2804 2805 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode, 2806 int scale, float_status *s) 2807 { 2808 FloatParts64 p; 2809 2810 bfloat16_unpack_canonical(&p, a, s); 2811 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2812 } 2813 2814 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode rmode, 2815 int scale, float_status *s) 2816 { 2817 FloatParts64 p; 2818 2819 bfloat16_unpack_canonical(&p, a, s); 2820 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2821 } 2822 2823 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode rmode, 2824 int scale, float_status *s) 2825 { 2826 FloatParts64 p; 2827 2828 bfloat16_unpack_canonical(&p, a, s); 2829 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2830 } 2831 2832 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s) 2833 { 2834 return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2835 } 2836 2837 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *s) 2838 { 2839 return bfloat16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2840 } 2841 2842 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s) 2843 { 2844 return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2845 } 2846 2847 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s) 2848 { 2849 return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2850 } 2851 2852 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *s) 2853 { 2854 return bfloat16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2855 } 2856 2857 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *s) 2858 { 2859 return bfloat16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2860 } 2861 2862 /* 2863 * Integer to float conversions 2864 * 2865 * Returns the result of converting the two's complement integer `a' 2866 * to the floating-point format. The conversion is performed according 2867 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2868 */ 2869 2870 static FloatParts64 int_to_float(int64_t a, int scale, float_status *status) 2871 { 2872 FloatParts64 r = { .sign = false }; 2873 2874 if (a == 0) { 2875 r.cls = float_class_zero; 2876 } else { 2877 uint64_t f = a; 2878 int shift; 2879 2880 r.cls = float_class_normal; 2881 if (a < 0) { 2882 f = -f; 2883 r.sign = true; 2884 } 2885 shift = clz64(f); 2886 scale = MIN(MAX(scale, -0x10000), 0x10000); 2887 2888 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 2889 r.frac = f << shift; 2890 } 2891 2892 return r; 2893 } 2894 2895 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status) 2896 { 2897 FloatParts64 pa = int_to_float(a, scale, status); 2898 return float16_round_pack_canonical(&pa, status); 2899 } 2900 2901 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status) 2902 { 2903 return int64_to_float16_scalbn(a, scale, status); 2904 } 2905 2906 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status) 2907 { 2908 return int64_to_float16_scalbn(a, scale, status); 2909 } 2910 2911 float16 int64_to_float16(int64_t a, float_status *status) 2912 { 2913 return int64_to_float16_scalbn(a, 0, status); 2914 } 2915 2916 float16 int32_to_float16(int32_t a, float_status *status) 2917 { 2918 return int64_to_float16_scalbn(a, 0, status); 2919 } 2920 2921 float16 int16_to_float16(int16_t a, float_status *status) 2922 { 2923 return int64_to_float16_scalbn(a, 0, status); 2924 } 2925 2926 float16 int8_to_float16(int8_t a, float_status *status) 2927 { 2928 return int64_to_float16_scalbn(a, 0, status); 2929 } 2930 2931 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status) 2932 { 2933 FloatParts64 pa = int_to_float(a, scale, status); 2934 return float32_round_pack_canonical(&pa, status); 2935 } 2936 2937 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status) 2938 { 2939 return int64_to_float32_scalbn(a, scale, status); 2940 } 2941 2942 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status) 2943 { 2944 return int64_to_float32_scalbn(a, scale, status); 2945 } 2946 2947 float32 int64_to_float32(int64_t a, float_status *status) 2948 { 2949 return int64_to_float32_scalbn(a, 0, status); 2950 } 2951 2952 float32 int32_to_float32(int32_t a, float_status *status) 2953 { 2954 return int64_to_float32_scalbn(a, 0, status); 2955 } 2956 2957 float32 int16_to_float32(int16_t a, float_status *status) 2958 { 2959 return int64_to_float32_scalbn(a, 0, status); 2960 } 2961 2962 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status) 2963 { 2964 FloatParts64 pa = int_to_float(a, scale, status); 2965 return float64_round_pack_canonical(&pa, status); 2966 } 2967 2968 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status) 2969 { 2970 return int64_to_float64_scalbn(a, scale, status); 2971 } 2972 2973 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status) 2974 { 2975 return int64_to_float64_scalbn(a, scale, status); 2976 } 2977 2978 float64 int64_to_float64(int64_t a, float_status *status) 2979 { 2980 return int64_to_float64_scalbn(a, 0, status); 2981 } 2982 2983 float64 int32_to_float64(int32_t a, float_status *status) 2984 { 2985 return int64_to_float64_scalbn(a, 0, status); 2986 } 2987 2988 float64 int16_to_float64(int16_t a, float_status *status) 2989 { 2990 return int64_to_float64_scalbn(a, 0, status); 2991 } 2992 2993 /* 2994 * Returns the result of converting the two's complement integer `a' 2995 * to the bfloat16 format. 2996 */ 2997 2998 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status) 2999 { 3000 FloatParts64 pa = int_to_float(a, scale, status); 3001 return bfloat16_round_pack_canonical(&pa, status); 3002 } 3003 3004 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int scale, float_status *status) 3005 { 3006 return int64_to_bfloat16_scalbn(a, scale, status); 3007 } 3008 3009 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status) 3010 { 3011 return int64_to_bfloat16_scalbn(a, scale, status); 3012 } 3013 3014 bfloat16 int64_to_bfloat16(int64_t a, float_status *status) 3015 { 3016 return int64_to_bfloat16_scalbn(a, 0, status); 3017 } 3018 3019 bfloat16 int32_to_bfloat16(int32_t a, float_status *status) 3020 { 3021 return int64_to_bfloat16_scalbn(a, 0, status); 3022 } 3023 3024 bfloat16 int16_to_bfloat16(int16_t a, float_status *status) 3025 { 3026 return int64_to_bfloat16_scalbn(a, 0, status); 3027 } 3028 3029 /* 3030 * Unsigned Integer to float conversions 3031 * 3032 * Returns the result of converting the unsigned integer `a' to the 3033 * floating-point format. The conversion is performed according to the 3034 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3035 */ 3036 3037 static FloatParts64 uint_to_float(uint64_t a, int scale, float_status *status) 3038 { 3039 FloatParts64 r = { .sign = false }; 3040 int shift; 3041 3042 if (a == 0) { 3043 r.cls = float_class_zero; 3044 } else { 3045 scale = MIN(MAX(scale, -0x10000), 0x10000); 3046 shift = clz64(a); 3047 r.cls = float_class_normal; 3048 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 3049 r.frac = a << shift; 3050 } 3051 3052 return r; 3053 } 3054 3055 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status) 3056 { 3057 FloatParts64 pa = uint_to_float(a, scale, status); 3058 return float16_round_pack_canonical(&pa, status); 3059 } 3060 3061 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status) 3062 { 3063 return uint64_to_float16_scalbn(a, scale, status); 3064 } 3065 3066 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status) 3067 { 3068 return uint64_to_float16_scalbn(a, scale, status); 3069 } 3070 3071 float16 uint64_to_float16(uint64_t a, float_status *status) 3072 { 3073 return uint64_to_float16_scalbn(a, 0, status); 3074 } 3075 3076 float16 uint32_to_float16(uint32_t a, float_status *status) 3077 { 3078 return uint64_to_float16_scalbn(a, 0, status); 3079 } 3080 3081 float16 uint16_to_float16(uint16_t a, float_status *status) 3082 { 3083 return uint64_to_float16_scalbn(a, 0, status); 3084 } 3085 3086 float16 uint8_to_float16(uint8_t a, float_status *status) 3087 { 3088 return uint64_to_float16_scalbn(a, 0, status); 3089 } 3090 3091 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status) 3092 { 3093 FloatParts64 pa = uint_to_float(a, scale, status); 3094 return float32_round_pack_canonical(&pa, status); 3095 } 3096 3097 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status) 3098 { 3099 return uint64_to_float32_scalbn(a, scale, status); 3100 } 3101 3102 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status) 3103 { 3104 return uint64_to_float32_scalbn(a, scale, status); 3105 } 3106 3107 float32 uint64_to_float32(uint64_t a, float_status *status) 3108 { 3109 return uint64_to_float32_scalbn(a, 0, status); 3110 } 3111 3112 float32 uint32_to_float32(uint32_t a, float_status *status) 3113 { 3114 return uint64_to_float32_scalbn(a, 0, status); 3115 } 3116 3117 float32 uint16_to_float32(uint16_t a, float_status *status) 3118 { 3119 return uint64_to_float32_scalbn(a, 0, status); 3120 } 3121 3122 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status) 3123 { 3124 FloatParts64 pa = uint_to_float(a, scale, status); 3125 return float64_round_pack_canonical(&pa, status); 3126 } 3127 3128 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status) 3129 { 3130 return uint64_to_float64_scalbn(a, scale, status); 3131 } 3132 3133 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status) 3134 { 3135 return uint64_to_float64_scalbn(a, scale, status); 3136 } 3137 3138 float64 uint64_to_float64(uint64_t a, float_status *status) 3139 { 3140 return uint64_to_float64_scalbn(a, 0, status); 3141 } 3142 3143 float64 uint32_to_float64(uint32_t a, float_status *status) 3144 { 3145 return uint64_to_float64_scalbn(a, 0, status); 3146 } 3147 3148 float64 uint16_to_float64(uint16_t a, float_status *status) 3149 { 3150 return uint64_to_float64_scalbn(a, 0, status); 3151 } 3152 3153 /* 3154 * Returns the result of converting the unsigned integer `a' to the 3155 * bfloat16 format. 3156 */ 3157 3158 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status) 3159 { 3160 FloatParts64 pa = uint_to_float(a, scale, status); 3161 return bfloat16_round_pack_canonical(&pa, status); 3162 } 3163 3164 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int scale, float_status *status) 3165 { 3166 return uint64_to_bfloat16_scalbn(a, scale, status); 3167 } 3168 3169 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status) 3170 { 3171 return uint64_to_bfloat16_scalbn(a, scale, status); 3172 } 3173 3174 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status) 3175 { 3176 return uint64_to_bfloat16_scalbn(a, 0, status); 3177 } 3178 3179 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status) 3180 { 3181 return uint64_to_bfloat16_scalbn(a, 0, status); 3182 } 3183 3184 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status) 3185 { 3186 return uint64_to_bfloat16_scalbn(a, 0, status); 3187 } 3188 3189 /* Float Min/Max */ 3190 /* min() and max() functions. These can't be implemented as 3191 * 'compare and pick one input' because that would mishandle 3192 * NaNs and +0 vs -0. 3193 * 3194 * minnum() and maxnum() functions. These are similar to the min() 3195 * and max() functions but if one of the arguments is a QNaN and 3196 * the other is numerical then the numerical argument is returned. 3197 * SNaNs will get quietened before being returned. 3198 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 3199 * and maxNum() operations. min() and max() are the typical min/max 3200 * semantics provided by many CPUs which predate that specification. 3201 * 3202 * minnummag() and maxnummag() functions correspond to minNumMag() 3203 * and minNumMag() from the IEEE-754 2008. 3204 */ 3205 static FloatParts64 minmax_floats(FloatParts64 a, FloatParts64 b, bool ismin, 3206 bool ieee, bool ismag, float_status *s) 3207 { 3208 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) { 3209 if (ieee) { 3210 /* Takes two floating-point values `a' and `b', one of 3211 * which is a NaN, and returns the appropriate NaN 3212 * result. If either `a' or `b' is a signaling NaN, 3213 * the invalid exception is raised. 3214 */ 3215 if (is_snan(a.cls) || is_snan(b.cls)) { 3216 return *parts_pick_nan(&a, &b, s); 3217 } else if (is_nan(a.cls) && !is_nan(b.cls)) { 3218 return b; 3219 } else if (is_nan(b.cls) && !is_nan(a.cls)) { 3220 return a; 3221 } 3222 } 3223 return *parts_pick_nan(&a, &b, s); 3224 } else { 3225 int a_exp, b_exp; 3226 3227 switch (a.cls) { 3228 case float_class_normal: 3229 a_exp = a.exp; 3230 break; 3231 case float_class_inf: 3232 a_exp = INT_MAX; 3233 break; 3234 case float_class_zero: 3235 a_exp = INT_MIN; 3236 break; 3237 default: 3238 g_assert_not_reached(); 3239 break; 3240 } 3241 switch (b.cls) { 3242 case float_class_normal: 3243 b_exp = b.exp; 3244 break; 3245 case float_class_inf: 3246 b_exp = INT_MAX; 3247 break; 3248 case float_class_zero: 3249 b_exp = INT_MIN; 3250 break; 3251 default: 3252 g_assert_not_reached(); 3253 break; 3254 } 3255 3256 if (ismag && (a_exp != b_exp || a.frac != b.frac)) { 3257 bool a_less = a_exp < b_exp; 3258 if (a_exp == b_exp) { 3259 a_less = a.frac < b.frac; 3260 } 3261 return a_less ^ ismin ? b : a; 3262 } 3263 3264 if (a.sign == b.sign) { 3265 bool a_less = a_exp < b_exp; 3266 if (a_exp == b_exp) { 3267 a_less = a.frac < b.frac; 3268 } 3269 return a.sign ^ a_less ^ ismin ? b : a; 3270 } else { 3271 return a.sign ^ ismin ? b : a; 3272 } 3273 } 3274 } 3275 3276 #define MINMAX(sz, name, ismin, isiee, ismag) \ 3277 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \ 3278 float_status *s) \ 3279 { \ 3280 FloatParts64 pa, pb, pr; \ 3281 float ## sz ## _unpack_canonical(&pa, a, s); \ 3282 float ## sz ## _unpack_canonical(&pb, b, s); \ 3283 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 3284 return float ## sz ## _round_pack_canonical(&pr, s); \ 3285 } 3286 3287 MINMAX(16, min, true, false, false) 3288 MINMAX(16, minnum, true, true, false) 3289 MINMAX(16, minnummag, true, true, true) 3290 MINMAX(16, max, false, false, false) 3291 MINMAX(16, maxnum, false, true, false) 3292 MINMAX(16, maxnummag, false, true, true) 3293 3294 MINMAX(32, min, true, false, false) 3295 MINMAX(32, minnum, true, true, false) 3296 MINMAX(32, minnummag, true, true, true) 3297 MINMAX(32, max, false, false, false) 3298 MINMAX(32, maxnum, false, true, false) 3299 MINMAX(32, maxnummag, false, true, true) 3300 3301 MINMAX(64, min, true, false, false) 3302 MINMAX(64, minnum, true, true, false) 3303 MINMAX(64, minnummag, true, true, true) 3304 MINMAX(64, max, false, false, false) 3305 MINMAX(64, maxnum, false, true, false) 3306 MINMAX(64, maxnummag, false, true, true) 3307 3308 #undef MINMAX 3309 3310 #define BF16_MINMAX(name, ismin, isiee, ismag) \ 3311 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s) \ 3312 { \ 3313 FloatParts64 pa, pb, pr; \ 3314 bfloat16_unpack_canonical(&pa, a, s); \ 3315 bfloat16_unpack_canonical(&pb, b, s); \ 3316 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 3317 return bfloat16_round_pack_canonical(&pr, s); \ 3318 } 3319 3320 BF16_MINMAX(min, true, false, false) 3321 BF16_MINMAX(minnum, true, true, false) 3322 BF16_MINMAX(minnummag, true, true, true) 3323 BF16_MINMAX(max, false, false, false) 3324 BF16_MINMAX(maxnum, false, true, false) 3325 BF16_MINMAX(maxnummag, false, true, true) 3326 3327 #undef BF16_MINMAX 3328 3329 /* Floating point compare */ 3330 static FloatRelation compare_floats(FloatParts64 a, FloatParts64 b, bool is_quiet, 3331 float_status *s) 3332 { 3333 if (is_nan(a.cls) || is_nan(b.cls)) { 3334 if (!is_quiet || 3335 a.cls == float_class_snan || 3336 b.cls == float_class_snan) { 3337 float_raise(float_flag_invalid, s); 3338 } 3339 return float_relation_unordered; 3340 } 3341 3342 if (a.cls == float_class_zero) { 3343 if (b.cls == float_class_zero) { 3344 return float_relation_equal; 3345 } 3346 return b.sign ? float_relation_greater : float_relation_less; 3347 } else if (b.cls == float_class_zero) { 3348 return a.sign ? float_relation_less : float_relation_greater; 3349 } 3350 3351 /* The only really important thing about infinity is its sign. If 3352 * both are infinities the sign marks the smallest of the two. 3353 */ 3354 if (a.cls == float_class_inf) { 3355 if ((b.cls == float_class_inf) && (a.sign == b.sign)) { 3356 return float_relation_equal; 3357 } 3358 return a.sign ? float_relation_less : float_relation_greater; 3359 } else if (b.cls == float_class_inf) { 3360 return b.sign ? float_relation_greater : float_relation_less; 3361 } 3362 3363 if (a.sign != b.sign) { 3364 return a.sign ? float_relation_less : float_relation_greater; 3365 } 3366 3367 if (a.exp == b.exp) { 3368 if (a.frac == b.frac) { 3369 return float_relation_equal; 3370 } 3371 if (a.sign) { 3372 return a.frac > b.frac ? 3373 float_relation_less : float_relation_greater; 3374 } else { 3375 return a.frac > b.frac ? 3376 float_relation_greater : float_relation_less; 3377 } 3378 } else { 3379 if (a.sign) { 3380 return a.exp > b.exp ? float_relation_less : float_relation_greater; 3381 } else { 3382 return a.exp > b.exp ? float_relation_greater : float_relation_less; 3383 } 3384 } 3385 } 3386 3387 #define COMPARE(name, attr, sz) \ 3388 static int attr \ 3389 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \ 3390 { \ 3391 FloatParts64 pa, pb; \ 3392 float ## sz ## _unpack_canonical(&pa, a, s); \ 3393 float ## sz ## _unpack_canonical(&pb, b, s); \ 3394 return compare_floats(pa, pb, is_quiet, s); \ 3395 } 3396 3397 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16) 3398 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32) 3399 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64) 3400 3401 #undef COMPARE 3402 3403 FloatRelation float16_compare(float16 a, float16 b, float_status *s) 3404 { 3405 return soft_f16_compare(a, b, false, s); 3406 } 3407 3408 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s) 3409 { 3410 return soft_f16_compare(a, b, true, s); 3411 } 3412 3413 static FloatRelation QEMU_FLATTEN 3414 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s) 3415 { 3416 union_float32 ua, ub; 3417 3418 ua.s = xa; 3419 ub.s = xb; 3420 3421 if (QEMU_NO_HARDFLOAT) { 3422 goto soft; 3423 } 3424 3425 float32_input_flush2(&ua.s, &ub.s, s); 3426 if (isgreaterequal(ua.h, ub.h)) { 3427 if (isgreater(ua.h, ub.h)) { 3428 return float_relation_greater; 3429 } 3430 return float_relation_equal; 3431 } 3432 if (likely(isless(ua.h, ub.h))) { 3433 return float_relation_less; 3434 } 3435 /* The only condition remaining is unordered. 3436 * Fall through to set flags. 3437 */ 3438 soft: 3439 return soft_f32_compare(ua.s, ub.s, is_quiet, s); 3440 } 3441 3442 FloatRelation float32_compare(float32 a, float32 b, float_status *s) 3443 { 3444 return f32_compare(a, b, false, s); 3445 } 3446 3447 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s) 3448 { 3449 return f32_compare(a, b, true, s); 3450 } 3451 3452 static FloatRelation QEMU_FLATTEN 3453 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s) 3454 { 3455 union_float64 ua, ub; 3456 3457 ua.s = xa; 3458 ub.s = xb; 3459 3460 if (QEMU_NO_HARDFLOAT) { 3461 goto soft; 3462 } 3463 3464 float64_input_flush2(&ua.s, &ub.s, s); 3465 if (isgreaterequal(ua.h, ub.h)) { 3466 if (isgreater(ua.h, ub.h)) { 3467 return float_relation_greater; 3468 } 3469 return float_relation_equal; 3470 } 3471 if (likely(isless(ua.h, ub.h))) { 3472 return float_relation_less; 3473 } 3474 /* The only condition remaining is unordered. 3475 * Fall through to set flags. 3476 */ 3477 soft: 3478 return soft_f64_compare(ua.s, ub.s, is_quiet, s); 3479 } 3480 3481 FloatRelation float64_compare(float64 a, float64 b, float_status *s) 3482 { 3483 return f64_compare(a, b, false, s); 3484 } 3485 3486 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s) 3487 { 3488 return f64_compare(a, b, true, s); 3489 } 3490 3491 static FloatRelation QEMU_FLATTEN 3492 soft_bf16_compare(bfloat16 a, bfloat16 b, bool is_quiet, float_status *s) 3493 { 3494 FloatParts64 pa, pb; 3495 3496 bfloat16_unpack_canonical(&pa, a, s); 3497 bfloat16_unpack_canonical(&pb, b, s); 3498 return compare_floats(pa, pb, is_quiet, s); 3499 } 3500 3501 FloatRelation bfloat16_compare(bfloat16 a, bfloat16 b, float_status *s) 3502 { 3503 return soft_bf16_compare(a, b, false, s); 3504 } 3505 3506 FloatRelation bfloat16_compare_quiet(bfloat16 a, bfloat16 b, float_status *s) 3507 { 3508 return soft_bf16_compare(a, b, true, s); 3509 } 3510 3511 /* Multiply A by 2 raised to the power N. */ 3512 static FloatParts64 scalbn_decomposed(FloatParts64 a, int n, float_status *s) 3513 { 3514 if (unlikely(is_nan(a.cls))) { 3515 parts_return_nan(&a, s); 3516 } 3517 if (a.cls == float_class_normal) { 3518 /* The largest float type (even though not supported by FloatParts64) 3519 * is float128, which has a 15 bit exponent. Bounding N to 16 bits 3520 * still allows rounding to infinity, without allowing overflow 3521 * within the int32_t that backs FloatParts64.exp. 3522 */ 3523 n = MIN(MAX(n, -0x10000), 0x10000); 3524 a.exp += n; 3525 } 3526 return a; 3527 } 3528 3529 float16 float16_scalbn(float16 a, int n, float_status *status) 3530 { 3531 FloatParts64 pa, pr; 3532 3533 float16_unpack_canonical(&pa, a, status); 3534 pr = scalbn_decomposed(pa, n, status); 3535 return float16_round_pack_canonical(&pr, status); 3536 } 3537 3538 float32 float32_scalbn(float32 a, int n, float_status *status) 3539 { 3540 FloatParts64 pa, pr; 3541 3542 float32_unpack_canonical(&pa, a, status); 3543 pr = scalbn_decomposed(pa, n, status); 3544 return float32_round_pack_canonical(&pr, status); 3545 } 3546 3547 float64 float64_scalbn(float64 a, int n, float_status *status) 3548 { 3549 FloatParts64 pa, pr; 3550 3551 float64_unpack_canonical(&pa, a, status); 3552 pr = scalbn_decomposed(pa, n, status); 3553 return float64_round_pack_canonical(&pr, status); 3554 } 3555 3556 bfloat16 bfloat16_scalbn(bfloat16 a, int n, float_status *status) 3557 { 3558 FloatParts64 pa, pr; 3559 3560 bfloat16_unpack_canonical(&pa, a, status); 3561 pr = scalbn_decomposed(pa, n, status); 3562 return bfloat16_round_pack_canonical(&pr, status); 3563 } 3564 3565 /* 3566 * Square Root 3567 * 3568 * The old softfloat code did an approximation step before zeroing in 3569 * on the final result. However for simpleness we just compute the 3570 * square root by iterating down from the implicit bit to enough extra 3571 * bits to ensure we get a correctly rounded result. 3572 * 3573 * This does mean however the calculation is slower than before, 3574 * especially for 64 bit floats. 3575 */ 3576 3577 static FloatParts64 sqrt_float(FloatParts64 a, float_status *s, const FloatFmt *p) 3578 { 3579 uint64_t a_frac, r_frac, s_frac; 3580 int bit, last_bit; 3581 3582 if (is_nan(a.cls)) { 3583 parts_return_nan(&a, s); 3584 return a; 3585 } 3586 if (a.cls == float_class_zero) { 3587 return a; /* sqrt(+-0) = +-0 */ 3588 } 3589 if (a.sign) { 3590 float_raise(float_flag_invalid, s); 3591 parts_default_nan(&a, s); 3592 return a; 3593 } 3594 if (a.cls == float_class_inf) { 3595 return a; /* sqrt(+inf) = +inf */ 3596 } 3597 3598 assert(a.cls == float_class_normal); 3599 3600 /* We need two overflow bits at the top. Adding room for that is a 3601 * right shift. If the exponent is odd, we can discard the low bit 3602 * by multiplying the fraction by 2; that's a left shift. Combine 3603 * those and we shift right by 1 if the exponent is odd, otherwise 2. 3604 */ 3605 a_frac = a.frac >> (2 - (a.exp & 1)); 3606 a.exp >>= 1; 3607 3608 /* Bit-by-bit computation of sqrt. */ 3609 r_frac = 0; 3610 s_frac = 0; 3611 3612 /* Iterate from implicit bit down to the 3 extra bits to compute a 3613 * properly rounded result. Remember we've inserted two more bits 3614 * at the top, so these positions are two less. 3615 */ 3616 bit = DECOMPOSED_BINARY_POINT - 2; 3617 last_bit = MAX(p->frac_shift - 4, 0); 3618 do { 3619 uint64_t q = 1ULL << bit; 3620 uint64_t t_frac = s_frac + q; 3621 if (t_frac <= a_frac) { 3622 s_frac = t_frac + q; 3623 a_frac -= t_frac; 3624 r_frac += q; 3625 } 3626 a_frac <<= 1; 3627 } while (--bit >= last_bit); 3628 3629 /* Undo the right shift done above. If there is any remaining 3630 * fraction, the result is inexact. Set the sticky bit. 3631 */ 3632 a.frac = (r_frac << 2) + (a_frac != 0); 3633 3634 return a; 3635 } 3636 3637 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status) 3638 { 3639 FloatParts64 pa, pr; 3640 3641 float16_unpack_canonical(&pa, a, status); 3642 pr = sqrt_float(pa, status, &float16_params); 3643 return float16_round_pack_canonical(&pr, status); 3644 } 3645 3646 static float32 QEMU_SOFTFLOAT_ATTR 3647 soft_f32_sqrt(float32 a, float_status *status) 3648 { 3649 FloatParts64 pa, pr; 3650 3651 float32_unpack_canonical(&pa, a, status); 3652 pr = sqrt_float(pa, status, &float32_params); 3653 return float32_round_pack_canonical(&pr, status); 3654 } 3655 3656 static float64 QEMU_SOFTFLOAT_ATTR 3657 soft_f64_sqrt(float64 a, float_status *status) 3658 { 3659 FloatParts64 pa, pr; 3660 3661 float64_unpack_canonical(&pa, a, status); 3662 pr = sqrt_float(pa, status, &float64_params); 3663 return float64_round_pack_canonical(&pr, status); 3664 } 3665 3666 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s) 3667 { 3668 union_float32 ua, ur; 3669 3670 ua.s = xa; 3671 if (unlikely(!can_use_fpu(s))) { 3672 goto soft; 3673 } 3674 3675 float32_input_flush1(&ua.s, s); 3676 if (QEMU_HARDFLOAT_1F32_USE_FP) { 3677 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3678 fpclassify(ua.h) == FP_ZERO) || 3679 signbit(ua.h))) { 3680 goto soft; 3681 } 3682 } else if (unlikely(!float32_is_zero_or_normal(ua.s) || 3683 float32_is_neg(ua.s))) { 3684 goto soft; 3685 } 3686 ur.h = sqrtf(ua.h); 3687 return ur.s; 3688 3689 soft: 3690 return soft_f32_sqrt(ua.s, s); 3691 } 3692 3693 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s) 3694 { 3695 union_float64 ua, ur; 3696 3697 ua.s = xa; 3698 if (unlikely(!can_use_fpu(s))) { 3699 goto soft; 3700 } 3701 3702 float64_input_flush1(&ua.s, s); 3703 if (QEMU_HARDFLOAT_1F64_USE_FP) { 3704 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3705 fpclassify(ua.h) == FP_ZERO) || 3706 signbit(ua.h))) { 3707 goto soft; 3708 } 3709 } else if (unlikely(!float64_is_zero_or_normal(ua.s) || 3710 float64_is_neg(ua.s))) { 3711 goto soft; 3712 } 3713 ur.h = sqrt(ua.h); 3714 return ur.s; 3715 3716 soft: 3717 return soft_f64_sqrt(ua.s, s); 3718 } 3719 3720 bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status) 3721 { 3722 FloatParts64 pa, pr; 3723 3724 bfloat16_unpack_canonical(&pa, a, status); 3725 pr = sqrt_float(pa, status, &bfloat16_params); 3726 return bfloat16_round_pack_canonical(&pr, status); 3727 } 3728 3729 /*---------------------------------------------------------------------------- 3730 | The pattern for a default generated NaN. 3731 *----------------------------------------------------------------------------*/ 3732 3733 float16 float16_default_nan(float_status *status) 3734 { 3735 FloatParts64 p; 3736 3737 parts_default_nan(&p, status); 3738 p.frac >>= float16_params.frac_shift; 3739 return float16_pack_raw(&p); 3740 } 3741 3742 float32 float32_default_nan(float_status *status) 3743 { 3744 FloatParts64 p; 3745 3746 parts_default_nan(&p, status); 3747 p.frac >>= float32_params.frac_shift; 3748 return float32_pack_raw(&p); 3749 } 3750 3751 float64 float64_default_nan(float_status *status) 3752 { 3753 FloatParts64 p; 3754 3755 parts_default_nan(&p, status); 3756 p.frac >>= float64_params.frac_shift; 3757 return float64_pack_raw(&p); 3758 } 3759 3760 float128 float128_default_nan(float_status *status) 3761 { 3762 FloatParts128 p; 3763 3764 parts_default_nan(&p, status); 3765 frac_shr(&p, float128_params.frac_shift); 3766 return float128_pack_raw(&p); 3767 } 3768 3769 bfloat16 bfloat16_default_nan(float_status *status) 3770 { 3771 FloatParts64 p; 3772 3773 parts_default_nan(&p, status); 3774 p.frac >>= bfloat16_params.frac_shift; 3775 return bfloat16_pack_raw(&p); 3776 } 3777 3778 /*---------------------------------------------------------------------------- 3779 | Returns a quiet NaN from a signalling NaN for the floating point value `a'. 3780 *----------------------------------------------------------------------------*/ 3781 3782 float16 float16_silence_nan(float16 a, float_status *status) 3783 { 3784 FloatParts64 p; 3785 3786 float16_unpack_raw(&p, a); 3787 p.frac <<= float16_params.frac_shift; 3788 parts_silence_nan(&p, status); 3789 p.frac >>= float16_params.frac_shift; 3790 return float16_pack_raw(&p); 3791 } 3792 3793 float32 float32_silence_nan(float32 a, float_status *status) 3794 { 3795 FloatParts64 p; 3796 3797 float32_unpack_raw(&p, a); 3798 p.frac <<= float32_params.frac_shift; 3799 parts_silence_nan(&p, status); 3800 p.frac >>= float32_params.frac_shift; 3801 return float32_pack_raw(&p); 3802 } 3803 3804 float64 float64_silence_nan(float64 a, float_status *status) 3805 { 3806 FloatParts64 p; 3807 3808 float64_unpack_raw(&p, a); 3809 p.frac <<= float64_params.frac_shift; 3810 parts_silence_nan(&p, status); 3811 p.frac >>= float64_params.frac_shift; 3812 return float64_pack_raw(&p); 3813 } 3814 3815 bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status) 3816 { 3817 FloatParts64 p; 3818 3819 bfloat16_unpack_raw(&p, a); 3820 p.frac <<= bfloat16_params.frac_shift; 3821 parts_silence_nan(&p, status); 3822 p.frac >>= bfloat16_params.frac_shift; 3823 return bfloat16_pack_raw(&p); 3824 } 3825 3826 float128 float128_silence_nan(float128 a, float_status *status) 3827 { 3828 FloatParts128 p; 3829 3830 float128_unpack_raw(&p, a); 3831 frac_shl(&p, float128_params.frac_shift); 3832 parts_silence_nan(&p, status); 3833 frac_shr(&p, float128_params.frac_shift); 3834 return float128_pack_raw(&p); 3835 } 3836 3837 /*---------------------------------------------------------------------------- 3838 | If `a' is denormal and we are in flush-to-zero mode then set the 3839 | input-denormal exception and return zero. Otherwise just return the value. 3840 *----------------------------------------------------------------------------*/ 3841 3842 static bool parts_squash_denormal(FloatParts64 p, float_status *status) 3843 { 3844 if (p.exp == 0 && p.frac != 0) { 3845 float_raise(float_flag_input_denormal, status); 3846 return true; 3847 } 3848 3849 return false; 3850 } 3851 3852 float16 float16_squash_input_denormal(float16 a, float_status *status) 3853 { 3854 if (status->flush_inputs_to_zero) { 3855 FloatParts64 p; 3856 3857 float16_unpack_raw(&p, a); 3858 if (parts_squash_denormal(p, status)) { 3859 return float16_set_sign(float16_zero, p.sign); 3860 } 3861 } 3862 return a; 3863 } 3864 3865 float32 float32_squash_input_denormal(float32 a, float_status *status) 3866 { 3867 if (status->flush_inputs_to_zero) { 3868 FloatParts64 p; 3869 3870 float32_unpack_raw(&p, a); 3871 if (parts_squash_denormal(p, status)) { 3872 return float32_set_sign(float32_zero, p.sign); 3873 } 3874 } 3875 return a; 3876 } 3877 3878 float64 float64_squash_input_denormal(float64 a, float_status *status) 3879 { 3880 if (status->flush_inputs_to_zero) { 3881 FloatParts64 p; 3882 3883 float64_unpack_raw(&p, a); 3884 if (parts_squash_denormal(p, status)) { 3885 return float64_set_sign(float64_zero, p.sign); 3886 } 3887 } 3888 return a; 3889 } 3890 3891 bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status) 3892 { 3893 if (status->flush_inputs_to_zero) { 3894 FloatParts64 p; 3895 3896 bfloat16_unpack_raw(&p, a); 3897 if (parts_squash_denormal(p, status)) { 3898 return bfloat16_set_sign(bfloat16_zero, p.sign); 3899 } 3900 } 3901 return a; 3902 } 3903 3904 /*---------------------------------------------------------------------------- 3905 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 3906 | and 7, and returns the properly rounded 32-bit integer corresponding to the 3907 | input. If `zSign' is 1, the input is negated before being converted to an 3908 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 3909 | is simply rounded to an integer, with the inexact exception raised if the 3910 | input cannot be represented exactly as an integer. However, if the fixed- 3911 | point input is too large, the invalid exception is raised and the largest 3912 | positive or negative integer is returned. 3913 *----------------------------------------------------------------------------*/ 3914 3915 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ, 3916 float_status *status) 3917 { 3918 int8_t roundingMode; 3919 bool roundNearestEven; 3920 int8_t roundIncrement, roundBits; 3921 int32_t z; 3922 3923 roundingMode = status->float_rounding_mode; 3924 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3925 switch (roundingMode) { 3926 case float_round_nearest_even: 3927 case float_round_ties_away: 3928 roundIncrement = 0x40; 3929 break; 3930 case float_round_to_zero: 3931 roundIncrement = 0; 3932 break; 3933 case float_round_up: 3934 roundIncrement = zSign ? 0 : 0x7f; 3935 break; 3936 case float_round_down: 3937 roundIncrement = zSign ? 0x7f : 0; 3938 break; 3939 case float_round_to_odd: 3940 roundIncrement = absZ & 0x80 ? 0 : 0x7f; 3941 break; 3942 default: 3943 abort(); 3944 } 3945 roundBits = absZ & 0x7F; 3946 absZ = ( absZ + roundIncrement )>>7; 3947 if (!(roundBits ^ 0x40) && roundNearestEven) { 3948 absZ &= ~1; 3949 } 3950 z = absZ; 3951 if ( zSign ) z = - z; 3952 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 3953 float_raise(float_flag_invalid, status); 3954 return zSign ? INT32_MIN : INT32_MAX; 3955 } 3956 if (roundBits) { 3957 float_raise(float_flag_inexact, status); 3958 } 3959 return z; 3960 3961 } 3962 3963 /*---------------------------------------------------------------------------- 3964 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 3965 | `absZ1', with binary point between bits 63 and 64 (between the input words), 3966 | and returns the properly rounded 64-bit integer corresponding to the input. 3967 | If `zSign' is 1, the input is negated before being converted to an integer. 3968 | Ordinarily, the fixed-point input is simply rounded to an integer, with 3969 | the inexact exception raised if the input cannot be represented exactly as 3970 | an integer. However, if the fixed-point input is too large, the invalid 3971 | exception is raised and the largest positive or negative integer is 3972 | returned. 3973 *----------------------------------------------------------------------------*/ 3974 3975 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1, 3976 float_status *status) 3977 { 3978 int8_t roundingMode; 3979 bool roundNearestEven, increment; 3980 int64_t z; 3981 3982 roundingMode = status->float_rounding_mode; 3983 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3984 switch (roundingMode) { 3985 case float_round_nearest_even: 3986 case float_round_ties_away: 3987 increment = ((int64_t) absZ1 < 0); 3988 break; 3989 case float_round_to_zero: 3990 increment = 0; 3991 break; 3992 case float_round_up: 3993 increment = !zSign && absZ1; 3994 break; 3995 case float_round_down: 3996 increment = zSign && absZ1; 3997 break; 3998 case float_round_to_odd: 3999 increment = !(absZ0 & 1) && absZ1; 4000 break; 4001 default: 4002 abort(); 4003 } 4004 if ( increment ) { 4005 ++absZ0; 4006 if ( absZ0 == 0 ) goto overflow; 4007 if (!(absZ1 << 1) && roundNearestEven) { 4008 absZ0 &= ~1; 4009 } 4010 } 4011 z = absZ0; 4012 if ( zSign ) z = - z; 4013 if ( z && ( ( z < 0 ) ^ zSign ) ) { 4014 overflow: 4015 float_raise(float_flag_invalid, status); 4016 return zSign ? INT64_MIN : INT64_MAX; 4017 } 4018 if (absZ1) { 4019 float_raise(float_flag_inexact, status); 4020 } 4021 return z; 4022 4023 } 4024 4025 /*---------------------------------------------------------------------------- 4026 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 4027 | `absZ1', with binary point between bits 63 and 64 (between the input words), 4028 | and returns the properly rounded 64-bit unsigned integer corresponding to the 4029 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 4030 | with the inexact exception raised if the input cannot be represented exactly 4031 | as an integer. However, if the fixed-point input is too large, the invalid 4032 | exception is raised and the largest unsigned integer is returned. 4033 *----------------------------------------------------------------------------*/ 4034 4035 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0, 4036 uint64_t absZ1, float_status *status) 4037 { 4038 int8_t roundingMode; 4039 bool roundNearestEven, increment; 4040 4041 roundingMode = status->float_rounding_mode; 4042 roundNearestEven = (roundingMode == float_round_nearest_even); 4043 switch (roundingMode) { 4044 case float_round_nearest_even: 4045 case float_round_ties_away: 4046 increment = ((int64_t)absZ1 < 0); 4047 break; 4048 case float_round_to_zero: 4049 increment = 0; 4050 break; 4051 case float_round_up: 4052 increment = !zSign && absZ1; 4053 break; 4054 case float_round_down: 4055 increment = zSign && absZ1; 4056 break; 4057 case float_round_to_odd: 4058 increment = !(absZ0 & 1) && absZ1; 4059 break; 4060 default: 4061 abort(); 4062 } 4063 if (increment) { 4064 ++absZ0; 4065 if (absZ0 == 0) { 4066 float_raise(float_flag_invalid, status); 4067 return UINT64_MAX; 4068 } 4069 if (!(absZ1 << 1) && roundNearestEven) { 4070 absZ0 &= ~1; 4071 } 4072 } 4073 4074 if (zSign && absZ0) { 4075 float_raise(float_flag_invalid, status); 4076 return 0; 4077 } 4078 4079 if (absZ1) { 4080 float_raise(float_flag_inexact, status); 4081 } 4082 return absZ0; 4083 } 4084 4085 /*---------------------------------------------------------------------------- 4086 | Normalizes the subnormal single-precision floating-point value represented 4087 | by the denormalized significand `aSig'. The normalized exponent and 4088 | significand are stored at the locations pointed to by `zExpPtr' and 4089 | `zSigPtr', respectively. 4090 *----------------------------------------------------------------------------*/ 4091 4092 static void 4093 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 4094 { 4095 int8_t shiftCount; 4096 4097 shiftCount = clz32(aSig) - 8; 4098 *zSigPtr = aSig<<shiftCount; 4099 *zExpPtr = 1 - shiftCount; 4100 4101 } 4102 4103 /*---------------------------------------------------------------------------- 4104 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4105 | and significand `zSig', and returns the proper single-precision floating- 4106 | point value corresponding to the abstract input. Ordinarily, the abstract 4107 | value is simply rounded and packed into the single-precision format, with 4108 | the inexact exception raised if the abstract input cannot be represented 4109 | exactly. However, if the abstract value is too large, the overflow and 4110 | inexact exceptions are raised and an infinity or maximal finite value is 4111 | returned. If the abstract value is too small, the input value is rounded to 4112 | a subnormal number, and the underflow and inexact exceptions are raised if 4113 | the abstract input cannot be represented exactly as a subnormal single- 4114 | precision floating-point number. 4115 | The input significand `zSig' has its binary point between bits 30 4116 | and 29, which is 7 bits to the left of the usual location. This shifted 4117 | significand must be normalized or smaller. If `zSig' is not normalized, 4118 | `zExp' must be 0; in that case, the result returned is a subnormal number, 4119 | and it must not require rounding. In the usual case that `zSig' is 4120 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 4121 | The handling of underflow and overflow follows the IEC/IEEE Standard for 4122 | Binary Floating-Point Arithmetic. 4123 *----------------------------------------------------------------------------*/ 4124 4125 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig, 4126 float_status *status) 4127 { 4128 int8_t roundingMode; 4129 bool roundNearestEven; 4130 int8_t roundIncrement, roundBits; 4131 bool isTiny; 4132 4133 roundingMode = status->float_rounding_mode; 4134 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4135 switch (roundingMode) { 4136 case float_round_nearest_even: 4137 case float_round_ties_away: 4138 roundIncrement = 0x40; 4139 break; 4140 case float_round_to_zero: 4141 roundIncrement = 0; 4142 break; 4143 case float_round_up: 4144 roundIncrement = zSign ? 0 : 0x7f; 4145 break; 4146 case float_round_down: 4147 roundIncrement = zSign ? 0x7f : 0; 4148 break; 4149 case float_round_to_odd: 4150 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 4151 break; 4152 default: 4153 abort(); 4154 break; 4155 } 4156 roundBits = zSig & 0x7F; 4157 if ( 0xFD <= (uint16_t) zExp ) { 4158 if ( ( 0xFD < zExp ) 4159 || ( ( zExp == 0xFD ) 4160 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 4161 ) { 4162 bool overflow_to_inf = roundingMode != float_round_to_odd && 4163 roundIncrement != 0; 4164 float_raise(float_flag_overflow | float_flag_inexact, status); 4165 return packFloat32(zSign, 0xFF, -!overflow_to_inf); 4166 } 4167 if ( zExp < 0 ) { 4168 if (status->flush_to_zero) { 4169 float_raise(float_flag_output_denormal, status); 4170 return packFloat32(zSign, 0, 0); 4171 } 4172 isTiny = status->tininess_before_rounding 4173 || (zExp < -1) 4174 || (zSig + roundIncrement < 0x80000000); 4175 shift32RightJamming( zSig, - zExp, &zSig ); 4176 zExp = 0; 4177 roundBits = zSig & 0x7F; 4178 if (isTiny && roundBits) { 4179 float_raise(float_flag_underflow, status); 4180 } 4181 if (roundingMode == float_round_to_odd) { 4182 /* 4183 * For round-to-odd case, the roundIncrement depends on 4184 * zSig which just changed. 4185 */ 4186 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 4187 } 4188 } 4189 } 4190 if (roundBits) { 4191 float_raise(float_flag_inexact, status); 4192 } 4193 zSig = ( zSig + roundIncrement )>>7; 4194 if (!(roundBits ^ 0x40) && roundNearestEven) { 4195 zSig &= ~1; 4196 } 4197 if ( zSig == 0 ) zExp = 0; 4198 return packFloat32( zSign, zExp, zSig ); 4199 4200 } 4201 4202 /*---------------------------------------------------------------------------- 4203 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4204 | and significand `zSig', and returns the proper single-precision floating- 4205 | point value corresponding to the abstract input. This routine is just like 4206 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 4207 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 4208 | floating-point exponent. 4209 *----------------------------------------------------------------------------*/ 4210 4211 static float32 4212 normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig, 4213 float_status *status) 4214 { 4215 int8_t shiftCount; 4216 4217 shiftCount = clz32(zSig) - 1; 4218 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 4219 status); 4220 4221 } 4222 4223 /*---------------------------------------------------------------------------- 4224 | Normalizes the subnormal double-precision floating-point value represented 4225 | by the denormalized significand `aSig'. The normalized exponent and 4226 | significand are stored at the locations pointed to by `zExpPtr' and 4227 | `zSigPtr', respectively. 4228 *----------------------------------------------------------------------------*/ 4229 4230 static void 4231 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 4232 { 4233 int8_t shiftCount; 4234 4235 shiftCount = clz64(aSig) - 11; 4236 *zSigPtr = aSig<<shiftCount; 4237 *zExpPtr = 1 - shiftCount; 4238 4239 } 4240 4241 /*---------------------------------------------------------------------------- 4242 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 4243 | double-precision floating-point value, returning the result. After being 4244 | shifted into the proper positions, the three fields are simply added 4245 | together to form the result. This means that any integer portion of `zSig' 4246 | will be added into the exponent. Since a properly normalized significand 4247 | will have an integer portion equal to 1, the `zExp' input should be 1 less 4248 | than the desired result exponent whenever `zSig' is a complete, normalized 4249 | significand. 4250 *----------------------------------------------------------------------------*/ 4251 4252 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig) 4253 { 4254 4255 return make_float64( 4256 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 4257 4258 } 4259 4260 /*---------------------------------------------------------------------------- 4261 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4262 | and significand `zSig', and returns the proper double-precision floating- 4263 | point value corresponding to the abstract input. Ordinarily, the abstract 4264 | value is simply rounded and packed into the double-precision format, with 4265 | the inexact exception raised if the abstract input cannot be represented 4266 | exactly. However, if the abstract value is too large, the overflow and 4267 | inexact exceptions are raised and an infinity or maximal finite value is 4268 | returned. If the abstract value is too small, the input value is rounded to 4269 | a subnormal number, and the underflow and inexact exceptions are raised if 4270 | the abstract input cannot be represented exactly as a subnormal double- 4271 | precision floating-point number. 4272 | The input significand `zSig' has its binary point between bits 62 4273 | and 61, which is 10 bits to the left of the usual location. This shifted 4274 | significand must be normalized or smaller. If `zSig' is not normalized, 4275 | `zExp' must be 0; in that case, the result returned is a subnormal number, 4276 | and it must not require rounding. In the usual case that `zSig' is 4277 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 4278 | The handling of underflow and overflow follows the IEC/IEEE Standard for 4279 | Binary Floating-Point Arithmetic. 4280 *----------------------------------------------------------------------------*/ 4281 4282 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig, 4283 float_status *status) 4284 { 4285 int8_t roundingMode; 4286 bool roundNearestEven; 4287 int roundIncrement, roundBits; 4288 bool isTiny; 4289 4290 roundingMode = status->float_rounding_mode; 4291 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4292 switch (roundingMode) { 4293 case float_round_nearest_even: 4294 case float_round_ties_away: 4295 roundIncrement = 0x200; 4296 break; 4297 case float_round_to_zero: 4298 roundIncrement = 0; 4299 break; 4300 case float_round_up: 4301 roundIncrement = zSign ? 0 : 0x3ff; 4302 break; 4303 case float_round_down: 4304 roundIncrement = zSign ? 0x3ff : 0; 4305 break; 4306 case float_round_to_odd: 4307 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 4308 break; 4309 default: 4310 abort(); 4311 } 4312 roundBits = zSig & 0x3FF; 4313 if ( 0x7FD <= (uint16_t) zExp ) { 4314 if ( ( 0x7FD < zExp ) 4315 || ( ( zExp == 0x7FD ) 4316 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 4317 ) { 4318 bool overflow_to_inf = roundingMode != float_round_to_odd && 4319 roundIncrement != 0; 4320 float_raise(float_flag_overflow | float_flag_inexact, status); 4321 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 4322 } 4323 if ( zExp < 0 ) { 4324 if (status->flush_to_zero) { 4325 float_raise(float_flag_output_denormal, status); 4326 return packFloat64(zSign, 0, 0); 4327 } 4328 isTiny = status->tininess_before_rounding 4329 || (zExp < -1) 4330 || (zSig + roundIncrement < UINT64_C(0x8000000000000000)); 4331 shift64RightJamming( zSig, - zExp, &zSig ); 4332 zExp = 0; 4333 roundBits = zSig & 0x3FF; 4334 if (isTiny && roundBits) { 4335 float_raise(float_flag_underflow, status); 4336 } 4337 if (roundingMode == float_round_to_odd) { 4338 /* 4339 * For round-to-odd case, the roundIncrement depends on 4340 * zSig which just changed. 4341 */ 4342 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 4343 } 4344 } 4345 } 4346 if (roundBits) { 4347 float_raise(float_flag_inexact, status); 4348 } 4349 zSig = ( zSig + roundIncrement )>>10; 4350 if (!(roundBits ^ 0x200) && roundNearestEven) { 4351 zSig &= ~1; 4352 } 4353 if ( zSig == 0 ) zExp = 0; 4354 return packFloat64( zSign, zExp, zSig ); 4355 4356 } 4357 4358 /*---------------------------------------------------------------------------- 4359 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4360 | and significand `zSig', and returns the proper double-precision floating- 4361 | point value corresponding to the abstract input. This routine is just like 4362 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 4363 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 4364 | floating-point exponent. 4365 *----------------------------------------------------------------------------*/ 4366 4367 static float64 4368 normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig, 4369 float_status *status) 4370 { 4371 int8_t shiftCount; 4372 4373 shiftCount = clz64(zSig) - 1; 4374 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 4375 status); 4376 4377 } 4378 4379 /*---------------------------------------------------------------------------- 4380 | Normalizes the subnormal extended double-precision floating-point value 4381 | represented by the denormalized significand `aSig'. The normalized exponent 4382 | and significand are stored at the locations pointed to by `zExpPtr' and 4383 | `zSigPtr', respectively. 4384 *----------------------------------------------------------------------------*/ 4385 4386 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr, 4387 uint64_t *zSigPtr) 4388 { 4389 int8_t shiftCount; 4390 4391 shiftCount = clz64(aSig); 4392 *zSigPtr = aSig<<shiftCount; 4393 *zExpPtr = 1 - shiftCount; 4394 } 4395 4396 /*---------------------------------------------------------------------------- 4397 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4398 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 4399 | and returns the proper extended double-precision floating-point value 4400 | corresponding to the abstract input. Ordinarily, the abstract value is 4401 | rounded and packed into the extended double-precision format, with the 4402 | inexact exception raised if the abstract input cannot be represented 4403 | exactly. However, if the abstract value is too large, the overflow and 4404 | inexact exceptions are raised and an infinity or maximal finite value is 4405 | returned. If the abstract value is too small, the input value is rounded to 4406 | a subnormal number, and the underflow and inexact exceptions are raised if 4407 | the abstract input cannot be represented exactly as a subnormal extended 4408 | double-precision floating-point number. 4409 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 4410 | number of bits as single or double precision, respectively. Otherwise, the 4411 | result is rounded to the full precision of the extended double-precision 4412 | format. 4413 | The input significand must be normalized or smaller. If the input 4414 | significand is not normalized, `zExp' must be 0; in that case, the result 4415 | returned is a subnormal number, and it must not require rounding. The 4416 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 4417 | Floating-Point Arithmetic. 4418 *----------------------------------------------------------------------------*/ 4419 4420 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign, 4421 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 4422 float_status *status) 4423 { 4424 int8_t roundingMode; 4425 bool roundNearestEven, increment, isTiny; 4426 int64_t roundIncrement, roundMask, roundBits; 4427 4428 roundingMode = status->float_rounding_mode; 4429 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4430 if ( roundingPrecision == 80 ) goto precision80; 4431 if ( roundingPrecision == 64 ) { 4432 roundIncrement = UINT64_C(0x0000000000000400); 4433 roundMask = UINT64_C(0x00000000000007FF); 4434 } 4435 else if ( roundingPrecision == 32 ) { 4436 roundIncrement = UINT64_C(0x0000008000000000); 4437 roundMask = UINT64_C(0x000000FFFFFFFFFF); 4438 } 4439 else { 4440 goto precision80; 4441 } 4442 zSig0 |= ( zSig1 != 0 ); 4443 switch (roundingMode) { 4444 case float_round_nearest_even: 4445 case float_round_ties_away: 4446 break; 4447 case float_round_to_zero: 4448 roundIncrement = 0; 4449 break; 4450 case float_round_up: 4451 roundIncrement = zSign ? 0 : roundMask; 4452 break; 4453 case float_round_down: 4454 roundIncrement = zSign ? roundMask : 0; 4455 break; 4456 default: 4457 abort(); 4458 } 4459 roundBits = zSig0 & roundMask; 4460 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 4461 if ( ( 0x7FFE < zExp ) 4462 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 4463 ) { 4464 goto overflow; 4465 } 4466 if ( zExp <= 0 ) { 4467 if (status->flush_to_zero) { 4468 float_raise(float_flag_output_denormal, status); 4469 return packFloatx80(zSign, 0, 0); 4470 } 4471 isTiny = status->tininess_before_rounding 4472 || (zExp < 0 ) 4473 || (zSig0 <= zSig0 + roundIncrement); 4474 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 4475 zExp = 0; 4476 roundBits = zSig0 & roundMask; 4477 if (isTiny && roundBits) { 4478 float_raise(float_flag_underflow, status); 4479 } 4480 if (roundBits) { 4481 float_raise(float_flag_inexact, status); 4482 } 4483 zSig0 += roundIncrement; 4484 if ( (int64_t) zSig0 < 0 ) zExp = 1; 4485 roundIncrement = roundMask + 1; 4486 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 4487 roundMask |= roundIncrement; 4488 } 4489 zSig0 &= ~ roundMask; 4490 return packFloatx80( zSign, zExp, zSig0 ); 4491 } 4492 } 4493 if (roundBits) { 4494 float_raise(float_flag_inexact, status); 4495 } 4496 zSig0 += roundIncrement; 4497 if ( zSig0 < roundIncrement ) { 4498 ++zExp; 4499 zSig0 = UINT64_C(0x8000000000000000); 4500 } 4501 roundIncrement = roundMask + 1; 4502 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 4503 roundMask |= roundIncrement; 4504 } 4505 zSig0 &= ~ roundMask; 4506 if ( zSig0 == 0 ) zExp = 0; 4507 return packFloatx80( zSign, zExp, zSig0 ); 4508 precision80: 4509 switch (roundingMode) { 4510 case float_round_nearest_even: 4511 case float_round_ties_away: 4512 increment = ((int64_t)zSig1 < 0); 4513 break; 4514 case float_round_to_zero: 4515 increment = 0; 4516 break; 4517 case float_round_up: 4518 increment = !zSign && zSig1; 4519 break; 4520 case float_round_down: 4521 increment = zSign && zSig1; 4522 break; 4523 default: 4524 abort(); 4525 } 4526 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 4527 if ( ( 0x7FFE < zExp ) 4528 || ( ( zExp == 0x7FFE ) 4529 && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) ) 4530 && increment 4531 ) 4532 ) { 4533 roundMask = 0; 4534 overflow: 4535 float_raise(float_flag_overflow | float_flag_inexact, status); 4536 if ( ( roundingMode == float_round_to_zero ) 4537 || ( zSign && ( roundingMode == float_round_up ) ) 4538 || ( ! zSign && ( roundingMode == float_round_down ) ) 4539 ) { 4540 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 4541 } 4542 return packFloatx80(zSign, 4543 floatx80_infinity_high, 4544 floatx80_infinity_low); 4545 } 4546 if ( zExp <= 0 ) { 4547 isTiny = status->tininess_before_rounding 4548 || (zExp < 0) 4549 || !increment 4550 || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF)); 4551 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 4552 zExp = 0; 4553 if (isTiny && zSig1) { 4554 float_raise(float_flag_underflow, status); 4555 } 4556 if (zSig1) { 4557 float_raise(float_flag_inexact, status); 4558 } 4559 switch (roundingMode) { 4560 case float_round_nearest_even: 4561 case float_round_ties_away: 4562 increment = ((int64_t)zSig1 < 0); 4563 break; 4564 case float_round_to_zero: 4565 increment = 0; 4566 break; 4567 case float_round_up: 4568 increment = !zSign && zSig1; 4569 break; 4570 case float_round_down: 4571 increment = zSign && zSig1; 4572 break; 4573 default: 4574 abort(); 4575 } 4576 if ( increment ) { 4577 ++zSig0; 4578 if (!(zSig1 << 1) && roundNearestEven) { 4579 zSig0 &= ~1; 4580 } 4581 if ( (int64_t) zSig0 < 0 ) zExp = 1; 4582 } 4583 return packFloatx80( zSign, zExp, zSig0 ); 4584 } 4585 } 4586 if (zSig1) { 4587 float_raise(float_flag_inexact, status); 4588 } 4589 if ( increment ) { 4590 ++zSig0; 4591 if ( zSig0 == 0 ) { 4592 ++zExp; 4593 zSig0 = UINT64_C(0x8000000000000000); 4594 } 4595 else { 4596 if (!(zSig1 << 1) && roundNearestEven) { 4597 zSig0 &= ~1; 4598 } 4599 } 4600 } 4601 else { 4602 if ( zSig0 == 0 ) zExp = 0; 4603 } 4604 return packFloatx80( zSign, zExp, zSig0 ); 4605 4606 } 4607 4608 /*---------------------------------------------------------------------------- 4609 | Takes an abstract floating-point value having sign `zSign', exponent 4610 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 4611 | and returns the proper extended double-precision floating-point value 4612 | corresponding to the abstract input. This routine is just like 4613 | `roundAndPackFloatx80' except that the input significand does not have to be 4614 | normalized. 4615 *----------------------------------------------------------------------------*/ 4616 4617 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 4618 bool zSign, int32_t zExp, 4619 uint64_t zSig0, uint64_t zSig1, 4620 float_status *status) 4621 { 4622 int8_t shiftCount; 4623 4624 if ( zSig0 == 0 ) { 4625 zSig0 = zSig1; 4626 zSig1 = 0; 4627 zExp -= 64; 4628 } 4629 shiftCount = clz64(zSig0); 4630 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4631 zExp -= shiftCount; 4632 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 4633 zSig0, zSig1, status); 4634 4635 } 4636 4637 /*---------------------------------------------------------------------------- 4638 | Returns the least-significant 64 fraction bits of the quadruple-precision 4639 | floating-point value `a'. 4640 *----------------------------------------------------------------------------*/ 4641 4642 static inline uint64_t extractFloat128Frac1( float128 a ) 4643 { 4644 4645 return a.low; 4646 4647 } 4648 4649 /*---------------------------------------------------------------------------- 4650 | Returns the most-significant 48 fraction bits of the quadruple-precision 4651 | floating-point value `a'. 4652 *----------------------------------------------------------------------------*/ 4653 4654 static inline uint64_t extractFloat128Frac0( float128 a ) 4655 { 4656 4657 return a.high & UINT64_C(0x0000FFFFFFFFFFFF); 4658 4659 } 4660 4661 /*---------------------------------------------------------------------------- 4662 | Returns the exponent bits of the quadruple-precision floating-point value 4663 | `a'. 4664 *----------------------------------------------------------------------------*/ 4665 4666 static inline int32_t extractFloat128Exp( float128 a ) 4667 { 4668 4669 return ( a.high>>48 ) & 0x7FFF; 4670 4671 } 4672 4673 /*---------------------------------------------------------------------------- 4674 | Returns the sign bit of the quadruple-precision floating-point value `a'. 4675 *----------------------------------------------------------------------------*/ 4676 4677 static inline bool extractFloat128Sign(float128 a) 4678 { 4679 return a.high >> 63; 4680 } 4681 4682 /*---------------------------------------------------------------------------- 4683 | Normalizes the subnormal quadruple-precision floating-point value 4684 | represented by the denormalized significand formed by the concatenation of 4685 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 4686 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 4687 | significand are stored at the location pointed to by `zSig0Ptr', and the 4688 | least significant 64 bits of the normalized significand are stored at the 4689 | location pointed to by `zSig1Ptr'. 4690 *----------------------------------------------------------------------------*/ 4691 4692 static void 4693 normalizeFloat128Subnormal( 4694 uint64_t aSig0, 4695 uint64_t aSig1, 4696 int32_t *zExpPtr, 4697 uint64_t *zSig0Ptr, 4698 uint64_t *zSig1Ptr 4699 ) 4700 { 4701 int8_t shiftCount; 4702 4703 if ( aSig0 == 0 ) { 4704 shiftCount = clz64(aSig1) - 15; 4705 if ( shiftCount < 0 ) { 4706 *zSig0Ptr = aSig1>>( - shiftCount ); 4707 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 4708 } 4709 else { 4710 *zSig0Ptr = aSig1<<shiftCount; 4711 *zSig1Ptr = 0; 4712 } 4713 *zExpPtr = - shiftCount - 63; 4714 } 4715 else { 4716 shiftCount = clz64(aSig0) - 15; 4717 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 4718 *zExpPtr = 1 - shiftCount; 4719 } 4720 4721 } 4722 4723 /*---------------------------------------------------------------------------- 4724 | Packs the sign `zSign', the exponent `zExp', and the significand formed 4725 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 4726 | floating-point value, returning the result. After being shifted into the 4727 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 4728 | added together to form the most significant 32 bits of the result. This 4729 | means that any integer portion of `zSig0' will be added into the exponent. 4730 | Since a properly normalized significand will have an integer portion equal 4731 | to 1, the `zExp' input should be 1 less than the desired result exponent 4732 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 4733 | significand. 4734 *----------------------------------------------------------------------------*/ 4735 4736 static inline float128 4737 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1) 4738 { 4739 float128 z; 4740 4741 z.low = zSig1; 4742 z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0; 4743 return z; 4744 } 4745 4746 /*---------------------------------------------------------------------------- 4747 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4748 | and extended significand formed by the concatenation of `zSig0', `zSig1', 4749 | and `zSig2', and returns the proper quadruple-precision floating-point value 4750 | corresponding to the abstract input. Ordinarily, the abstract value is 4751 | simply rounded and packed into the quadruple-precision format, with the 4752 | inexact exception raised if the abstract input cannot be represented 4753 | exactly. However, if the abstract value is too large, the overflow and 4754 | inexact exceptions are raised and an infinity or maximal finite value is 4755 | returned. If the abstract value is too small, the input value is rounded to 4756 | a subnormal number, and the underflow and inexact exceptions are raised if 4757 | the abstract input cannot be represented exactly as a subnormal quadruple- 4758 | precision floating-point number. 4759 | The input significand must be normalized or smaller. If the input 4760 | significand is not normalized, `zExp' must be 0; in that case, the result 4761 | returned is a subnormal number, and it must not require rounding. In the 4762 | usual case that the input significand is normalized, `zExp' must be 1 less 4763 | than the ``true'' floating-point exponent. The handling of underflow and 4764 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4765 *----------------------------------------------------------------------------*/ 4766 4767 static float128 roundAndPackFloat128(bool zSign, int32_t zExp, 4768 uint64_t zSig0, uint64_t zSig1, 4769 uint64_t zSig2, float_status *status) 4770 { 4771 int8_t roundingMode; 4772 bool roundNearestEven, increment, isTiny; 4773 4774 roundingMode = status->float_rounding_mode; 4775 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4776 switch (roundingMode) { 4777 case float_round_nearest_even: 4778 case float_round_ties_away: 4779 increment = ((int64_t)zSig2 < 0); 4780 break; 4781 case float_round_to_zero: 4782 increment = 0; 4783 break; 4784 case float_round_up: 4785 increment = !zSign && zSig2; 4786 break; 4787 case float_round_down: 4788 increment = zSign && zSig2; 4789 break; 4790 case float_round_to_odd: 4791 increment = !(zSig1 & 0x1) && zSig2; 4792 break; 4793 default: 4794 abort(); 4795 } 4796 if ( 0x7FFD <= (uint32_t) zExp ) { 4797 if ( ( 0x7FFD < zExp ) 4798 || ( ( zExp == 0x7FFD ) 4799 && eq128( 4800 UINT64_C(0x0001FFFFFFFFFFFF), 4801 UINT64_C(0xFFFFFFFFFFFFFFFF), 4802 zSig0, 4803 zSig1 4804 ) 4805 && increment 4806 ) 4807 ) { 4808 float_raise(float_flag_overflow | float_flag_inexact, status); 4809 if ( ( roundingMode == float_round_to_zero ) 4810 || ( zSign && ( roundingMode == float_round_up ) ) 4811 || ( ! zSign && ( roundingMode == float_round_down ) ) 4812 || (roundingMode == float_round_to_odd) 4813 ) { 4814 return 4815 packFloat128( 4816 zSign, 4817 0x7FFE, 4818 UINT64_C(0x0000FFFFFFFFFFFF), 4819 UINT64_C(0xFFFFFFFFFFFFFFFF) 4820 ); 4821 } 4822 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4823 } 4824 if ( zExp < 0 ) { 4825 if (status->flush_to_zero) { 4826 float_raise(float_flag_output_denormal, status); 4827 return packFloat128(zSign, 0, 0, 0); 4828 } 4829 isTiny = status->tininess_before_rounding 4830 || (zExp < -1) 4831 || !increment 4832 || lt128(zSig0, zSig1, 4833 UINT64_C(0x0001FFFFFFFFFFFF), 4834 UINT64_C(0xFFFFFFFFFFFFFFFF)); 4835 shift128ExtraRightJamming( 4836 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 4837 zExp = 0; 4838 if (isTiny && zSig2) { 4839 float_raise(float_flag_underflow, status); 4840 } 4841 switch (roundingMode) { 4842 case float_round_nearest_even: 4843 case float_round_ties_away: 4844 increment = ((int64_t)zSig2 < 0); 4845 break; 4846 case float_round_to_zero: 4847 increment = 0; 4848 break; 4849 case float_round_up: 4850 increment = !zSign && zSig2; 4851 break; 4852 case float_round_down: 4853 increment = zSign && zSig2; 4854 break; 4855 case float_round_to_odd: 4856 increment = !(zSig1 & 0x1) && zSig2; 4857 break; 4858 default: 4859 abort(); 4860 } 4861 } 4862 } 4863 if (zSig2) { 4864 float_raise(float_flag_inexact, status); 4865 } 4866 if ( increment ) { 4867 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 4868 if ((zSig2 + zSig2 == 0) && roundNearestEven) { 4869 zSig1 &= ~1; 4870 } 4871 } 4872 else { 4873 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 4874 } 4875 return packFloat128( zSign, zExp, zSig0, zSig1 ); 4876 4877 } 4878 4879 /*---------------------------------------------------------------------------- 4880 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4881 | and significand formed by the concatenation of `zSig0' and `zSig1', and 4882 | returns the proper quadruple-precision floating-point value corresponding 4883 | to the abstract input. This routine is just like `roundAndPackFloat128' 4884 | except that the input significand has fewer bits and does not have to be 4885 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 4886 | point exponent. 4887 *----------------------------------------------------------------------------*/ 4888 4889 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp, 4890 uint64_t zSig0, uint64_t zSig1, 4891 float_status *status) 4892 { 4893 int8_t shiftCount; 4894 uint64_t zSig2; 4895 4896 if ( zSig0 == 0 ) { 4897 zSig0 = zSig1; 4898 zSig1 = 0; 4899 zExp -= 64; 4900 } 4901 shiftCount = clz64(zSig0) - 15; 4902 if ( 0 <= shiftCount ) { 4903 zSig2 = 0; 4904 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4905 } 4906 else { 4907 shift128ExtraRightJamming( 4908 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 4909 } 4910 zExp -= shiftCount; 4911 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 4912 4913 } 4914 4915 4916 /*---------------------------------------------------------------------------- 4917 | Returns the result of converting the 32-bit two's complement integer `a' 4918 | to the extended double-precision floating-point format. The conversion 4919 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4920 | Arithmetic. 4921 *----------------------------------------------------------------------------*/ 4922 4923 floatx80 int32_to_floatx80(int32_t a, float_status *status) 4924 { 4925 bool zSign; 4926 uint32_t absA; 4927 int8_t shiftCount; 4928 uint64_t zSig; 4929 4930 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 4931 zSign = ( a < 0 ); 4932 absA = zSign ? - a : a; 4933 shiftCount = clz32(absA) + 32; 4934 zSig = absA; 4935 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 4936 4937 } 4938 4939 /*---------------------------------------------------------------------------- 4940 | Returns the result of converting the 32-bit two's complement integer `a' to 4941 | the quadruple-precision floating-point format. The conversion is performed 4942 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4943 *----------------------------------------------------------------------------*/ 4944 4945 float128 int32_to_float128(int32_t a, float_status *status) 4946 { 4947 bool zSign; 4948 uint32_t absA; 4949 int8_t shiftCount; 4950 uint64_t zSig0; 4951 4952 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 4953 zSign = ( a < 0 ); 4954 absA = zSign ? - a : a; 4955 shiftCount = clz32(absA) + 17; 4956 zSig0 = absA; 4957 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 4958 4959 } 4960 4961 /*---------------------------------------------------------------------------- 4962 | Returns the result of converting the 64-bit two's complement integer `a' 4963 | to the extended double-precision floating-point format. The conversion 4964 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4965 | Arithmetic. 4966 *----------------------------------------------------------------------------*/ 4967 4968 floatx80 int64_to_floatx80(int64_t a, float_status *status) 4969 { 4970 bool zSign; 4971 uint64_t absA; 4972 int8_t shiftCount; 4973 4974 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 4975 zSign = ( a < 0 ); 4976 absA = zSign ? - a : a; 4977 shiftCount = clz64(absA); 4978 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 4979 4980 } 4981 4982 /*---------------------------------------------------------------------------- 4983 | Returns the result of converting the 64-bit two's complement integer `a' to 4984 | the quadruple-precision floating-point format. The conversion is performed 4985 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4986 *----------------------------------------------------------------------------*/ 4987 4988 float128 int64_to_float128(int64_t a, float_status *status) 4989 { 4990 bool zSign; 4991 uint64_t absA; 4992 int8_t shiftCount; 4993 int32_t zExp; 4994 uint64_t zSig0, zSig1; 4995 4996 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 4997 zSign = ( a < 0 ); 4998 absA = zSign ? - a : a; 4999 shiftCount = clz64(absA) + 49; 5000 zExp = 0x406E - shiftCount; 5001 if ( 64 <= shiftCount ) { 5002 zSig1 = 0; 5003 zSig0 = absA; 5004 shiftCount -= 64; 5005 } 5006 else { 5007 zSig1 = absA; 5008 zSig0 = 0; 5009 } 5010 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 5011 return packFloat128( zSign, zExp, zSig0, zSig1 ); 5012 5013 } 5014 5015 /*---------------------------------------------------------------------------- 5016 | Returns the result of converting the 64-bit unsigned integer `a' 5017 | to the quadruple-precision floating-point format. The conversion is performed 5018 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5019 *----------------------------------------------------------------------------*/ 5020 5021 float128 uint64_to_float128(uint64_t a, float_status *status) 5022 { 5023 if (a == 0) { 5024 return float128_zero; 5025 } 5026 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status); 5027 } 5028 5029 /*---------------------------------------------------------------------------- 5030 | Returns the result of converting the single-precision floating-point value 5031 | `a' to the extended double-precision floating-point format. The conversion 5032 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5033 | Arithmetic. 5034 *----------------------------------------------------------------------------*/ 5035 5036 floatx80 float32_to_floatx80(float32 a, float_status *status) 5037 { 5038 bool aSign; 5039 int aExp; 5040 uint32_t aSig; 5041 5042 a = float32_squash_input_denormal(a, status); 5043 aSig = extractFloat32Frac( a ); 5044 aExp = extractFloat32Exp( a ); 5045 aSign = extractFloat32Sign( a ); 5046 if ( aExp == 0xFF ) { 5047 if (aSig) { 5048 floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status), 5049 status); 5050 return floatx80_silence_nan(res, status); 5051 } 5052 return packFloatx80(aSign, 5053 floatx80_infinity_high, 5054 floatx80_infinity_low); 5055 } 5056 if ( aExp == 0 ) { 5057 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 5058 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5059 } 5060 aSig |= 0x00800000; 5061 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 5062 5063 } 5064 5065 /*---------------------------------------------------------------------------- 5066 | Returns the result of converting the single-precision floating-point value 5067 | `a' to the double-precision floating-point format. The conversion is 5068 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5069 | Arithmetic. 5070 *----------------------------------------------------------------------------*/ 5071 5072 float128 float32_to_float128(float32 a, float_status *status) 5073 { 5074 bool aSign; 5075 int aExp; 5076 uint32_t aSig; 5077 5078 a = float32_squash_input_denormal(a, status); 5079 aSig = extractFloat32Frac( a ); 5080 aExp = extractFloat32Exp( a ); 5081 aSign = extractFloat32Sign( a ); 5082 if ( aExp == 0xFF ) { 5083 if (aSig) { 5084 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 5085 } 5086 return packFloat128( aSign, 0x7FFF, 0, 0 ); 5087 } 5088 if ( aExp == 0 ) { 5089 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 5090 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5091 --aExp; 5092 } 5093 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 5094 5095 } 5096 5097 /*---------------------------------------------------------------------------- 5098 | Returns the remainder of the single-precision floating-point value `a' 5099 | with respect to the corresponding value `b'. The operation is performed 5100 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5101 *----------------------------------------------------------------------------*/ 5102 5103 float32 float32_rem(float32 a, float32 b, float_status *status) 5104 { 5105 bool aSign, zSign; 5106 int aExp, bExp, expDiff; 5107 uint32_t aSig, bSig; 5108 uint32_t q; 5109 uint64_t aSig64, bSig64, q64; 5110 uint32_t alternateASig; 5111 int32_t sigMean; 5112 a = float32_squash_input_denormal(a, status); 5113 b = float32_squash_input_denormal(b, status); 5114 5115 aSig = extractFloat32Frac( a ); 5116 aExp = extractFloat32Exp( a ); 5117 aSign = extractFloat32Sign( a ); 5118 bSig = extractFloat32Frac( b ); 5119 bExp = extractFloat32Exp( b ); 5120 if ( aExp == 0xFF ) { 5121 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 5122 return propagateFloat32NaN(a, b, status); 5123 } 5124 float_raise(float_flag_invalid, status); 5125 return float32_default_nan(status); 5126 } 5127 if ( bExp == 0xFF ) { 5128 if (bSig) { 5129 return propagateFloat32NaN(a, b, status); 5130 } 5131 return a; 5132 } 5133 if ( bExp == 0 ) { 5134 if ( bSig == 0 ) { 5135 float_raise(float_flag_invalid, status); 5136 return float32_default_nan(status); 5137 } 5138 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 5139 } 5140 if ( aExp == 0 ) { 5141 if ( aSig == 0 ) return a; 5142 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5143 } 5144 expDiff = aExp - bExp; 5145 aSig |= 0x00800000; 5146 bSig |= 0x00800000; 5147 if ( expDiff < 32 ) { 5148 aSig <<= 8; 5149 bSig <<= 8; 5150 if ( expDiff < 0 ) { 5151 if ( expDiff < -1 ) return a; 5152 aSig >>= 1; 5153 } 5154 q = ( bSig <= aSig ); 5155 if ( q ) aSig -= bSig; 5156 if ( 0 < expDiff ) { 5157 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 5158 q >>= 32 - expDiff; 5159 bSig >>= 2; 5160 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 5161 } 5162 else { 5163 aSig >>= 2; 5164 bSig >>= 2; 5165 } 5166 } 5167 else { 5168 if ( bSig <= aSig ) aSig -= bSig; 5169 aSig64 = ( (uint64_t) aSig )<<40; 5170 bSig64 = ( (uint64_t) bSig )<<40; 5171 expDiff -= 64; 5172 while ( 0 < expDiff ) { 5173 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 5174 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 5175 aSig64 = - ( ( bSig * q64 )<<38 ); 5176 expDiff -= 62; 5177 } 5178 expDiff += 64; 5179 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 5180 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 5181 q = q64>>( 64 - expDiff ); 5182 bSig <<= 6; 5183 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 5184 } 5185 do { 5186 alternateASig = aSig; 5187 ++q; 5188 aSig -= bSig; 5189 } while ( 0 <= (int32_t) aSig ); 5190 sigMean = aSig + alternateASig; 5191 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 5192 aSig = alternateASig; 5193 } 5194 zSign = ( (int32_t) aSig < 0 ); 5195 if ( zSign ) aSig = - aSig; 5196 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 5197 } 5198 5199 5200 5201 /*---------------------------------------------------------------------------- 5202 | Returns the binary exponential of the single-precision floating-point value 5203 | `a'. The operation is performed according to the IEC/IEEE Standard for 5204 | Binary Floating-Point Arithmetic. 5205 | 5206 | Uses the following identities: 5207 | 5208 | 1. ------------------------------------------------------------------------- 5209 | x x*ln(2) 5210 | 2 = e 5211 | 5212 | 2. ------------------------------------------------------------------------- 5213 | 2 3 4 5 n 5214 | x x x x x x x 5215 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 5216 | 1! 2! 3! 4! 5! n! 5217 *----------------------------------------------------------------------------*/ 5218 5219 static const float64 float32_exp2_coefficients[15] = 5220 { 5221 const_float64( 0x3ff0000000000000ll ), /* 1 */ 5222 const_float64( 0x3fe0000000000000ll ), /* 2 */ 5223 const_float64( 0x3fc5555555555555ll ), /* 3 */ 5224 const_float64( 0x3fa5555555555555ll ), /* 4 */ 5225 const_float64( 0x3f81111111111111ll ), /* 5 */ 5226 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 5227 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 5228 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 5229 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 5230 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 5231 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 5232 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 5233 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 5234 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 5235 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 5236 }; 5237 5238 float32 float32_exp2(float32 a, float_status *status) 5239 { 5240 bool aSign; 5241 int aExp; 5242 uint32_t aSig; 5243 float64 r, x, xn; 5244 int i; 5245 a = float32_squash_input_denormal(a, status); 5246 5247 aSig = extractFloat32Frac( a ); 5248 aExp = extractFloat32Exp( a ); 5249 aSign = extractFloat32Sign( a ); 5250 5251 if ( aExp == 0xFF) { 5252 if (aSig) { 5253 return propagateFloat32NaN(a, float32_zero, status); 5254 } 5255 return (aSign) ? float32_zero : a; 5256 } 5257 if (aExp == 0) { 5258 if (aSig == 0) return float32_one; 5259 } 5260 5261 float_raise(float_flag_inexact, status); 5262 5263 /* ******************************* */ 5264 /* using float64 for approximation */ 5265 /* ******************************* */ 5266 x = float32_to_float64(a, status); 5267 x = float64_mul(x, float64_ln2, status); 5268 5269 xn = x; 5270 r = float64_one; 5271 for (i = 0 ; i < 15 ; i++) { 5272 float64 f; 5273 5274 f = float64_mul(xn, float32_exp2_coefficients[i], status); 5275 r = float64_add(r, f, status); 5276 5277 xn = float64_mul(xn, x, status); 5278 } 5279 5280 return float64_to_float32(r, status); 5281 } 5282 5283 /*---------------------------------------------------------------------------- 5284 | Returns the binary log of the single-precision floating-point value `a'. 5285 | The operation is performed according to the IEC/IEEE Standard for Binary 5286 | Floating-Point Arithmetic. 5287 *----------------------------------------------------------------------------*/ 5288 float32 float32_log2(float32 a, float_status *status) 5289 { 5290 bool aSign, zSign; 5291 int aExp; 5292 uint32_t aSig, zSig, i; 5293 5294 a = float32_squash_input_denormal(a, status); 5295 aSig = extractFloat32Frac( a ); 5296 aExp = extractFloat32Exp( a ); 5297 aSign = extractFloat32Sign( a ); 5298 5299 if ( aExp == 0 ) { 5300 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 5301 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5302 } 5303 if ( aSign ) { 5304 float_raise(float_flag_invalid, status); 5305 return float32_default_nan(status); 5306 } 5307 if ( aExp == 0xFF ) { 5308 if (aSig) { 5309 return propagateFloat32NaN(a, float32_zero, status); 5310 } 5311 return a; 5312 } 5313 5314 aExp -= 0x7F; 5315 aSig |= 0x00800000; 5316 zSign = aExp < 0; 5317 zSig = aExp << 23; 5318 5319 for (i = 1 << 22; i > 0; i >>= 1) { 5320 aSig = ( (uint64_t)aSig * aSig ) >> 23; 5321 if ( aSig & 0x01000000 ) { 5322 aSig >>= 1; 5323 zSig |= i; 5324 } 5325 } 5326 5327 if ( zSign ) 5328 zSig = -zSig; 5329 5330 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 5331 } 5332 5333 /*---------------------------------------------------------------------------- 5334 | Returns the result of converting the double-precision floating-point value 5335 | `a' to the extended double-precision floating-point format. The conversion 5336 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5337 | Arithmetic. 5338 *----------------------------------------------------------------------------*/ 5339 5340 floatx80 float64_to_floatx80(float64 a, float_status *status) 5341 { 5342 bool aSign; 5343 int aExp; 5344 uint64_t aSig; 5345 5346 a = float64_squash_input_denormal(a, status); 5347 aSig = extractFloat64Frac( a ); 5348 aExp = extractFloat64Exp( a ); 5349 aSign = extractFloat64Sign( a ); 5350 if ( aExp == 0x7FF ) { 5351 if (aSig) { 5352 floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status), 5353 status); 5354 return floatx80_silence_nan(res, status); 5355 } 5356 return packFloatx80(aSign, 5357 floatx80_infinity_high, 5358 floatx80_infinity_low); 5359 } 5360 if ( aExp == 0 ) { 5361 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 5362 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5363 } 5364 return 5365 packFloatx80( 5366 aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11); 5367 5368 } 5369 5370 /*---------------------------------------------------------------------------- 5371 | Returns the result of converting the double-precision floating-point value 5372 | `a' to the quadruple-precision floating-point format. The conversion is 5373 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5374 | Arithmetic. 5375 *----------------------------------------------------------------------------*/ 5376 5377 float128 float64_to_float128(float64 a, float_status *status) 5378 { 5379 bool aSign; 5380 int aExp; 5381 uint64_t aSig, zSig0, zSig1; 5382 5383 a = float64_squash_input_denormal(a, status); 5384 aSig = extractFloat64Frac( a ); 5385 aExp = extractFloat64Exp( a ); 5386 aSign = extractFloat64Sign( a ); 5387 if ( aExp == 0x7FF ) { 5388 if (aSig) { 5389 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 5390 } 5391 return packFloat128( aSign, 0x7FFF, 0, 0 ); 5392 } 5393 if ( aExp == 0 ) { 5394 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 5395 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5396 --aExp; 5397 } 5398 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 5399 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 5400 5401 } 5402 5403 5404 /*---------------------------------------------------------------------------- 5405 | Returns the remainder of the double-precision floating-point value `a' 5406 | with respect to the corresponding value `b'. The operation is performed 5407 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5408 *----------------------------------------------------------------------------*/ 5409 5410 float64 float64_rem(float64 a, float64 b, float_status *status) 5411 { 5412 bool aSign, zSign; 5413 int aExp, bExp, expDiff; 5414 uint64_t aSig, bSig; 5415 uint64_t q, alternateASig; 5416 int64_t sigMean; 5417 5418 a = float64_squash_input_denormal(a, status); 5419 b = float64_squash_input_denormal(b, status); 5420 aSig = extractFloat64Frac( a ); 5421 aExp = extractFloat64Exp( a ); 5422 aSign = extractFloat64Sign( a ); 5423 bSig = extractFloat64Frac( b ); 5424 bExp = extractFloat64Exp( b ); 5425 if ( aExp == 0x7FF ) { 5426 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 5427 return propagateFloat64NaN(a, b, status); 5428 } 5429 float_raise(float_flag_invalid, status); 5430 return float64_default_nan(status); 5431 } 5432 if ( bExp == 0x7FF ) { 5433 if (bSig) { 5434 return propagateFloat64NaN(a, b, status); 5435 } 5436 return a; 5437 } 5438 if ( bExp == 0 ) { 5439 if ( bSig == 0 ) { 5440 float_raise(float_flag_invalid, status); 5441 return float64_default_nan(status); 5442 } 5443 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 5444 } 5445 if ( aExp == 0 ) { 5446 if ( aSig == 0 ) return a; 5447 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5448 } 5449 expDiff = aExp - bExp; 5450 aSig = (aSig | UINT64_C(0x0010000000000000)) << 11; 5451 bSig = (bSig | UINT64_C(0x0010000000000000)) << 11; 5452 if ( expDiff < 0 ) { 5453 if ( expDiff < -1 ) return a; 5454 aSig >>= 1; 5455 } 5456 q = ( bSig <= aSig ); 5457 if ( q ) aSig -= bSig; 5458 expDiff -= 64; 5459 while ( 0 < expDiff ) { 5460 q = estimateDiv128To64( aSig, 0, bSig ); 5461 q = ( 2 < q ) ? q - 2 : 0; 5462 aSig = - ( ( bSig>>2 ) * q ); 5463 expDiff -= 62; 5464 } 5465 expDiff += 64; 5466 if ( 0 < expDiff ) { 5467 q = estimateDiv128To64( aSig, 0, bSig ); 5468 q = ( 2 < q ) ? q - 2 : 0; 5469 q >>= 64 - expDiff; 5470 bSig >>= 2; 5471 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 5472 } 5473 else { 5474 aSig >>= 2; 5475 bSig >>= 2; 5476 } 5477 do { 5478 alternateASig = aSig; 5479 ++q; 5480 aSig -= bSig; 5481 } while ( 0 <= (int64_t) aSig ); 5482 sigMean = aSig + alternateASig; 5483 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 5484 aSig = alternateASig; 5485 } 5486 zSign = ( (int64_t) aSig < 0 ); 5487 if ( zSign ) aSig = - aSig; 5488 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 5489 5490 } 5491 5492 /*---------------------------------------------------------------------------- 5493 | Returns the binary log of the double-precision floating-point value `a'. 5494 | The operation is performed according to the IEC/IEEE Standard for Binary 5495 | Floating-Point Arithmetic. 5496 *----------------------------------------------------------------------------*/ 5497 float64 float64_log2(float64 a, float_status *status) 5498 { 5499 bool aSign, zSign; 5500 int aExp; 5501 uint64_t aSig, aSig0, aSig1, zSig, i; 5502 a = float64_squash_input_denormal(a, status); 5503 5504 aSig = extractFloat64Frac( a ); 5505 aExp = extractFloat64Exp( a ); 5506 aSign = extractFloat64Sign( a ); 5507 5508 if ( aExp == 0 ) { 5509 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 5510 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5511 } 5512 if ( aSign ) { 5513 float_raise(float_flag_invalid, status); 5514 return float64_default_nan(status); 5515 } 5516 if ( aExp == 0x7FF ) { 5517 if (aSig) { 5518 return propagateFloat64NaN(a, float64_zero, status); 5519 } 5520 return a; 5521 } 5522 5523 aExp -= 0x3FF; 5524 aSig |= UINT64_C(0x0010000000000000); 5525 zSign = aExp < 0; 5526 zSig = (uint64_t)aExp << 52; 5527 for (i = 1LL << 51; i > 0; i >>= 1) { 5528 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 5529 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 5530 if ( aSig & UINT64_C(0x0020000000000000) ) { 5531 aSig >>= 1; 5532 zSig |= i; 5533 } 5534 } 5535 5536 if ( zSign ) 5537 zSig = -zSig; 5538 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 5539 } 5540 5541 /*---------------------------------------------------------------------------- 5542 | Returns the result of converting the extended double-precision floating- 5543 | point value `a' to the 32-bit two's complement integer format. The 5544 | conversion is performed according to the IEC/IEEE Standard for Binary 5545 | Floating-Point Arithmetic---which means in particular that the conversion 5546 | is rounded according to the current rounding mode. If `a' is a NaN, the 5547 | largest positive integer is returned. Otherwise, if the conversion 5548 | overflows, the largest integer with the same sign as `a' is returned. 5549 *----------------------------------------------------------------------------*/ 5550 5551 int32_t floatx80_to_int32(floatx80 a, float_status *status) 5552 { 5553 bool aSign; 5554 int32_t aExp, shiftCount; 5555 uint64_t aSig; 5556 5557 if (floatx80_invalid_encoding(a)) { 5558 float_raise(float_flag_invalid, status); 5559 return 1 << 31; 5560 } 5561 aSig = extractFloatx80Frac( a ); 5562 aExp = extractFloatx80Exp( a ); 5563 aSign = extractFloatx80Sign( a ); 5564 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5565 shiftCount = 0x4037 - aExp; 5566 if ( shiftCount <= 0 ) shiftCount = 1; 5567 shift64RightJamming( aSig, shiftCount, &aSig ); 5568 return roundAndPackInt32(aSign, aSig, status); 5569 5570 } 5571 5572 /*---------------------------------------------------------------------------- 5573 | Returns the result of converting the extended double-precision floating- 5574 | point value `a' to the 32-bit two's complement integer format. The 5575 | conversion is performed according to the IEC/IEEE Standard for Binary 5576 | Floating-Point Arithmetic, except that the conversion is always rounded 5577 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5578 | Otherwise, if the conversion overflows, the largest integer with the same 5579 | sign as `a' is returned. 5580 *----------------------------------------------------------------------------*/ 5581 5582 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 5583 { 5584 bool aSign; 5585 int32_t aExp, shiftCount; 5586 uint64_t aSig, savedASig; 5587 int32_t z; 5588 5589 if (floatx80_invalid_encoding(a)) { 5590 float_raise(float_flag_invalid, status); 5591 return 1 << 31; 5592 } 5593 aSig = extractFloatx80Frac( a ); 5594 aExp = extractFloatx80Exp( a ); 5595 aSign = extractFloatx80Sign( a ); 5596 if ( 0x401E < aExp ) { 5597 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5598 goto invalid; 5599 } 5600 else if ( aExp < 0x3FFF ) { 5601 if (aExp || aSig) { 5602 float_raise(float_flag_inexact, status); 5603 } 5604 return 0; 5605 } 5606 shiftCount = 0x403E - aExp; 5607 savedASig = aSig; 5608 aSig >>= shiftCount; 5609 z = aSig; 5610 if ( aSign ) z = - z; 5611 if ( ( z < 0 ) ^ aSign ) { 5612 invalid: 5613 float_raise(float_flag_invalid, status); 5614 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5615 } 5616 if ( ( aSig<<shiftCount ) != savedASig ) { 5617 float_raise(float_flag_inexact, status); 5618 } 5619 return z; 5620 5621 } 5622 5623 /*---------------------------------------------------------------------------- 5624 | Returns the result of converting the extended double-precision floating- 5625 | point value `a' to the 64-bit two's complement integer format. The 5626 | conversion is performed according to the IEC/IEEE Standard for Binary 5627 | Floating-Point Arithmetic---which means in particular that the conversion 5628 | is rounded according to the current rounding mode. If `a' is a NaN, 5629 | the largest positive integer is returned. Otherwise, if the conversion 5630 | overflows, the largest integer with the same sign as `a' is returned. 5631 *----------------------------------------------------------------------------*/ 5632 5633 int64_t floatx80_to_int64(floatx80 a, float_status *status) 5634 { 5635 bool aSign; 5636 int32_t aExp, shiftCount; 5637 uint64_t aSig, aSigExtra; 5638 5639 if (floatx80_invalid_encoding(a)) { 5640 float_raise(float_flag_invalid, status); 5641 return 1ULL << 63; 5642 } 5643 aSig = extractFloatx80Frac( a ); 5644 aExp = extractFloatx80Exp( a ); 5645 aSign = extractFloatx80Sign( a ); 5646 shiftCount = 0x403E - aExp; 5647 if ( shiftCount <= 0 ) { 5648 if ( shiftCount ) { 5649 float_raise(float_flag_invalid, status); 5650 if (!aSign || floatx80_is_any_nan(a)) { 5651 return INT64_MAX; 5652 } 5653 return INT64_MIN; 5654 } 5655 aSigExtra = 0; 5656 } 5657 else { 5658 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 5659 } 5660 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 5661 5662 } 5663 5664 /*---------------------------------------------------------------------------- 5665 | Returns the result of converting the extended double-precision floating- 5666 | point value `a' to the 64-bit two's complement integer format. The 5667 | conversion is performed according to the IEC/IEEE Standard for Binary 5668 | Floating-Point Arithmetic, except that the conversion is always rounded 5669 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5670 | Otherwise, if the conversion overflows, the largest integer with the same 5671 | sign as `a' is returned. 5672 *----------------------------------------------------------------------------*/ 5673 5674 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 5675 { 5676 bool aSign; 5677 int32_t aExp, shiftCount; 5678 uint64_t aSig; 5679 int64_t z; 5680 5681 if (floatx80_invalid_encoding(a)) { 5682 float_raise(float_flag_invalid, status); 5683 return 1ULL << 63; 5684 } 5685 aSig = extractFloatx80Frac( a ); 5686 aExp = extractFloatx80Exp( a ); 5687 aSign = extractFloatx80Sign( a ); 5688 shiftCount = aExp - 0x403E; 5689 if ( 0 <= shiftCount ) { 5690 aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF); 5691 if ( ( a.high != 0xC03E ) || aSig ) { 5692 float_raise(float_flag_invalid, status); 5693 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 5694 return INT64_MAX; 5695 } 5696 } 5697 return INT64_MIN; 5698 } 5699 else if ( aExp < 0x3FFF ) { 5700 if (aExp | aSig) { 5701 float_raise(float_flag_inexact, status); 5702 } 5703 return 0; 5704 } 5705 z = aSig>>( - shiftCount ); 5706 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 5707 float_raise(float_flag_inexact, status); 5708 } 5709 if ( aSign ) z = - z; 5710 return z; 5711 5712 } 5713 5714 /*---------------------------------------------------------------------------- 5715 | Returns the result of converting the extended double-precision floating- 5716 | point value `a' to the single-precision floating-point format. The 5717 | conversion is performed according to the IEC/IEEE Standard for Binary 5718 | Floating-Point Arithmetic. 5719 *----------------------------------------------------------------------------*/ 5720 5721 float32 floatx80_to_float32(floatx80 a, float_status *status) 5722 { 5723 bool aSign; 5724 int32_t aExp; 5725 uint64_t aSig; 5726 5727 if (floatx80_invalid_encoding(a)) { 5728 float_raise(float_flag_invalid, status); 5729 return float32_default_nan(status); 5730 } 5731 aSig = extractFloatx80Frac( a ); 5732 aExp = extractFloatx80Exp( a ); 5733 aSign = extractFloatx80Sign( a ); 5734 if ( aExp == 0x7FFF ) { 5735 if ( (uint64_t) ( aSig<<1 ) ) { 5736 float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status), 5737 status); 5738 return float32_silence_nan(res, status); 5739 } 5740 return packFloat32( aSign, 0xFF, 0 ); 5741 } 5742 shift64RightJamming( aSig, 33, &aSig ); 5743 if ( aExp || aSig ) aExp -= 0x3F81; 5744 return roundAndPackFloat32(aSign, aExp, aSig, status); 5745 5746 } 5747 5748 /*---------------------------------------------------------------------------- 5749 | Returns the result of converting the extended double-precision floating- 5750 | point value `a' to the double-precision floating-point format. The 5751 | conversion is performed according to the IEC/IEEE Standard for Binary 5752 | Floating-Point Arithmetic. 5753 *----------------------------------------------------------------------------*/ 5754 5755 float64 floatx80_to_float64(floatx80 a, float_status *status) 5756 { 5757 bool aSign; 5758 int32_t aExp; 5759 uint64_t aSig, zSig; 5760 5761 if (floatx80_invalid_encoding(a)) { 5762 float_raise(float_flag_invalid, status); 5763 return float64_default_nan(status); 5764 } 5765 aSig = extractFloatx80Frac( a ); 5766 aExp = extractFloatx80Exp( a ); 5767 aSign = extractFloatx80Sign( a ); 5768 if ( aExp == 0x7FFF ) { 5769 if ( (uint64_t) ( aSig<<1 ) ) { 5770 float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status), 5771 status); 5772 return float64_silence_nan(res, status); 5773 } 5774 return packFloat64( aSign, 0x7FF, 0 ); 5775 } 5776 shift64RightJamming( aSig, 1, &zSig ); 5777 if ( aExp || aSig ) aExp -= 0x3C01; 5778 return roundAndPackFloat64(aSign, aExp, zSig, status); 5779 5780 } 5781 5782 /*---------------------------------------------------------------------------- 5783 | Returns the result of converting the extended double-precision floating- 5784 | point value `a' to the quadruple-precision floating-point format. The 5785 | conversion is performed according to the IEC/IEEE Standard for Binary 5786 | Floating-Point Arithmetic. 5787 *----------------------------------------------------------------------------*/ 5788 5789 float128 floatx80_to_float128(floatx80 a, float_status *status) 5790 { 5791 bool aSign; 5792 int aExp; 5793 uint64_t aSig, zSig0, zSig1; 5794 5795 if (floatx80_invalid_encoding(a)) { 5796 float_raise(float_flag_invalid, status); 5797 return float128_default_nan(status); 5798 } 5799 aSig = extractFloatx80Frac( a ); 5800 aExp = extractFloatx80Exp( a ); 5801 aSign = extractFloatx80Sign( a ); 5802 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 5803 float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status), 5804 status); 5805 return float128_silence_nan(res, status); 5806 } 5807 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 5808 return packFloat128( aSign, aExp, zSig0, zSig1 ); 5809 5810 } 5811 5812 /*---------------------------------------------------------------------------- 5813 | Rounds the extended double-precision floating-point value `a' 5814 | to the precision provided by floatx80_rounding_precision and returns the 5815 | result as an extended double-precision floating-point value. 5816 | The operation is performed according to the IEC/IEEE Standard for Binary 5817 | Floating-Point Arithmetic. 5818 *----------------------------------------------------------------------------*/ 5819 5820 floatx80 floatx80_round(floatx80 a, float_status *status) 5821 { 5822 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5823 extractFloatx80Sign(a), 5824 extractFloatx80Exp(a), 5825 extractFloatx80Frac(a), 0, status); 5826 } 5827 5828 /*---------------------------------------------------------------------------- 5829 | Rounds the extended double-precision floating-point value `a' to an integer, 5830 | and returns the result as an extended quadruple-precision floating-point 5831 | value. The operation is performed according to the IEC/IEEE Standard for 5832 | Binary Floating-Point Arithmetic. 5833 *----------------------------------------------------------------------------*/ 5834 5835 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 5836 { 5837 bool aSign; 5838 int32_t aExp; 5839 uint64_t lastBitMask, roundBitsMask; 5840 floatx80 z; 5841 5842 if (floatx80_invalid_encoding(a)) { 5843 float_raise(float_flag_invalid, status); 5844 return floatx80_default_nan(status); 5845 } 5846 aExp = extractFloatx80Exp( a ); 5847 if ( 0x403E <= aExp ) { 5848 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 5849 return propagateFloatx80NaN(a, a, status); 5850 } 5851 return a; 5852 } 5853 if ( aExp < 0x3FFF ) { 5854 if ( ( aExp == 0 ) 5855 && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) { 5856 return a; 5857 } 5858 float_raise(float_flag_inexact, status); 5859 aSign = extractFloatx80Sign( a ); 5860 switch (status->float_rounding_mode) { 5861 case float_round_nearest_even: 5862 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 5863 ) { 5864 return 5865 packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000)); 5866 } 5867 break; 5868 case float_round_ties_away: 5869 if (aExp == 0x3FFE) { 5870 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000)); 5871 } 5872 break; 5873 case float_round_down: 5874 return 5875 aSign ? 5876 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000)) 5877 : packFloatx80( 0, 0, 0 ); 5878 case float_round_up: 5879 return 5880 aSign ? packFloatx80( 1, 0, 0 ) 5881 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000)); 5882 5883 case float_round_to_zero: 5884 break; 5885 default: 5886 g_assert_not_reached(); 5887 } 5888 return packFloatx80( aSign, 0, 0 ); 5889 } 5890 lastBitMask = 1; 5891 lastBitMask <<= 0x403E - aExp; 5892 roundBitsMask = lastBitMask - 1; 5893 z = a; 5894 switch (status->float_rounding_mode) { 5895 case float_round_nearest_even: 5896 z.low += lastBitMask>>1; 5897 if ((z.low & roundBitsMask) == 0) { 5898 z.low &= ~lastBitMask; 5899 } 5900 break; 5901 case float_round_ties_away: 5902 z.low += lastBitMask >> 1; 5903 break; 5904 case float_round_to_zero: 5905 break; 5906 case float_round_up: 5907 if (!extractFloatx80Sign(z)) { 5908 z.low += roundBitsMask; 5909 } 5910 break; 5911 case float_round_down: 5912 if (extractFloatx80Sign(z)) { 5913 z.low += roundBitsMask; 5914 } 5915 break; 5916 default: 5917 abort(); 5918 } 5919 z.low &= ~ roundBitsMask; 5920 if ( z.low == 0 ) { 5921 ++z.high; 5922 z.low = UINT64_C(0x8000000000000000); 5923 } 5924 if (z.low != a.low) { 5925 float_raise(float_flag_inexact, status); 5926 } 5927 return z; 5928 5929 } 5930 5931 /*---------------------------------------------------------------------------- 5932 | Returns the result of adding the absolute values of the extended double- 5933 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 5934 | negated before being returned. `zSign' is ignored if the result is a NaN. 5935 | The addition is performed according to the IEC/IEEE Standard for Binary 5936 | Floating-Point Arithmetic. 5937 *----------------------------------------------------------------------------*/ 5938 5939 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign, 5940 float_status *status) 5941 { 5942 int32_t aExp, bExp, zExp; 5943 uint64_t aSig, bSig, zSig0, zSig1; 5944 int32_t expDiff; 5945 5946 aSig = extractFloatx80Frac( a ); 5947 aExp = extractFloatx80Exp( a ); 5948 bSig = extractFloatx80Frac( b ); 5949 bExp = extractFloatx80Exp( b ); 5950 expDiff = aExp - bExp; 5951 if ( 0 < expDiff ) { 5952 if ( aExp == 0x7FFF ) { 5953 if ((uint64_t)(aSig << 1)) { 5954 return propagateFloatx80NaN(a, b, status); 5955 } 5956 return a; 5957 } 5958 if ( bExp == 0 ) --expDiff; 5959 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5960 zExp = aExp; 5961 } 5962 else if ( expDiff < 0 ) { 5963 if ( bExp == 0x7FFF ) { 5964 if ((uint64_t)(bSig << 1)) { 5965 return propagateFloatx80NaN(a, b, status); 5966 } 5967 return packFloatx80(zSign, 5968 floatx80_infinity_high, 5969 floatx80_infinity_low); 5970 } 5971 if ( aExp == 0 ) ++expDiff; 5972 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5973 zExp = bExp; 5974 } 5975 else { 5976 if ( aExp == 0x7FFF ) { 5977 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5978 return propagateFloatx80NaN(a, b, status); 5979 } 5980 return a; 5981 } 5982 zSig1 = 0; 5983 zSig0 = aSig + bSig; 5984 if ( aExp == 0 ) { 5985 if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) { 5986 /* At least one of the values is a pseudo-denormal, 5987 * and there is a carry out of the result. */ 5988 zExp = 1; 5989 goto shiftRight1; 5990 } 5991 if (zSig0 == 0) { 5992 return packFloatx80(zSign, 0, 0); 5993 } 5994 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 5995 goto roundAndPack; 5996 } 5997 zExp = aExp; 5998 goto shiftRight1; 5999 } 6000 zSig0 = aSig + bSig; 6001 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 6002 shiftRight1: 6003 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 6004 zSig0 |= UINT64_C(0x8000000000000000); 6005 ++zExp; 6006 roundAndPack: 6007 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6008 zSign, zExp, zSig0, zSig1, status); 6009 } 6010 6011 /*---------------------------------------------------------------------------- 6012 | Returns the result of subtracting the absolute values of the extended 6013 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 6014 | difference is negated before being returned. `zSign' is ignored if the 6015 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6016 | Standard for Binary Floating-Point Arithmetic. 6017 *----------------------------------------------------------------------------*/ 6018 6019 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign, 6020 float_status *status) 6021 { 6022 int32_t aExp, bExp, zExp; 6023 uint64_t aSig, bSig, zSig0, zSig1; 6024 int32_t expDiff; 6025 6026 aSig = extractFloatx80Frac( a ); 6027 aExp = extractFloatx80Exp( a ); 6028 bSig = extractFloatx80Frac( b ); 6029 bExp = extractFloatx80Exp( b ); 6030 expDiff = aExp - bExp; 6031 if ( 0 < expDiff ) goto aExpBigger; 6032 if ( expDiff < 0 ) goto bExpBigger; 6033 if ( aExp == 0x7FFF ) { 6034 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 6035 return propagateFloatx80NaN(a, b, status); 6036 } 6037 float_raise(float_flag_invalid, status); 6038 return floatx80_default_nan(status); 6039 } 6040 if ( aExp == 0 ) { 6041 aExp = 1; 6042 bExp = 1; 6043 } 6044 zSig1 = 0; 6045 if ( bSig < aSig ) goto aBigger; 6046 if ( aSig < bSig ) goto bBigger; 6047 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 6048 bExpBigger: 6049 if ( bExp == 0x7FFF ) { 6050 if ((uint64_t)(bSig << 1)) { 6051 return propagateFloatx80NaN(a, b, status); 6052 } 6053 return packFloatx80(zSign ^ 1, floatx80_infinity_high, 6054 floatx80_infinity_low); 6055 } 6056 if ( aExp == 0 ) ++expDiff; 6057 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 6058 bBigger: 6059 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 6060 zExp = bExp; 6061 zSign ^= 1; 6062 goto normalizeRoundAndPack; 6063 aExpBigger: 6064 if ( aExp == 0x7FFF ) { 6065 if ((uint64_t)(aSig << 1)) { 6066 return propagateFloatx80NaN(a, b, status); 6067 } 6068 return a; 6069 } 6070 if ( bExp == 0 ) --expDiff; 6071 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 6072 aBigger: 6073 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 6074 zExp = aExp; 6075 normalizeRoundAndPack: 6076 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 6077 zSign, zExp, zSig0, zSig1, status); 6078 } 6079 6080 /*---------------------------------------------------------------------------- 6081 | Returns the result of adding the extended double-precision floating-point 6082 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6083 | Standard for Binary Floating-Point Arithmetic. 6084 *----------------------------------------------------------------------------*/ 6085 6086 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 6087 { 6088 bool aSign, bSign; 6089 6090 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6091 float_raise(float_flag_invalid, status); 6092 return floatx80_default_nan(status); 6093 } 6094 aSign = extractFloatx80Sign( a ); 6095 bSign = extractFloatx80Sign( b ); 6096 if ( aSign == bSign ) { 6097 return addFloatx80Sigs(a, b, aSign, status); 6098 } 6099 else { 6100 return subFloatx80Sigs(a, b, aSign, status); 6101 } 6102 6103 } 6104 6105 /*---------------------------------------------------------------------------- 6106 | Returns the result of subtracting the extended double-precision floating- 6107 | point values `a' and `b'. The operation is performed according to the 6108 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6109 *----------------------------------------------------------------------------*/ 6110 6111 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 6112 { 6113 bool aSign, bSign; 6114 6115 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6116 float_raise(float_flag_invalid, status); 6117 return floatx80_default_nan(status); 6118 } 6119 aSign = extractFloatx80Sign( a ); 6120 bSign = extractFloatx80Sign( b ); 6121 if ( aSign == bSign ) { 6122 return subFloatx80Sigs(a, b, aSign, status); 6123 } 6124 else { 6125 return addFloatx80Sigs(a, b, aSign, status); 6126 } 6127 6128 } 6129 6130 /*---------------------------------------------------------------------------- 6131 | Returns the result of multiplying the extended double-precision floating- 6132 | point values `a' and `b'. The operation is performed according to the 6133 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6134 *----------------------------------------------------------------------------*/ 6135 6136 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 6137 { 6138 bool aSign, bSign, zSign; 6139 int32_t aExp, bExp, zExp; 6140 uint64_t aSig, bSig, zSig0, zSig1; 6141 6142 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6143 float_raise(float_flag_invalid, status); 6144 return floatx80_default_nan(status); 6145 } 6146 aSig = extractFloatx80Frac( a ); 6147 aExp = extractFloatx80Exp( a ); 6148 aSign = extractFloatx80Sign( a ); 6149 bSig = extractFloatx80Frac( b ); 6150 bExp = extractFloatx80Exp( b ); 6151 bSign = extractFloatx80Sign( b ); 6152 zSign = aSign ^ bSign; 6153 if ( aExp == 0x7FFF ) { 6154 if ( (uint64_t) ( aSig<<1 ) 6155 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 6156 return propagateFloatx80NaN(a, b, status); 6157 } 6158 if ( ( bExp | bSig ) == 0 ) goto invalid; 6159 return packFloatx80(zSign, floatx80_infinity_high, 6160 floatx80_infinity_low); 6161 } 6162 if ( bExp == 0x7FFF ) { 6163 if ((uint64_t)(bSig << 1)) { 6164 return propagateFloatx80NaN(a, b, status); 6165 } 6166 if ( ( aExp | aSig ) == 0 ) { 6167 invalid: 6168 float_raise(float_flag_invalid, status); 6169 return floatx80_default_nan(status); 6170 } 6171 return packFloatx80(zSign, floatx80_infinity_high, 6172 floatx80_infinity_low); 6173 } 6174 if ( aExp == 0 ) { 6175 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6176 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 6177 } 6178 if ( bExp == 0 ) { 6179 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6180 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6181 } 6182 zExp = aExp + bExp - 0x3FFE; 6183 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 6184 if ( 0 < (int64_t) zSig0 ) { 6185 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 6186 --zExp; 6187 } 6188 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6189 zSign, zExp, zSig0, zSig1, status); 6190 } 6191 6192 /*---------------------------------------------------------------------------- 6193 | Returns the result of dividing the extended double-precision floating-point 6194 | value `a' by the corresponding value `b'. The operation is performed 6195 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6196 *----------------------------------------------------------------------------*/ 6197 6198 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 6199 { 6200 bool aSign, bSign, zSign; 6201 int32_t aExp, bExp, zExp; 6202 uint64_t aSig, bSig, zSig0, zSig1; 6203 uint64_t rem0, rem1, rem2, term0, term1, term2; 6204 6205 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6206 float_raise(float_flag_invalid, status); 6207 return floatx80_default_nan(status); 6208 } 6209 aSig = extractFloatx80Frac( a ); 6210 aExp = extractFloatx80Exp( a ); 6211 aSign = extractFloatx80Sign( a ); 6212 bSig = extractFloatx80Frac( b ); 6213 bExp = extractFloatx80Exp( b ); 6214 bSign = extractFloatx80Sign( b ); 6215 zSign = aSign ^ bSign; 6216 if ( aExp == 0x7FFF ) { 6217 if ((uint64_t)(aSig << 1)) { 6218 return propagateFloatx80NaN(a, b, status); 6219 } 6220 if ( bExp == 0x7FFF ) { 6221 if ((uint64_t)(bSig << 1)) { 6222 return propagateFloatx80NaN(a, b, status); 6223 } 6224 goto invalid; 6225 } 6226 return packFloatx80(zSign, floatx80_infinity_high, 6227 floatx80_infinity_low); 6228 } 6229 if ( bExp == 0x7FFF ) { 6230 if ((uint64_t)(bSig << 1)) { 6231 return propagateFloatx80NaN(a, b, status); 6232 } 6233 return packFloatx80( zSign, 0, 0 ); 6234 } 6235 if ( bExp == 0 ) { 6236 if ( bSig == 0 ) { 6237 if ( ( aExp | aSig ) == 0 ) { 6238 invalid: 6239 float_raise(float_flag_invalid, status); 6240 return floatx80_default_nan(status); 6241 } 6242 float_raise(float_flag_divbyzero, status); 6243 return packFloatx80(zSign, floatx80_infinity_high, 6244 floatx80_infinity_low); 6245 } 6246 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6247 } 6248 if ( aExp == 0 ) { 6249 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6250 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 6251 } 6252 zExp = aExp - bExp + 0x3FFE; 6253 rem1 = 0; 6254 if ( bSig <= aSig ) { 6255 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 6256 ++zExp; 6257 } 6258 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 6259 mul64To128( bSig, zSig0, &term0, &term1 ); 6260 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 6261 while ( (int64_t) rem0 < 0 ) { 6262 --zSig0; 6263 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 6264 } 6265 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 6266 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 6267 mul64To128( bSig, zSig1, &term1, &term2 ); 6268 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6269 while ( (int64_t) rem1 < 0 ) { 6270 --zSig1; 6271 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 6272 } 6273 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 6274 } 6275 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6276 zSign, zExp, zSig0, zSig1, status); 6277 } 6278 6279 /*---------------------------------------------------------------------------- 6280 | Returns the remainder of the extended double-precision floating-point value 6281 | `a' with respect to the corresponding value `b'. The operation is performed 6282 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic, 6283 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating 6284 | the quotient toward zero instead. '*quotient' is set to the low 64 bits of 6285 | the absolute value of the integer quotient. 6286 *----------------------------------------------------------------------------*/ 6287 6288 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient, 6289 float_status *status) 6290 { 6291 bool aSign, zSign; 6292 int32_t aExp, bExp, expDiff, aExpOrig; 6293 uint64_t aSig0, aSig1, bSig; 6294 uint64_t q, term0, term1, alternateASig0, alternateASig1; 6295 6296 *quotient = 0; 6297 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6298 float_raise(float_flag_invalid, status); 6299 return floatx80_default_nan(status); 6300 } 6301 aSig0 = extractFloatx80Frac( a ); 6302 aExpOrig = aExp = extractFloatx80Exp( a ); 6303 aSign = extractFloatx80Sign( a ); 6304 bSig = extractFloatx80Frac( b ); 6305 bExp = extractFloatx80Exp( b ); 6306 if ( aExp == 0x7FFF ) { 6307 if ( (uint64_t) ( aSig0<<1 ) 6308 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 6309 return propagateFloatx80NaN(a, b, status); 6310 } 6311 goto invalid; 6312 } 6313 if ( bExp == 0x7FFF ) { 6314 if ((uint64_t)(bSig << 1)) { 6315 return propagateFloatx80NaN(a, b, status); 6316 } 6317 if (aExp == 0 && aSig0 >> 63) { 6318 /* 6319 * Pseudo-denormal argument must be returned in normalized 6320 * form. 6321 */ 6322 return packFloatx80(aSign, 1, aSig0); 6323 } 6324 return a; 6325 } 6326 if ( bExp == 0 ) { 6327 if ( bSig == 0 ) { 6328 invalid: 6329 float_raise(float_flag_invalid, status); 6330 return floatx80_default_nan(status); 6331 } 6332 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6333 } 6334 if ( aExp == 0 ) { 6335 if ( aSig0 == 0 ) return a; 6336 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6337 } 6338 zSign = aSign; 6339 expDiff = aExp - bExp; 6340 aSig1 = 0; 6341 if ( expDiff < 0 ) { 6342 if ( mod || expDiff < -1 ) { 6343 if (aExp == 1 && aExpOrig == 0) { 6344 /* 6345 * Pseudo-denormal argument must be returned in 6346 * normalized form. 6347 */ 6348 return packFloatx80(aSign, aExp, aSig0); 6349 } 6350 return a; 6351 } 6352 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 6353 expDiff = 0; 6354 } 6355 *quotient = q = ( bSig <= aSig0 ); 6356 if ( q ) aSig0 -= bSig; 6357 expDiff -= 64; 6358 while ( 0 < expDiff ) { 6359 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6360 q = ( 2 < q ) ? q - 2 : 0; 6361 mul64To128( bSig, q, &term0, &term1 ); 6362 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6363 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 6364 expDiff -= 62; 6365 *quotient <<= 62; 6366 *quotient += q; 6367 } 6368 expDiff += 64; 6369 if ( 0 < expDiff ) { 6370 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6371 q = ( 2 < q ) ? q - 2 : 0; 6372 q >>= 64 - expDiff; 6373 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 6374 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6375 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 6376 while ( le128( term0, term1, aSig0, aSig1 ) ) { 6377 ++q; 6378 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6379 } 6380 if (expDiff < 64) { 6381 *quotient <<= expDiff; 6382 } else { 6383 *quotient = 0; 6384 } 6385 *quotient += q; 6386 } 6387 else { 6388 term1 = 0; 6389 term0 = bSig; 6390 } 6391 if (!mod) { 6392 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 6393 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6394 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6395 && ( q & 1 ) ) 6396 ) { 6397 aSig0 = alternateASig0; 6398 aSig1 = alternateASig1; 6399 zSign = ! zSign; 6400 ++*quotient; 6401 } 6402 } 6403 return 6404 normalizeRoundAndPackFloatx80( 6405 80, zSign, bExp + expDiff, aSig0, aSig1, status); 6406 6407 } 6408 6409 /*---------------------------------------------------------------------------- 6410 | Returns the remainder of the extended double-precision floating-point value 6411 | `a' with respect to the corresponding value `b'. The operation is performed 6412 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6413 *----------------------------------------------------------------------------*/ 6414 6415 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 6416 { 6417 uint64_t quotient; 6418 return floatx80_modrem(a, b, false, "ient, status); 6419 } 6420 6421 /*---------------------------------------------------------------------------- 6422 | Returns the remainder of the extended double-precision floating-point value 6423 | `a' with respect to the corresponding value `b', with the quotient truncated 6424 | toward zero. 6425 *----------------------------------------------------------------------------*/ 6426 6427 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status) 6428 { 6429 uint64_t quotient; 6430 return floatx80_modrem(a, b, true, "ient, status); 6431 } 6432 6433 /*---------------------------------------------------------------------------- 6434 | Returns the square root of the extended double-precision floating-point 6435 | value `a'. The operation is performed according to the IEC/IEEE Standard 6436 | for Binary Floating-Point Arithmetic. 6437 *----------------------------------------------------------------------------*/ 6438 6439 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 6440 { 6441 bool aSign; 6442 int32_t aExp, zExp; 6443 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 6444 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6445 6446 if (floatx80_invalid_encoding(a)) { 6447 float_raise(float_flag_invalid, status); 6448 return floatx80_default_nan(status); 6449 } 6450 aSig0 = extractFloatx80Frac( a ); 6451 aExp = extractFloatx80Exp( a ); 6452 aSign = extractFloatx80Sign( a ); 6453 if ( aExp == 0x7FFF ) { 6454 if ((uint64_t)(aSig0 << 1)) { 6455 return propagateFloatx80NaN(a, a, status); 6456 } 6457 if ( ! aSign ) return a; 6458 goto invalid; 6459 } 6460 if ( aSign ) { 6461 if ( ( aExp | aSig0 ) == 0 ) return a; 6462 invalid: 6463 float_raise(float_flag_invalid, status); 6464 return floatx80_default_nan(status); 6465 } 6466 if ( aExp == 0 ) { 6467 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 6468 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6469 } 6470 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 6471 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 6472 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 6473 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6474 doubleZSig0 = zSig0<<1; 6475 mul64To128( zSig0, zSig0, &term0, &term1 ); 6476 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6477 while ( (int64_t) rem0 < 0 ) { 6478 --zSig0; 6479 doubleZSig0 -= 2; 6480 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6481 } 6482 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6483 if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) { 6484 if ( zSig1 == 0 ) zSig1 = 1; 6485 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6486 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6487 mul64To128( zSig1, zSig1, &term2, &term3 ); 6488 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6489 while ( (int64_t) rem1 < 0 ) { 6490 --zSig1; 6491 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6492 term3 |= 1; 6493 term2 |= doubleZSig0; 6494 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6495 } 6496 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6497 } 6498 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 6499 zSig0 |= doubleZSig0; 6500 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6501 0, zExp, zSig0, zSig1, status); 6502 } 6503 6504 /*---------------------------------------------------------------------------- 6505 | Returns the result of converting the quadruple-precision floating-point 6506 | value `a' to the 32-bit two's complement integer format. The conversion 6507 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6508 | Arithmetic---which means in particular that the conversion is rounded 6509 | according to the current rounding mode. If `a' is a NaN, the largest 6510 | positive integer is returned. Otherwise, if the conversion overflows, the 6511 | largest integer with the same sign as `a' is returned. 6512 *----------------------------------------------------------------------------*/ 6513 6514 int32_t float128_to_int32(float128 a, float_status *status) 6515 { 6516 bool aSign; 6517 int32_t aExp, shiftCount; 6518 uint64_t aSig0, aSig1; 6519 6520 aSig1 = extractFloat128Frac1( a ); 6521 aSig0 = extractFloat128Frac0( a ); 6522 aExp = extractFloat128Exp( a ); 6523 aSign = extractFloat128Sign( a ); 6524 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 6525 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6526 aSig0 |= ( aSig1 != 0 ); 6527 shiftCount = 0x4028 - aExp; 6528 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 6529 return roundAndPackInt32(aSign, aSig0, status); 6530 6531 } 6532 6533 /*---------------------------------------------------------------------------- 6534 | Returns the result of converting the quadruple-precision floating-point 6535 | value `a' to the 32-bit two's complement integer format. The conversion 6536 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6537 | Arithmetic, except that the conversion is always rounded toward zero. If 6538 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 6539 | conversion overflows, the largest integer with the same sign as `a' is 6540 | returned. 6541 *----------------------------------------------------------------------------*/ 6542 6543 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 6544 { 6545 bool aSign; 6546 int32_t aExp, shiftCount; 6547 uint64_t aSig0, aSig1, savedASig; 6548 int32_t z; 6549 6550 aSig1 = extractFloat128Frac1( a ); 6551 aSig0 = extractFloat128Frac0( a ); 6552 aExp = extractFloat128Exp( a ); 6553 aSign = extractFloat128Sign( a ); 6554 aSig0 |= ( aSig1 != 0 ); 6555 if ( 0x401E < aExp ) { 6556 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 6557 goto invalid; 6558 } 6559 else if ( aExp < 0x3FFF ) { 6560 if (aExp || aSig0) { 6561 float_raise(float_flag_inexact, status); 6562 } 6563 return 0; 6564 } 6565 aSig0 |= UINT64_C(0x0001000000000000); 6566 shiftCount = 0x402F - aExp; 6567 savedASig = aSig0; 6568 aSig0 >>= shiftCount; 6569 z = aSig0; 6570 if ( aSign ) z = - z; 6571 if ( ( z < 0 ) ^ aSign ) { 6572 invalid: 6573 float_raise(float_flag_invalid, status); 6574 return aSign ? INT32_MIN : INT32_MAX; 6575 } 6576 if ( ( aSig0<<shiftCount ) != savedASig ) { 6577 float_raise(float_flag_inexact, status); 6578 } 6579 return z; 6580 6581 } 6582 6583 /*---------------------------------------------------------------------------- 6584 | Returns the result of converting the quadruple-precision floating-point 6585 | value `a' to the 64-bit two's complement integer format. The conversion 6586 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6587 | Arithmetic---which means in particular that the conversion is rounded 6588 | according to the current rounding mode. If `a' is a NaN, the largest 6589 | positive integer is returned. Otherwise, if the conversion overflows, the 6590 | largest integer with the same sign as `a' is returned. 6591 *----------------------------------------------------------------------------*/ 6592 6593 int64_t float128_to_int64(float128 a, float_status *status) 6594 { 6595 bool aSign; 6596 int32_t aExp, shiftCount; 6597 uint64_t aSig0, aSig1; 6598 6599 aSig1 = extractFloat128Frac1( a ); 6600 aSig0 = extractFloat128Frac0( a ); 6601 aExp = extractFloat128Exp( a ); 6602 aSign = extractFloat128Sign( a ); 6603 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6604 shiftCount = 0x402F - aExp; 6605 if ( shiftCount <= 0 ) { 6606 if ( 0x403E < aExp ) { 6607 float_raise(float_flag_invalid, status); 6608 if ( ! aSign 6609 || ( ( aExp == 0x7FFF ) 6610 && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) ) 6611 ) 6612 ) { 6613 return INT64_MAX; 6614 } 6615 return INT64_MIN; 6616 } 6617 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 6618 } 6619 else { 6620 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 6621 } 6622 return roundAndPackInt64(aSign, aSig0, aSig1, status); 6623 6624 } 6625 6626 /*---------------------------------------------------------------------------- 6627 | Returns the result of converting the quadruple-precision floating-point 6628 | value `a' to the 64-bit two's complement integer format. The conversion 6629 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6630 | Arithmetic, except that the conversion is always rounded toward zero. 6631 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 6632 | the conversion overflows, the largest integer with the same sign as `a' is 6633 | returned. 6634 *----------------------------------------------------------------------------*/ 6635 6636 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 6637 { 6638 bool aSign; 6639 int32_t aExp, shiftCount; 6640 uint64_t aSig0, aSig1; 6641 int64_t z; 6642 6643 aSig1 = extractFloat128Frac1( a ); 6644 aSig0 = extractFloat128Frac0( a ); 6645 aExp = extractFloat128Exp( a ); 6646 aSign = extractFloat128Sign( a ); 6647 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6648 shiftCount = aExp - 0x402F; 6649 if ( 0 < shiftCount ) { 6650 if ( 0x403E <= aExp ) { 6651 aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF); 6652 if ( ( a.high == UINT64_C(0xC03E000000000000) ) 6653 && ( aSig1 < UINT64_C(0x0002000000000000) ) ) { 6654 if (aSig1) { 6655 float_raise(float_flag_inexact, status); 6656 } 6657 } 6658 else { 6659 float_raise(float_flag_invalid, status); 6660 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 6661 return INT64_MAX; 6662 } 6663 } 6664 return INT64_MIN; 6665 } 6666 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 6667 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 6668 float_raise(float_flag_inexact, status); 6669 } 6670 } 6671 else { 6672 if ( aExp < 0x3FFF ) { 6673 if ( aExp | aSig0 | aSig1 ) { 6674 float_raise(float_flag_inexact, status); 6675 } 6676 return 0; 6677 } 6678 z = aSig0>>( - shiftCount ); 6679 if ( aSig1 6680 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 6681 float_raise(float_flag_inexact, status); 6682 } 6683 } 6684 if ( aSign ) z = - z; 6685 return z; 6686 6687 } 6688 6689 /*---------------------------------------------------------------------------- 6690 | Returns the result of converting the quadruple-precision floating-point value 6691 | `a' to the 64-bit unsigned integer format. The conversion is 6692 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6693 | Arithmetic---which means in particular that the conversion is rounded 6694 | according to the current rounding mode. If `a' is a NaN, the largest 6695 | positive integer is returned. If the conversion overflows, the 6696 | largest unsigned integer is returned. If 'a' is negative, the value is 6697 | rounded and zero is returned; negative values that do not round to zero 6698 | will raise the inexact exception. 6699 *----------------------------------------------------------------------------*/ 6700 6701 uint64_t float128_to_uint64(float128 a, float_status *status) 6702 { 6703 bool aSign; 6704 int aExp; 6705 int shiftCount; 6706 uint64_t aSig0, aSig1; 6707 6708 aSig0 = extractFloat128Frac0(a); 6709 aSig1 = extractFloat128Frac1(a); 6710 aExp = extractFloat128Exp(a); 6711 aSign = extractFloat128Sign(a); 6712 if (aSign && (aExp > 0x3FFE)) { 6713 float_raise(float_flag_invalid, status); 6714 if (float128_is_any_nan(a)) { 6715 return UINT64_MAX; 6716 } else { 6717 return 0; 6718 } 6719 } 6720 if (aExp) { 6721 aSig0 |= UINT64_C(0x0001000000000000); 6722 } 6723 shiftCount = 0x402F - aExp; 6724 if (shiftCount <= 0) { 6725 if (0x403E < aExp) { 6726 float_raise(float_flag_invalid, status); 6727 return UINT64_MAX; 6728 } 6729 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 6730 } else { 6731 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 6732 } 6733 return roundAndPackUint64(aSign, aSig0, aSig1, status); 6734 } 6735 6736 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 6737 { 6738 uint64_t v; 6739 signed char current_rounding_mode = status->float_rounding_mode; 6740 6741 set_float_rounding_mode(float_round_to_zero, status); 6742 v = float128_to_uint64(a, status); 6743 set_float_rounding_mode(current_rounding_mode, status); 6744 6745 return v; 6746 } 6747 6748 /*---------------------------------------------------------------------------- 6749 | Returns the result of converting the quadruple-precision floating-point 6750 | value `a' to the 32-bit unsigned integer format. The conversion 6751 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6752 | Arithmetic except that the conversion is always rounded toward zero. 6753 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 6754 | if the conversion overflows, the largest unsigned integer is returned. 6755 | If 'a' is negative, the value is rounded and zero is returned; negative 6756 | values that do not round to zero will raise the inexact exception. 6757 *----------------------------------------------------------------------------*/ 6758 6759 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 6760 { 6761 uint64_t v; 6762 uint32_t res; 6763 int old_exc_flags = get_float_exception_flags(status); 6764 6765 v = float128_to_uint64_round_to_zero(a, status); 6766 if (v > 0xffffffff) { 6767 res = 0xffffffff; 6768 } else { 6769 return v; 6770 } 6771 set_float_exception_flags(old_exc_flags, status); 6772 float_raise(float_flag_invalid, status); 6773 return res; 6774 } 6775 6776 /*---------------------------------------------------------------------------- 6777 | Returns the result of converting the quadruple-precision floating-point value 6778 | `a' to the 32-bit unsigned integer format. The conversion is 6779 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6780 | Arithmetic---which means in particular that the conversion is rounded 6781 | according to the current rounding mode. If `a' is a NaN, the largest 6782 | positive integer is returned. If the conversion overflows, the 6783 | largest unsigned integer is returned. If 'a' is negative, the value is 6784 | rounded and zero is returned; negative values that do not round to zero 6785 | will raise the inexact exception. 6786 *----------------------------------------------------------------------------*/ 6787 6788 uint32_t float128_to_uint32(float128 a, float_status *status) 6789 { 6790 uint64_t v; 6791 uint32_t res; 6792 int old_exc_flags = get_float_exception_flags(status); 6793 6794 v = float128_to_uint64(a, status); 6795 if (v > 0xffffffff) { 6796 res = 0xffffffff; 6797 } else { 6798 return v; 6799 } 6800 set_float_exception_flags(old_exc_flags, status); 6801 float_raise(float_flag_invalid, status); 6802 return res; 6803 } 6804 6805 /*---------------------------------------------------------------------------- 6806 | Returns the result of converting the quadruple-precision floating-point 6807 | value `a' to the single-precision floating-point format. The conversion 6808 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6809 | Arithmetic. 6810 *----------------------------------------------------------------------------*/ 6811 6812 float32 float128_to_float32(float128 a, float_status *status) 6813 { 6814 bool aSign; 6815 int32_t aExp; 6816 uint64_t aSig0, aSig1; 6817 uint32_t zSig; 6818 6819 aSig1 = extractFloat128Frac1( a ); 6820 aSig0 = extractFloat128Frac0( a ); 6821 aExp = extractFloat128Exp( a ); 6822 aSign = extractFloat128Sign( a ); 6823 if ( aExp == 0x7FFF ) { 6824 if ( aSig0 | aSig1 ) { 6825 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 6826 } 6827 return packFloat32( aSign, 0xFF, 0 ); 6828 } 6829 aSig0 |= ( aSig1 != 0 ); 6830 shift64RightJamming( aSig0, 18, &aSig0 ); 6831 zSig = aSig0; 6832 if ( aExp || zSig ) { 6833 zSig |= 0x40000000; 6834 aExp -= 0x3F81; 6835 } 6836 return roundAndPackFloat32(aSign, aExp, zSig, status); 6837 6838 } 6839 6840 /*---------------------------------------------------------------------------- 6841 | Returns the result of converting the quadruple-precision floating-point 6842 | value `a' to the double-precision floating-point format. The conversion 6843 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6844 | Arithmetic. 6845 *----------------------------------------------------------------------------*/ 6846 6847 float64 float128_to_float64(float128 a, float_status *status) 6848 { 6849 bool aSign; 6850 int32_t aExp; 6851 uint64_t aSig0, aSig1; 6852 6853 aSig1 = extractFloat128Frac1( a ); 6854 aSig0 = extractFloat128Frac0( a ); 6855 aExp = extractFloat128Exp( a ); 6856 aSign = extractFloat128Sign( a ); 6857 if ( aExp == 0x7FFF ) { 6858 if ( aSig0 | aSig1 ) { 6859 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 6860 } 6861 return packFloat64( aSign, 0x7FF, 0 ); 6862 } 6863 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6864 aSig0 |= ( aSig1 != 0 ); 6865 if ( aExp || aSig0 ) { 6866 aSig0 |= UINT64_C(0x4000000000000000); 6867 aExp -= 0x3C01; 6868 } 6869 return roundAndPackFloat64(aSign, aExp, aSig0, status); 6870 6871 } 6872 6873 /*---------------------------------------------------------------------------- 6874 | Returns the result of converting the quadruple-precision floating-point 6875 | value `a' to the extended double-precision floating-point format. The 6876 | conversion is performed according to the IEC/IEEE Standard for Binary 6877 | Floating-Point Arithmetic. 6878 *----------------------------------------------------------------------------*/ 6879 6880 floatx80 float128_to_floatx80(float128 a, float_status *status) 6881 { 6882 bool aSign; 6883 int32_t aExp; 6884 uint64_t aSig0, aSig1; 6885 6886 aSig1 = extractFloat128Frac1( a ); 6887 aSig0 = extractFloat128Frac0( a ); 6888 aExp = extractFloat128Exp( a ); 6889 aSign = extractFloat128Sign( a ); 6890 if ( aExp == 0x7FFF ) { 6891 if ( aSig0 | aSig1 ) { 6892 floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status), 6893 status); 6894 return floatx80_silence_nan(res, status); 6895 } 6896 return packFloatx80(aSign, floatx80_infinity_high, 6897 floatx80_infinity_low); 6898 } 6899 if ( aExp == 0 ) { 6900 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6901 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6902 } 6903 else { 6904 aSig0 |= UINT64_C(0x0001000000000000); 6905 } 6906 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6907 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6908 6909 } 6910 6911 /*---------------------------------------------------------------------------- 6912 | Rounds the quadruple-precision floating-point value `a' to an integer, and 6913 | returns the result as a quadruple-precision floating-point value. The 6914 | operation is performed according to the IEC/IEEE Standard for Binary 6915 | Floating-Point Arithmetic. 6916 *----------------------------------------------------------------------------*/ 6917 6918 float128 float128_round_to_int(float128 a, float_status *status) 6919 { 6920 bool aSign; 6921 int32_t aExp; 6922 uint64_t lastBitMask, roundBitsMask; 6923 float128 z; 6924 6925 aExp = extractFloat128Exp( a ); 6926 if ( 0x402F <= aExp ) { 6927 if ( 0x406F <= aExp ) { 6928 if ( ( aExp == 0x7FFF ) 6929 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 6930 ) { 6931 return propagateFloat128NaN(a, a, status); 6932 } 6933 return a; 6934 } 6935 lastBitMask = 1; 6936 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 6937 roundBitsMask = lastBitMask - 1; 6938 z = a; 6939 switch (status->float_rounding_mode) { 6940 case float_round_nearest_even: 6941 if ( lastBitMask ) { 6942 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 6943 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 6944 } 6945 else { 6946 if ( (int64_t) z.low < 0 ) { 6947 ++z.high; 6948 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 6949 } 6950 } 6951 break; 6952 case float_round_ties_away: 6953 if (lastBitMask) { 6954 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 6955 } else { 6956 if ((int64_t) z.low < 0) { 6957 ++z.high; 6958 } 6959 } 6960 break; 6961 case float_round_to_zero: 6962 break; 6963 case float_round_up: 6964 if (!extractFloat128Sign(z)) { 6965 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6966 } 6967 break; 6968 case float_round_down: 6969 if (extractFloat128Sign(z)) { 6970 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6971 } 6972 break; 6973 case float_round_to_odd: 6974 /* 6975 * Note that if lastBitMask == 0, the last bit is the lsb 6976 * of high, and roundBitsMask == -1. 6977 */ 6978 if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) { 6979 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6980 } 6981 break; 6982 default: 6983 abort(); 6984 } 6985 z.low &= ~ roundBitsMask; 6986 } 6987 else { 6988 if ( aExp < 0x3FFF ) { 6989 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 6990 float_raise(float_flag_inexact, status); 6991 aSign = extractFloat128Sign( a ); 6992 switch (status->float_rounding_mode) { 6993 case float_round_nearest_even: 6994 if ( ( aExp == 0x3FFE ) 6995 && ( extractFloat128Frac0( a ) 6996 | extractFloat128Frac1( a ) ) 6997 ) { 6998 return packFloat128( aSign, 0x3FFF, 0, 0 ); 6999 } 7000 break; 7001 case float_round_ties_away: 7002 if (aExp == 0x3FFE) { 7003 return packFloat128(aSign, 0x3FFF, 0, 0); 7004 } 7005 break; 7006 case float_round_down: 7007 return 7008 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 7009 : packFloat128( 0, 0, 0, 0 ); 7010 case float_round_up: 7011 return 7012 aSign ? packFloat128( 1, 0, 0, 0 ) 7013 : packFloat128( 0, 0x3FFF, 0, 0 ); 7014 7015 case float_round_to_odd: 7016 return packFloat128(aSign, 0x3FFF, 0, 0); 7017 7018 case float_round_to_zero: 7019 break; 7020 } 7021 return packFloat128( aSign, 0, 0, 0 ); 7022 } 7023 lastBitMask = 1; 7024 lastBitMask <<= 0x402F - aExp; 7025 roundBitsMask = lastBitMask - 1; 7026 z.low = 0; 7027 z.high = a.high; 7028 switch (status->float_rounding_mode) { 7029 case float_round_nearest_even: 7030 z.high += lastBitMask>>1; 7031 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 7032 z.high &= ~ lastBitMask; 7033 } 7034 break; 7035 case float_round_ties_away: 7036 z.high += lastBitMask>>1; 7037 break; 7038 case float_round_to_zero: 7039 break; 7040 case float_round_up: 7041 if (!extractFloat128Sign(z)) { 7042 z.high |= ( a.low != 0 ); 7043 z.high += roundBitsMask; 7044 } 7045 break; 7046 case float_round_down: 7047 if (extractFloat128Sign(z)) { 7048 z.high |= (a.low != 0); 7049 z.high += roundBitsMask; 7050 } 7051 break; 7052 case float_round_to_odd: 7053 if ((z.high & lastBitMask) == 0) { 7054 z.high |= (a.low != 0); 7055 z.high += roundBitsMask; 7056 } 7057 break; 7058 default: 7059 abort(); 7060 } 7061 z.high &= ~ roundBitsMask; 7062 } 7063 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 7064 float_raise(float_flag_inexact, status); 7065 } 7066 return z; 7067 7068 } 7069 7070 /*---------------------------------------------------------------------------- 7071 | Returns the result of dividing the quadruple-precision floating-point value 7072 | `a' by the corresponding value `b'. The operation is performed according to 7073 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7074 *----------------------------------------------------------------------------*/ 7075 7076 float128 float128_div(float128 a, float128 b, float_status *status) 7077 { 7078 bool aSign, bSign, zSign; 7079 int32_t aExp, bExp, zExp; 7080 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 7081 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7082 7083 aSig1 = extractFloat128Frac1( a ); 7084 aSig0 = extractFloat128Frac0( a ); 7085 aExp = extractFloat128Exp( a ); 7086 aSign = extractFloat128Sign( a ); 7087 bSig1 = extractFloat128Frac1( b ); 7088 bSig0 = extractFloat128Frac0( b ); 7089 bExp = extractFloat128Exp( b ); 7090 bSign = extractFloat128Sign( b ); 7091 zSign = aSign ^ bSign; 7092 if ( aExp == 0x7FFF ) { 7093 if (aSig0 | aSig1) { 7094 return propagateFloat128NaN(a, b, status); 7095 } 7096 if ( bExp == 0x7FFF ) { 7097 if (bSig0 | bSig1) { 7098 return propagateFloat128NaN(a, b, status); 7099 } 7100 goto invalid; 7101 } 7102 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7103 } 7104 if ( bExp == 0x7FFF ) { 7105 if (bSig0 | bSig1) { 7106 return propagateFloat128NaN(a, b, status); 7107 } 7108 return packFloat128( zSign, 0, 0, 0 ); 7109 } 7110 if ( bExp == 0 ) { 7111 if ( ( bSig0 | bSig1 ) == 0 ) { 7112 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 7113 invalid: 7114 float_raise(float_flag_invalid, status); 7115 return float128_default_nan(status); 7116 } 7117 float_raise(float_flag_divbyzero, status); 7118 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7119 } 7120 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7121 } 7122 if ( aExp == 0 ) { 7123 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7124 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7125 } 7126 zExp = aExp - bExp + 0x3FFD; 7127 shortShift128Left( 7128 aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 ); 7129 shortShift128Left( 7130 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 ); 7131 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 7132 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 7133 ++zExp; 7134 } 7135 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7136 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 7137 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 7138 while ( (int64_t) rem0 < 0 ) { 7139 --zSig0; 7140 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 7141 } 7142 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 7143 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 7144 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 7145 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 7146 while ( (int64_t) rem1 < 0 ) { 7147 --zSig1; 7148 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 7149 } 7150 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7151 } 7152 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 7153 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7154 7155 } 7156 7157 /*---------------------------------------------------------------------------- 7158 | Returns the remainder of the quadruple-precision floating-point value `a' 7159 | with respect to the corresponding value `b'. The operation is performed 7160 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7161 *----------------------------------------------------------------------------*/ 7162 7163 float128 float128_rem(float128 a, float128 b, float_status *status) 7164 { 7165 bool aSign, zSign; 7166 int32_t aExp, bExp, expDiff; 7167 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 7168 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 7169 int64_t sigMean0; 7170 7171 aSig1 = extractFloat128Frac1( a ); 7172 aSig0 = extractFloat128Frac0( a ); 7173 aExp = extractFloat128Exp( a ); 7174 aSign = extractFloat128Sign( a ); 7175 bSig1 = extractFloat128Frac1( b ); 7176 bSig0 = extractFloat128Frac0( b ); 7177 bExp = extractFloat128Exp( b ); 7178 if ( aExp == 0x7FFF ) { 7179 if ( ( aSig0 | aSig1 ) 7180 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 7181 return propagateFloat128NaN(a, b, status); 7182 } 7183 goto invalid; 7184 } 7185 if ( bExp == 0x7FFF ) { 7186 if (bSig0 | bSig1) { 7187 return propagateFloat128NaN(a, b, status); 7188 } 7189 return a; 7190 } 7191 if ( bExp == 0 ) { 7192 if ( ( bSig0 | bSig1 ) == 0 ) { 7193 invalid: 7194 float_raise(float_flag_invalid, status); 7195 return float128_default_nan(status); 7196 } 7197 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7198 } 7199 if ( aExp == 0 ) { 7200 if ( ( aSig0 | aSig1 ) == 0 ) return a; 7201 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7202 } 7203 expDiff = aExp - bExp; 7204 if ( expDiff < -1 ) return a; 7205 shortShift128Left( 7206 aSig0 | UINT64_C(0x0001000000000000), 7207 aSig1, 7208 15 - ( expDiff < 0 ), 7209 &aSig0, 7210 &aSig1 7211 ); 7212 shortShift128Left( 7213 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 ); 7214 q = le128( bSig0, bSig1, aSig0, aSig1 ); 7215 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7216 expDiff -= 64; 7217 while ( 0 < expDiff ) { 7218 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7219 q = ( 4 < q ) ? q - 4 : 0; 7220 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7221 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 7222 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 7223 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 7224 expDiff -= 61; 7225 } 7226 if ( -64 < expDiff ) { 7227 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7228 q = ( 4 < q ) ? q - 4 : 0; 7229 q >>= - expDiff; 7230 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7231 expDiff += 52; 7232 if ( expDiff < 0 ) { 7233 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 7234 } 7235 else { 7236 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 7237 } 7238 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7239 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 7240 } 7241 else { 7242 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 7243 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7244 } 7245 do { 7246 alternateASig0 = aSig0; 7247 alternateASig1 = aSig1; 7248 ++q; 7249 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7250 } while ( 0 <= (int64_t) aSig0 ); 7251 add128( 7252 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 7253 if ( ( sigMean0 < 0 ) 7254 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 7255 aSig0 = alternateASig0; 7256 aSig1 = alternateASig1; 7257 } 7258 zSign = ( (int64_t) aSig0 < 0 ); 7259 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 7260 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 7261 status); 7262 } 7263 7264 /*---------------------------------------------------------------------------- 7265 | Returns the square root of the quadruple-precision floating-point value `a'. 7266 | The operation is performed according to the IEC/IEEE Standard for Binary 7267 | Floating-Point Arithmetic. 7268 *----------------------------------------------------------------------------*/ 7269 7270 float128 float128_sqrt(float128 a, float_status *status) 7271 { 7272 bool aSign; 7273 int32_t aExp, zExp; 7274 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 7275 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7276 7277 aSig1 = extractFloat128Frac1( a ); 7278 aSig0 = extractFloat128Frac0( a ); 7279 aExp = extractFloat128Exp( a ); 7280 aSign = extractFloat128Sign( a ); 7281 if ( aExp == 0x7FFF ) { 7282 if (aSig0 | aSig1) { 7283 return propagateFloat128NaN(a, a, status); 7284 } 7285 if ( ! aSign ) return a; 7286 goto invalid; 7287 } 7288 if ( aSign ) { 7289 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 7290 invalid: 7291 float_raise(float_flag_invalid, status); 7292 return float128_default_nan(status); 7293 } 7294 if ( aExp == 0 ) { 7295 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 7296 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7297 } 7298 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 7299 aSig0 |= UINT64_C(0x0001000000000000); 7300 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 7301 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 7302 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 7303 doubleZSig0 = zSig0<<1; 7304 mul64To128( zSig0, zSig0, &term0, &term1 ); 7305 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 7306 while ( (int64_t) rem0 < 0 ) { 7307 --zSig0; 7308 doubleZSig0 -= 2; 7309 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 7310 } 7311 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 7312 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 7313 if ( zSig1 == 0 ) zSig1 = 1; 7314 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 7315 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 7316 mul64To128( zSig1, zSig1, &term2, &term3 ); 7317 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 7318 while ( (int64_t) rem1 < 0 ) { 7319 --zSig1; 7320 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 7321 term3 |= 1; 7322 term2 |= doubleZSig0; 7323 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 7324 } 7325 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7326 } 7327 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 7328 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 7329 7330 } 7331 7332 static inline FloatRelation 7333 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet, 7334 float_status *status) 7335 { 7336 bool aSign, bSign; 7337 7338 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 7339 float_raise(float_flag_invalid, status); 7340 return float_relation_unordered; 7341 } 7342 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 7343 ( extractFloatx80Frac( a )<<1 ) ) || 7344 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 7345 ( extractFloatx80Frac( b )<<1 ) )) { 7346 if (!is_quiet || 7347 floatx80_is_signaling_nan(a, status) || 7348 floatx80_is_signaling_nan(b, status)) { 7349 float_raise(float_flag_invalid, status); 7350 } 7351 return float_relation_unordered; 7352 } 7353 aSign = extractFloatx80Sign( a ); 7354 bSign = extractFloatx80Sign( b ); 7355 if ( aSign != bSign ) { 7356 7357 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 7358 ( ( a.low | b.low ) == 0 ) ) { 7359 /* zero case */ 7360 return float_relation_equal; 7361 } else { 7362 return 1 - (2 * aSign); 7363 } 7364 } else { 7365 /* Normalize pseudo-denormals before comparison. */ 7366 if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) { 7367 ++a.high; 7368 } 7369 if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) { 7370 ++b.high; 7371 } 7372 if (a.low == b.low && a.high == b.high) { 7373 return float_relation_equal; 7374 } else { 7375 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7376 } 7377 } 7378 } 7379 7380 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status) 7381 { 7382 return floatx80_compare_internal(a, b, 0, status); 7383 } 7384 7385 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b, 7386 float_status *status) 7387 { 7388 return floatx80_compare_internal(a, b, 1, status); 7389 } 7390 7391 static inline FloatRelation 7392 float128_compare_internal(float128 a, float128 b, bool is_quiet, 7393 float_status *status) 7394 { 7395 bool aSign, bSign; 7396 7397 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7398 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7399 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7400 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7401 if (!is_quiet || 7402 float128_is_signaling_nan(a, status) || 7403 float128_is_signaling_nan(b, status)) { 7404 float_raise(float_flag_invalid, status); 7405 } 7406 return float_relation_unordered; 7407 } 7408 aSign = extractFloat128Sign( a ); 7409 bSign = extractFloat128Sign( b ); 7410 if ( aSign != bSign ) { 7411 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7412 /* zero case */ 7413 return float_relation_equal; 7414 } else { 7415 return 1 - (2 * aSign); 7416 } 7417 } else { 7418 if (a.low == b.low && a.high == b.high) { 7419 return float_relation_equal; 7420 } else { 7421 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7422 } 7423 } 7424 } 7425 7426 FloatRelation float128_compare(float128 a, float128 b, float_status *status) 7427 { 7428 return float128_compare_internal(a, b, 0, status); 7429 } 7430 7431 FloatRelation float128_compare_quiet(float128 a, float128 b, 7432 float_status *status) 7433 { 7434 return float128_compare_internal(a, b, 1, status); 7435 } 7436 7437 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7438 { 7439 bool aSign; 7440 int32_t aExp; 7441 uint64_t aSig; 7442 7443 if (floatx80_invalid_encoding(a)) { 7444 float_raise(float_flag_invalid, status); 7445 return floatx80_default_nan(status); 7446 } 7447 aSig = extractFloatx80Frac( a ); 7448 aExp = extractFloatx80Exp( a ); 7449 aSign = extractFloatx80Sign( a ); 7450 7451 if ( aExp == 0x7FFF ) { 7452 if ( aSig<<1 ) { 7453 return propagateFloatx80NaN(a, a, status); 7454 } 7455 return a; 7456 } 7457 7458 if (aExp == 0) { 7459 if (aSig == 0) { 7460 return a; 7461 } 7462 aExp++; 7463 } 7464 7465 if (n > 0x10000) { 7466 n = 0x10000; 7467 } else if (n < -0x10000) { 7468 n = -0x10000; 7469 } 7470 7471 aExp += n; 7472 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7473 aSign, aExp, aSig, 0, status); 7474 } 7475 7476 float128 float128_scalbn(float128 a, int n, float_status *status) 7477 { 7478 bool aSign; 7479 int32_t aExp; 7480 uint64_t aSig0, aSig1; 7481 7482 aSig1 = extractFloat128Frac1( a ); 7483 aSig0 = extractFloat128Frac0( a ); 7484 aExp = extractFloat128Exp( a ); 7485 aSign = extractFloat128Sign( a ); 7486 if ( aExp == 0x7FFF ) { 7487 if ( aSig0 | aSig1 ) { 7488 return propagateFloat128NaN(a, a, status); 7489 } 7490 return a; 7491 } 7492 if (aExp != 0) { 7493 aSig0 |= UINT64_C(0x0001000000000000); 7494 } else if (aSig0 == 0 && aSig1 == 0) { 7495 return a; 7496 } else { 7497 aExp++; 7498 } 7499 7500 if (n > 0x10000) { 7501 n = 0x10000; 7502 } else if (n < -0x10000) { 7503 n = -0x10000; 7504 } 7505 7506 aExp += n - 1; 7507 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7508 , status); 7509 7510 } 7511 7512 static void __attribute__((constructor)) softfloat_init(void) 7513 { 7514 union_float64 ua, ub, uc, ur; 7515 7516 if (QEMU_NO_HARDFLOAT) { 7517 return; 7518 } 7519 /* 7520 * Test that the host's FMA is not obviously broken. For example, 7521 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see 7522 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304 7523 */ 7524 ua.s = 0x0020000000000001ULL; 7525 ub.s = 0x3ca0000000000000ULL; 7526 uc.s = 0x0020000000000000ULL; 7527 ur.h = fma(ua.h, ub.h, uc.h); 7528 if (ur.s != 0x0020000000000001ULL) { 7529 force_soft_fma = true; 7530 } 7531 } 7532