1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include <math.h> 87 #include "qemu/bitops.h" 88 #include "fpu/softfloat.h" 89 90 /* We only need stdlib for abort() */ 91 92 /*---------------------------------------------------------------------------- 93 | Primitive arithmetic functions, including multi-word arithmetic, and 94 | division and square root approximations. (Can be specialized to target if 95 | desired.) 96 *----------------------------------------------------------------------------*/ 97 #include "fpu/softfloat-macros.h" 98 99 /* 100 * Hardfloat 101 * 102 * Fast emulation of guest FP instructions is challenging for two reasons. 103 * First, FP instruction semantics are similar but not identical, particularly 104 * when handling NaNs. Second, emulating at reasonable speed the guest FP 105 * exception flags is not trivial: reading the host's flags register with a 106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp], 107 * and trapping on every FP exception is not fast nor pleasant to work with. 108 * 109 * We address these challenges by leveraging the host FPU for a subset of the 110 * operations. To do this we expand on the idea presented in this paper: 111 * 112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a 113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615. 114 * 115 * The idea is thus to leverage the host FPU to (1) compute FP operations 116 * and (2) identify whether FP exceptions occurred while avoiding 117 * expensive exception flag register accesses. 118 * 119 * An important optimization shown in the paper is that given that exception 120 * flags are rarely cleared by the guest, we can avoid recomputing some flags. 121 * This is particularly useful for the inexact flag, which is very frequently 122 * raised in floating-point workloads. 123 * 124 * We optimize the code further by deferring to soft-fp whenever FP exception 125 * detection might get hairy. Two examples: (1) when at least one operand is 126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result 127 * and the result is < the minimum normal. 128 */ 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \ 130 static inline void name(soft_t *a, float_status *s) \ 131 { \ 132 if (unlikely(soft_t ## _is_denormal(*a))) { \ 133 *a = soft_t ## _set_sign(soft_t ## _zero, \ 134 soft_t ## _is_neg(*a)); \ 135 float_raise(float_flag_input_denormal, s); \ 136 } \ 137 } 138 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32) 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64) 141 #undef GEN_INPUT_FLUSH__NOCHECK 142 143 #define GEN_INPUT_FLUSH1(name, soft_t) \ 144 static inline void name(soft_t *a, float_status *s) \ 145 { \ 146 if (likely(!s->flush_inputs_to_zero)) { \ 147 return; \ 148 } \ 149 soft_t ## _input_flush__nocheck(a, s); \ 150 } 151 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32) 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64) 154 #undef GEN_INPUT_FLUSH1 155 156 #define GEN_INPUT_FLUSH2(name, soft_t) \ 157 static inline void name(soft_t *a, soft_t *b, float_status *s) \ 158 { \ 159 if (likely(!s->flush_inputs_to_zero)) { \ 160 return; \ 161 } \ 162 soft_t ## _input_flush__nocheck(a, s); \ 163 soft_t ## _input_flush__nocheck(b, s); \ 164 } 165 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32) 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64) 168 #undef GEN_INPUT_FLUSH2 169 170 #define GEN_INPUT_FLUSH3(name, soft_t) \ 171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \ 172 { \ 173 if (likely(!s->flush_inputs_to_zero)) { \ 174 return; \ 175 } \ 176 soft_t ## _input_flush__nocheck(a, s); \ 177 soft_t ## _input_flush__nocheck(b, s); \ 178 soft_t ## _input_flush__nocheck(c, s); \ 179 } 180 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32) 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64) 183 #undef GEN_INPUT_FLUSH3 184 185 /* 186 * Choose whether to use fpclassify or float32/64_* primitives in the generated 187 * hardfloat functions. Each combination of number of inputs and float size 188 * gets its own value. 189 */ 190 #if defined(__x86_64__) 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1 197 #else 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0 204 #endif 205 206 /* 207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over 208 * float{32,64}_is_infinity when !USE_FP. 209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup. 210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%. 211 */ 212 #if defined(__x86_64__) || defined(__aarch64__) 213 # define QEMU_HARDFLOAT_USE_ISINF 1 214 #else 215 # define QEMU_HARDFLOAT_USE_ISINF 0 216 #endif 217 218 /* 219 * Some targets clear the FP flags before most FP operations. This prevents 220 * the use of hardfloat, since hardfloat relies on the inexact flag being 221 * already set. 222 */ 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__) 224 # if defined(__FAST_MATH__) 225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \ 226 IEEE implementation 227 # endif 228 # define QEMU_NO_HARDFLOAT 1 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN 230 #else 231 # define QEMU_NO_HARDFLOAT 0 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline)) 233 #endif 234 235 static inline bool can_use_fpu(const float_status *s) 236 { 237 if (QEMU_NO_HARDFLOAT) { 238 return false; 239 } 240 return likely(s->float_exception_flags & float_flag_inexact && 241 s->float_rounding_mode == float_round_nearest_even); 242 } 243 244 /* 245 * Hardfloat generation functions. Each operation can have two flavors: 246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for 247 * most condition checks, or native ones (e.g. fpclassify). 248 * 249 * The flavor is chosen by the callers. Instead of using macros, we rely on the 250 * compiler to propagate constants and inline everything into the callers. 251 * 252 * We only generate functions for operations with two inputs, since only 253 * these are common enough to justify consolidating them into common code. 254 */ 255 256 typedef union { 257 float32 s; 258 float h; 259 } union_float32; 260 261 typedef union { 262 float64 s; 263 double h; 264 } union_float64; 265 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b); 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b); 268 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s); 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s); 271 typedef float (*hard_f32_op2_fn)(float a, float b); 272 typedef double (*hard_f64_op2_fn)(double a, double b); 273 274 /* 2-input is-zero-or-normal */ 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b) 276 { 277 if (QEMU_HARDFLOAT_2F32_USE_FP) { 278 /* 279 * Not using a temp variable for consecutive fpclassify calls ends up 280 * generating faster code. 281 */ 282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 284 } 285 return float32_is_zero_or_normal(a.s) && 286 float32_is_zero_or_normal(b.s); 287 } 288 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b) 290 { 291 if (QEMU_HARDFLOAT_2F64_USE_FP) { 292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 294 } 295 return float64_is_zero_or_normal(a.s) && 296 float64_is_zero_or_normal(b.s); 297 } 298 299 /* 3-input is-zero-or-normal */ 300 static inline 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c) 302 { 303 if (QEMU_HARDFLOAT_3F32_USE_FP) { 304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 307 } 308 return float32_is_zero_or_normal(a.s) && 309 float32_is_zero_or_normal(b.s) && 310 float32_is_zero_or_normal(c.s); 311 } 312 313 static inline 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c) 315 { 316 if (QEMU_HARDFLOAT_3F64_USE_FP) { 317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 320 } 321 return float64_is_zero_or_normal(a.s) && 322 float64_is_zero_or_normal(b.s) && 323 float64_is_zero_or_normal(c.s); 324 } 325 326 static inline bool f32_is_inf(union_float32 a) 327 { 328 if (QEMU_HARDFLOAT_USE_ISINF) { 329 return isinf(a.h); 330 } 331 return float32_is_infinity(a.s); 332 } 333 334 static inline bool f64_is_inf(union_float64 a) 335 { 336 if (QEMU_HARDFLOAT_USE_ISINF) { 337 return isinf(a.h); 338 } 339 return float64_is_infinity(a.s); 340 } 341 342 static inline float32 343 float32_gen2(float32 xa, float32 xb, float_status *s, 344 hard_f32_op2_fn hard, soft_f32_op2_fn soft, 345 f32_check_fn pre, f32_check_fn post) 346 { 347 union_float32 ua, ub, ur; 348 349 ua.s = xa; 350 ub.s = xb; 351 352 if (unlikely(!can_use_fpu(s))) { 353 goto soft; 354 } 355 356 float32_input_flush2(&ua.s, &ub.s, s); 357 if (unlikely(!pre(ua, ub))) { 358 goto soft; 359 } 360 361 ur.h = hard(ua.h, ub.h); 362 if (unlikely(f32_is_inf(ur))) { 363 float_raise(float_flag_overflow, s); 364 } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) { 365 goto soft; 366 } 367 return ur.s; 368 369 soft: 370 return soft(ua.s, ub.s, s); 371 } 372 373 static inline float64 374 float64_gen2(float64 xa, float64 xb, float_status *s, 375 hard_f64_op2_fn hard, soft_f64_op2_fn soft, 376 f64_check_fn pre, f64_check_fn post) 377 { 378 union_float64 ua, ub, ur; 379 380 ua.s = xa; 381 ub.s = xb; 382 383 if (unlikely(!can_use_fpu(s))) { 384 goto soft; 385 } 386 387 float64_input_flush2(&ua.s, &ub.s, s); 388 if (unlikely(!pre(ua, ub))) { 389 goto soft; 390 } 391 392 ur.h = hard(ua.h, ub.h); 393 if (unlikely(f64_is_inf(ur))) { 394 float_raise(float_flag_overflow, s); 395 } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) { 396 goto soft; 397 } 398 return ur.s; 399 400 soft: 401 return soft(ua.s, ub.s, s); 402 } 403 404 /*---------------------------------------------------------------------------- 405 | Returns the fraction bits of the single-precision floating-point value `a'. 406 *----------------------------------------------------------------------------*/ 407 408 static inline uint32_t extractFloat32Frac(float32 a) 409 { 410 return float32_val(a) & 0x007FFFFF; 411 } 412 413 /*---------------------------------------------------------------------------- 414 | Returns the exponent bits of the single-precision floating-point value `a'. 415 *----------------------------------------------------------------------------*/ 416 417 static inline int extractFloat32Exp(float32 a) 418 { 419 return (float32_val(a) >> 23) & 0xFF; 420 } 421 422 /*---------------------------------------------------------------------------- 423 | Returns the sign bit of the single-precision floating-point value `a'. 424 *----------------------------------------------------------------------------*/ 425 426 static inline bool extractFloat32Sign(float32 a) 427 { 428 return float32_val(a) >> 31; 429 } 430 431 /*---------------------------------------------------------------------------- 432 | Returns the fraction bits of the double-precision floating-point value `a'. 433 *----------------------------------------------------------------------------*/ 434 435 static inline uint64_t extractFloat64Frac(float64 a) 436 { 437 return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF); 438 } 439 440 /*---------------------------------------------------------------------------- 441 | Returns the exponent bits of the double-precision floating-point value `a'. 442 *----------------------------------------------------------------------------*/ 443 444 static inline int extractFloat64Exp(float64 a) 445 { 446 return (float64_val(a) >> 52) & 0x7FF; 447 } 448 449 /*---------------------------------------------------------------------------- 450 | Returns the sign bit of the double-precision floating-point value `a'. 451 *----------------------------------------------------------------------------*/ 452 453 static inline bool extractFloat64Sign(float64 a) 454 { 455 return float64_val(a) >> 63; 456 } 457 458 /* 459 * Classify a floating point number. Everything above float_class_qnan 460 * is a NaN so cls >= float_class_qnan is any NaN. 461 */ 462 463 typedef enum __attribute__ ((__packed__)) { 464 float_class_unclassified, 465 float_class_zero, 466 float_class_normal, 467 float_class_inf, 468 float_class_qnan, /* all NaNs from here */ 469 float_class_snan, 470 } FloatClass; 471 472 #define float_cmask(bit) (1u << (bit)) 473 474 enum { 475 float_cmask_zero = float_cmask(float_class_zero), 476 float_cmask_normal = float_cmask(float_class_normal), 477 float_cmask_inf = float_cmask(float_class_inf), 478 float_cmask_qnan = float_cmask(float_class_qnan), 479 float_cmask_snan = float_cmask(float_class_snan), 480 481 float_cmask_infzero = float_cmask_zero | float_cmask_inf, 482 float_cmask_anynan = float_cmask_qnan | float_cmask_snan, 483 }; 484 485 486 /* Simple helpers for checking if, or what kind of, NaN we have */ 487 static inline __attribute__((unused)) bool is_nan(FloatClass c) 488 { 489 return unlikely(c >= float_class_qnan); 490 } 491 492 static inline __attribute__((unused)) bool is_snan(FloatClass c) 493 { 494 return c == float_class_snan; 495 } 496 497 static inline __attribute__((unused)) bool is_qnan(FloatClass c) 498 { 499 return c == float_class_qnan; 500 } 501 502 /* 503 * Structure holding all of the decomposed parts of a float. 504 * The exponent is unbiased and the fraction is normalized. 505 * 506 * The fraction words are stored in big-endian word ordering, 507 * so that truncation from a larger format to a smaller format 508 * can be done simply by ignoring subsequent elements. 509 */ 510 511 typedef struct { 512 FloatClass cls; 513 bool sign; 514 int32_t exp; 515 union { 516 /* Routines that know the structure may reference the singular name. */ 517 uint64_t frac; 518 /* 519 * Routines expanded with multiple structures reference "hi" and "lo" 520 * depending on the operation. In FloatParts64, "hi" and "lo" are 521 * both the same word and aliased here. 522 */ 523 uint64_t frac_hi; 524 uint64_t frac_lo; 525 }; 526 } FloatParts64; 527 528 typedef struct { 529 FloatClass cls; 530 bool sign; 531 int32_t exp; 532 uint64_t frac_hi; 533 uint64_t frac_lo; 534 } FloatParts128; 535 536 /* These apply to the most significant word of each FloatPartsN. */ 537 #define DECOMPOSED_BINARY_POINT 63 538 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 539 540 /* Structure holding all of the relevant parameters for a format. 541 * exp_size: the size of the exponent field 542 * exp_bias: the offset applied to the exponent field 543 * exp_max: the maximum normalised exponent 544 * frac_size: the size of the fraction field 545 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 546 * The following are computed based the size of fraction 547 * frac_lsb: least significant bit of fraction 548 * frac_lsbm1: the bit below the least significant bit (for rounding) 549 * round_mask/roundeven_mask: masks used for rounding 550 * The following optional modifiers are available: 551 * arm_althp: handle ARM Alternative Half Precision 552 */ 553 typedef struct { 554 int exp_size; 555 int exp_bias; 556 int exp_max; 557 int frac_size; 558 int frac_shift; 559 uint64_t frac_lsb; 560 uint64_t frac_lsbm1; 561 uint64_t round_mask; 562 uint64_t roundeven_mask; 563 bool arm_althp; 564 } FloatFmt; 565 566 /* Expand fields based on the size of exponent and fraction */ 567 #define FLOAT_PARAMS(E, F) \ 568 .exp_size = E, \ 569 .exp_bias = ((1 << E) - 1) >> 1, \ 570 .exp_max = (1 << E) - 1, \ 571 .frac_size = F, \ 572 .frac_shift = (-F - 1) & 63, \ 573 .frac_lsb = 1ull << ((-F - 1) & 63), \ 574 .frac_lsbm1 = 1ull << ((-F - 2) & 63), \ 575 .round_mask = (1ull << ((-F - 1) & 63)) - 1, \ 576 .roundeven_mask = (2ull << ((-F - 1) & 63)) - 1 577 578 static const FloatFmt float16_params = { 579 FLOAT_PARAMS(5, 10) 580 }; 581 582 static const FloatFmt float16_params_ahp = { 583 FLOAT_PARAMS(5, 10), 584 .arm_althp = true 585 }; 586 587 static const FloatFmt bfloat16_params = { 588 FLOAT_PARAMS(8, 7) 589 }; 590 591 static const FloatFmt float32_params = { 592 FLOAT_PARAMS(8, 23) 593 }; 594 595 static const FloatFmt float64_params = { 596 FLOAT_PARAMS(11, 52) 597 }; 598 599 static const FloatFmt float128_params = { 600 FLOAT_PARAMS(15, 112) 601 }; 602 603 /* Unpack a float to parts, but do not canonicalize. */ 604 static void unpack_raw64(FloatParts64 *r, const FloatFmt *fmt, uint64_t raw) 605 { 606 const int f_size = fmt->frac_size; 607 const int e_size = fmt->exp_size; 608 609 *r = (FloatParts64) { 610 .cls = float_class_unclassified, 611 .sign = extract64(raw, f_size + e_size, 1), 612 .exp = extract64(raw, f_size, e_size), 613 .frac = extract64(raw, 0, f_size) 614 }; 615 } 616 617 static inline void float16_unpack_raw(FloatParts64 *p, float16 f) 618 { 619 unpack_raw64(p, &float16_params, f); 620 } 621 622 static inline void bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f) 623 { 624 unpack_raw64(p, &bfloat16_params, f); 625 } 626 627 static inline void float32_unpack_raw(FloatParts64 *p, float32 f) 628 { 629 unpack_raw64(p, &float32_params, f); 630 } 631 632 static inline void float64_unpack_raw(FloatParts64 *p, float64 f) 633 { 634 unpack_raw64(p, &float64_params, f); 635 } 636 637 static void float128_unpack_raw(FloatParts128 *p, float128 f) 638 { 639 const int f_size = float128_params.frac_size - 64; 640 const int e_size = float128_params.exp_size; 641 642 *p = (FloatParts128) { 643 .cls = float_class_unclassified, 644 .sign = extract64(f.high, f_size + e_size, 1), 645 .exp = extract64(f.high, f_size, e_size), 646 .frac_hi = extract64(f.high, 0, f_size), 647 .frac_lo = f.low, 648 }; 649 } 650 651 /* Pack a float from parts, but do not canonicalize. */ 652 static uint64_t pack_raw64(const FloatParts64 *p, const FloatFmt *fmt) 653 { 654 const int f_size = fmt->frac_size; 655 const int e_size = fmt->exp_size; 656 uint64_t ret; 657 658 ret = (uint64_t)p->sign << (f_size + e_size); 659 ret = deposit64(ret, f_size, e_size, p->exp); 660 ret = deposit64(ret, 0, f_size, p->frac); 661 return ret; 662 } 663 664 static inline float16 float16_pack_raw(const FloatParts64 *p) 665 { 666 return make_float16(pack_raw64(p, &float16_params)); 667 } 668 669 static inline bfloat16 bfloat16_pack_raw(const FloatParts64 *p) 670 { 671 return pack_raw64(p, &bfloat16_params); 672 } 673 674 static inline float32 float32_pack_raw(const FloatParts64 *p) 675 { 676 return make_float32(pack_raw64(p, &float32_params)); 677 } 678 679 static inline float64 float64_pack_raw(const FloatParts64 *p) 680 { 681 return make_float64(pack_raw64(p, &float64_params)); 682 } 683 684 static float128 float128_pack_raw(const FloatParts128 *p) 685 { 686 const int f_size = float128_params.frac_size - 64; 687 const int e_size = float128_params.exp_size; 688 uint64_t hi; 689 690 hi = (uint64_t)p->sign << (f_size + e_size); 691 hi = deposit64(hi, f_size, e_size, p->exp); 692 hi = deposit64(hi, 0, f_size, p->frac_hi); 693 return make_float128(hi, p->frac_lo); 694 } 695 696 /*---------------------------------------------------------------------------- 697 | Functions and definitions to determine: (1) whether tininess for underflow 698 | is detected before or after rounding by default, (2) what (if anything) 699 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 700 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 701 | are propagated from function inputs to output. These details are target- 702 | specific. 703 *----------------------------------------------------------------------------*/ 704 #include "softfloat-specialize.c.inc" 705 706 #define PARTS_GENERIC_64_128(NAME, P) \ 707 QEMU_GENERIC(P, (FloatParts128 *, parts128_##NAME), parts64_##NAME) 708 709 #define parts_default_nan(P, S) PARTS_GENERIC_64_128(default_nan, P)(P, S) 710 #define parts_silence_nan(P, S) PARTS_GENERIC_64_128(silence_nan, P)(P, S) 711 712 static void parts64_return_nan(FloatParts64 *a, float_status *s); 713 static void parts128_return_nan(FloatParts128 *a, float_status *s); 714 715 #define parts_return_nan(P, S) PARTS_GENERIC_64_128(return_nan, P)(P, S) 716 717 static FloatParts64 *parts64_pick_nan(FloatParts64 *a, FloatParts64 *b, 718 float_status *s); 719 static FloatParts128 *parts128_pick_nan(FloatParts128 *a, FloatParts128 *b, 720 float_status *s); 721 722 #define parts_pick_nan(A, B, S) PARTS_GENERIC_64_128(pick_nan, A)(A, B, S) 723 724 static FloatParts64 *parts64_pick_nan_muladd(FloatParts64 *a, FloatParts64 *b, 725 FloatParts64 *c, float_status *s, 726 int ab_mask, int abc_mask); 727 static FloatParts128 *parts128_pick_nan_muladd(FloatParts128 *a, 728 FloatParts128 *b, 729 FloatParts128 *c, 730 float_status *s, 731 int ab_mask, int abc_mask); 732 733 #define parts_pick_nan_muladd(A, B, C, S, ABM, ABCM) \ 734 PARTS_GENERIC_64_128(pick_nan_muladd, A)(A, B, C, S, ABM, ABCM) 735 736 static void parts64_canonicalize(FloatParts64 *p, float_status *status, 737 const FloatFmt *fmt); 738 static void parts128_canonicalize(FloatParts128 *p, float_status *status, 739 const FloatFmt *fmt); 740 741 #define parts_canonicalize(A, S, F) \ 742 PARTS_GENERIC_64_128(canonicalize, A)(A, S, F) 743 744 static void parts64_uncanon(FloatParts64 *p, float_status *status, 745 const FloatFmt *fmt); 746 static void parts128_uncanon(FloatParts128 *p, float_status *status, 747 const FloatFmt *fmt); 748 749 #define parts_uncanon(A, S, F) \ 750 PARTS_GENERIC_64_128(uncanon, A)(A, S, F) 751 752 static void parts64_add_normal(FloatParts64 *a, FloatParts64 *b); 753 static void parts128_add_normal(FloatParts128 *a, FloatParts128 *b); 754 755 #define parts_add_normal(A, B) \ 756 PARTS_GENERIC_64_128(add_normal, A)(A, B) 757 758 static bool parts64_sub_normal(FloatParts64 *a, FloatParts64 *b); 759 static bool parts128_sub_normal(FloatParts128 *a, FloatParts128 *b); 760 761 #define parts_sub_normal(A, B) \ 762 PARTS_GENERIC_64_128(sub_normal, A)(A, B) 763 764 static FloatParts64 *parts64_addsub(FloatParts64 *a, FloatParts64 *b, 765 float_status *s, bool subtract); 766 static FloatParts128 *parts128_addsub(FloatParts128 *a, FloatParts128 *b, 767 float_status *s, bool subtract); 768 769 #define parts_addsub(A, B, S, Z) \ 770 PARTS_GENERIC_64_128(addsub, A)(A, B, S, Z) 771 772 /* 773 * Helper functions for softfloat-parts.c.inc, per-size operations. 774 */ 775 776 #define FRAC_GENERIC_64_128(NAME, P) \ 777 QEMU_GENERIC(P, (FloatParts128 *, frac128_##NAME), frac64_##NAME) 778 779 static bool frac64_add(FloatParts64 *r, FloatParts64 *a, FloatParts64 *b) 780 { 781 return uadd64_overflow(a->frac, b->frac, &r->frac); 782 } 783 784 static bool frac128_add(FloatParts128 *r, FloatParts128 *a, FloatParts128 *b) 785 { 786 bool c = 0; 787 r->frac_lo = uadd64_carry(a->frac_lo, b->frac_lo, &c); 788 r->frac_hi = uadd64_carry(a->frac_hi, b->frac_hi, &c); 789 return c; 790 } 791 792 #define frac_add(R, A, B) FRAC_GENERIC_64_128(add, R)(R, A, B) 793 794 static bool frac64_addi(FloatParts64 *r, FloatParts64 *a, uint64_t c) 795 { 796 return uadd64_overflow(a->frac, c, &r->frac); 797 } 798 799 static bool frac128_addi(FloatParts128 *r, FloatParts128 *a, uint64_t c) 800 { 801 c = uadd64_overflow(a->frac_lo, c, &r->frac_lo); 802 return uadd64_overflow(a->frac_hi, c, &r->frac_hi); 803 } 804 805 #define frac_addi(R, A, C) FRAC_GENERIC_64_128(addi, R)(R, A, C) 806 807 static void frac64_allones(FloatParts64 *a) 808 { 809 a->frac = -1; 810 } 811 812 static void frac128_allones(FloatParts128 *a) 813 { 814 a->frac_hi = a->frac_lo = -1; 815 } 816 817 #define frac_allones(A) FRAC_GENERIC_64_128(allones, A)(A) 818 819 static int frac64_cmp(FloatParts64 *a, FloatParts64 *b) 820 { 821 return a->frac == b->frac ? 0 : a->frac < b->frac ? -1 : 1; 822 } 823 824 static int frac128_cmp(FloatParts128 *a, FloatParts128 *b) 825 { 826 uint64_t ta = a->frac_hi, tb = b->frac_hi; 827 if (ta == tb) { 828 ta = a->frac_lo, tb = b->frac_lo; 829 if (ta == tb) { 830 return 0; 831 } 832 } 833 return ta < tb ? -1 : 1; 834 } 835 836 #define frac_cmp(A, B) FRAC_GENERIC_64_128(cmp, A)(A, B) 837 838 static void frac64_clear(FloatParts64 *a) 839 { 840 a->frac = 0; 841 } 842 843 static void frac128_clear(FloatParts128 *a) 844 { 845 a->frac_hi = a->frac_lo = 0; 846 } 847 848 #define frac_clear(A) FRAC_GENERIC_64_128(clear, A)(A) 849 850 static bool frac64_eqz(FloatParts64 *a) 851 { 852 return a->frac == 0; 853 } 854 855 static bool frac128_eqz(FloatParts128 *a) 856 { 857 return (a->frac_hi | a->frac_lo) == 0; 858 } 859 860 #define frac_eqz(A) FRAC_GENERIC_64_128(eqz, A)(A) 861 862 static void frac64_neg(FloatParts64 *a) 863 { 864 a->frac = -a->frac; 865 } 866 867 static void frac128_neg(FloatParts128 *a) 868 { 869 bool c = 0; 870 a->frac_lo = usub64_borrow(0, a->frac_lo, &c); 871 a->frac_hi = usub64_borrow(0, a->frac_hi, &c); 872 } 873 874 #define frac_neg(A) FRAC_GENERIC_64_128(neg, A)(A) 875 876 static int frac64_normalize(FloatParts64 *a) 877 { 878 if (a->frac) { 879 int shift = clz64(a->frac); 880 a->frac <<= shift; 881 return shift; 882 } 883 return 64; 884 } 885 886 static int frac128_normalize(FloatParts128 *a) 887 { 888 if (a->frac_hi) { 889 int shl = clz64(a->frac_hi); 890 if (shl) { 891 int shr = 64 - shl; 892 a->frac_hi = (a->frac_hi << shl) | (a->frac_lo >> shr); 893 a->frac_lo = (a->frac_lo << shl); 894 } 895 return shl; 896 } else if (a->frac_lo) { 897 int shl = clz64(a->frac_lo); 898 a->frac_hi = (a->frac_lo << shl); 899 a->frac_lo = 0; 900 return shl + 64; 901 } 902 return 128; 903 } 904 905 #define frac_normalize(A) FRAC_GENERIC_64_128(normalize, A)(A) 906 907 static void frac64_shl(FloatParts64 *a, int c) 908 { 909 a->frac <<= c; 910 } 911 912 static void frac128_shl(FloatParts128 *a, int c) 913 { 914 shift128Left(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo); 915 } 916 917 #define frac_shl(A, C) FRAC_GENERIC_64_128(shl, A)(A, C) 918 919 static void frac64_shr(FloatParts64 *a, int c) 920 { 921 a->frac >>= c; 922 } 923 924 static void frac128_shr(FloatParts128 *a, int c) 925 { 926 shift128Right(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo); 927 } 928 929 #define frac_shr(A, C) FRAC_GENERIC_64_128(shr, A)(A, C) 930 931 static void frac64_shrjam(FloatParts64 *a, int c) 932 { 933 shift64RightJamming(a->frac, c, &a->frac); 934 } 935 936 static void frac128_shrjam(FloatParts128 *a, int c) 937 { 938 shift128RightJamming(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo); 939 } 940 941 #define frac_shrjam(A, C) FRAC_GENERIC_64_128(shrjam, A)(A, C) 942 943 static bool frac64_sub(FloatParts64 *r, FloatParts64 *a, FloatParts64 *b) 944 { 945 return usub64_overflow(a->frac, b->frac, &r->frac); 946 } 947 948 static bool frac128_sub(FloatParts128 *r, FloatParts128 *a, FloatParts128 *b) 949 { 950 bool c = 0; 951 r->frac_lo = usub64_borrow(a->frac_lo, b->frac_lo, &c); 952 r->frac_hi = usub64_borrow(a->frac_hi, b->frac_hi, &c); 953 return c; 954 } 955 956 #define frac_sub(R, A, B) FRAC_GENERIC_64_128(sub, R)(R, A, B) 957 958 #define partsN(NAME) glue(glue(glue(parts,N),_),NAME) 959 #define FloatPartsN glue(FloatParts,N) 960 961 #define N 64 962 963 #include "softfloat-parts-addsub.c.inc" 964 #include "softfloat-parts.c.inc" 965 966 #undef N 967 #define N 128 968 969 #include "softfloat-parts-addsub.c.inc" 970 #include "softfloat-parts.c.inc" 971 972 #undef N 973 #undef partsN 974 #undef FloatPartsN 975 976 /* 977 * Pack/unpack routines with a specific FloatFmt. 978 */ 979 980 static void float16a_unpack_canonical(FloatParts64 *p, float16 f, 981 float_status *s, const FloatFmt *params) 982 { 983 float16_unpack_raw(p, f); 984 parts_canonicalize(p, s, params); 985 } 986 987 static void float16_unpack_canonical(FloatParts64 *p, float16 f, 988 float_status *s) 989 { 990 float16a_unpack_canonical(p, f, s, &float16_params); 991 } 992 993 static void bfloat16_unpack_canonical(FloatParts64 *p, bfloat16 f, 994 float_status *s) 995 { 996 bfloat16_unpack_raw(p, f); 997 parts_canonicalize(p, s, &bfloat16_params); 998 } 999 1000 static float16 float16a_round_pack_canonical(FloatParts64 *p, 1001 float_status *s, 1002 const FloatFmt *params) 1003 { 1004 parts_uncanon(p, s, params); 1005 return float16_pack_raw(p); 1006 } 1007 1008 static float16 float16_round_pack_canonical(FloatParts64 *p, 1009 float_status *s) 1010 { 1011 return float16a_round_pack_canonical(p, s, &float16_params); 1012 } 1013 1014 static bfloat16 bfloat16_round_pack_canonical(FloatParts64 *p, 1015 float_status *s) 1016 { 1017 parts_uncanon(p, s, &bfloat16_params); 1018 return bfloat16_pack_raw(p); 1019 } 1020 1021 static void float32_unpack_canonical(FloatParts64 *p, float32 f, 1022 float_status *s) 1023 { 1024 float32_unpack_raw(p, f); 1025 parts_canonicalize(p, s, &float32_params); 1026 } 1027 1028 static float32 float32_round_pack_canonical(FloatParts64 *p, 1029 float_status *s) 1030 { 1031 parts_uncanon(p, s, &float32_params); 1032 return float32_pack_raw(p); 1033 } 1034 1035 static void float64_unpack_canonical(FloatParts64 *p, float64 f, 1036 float_status *s) 1037 { 1038 float64_unpack_raw(p, f); 1039 parts_canonicalize(p, s, &float64_params); 1040 } 1041 1042 static float64 float64_round_pack_canonical(FloatParts64 *p, 1043 float_status *s) 1044 { 1045 parts_uncanon(p, s, &float64_params); 1046 return float64_pack_raw(p); 1047 } 1048 1049 static void float128_unpack_canonical(FloatParts128 *p, float128 f, 1050 float_status *s) 1051 { 1052 float128_unpack_raw(p, f); 1053 parts_canonicalize(p, s, &float128_params); 1054 } 1055 1056 static float128 float128_round_pack_canonical(FloatParts128 *p, 1057 float_status *s) 1058 { 1059 parts_uncanon(p, s, &float128_params); 1060 return float128_pack_raw(p); 1061 } 1062 1063 /* 1064 * Addition and subtraction 1065 */ 1066 1067 static float16 QEMU_FLATTEN 1068 float16_addsub(float16 a, float16 b, float_status *status, bool subtract) 1069 { 1070 FloatParts64 pa, pb, *pr; 1071 1072 float16_unpack_canonical(&pa, a, status); 1073 float16_unpack_canonical(&pb, b, status); 1074 pr = parts_addsub(&pa, &pb, status, subtract); 1075 1076 return float16_round_pack_canonical(pr, status); 1077 } 1078 1079 float16 float16_add(float16 a, float16 b, float_status *status) 1080 { 1081 return float16_addsub(a, b, status, false); 1082 } 1083 1084 float16 float16_sub(float16 a, float16 b, float_status *status) 1085 { 1086 return float16_addsub(a, b, status, true); 1087 } 1088 1089 static float32 QEMU_SOFTFLOAT_ATTR 1090 soft_f32_addsub(float32 a, float32 b, float_status *status, bool subtract) 1091 { 1092 FloatParts64 pa, pb, *pr; 1093 1094 float32_unpack_canonical(&pa, a, status); 1095 float32_unpack_canonical(&pb, b, status); 1096 pr = parts_addsub(&pa, &pb, status, subtract); 1097 1098 return float32_round_pack_canonical(pr, status); 1099 } 1100 1101 static float32 soft_f32_add(float32 a, float32 b, float_status *status) 1102 { 1103 return soft_f32_addsub(a, b, status, false); 1104 } 1105 1106 static float32 soft_f32_sub(float32 a, float32 b, float_status *status) 1107 { 1108 return soft_f32_addsub(a, b, status, true); 1109 } 1110 1111 static float64 QEMU_SOFTFLOAT_ATTR 1112 soft_f64_addsub(float64 a, float64 b, float_status *status, bool subtract) 1113 { 1114 FloatParts64 pa, pb, *pr; 1115 1116 float64_unpack_canonical(&pa, a, status); 1117 float64_unpack_canonical(&pb, b, status); 1118 pr = parts_addsub(&pa, &pb, status, subtract); 1119 1120 return float64_round_pack_canonical(pr, status); 1121 } 1122 1123 static float64 soft_f64_add(float64 a, float64 b, float_status *status) 1124 { 1125 return soft_f64_addsub(a, b, status, false); 1126 } 1127 1128 static float64 soft_f64_sub(float64 a, float64 b, float_status *status) 1129 { 1130 return soft_f64_addsub(a, b, status, true); 1131 } 1132 1133 static float hard_f32_add(float a, float b) 1134 { 1135 return a + b; 1136 } 1137 1138 static float hard_f32_sub(float a, float b) 1139 { 1140 return a - b; 1141 } 1142 1143 static double hard_f64_add(double a, double b) 1144 { 1145 return a + b; 1146 } 1147 1148 static double hard_f64_sub(double a, double b) 1149 { 1150 return a - b; 1151 } 1152 1153 static bool f32_addsubmul_post(union_float32 a, union_float32 b) 1154 { 1155 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1156 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1157 } 1158 return !(float32_is_zero(a.s) && float32_is_zero(b.s)); 1159 } 1160 1161 static bool f64_addsubmul_post(union_float64 a, union_float64 b) 1162 { 1163 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1164 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1165 } else { 1166 return !(float64_is_zero(a.s) && float64_is_zero(b.s)); 1167 } 1168 } 1169 1170 static float32 float32_addsub(float32 a, float32 b, float_status *s, 1171 hard_f32_op2_fn hard, soft_f32_op2_fn soft) 1172 { 1173 return float32_gen2(a, b, s, hard, soft, 1174 f32_is_zon2, f32_addsubmul_post); 1175 } 1176 1177 static float64 float64_addsub(float64 a, float64 b, float_status *s, 1178 hard_f64_op2_fn hard, soft_f64_op2_fn soft) 1179 { 1180 return float64_gen2(a, b, s, hard, soft, 1181 f64_is_zon2, f64_addsubmul_post); 1182 } 1183 1184 float32 QEMU_FLATTEN 1185 float32_add(float32 a, float32 b, float_status *s) 1186 { 1187 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add); 1188 } 1189 1190 float32 QEMU_FLATTEN 1191 float32_sub(float32 a, float32 b, float_status *s) 1192 { 1193 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub); 1194 } 1195 1196 float64 QEMU_FLATTEN 1197 float64_add(float64 a, float64 b, float_status *s) 1198 { 1199 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add); 1200 } 1201 1202 float64 QEMU_FLATTEN 1203 float64_sub(float64 a, float64 b, float_status *s) 1204 { 1205 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub); 1206 } 1207 1208 static bfloat16 QEMU_FLATTEN 1209 bfloat16_addsub(bfloat16 a, bfloat16 b, float_status *status, bool subtract) 1210 { 1211 FloatParts64 pa, pb, *pr; 1212 1213 bfloat16_unpack_canonical(&pa, a, status); 1214 bfloat16_unpack_canonical(&pb, b, status); 1215 pr = parts_addsub(&pa, &pb, status, subtract); 1216 1217 return bfloat16_round_pack_canonical(pr, status); 1218 } 1219 1220 bfloat16 bfloat16_add(bfloat16 a, bfloat16 b, float_status *status) 1221 { 1222 return bfloat16_addsub(a, b, status, false); 1223 } 1224 1225 bfloat16 bfloat16_sub(bfloat16 a, bfloat16 b, float_status *status) 1226 { 1227 return bfloat16_addsub(a, b, status, true); 1228 } 1229 1230 static float128 QEMU_FLATTEN 1231 float128_addsub(float128 a, float128 b, float_status *status, bool subtract) 1232 { 1233 FloatParts128 pa, pb, *pr; 1234 1235 float128_unpack_canonical(&pa, a, status); 1236 float128_unpack_canonical(&pb, b, status); 1237 pr = parts_addsub(&pa, &pb, status, subtract); 1238 1239 return float128_round_pack_canonical(pr, status); 1240 } 1241 1242 float128 float128_add(float128 a, float128 b, float_status *status) 1243 { 1244 return float128_addsub(a, b, status, false); 1245 } 1246 1247 float128 float128_sub(float128 a, float128 b, float_status *status) 1248 { 1249 return float128_addsub(a, b, status, true); 1250 } 1251 1252 /* 1253 * Returns the result of multiplying the floating-point values `a' and 1254 * `b'. The operation is performed according to the IEC/IEEE Standard 1255 * for Binary Floating-Point Arithmetic. 1256 */ 1257 1258 static FloatParts64 mul_floats(FloatParts64 a, FloatParts64 b, float_status *s) 1259 { 1260 bool sign = a.sign ^ b.sign; 1261 1262 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1263 uint64_t hi, lo; 1264 int exp = a.exp + b.exp; 1265 1266 mul64To128(a.frac, b.frac, &hi, &lo); 1267 if (hi & DECOMPOSED_IMPLICIT_BIT) { 1268 exp += 1; 1269 } else { 1270 hi <<= 1; 1271 } 1272 hi |= (lo != 0); 1273 1274 /* Re-use a */ 1275 a.exp = exp; 1276 a.sign = sign; 1277 a.frac = hi; 1278 return a; 1279 } 1280 /* handle all the NaN cases */ 1281 if (is_nan(a.cls) || is_nan(b.cls)) { 1282 return *parts_pick_nan(&a, &b, s); 1283 } 1284 /* Inf * Zero == NaN */ 1285 if ((a.cls == float_class_inf && b.cls == float_class_zero) || 1286 (a.cls == float_class_zero && b.cls == float_class_inf)) { 1287 float_raise(float_flag_invalid, s); 1288 parts_default_nan(&a, s); 1289 return a; 1290 } 1291 /* Multiply by 0 or Inf */ 1292 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1293 a.sign = sign; 1294 return a; 1295 } 1296 if (b.cls == float_class_inf || b.cls == float_class_zero) { 1297 b.sign = sign; 1298 return b; 1299 } 1300 g_assert_not_reached(); 1301 } 1302 1303 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status) 1304 { 1305 FloatParts64 pa, pb, pr; 1306 1307 float16_unpack_canonical(&pa, a, status); 1308 float16_unpack_canonical(&pb, b, status); 1309 pr = mul_floats(pa, pb, status); 1310 1311 return float16_round_pack_canonical(&pr, status); 1312 } 1313 1314 static float32 QEMU_SOFTFLOAT_ATTR 1315 soft_f32_mul(float32 a, float32 b, float_status *status) 1316 { 1317 FloatParts64 pa, pb, pr; 1318 1319 float32_unpack_canonical(&pa, a, status); 1320 float32_unpack_canonical(&pb, b, status); 1321 pr = mul_floats(pa, pb, status); 1322 1323 return float32_round_pack_canonical(&pr, status); 1324 } 1325 1326 static float64 QEMU_SOFTFLOAT_ATTR 1327 soft_f64_mul(float64 a, float64 b, float_status *status) 1328 { 1329 FloatParts64 pa, pb, pr; 1330 1331 float64_unpack_canonical(&pa, a, status); 1332 float64_unpack_canonical(&pb, b, status); 1333 pr = mul_floats(pa, pb, status); 1334 1335 return float64_round_pack_canonical(&pr, status); 1336 } 1337 1338 static float hard_f32_mul(float a, float b) 1339 { 1340 return a * b; 1341 } 1342 1343 static double hard_f64_mul(double a, double b) 1344 { 1345 return a * b; 1346 } 1347 1348 float32 QEMU_FLATTEN 1349 float32_mul(float32 a, float32 b, float_status *s) 1350 { 1351 return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul, 1352 f32_is_zon2, f32_addsubmul_post); 1353 } 1354 1355 float64 QEMU_FLATTEN 1356 float64_mul(float64 a, float64 b, float_status *s) 1357 { 1358 return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul, 1359 f64_is_zon2, f64_addsubmul_post); 1360 } 1361 1362 /* 1363 * Returns the result of multiplying the bfloat16 1364 * values `a' and `b'. 1365 */ 1366 1367 bfloat16 QEMU_FLATTEN bfloat16_mul(bfloat16 a, bfloat16 b, float_status *status) 1368 { 1369 FloatParts64 pa, pb, pr; 1370 1371 bfloat16_unpack_canonical(&pa, a, status); 1372 bfloat16_unpack_canonical(&pb, b, status); 1373 pr = mul_floats(pa, pb, status); 1374 1375 return bfloat16_round_pack_canonical(&pr, status); 1376 } 1377 1378 /* 1379 * Returns the result of multiplying the floating-point values `a' and 1380 * `b' then adding 'c', with no intermediate rounding step after the 1381 * multiplication. The operation is performed according to the 1382 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008. 1383 * The flags argument allows the caller to select negation of the 1384 * addend, the intermediate product, or the final result. (The 1385 * difference between this and having the caller do a separate 1386 * negation is that negating externally will flip the sign bit on 1387 * NaNs.) 1388 */ 1389 1390 static FloatParts64 muladd_floats(FloatParts64 a, FloatParts64 b, FloatParts64 c, 1391 int flags, float_status *s) 1392 { 1393 bool inf_zero, p_sign; 1394 bool sign_flip = flags & float_muladd_negate_result; 1395 FloatClass p_class; 1396 uint64_t hi, lo; 1397 int p_exp; 1398 int ab_mask, abc_mask; 1399 1400 ab_mask = float_cmask(a.cls) | float_cmask(b.cls); 1401 abc_mask = float_cmask(c.cls) | ab_mask; 1402 inf_zero = ab_mask == float_cmask_infzero; 1403 1404 /* It is implementation-defined whether the cases of (0,inf,qnan) 1405 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 1406 * they return if they do), so we have to hand this information 1407 * off to the target-specific pick-a-NaN routine. 1408 */ 1409 if (unlikely(abc_mask & float_cmask_anynan)) { 1410 return *parts_pick_nan_muladd(&a, &b, &c, s, ab_mask, abc_mask); 1411 } 1412 1413 if (inf_zero) { 1414 float_raise(float_flag_invalid, s); 1415 parts_default_nan(&a, s); 1416 return a; 1417 } 1418 1419 if (flags & float_muladd_negate_c) { 1420 c.sign ^= 1; 1421 } 1422 1423 p_sign = a.sign ^ b.sign; 1424 1425 if (flags & float_muladd_negate_product) { 1426 p_sign ^= 1; 1427 } 1428 1429 if (ab_mask & float_cmask_inf) { 1430 p_class = float_class_inf; 1431 } else if (ab_mask & float_cmask_zero) { 1432 p_class = float_class_zero; 1433 } else { 1434 p_class = float_class_normal; 1435 } 1436 1437 if (c.cls == float_class_inf) { 1438 if (p_class == float_class_inf && p_sign != c.sign) { 1439 float_raise(float_flag_invalid, s); 1440 parts_default_nan(&c, s); 1441 } else { 1442 c.sign ^= sign_flip; 1443 } 1444 return c; 1445 } 1446 1447 if (p_class == float_class_inf) { 1448 a.cls = float_class_inf; 1449 a.sign = p_sign ^ sign_flip; 1450 return a; 1451 } 1452 1453 if (p_class == float_class_zero) { 1454 if (c.cls == float_class_zero) { 1455 if (p_sign != c.sign) { 1456 p_sign = s->float_rounding_mode == float_round_down; 1457 } 1458 c.sign = p_sign; 1459 } else if (flags & float_muladd_halve_result) { 1460 c.exp -= 1; 1461 } 1462 c.sign ^= sign_flip; 1463 return c; 1464 } 1465 1466 /* a & b should be normals now... */ 1467 assert(a.cls == float_class_normal && 1468 b.cls == float_class_normal); 1469 1470 p_exp = a.exp + b.exp; 1471 1472 mul64To128(a.frac, b.frac, &hi, &lo); 1473 1474 /* Renormalize to the msb. */ 1475 if (hi & DECOMPOSED_IMPLICIT_BIT) { 1476 p_exp += 1; 1477 } else { 1478 shortShift128Left(hi, lo, 1, &hi, &lo); 1479 } 1480 1481 /* + add/sub */ 1482 if (c.cls != float_class_zero) { 1483 int exp_diff = p_exp - c.exp; 1484 if (p_sign == c.sign) { 1485 /* Addition */ 1486 if (exp_diff <= 0) { 1487 shift64RightJamming(hi, -exp_diff, &hi); 1488 p_exp = c.exp; 1489 if (uadd64_overflow(hi, c.frac, &hi)) { 1490 shift64RightJamming(hi, 1, &hi); 1491 hi |= DECOMPOSED_IMPLICIT_BIT; 1492 p_exp += 1; 1493 } 1494 } else { 1495 uint64_t c_hi, c_lo, over; 1496 shift128RightJamming(c.frac, 0, exp_diff, &c_hi, &c_lo); 1497 add192(0, hi, lo, 0, c_hi, c_lo, &over, &hi, &lo); 1498 if (over) { 1499 shift64RightJamming(hi, 1, &hi); 1500 hi |= DECOMPOSED_IMPLICIT_BIT; 1501 p_exp += 1; 1502 } 1503 } 1504 } else { 1505 /* Subtraction */ 1506 uint64_t c_hi = c.frac, c_lo = 0; 1507 1508 if (exp_diff <= 0) { 1509 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo); 1510 if (exp_diff == 0 1511 && 1512 (hi > c_hi || (hi == c_hi && lo >= c_lo))) { 1513 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1514 } else { 1515 sub128(c_hi, c_lo, hi, lo, &hi, &lo); 1516 p_sign ^= 1; 1517 p_exp = c.exp; 1518 } 1519 } else { 1520 shift128RightJamming(c_hi, c_lo, 1521 exp_diff, 1522 &c_hi, &c_lo); 1523 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1524 } 1525 1526 if (hi == 0 && lo == 0) { 1527 a.cls = float_class_zero; 1528 a.sign = s->float_rounding_mode == float_round_down; 1529 a.sign ^= sign_flip; 1530 return a; 1531 } else { 1532 int shift; 1533 if (hi != 0) { 1534 shift = clz64(hi); 1535 } else { 1536 shift = clz64(lo) + 64; 1537 } 1538 /* Normalizing to a binary point of 124 is the 1539 correct adjust for the exponent. However since we're 1540 shifting, we might as well put the binary point back 1541 at 63 where we really want it. Therefore shift as 1542 if we're leaving 1 bit at the top of the word, but 1543 adjust the exponent as if we're leaving 3 bits. */ 1544 shift128Left(hi, lo, shift, &hi, &lo); 1545 p_exp -= shift; 1546 } 1547 } 1548 } 1549 hi |= (lo != 0); 1550 1551 if (flags & float_muladd_halve_result) { 1552 p_exp -= 1; 1553 } 1554 1555 /* finally prepare our result */ 1556 a.cls = float_class_normal; 1557 a.sign = p_sign ^ sign_flip; 1558 a.exp = p_exp; 1559 a.frac = hi; 1560 1561 return a; 1562 } 1563 1564 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c, 1565 int flags, float_status *status) 1566 { 1567 FloatParts64 pa, pb, pc, pr; 1568 1569 float16_unpack_canonical(&pa, a, status); 1570 float16_unpack_canonical(&pb, b, status); 1571 float16_unpack_canonical(&pc, c, status); 1572 pr = muladd_floats(pa, pb, pc, flags, status); 1573 1574 return float16_round_pack_canonical(&pr, status); 1575 } 1576 1577 static float32 QEMU_SOFTFLOAT_ATTR 1578 soft_f32_muladd(float32 a, float32 b, float32 c, int flags, 1579 float_status *status) 1580 { 1581 FloatParts64 pa, pb, pc, pr; 1582 1583 float32_unpack_canonical(&pa, a, status); 1584 float32_unpack_canonical(&pb, b, status); 1585 float32_unpack_canonical(&pc, c, status); 1586 pr = muladd_floats(pa, pb, pc, flags, status); 1587 1588 return float32_round_pack_canonical(&pr, status); 1589 } 1590 1591 static float64 QEMU_SOFTFLOAT_ATTR 1592 soft_f64_muladd(float64 a, float64 b, float64 c, int flags, 1593 float_status *status) 1594 { 1595 FloatParts64 pa, pb, pc, pr; 1596 1597 float64_unpack_canonical(&pa, a, status); 1598 float64_unpack_canonical(&pb, b, status); 1599 float64_unpack_canonical(&pc, c, status); 1600 pr = muladd_floats(pa, pb, pc, flags, status); 1601 1602 return float64_round_pack_canonical(&pr, status); 1603 } 1604 1605 static bool force_soft_fma; 1606 1607 float32 QEMU_FLATTEN 1608 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s) 1609 { 1610 union_float32 ua, ub, uc, ur; 1611 1612 ua.s = xa; 1613 ub.s = xb; 1614 uc.s = xc; 1615 1616 if (unlikely(!can_use_fpu(s))) { 1617 goto soft; 1618 } 1619 if (unlikely(flags & float_muladd_halve_result)) { 1620 goto soft; 1621 } 1622 1623 float32_input_flush3(&ua.s, &ub.s, &uc.s, s); 1624 if (unlikely(!f32_is_zon3(ua, ub, uc))) { 1625 goto soft; 1626 } 1627 1628 if (unlikely(force_soft_fma)) { 1629 goto soft; 1630 } 1631 1632 /* 1633 * When (a || b) == 0, there's no need to check for under/over flow, 1634 * since we know the addend is (normal || 0) and the product is 0. 1635 */ 1636 if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) { 1637 union_float32 up; 1638 bool prod_sign; 1639 1640 prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s); 1641 prod_sign ^= !!(flags & float_muladd_negate_product); 1642 up.s = float32_set_sign(float32_zero, prod_sign); 1643 1644 if (flags & float_muladd_negate_c) { 1645 uc.h = -uc.h; 1646 } 1647 ur.h = up.h + uc.h; 1648 } else { 1649 union_float32 ua_orig = ua; 1650 union_float32 uc_orig = uc; 1651 1652 if (flags & float_muladd_negate_product) { 1653 ua.h = -ua.h; 1654 } 1655 if (flags & float_muladd_negate_c) { 1656 uc.h = -uc.h; 1657 } 1658 1659 ur.h = fmaf(ua.h, ub.h, uc.h); 1660 1661 if (unlikely(f32_is_inf(ur))) { 1662 float_raise(float_flag_overflow, s); 1663 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) { 1664 ua = ua_orig; 1665 uc = uc_orig; 1666 goto soft; 1667 } 1668 } 1669 if (flags & float_muladd_negate_result) { 1670 return float32_chs(ur.s); 1671 } 1672 return ur.s; 1673 1674 soft: 1675 return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s); 1676 } 1677 1678 float64 QEMU_FLATTEN 1679 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s) 1680 { 1681 union_float64 ua, ub, uc, ur; 1682 1683 ua.s = xa; 1684 ub.s = xb; 1685 uc.s = xc; 1686 1687 if (unlikely(!can_use_fpu(s))) { 1688 goto soft; 1689 } 1690 if (unlikely(flags & float_muladd_halve_result)) { 1691 goto soft; 1692 } 1693 1694 float64_input_flush3(&ua.s, &ub.s, &uc.s, s); 1695 if (unlikely(!f64_is_zon3(ua, ub, uc))) { 1696 goto soft; 1697 } 1698 1699 if (unlikely(force_soft_fma)) { 1700 goto soft; 1701 } 1702 1703 /* 1704 * When (a || b) == 0, there's no need to check for under/over flow, 1705 * since we know the addend is (normal || 0) and the product is 0. 1706 */ 1707 if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) { 1708 union_float64 up; 1709 bool prod_sign; 1710 1711 prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s); 1712 prod_sign ^= !!(flags & float_muladd_negate_product); 1713 up.s = float64_set_sign(float64_zero, prod_sign); 1714 1715 if (flags & float_muladd_negate_c) { 1716 uc.h = -uc.h; 1717 } 1718 ur.h = up.h + uc.h; 1719 } else { 1720 union_float64 ua_orig = ua; 1721 union_float64 uc_orig = uc; 1722 1723 if (flags & float_muladd_negate_product) { 1724 ua.h = -ua.h; 1725 } 1726 if (flags & float_muladd_negate_c) { 1727 uc.h = -uc.h; 1728 } 1729 1730 ur.h = fma(ua.h, ub.h, uc.h); 1731 1732 if (unlikely(f64_is_inf(ur))) { 1733 float_raise(float_flag_overflow, s); 1734 } else if (unlikely(fabs(ur.h) <= FLT_MIN)) { 1735 ua = ua_orig; 1736 uc = uc_orig; 1737 goto soft; 1738 } 1739 } 1740 if (flags & float_muladd_negate_result) { 1741 return float64_chs(ur.s); 1742 } 1743 return ur.s; 1744 1745 soft: 1746 return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s); 1747 } 1748 1749 /* 1750 * Returns the result of multiplying the bfloat16 values `a' 1751 * and `b' then adding 'c', with no intermediate rounding step after the 1752 * multiplication. 1753 */ 1754 1755 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c, 1756 int flags, float_status *status) 1757 { 1758 FloatParts64 pa, pb, pc, pr; 1759 1760 bfloat16_unpack_canonical(&pa, a, status); 1761 bfloat16_unpack_canonical(&pb, b, status); 1762 bfloat16_unpack_canonical(&pc, c, status); 1763 pr = muladd_floats(pa, pb, pc, flags, status); 1764 1765 return bfloat16_round_pack_canonical(&pr, status); 1766 } 1767 1768 /* 1769 * Returns the result of dividing the floating-point value `a' by the 1770 * corresponding value `b'. The operation is performed according to 1771 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1772 */ 1773 1774 static FloatParts64 div_floats(FloatParts64 a, FloatParts64 b, float_status *s) 1775 { 1776 bool sign = a.sign ^ b.sign; 1777 1778 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1779 uint64_t n0, n1, q, r; 1780 int exp = a.exp - b.exp; 1781 1782 /* 1783 * We want a 2*N / N-bit division to produce exactly an N-bit 1784 * result, so that we do not lose any precision and so that we 1785 * do not have to renormalize afterward. If A.frac < B.frac, 1786 * then division would produce an (N-1)-bit result; shift A left 1787 * by one to produce the an N-bit result, and decrement the 1788 * exponent to match. 1789 * 1790 * The udiv_qrnnd algorithm that we're using requires normalization, 1791 * i.e. the msb of the denominator must be set, which is already true. 1792 */ 1793 if (a.frac < b.frac) { 1794 exp -= 1; 1795 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0); 1796 } else { 1797 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, &n1, &n0); 1798 } 1799 q = udiv_qrnnd(&r, n1, n0, b.frac); 1800 1801 /* Set lsb if there is a remainder, to set inexact. */ 1802 a.frac = q | (r != 0); 1803 a.sign = sign; 1804 a.exp = exp; 1805 return a; 1806 } 1807 /* handle all the NaN cases */ 1808 if (is_nan(a.cls) || is_nan(b.cls)) { 1809 return *parts_pick_nan(&a, &b, s); 1810 } 1811 /* 0/0 or Inf/Inf */ 1812 if (a.cls == b.cls 1813 && 1814 (a.cls == float_class_inf || a.cls == float_class_zero)) { 1815 float_raise(float_flag_invalid, s); 1816 parts_default_nan(&a, s); 1817 return a; 1818 } 1819 /* Inf / x or 0 / x */ 1820 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1821 a.sign = sign; 1822 return a; 1823 } 1824 /* Div 0 => Inf */ 1825 if (b.cls == float_class_zero) { 1826 float_raise(float_flag_divbyzero, s); 1827 a.cls = float_class_inf; 1828 a.sign = sign; 1829 return a; 1830 } 1831 /* Div by Inf */ 1832 if (b.cls == float_class_inf) { 1833 a.cls = float_class_zero; 1834 a.sign = sign; 1835 return a; 1836 } 1837 g_assert_not_reached(); 1838 } 1839 1840 float16 float16_div(float16 a, float16 b, float_status *status) 1841 { 1842 FloatParts64 pa, pb, pr; 1843 1844 float16_unpack_canonical(&pa, a, status); 1845 float16_unpack_canonical(&pb, b, status); 1846 pr = div_floats(pa, pb, status); 1847 1848 return float16_round_pack_canonical(&pr, status); 1849 } 1850 1851 static float32 QEMU_SOFTFLOAT_ATTR 1852 soft_f32_div(float32 a, float32 b, float_status *status) 1853 { 1854 FloatParts64 pa, pb, pr; 1855 1856 float32_unpack_canonical(&pa, a, status); 1857 float32_unpack_canonical(&pb, b, status); 1858 pr = div_floats(pa, pb, status); 1859 1860 return float32_round_pack_canonical(&pr, status); 1861 } 1862 1863 static float64 QEMU_SOFTFLOAT_ATTR 1864 soft_f64_div(float64 a, float64 b, float_status *status) 1865 { 1866 FloatParts64 pa, pb, pr; 1867 1868 float64_unpack_canonical(&pa, a, status); 1869 float64_unpack_canonical(&pb, b, status); 1870 pr = div_floats(pa, pb, status); 1871 1872 return float64_round_pack_canonical(&pr, status); 1873 } 1874 1875 static float hard_f32_div(float a, float b) 1876 { 1877 return a / b; 1878 } 1879 1880 static double hard_f64_div(double a, double b) 1881 { 1882 return a / b; 1883 } 1884 1885 static bool f32_div_pre(union_float32 a, union_float32 b) 1886 { 1887 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1888 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1889 fpclassify(b.h) == FP_NORMAL; 1890 } 1891 return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s); 1892 } 1893 1894 static bool f64_div_pre(union_float64 a, union_float64 b) 1895 { 1896 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1897 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1898 fpclassify(b.h) == FP_NORMAL; 1899 } 1900 return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s); 1901 } 1902 1903 static bool f32_div_post(union_float32 a, union_float32 b) 1904 { 1905 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1906 return fpclassify(a.h) != FP_ZERO; 1907 } 1908 return !float32_is_zero(a.s); 1909 } 1910 1911 static bool f64_div_post(union_float64 a, union_float64 b) 1912 { 1913 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1914 return fpclassify(a.h) != FP_ZERO; 1915 } 1916 return !float64_is_zero(a.s); 1917 } 1918 1919 float32 QEMU_FLATTEN 1920 float32_div(float32 a, float32 b, float_status *s) 1921 { 1922 return float32_gen2(a, b, s, hard_f32_div, soft_f32_div, 1923 f32_div_pre, f32_div_post); 1924 } 1925 1926 float64 QEMU_FLATTEN 1927 float64_div(float64 a, float64 b, float_status *s) 1928 { 1929 return float64_gen2(a, b, s, hard_f64_div, soft_f64_div, 1930 f64_div_pre, f64_div_post); 1931 } 1932 1933 /* 1934 * Returns the result of dividing the bfloat16 1935 * value `a' by the corresponding value `b'. 1936 */ 1937 1938 bfloat16 bfloat16_div(bfloat16 a, bfloat16 b, float_status *status) 1939 { 1940 FloatParts64 pa, pb, pr; 1941 1942 bfloat16_unpack_canonical(&pa, a, status); 1943 bfloat16_unpack_canonical(&pb, b, status); 1944 pr = div_floats(pa, pb, status); 1945 1946 return bfloat16_round_pack_canonical(&pr, status); 1947 } 1948 1949 /* 1950 * Float to Float conversions 1951 * 1952 * Returns the result of converting one float format to another. The 1953 * conversion is performed according to the IEC/IEEE Standard for 1954 * Binary Floating-Point Arithmetic. 1955 * 1956 * The float_to_float helper only needs to take care of raising 1957 * invalid exceptions and handling the conversion on NaNs. 1958 */ 1959 1960 static FloatParts64 float_to_float(FloatParts64 a, const FloatFmt *dstf, 1961 float_status *s) 1962 { 1963 if (dstf->arm_althp) { 1964 switch (a.cls) { 1965 case float_class_qnan: 1966 case float_class_snan: 1967 /* There is no NaN in the destination format. Raise Invalid 1968 * and return a zero with the sign of the input NaN. 1969 */ 1970 float_raise(float_flag_invalid, s); 1971 a.cls = float_class_zero; 1972 a.frac = 0; 1973 a.exp = 0; 1974 break; 1975 1976 case float_class_inf: 1977 /* There is no Inf in the destination format. Raise Invalid 1978 * and return the maximum normal with the correct sign. 1979 */ 1980 float_raise(float_flag_invalid, s); 1981 a.cls = float_class_normal; 1982 a.exp = dstf->exp_max; 1983 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift; 1984 break; 1985 1986 default: 1987 break; 1988 } 1989 } else if (is_nan(a.cls)) { 1990 parts_return_nan(&a, s); 1991 } 1992 return a; 1993 } 1994 1995 float32 float16_to_float32(float16 a, bool ieee, float_status *s) 1996 { 1997 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1998 FloatParts64 pa, pr; 1999 2000 float16a_unpack_canonical(&pa, a, s, fmt16); 2001 pr = float_to_float(pa, &float32_params, s); 2002 return float32_round_pack_canonical(&pr, s); 2003 } 2004 2005 float64 float16_to_float64(float16 a, bool ieee, float_status *s) 2006 { 2007 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2008 FloatParts64 pa, pr; 2009 2010 float16a_unpack_canonical(&pa, a, s, fmt16); 2011 pr = float_to_float(pa, &float64_params, s); 2012 return float64_round_pack_canonical(&pr, s); 2013 } 2014 2015 float16 float32_to_float16(float32 a, bool ieee, float_status *s) 2016 { 2017 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2018 FloatParts64 pa, pr; 2019 2020 float32_unpack_canonical(&pa, a, s); 2021 pr = float_to_float(pa, fmt16, s); 2022 return float16a_round_pack_canonical(&pr, s, fmt16); 2023 } 2024 2025 static float64 QEMU_SOFTFLOAT_ATTR 2026 soft_float32_to_float64(float32 a, float_status *s) 2027 { 2028 FloatParts64 pa, pr; 2029 2030 float32_unpack_canonical(&pa, a, s); 2031 pr = float_to_float(pa, &float64_params, s); 2032 return float64_round_pack_canonical(&pr, s); 2033 } 2034 2035 float64 float32_to_float64(float32 a, float_status *s) 2036 { 2037 if (likely(float32_is_normal(a))) { 2038 /* Widening conversion can never produce inexact results. */ 2039 union_float32 uf; 2040 union_float64 ud; 2041 uf.s = a; 2042 ud.h = uf.h; 2043 return ud.s; 2044 } else if (float32_is_zero(a)) { 2045 return float64_set_sign(float64_zero, float32_is_neg(a)); 2046 } else { 2047 return soft_float32_to_float64(a, s); 2048 } 2049 } 2050 2051 float16 float64_to_float16(float64 a, bool ieee, float_status *s) 2052 { 2053 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2054 FloatParts64 pa, pr; 2055 2056 float64_unpack_canonical(&pa, a, s); 2057 pr = float_to_float(pa, fmt16, s); 2058 return float16a_round_pack_canonical(&pr, s, fmt16); 2059 } 2060 2061 float32 float64_to_float32(float64 a, float_status *s) 2062 { 2063 FloatParts64 pa, pr; 2064 2065 float64_unpack_canonical(&pa, a, s); 2066 pr = float_to_float(pa, &float32_params, s); 2067 return float32_round_pack_canonical(&pr, s); 2068 } 2069 2070 float32 bfloat16_to_float32(bfloat16 a, float_status *s) 2071 { 2072 FloatParts64 pa, pr; 2073 2074 bfloat16_unpack_canonical(&pa, a, s); 2075 pr = float_to_float(pa, &float32_params, s); 2076 return float32_round_pack_canonical(&pr, s); 2077 } 2078 2079 float64 bfloat16_to_float64(bfloat16 a, float_status *s) 2080 { 2081 FloatParts64 pa, pr; 2082 2083 bfloat16_unpack_canonical(&pa, a, s); 2084 pr = float_to_float(pa, &float64_params, s); 2085 return float64_round_pack_canonical(&pr, s); 2086 } 2087 2088 bfloat16 float32_to_bfloat16(float32 a, float_status *s) 2089 { 2090 FloatParts64 pa, pr; 2091 2092 float32_unpack_canonical(&pa, a, s); 2093 pr = float_to_float(pa, &bfloat16_params, s); 2094 return bfloat16_round_pack_canonical(&pr, s); 2095 } 2096 2097 bfloat16 float64_to_bfloat16(float64 a, float_status *s) 2098 { 2099 FloatParts64 pa, pr; 2100 2101 float64_unpack_canonical(&pa, a, s); 2102 pr = float_to_float(pa, &bfloat16_params, s); 2103 return bfloat16_round_pack_canonical(&pr, s); 2104 } 2105 2106 /* 2107 * Rounds the floating-point value `a' to an integer, and returns the 2108 * result as a floating-point value. The operation is performed 2109 * according to the IEC/IEEE Standard for Binary Floating-Point 2110 * Arithmetic. 2111 */ 2112 2113 static FloatParts64 round_to_int(FloatParts64 a, FloatRoundMode rmode, 2114 int scale, float_status *s) 2115 { 2116 switch (a.cls) { 2117 case float_class_qnan: 2118 case float_class_snan: 2119 parts_return_nan(&a, s); 2120 break; 2121 2122 case float_class_zero: 2123 case float_class_inf: 2124 /* already "integral" */ 2125 break; 2126 2127 case float_class_normal: 2128 scale = MIN(MAX(scale, -0x10000), 0x10000); 2129 a.exp += scale; 2130 2131 if (a.exp >= DECOMPOSED_BINARY_POINT) { 2132 /* already integral */ 2133 break; 2134 } 2135 if (a.exp < 0) { 2136 bool one; 2137 /* all fractional */ 2138 float_raise(float_flag_inexact, s); 2139 switch (rmode) { 2140 case float_round_nearest_even: 2141 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT; 2142 break; 2143 case float_round_ties_away: 2144 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT; 2145 break; 2146 case float_round_to_zero: 2147 one = false; 2148 break; 2149 case float_round_up: 2150 one = !a.sign; 2151 break; 2152 case float_round_down: 2153 one = a.sign; 2154 break; 2155 case float_round_to_odd: 2156 one = true; 2157 break; 2158 default: 2159 g_assert_not_reached(); 2160 } 2161 2162 if (one) { 2163 a.frac = DECOMPOSED_IMPLICIT_BIT; 2164 a.exp = 0; 2165 } else { 2166 a.cls = float_class_zero; 2167 } 2168 } else { 2169 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp; 2170 uint64_t frac_lsbm1 = frac_lsb >> 1; 2171 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb; 2172 uint64_t rnd_mask = rnd_even_mask >> 1; 2173 uint64_t inc; 2174 2175 switch (rmode) { 2176 case float_round_nearest_even: 2177 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 2178 break; 2179 case float_round_ties_away: 2180 inc = frac_lsbm1; 2181 break; 2182 case float_round_to_zero: 2183 inc = 0; 2184 break; 2185 case float_round_up: 2186 inc = a.sign ? 0 : rnd_mask; 2187 break; 2188 case float_round_down: 2189 inc = a.sign ? rnd_mask : 0; 2190 break; 2191 case float_round_to_odd: 2192 inc = a.frac & frac_lsb ? 0 : rnd_mask; 2193 break; 2194 default: 2195 g_assert_not_reached(); 2196 } 2197 2198 if (a.frac & rnd_mask) { 2199 float_raise(float_flag_inexact, s); 2200 if (uadd64_overflow(a.frac, inc, &a.frac)) { 2201 a.frac >>= 1; 2202 a.frac |= DECOMPOSED_IMPLICIT_BIT; 2203 a.exp++; 2204 } 2205 a.frac &= ~rnd_mask; 2206 } 2207 } 2208 break; 2209 default: 2210 g_assert_not_reached(); 2211 } 2212 return a; 2213 } 2214 2215 float16 float16_round_to_int(float16 a, float_status *s) 2216 { 2217 FloatParts64 pa, pr; 2218 2219 float16_unpack_canonical(&pa, a, s); 2220 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2221 return float16_round_pack_canonical(&pr, s); 2222 } 2223 2224 float32 float32_round_to_int(float32 a, float_status *s) 2225 { 2226 FloatParts64 pa, pr; 2227 2228 float32_unpack_canonical(&pa, a, s); 2229 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2230 return float32_round_pack_canonical(&pr, s); 2231 } 2232 2233 float64 float64_round_to_int(float64 a, float_status *s) 2234 { 2235 FloatParts64 pa, pr; 2236 2237 float64_unpack_canonical(&pa, a, s); 2238 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2239 return float64_round_pack_canonical(&pr, s); 2240 } 2241 2242 /* 2243 * Rounds the bfloat16 value `a' to an integer, and returns the 2244 * result as a bfloat16 value. 2245 */ 2246 2247 bfloat16 bfloat16_round_to_int(bfloat16 a, float_status *s) 2248 { 2249 FloatParts64 pa, pr; 2250 2251 bfloat16_unpack_canonical(&pa, a, s); 2252 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2253 return bfloat16_round_pack_canonical(&pr, s); 2254 } 2255 2256 /* 2257 * Returns the result of converting the floating-point value `a' to 2258 * the two's complement integer format. The conversion is performed 2259 * according to the IEC/IEEE Standard for Binary Floating-Point 2260 * Arithmetic---which means in particular that the conversion is 2261 * rounded according to the current rounding mode. If `a' is a NaN, 2262 * the largest positive integer is returned. Otherwise, if the 2263 * conversion overflows, the largest integer with the same sign as `a' 2264 * is returned. 2265 */ 2266 2267 static int64_t round_to_int_and_pack(FloatParts64 in, FloatRoundMode rmode, 2268 int scale, int64_t min, int64_t max, 2269 float_status *s) 2270 { 2271 uint64_t r; 2272 int orig_flags = get_float_exception_flags(s); 2273 FloatParts64 p = round_to_int(in, rmode, scale, s); 2274 2275 switch (p.cls) { 2276 case float_class_snan: 2277 case float_class_qnan: 2278 s->float_exception_flags = orig_flags | float_flag_invalid; 2279 return max; 2280 case float_class_inf: 2281 s->float_exception_flags = orig_flags | float_flag_invalid; 2282 return p.sign ? min : max; 2283 case float_class_zero: 2284 return 0; 2285 case float_class_normal: 2286 if (p.exp <= DECOMPOSED_BINARY_POINT) { 2287 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2288 } else { 2289 r = UINT64_MAX; 2290 } 2291 if (p.sign) { 2292 if (r <= -(uint64_t) min) { 2293 return -r; 2294 } else { 2295 s->float_exception_flags = orig_flags | float_flag_invalid; 2296 return min; 2297 } 2298 } else { 2299 if (r <= max) { 2300 return r; 2301 } else { 2302 s->float_exception_flags = orig_flags | float_flag_invalid; 2303 return max; 2304 } 2305 } 2306 default: 2307 g_assert_not_reached(); 2308 } 2309 } 2310 2311 int8_t float16_to_int8_scalbn(float16 a, FloatRoundMode rmode, int scale, 2312 float_status *s) 2313 { 2314 FloatParts64 p; 2315 2316 float16_unpack_canonical(&p, a, s); 2317 return round_to_int_and_pack(p, rmode, scale, INT8_MIN, INT8_MAX, s); 2318 } 2319 2320 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale, 2321 float_status *s) 2322 { 2323 FloatParts64 p; 2324 2325 float16_unpack_canonical(&p, a, s); 2326 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2327 } 2328 2329 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale, 2330 float_status *s) 2331 { 2332 FloatParts64 p; 2333 2334 float16_unpack_canonical(&p, a, s); 2335 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2336 } 2337 2338 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale, 2339 float_status *s) 2340 { 2341 FloatParts64 p; 2342 2343 float16_unpack_canonical(&p, a, s); 2344 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2345 } 2346 2347 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale, 2348 float_status *s) 2349 { 2350 FloatParts64 p; 2351 2352 float32_unpack_canonical(&p, a, s); 2353 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2354 } 2355 2356 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale, 2357 float_status *s) 2358 { 2359 FloatParts64 p; 2360 2361 float32_unpack_canonical(&p, a, s); 2362 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2363 } 2364 2365 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale, 2366 float_status *s) 2367 { 2368 FloatParts64 p; 2369 2370 float32_unpack_canonical(&p, a, s); 2371 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2372 } 2373 2374 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale, 2375 float_status *s) 2376 { 2377 FloatParts64 p; 2378 2379 float64_unpack_canonical(&p, a, s); 2380 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2381 } 2382 2383 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale, 2384 float_status *s) 2385 { 2386 FloatParts64 p; 2387 2388 float64_unpack_canonical(&p, a, s); 2389 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2390 } 2391 2392 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale, 2393 float_status *s) 2394 { 2395 FloatParts64 p; 2396 2397 float64_unpack_canonical(&p, a, s); 2398 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2399 } 2400 2401 int8_t float16_to_int8(float16 a, float_status *s) 2402 { 2403 return float16_to_int8_scalbn(a, s->float_rounding_mode, 0, s); 2404 } 2405 2406 int16_t float16_to_int16(float16 a, float_status *s) 2407 { 2408 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2409 } 2410 2411 int32_t float16_to_int32(float16 a, float_status *s) 2412 { 2413 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2414 } 2415 2416 int64_t float16_to_int64(float16 a, float_status *s) 2417 { 2418 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2419 } 2420 2421 int16_t float32_to_int16(float32 a, float_status *s) 2422 { 2423 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2424 } 2425 2426 int32_t float32_to_int32(float32 a, float_status *s) 2427 { 2428 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2429 } 2430 2431 int64_t float32_to_int64(float32 a, float_status *s) 2432 { 2433 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2434 } 2435 2436 int16_t float64_to_int16(float64 a, float_status *s) 2437 { 2438 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2439 } 2440 2441 int32_t float64_to_int32(float64 a, float_status *s) 2442 { 2443 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2444 } 2445 2446 int64_t float64_to_int64(float64 a, float_status *s) 2447 { 2448 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2449 } 2450 2451 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s) 2452 { 2453 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2454 } 2455 2456 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s) 2457 { 2458 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2459 } 2460 2461 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s) 2462 { 2463 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2464 } 2465 2466 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s) 2467 { 2468 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s); 2469 } 2470 2471 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s) 2472 { 2473 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s); 2474 } 2475 2476 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s) 2477 { 2478 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s); 2479 } 2480 2481 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s) 2482 { 2483 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s); 2484 } 2485 2486 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s) 2487 { 2488 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s); 2489 } 2490 2491 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s) 2492 { 2493 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s); 2494 } 2495 2496 /* 2497 * Returns the result of converting the floating-point value `a' to 2498 * the two's complement integer format. 2499 */ 2500 2501 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2502 float_status *s) 2503 { 2504 FloatParts64 p; 2505 2506 bfloat16_unpack_canonical(&p, a, s); 2507 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2508 } 2509 2510 int32_t bfloat16_to_int32_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2511 float_status *s) 2512 { 2513 FloatParts64 p; 2514 2515 bfloat16_unpack_canonical(&p, a, s); 2516 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2517 } 2518 2519 int64_t bfloat16_to_int64_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2520 float_status *s) 2521 { 2522 FloatParts64 p; 2523 2524 bfloat16_unpack_canonical(&p, a, s); 2525 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2526 } 2527 2528 int16_t bfloat16_to_int16(bfloat16 a, float_status *s) 2529 { 2530 return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2531 } 2532 2533 int32_t bfloat16_to_int32(bfloat16 a, float_status *s) 2534 { 2535 return bfloat16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2536 } 2537 2538 int64_t bfloat16_to_int64(bfloat16 a, float_status *s) 2539 { 2540 return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2541 } 2542 2543 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s) 2544 { 2545 return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2546 } 2547 2548 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a, float_status *s) 2549 { 2550 return bfloat16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2551 } 2552 2553 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a, float_status *s) 2554 { 2555 return bfloat16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2556 } 2557 2558 /* 2559 * Returns the result of converting the floating-point value `a' to 2560 * the unsigned integer format. The conversion is performed according 2561 * to the IEC/IEEE Standard for Binary Floating-Point 2562 * Arithmetic---which means in particular that the conversion is 2563 * rounded according to the current rounding mode. If `a' is a NaN, 2564 * the largest unsigned integer is returned. Otherwise, if the 2565 * conversion overflows, the largest unsigned integer is returned. If 2566 * the 'a' is negative, the result is rounded and zero is returned; 2567 * values that do not round to zero will raise the inexact exception 2568 * flag. 2569 */ 2570 2571 static uint64_t round_to_uint_and_pack(FloatParts64 in, FloatRoundMode rmode, 2572 int scale, uint64_t max, 2573 float_status *s) 2574 { 2575 int orig_flags = get_float_exception_flags(s); 2576 FloatParts64 p = round_to_int(in, rmode, scale, s); 2577 uint64_t r; 2578 2579 switch (p.cls) { 2580 case float_class_snan: 2581 case float_class_qnan: 2582 s->float_exception_flags = orig_flags | float_flag_invalid; 2583 return max; 2584 case float_class_inf: 2585 s->float_exception_flags = orig_flags | float_flag_invalid; 2586 return p.sign ? 0 : max; 2587 case float_class_zero: 2588 return 0; 2589 case float_class_normal: 2590 if (p.sign) { 2591 s->float_exception_flags = orig_flags | float_flag_invalid; 2592 return 0; 2593 } 2594 2595 if (p.exp <= DECOMPOSED_BINARY_POINT) { 2596 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2597 } else { 2598 s->float_exception_flags = orig_flags | float_flag_invalid; 2599 return max; 2600 } 2601 2602 /* For uint64 this will never trip, but if p.exp is too large 2603 * to shift a decomposed fraction we shall have exited via the 2604 * 3rd leg above. 2605 */ 2606 if (r > max) { 2607 s->float_exception_flags = orig_flags | float_flag_invalid; 2608 return max; 2609 } 2610 return r; 2611 default: 2612 g_assert_not_reached(); 2613 } 2614 } 2615 2616 uint8_t float16_to_uint8_scalbn(float16 a, FloatRoundMode rmode, int scale, 2617 float_status *s) 2618 { 2619 FloatParts64 p; 2620 2621 float16_unpack_canonical(&p, a, s); 2622 return round_to_uint_and_pack(p, rmode, scale, UINT8_MAX, s); 2623 } 2624 2625 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale, 2626 float_status *s) 2627 { 2628 FloatParts64 p; 2629 2630 float16_unpack_canonical(&p, a, s); 2631 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2632 } 2633 2634 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale, 2635 float_status *s) 2636 { 2637 FloatParts64 p; 2638 2639 float16_unpack_canonical(&p, a, s); 2640 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2641 } 2642 2643 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale, 2644 float_status *s) 2645 { 2646 FloatParts64 p; 2647 2648 float16_unpack_canonical(&p, a, s); 2649 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2650 } 2651 2652 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale, 2653 float_status *s) 2654 { 2655 FloatParts64 p; 2656 2657 float32_unpack_canonical(&p, a, s); 2658 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2659 } 2660 2661 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale, 2662 float_status *s) 2663 { 2664 FloatParts64 p; 2665 2666 float32_unpack_canonical(&p, a, s); 2667 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2668 } 2669 2670 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale, 2671 float_status *s) 2672 { 2673 FloatParts64 p; 2674 2675 float32_unpack_canonical(&p, a, s); 2676 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2677 } 2678 2679 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale, 2680 float_status *s) 2681 { 2682 FloatParts64 p; 2683 2684 float64_unpack_canonical(&p, a, s); 2685 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2686 } 2687 2688 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale, 2689 float_status *s) 2690 { 2691 FloatParts64 p; 2692 2693 float64_unpack_canonical(&p, a, s); 2694 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2695 } 2696 2697 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale, 2698 float_status *s) 2699 { 2700 FloatParts64 p; 2701 2702 float64_unpack_canonical(&p, a, s); 2703 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2704 } 2705 2706 uint8_t float16_to_uint8(float16 a, float_status *s) 2707 { 2708 return float16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s); 2709 } 2710 2711 uint16_t float16_to_uint16(float16 a, float_status *s) 2712 { 2713 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2714 } 2715 2716 uint32_t float16_to_uint32(float16 a, float_status *s) 2717 { 2718 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2719 } 2720 2721 uint64_t float16_to_uint64(float16 a, float_status *s) 2722 { 2723 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2724 } 2725 2726 uint16_t float32_to_uint16(float32 a, float_status *s) 2727 { 2728 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2729 } 2730 2731 uint32_t float32_to_uint32(float32 a, float_status *s) 2732 { 2733 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2734 } 2735 2736 uint64_t float32_to_uint64(float32 a, float_status *s) 2737 { 2738 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2739 } 2740 2741 uint16_t float64_to_uint16(float64 a, float_status *s) 2742 { 2743 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2744 } 2745 2746 uint32_t float64_to_uint32(float64 a, float_status *s) 2747 { 2748 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2749 } 2750 2751 uint64_t float64_to_uint64(float64 a, float_status *s) 2752 { 2753 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2754 } 2755 2756 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s) 2757 { 2758 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2759 } 2760 2761 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s) 2762 { 2763 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2764 } 2765 2766 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s) 2767 { 2768 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2769 } 2770 2771 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s) 2772 { 2773 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2774 } 2775 2776 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s) 2777 { 2778 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2779 } 2780 2781 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s) 2782 { 2783 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2784 } 2785 2786 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s) 2787 { 2788 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2789 } 2790 2791 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s) 2792 { 2793 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2794 } 2795 2796 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s) 2797 { 2798 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2799 } 2800 2801 /* 2802 * Returns the result of converting the bfloat16 value `a' to 2803 * the unsigned integer format. 2804 */ 2805 2806 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode, 2807 int scale, float_status *s) 2808 { 2809 FloatParts64 p; 2810 2811 bfloat16_unpack_canonical(&p, a, s); 2812 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2813 } 2814 2815 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode rmode, 2816 int scale, float_status *s) 2817 { 2818 FloatParts64 p; 2819 2820 bfloat16_unpack_canonical(&p, a, s); 2821 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2822 } 2823 2824 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode rmode, 2825 int scale, float_status *s) 2826 { 2827 FloatParts64 p; 2828 2829 bfloat16_unpack_canonical(&p, a, s); 2830 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2831 } 2832 2833 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s) 2834 { 2835 return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2836 } 2837 2838 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *s) 2839 { 2840 return bfloat16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2841 } 2842 2843 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s) 2844 { 2845 return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2846 } 2847 2848 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s) 2849 { 2850 return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2851 } 2852 2853 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *s) 2854 { 2855 return bfloat16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2856 } 2857 2858 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *s) 2859 { 2860 return bfloat16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2861 } 2862 2863 /* 2864 * Integer to float conversions 2865 * 2866 * Returns the result of converting the two's complement integer `a' 2867 * to the floating-point format. The conversion is performed according 2868 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2869 */ 2870 2871 static FloatParts64 int_to_float(int64_t a, int scale, float_status *status) 2872 { 2873 FloatParts64 r = { .sign = false }; 2874 2875 if (a == 0) { 2876 r.cls = float_class_zero; 2877 } else { 2878 uint64_t f = a; 2879 int shift; 2880 2881 r.cls = float_class_normal; 2882 if (a < 0) { 2883 f = -f; 2884 r.sign = true; 2885 } 2886 shift = clz64(f); 2887 scale = MIN(MAX(scale, -0x10000), 0x10000); 2888 2889 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 2890 r.frac = f << shift; 2891 } 2892 2893 return r; 2894 } 2895 2896 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status) 2897 { 2898 FloatParts64 pa = int_to_float(a, scale, status); 2899 return float16_round_pack_canonical(&pa, status); 2900 } 2901 2902 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status) 2903 { 2904 return int64_to_float16_scalbn(a, scale, status); 2905 } 2906 2907 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status) 2908 { 2909 return int64_to_float16_scalbn(a, scale, status); 2910 } 2911 2912 float16 int64_to_float16(int64_t a, float_status *status) 2913 { 2914 return int64_to_float16_scalbn(a, 0, status); 2915 } 2916 2917 float16 int32_to_float16(int32_t a, float_status *status) 2918 { 2919 return int64_to_float16_scalbn(a, 0, status); 2920 } 2921 2922 float16 int16_to_float16(int16_t a, float_status *status) 2923 { 2924 return int64_to_float16_scalbn(a, 0, status); 2925 } 2926 2927 float16 int8_to_float16(int8_t a, float_status *status) 2928 { 2929 return int64_to_float16_scalbn(a, 0, status); 2930 } 2931 2932 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status) 2933 { 2934 FloatParts64 pa = int_to_float(a, scale, status); 2935 return float32_round_pack_canonical(&pa, status); 2936 } 2937 2938 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status) 2939 { 2940 return int64_to_float32_scalbn(a, scale, status); 2941 } 2942 2943 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status) 2944 { 2945 return int64_to_float32_scalbn(a, scale, status); 2946 } 2947 2948 float32 int64_to_float32(int64_t a, float_status *status) 2949 { 2950 return int64_to_float32_scalbn(a, 0, status); 2951 } 2952 2953 float32 int32_to_float32(int32_t a, float_status *status) 2954 { 2955 return int64_to_float32_scalbn(a, 0, status); 2956 } 2957 2958 float32 int16_to_float32(int16_t a, float_status *status) 2959 { 2960 return int64_to_float32_scalbn(a, 0, status); 2961 } 2962 2963 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status) 2964 { 2965 FloatParts64 pa = int_to_float(a, scale, status); 2966 return float64_round_pack_canonical(&pa, status); 2967 } 2968 2969 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status) 2970 { 2971 return int64_to_float64_scalbn(a, scale, status); 2972 } 2973 2974 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status) 2975 { 2976 return int64_to_float64_scalbn(a, scale, status); 2977 } 2978 2979 float64 int64_to_float64(int64_t a, float_status *status) 2980 { 2981 return int64_to_float64_scalbn(a, 0, status); 2982 } 2983 2984 float64 int32_to_float64(int32_t a, float_status *status) 2985 { 2986 return int64_to_float64_scalbn(a, 0, status); 2987 } 2988 2989 float64 int16_to_float64(int16_t a, float_status *status) 2990 { 2991 return int64_to_float64_scalbn(a, 0, status); 2992 } 2993 2994 /* 2995 * Returns the result of converting the two's complement integer `a' 2996 * to the bfloat16 format. 2997 */ 2998 2999 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status) 3000 { 3001 FloatParts64 pa = int_to_float(a, scale, status); 3002 return bfloat16_round_pack_canonical(&pa, status); 3003 } 3004 3005 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int scale, float_status *status) 3006 { 3007 return int64_to_bfloat16_scalbn(a, scale, status); 3008 } 3009 3010 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status) 3011 { 3012 return int64_to_bfloat16_scalbn(a, scale, status); 3013 } 3014 3015 bfloat16 int64_to_bfloat16(int64_t a, float_status *status) 3016 { 3017 return int64_to_bfloat16_scalbn(a, 0, status); 3018 } 3019 3020 bfloat16 int32_to_bfloat16(int32_t a, float_status *status) 3021 { 3022 return int64_to_bfloat16_scalbn(a, 0, status); 3023 } 3024 3025 bfloat16 int16_to_bfloat16(int16_t a, float_status *status) 3026 { 3027 return int64_to_bfloat16_scalbn(a, 0, status); 3028 } 3029 3030 /* 3031 * Unsigned Integer to float conversions 3032 * 3033 * Returns the result of converting the unsigned integer `a' to the 3034 * floating-point format. The conversion is performed according to the 3035 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3036 */ 3037 3038 static FloatParts64 uint_to_float(uint64_t a, int scale, float_status *status) 3039 { 3040 FloatParts64 r = { .sign = false }; 3041 int shift; 3042 3043 if (a == 0) { 3044 r.cls = float_class_zero; 3045 } else { 3046 scale = MIN(MAX(scale, -0x10000), 0x10000); 3047 shift = clz64(a); 3048 r.cls = float_class_normal; 3049 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 3050 r.frac = a << shift; 3051 } 3052 3053 return r; 3054 } 3055 3056 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status) 3057 { 3058 FloatParts64 pa = uint_to_float(a, scale, status); 3059 return float16_round_pack_canonical(&pa, status); 3060 } 3061 3062 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status) 3063 { 3064 return uint64_to_float16_scalbn(a, scale, status); 3065 } 3066 3067 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status) 3068 { 3069 return uint64_to_float16_scalbn(a, scale, status); 3070 } 3071 3072 float16 uint64_to_float16(uint64_t a, float_status *status) 3073 { 3074 return uint64_to_float16_scalbn(a, 0, status); 3075 } 3076 3077 float16 uint32_to_float16(uint32_t a, float_status *status) 3078 { 3079 return uint64_to_float16_scalbn(a, 0, status); 3080 } 3081 3082 float16 uint16_to_float16(uint16_t a, float_status *status) 3083 { 3084 return uint64_to_float16_scalbn(a, 0, status); 3085 } 3086 3087 float16 uint8_to_float16(uint8_t a, float_status *status) 3088 { 3089 return uint64_to_float16_scalbn(a, 0, status); 3090 } 3091 3092 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status) 3093 { 3094 FloatParts64 pa = uint_to_float(a, scale, status); 3095 return float32_round_pack_canonical(&pa, status); 3096 } 3097 3098 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status) 3099 { 3100 return uint64_to_float32_scalbn(a, scale, status); 3101 } 3102 3103 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status) 3104 { 3105 return uint64_to_float32_scalbn(a, scale, status); 3106 } 3107 3108 float32 uint64_to_float32(uint64_t a, float_status *status) 3109 { 3110 return uint64_to_float32_scalbn(a, 0, status); 3111 } 3112 3113 float32 uint32_to_float32(uint32_t a, float_status *status) 3114 { 3115 return uint64_to_float32_scalbn(a, 0, status); 3116 } 3117 3118 float32 uint16_to_float32(uint16_t a, float_status *status) 3119 { 3120 return uint64_to_float32_scalbn(a, 0, status); 3121 } 3122 3123 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status) 3124 { 3125 FloatParts64 pa = uint_to_float(a, scale, status); 3126 return float64_round_pack_canonical(&pa, status); 3127 } 3128 3129 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status) 3130 { 3131 return uint64_to_float64_scalbn(a, scale, status); 3132 } 3133 3134 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status) 3135 { 3136 return uint64_to_float64_scalbn(a, scale, status); 3137 } 3138 3139 float64 uint64_to_float64(uint64_t a, float_status *status) 3140 { 3141 return uint64_to_float64_scalbn(a, 0, status); 3142 } 3143 3144 float64 uint32_to_float64(uint32_t a, float_status *status) 3145 { 3146 return uint64_to_float64_scalbn(a, 0, status); 3147 } 3148 3149 float64 uint16_to_float64(uint16_t a, float_status *status) 3150 { 3151 return uint64_to_float64_scalbn(a, 0, status); 3152 } 3153 3154 /* 3155 * Returns the result of converting the unsigned integer `a' to the 3156 * bfloat16 format. 3157 */ 3158 3159 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status) 3160 { 3161 FloatParts64 pa = uint_to_float(a, scale, status); 3162 return bfloat16_round_pack_canonical(&pa, status); 3163 } 3164 3165 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int scale, float_status *status) 3166 { 3167 return uint64_to_bfloat16_scalbn(a, scale, status); 3168 } 3169 3170 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status) 3171 { 3172 return uint64_to_bfloat16_scalbn(a, scale, status); 3173 } 3174 3175 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status) 3176 { 3177 return uint64_to_bfloat16_scalbn(a, 0, status); 3178 } 3179 3180 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status) 3181 { 3182 return uint64_to_bfloat16_scalbn(a, 0, status); 3183 } 3184 3185 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status) 3186 { 3187 return uint64_to_bfloat16_scalbn(a, 0, status); 3188 } 3189 3190 /* Float Min/Max */ 3191 /* min() and max() functions. These can't be implemented as 3192 * 'compare and pick one input' because that would mishandle 3193 * NaNs and +0 vs -0. 3194 * 3195 * minnum() and maxnum() functions. These are similar to the min() 3196 * and max() functions but if one of the arguments is a QNaN and 3197 * the other is numerical then the numerical argument is returned. 3198 * SNaNs will get quietened before being returned. 3199 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 3200 * and maxNum() operations. min() and max() are the typical min/max 3201 * semantics provided by many CPUs which predate that specification. 3202 * 3203 * minnummag() and maxnummag() functions correspond to minNumMag() 3204 * and minNumMag() from the IEEE-754 2008. 3205 */ 3206 static FloatParts64 minmax_floats(FloatParts64 a, FloatParts64 b, bool ismin, 3207 bool ieee, bool ismag, float_status *s) 3208 { 3209 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) { 3210 if (ieee) { 3211 /* Takes two floating-point values `a' and `b', one of 3212 * which is a NaN, and returns the appropriate NaN 3213 * result. If either `a' or `b' is a signaling NaN, 3214 * the invalid exception is raised. 3215 */ 3216 if (is_snan(a.cls) || is_snan(b.cls)) { 3217 return *parts_pick_nan(&a, &b, s); 3218 } else if (is_nan(a.cls) && !is_nan(b.cls)) { 3219 return b; 3220 } else if (is_nan(b.cls) && !is_nan(a.cls)) { 3221 return a; 3222 } 3223 } 3224 return *parts_pick_nan(&a, &b, s); 3225 } else { 3226 int a_exp, b_exp; 3227 3228 switch (a.cls) { 3229 case float_class_normal: 3230 a_exp = a.exp; 3231 break; 3232 case float_class_inf: 3233 a_exp = INT_MAX; 3234 break; 3235 case float_class_zero: 3236 a_exp = INT_MIN; 3237 break; 3238 default: 3239 g_assert_not_reached(); 3240 break; 3241 } 3242 switch (b.cls) { 3243 case float_class_normal: 3244 b_exp = b.exp; 3245 break; 3246 case float_class_inf: 3247 b_exp = INT_MAX; 3248 break; 3249 case float_class_zero: 3250 b_exp = INT_MIN; 3251 break; 3252 default: 3253 g_assert_not_reached(); 3254 break; 3255 } 3256 3257 if (ismag && (a_exp != b_exp || a.frac != b.frac)) { 3258 bool a_less = a_exp < b_exp; 3259 if (a_exp == b_exp) { 3260 a_less = a.frac < b.frac; 3261 } 3262 return a_less ^ ismin ? b : a; 3263 } 3264 3265 if (a.sign == b.sign) { 3266 bool a_less = a_exp < b_exp; 3267 if (a_exp == b_exp) { 3268 a_less = a.frac < b.frac; 3269 } 3270 return a.sign ^ a_less ^ ismin ? b : a; 3271 } else { 3272 return a.sign ^ ismin ? b : a; 3273 } 3274 } 3275 } 3276 3277 #define MINMAX(sz, name, ismin, isiee, ismag) \ 3278 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \ 3279 float_status *s) \ 3280 { \ 3281 FloatParts64 pa, pb, pr; \ 3282 float ## sz ## _unpack_canonical(&pa, a, s); \ 3283 float ## sz ## _unpack_canonical(&pb, b, s); \ 3284 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 3285 return float ## sz ## _round_pack_canonical(&pr, s); \ 3286 } 3287 3288 MINMAX(16, min, true, false, false) 3289 MINMAX(16, minnum, true, true, false) 3290 MINMAX(16, minnummag, true, true, true) 3291 MINMAX(16, max, false, false, false) 3292 MINMAX(16, maxnum, false, true, false) 3293 MINMAX(16, maxnummag, false, true, true) 3294 3295 MINMAX(32, min, true, false, false) 3296 MINMAX(32, minnum, true, true, false) 3297 MINMAX(32, minnummag, true, true, true) 3298 MINMAX(32, max, false, false, false) 3299 MINMAX(32, maxnum, false, true, false) 3300 MINMAX(32, maxnummag, false, true, true) 3301 3302 MINMAX(64, min, true, false, false) 3303 MINMAX(64, minnum, true, true, false) 3304 MINMAX(64, minnummag, true, true, true) 3305 MINMAX(64, max, false, false, false) 3306 MINMAX(64, maxnum, false, true, false) 3307 MINMAX(64, maxnummag, false, true, true) 3308 3309 #undef MINMAX 3310 3311 #define BF16_MINMAX(name, ismin, isiee, ismag) \ 3312 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s) \ 3313 { \ 3314 FloatParts64 pa, pb, pr; \ 3315 bfloat16_unpack_canonical(&pa, a, s); \ 3316 bfloat16_unpack_canonical(&pb, b, s); \ 3317 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 3318 return bfloat16_round_pack_canonical(&pr, s); \ 3319 } 3320 3321 BF16_MINMAX(min, true, false, false) 3322 BF16_MINMAX(minnum, true, true, false) 3323 BF16_MINMAX(minnummag, true, true, true) 3324 BF16_MINMAX(max, false, false, false) 3325 BF16_MINMAX(maxnum, false, true, false) 3326 BF16_MINMAX(maxnummag, false, true, true) 3327 3328 #undef BF16_MINMAX 3329 3330 /* Floating point compare */ 3331 static FloatRelation compare_floats(FloatParts64 a, FloatParts64 b, bool is_quiet, 3332 float_status *s) 3333 { 3334 if (is_nan(a.cls) || is_nan(b.cls)) { 3335 if (!is_quiet || 3336 a.cls == float_class_snan || 3337 b.cls == float_class_snan) { 3338 float_raise(float_flag_invalid, s); 3339 } 3340 return float_relation_unordered; 3341 } 3342 3343 if (a.cls == float_class_zero) { 3344 if (b.cls == float_class_zero) { 3345 return float_relation_equal; 3346 } 3347 return b.sign ? float_relation_greater : float_relation_less; 3348 } else if (b.cls == float_class_zero) { 3349 return a.sign ? float_relation_less : float_relation_greater; 3350 } 3351 3352 /* The only really important thing about infinity is its sign. If 3353 * both are infinities the sign marks the smallest of the two. 3354 */ 3355 if (a.cls == float_class_inf) { 3356 if ((b.cls == float_class_inf) && (a.sign == b.sign)) { 3357 return float_relation_equal; 3358 } 3359 return a.sign ? float_relation_less : float_relation_greater; 3360 } else if (b.cls == float_class_inf) { 3361 return b.sign ? float_relation_greater : float_relation_less; 3362 } 3363 3364 if (a.sign != b.sign) { 3365 return a.sign ? float_relation_less : float_relation_greater; 3366 } 3367 3368 if (a.exp == b.exp) { 3369 if (a.frac == b.frac) { 3370 return float_relation_equal; 3371 } 3372 if (a.sign) { 3373 return a.frac > b.frac ? 3374 float_relation_less : float_relation_greater; 3375 } else { 3376 return a.frac > b.frac ? 3377 float_relation_greater : float_relation_less; 3378 } 3379 } else { 3380 if (a.sign) { 3381 return a.exp > b.exp ? float_relation_less : float_relation_greater; 3382 } else { 3383 return a.exp > b.exp ? float_relation_greater : float_relation_less; 3384 } 3385 } 3386 } 3387 3388 #define COMPARE(name, attr, sz) \ 3389 static int attr \ 3390 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \ 3391 { \ 3392 FloatParts64 pa, pb; \ 3393 float ## sz ## _unpack_canonical(&pa, a, s); \ 3394 float ## sz ## _unpack_canonical(&pb, b, s); \ 3395 return compare_floats(pa, pb, is_quiet, s); \ 3396 } 3397 3398 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16) 3399 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32) 3400 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64) 3401 3402 #undef COMPARE 3403 3404 FloatRelation float16_compare(float16 a, float16 b, float_status *s) 3405 { 3406 return soft_f16_compare(a, b, false, s); 3407 } 3408 3409 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s) 3410 { 3411 return soft_f16_compare(a, b, true, s); 3412 } 3413 3414 static FloatRelation QEMU_FLATTEN 3415 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s) 3416 { 3417 union_float32 ua, ub; 3418 3419 ua.s = xa; 3420 ub.s = xb; 3421 3422 if (QEMU_NO_HARDFLOAT) { 3423 goto soft; 3424 } 3425 3426 float32_input_flush2(&ua.s, &ub.s, s); 3427 if (isgreaterequal(ua.h, ub.h)) { 3428 if (isgreater(ua.h, ub.h)) { 3429 return float_relation_greater; 3430 } 3431 return float_relation_equal; 3432 } 3433 if (likely(isless(ua.h, ub.h))) { 3434 return float_relation_less; 3435 } 3436 /* The only condition remaining is unordered. 3437 * Fall through to set flags. 3438 */ 3439 soft: 3440 return soft_f32_compare(ua.s, ub.s, is_quiet, s); 3441 } 3442 3443 FloatRelation float32_compare(float32 a, float32 b, float_status *s) 3444 { 3445 return f32_compare(a, b, false, s); 3446 } 3447 3448 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s) 3449 { 3450 return f32_compare(a, b, true, s); 3451 } 3452 3453 static FloatRelation QEMU_FLATTEN 3454 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s) 3455 { 3456 union_float64 ua, ub; 3457 3458 ua.s = xa; 3459 ub.s = xb; 3460 3461 if (QEMU_NO_HARDFLOAT) { 3462 goto soft; 3463 } 3464 3465 float64_input_flush2(&ua.s, &ub.s, s); 3466 if (isgreaterequal(ua.h, ub.h)) { 3467 if (isgreater(ua.h, ub.h)) { 3468 return float_relation_greater; 3469 } 3470 return float_relation_equal; 3471 } 3472 if (likely(isless(ua.h, ub.h))) { 3473 return float_relation_less; 3474 } 3475 /* The only condition remaining is unordered. 3476 * Fall through to set flags. 3477 */ 3478 soft: 3479 return soft_f64_compare(ua.s, ub.s, is_quiet, s); 3480 } 3481 3482 FloatRelation float64_compare(float64 a, float64 b, float_status *s) 3483 { 3484 return f64_compare(a, b, false, s); 3485 } 3486 3487 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s) 3488 { 3489 return f64_compare(a, b, true, s); 3490 } 3491 3492 static FloatRelation QEMU_FLATTEN 3493 soft_bf16_compare(bfloat16 a, bfloat16 b, bool is_quiet, float_status *s) 3494 { 3495 FloatParts64 pa, pb; 3496 3497 bfloat16_unpack_canonical(&pa, a, s); 3498 bfloat16_unpack_canonical(&pb, b, s); 3499 return compare_floats(pa, pb, is_quiet, s); 3500 } 3501 3502 FloatRelation bfloat16_compare(bfloat16 a, bfloat16 b, float_status *s) 3503 { 3504 return soft_bf16_compare(a, b, false, s); 3505 } 3506 3507 FloatRelation bfloat16_compare_quiet(bfloat16 a, bfloat16 b, float_status *s) 3508 { 3509 return soft_bf16_compare(a, b, true, s); 3510 } 3511 3512 /* Multiply A by 2 raised to the power N. */ 3513 static FloatParts64 scalbn_decomposed(FloatParts64 a, int n, float_status *s) 3514 { 3515 if (unlikely(is_nan(a.cls))) { 3516 parts_return_nan(&a, s); 3517 } 3518 if (a.cls == float_class_normal) { 3519 /* The largest float type (even though not supported by FloatParts64) 3520 * is float128, which has a 15 bit exponent. Bounding N to 16 bits 3521 * still allows rounding to infinity, without allowing overflow 3522 * within the int32_t that backs FloatParts64.exp. 3523 */ 3524 n = MIN(MAX(n, -0x10000), 0x10000); 3525 a.exp += n; 3526 } 3527 return a; 3528 } 3529 3530 float16 float16_scalbn(float16 a, int n, float_status *status) 3531 { 3532 FloatParts64 pa, pr; 3533 3534 float16_unpack_canonical(&pa, a, status); 3535 pr = scalbn_decomposed(pa, n, status); 3536 return float16_round_pack_canonical(&pr, status); 3537 } 3538 3539 float32 float32_scalbn(float32 a, int n, float_status *status) 3540 { 3541 FloatParts64 pa, pr; 3542 3543 float32_unpack_canonical(&pa, a, status); 3544 pr = scalbn_decomposed(pa, n, status); 3545 return float32_round_pack_canonical(&pr, status); 3546 } 3547 3548 float64 float64_scalbn(float64 a, int n, float_status *status) 3549 { 3550 FloatParts64 pa, pr; 3551 3552 float64_unpack_canonical(&pa, a, status); 3553 pr = scalbn_decomposed(pa, n, status); 3554 return float64_round_pack_canonical(&pr, status); 3555 } 3556 3557 bfloat16 bfloat16_scalbn(bfloat16 a, int n, float_status *status) 3558 { 3559 FloatParts64 pa, pr; 3560 3561 bfloat16_unpack_canonical(&pa, a, status); 3562 pr = scalbn_decomposed(pa, n, status); 3563 return bfloat16_round_pack_canonical(&pr, status); 3564 } 3565 3566 /* 3567 * Square Root 3568 * 3569 * The old softfloat code did an approximation step before zeroing in 3570 * on the final result. However for simpleness we just compute the 3571 * square root by iterating down from the implicit bit to enough extra 3572 * bits to ensure we get a correctly rounded result. 3573 * 3574 * This does mean however the calculation is slower than before, 3575 * especially for 64 bit floats. 3576 */ 3577 3578 static FloatParts64 sqrt_float(FloatParts64 a, float_status *s, const FloatFmt *p) 3579 { 3580 uint64_t a_frac, r_frac, s_frac; 3581 int bit, last_bit; 3582 3583 if (is_nan(a.cls)) { 3584 parts_return_nan(&a, s); 3585 return a; 3586 } 3587 if (a.cls == float_class_zero) { 3588 return a; /* sqrt(+-0) = +-0 */ 3589 } 3590 if (a.sign) { 3591 float_raise(float_flag_invalid, s); 3592 parts_default_nan(&a, s); 3593 return a; 3594 } 3595 if (a.cls == float_class_inf) { 3596 return a; /* sqrt(+inf) = +inf */ 3597 } 3598 3599 assert(a.cls == float_class_normal); 3600 3601 /* We need two overflow bits at the top. Adding room for that is a 3602 * right shift. If the exponent is odd, we can discard the low bit 3603 * by multiplying the fraction by 2; that's a left shift. Combine 3604 * those and we shift right by 1 if the exponent is odd, otherwise 2. 3605 */ 3606 a_frac = a.frac >> (2 - (a.exp & 1)); 3607 a.exp >>= 1; 3608 3609 /* Bit-by-bit computation of sqrt. */ 3610 r_frac = 0; 3611 s_frac = 0; 3612 3613 /* Iterate from implicit bit down to the 3 extra bits to compute a 3614 * properly rounded result. Remember we've inserted two more bits 3615 * at the top, so these positions are two less. 3616 */ 3617 bit = DECOMPOSED_BINARY_POINT - 2; 3618 last_bit = MAX(p->frac_shift - 4, 0); 3619 do { 3620 uint64_t q = 1ULL << bit; 3621 uint64_t t_frac = s_frac + q; 3622 if (t_frac <= a_frac) { 3623 s_frac = t_frac + q; 3624 a_frac -= t_frac; 3625 r_frac += q; 3626 } 3627 a_frac <<= 1; 3628 } while (--bit >= last_bit); 3629 3630 /* Undo the right shift done above. If there is any remaining 3631 * fraction, the result is inexact. Set the sticky bit. 3632 */ 3633 a.frac = (r_frac << 2) + (a_frac != 0); 3634 3635 return a; 3636 } 3637 3638 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status) 3639 { 3640 FloatParts64 pa, pr; 3641 3642 float16_unpack_canonical(&pa, a, status); 3643 pr = sqrt_float(pa, status, &float16_params); 3644 return float16_round_pack_canonical(&pr, status); 3645 } 3646 3647 static float32 QEMU_SOFTFLOAT_ATTR 3648 soft_f32_sqrt(float32 a, float_status *status) 3649 { 3650 FloatParts64 pa, pr; 3651 3652 float32_unpack_canonical(&pa, a, status); 3653 pr = sqrt_float(pa, status, &float32_params); 3654 return float32_round_pack_canonical(&pr, status); 3655 } 3656 3657 static float64 QEMU_SOFTFLOAT_ATTR 3658 soft_f64_sqrt(float64 a, float_status *status) 3659 { 3660 FloatParts64 pa, pr; 3661 3662 float64_unpack_canonical(&pa, a, status); 3663 pr = sqrt_float(pa, status, &float64_params); 3664 return float64_round_pack_canonical(&pr, status); 3665 } 3666 3667 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s) 3668 { 3669 union_float32 ua, ur; 3670 3671 ua.s = xa; 3672 if (unlikely(!can_use_fpu(s))) { 3673 goto soft; 3674 } 3675 3676 float32_input_flush1(&ua.s, s); 3677 if (QEMU_HARDFLOAT_1F32_USE_FP) { 3678 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3679 fpclassify(ua.h) == FP_ZERO) || 3680 signbit(ua.h))) { 3681 goto soft; 3682 } 3683 } else if (unlikely(!float32_is_zero_or_normal(ua.s) || 3684 float32_is_neg(ua.s))) { 3685 goto soft; 3686 } 3687 ur.h = sqrtf(ua.h); 3688 return ur.s; 3689 3690 soft: 3691 return soft_f32_sqrt(ua.s, s); 3692 } 3693 3694 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s) 3695 { 3696 union_float64 ua, ur; 3697 3698 ua.s = xa; 3699 if (unlikely(!can_use_fpu(s))) { 3700 goto soft; 3701 } 3702 3703 float64_input_flush1(&ua.s, s); 3704 if (QEMU_HARDFLOAT_1F64_USE_FP) { 3705 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3706 fpclassify(ua.h) == FP_ZERO) || 3707 signbit(ua.h))) { 3708 goto soft; 3709 } 3710 } else if (unlikely(!float64_is_zero_or_normal(ua.s) || 3711 float64_is_neg(ua.s))) { 3712 goto soft; 3713 } 3714 ur.h = sqrt(ua.h); 3715 return ur.s; 3716 3717 soft: 3718 return soft_f64_sqrt(ua.s, s); 3719 } 3720 3721 bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status) 3722 { 3723 FloatParts64 pa, pr; 3724 3725 bfloat16_unpack_canonical(&pa, a, status); 3726 pr = sqrt_float(pa, status, &bfloat16_params); 3727 return bfloat16_round_pack_canonical(&pr, status); 3728 } 3729 3730 /*---------------------------------------------------------------------------- 3731 | The pattern for a default generated NaN. 3732 *----------------------------------------------------------------------------*/ 3733 3734 float16 float16_default_nan(float_status *status) 3735 { 3736 FloatParts64 p; 3737 3738 parts_default_nan(&p, status); 3739 p.frac >>= float16_params.frac_shift; 3740 return float16_pack_raw(&p); 3741 } 3742 3743 float32 float32_default_nan(float_status *status) 3744 { 3745 FloatParts64 p; 3746 3747 parts_default_nan(&p, status); 3748 p.frac >>= float32_params.frac_shift; 3749 return float32_pack_raw(&p); 3750 } 3751 3752 float64 float64_default_nan(float_status *status) 3753 { 3754 FloatParts64 p; 3755 3756 parts_default_nan(&p, status); 3757 p.frac >>= float64_params.frac_shift; 3758 return float64_pack_raw(&p); 3759 } 3760 3761 float128 float128_default_nan(float_status *status) 3762 { 3763 FloatParts128 p; 3764 3765 parts_default_nan(&p, status); 3766 frac_shr(&p, float128_params.frac_shift); 3767 return float128_pack_raw(&p); 3768 } 3769 3770 bfloat16 bfloat16_default_nan(float_status *status) 3771 { 3772 FloatParts64 p; 3773 3774 parts_default_nan(&p, status); 3775 p.frac >>= bfloat16_params.frac_shift; 3776 return bfloat16_pack_raw(&p); 3777 } 3778 3779 /*---------------------------------------------------------------------------- 3780 | Returns a quiet NaN from a signalling NaN for the floating point value `a'. 3781 *----------------------------------------------------------------------------*/ 3782 3783 float16 float16_silence_nan(float16 a, float_status *status) 3784 { 3785 FloatParts64 p; 3786 3787 float16_unpack_raw(&p, a); 3788 p.frac <<= float16_params.frac_shift; 3789 parts_silence_nan(&p, status); 3790 p.frac >>= float16_params.frac_shift; 3791 return float16_pack_raw(&p); 3792 } 3793 3794 float32 float32_silence_nan(float32 a, float_status *status) 3795 { 3796 FloatParts64 p; 3797 3798 float32_unpack_raw(&p, a); 3799 p.frac <<= float32_params.frac_shift; 3800 parts_silence_nan(&p, status); 3801 p.frac >>= float32_params.frac_shift; 3802 return float32_pack_raw(&p); 3803 } 3804 3805 float64 float64_silence_nan(float64 a, float_status *status) 3806 { 3807 FloatParts64 p; 3808 3809 float64_unpack_raw(&p, a); 3810 p.frac <<= float64_params.frac_shift; 3811 parts_silence_nan(&p, status); 3812 p.frac >>= float64_params.frac_shift; 3813 return float64_pack_raw(&p); 3814 } 3815 3816 bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status) 3817 { 3818 FloatParts64 p; 3819 3820 bfloat16_unpack_raw(&p, a); 3821 p.frac <<= bfloat16_params.frac_shift; 3822 parts_silence_nan(&p, status); 3823 p.frac >>= bfloat16_params.frac_shift; 3824 return bfloat16_pack_raw(&p); 3825 } 3826 3827 float128 float128_silence_nan(float128 a, float_status *status) 3828 { 3829 FloatParts128 p; 3830 3831 float128_unpack_raw(&p, a); 3832 frac_shl(&p, float128_params.frac_shift); 3833 parts_silence_nan(&p, status); 3834 frac_shr(&p, float128_params.frac_shift); 3835 return float128_pack_raw(&p); 3836 } 3837 3838 /*---------------------------------------------------------------------------- 3839 | If `a' is denormal and we are in flush-to-zero mode then set the 3840 | input-denormal exception and return zero. Otherwise just return the value. 3841 *----------------------------------------------------------------------------*/ 3842 3843 static bool parts_squash_denormal(FloatParts64 p, float_status *status) 3844 { 3845 if (p.exp == 0 && p.frac != 0) { 3846 float_raise(float_flag_input_denormal, status); 3847 return true; 3848 } 3849 3850 return false; 3851 } 3852 3853 float16 float16_squash_input_denormal(float16 a, float_status *status) 3854 { 3855 if (status->flush_inputs_to_zero) { 3856 FloatParts64 p; 3857 3858 float16_unpack_raw(&p, a); 3859 if (parts_squash_denormal(p, status)) { 3860 return float16_set_sign(float16_zero, p.sign); 3861 } 3862 } 3863 return a; 3864 } 3865 3866 float32 float32_squash_input_denormal(float32 a, float_status *status) 3867 { 3868 if (status->flush_inputs_to_zero) { 3869 FloatParts64 p; 3870 3871 float32_unpack_raw(&p, a); 3872 if (parts_squash_denormal(p, status)) { 3873 return float32_set_sign(float32_zero, p.sign); 3874 } 3875 } 3876 return a; 3877 } 3878 3879 float64 float64_squash_input_denormal(float64 a, float_status *status) 3880 { 3881 if (status->flush_inputs_to_zero) { 3882 FloatParts64 p; 3883 3884 float64_unpack_raw(&p, a); 3885 if (parts_squash_denormal(p, status)) { 3886 return float64_set_sign(float64_zero, p.sign); 3887 } 3888 } 3889 return a; 3890 } 3891 3892 bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status) 3893 { 3894 if (status->flush_inputs_to_zero) { 3895 FloatParts64 p; 3896 3897 bfloat16_unpack_raw(&p, a); 3898 if (parts_squash_denormal(p, status)) { 3899 return bfloat16_set_sign(bfloat16_zero, p.sign); 3900 } 3901 } 3902 return a; 3903 } 3904 3905 /*---------------------------------------------------------------------------- 3906 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 3907 | and 7, and returns the properly rounded 32-bit integer corresponding to the 3908 | input. If `zSign' is 1, the input is negated before being converted to an 3909 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 3910 | is simply rounded to an integer, with the inexact exception raised if the 3911 | input cannot be represented exactly as an integer. However, if the fixed- 3912 | point input is too large, the invalid exception is raised and the largest 3913 | positive or negative integer is returned. 3914 *----------------------------------------------------------------------------*/ 3915 3916 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ, 3917 float_status *status) 3918 { 3919 int8_t roundingMode; 3920 bool roundNearestEven; 3921 int8_t roundIncrement, roundBits; 3922 int32_t z; 3923 3924 roundingMode = status->float_rounding_mode; 3925 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3926 switch (roundingMode) { 3927 case float_round_nearest_even: 3928 case float_round_ties_away: 3929 roundIncrement = 0x40; 3930 break; 3931 case float_round_to_zero: 3932 roundIncrement = 0; 3933 break; 3934 case float_round_up: 3935 roundIncrement = zSign ? 0 : 0x7f; 3936 break; 3937 case float_round_down: 3938 roundIncrement = zSign ? 0x7f : 0; 3939 break; 3940 case float_round_to_odd: 3941 roundIncrement = absZ & 0x80 ? 0 : 0x7f; 3942 break; 3943 default: 3944 abort(); 3945 } 3946 roundBits = absZ & 0x7F; 3947 absZ = ( absZ + roundIncrement )>>7; 3948 if (!(roundBits ^ 0x40) && roundNearestEven) { 3949 absZ &= ~1; 3950 } 3951 z = absZ; 3952 if ( zSign ) z = - z; 3953 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 3954 float_raise(float_flag_invalid, status); 3955 return zSign ? INT32_MIN : INT32_MAX; 3956 } 3957 if (roundBits) { 3958 float_raise(float_flag_inexact, status); 3959 } 3960 return z; 3961 3962 } 3963 3964 /*---------------------------------------------------------------------------- 3965 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 3966 | `absZ1', with binary point between bits 63 and 64 (between the input words), 3967 | and returns the properly rounded 64-bit integer corresponding to the input. 3968 | If `zSign' is 1, the input is negated before being converted to an integer. 3969 | Ordinarily, the fixed-point input is simply rounded to an integer, with 3970 | the inexact exception raised if the input cannot be represented exactly as 3971 | an integer. However, if the fixed-point input is too large, the invalid 3972 | exception is raised and the largest positive or negative integer is 3973 | returned. 3974 *----------------------------------------------------------------------------*/ 3975 3976 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1, 3977 float_status *status) 3978 { 3979 int8_t roundingMode; 3980 bool roundNearestEven, increment; 3981 int64_t z; 3982 3983 roundingMode = status->float_rounding_mode; 3984 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3985 switch (roundingMode) { 3986 case float_round_nearest_even: 3987 case float_round_ties_away: 3988 increment = ((int64_t) absZ1 < 0); 3989 break; 3990 case float_round_to_zero: 3991 increment = 0; 3992 break; 3993 case float_round_up: 3994 increment = !zSign && absZ1; 3995 break; 3996 case float_round_down: 3997 increment = zSign && absZ1; 3998 break; 3999 case float_round_to_odd: 4000 increment = !(absZ0 & 1) && absZ1; 4001 break; 4002 default: 4003 abort(); 4004 } 4005 if ( increment ) { 4006 ++absZ0; 4007 if ( absZ0 == 0 ) goto overflow; 4008 if (!(absZ1 << 1) && roundNearestEven) { 4009 absZ0 &= ~1; 4010 } 4011 } 4012 z = absZ0; 4013 if ( zSign ) z = - z; 4014 if ( z && ( ( z < 0 ) ^ zSign ) ) { 4015 overflow: 4016 float_raise(float_flag_invalid, status); 4017 return zSign ? INT64_MIN : INT64_MAX; 4018 } 4019 if (absZ1) { 4020 float_raise(float_flag_inexact, status); 4021 } 4022 return z; 4023 4024 } 4025 4026 /*---------------------------------------------------------------------------- 4027 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 4028 | `absZ1', with binary point between bits 63 and 64 (between the input words), 4029 | and returns the properly rounded 64-bit unsigned integer corresponding to the 4030 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 4031 | with the inexact exception raised if the input cannot be represented exactly 4032 | as an integer. However, if the fixed-point input is too large, the invalid 4033 | exception is raised and the largest unsigned integer is returned. 4034 *----------------------------------------------------------------------------*/ 4035 4036 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0, 4037 uint64_t absZ1, float_status *status) 4038 { 4039 int8_t roundingMode; 4040 bool roundNearestEven, increment; 4041 4042 roundingMode = status->float_rounding_mode; 4043 roundNearestEven = (roundingMode == float_round_nearest_even); 4044 switch (roundingMode) { 4045 case float_round_nearest_even: 4046 case float_round_ties_away: 4047 increment = ((int64_t)absZ1 < 0); 4048 break; 4049 case float_round_to_zero: 4050 increment = 0; 4051 break; 4052 case float_round_up: 4053 increment = !zSign && absZ1; 4054 break; 4055 case float_round_down: 4056 increment = zSign && absZ1; 4057 break; 4058 case float_round_to_odd: 4059 increment = !(absZ0 & 1) && absZ1; 4060 break; 4061 default: 4062 abort(); 4063 } 4064 if (increment) { 4065 ++absZ0; 4066 if (absZ0 == 0) { 4067 float_raise(float_flag_invalid, status); 4068 return UINT64_MAX; 4069 } 4070 if (!(absZ1 << 1) && roundNearestEven) { 4071 absZ0 &= ~1; 4072 } 4073 } 4074 4075 if (zSign && absZ0) { 4076 float_raise(float_flag_invalid, status); 4077 return 0; 4078 } 4079 4080 if (absZ1) { 4081 float_raise(float_flag_inexact, status); 4082 } 4083 return absZ0; 4084 } 4085 4086 /*---------------------------------------------------------------------------- 4087 | Normalizes the subnormal single-precision floating-point value represented 4088 | by the denormalized significand `aSig'. The normalized exponent and 4089 | significand are stored at the locations pointed to by `zExpPtr' and 4090 | `zSigPtr', respectively. 4091 *----------------------------------------------------------------------------*/ 4092 4093 static void 4094 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 4095 { 4096 int8_t shiftCount; 4097 4098 shiftCount = clz32(aSig) - 8; 4099 *zSigPtr = aSig<<shiftCount; 4100 *zExpPtr = 1 - shiftCount; 4101 4102 } 4103 4104 /*---------------------------------------------------------------------------- 4105 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4106 | and significand `zSig', and returns the proper single-precision floating- 4107 | point value corresponding to the abstract input. Ordinarily, the abstract 4108 | value is simply rounded and packed into the single-precision format, with 4109 | the inexact exception raised if the abstract input cannot be represented 4110 | exactly. However, if the abstract value is too large, the overflow and 4111 | inexact exceptions are raised and an infinity or maximal finite value is 4112 | returned. If the abstract value is too small, the input value is rounded to 4113 | a subnormal number, and the underflow and inexact exceptions are raised if 4114 | the abstract input cannot be represented exactly as a subnormal single- 4115 | precision floating-point number. 4116 | The input significand `zSig' has its binary point between bits 30 4117 | and 29, which is 7 bits to the left of the usual location. This shifted 4118 | significand must be normalized or smaller. If `zSig' is not normalized, 4119 | `zExp' must be 0; in that case, the result returned is a subnormal number, 4120 | and it must not require rounding. In the usual case that `zSig' is 4121 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 4122 | The handling of underflow and overflow follows the IEC/IEEE Standard for 4123 | Binary Floating-Point Arithmetic. 4124 *----------------------------------------------------------------------------*/ 4125 4126 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig, 4127 float_status *status) 4128 { 4129 int8_t roundingMode; 4130 bool roundNearestEven; 4131 int8_t roundIncrement, roundBits; 4132 bool isTiny; 4133 4134 roundingMode = status->float_rounding_mode; 4135 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4136 switch (roundingMode) { 4137 case float_round_nearest_even: 4138 case float_round_ties_away: 4139 roundIncrement = 0x40; 4140 break; 4141 case float_round_to_zero: 4142 roundIncrement = 0; 4143 break; 4144 case float_round_up: 4145 roundIncrement = zSign ? 0 : 0x7f; 4146 break; 4147 case float_round_down: 4148 roundIncrement = zSign ? 0x7f : 0; 4149 break; 4150 case float_round_to_odd: 4151 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 4152 break; 4153 default: 4154 abort(); 4155 break; 4156 } 4157 roundBits = zSig & 0x7F; 4158 if ( 0xFD <= (uint16_t) zExp ) { 4159 if ( ( 0xFD < zExp ) 4160 || ( ( zExp == 0xFD ) 4161 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 4162 ) { 4163 bool overflow_to_inf = roundingMode != float_round_to_odd && 4164 roundIncrement != 0; 4165 float_raise(float_flag_overflow | float_flag_inexact, status); 4166 return packFloat32(zSign, 0xFF, -!overflow_to_inf); 4167 } 4168 if ( zExp < 0 ) { 4169 if (status->flush_to_zero) { 4170 float_raise(float_flag_output_denormal, status); 4171 return packFloat32(zSign, 0, 0); 4172 } 4173 isTiny = status->tininess_before_rounding 4174 || (zExp < -1) 4175 || (zSig + roundIncrement < 0x80000000); 4176 shift32RightJamming( zSig, - zExp, &zSig ); 4177 zExp = 0; 4178 roundBits = zSig & 0x7F; 4179 if (isTiny && roundBits) { 4180 float_raise(float_flag_underflow, status); 4181 } 4182 if (roundingMode == float_round_to_odd) { 4183 /* 4184 * For round-to-odd case, the roundIncrement depends on 4185 * zSig which just changed. 4186 */ 4187 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 4188 } 4189 } 4190 } 4191 if (roundBits) { 4192 float_raise(float_flag_inexact, status); 4193 } 4194 zSig = ( zSig + roundIncrement )>>7; 4195 if (!(roundBits ^ 0x40) && roundNearestEven) { 4196 zSig &= ~1; 4197 } 4198 if ( zSig == 0 ) zExp = 0; 4199 return packFloat32( zSign, zExp, zSig ); 4200 4201 } 4202 4203 /*---------------------------------------------------------------------------- 4204 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4205 | and significand `zSig', and returns the proper single-precision floating- 4206 | point value corresponding to the abstract input. This routine is just like 4207 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 4208 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 4209 | floating-point exponent. 4210 *----------------------------------------------------------------------------*/ 4211 4212 static float32 4213 normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig, 4214 float_status *status) 4215 { 4216 int8_t shiftCount; 4217 4218 shiftCount = clz32(zSig) - 1; 4219 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 4220 status); 4221 4222 } 4223 4224 /*---------------------------------------------------------------------------- 4225 | Normalizes the subnormal double-precision floating-point value represented 4226 | by the denormalized significand `aSig'. The normalized exponent and 4227 | significand are stored at the locations pointed to by `zExpPtr' and 4228 | `zSigPtr', respectively. 4229 *----------------------------------------------------------------------------*/ 4230 4231 static void 4232 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 4233 { 4234 int8_t shiftCount; 4235 4236 shiftCount = clz64(aSig) - 11; 4237 *zSigPtr = aSig<<shiftCount; 4238 *zExpPtr = 1 - shiftCount; 4239 4240 } 4241 4242 /*---------------------------------------------------------------------------- 4243 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 4244 | double-precision floating-point value, returning the result. After being 4245 | shifted into the proper positions, the three fields are simply added 4246 | together to form the result. This means that any integer portion of `zSig' 4247 | will be added into the exponent. Since a properly normalized significand 4248 | will have an integer portion equal to 1, the `zExp' input should be 1 less 4249 | than the desired result exponent whenever `zSig' is a complete, normalized 4250 | significand. 4251 *----------------------------------------------------------------------------*/ 4252 4253 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig) 4254 { 4255 4256 return make_float64( 4257 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 4258 4259 } 4260 4261 /*---------------------------------------------------------------------------- 4262 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4263 | and significand `zSig', and returns the proper double-precision floating- 4264 | point value corresponding to the abstract input. Ordinarily, the abstract 4265 | value is simply rounded and packed into the double-precision format, with 4266 | the inexact exception raised if the abstract input cannot be represented 4267 | exactly. However, if the abstract value is too large, the overflow and 4268 | inexact exceptions are raised and an infinity or maximal finite value is 4269 | returned. If the abstract value is too small, the input value is rounded to 4270 | a subnormal number, and the underflow and inexact exceptions are raised if 4271 | the abstract input cannot be represented exactly as a subnormal double- 4272 | precision floating-point number. 4273 | The input significand `zSig' has its binary point between bits 62 4274 | and 61, which is 10 bits to the left of the usual location. This shifted 4275 | significand must be normalized or smaller. If `zSig' is not normalized, 4276 | `zExp' must be 0; in that case, the result returned is a subnormal number, 4277 | and it must not require rounding. In the usual case that `zSig' is 4278 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 4279 | The handling of underflow and overflow follows the IEC/IEEE Standard for 4280 | Binary Floating-Point Arithmetic. 4281 *----------------------------------------------------------------------------*/ 4282 4283 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig, 4284 float_status *status) 4285 { 4286 int8_t roundingMode; 4287 bool roundNearestEven; 4288 int roundIncrement, roundBits; 4289 bool isTiny; 4290 4291 roundingMode = status->float_rounding_mode; 4292 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4293 switch (roundingMode) { 4294 case float_round_nearest_even: 4295 case float_round_ties_away: 4296 roundIncrement = 0x200; 4297 break; 4298 case float_round_to_zero: 4299 roundIncrement = 0; 4300 break; 4301 case float_round_up: 4302 roundIncrement = zSign ? 0 : 0x3ff; 4303 break; 4304 case float_round_down: 4305 roundIncrement = zSign ? 0x3ff : 0; 4306 break; 4307 case float_round_to_odd: 4308 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 4309 break; 4310 default: 4311 abort(); 4312 } 4313 roundBits = zSig & 0x3FF; 4314 if ( 0x7FD <= (uint16_t) zExp ) { 4315 if ( ( 0x7FD < zExp ) 4316 || ( ( zExp == 0x7FD ) 4317 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 4318 ) { 4319 bool overflow_to_inf = roundingMode != float_round_to_odd && 4320 roundIncrement != 0; 4321 float_raise(float_flag_overflow | float_flag_inexact, status); 4322 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 4323 } 4324 if ( zExp < 0 ) { 4325 if (status->flush_to_zero) { 4326 float_raise(float_flag_output_denormal, status); 4327 return packFloat64(zSign, 0, 0); 4328 } 4329 isTiny = status->tininess_before_rounding 4330 || (zExp < -1) 4331 || (zSig + roundIncrement < UINT64_C(0x8000000000000000)); 4332 shift64RightJamming( zSig, - zExp, &zSig ); 4333 zExp = 0; 4334 roundBits = zSig & 0x3FF; 4335 if (isTiny && roundBits) { 4336 float_raise(float_flag_underflow, status); 4337 } 4338 if (roundingMode == float_round_to_odd) { 4339 /* 4340 * For round-to-odd case, the roundIncrement depends on 4341 * zSig which just changed. 4342 */ 4343 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 4344 } 4345 } 4346 } 4347 if (roundBits) { 4348 float_raise(float_flag_inexact, status); 4349 } 4350 zSig = ( zSig + roundIncrement )>>10; 4351 if (!(roundBits ^ 0x200) && roundNearestEven) { 4352 zSig &= ~1; 4353 } 4354 if ( zSig == 0 ) zExp = 0; 4355 return packFloat64( zSign, zExp, zSig ); 4356 4357 } 4358 4359 /*---------------------------------------------------------------------------- 4360 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4361 | and significand `zSig', and returns the proper double-precision floating- 4362 | point value corresponding to the abstract input. This routine is just like 4363 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 4364 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 4365 | floating-point exponent. 4366 *----------------------------------------------------------------------------*/ 4367 4368 static float64 4369 normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig, 4370 float_status *status) 4371 { 4372 int8_t shiftCount; 4373 4374 shiftCount = clz64(zSig) - 1; 4375 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 4376 status); 4377 4378 } 4379 4380 /*---------------------------------------------------------------------------- 4381 | Normalizes the subnormal extended double-precision floating-point value 4382 | represented by the denormalized significand `aSig'. The normalized exponent 4383 | and significand are stored at the locations pointed to by `zExpPtr' and 4384 | `zSigPtr', respectively. 4385 *----------------------------------------------------------------------------*/ 4386 4387 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr, 4388 uint64_t *zSigPtr) 4389 { 4390 int8_t shiftCount; 4391 4392 shiftCount = clz64(aSig); 4393 *zSigPtr = aSig<<shiftCount; 4394 *zExpPtr = 1 - shiftCount; 4395 } 4396 4397 /*---------------------------------------------------------------------------- 4398 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4399 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 4400 | and returns the proper extended double-precision floating-point value 4401 | corresponding to the abstract input. Ordinarily, the abstract value is 4402 | rounded and packed into the extended double-precision format, with the 4403 | inexact exception raised if the abstract input cannot be represented 4404 | exactly. However, if the abstract value is too large, the overflow and 4405 | inexact exceptions are raised and an infinity or maximal finite value is 4406 | returned. If the abstract value is too small, the input value is rounded to 4407 | a subnormal number, and the underflow and inexact exceptions are raised if 4408 | the abstract input cannot be represented exactly as a subnormal extended 4409 | double-precision floating-point number. 4410 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 4411 | number of bits as single or double precision, respectively. Otherwise, the 4412 | result is rounded to the full precision of the extended double-precision 4413 | format. 4414 | The input significand must be normalized or smaller. If the input 4415 | significand is not normalized, `zExp' must be 0; in that case, the result 4416 | returned is a subnormal number, and it must not require rounding. The 4417 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 4418 | Floating-Point Arithmetic. 4419 *----------------------------------------------------------------------------*/ 4420 4421 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign, 4422 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 4423 float_status *status) 4424 { 4425 int8_t roundingMode; 4426 bool roundNearestEven, increment, isTiny; 4427 int64_t roundIncrement, roundMask, roundBits; 4428 4429 roundingMode = status->float_rounding_mode; 4430 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4431 if ( roundingPrecision == 80 ) goto precision80; 4432 if ( roundingPrecision == 64 ) { 4433 roundIncrement = UINT64_C(0x0000000000000400); 4434 roundMask = UINT64_C(0x00000000000007FF); 4435 } 4436 else if ( roundingPrecision == 32 ) { 4437 roundIncrement = UINT64_C(0x0000008000000000); 4438 roundMask = UINT64_C(0x000000FFFFFFFFFF); 4439 } 4440 else { 4441 goto precision80; 4442 } 4443 zSig0 |= ( zSig1 != 0 ); 4444 switch (roundingMode) { 4445 case float_round_nearest_even: 4446 case float_round_ties_away: 4447 break; 4448 case float_round_to_zero: 4449 roundIncrement = 0; 4450 break; 4451 case float_round_up: 4452 roundIncrement = zSign ? 0 : roundMask; 4453 break; 4454 case float_round_down: 4455 roundIncrement = zSign ? roundMask : 0; 4456 break; 4457 default: 4458 abort(); 4459 } 4460 roundBits = zSig0 & roundMask; 4461 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 4462 if ( ( 0x7FFE < zExp ) 4463 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 4464 ) { 4465 goto overflow; 4466 } 4467 if ( zExp <= 0 ) { 4468 if (status->flush_to_zero) { 4469 float_raise(float_flag_output_denormal, status); 4470 return packFloatx80(zSign, 0, 0); 4471 } 4472 isTiny = status->tininess_before_rounding 4473 || (zExp < 0 ) 4474 || (zSig0 <= zSig0 + roundIncrement); 4475 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 4476 zExp = 0; 4477 roundBits = zSig0 & roundMask; 4478 if (isTiny && roundBits) { 4479 float_raise(float_flag_underflow, status); 4480 } 4481 if (roundBits) { 4482 float_raise(float_flag_inexact, status); 4483 } 4484 zSig0 += roundIncrement; 4485 if ( (int64_t) zSig0 < 0 ) zExp = 1; 4486 roundIncrement = roundMask + 1; 4487 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 4488 roundMask |= roundIncrement; 4489 } 4490 zSig0 &= ~ roundMask; 4491 return packFloatx80( zSign, zExp, zSig0 ); 4492 } 4493 } 4494 if (roundBits) { 4495 float_raise(float_flag_inexact, status); 4496 } 4497 zSig0 += roundIncrement; 4498 if ( zSig0 < roundIncrement ) { 4499 ++zExp; 4500 zSig0 = UINT64_C(0x8000000000000000); 4501 } 4502 roundIncrement = roundMask + 1; 4503 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 4504 roundMask |= roundIncrement; 4505 } 4506 zSig0 &= ~ roundMask; 4507 if ( zSig0 == 0 ) zExp = 0; 4508 return packFloatx80( zSign, zExp, zSig0 ); 4509 precision80: 4510 switch (roundingMode) { 4511 case float_round_nearest_even: 4512 case float_round_ties_away: 4513 increment = ((int64_t)zSig1 < 0); 4514 break; 4515 case float_round_to_zero: 4516 increment = 0; 4517 break; 4518 case float_round_up: 4519 increment = !zSign && zSig1; 4520 break; 4521 case float_round_down: 4522 increment = zSign && zSig1; 4523 break; 4524 default: 4525 abort(); 4526 } 4527 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 4528 if ( ( 0x7FFE < zExp ) 4529 || ( ( zExp == 0x7FFE ) 4530 && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) ) 4531 && increment 4532 ) 4533 ) { 4534 roundMask = 0; 4535 overflow: 4536 float_raise(float_flag_overflow | float_flag_inexact, status); 4537 if ( ( roundingMode == float_round_to_zero ) 4538 || ( zSign && ( roundingMode == float_round_up ) ) 4539 || ( ! zSign && ( roundingMode == float_round_down ) ) 4540 ) { 4541 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 4542 } 4543 return packFloatx80(zSign, 4544 floatx80_infinity_high, 4545 floatx80_infinity_low); 4546 } 4547 if ( zExp <= 0 ) { 4548 isTiny = status->tininess_before_rounding 4549 || (zExp < 0) 4550 || !increment 4551 || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF)); 4552 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 4553 zExp = 0; 4554 if (isTiny && zSig1) { 4555 float_raise(float_flag_underflow, status); 4556 } 4557 if (zSig1) { 4558 float_raise(float_flag_inexact, status); 4559 } 4560 switch (roundingMode) { 4561 case float_round_nearest_even: 4562 case float_round_ties_away: 4563 increment = ((int64_t)zSig1 < 0); 4564 break; 4565 case float_round_to_zero: 4566 increment = 0; 4567 break; 4568 case float_round_up: 4569 increment = !zSign && zSig1; 4570 break; 4571 case float_round_down: 4572 increment = zSign && zSig1; 4573 break; 4574 default: 4575 abort(); 4576 } 4577 if ( increment ) { 4578 ++zSig0; 4579 if (!(zSig1 << 1) && roundNearestEven) { 4580 zSig0 &= ~1; 4581 } 4582 if ( (int64_t) zSig0 < 0 ) zExp = 1; 4583 } 4584 return packFloatx80( zSign, zExp, zSig0 ); 4585 } 4586 } 4587 if (zSig1) { 4588 float_raise(float_flag_inexact, status); 4589 } 4590 if ( increment ) { 4591 ++zSig0; 4592 if ( zSig0 == 0 ) { 4593 ++zExp; 4594 zSig0 = UINT64_C(0x8000000000000000); 4595 } 4596 else { 4597 if (!(zSig1 << 1) && roundNearestEven) { 4598 zSig0 &= ~1; 4599 } 4600 } 4601 } 4602 else { 4603 if ( zSig0 == 0 ) zExp = 0; 4604 } 4605 return packFloatx80( zSign, zExp, zSig0 ); 4606 4607 } 4608 4609 /*---------------------------------------------------------------------------- 4610 | Takes an abstract floating-point value having sign `zSign', exponent 4611 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 4612 | and returns the proper extended double-precision floating-point value 4613 | corresponding to the abstract input. This routine is just like 4614 | `roundAndPackFloatx80' except that the input significand does not have to be 4615 | normalized. 4616 *----------------------------------------------------------------------------*/ 4617 4618 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 4619 bool zSign, int32_t zExp, 4620 uint64_t zSig0, uint64_t zSig1, 4621 float_status *status) 4622 { 4623 int8_t shiftCount; 4624 4625 if ( zSig0 == 0 ) { 4626 zSig0 = zSig1; 4627 zSig1 = 0; 4628 zExp -= 64; 4629 } 4630 shiftCount = clz64(zSig0); 4631 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4632 zExp -= shiftCount; 4633 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 4634 zSig0, zSig1, status); 4635 4636 } 4637 4638 /*---------------------------------------------------------------------------- 4639 | Returns the least-significant 64 fraction bits of the quadruple-precision 4640 | floating-point value `a'. 4641 *----------------------------------------------------------------------------*/ 4642 4643 static inline uint64_t extractFloat128Frac1( float128 a ) 4644 { 4645 4646 return a.low; 4647 4648 } 4649 4650 /*---------------------------------------------------------------------------- 4651 | Returns the most-significant 48 fraction bits of the quadruple-precision 4652 | floating-point value `a'. 4653 *----------------------------------------------------------------------------*/ 4654 4655 static inline uint64_t extractFloat128Frac0( float128 a ) 4656 { 4657 4658 return a.high & UINT64_C(0x0000FFFFFFFFFFFF); 4659 4660 } 4661 4662 /*---------------------------------------------------------------------------- 4663 | Returns the exponent bits of the quadruple-precision floating-point value 4664 | `a'. 4665 *----------------------------------------------------------------------------*/ 4666 4667 static inline int32_t extractFloat128Exp( float128 a ) 4668 { 4669 4670 return ( a.high>>48 ) & 0x7FFF; 4671 4672 } 4673 4674 /*---------------------------------------------------------------------------- 4675 | Returns the sign bit of the quadruple-precision floating-point value `a'. 4676 *----------------------------------------------------------------------------*/ 4677 4678 static inline bool extractFloat128Sign(float128 a) 4679 { 4680 return a.high >> 63; 4681 } 4682 4683 /*---------------------------------------------------------------------------- 4684 | Normalizes the subnormal quadruple-precision floating-point value 4685 | represented by the denormalized significand formed by the concatenation of 4686 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 4687 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 4688 | significand are stored at the location pointed to by `zSig0Ptr', and the 4689 | least significant 64 bits of the normalized significand are stored at the 4690 | location pointed to by `zSig1Ptr'. 4691 *----------------------------------------------------------------------------*/ 4692 4693 static void 4694 normalizeFloat128Subnormal( 4695 uint64_t aSig0, 4696 uint64_t aSig1, 4697 int32_t *zExpPtr, 4698 uint64_t *zSig0Ptr, 4699 uint64_t *zSig1Ptr 4700 ) 4701 { 4702 int8_t shiftCount; 4703 4704 if ( aSig0 == 0 ) { 4705 shiftCount = clz64(aSig1) - 15; 4706 if ( shiftCount < 0 ) { 4707 *zSig0Ptr = aSig1>>( - shiftCount ); 4708 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 4709 } 4710 else { 4711 *zSig0Ptr = aSig1<<shiftCount; 4712 *zSig1Ptr = 0; 4713 } 4714 *zExpPtr = - shiftCount - 63; 4715 } 4716 else { 4717 shiftCount = clz64(aSig0) - 15; 4718 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 4719 *zExpPtr = 1 - shiftCount; 4720 } 4721 4722 } 4723 4724 /*---------------------------------------------------------------------------- 4725 | Packs the sign `zSign', the exponent `zExp', and the significand formed 4726 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 4727 | floating-point value, returning the result. After being shifted into the 4728 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 4729 | added together to form the most significant 32 bits of the result. This 4730 | means that any integer portion of `zSig0' will be added into the exponent. 4731 | Since a properly normalized significand will have an integer portion equal 4732 | to 1, the `zExp' input should be 1 less than the desired result exponent 4733 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 4734 | significand. 4735 *----------------------------------------------------------------------------*/ 4736 4737 static inline float128 4738 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1) 4739 { 4740 float128 z; 4741 4742 z.low = zSig1; 4743 z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0; 4744 return z; 4745 } 4746 4747 /*---------------------------------------------------------------------------- 4748 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4749 | and extended significand formed by the concatenation of `zSig0', `zSig1', 4750 | and `zSig2', and returns the proper quadruple-precision floating-point value 4751 | corresponding to the abstract input. Ordinarily, the abstract value is 4752 | simply rounded and packed into the quadruple-precision format, with the 4753 | inexact exception raised if the abstract input cannot be represented 4754 | exactly. However, if the abstract value is too large, the overflow and 4755 | inexact exceptions are raised and an infinity or maximal finite value is 4756 | returned. If the abstract value is too small, the input value is rounded to 4757 | a subnormal number, and the underflow and inexact exceptions are raised if 4758 | the abstract input cannot be represented exactly as a subnormal quadruple- 4759 | precision floating-point number. 4760 | The input significand must be normalized or smaller. If the input 4761 | significand is not normalized, `zExp' must be 0; in that case, the result 4762 | returned is a subnormal number, and it must not require rounding. In the 4763 | usual case that the input significand is normalized, `zExp' must be 1 less 4764 | than the ``true'' floating-point exponent. The handling of underflow and 4765 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4766 *----------------------------------------------------------------------------*/ 4767 4768 static float128 roundAndPackFloat128(bool zSign, int32_t zExp, 4769 uint64_t zSig0, uint64_t zSig1, 4770 uint64_t zSig2, float_status *status) 4771 { 4772 int8_t roundingMode; 4773 bool roundNearestEven, increment, isTiny; 4774 4775 roundingMode = status->float_rounding_mode; 4776 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4777 switch (roundingMode) { 4778 case float_round_nearest_even: 4779 case float_round_ties_away: 4780 increment = ((int64_t)zSig2 < 0); 4781 break; 4782 case float_round_to_zero: 4783 increment = 0; 4784 break; 4785 case float_round_up: 4786 increment = !zSign && zSig2; 4787 break; 4788 case float_round_down: 4789 increment = zSign && zSig2; 4790 break; 4791 case float_round_to_odd: 4792 increment = !(zSig1 & 0x1) && zSig2; 4793 break; 4794 default: 4795 abort(); 4796 } 4797 if ( 0x7FFD <= (uint32_t) zExp ) { 4798 if ( ( 0x7FFD < zExp ) 4799 || ( ( zExp == 0x7FFD ) 4800 && eq128( 4801 UINT64_C(0x0001FFFFFFFFFFFF), 4802 UINT64_C(0xFFFFFFFFFFFFFFFF), 4803 zSig0, 4804 zSig1 4805 ) 4806 && increment 4807 ) 4808 ) { 4809 float_raise(float_flag_overflow | float_flag_inexact, status); 4810 if ( ( roundingMode == float_round_to_zero ) 4811 || ( zSign && ( roundingMode == float_round_up ) ) 4812 || ( ! zSign && ( roundingMode == float_round_down ) ) 4813 || (roundingMode == float_round_to_odd) 4814 ) { 4815 return 4816 packFloat128( 4817 zSign, 4818 0x7FFE, 4819 UINT64_C(0x0000FFFFFFFFFFFF), 4820 UINT64_C(0xFFFFFFFFFFFFFFFF) 4821 ); 4822 } 4823 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4824 } 4825 if ( zExp < 0 ) { 4826 if (status->flush_to_zero) { 4827 float_raise(float_flag_output_denormal, status); 4828 return packFloat128(zSign, 0, 0, 0); 4829 } 4830 isTiny = status->tininess_before_rounding 4831 || (zExp < -1) 4832 || !increment 4833 || lt128(zSig0, zSig1, 4834 UINT64_C(0x0001FFFFFFFFFFFF), 4835 UINT64_C(0xFFFFFFFFFFFFFFFF)); 4836 shift128ExtraRightJamming( 4837 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 4838 zExp = 0; 4839 if (isTiny && zSig2) { 4840 float_raise(float_flag_underflow, status); 4841 } 4842 switch (roundingMode) { 4843 case float_round_nearest_even: 4844 case float_round_ties_away: 4845 increment = ((int64_t)zSig2 < 0); 4846 break; 4847 case float_round_to_zero: 4848 increment = 0; 4849 break; 4850 case float_round_up: 4851 increment = !zSign && zSig2; 4852 break; 4853 case float_round_down: 4854 increment = zSign && zSig2; 4855 break; 4856 case float_round_to_odd: 4857 increment = !(zSig1 & 0x1) && zSig2; 4858 break; 4859 default: 4860 abort(); 4861 } 4862 } 4863 } 4864 if (zSig2) { 4865 float_raise(float_flag_inexact, status); 4866 } 4867 if ( increment ) { 4868 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 4869 if ((zSig2 + zSig2 == 0) && roundNearestEven) { 4870 zSig1 &= ~1; 4871 } 4872 } 4873 else { 4874 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 4875 } 4876 return packFloat128( zSign, zExp, zSig0, zSig1 ); 4877 4878 } 4879 4880 /*---------------------------------------------------------------------------- 4881 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4882 | and significand formed by the concatenation of `zSig0' and `zSig1', and 4883 | returns the proper quadruple-precision floating-point value corresponding 4884 | to the abstract input. This routine is just like `roundAndPackFloat128' 4885 | except that the input significand has fewer bits and does not have to be 4886 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 4887 | point exponent. 4888 *----------------------------------------------------------------------------*/ 4889 4890 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp, 4891 uint64_t zSig0, uint64_t zSig1, 4892 float_status *status) 4893 { 4894 int8_t shiftCount; 4895 uint64_t zSig2; 4896 4897 if ( zSig0 == 0 ) { 4898 zSig0 = zSig1; 4899 zSig1 = 0; 4900 zExp -= 64; 4901 } 4902 shiftCount = clz64(zSig0) - 15; 4903 if ( 0 <= shiftCount ) { 4904 zSig2 = 0; 4905 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4906 } 4907 else { 4908 shift128ExtraRightJamming( 4909 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 4910 } 4911 zExp -= shiftCount; 4912 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 4913 4914 } 4915 4916 4917 /*---------------------------------------------------------------------------- 4918 | Returns the result of converting the 32-bit two's complement integer `a' 4919 | to the extended double-precision floating-point format. The conversion 4920 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4921 | Arithmetic. 4922 *----------------------------------------------------------------------------*/ 4923 4924 floatx80 int32_to_floatx80(int32_t a, float_status *status) 4925 { 4926 bool zSign; 4927 uint32_t absA; 4928 int8_t shiftCount; 4929 uint64_t zSig; 4930 4931 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 4932 zSign = ( a < 0 ); 4933 absA = zSign ? - a : a; 4934 shiftCount = clz32(absA) + 32; 4935 zSig = absA; 4936 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 4937 4938 } 4939 4940 /*---------------------------------------------------------------------------- 4941 | Returns the result of converting the 32-bit two's complement integer `a' to 4942 | the quadruple-precision floating-point format. The conversion is performed 4943 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4944 *----------------------------------------------------------------------------*/ 4945 4946 float128 int32_to_float128(int32_t a, float_status *status) 4947 { 4948 bool zSign; 4949 uint32_t absA; 4950 int8_t shiftCount; 4951 uint64_t zSig0; 4952 4953 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 4954 zSign = ( a < 0 ); 4955 absA = zSign ? - a : a; 4956 shiftCount = clz32(absA) + 17; 4957 zSig0 = absA; 4958 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 4959 4960 } 4961 4962 /*---------------------------------------------------------------------------- 4963 | Returns the result of converting the 64-bit two's complement integer `a' 4964 | to the extended double-precision floating-point format. The conversion 4965 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4966 | Arithmetic. 4967 *----------------------------------------------------------------------------*/ 4968 4969 floatx80 int64_to_floatx80(int64_t a, float_status *status) 4970 { 4971 bool zSign; 4972 uint64_t absA; 4973 int8_t shiftCount; 4974 4975 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 4976 zSign = ( a < 0 ); 4977 absA = zSign ? - a : a; 4978 shiftCount = clz64(absA); 4979 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 4980 4981 } 4982 4983 /*---------------------------------------------------------------------------- 4984 | Returns the result of converting the 64-bit two's complement integer `a' to 4985 | the quadruple-precision floating-point format. The conversion is performed 4986 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4987 *----------------------------------------------------------------------------*/ 4988 4989 float128 int64_to_float128(int64_t a, float_status *status) 4990 { 4991 bool zSign; 4992 uint64_t absA; 4993 int8_t shiftCount; 4994 int32_t zExp; 4995 uint64_t zSig0, zSig1; 4996 4997 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 4998 zSign = ( a < 0 ); 4999 absA = zSign ? - a : a; 5000 shiftCount = clz64(absA) + 49; 5001 zExp = 0x406E - shiftCount; 5002 if ( 64 <= shiftCount ) { 5003 zSig1 = 0; 5004 zSig0 = absA; 5005 shiftCount -= 64; 5006 } 5007 else { 5008 zSig1 = absA; 5009 zSig0 = 0; 5010 } 5011 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 5012 return packFloat128( zSign, zExp, zSig0, zSig1 ); 5013 5014 } 5015 5016 /*---------------------------------------------------------------------------- 5017 | Returns the result of converting the 64-bit unsigned integer `a' 5018 | to the quadruple-precision floating-point format. The conversion is performed 5019 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5020 *----------------------------------------------------------------------------*/ 5021 5022 float128 uint64_to_float128(uint64_t a, float_status *status) 5023 { 5024 if (a == 0) { 5025 return float128_zero; 5026 } 5027 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status); 5028 } 5029 5030 /*---------------------------------------------------------------------------- 5031 | Returns the result of converting the single-precision floating-point value 5032 | `a' to the extended double-precision floating-point format. The conversion 5033 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5034 | Arithmetic. 5035 *----------------------------------------------------------------------------*/ 5036 5037 floatx80 float32_to_floatx80(float32 a, float_status *status) 5038 { 5039 bool aSign; 5040 int aExp; 5041 uint32_t aSig; 5042 5043 a = float32_squash_input_denormal(a, status); 5044 aSig = extractFloat32Frac( a ); 5045 aExp = extractFloat32Exp( a ); 5046 aSign = extractFloat32Sign( a ); 5047 if ( aExp == 0xFF ) { 5048 if (aSig) { 5049 floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status), 5050 status); 5051 return floatx80_silence_nan(res, status); 5052 } 5053 return packFloatx80(aSign, 5054 floatx80_infinity_high, 5055 floatx80_infinity_low); 5056 } 5057 if ( aExp == 0 ) { 5058 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 5059 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5060 } 5061 aSig |= 0x00800000; 5062 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 5063 5064 } 5065 5066 /*---------------------------------------------------------------------------- 5067 | Returns the result of converting the single-precision floating-point value 5068 | `a' to the double-precision floating-point format. The conversion is 5069 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5070 | Arithmetic. 5071 *----------------------------------------------------------------------------*/ 5072 5073 float128 float32_to_float128(float32 a, float_status *status) 5074 { 5075 bool aSign; 5076 int aExp; 5077 uint32_t aSig; 5078 5079 a = float32_squash_input_denormal(a, status); 5080 aSig = extractFloat32Frac( a ); 5081 aExp = extractFloat32Exp( a ); 5082 aSign = extractFloat32Sign( a ); 5083 if ( aExp == 0xFF ) { 5084 if (aSig) { 5085 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 5086 } 5087 return packFloat128( aSign, 0x7FFF, 0, 0 ); 5088 } 5089 if ( aExp == 0 ) { 5090 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 5091 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5092 --aExp; 5093 } 5094 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 5095 5096 } 5097 5098 /*---------------------------------------------------------------------------- 5099 | Returns the remainder of the single-precision floating-point value `a' 5100 | with respect to the corresponding value `b'. The operation is performed 5101 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5102 *----------------------------------------------------------------------------*/ 5103 5104 float32 float32_rem(float32 a, float32 b, float_status *status) 5105 { 5106 bool aSign, zSign; 5107 int aExp, bExp, expDiff; 5108 uint32_t aSig, bSig; 5109 uint32_t q; 5110 uint64_t aSig64, bSig64, q64; 5111 uint32_t alternateASig; 5112 int32_t sigMean; 5113 a = float32_squash_input_denormal(a, status); 5114 b = float32_squash_input_denormal(b, status); 5115 5116 aSig = extractFloat32Frac( a ); 5117 aExp = extractFloat32Exp( a ); 5118 aSign = extractFloat32Sign( a ); 5119 bSig = extractFloat32Frac( b ); 5120 bExp = extractFloat32Exp( b ); 5121 if ( aExp == 0xFF ) { 5122 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 5123 return propagateFloat32NaN(a, b, status); 5124 } 5125 float_raise(float_flag_invalid, status); 5126 return float32_default_nan(status); 5127 } 5128 if ( bExp == 0xFF ) { 5129 if (bSig) { 5130 return propagateFloat32NaN(a, b, status); 5131 } 5132 return a; 5133 } 5134 if ( bExp == 0 ) { 5135 if ( bSig == 0 ) { 5136 float_raise(float_flag_invalid, status); 5137 return float32_default_nan(status); 5138 } 5139 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 5140 } 5141 if ( aExp == 0 ) { 5142 if ( aSig == 0 ) return a; 5143 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5144 } 5145 expDiff = aExp - bExp; 5146 aSig |= 0x00800000; 5147 bSig |= 0x00800000; 5148 if ( expDiff < 32 ) { 5149 aSig <<= 8; 5150 bSig <<= 8; 5151 if ( expDiff < 0 ) { 5152 if ( expDiff < -1 ) return a; 5153 aSig >>= 1; 5154 } 5155 q = ( bSig <= aSig ); 5156 if ( q ) aSig -= bSig; 5157 if ( 0 < expDiff ) { 5158 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 5159 q >>= 32 - expDiff; 5160 bSig >>= 2; 5161 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 5162 } 5163 else { 5164 aSig >>= 2; 5165 bSig >>= 2; 5166 } 5167 } 5168 else { 5169 if ( bSig <= aSig ) aSig -= bSig; 5170 aSig64 = ( (uint64_t) aSig )<<40; 5171 bSig64 = ( (uint64_t) bSig )<<40; 5172 expDiff -= 64; 5173 while ( 0 < expDiff ) { 5174 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 5175 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 5176 aSig64 = - ( ( bSig * q64 )<<38 ); 5177 expDiff -= 62; 5178 } 5179 expDiff += 64; 5180 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 5181 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 5182 q = q64>>( 64 - expDiff ); 5183 bSig <<= 6; 5184 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 5185 } 5186 do { 5187 alternateASig = aSig; 5188 ++q; 5189 aSig -= bSig; 5190 } while ( 0 <= (int32_t) aSig ); 5191 sigMean = aSig + alternateASig; 5192 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 5193 aSig = alternateASig; 5194 } 5195 zSign = ( (int32_t) aSig < 0 ); 5196 if ( zSign ) aSig = - aSig; 5197 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 5198 } 5199 5200 5201 5202 /*---------------------------------------------------------------------------- 5203 | Returns the binary exponential of the single-precision floating-point value 5204 | `a'. The operation is performed according to the IEC/IEEE Standard for 5205 | Binary Floating-Point Arithmetic. 5206 | 5207 | Uses the following identities: 5208 | 5209 | 1. ------------------------------------------------------------------------- 5210 | x x*ln(2) 5211 | 2 = e 5212 | 5213 | 2. ------------------------------------------------------------------------- 5214 | 2 3 4 5 n 5215 | x x x x x x x 5216 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 5217 | 1! 2! 3! 4! 5! n! 5218 *----------------------------------------------------------------------------*/ 5219 5220 static const float64 float32_exp2_coefficients[15] = 5221 { 5222 const_float64( 0x3ff0000000000000ll ), /* 1 */ 5223 const_float64( 0x3fe0000000000000ll ), /* 2 */ 5224 const_float64( 0x3fc5555555555555ll ), /* 3 */ 5225 const_float64( 0x3fa5555555555555ll ), /* 4 */ 5226 const_float64( 0x3f81111111111111ll ), /* 5 */ 5227 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 5228 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 5229 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 5230 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 5231 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 5232 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 5233 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 5234 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 5235 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 5236 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 5237 }; 5238 5239 float32 float32_exp2(float32 a, float_status *status) 5240 { 5241 bool aSign; 5242 int aExp; 5243 uint32_t aSig; 5244 float64 r, x, xn; 5245 int i; 5246 a = float32_squash_input_denormal(a, status); 5247 5248 aSig = extractFloat32Frac( a ); 5249 aExp = extractFloat32Exp( a ); 5250 aSign = extractFloat32Sign( a ); 5251 5252 if ( aExp == 0xFF) { 5253 if (aSig) { 5254 return propagateFloat32NaN(a, float32_zero, status); 5255 } 5256 return (aSign) ? float32_zero : a; 5257 } 5258 if (aExp == 0) { 5259 if (aSig == 0) return float32_one; 5260 } 5261 5262 float_raise(float_flag_inexact, status); 5263 5264 /* ******************************* */ 5265 /* using float64 for approximation */ 5266 /* ******************************* */ 5267 x = float32_to_float64(a, status); 5268 x = float64_mul(x, float64_ln2, status); 5269 5270 xn = x; 5271 r = float64_one; 5272 for (i = 0 ; i < 15 ; i++) { 5273 float64 f; 5274 5275 f = float64_mul(xn, float32_exp2_coefficients[i], status); 5276 r = float64_add(r, f, status); 5277 5278 xn = float64_mul(xn, x, status); 5279 } 5280 5281 return float64_to_float32(r, status); 5282 } 5283 5284 /*---------------------------------------------------------------------------- 5285 | Returns the binary log of the single-precision floating-point value `a'. 5286 | The operation is performed according to the IEC/IEEE Standard for Binary 5287 | Floating-Point Arithmetic. 5288 *----------------------------------------------------------------------------*/ 5289 float32 float32_log2(float32 a, float_status *status) 5290 { 5291 bool aSign, zSign; 5292 int aExp; 5293 uint32_t aSig, zSig, i; 5294 5295 a = float32_squash_input_denormal(a, status); 5296 aSig = extractFloat32Frac( a ); 5297 aExp = extractFloat32Exp( a ); 5298 aSign = extractFloat32Sign( a ); 5299 5300 if ( aExp == 0 ) { 5301 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 5302 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5303 } 5304 if ( aSign ) { 5305 float_raise(float_flag_invalid, status); 5306 return float32_default_nan(status); 5307 } 5308 if ( aExp == 0xFF ) { 5309 if (aSig) { 5310 return propagateFloat32NaN(a, float32_zero, status); 5311 } 5312 return a; 5313 } 5314 5315 aExp -= 0x7F; 5316 aSig |= 0x00800000; 5317 zSign = aExp < 0; 5318 zSig = aExp << 23; 5319 5320 for (i = 1 << 22; i > 0; i >>= 1) { 5321 aSig = ( (uint64_t)aSig * aSig ) >> 23; 5322 if ( aSig & 0x01000000 ) { 5323 aSig >>= 1; 5324 zSig |= i; 5325 } 5326 } 5327 5328 if ( zSign ) 5329 zSig = -zSig; 5330 5331 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 5332 } 5333 5334 /*---------------------------------------------------------------------------- 5335 | Returns the result of converting the double-precision floating-point value 5336 | `a' to the extended double-precision floating-point format. The conversion 5337 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5338 | Arithmetic. 5339 *----------------------------------------------------------------------------*/ 5340 5341 floatx80 float64_to_floatx80(float64 a, float_status *status) 5342 { 5343 bool aSign; 5344 int aExp; 5345 uint64_t aSig; 5346 5347 a = float64_squash_input_denormal(a, status); 5348 aSig = extractFloat64Frac( a ); 5349 aExp = extractFloat64Exp( a ); 5350 aSign = extractFloat64Sign( a ); 5351 if ( aExp == 0x7FF ) { 5352 if (aSig) { 5353 floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status), 5354 status); 5355 return floatx80_silence_nan(res, status); 5356 } 5357 return packFloatx80(aSign, 5358 floatx80_infinity_high, 5359 floatx80_infinity_low); 5360 } 5361 if ( aExp == 0 ) { 5362 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 5363 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5364 } 5365 return 5366 packFloatx80( 5367 aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11); 5368 5369 } 5370 5371 /*---------------------------------------------------------------------------- 5372 | Returns the result of converting the double-precision floating-point value 5373 | `a' to the quadruple-precision floating-point format. The conversion is 5374 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5375 | Arithmetic. 5376 *----------------------------------------------------------------------------*/ 5377 5378 float128 float64_to_float128(float64 a, float_status *status) 5379 { 5380 bool aSign; 5381 int aExp; 5382 uint64_t aSig, zSig0, zSig1; 5383 5384 a = float64_squash_input_denormal(a, status); 5385 aSig = extractFloat64Frac( a ); 5386 aExp = extractFloat64Exp( a ); 5387 aSign = extractFloat64Sign( a ); 5388 if ( aExp == 0x7FF ) { 5389 if (aSig) { 5390 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 5391 } 5392 return packFloat128( aSign, 0x7FFF, 0, 0 ); 5393 } 5394 if ( aExp == 0 ) { 5395 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 5396 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5397 --aExp; 5398 } 5399 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 5400 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 5401 5402 } 5403 5404 5405 /*---------------------------------------------------------------------------- 5406 | Returns the remainder of the double-precision floating-point value `a' 5407 | with respect to the corresponding value `b'. The operation is performed 5408 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5409 *----------------------------------------------------------------------------*/ 5410 5411 float64 float64_rem(float64 a, float64 b, float_status *status) 5412 { 5413 bool aSign, zSign; 5414 int aExp, bExp, expDiff; 5415 uint64_t aSig, bSig; 5416 uint64_t q, alternateASig; 5417 int64_t sigMean; 5418 5419 a = float64_squash_input_denormal(a, status); 5420 b = float64_squash_input_denormal(b, status); 5421 aSig = extractFloat64Frac( a ); 5422 aExp = extractFloat64Exp( a ); 5423 aSign = extractFloat64Sign( a ); 5424 bSig = extractFloat64Frac( b ); 5425 bExp = extractFloat64Exp( b ); 5426 if ( aExp == 0x7FF ) { 5427 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 5428 return propagateFloat64NaN(a, b, status); 5429 } 5430 float_raise(float_flag_invalid, status); 5431 return float64_default_nan(status); 5432 } 5433 if ( bExp == 0x7FF ) { 5434 if (bSig) { 5435 return propagateFloat64NaN(a, b, status); 5436 } 5437 return a; 5438 } 5439 if ( bExp == 0 ) { 5440 if ( bSig == 0 ) { 5441 float_raise(float_flag_invalid, status); 5442 return float64_default_nan(status); 5443 } 5444 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 5445 } 5446 if ( aExp == 0 ) { 5447 if ( aSig == 0 ) return a; 5448 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5449 } 5450 expDiff = aExp - bExp; 5451 aSig = (aSig | UINT64_C(0x0010000000000000)) << 11; 5452 bSig = (bSig | UINT64_C(0x0010000000000000)) << 11; 5453 if ( expDiff < 0 ) { 5454 if ( expDiff < -1 ) return a; 5455 aSig >>= 1; 5456 } 5457 q = ( bSig <= aSig ); 5458 if ( q ) aSig -= bSig; 5459 expDiff -= 64; 5460 while ( 0 < expDiff ) { 5461 q = estimateDiv128To64( aSig, 0, bSig ); 5462 q = ( 2 < q ) ? q - 2 : 0; 5463 aSig = - ( ( bSig>>2 ) * q ); 5464 expDiff -= 62; 5465 } 5466 expDiff += 64; 5467 if ( 0 < expDiff ) { 5468 q = estimateDiv128To64( aSig, 0, bSig ); 5469 q = ( 2 < q ) ? q - 2 : 0; 5470 q >>= 64 - expDiff; 5471 bSig >>= 2; 5472 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 5473 } 5474 else { 5475 aSig >>= 2; 5476 bSig >>= 2; 5477 } 5478 do { 5479 alternateASig = aSig; 5480 ++q; 5481 aSig -= bSig; 5482 } while ( 0 <= (int64_t) aSig ); 5483 sigMean = aSig + alternateASig; 5484 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 5485 aSig = alternateASig; 5486 } 5487 zSign = ( (int64_t) aSig < 0 ); 5488 if ( zSign ) aSig = - aSig; 5489 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 5490 5491 } 5492 5493 /*---------------------------------------------------------------------------- 5494 | Returns the binary log of the double-precision floating-point value `a'. 5495 | The operation is performed according to the IEC/IEEE Standard for Binary 5496 | Floating-Point Arithmetic. 5497 *----------------------------------------------------------------------------*/ 5498 float64 float64_log2(float64 a, float_status *status) 5499 { 5500 bool aSign, zSign; 5501 int aExp; 5502 uint64_t aSig, aSig0, aSig1, zSig, i; 5503 a = float64_squash_input_denormal(a, status); 5504 5505 aSig = extractFloat64Frac( a ); 5506 aExp = extractFloat64Exp( a ); 5507 aSign = extractFloat64Sign( a ); 5508 5509 if ( aExp == 0 ) { 5510 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 5511 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5512 } 5513 if ( aSign ) { 5514 float_raise(float_flag_invalid, status); 5515 return float64_default_nan(status); 5516 } 5517 if ( aExp == 0x7FF ) { 5518 if (aSig) { 5519 return propagateFloat64NaN(a, float64_zero, status); 5520 } 5521 return a; 5522 } 5523 5524 aExp -= 0x3FF; 5525 aSig |= UINT64_C(0x0010000000000000); 5526 zSign = aExp < 0; 5527 zSig = (uint64_t)aExp << 52; 5528 for (i = 1LL << 51; i > 0; i >>= 1) { 5529 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 5530 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 5531 if ( aSig & UINT64_C(0x0020000000000000) ) { 5532 aSig >>= 1; 5533 zSig |= i; 5534 } 5535 } 5536 5537 if ( zSign ) 5538 zSig = -zSig; 5539 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 5540 } 5541 5542 /*---------------------------------------------------------------------------- 5543 | Returns the result of converting the extended double-precision floating- 5544 | point value `a' to the 32-bit two's complement integer format. The 5545 | conversion is performed according to the IEC/IEEE Standard for Binary 5546 | Floating-Point Arithmetic---which means in particular that the conversion 5547 | is rounded according to the current rounding mode. If `a' is a NaN, the 5548 | largest positive integer is returned. Otherwise, if the conversion 5549 | overflows, the largest integer with the same sign as `a' is returned. 5550 *----------------------------------------------------------------------------*/ 5551 5552 int32_t floatx80_to_int32(floatx80 a, float_status *status) 5553 { 5554 bool aSign; 5555 int32_t aExp, shiftCount; 5556 uint64_t aSig; 5557 5558 if (floatx80_invalid_encoding(a)) { 5559 float_raise(float_flag_invalid, status); 5560 return 1 << 31; 5561 } 5562 aSig = extractFloatx80Frac( a ); 5563 aExp = extractFloatx80Exp( a ); 5564 aSign = extractFloatx80Sign( a ); 5565 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5566 shiftCount = 0x4037 - aExp; 5567 if ( shiftCount <= 0 ) shiftCount = 1; 5568 shift64RightJamming( aSig, shiftCount, &aSig ); 5569 return roundAndPackInt32(aSign, aSig, status); 5570 5571 } 5572 5573 /*---------------------------------------------------------------------------- 5574 | Returns the result of converting the extended double-precision floating- 5575 | point value `a' to the 32-bit two's complement integer format. The 5576 | conversion is performed according to the IEC/IEEE Standard for Binary 5577 | Floating-Point Arithmetic, except that the conversion is always rounded 5578 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5579 | Otherwise, if the conversion overflows, the largest integer with the same 5580 | sign as `a' is returned. 5581 *----------------------------------------------------------------------------*/ 5582 5583 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 5584 { 5585 bool aSign; 5586 int32_t aExp, shiftCount; 5587 uint64_t aSig, savedASig; 5588 int32_t z; 5589 5590 if (floatx80_invalid_encoding(a)) { 5591 float_raise(float_flag_invalid, status); 5592 return 1 << 31; 5593 } 5594 aSig = extractFloatx80Frac( a ); 5595 aExp = extractFloatx80Exp( a ); 5596 aSign = extractFloatx80Sign( a ); 5597 if ( 0x401E < aExp ) { 5598 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5599 goto invalid; 5600 } 5601 else if ( aExp < 0x3FFF ) { 5602 if (aExp || aSig) { 5603 float_raise(float_flag_inexact, status); 5604 } 5605 return 0; 5606 } 5607 shiftCount = 0x403E - aExp; 5608 savedASig = aSig; 5609 aSig >>= shiftCount; 5610 z = aSig; 5611 if ( aSign ) z = - z; 5612 if ( ( z < 0 ) ^ aSign ) { 5613 invalid: 5614 float_raise(float_flag_invalid, status); 5615 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5616 } 5617 if ( ( aSig<<shiftCount ) != savedASig ) { 5618 float_raise(float_flag_inexact, status); 5619 } 5620 return z; 5621 5622 } 5623 5624 /*---------------------------------------------------------------------------- 5625 | Returns the result of converting the extended double-precision floating- 5626 | point value `a' to the 64-bit two's complement integer format. The 5627 | conversion is performed according to the IEC/IEEE Standard for Binary 5628 | Floating-Point Arithmetic---which means in particular that the conversion 5629 | is rounded according to the current rounding mode. If `a' is a NaN, 5630 | the largest positive integer is returned. Otherwise, if the conversion 5631 | overflows, the largest integer with the same sign as `a' is returned. 5632 *----------------------------------------------------------------------------*/ 5633 5634 int64_t floatx80_to_int64(floatx80 a, float_status *status) 5635 { 5636 bool aSign; 5637 int32_t aExp, shiftCount; 5638 uint64_t aSig, aSigExtra; 5639 5640 if (floatx80_invalid_encoding(a)) { 5641 float_raise(float_flag_invalid, status); 5642 return 1ULL << 63; 5643 } 5644 aSig = extractFloatx80Frac( a ); 5645 aExp = extractFloatx80Exp( a ); 5646 aSign = extractFloatx80Sign( a ); 5647 shiftCount = 0x403E - aExp; 5648 if ( shiftCount <= 0 ) { 5649 if ( shiftCount ) { 5650 float_raise(float_flag_invalid, status); 5651 if (!aSign || floatx80_is_any_nan(a)) { 5652 return INT64_MAX; 5653 } 5654 return INT64_MIN; 5655 } 5656 aSigExtra = 0; 5657 } 5658 else { 5659 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 5660 } 5661 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 5662 5663 } 5664 5665 /*---------------------------------------------------------------------------- 5666 | Returns the result of converting the extended double-precision floating- 5667 | point value `a' to the 64-bit two's complement integer format. The 5668 | conversion is performed according to the IEC/IEEE Standard for Binary 5669 | Floating-Point Arithmetic, except that the conversion is always rounded 5670 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5671 | Otherwise, if the conversion overflows, the largest integer with the same 5672 | sign as `a' is returned. 5673 *----------------------------------------------------------------------------*/ 5674 5675 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 5676 { 5677 bool aSign; 5678 int32_t aExp, shiftCount; 5679 uint64_t aSig; 5680 int64_t z; 5681 5682 if (floatx80_invalid_encoding(a)) { 5683 float_raise(float_flag_invalid, status); 5684 return 1ULL << 63; 5685 } 5686 aSig = extractFloatx80Frac( a ); 5687 aExp = extractFloatx80Exp( a ); 5688 aSign = extractFloatx80Sign( a ); 5689 shiftCount = aExp - 0x403E; 5690 if ( 0 <= shiftCount ) { 5691 aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF); 5692 if ( ( a.high != 0xC03E ) || aSig ) { 5693 float_raise(float_flag_invalid, status); 5694 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 5695 return INT64_MAX; 5696 } 5697 } 5698 return INT64_MIN; 5699 } 5700 else if ( aExp < 0x3FFF ) { 5701 if (aExp | aSig) { 5702 float_raise(float_flag_inexact, status); 5703 } 5704 return 0; 5705 } 5706 z = aSig>>( - shiftCount ); 5707 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 5708 float_raise(float_flag_inexact, status); 5709 } 5710 if ( aSign ) z = - z; 5711 return z; 5712 5713 } 5714 5715 /*---------------------------------------------------------------------------- 5716 | Returns the result of converting the extended double-precision floating- 5717 | point value `a' to the single-precision floating-point format. The 5718 | conversion is performed according to the IEC/IEEE Standard for Binary 5719 | Floating-Point Arithmetic. 5720 *----------------------------------------------------------------------------*/ 5721 5722 float32 floatx80_to_float32(floatx80 a, float_status *status) 5723 { 5724 bool aSign; 5725 int32_t aExp; 5726 uint64_t aSig; 5727 5728 if (floatx80_invalid_encoding(a)) { 5729 float_raise(float_flag_invalid, status); 5730 return float32_default_nan(status); 5731 } 5732 aSig = extractFloatx80Frac( a ); 5733 aExp = extractFloatx80Exp( a ); 5734 aSign = extractFloatx80Sign( a ); 5735 if ( aExp == 0x7FFF ) { 5736 if ( (uint64_t) ( aSig<<1 ) ) { 5737 float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status), 5738 status); 5739 return float32_silence_nan(res, status); 5740 } 5741 return packFloat32( aSign, 0xFF, 0 ); 5742 } 5743 shift64RightJamming( aSig, 33, &aSig ); 5744 if ( aExp || aSig ) aExp -= 0x3F81; 5745 return roundAndPackFloat32(aSign, aExp, aSig, status); 5746 5747 } 5748 5749 /*---------------------------------------------------------------------------- 5750 | Returns the result of converting the extended double-precision floating- 5751 | point value `a' to the double-precision floating-point format. The 5752 | conversion is performed according to the IEC/IEEE Standard for Binary 5753 | Floating-Point Arithmetic. 5754 *----------------------------------------------------------------------------*/ 5755 5756 float64 floatx80_to_float64(floatx80 a, float_status *status) 5757 { 5758 bool aSign; 5759 int32_t aExp; 5760 uint64_t aSig, zSig; 5761 5762 if (floatx80_invalid_encoding(a)) { 5763 float_raise(float_flag_invalid, status); 5764 return float64_default_nan(status); 5765 } 5766 aSig = extractFloatx80Frac( a ); 5767 aExp = extractFloatx80Exp( a ); 5768 aSign = extractFloatx80Sign( a ); 5769 if ( aExp == 0x7FFF ) { 5770 if ( (uint64_t) ( aSig<<1 ) ) { 5771 float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status), 5772 status); 5773 return float64_silence_nan(res, status); 5774 } 5775 return packFloat64( aSign, 0x7FF, 0 ); 5776 } 5777 shift64RightJamming( aSig, 1, &zSig ); 5778 if ( aExp || aSig ) aExp -= 0x3C01; 5779 return roundAndPackFloat64(aSign, aExp, zSig, status); 5780 5781 } 5782 5783 /*---------------------------------------------------------------------------- 5784 | Returns the result of converting the extended double-precision floating- 5785 | point value `a' to the quadruple-precision floating-point format. The 5786 | conversion is performed according to the IEC/IEEE Standard for Binary 5787 | Floating-Point Arithmetic. 5788 *----------------------------------------------------------------------------*/ 5789 5790 float128 floatx80_to_float128(floatx80 a, float_status *status) 5791 { 5792 bool aSign; 5793 int aExp; 5794 uint64_t aSig, zSig0, zSig1; 5795 5796 if (floatx80_invalid_encoding(a)) { 5797 float_raise(float_flag_invalid, status); 5798 return float128_default_nan(status); 5799 } 5800 aSig = extractFloatx80Frac( a ); 5801 aExp = extractFloatx80Exp( a ); 5802 aSign = extractFloatx80Sign( a ); 5803 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 5804 float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status), 5805 status); 5806 return float128_silence_nan(res, status); 5807 } 5808 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 5809 return packFloat128( aSign, aExp, zSig0, zSig1 ); 5810 5811 } 5812 5813 /*---------------------------------------------------------------------------- 5814 | Rounds the extended double-precision floating-point value `a' 5815 | to the precision provided by floatx80_rounding_precision and returns the 5816 | result as an extended double-precision floating-point value. 5817 | The operation is performed according to the IEC/IEEE Standard for Binary 5818 | Floating-Point Arithmetic. 5819 *----------------------------------------------------------------------------*/ 5820 5821 floatx80 floatx80_round(floatx80 a, float_status *status) 5822 { 5823 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5824 extractFloatx80Sign(a), 5825 extractFloatx80Exp(a), 5826 extractFloatx80Frac(a), 0, status); 5827 } 5828 5829 /*---------------------------------------------------------------------------- 5830 | Rounds the extended double-precision floating-point value `a' to an integer, 5831 | and returns the result as an extended quadruple-precision floating-point 5832 | value. The operation is performed according to the IEC/IEEE Standard for 5833 | Binary Floating-Point Arithmetic. 5834 *----------------------------------------------------------------------------*/ 5835 5836 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 5837 { 5838 bool aSign; 5839 int32_t aExp; 5840 uint64_t lastBitMask, roundBitsMask; 5841 floatx80 z; 5842 5843 if (floatx80_invalid_encoding(a)) { 5844 float_raise(float_flag_invalid, status); 5845 return floatx80_default_nan(status); 5846 } 5847 aExp = extractFloatx80Exp( a ); 5848 if ( 0x403E <= aExp ) { 5849 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 5850 return propagateFloatx80NaN(a, a, status); 5851 } 5852 return a; 5853 } 5854 if ( aExp < 0x3FFF ) { 5855 if ( ( aExp == 0 ) 5856 && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) { 5857 return a; 5858 } 5859 float_raise(float_flag_inexact, status); 5860 aSign = extractFloatx80Sign( a ); 5861 switch (status->float_rounding_mode) { 5862 case float_round_nearest_even: 5863 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 5864 ) { 5865 return 5866 packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000)); 5867 } 5868 break; 5869 case float_round_ties_away: 5870 if (aExp == 0x3FFE) { 5871 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000)); 5872 } 5873 break; 5874 case float_round_down: 5875 return 5876 aSign ? 5877 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000)) 5878 : packFloatx80( 0, 0, 0 ); 5879 case float_round_up: 5880 return 5881 aSign ? packFloatx80( 1, 0, 0 ) 5882 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000)); 5883 5884 case float_round_to_zero: 5885 break; 5886 default: 5887 g_assert_not_reached(); 5888 } 5889 return packFloatx80( aSign, 0, 0 ); 5890 } 5891 lastBitMask = 1; 5892 lastBitMask <<= 0x403E - aExp; 5893 roundBitsMask = lastBitMask - 1; 5894 z = a; 5895 switch (status->float_rounding_mode) { 5896 case float_round_nearest_even: 5897 z.low += lastBitMask>>1; 5898 if ((z.low & roundBitsMask) == 0) { 5899 z.low &= ~lastBitMask; 5900 } 5901 break; 5902 case float_round_ties_away: 5903 z.low += lastBitMask >> 1; 5904 break; 5905 case float_round_to_zero: 5906 break; 5907 case float_round_up: 5908 if (!extractFloatx80Sign(z)) { 5909 z.low += roundBitsMask; 5910 } 5911 break; 5912 case float_round_down: 5913 if (extractFloatx80Sign(z)) { 5914 z.low += roundBitsMask; 5915 } 5916 break; 5917 default: 5918 abort(); 5919 } 5920 z.low &= ~ roundBitsMask; 5921 if ( z.low == 0 ) { 5922 ++z.high; 5923 z.low = UINT64_C(0x8000000000000000); 5924 } 5925 if (z.low != a.low) { 5926 float_raise(float_flag_inexact, status); 5927 } 5928 return z; 5929 5930 } 5931 5932 /*---------------------------------------------------------------------------- 5933 | Returns the result of adding the absolute values of the extended double- 5934 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 5935 | negated before being returned. `zSign' is ignored if the result is a NaN. 5936 | The addition is performed according to the IEC/IEEE Standard for Binary 5937 | Floating-Point Arithmetic. 5938 *----------------------------------------------------------------------------*/ 5939 5940 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign, 5941 float_status *status) 5942 { 5943 int32_t aExp, bExp, zExp; 5944 uint64_t aSig, bSig, zSig0, zSig1; 5945 int32_t expDiff; 5946 5947 aSig = extractFloatx80Frac( a ); 5948 aExp = extractFloatx80Exp( a ); 5949 bSig = extractFloatx80Frac( b ); 5950 bExp = extractFloatx80Exp( b ); 5951 expDiff = aExp - bExp; 5952 if ( 0 < expDiff ) { 5953 if ( aExp == 0x7FFF ) { 5954 if ((uint64_t)(aSig << 1)) { 5955 return propagateFloatx80NaN(a, b, status); 5956 } 5957 return a; 5958 } 5959 if ( bExp == 0 ) --expDiff; 5960 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5961 zExp = aExp; 5962 } 5963 else if ( expDiff < 0 ) { 5964 if ( bExp == 0x7FFF ) { 5965 if ((uint64_t)(bSig << 1)) { 5966 return propagateFloatx80NaN(a, b, status); 5967 } 5968 return packFloatx80(zSign, 5969 floatx80_infinity_high, 5970 floatx80_infinity_low); 5971 } 5972 if ( aExp == 0 ) ++expDiff; 5973 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5974 zExp = bExp; 5975 } 5976 else { 5977 if ( aExp == 0x7FFF ) { 5978 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5979 return propagateFloatx80NaN(a, b, status); 5980 } 5981 return a; 5982 } 5983 zSig1 = 0; 5984 zSig0 = aSig + bSig; 5985 if ( aExp == 0 ) { 5986 if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) { 5987 /* At least one of the values is a pseudo-denormal, 5988 * and there is a carry out of the result. */ 5989 zExp = 1; 5990 goto shiftRight1; 5991 } 5992 if (zSig0 == 0) { 5993 return packFloatx80(zSign, 0, 0); 5994 } 5995 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 5996 goto roundAndPack; 5997 } 5998 zExp = aExp; 5999 goto shiftRight1; 6000 } 6001 zSig0 = aSig + bSig; 6002 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 6003 shiftRight1: 6004 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 6005 zSig0 |= UINT64_C(0x8000000000000000); 6006 ++zExp; 6007 roundAndPack: 6008 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6009 zSign, zExp, zSig0, zSig1, status); 6010 } 6011 6012 /*---------------------------------------------------------------------------- 6013 | Returns the result of subtracting the absolute values of the extended 6014 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 6015 | difference is negated before being returned. `zSign' is ignored if the 6016 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6017 | Standard for Binary Floating-Point Arithmetic. 6018 *----------------------------------------------------------------------------*/ 6019 6020 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign, 6021 float_status *status) 6022 { 6023 int32_t aExp, bExp, zExp; 6024 uint64_t aSig, bSig, zSig0, zSig1; 6025 int32_t expDiff; 6026 6027 aSig = extractFloatx80Frac( a ); 6028 aExp = extractFloatx80Exp( a ); 6029 bSig = extractFloatx80Frac( b ); 6030 bExp = extractFloatx80Exp( b ); 6031 expDiff = aExp - bExp; 6032 if ( 0 < expDiff ) goto aExpBigger; 6033 if ( expDiff < 0 ) goto bExpBigger; 6034 if ( aExp == 0x7FFF ) { 6035 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 6036 return propagateFloatx80NaN(a, b, status); 6037 } 6038 float_raise(float_flag_invalid, status); 6039 return floatx80_default_nan(status); 6040 } 6041 if ( aExp == 0 ) { 6042 aExp = 1; 6043 bExp = 1; 6044 } 6045 zSig1 = 0; 6046 if ( bSig < aSig ) goto aBigger; 6047 if ( aSig < bSig ) goto bBigger; 6048 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 6049 bExpBigger: 6050 if ( bExp == 0x7FFF ) { 6051 if ((uint64_t)(bSig << 1)) { 6052 return propagateFloatx80NaN(a, b, status); 6053 } 6054 return packFloatx80(zSign ^ 1, floatx80_infinity_high, 6055 floatx80_infinity_low); 6056 } 6057 if ( aExp == 0 ) ++expDiff; 6058 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 6059 bBigger: 6060 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 6061 zExp = bExp; 6062 zSign ^= 1; 6063 goto normalizeRoundAndPack; 6064 aExpBigger: 6065 if ( aExp == 0x7FFF ) { 6066 if ((uint64_t)(aSig << 1)) { 6067 return propagateFloatx80NaN(a, b, status); 6068 } 6069 return a; 6070 } 6071 if ( bExp == 0 ) --expDiff; 6072 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 6073 aBigger: 6074 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 6075 zExp = aExp; 6076 normalizeRoundAndPack: 6077 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 6078 zSign, zExp, zSig0, zSig1, status); 6079 } 6080 6081 /*---------------------------------------------------------------------------- 6082 | Returns the result of adding the extended double-precision floating-point 6083 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6084 | Standard for Binary Floating-Point Arithmetic. 6085 *----------------------------------------------------------------------------*/ 6086 6087 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 6088 { 6089 bool aSign, bSign; 6090 6091 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6092 float_raise(float_flag_invalid, status); 6093 return floatx80_default_nan(status); 6094 } 6095 aSign = extractFloatx80Sign( a ); 6096 bSign = extractFloatx80Sign( b ); 6097 if ( aSign == bSign ) { 6098 return addFloatx80Sigs(a, b, aSign, status); 6099 } 6100 else { 6101 return subFloatx80Sigs(a, b, aSign, status); 6102 } 6103 6104 } 6105 6106 /*---------------------------------------------------------------------------- 6107 | Returns the result of subtracting the extended double-precision floating- 6108 | point values `a' and `b'. The operation is performed according to the 6109 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6110 *----------------------------------------------------------------------------*/ 6111 6112 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 6113 { 6114 bool aSign, bSign; 6115 6116 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6117 float_raise(float_flag_invalid, status); 6118 return floatx80_default_nan(status); 6119 } 6120 aSign = extractFloatx80Sign( a ); 6121 bSign = extractFloatx80Sign( b ); 6122 if ( aSign == bSign ) { 6123 return subFloatx80Sigs(a, b, aSign, status); 6124 } 6125 else { 6126 return addFloatx80Sigs(a, b, aSign, status); 6127 } 6128 6129 } 6130 6131 /*---------------------------------------------------------------------------- 6132 | Returns the result of multiplying the extended double-precision floating- 6133 | point values `a' and `b'. The operation is performed according to the 6134 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6135 *----------------------------------------------------------------------------*/ 6136 6137 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 6138 { 6139 bool aSign, bSign, zSign; 6140 int32_t aExp, bExp, zExp; 6141 uint64_t aSig, bSig, zSig0, zSig1; 6142 6143 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6144 float_raise(float_flag_invalid, status); 6145 return floatx80_default_nan(status); 6146 } 6147 aSig = extractFloatx80Frac( a ); 6148 aExp = extractFloatx80Exp( a ); 6149 aSign = extractFloatx80Sign( a ); 6150 bSig = extractFloatx80Frac( b ); 6151 bExp = extractFloatx80Exp( b ); 6152 bSign = extractFloatx80Sign( b ); 6153 zSign = aSign ^ bSign; 6154 if ( aExp == 0x7FFF ) { 6155 if ( (uint64_t) ( aSig<<1 ) 6156 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 6157 return propagateFloatx80NaN(a, b, status); 6158 } 6159 if ( ( bExp | bSig ) == 0 ) goto invalid; 6160 return packFloatx80(zSign, floatx80_infinity_high, 6161 floatx80_infinity_low); 6162 } 6163 if ( bExp == 0x7FFF ) { 6164 if ((uint64_t)(bSig << 1)) { 6165 return propagateFloatx80NaN(a, b, status); 6166 } 6167 if ( ( aExp | aSig ) == 0 ) { 6168 invalid: 6169 float_raise(float_flag_invalid, status); 6170 return floatx80_default_nan(status); 6171 } 6172 return packFloatx80(zSign, floatx80_infinity_high, 6173 floatx80_infinity_low); 6174 } 6175 if ( aExp == 0 ) { 6176 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6177 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 6178 } 6179 if ( bExp == 0 ) { 6180 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6181 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6182 } 6183 zExp = aExp + bExp - 0x3FFE; 6184 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 6185 if ( 0 < (int64_t) zSig0 ) { 6186 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 6187 --zExp; 6188 } 6189 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6190 zSign, zExp, zSig0, zSig1, status); 6191 } 6192 6193 /*---------------------------------------------------------------------------- 6194 | Returns the result of dividing the extended double-precision floating-point 6195 | value `a' by the corresponding value `b'. The operation is performed 6196 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6197 *----------------------------------------------------------------------------*/ 6198 6199 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 6200 { 6201 bool aSign, bSign, zSign; 6202 int32_t aExp, bExp, zExp; 6203 uint64_t aSig, bSig, zSig0, zSig1; 6204 uint64_t rem0, rem1, rem2, term0, term1, term2; 6205 6206 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6207 float_raise(float_flag_invalid, status); 6208 return floatx80_default_nan(status); 6209 } 6210 aSig = extractFloatx80Frac( a ); 6211 aExp = extractFloatx80Exp( a ); 6212 aSign = extractFloatx80Sign( a ); 6213 bSig = extractFloatx80Frac( b ); 6214 bExp = extractFloatx80Exp( b ); 6215 bSign = extractFloatx80Sign( b ); 6216 zSign = aSign ^ bSign; 6217 if ( aExp == 0x7FFF ) { 6218 if ((uint64_t)(aSig << 1)) { 6219 return propagateFloatx80NaN(a, b, status); 6220 } 6221 if ( bExp == 0x7FFF ) { 6222 if ((uint64_t)(bSig << 1)) { 6223 return propagateFloatx80NaN(a, b, status); 6224 } 6225 goto invalid; 6226 } 6227 return packFloatx80(zSign, floatx80_infinity_high, 6228 floatx80_infinity_low); 6229 } 6230 if ( bExp == 0x7FFF ) { 6231 if ((uint64_t)(bSig << 1)) { 6232 return propagateFloatx80NaN(a, b, status); 6233 } 6234 return packFloatx80( zSign, 0, 0 ); 6235 } 6236 if ( bExp == 0 ) { 6237 if ( bSig == 0 ) { 6238 if ( ( aExp | aSig ) == 0 ) { 6239 invalid: 6240 float_raise(float_flag_invalid, status); 6241 return floatx80_default_nan(status); 6242 } 6243 float_raise(float_flag_divbyzero, status); 6244 return packFloatx80(zSign, floatx80_infinity_high, 6245 floatx80_infinity_low); 6246 } 6247 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6248 } 6249 if ( aExp == 0 ) { 6250 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6251 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 6252 } 6253 zExp = aExp - bExp + 0x3FFE; 6254 rem1 = 0; 6255 if ( bSig <= aSig ) { 6256 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 6257 ++zExp; 6258 } 6259 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 6260 mul64To128( bSig, zSig0, &term0, &term1 ); 6261 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 6262 while ( (int64_t) rem0 < 0 ) { 6263 --zSig0; 6264 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 6265 } 6266 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 6267 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 6268 mul64To128( bSig, zSig1, &term1, &term2 ); 6269 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6270 while ( (int64_t) rem1 < 0 ) { 6271 --zSig1; 6272 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 6273 } 6274 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 6275 } 6276 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6277 zSign, zExp, zSig0, zSig1, status); 6278 } 6279 6280 /*---------------------------------------------------------------------------- 6281 | Returns the remainder of the extended double-precision floating-point value 6282 | `a' with respect to the corresponding value `b'. The operation is performed 6283 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic, 6284 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating 6285 | the quotient toward zero instead. '*quotient' is set to the low 64 bits of 6286 | the absolute value of the integer quotient. 6287 *----------------------------------------------------------------------------*/ 6288 6289 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient, 6290 float_status *status) 6291 { 6292 bool aSign, zSign; 6293 int32_t aExp, bExp, expDiff, aExpOrig; 6294 uint64_t aSig0, aSig1, bSig; 6295 uint64_t q, term0, term1, alternateASig0, alternateASig1; 6296 6297 *quotient = 0; 6298 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6299 float_raise(float_flag_invalid, status); 6300 return floatx80_default_nan(status); 6301 } 6302 aSig0 = extractFloatx80Frac( a ); 6303 aExpOrig = aExp = extractFloatx80Exp( a ); 6304 aSign = extractFloatx80Sign( a ); 6305 bSig = extractFloatx80Frac( b ); 6306 bExp = extractFloatx80Exp( b ); 6307 if ( aExp == 0x7FFF ) { 6308 if ( (uint64_t) ( aSig0<<1 ) 6309 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 6310 return propagateFloatx80NaN(a, b, status); 6311 } 6312 goto invalid; 6313 } 6314 if ( bExp == 0x7FFF ) { 6315 if ((uint64_t)(bSig << 1)) { 6316 return propagateFloatx80NaN(a, b, status); 6317 } 6318 if (aExp == 0 && aSig0 >> 63) { 6319 /* 6320 * Pseudo-denormal argument must be returned in normalized 6321 * form. 6322 */ 6323 return packFloatx80(aSign, 1, aSig0); 6324 } 6325 return a; 6326 } 6327 if ( bExp == 0 ) { 6328 if ( bSig == 0 ) { 6329 invalid: 6330 float_raise(float_flag_invalid, status); 6331 return floatx80_default_nan(status); 6332 } 6333 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6334 } 6335 if ( aExp == 0 ) { 6336 if ( aSig0 == 0 ) return a; 6337 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6338 } 6339 zSign = aSign; 6340 expDiff = aExp - bExp; 6341 aSig1 = 0; 6342 if ( expDiff < 0 ) { 6343 if ( mod || expDiff < -1 ) { 6344 if (aExp == 1 && aExpOrig == 0) { 6345 /* 6346 * Pseudo-denormal argument must be returned in 6347 * normalized form. 6348 */ 6349 return packFloatx80(aSign, aExp, aSig0); 6350 } 6351 return a; 6352 } 6353 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 6354 expDiff = 0; 6355 } 6356 *quotient = q = ( bSig <= aSig0 ); 6357 if ( q ) aSig0 -= bSig; 6358 expDiff -= 64; 6359 while ( 0 < expDiff ) { 6360 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6361 q = ( 2 < q ) ? q - 2 : 0; 6362 mul64To128( bSig, q, &term0, &term1 ); 6363 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6364 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 6365 expDiff -= 62; 6366 *quotient <<= 62; 6367 *quotient += q; 6368 } 6369 expDiff += 64; 6370 if ( 0 < expDiff ) { 6371 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6372 q = ( 2 < q ) ? q - 2 : 0; 6373 q >>= 64 - expDiff; 6374 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 6375 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6376 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 6377 while ( le128( term0, term1, aSig0, aSig1 ) ) { 6378 ++q; 6379 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6380 } 6381 if (expDiff < 64) { 6382 *quotient <<= expDiff; 6383 } else { 6384 *quotient = 0; 6385 } 6386 *quotient += q; 6387 } 6388 else { 6389 term1 = 0; 6390 term0 = bSig; 6391 } 6392 if (!mod) { 6393 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 6394 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6395 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6396 && ( q & 1 ) ) 6397 ) { 6398 aSig0 = alternateASig0; 6399 aSig1 = alternateASig1; 6400 zSign = ! zSign; 6401 ++*quotient; 6402 } 6403 } 6404 return 6405 normalizeRoundAndPackFloatx80( 6406 80, zSign, bExp + expDiff, aSig0, aSig1, status); 6407 6408 } 6409 6410 /*---------------------------------------------------------------------------- 6411 | Returns the remainder of the extended double-precision floating-point value 6412 | `a' with respect to the corresponding value `b'. The operation is performed 6413 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6414 *----------------------------------------------------------------------------*/ 6415 6416 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 6417 { 6418 uint64_t quotient; 6419 return floatx80_modrem(a, b, false, "ient, status); 6420 } 6421 6422 /*---------------------------------------------------------------------------- 6423 | Returns the remainder of the extended double-precision floating-point value 6424 | `a' with respect to the corresponding value `b', with the quotient truncated 6425 | toward zero. 6426 *----------------------------------------------------------------------------*/ 6427 6428 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status) 6429 { 6430 uint64_t quotient; 6431 return floatx80_modrem(a, b, true, "ient, status); 6432 } 6433 6434 /*---------------------------------------------------------------------------- 6435 | Returns the square root of the extended double-precision floating-point 6436 | value `a'. The operation is performed according to the IEC/IEEE Standard 6437 | for Binary Floating-Point Arithmetic. 6438 *----------------------------------------------------------------------------*/ 6439 6440 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 6441 { 6442 bool aSign; 6443 int32_t aExp, zExp; 6444 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 6445 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6446 6447 if (floatx80_invalid_encoding(a)) { 6448 float_raise(float_flag_invalid, status); 6449 return floatx80_default_nan(status); 6450 } 6451 aSig0 = extractFloatx80Frac( a ); 6452 aExp = extractFloatx80Exp( a ); 6453 aSign = extractFloatx80Sign( a ); 6454 if ( aExp == 0x7FFF ) { 6455 if ((uint64_t)(aSig0 << 1)) { 6456 return propagateFloatx80NaN(a, a, status); 6457 } 6458 if ( ! aSign ) return a; 6459 goto invalid; 6460 } 6461 if ( aSign ) { 6462 if ( ( aExp | aSig0 ) == 0 ) return a; 6463 invalid: 6464 float_raise(float_flag_invalid, status); 6465 return floatx80_default_nan(status); 6466 } 6467 if ( aExp == 0 ) { 6468 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 6469 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6470 } 6471 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 6472 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 6473 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 6474 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6475 doubleZSig0 = zSig0<<1; 6476 mul64To128( zSig0, zSig0, &term0, &term1 ); 6477 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6478 while ( (int64_t) rem0 < 0 ) { 6479 --zSig0; 6480 doubleZSig0 -= 2; 6481 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6482 } 6483 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6484 if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) { 6485 if ( zSig1 == 0 ) zSig1 = 1; 6486 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6487 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6488 mul64To128( zSig1, zSig1, &term2, &term3 ); 6489 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6490 while ( (int64_t) rem1 < 0 ) { 6491 --zSig1; 6492 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6493 term3 |= 1; 6494 term2 |= doubleZSig0; 6495 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6496 } 6497 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6498 } 6499 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 6500 zSig0 |= doubleZSig0; 6501 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6502 0, zExp, zSig0, zSig1, status); 6503 } 6504 6505 /*---------------------------------------------------------------------------- 6506 | Returns the result of converting the quadruple-precision floating-point 6507 | value `a' to the 32-bit two's complement integer format. The conversion 6508 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6509 | Arithmetic---which means in particular that the conversion is rounded 6510 | according to the current rounding mode. If `a' is a NaN, the largest 6511 | positive integer is returned. Otherwise, if the conversion overflows, the 6512 | largest integer with the same sign as `a' is returned. 6513 *----------------------------------------------------------------------------*/ 6514 6515 int32_t float128_to_int32(float128 a, float_status *status) 6516 { 6517 bool aSign; 6518 int32_t aExp, shiftCount; 6519 uint64_t aSig0, aSig1; 6520 6521 aSig1 = extractFloat128Frac1( a ); 6522 aSig0 = extractFloat128Frac0( a ); 6523 aExp = extractFloat128Exp( a ); 6524 aSign = extractFloat128Sign( a ); 6525 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 6526 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6527 aSig0 |= ( aSig1 != 0 ); 6528 shiftCount = 0x4028 - aExp; 6529 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 6530 return roundAndPackInt32(aSign, aSig0, status); 6531 6532 } 6533 6534 /*---------------------------------------------------------------------------- 6535 | Returns the result of converting the quadruple-precision floating-point 6536 | value `a' to the 32-bit two's complement integer format. The conversion 6537 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6538 | Arithmetic, except that the conversion is always rounded toward zero. If 6539 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 6540 | conversion overflows, the largest integer with the same sign as `a' is 6541 | returned. 6542 *----------------------------------------------------------------------------*/ 6543 6544 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 6545 { 6546 bool aSign; 6547 int32_t aExp, shiftCount; 6548 uint64_t aSig0, aSig1, savedASig; 6549 int32_t z; 6550 6551 aSig1 = extractFloat128Frac1( a ); 6552 aSig0 = extractFloat128Frac0( a ); 6553 aExp = extractFloat128Exp( a ); 6554 aSign = extractFloat128Sign( a ); 6555 aSig0 |= ( aSig1 != 0 ); 6556 if ( 0x401E < aExp ) { 6557 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 6558 goto invalid; 6559 } 6560 else if ( aExp < 0x3FFF ) { 6561 if (aExp || aSig0) { 6562 float_raise(float_flag_inexact, status); 6563 } 6564 return 0; 6565 } 6566 aSig0 |= UINT64_C(0x0001000000000000); 6567 shiftCount = 0x402F - aExp; 6568 savedASig = aSig0; 6569 aSig0 >>= shiftCount; 6570 z = aSig0; 6571 if ( aSign ) z = - z; 6572 if ( ( z < 0 ) ^ aSign ) { 6573 invalid: 6574 float_raise(float_flag_invalid, status); 6575 return aSign ? INT32_MIN : INT32_MAX; 6576 } 6577 if ( ( aSig0<<shiftCount ) != savedASig ) { 6578 float_raise(float_flag_inexact, status); 6579 } 6580 return z; 6581 6582 } 6583 6584 /*---------------------------------------------------------------------------- 6585 | Returns the result of converting the quadruple-precision floating-point 6586 | value `a' to the 64-bit two's complement integer format. The conversion 6587 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6588 | Arithmetic---which means in particular that the conversion is rounded 6589 | according to the current rounding mode. If `a' is a NaN, the largest 6590 | positive integer is returned. Otherwise, if the conversion overflows, the 6591 | largest integer with the same sign as `a' is returned. 6592 *----------------------------------------------------------------------------*/ 6593 6594 int64_t float128_to_int64(float128 a, float_status *status) 6595 { 6596 bool aSign; 6597 int32_t aExp, shiftCount; 6598 uint64_t aSig0, aSig1; 6599 6600 aSig1 = extractFloat128Frac1( a ); 6601 aSig0 = extractFloat128Frac0( a ); 6602 aExp = extractFloat128Exp( a ); 6603 aSign = extractFloat128Sign( a ); 6604 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6605 shiftCount = 0x402F - aExp; 6606 if ( shiftCount <= 0 ) { 6607 if ( 0x403E < aExp ) { 6608 float_raise(float_flag_invalid, status); 6609 if ( ! aSign 6610 || ( ( aExp == 0x7FFF ) 6611 && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) ) 6612 ) 6613 ) { 6614 return INT64_MAX; 6615 } 6616 return INT64_MIN; 6617 } 6618 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 6619 } 6620 else { 6621 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 6622 } 6623 return roundAndPackInt64(aSign, aSig0, aSig1, status); 6624 6625 } 6626 6627 /*---------------------------------------------------------------------------- 6628 | Returns the result of converting the quadruple-precision floating-point 6629 | value `a' to the 64-bit two's complement integer format. The conversion 6630 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6631 | Arithmetic, except that the conversion is always rounded toward zero. 6632 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 6633 | the conversion overflows, the largest integer with the same sign as `a' is 6634 | returned. 6635 *----------------------------------------------------------------------------*/ 6636 6637 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 6638 { 6639 bool aSign; 6640 int32_t aExp, shiftCount; 6641 uint64_t aSig0, aSig1; 6642 int64_t z; 6643 6644 aSig1 = extractFloat128Frac1( a ); 6645 aSig0 = extractFloat128Frac0( a ); 6646 aExp = extractFloat128Exp( a ); 6647 aSign = extractFloat128Sign( a ); 6648 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6649 shiftCount = aExp - 0x402F; 6650 if ( 0 < shiftCount ) { 6651 if ( 0x403E <= aExp ) { 6652 aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF); 6653 if ( ( a.high == UINT64_C(0xC03E000000000000) ) 6654 && ( aSig1 < UINT64_C(0x0002000000000000) ) ) { 6655 if (aSig1) { 6656 float_raise(float_flag_inexact, status); 6657 } 6658 } 6659 else { 6660 float_raise(float_flag_invalid, status); 6661 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 6662 return INT64_MAX; 6663 } 6664 } 6665 return INT64_MIN; 6666 } 6667 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 6668 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 6669 float_raise(float_flag_inexact, status); 6670 } 6671 } 6672 else { 6673 if ( aExp < 0x3FFF ) { 6674 if ( aExp | aSig0 | aSig1 ) { 6675 float_raise(float_flag_inexact, status); 6676 } 6677 return 0; 6678 } 6679 z = aSig0>>( - shiftCount ); 6680 if ( aSig1 6681 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 6682 float_raise(float_flag_inexact, status); 6683 } 6684 } 6685 if ( aSign ) z = - z; 6686 return z; 6687 6688 } 6689 6690 /*---------------------------------------------------------------------------- 6691 | Returns the result of converting the quadruple-precision floating-point value 6692 | `a' to the 64-bit unsigned integer format. The conversion is 6693 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6694 | Arithmetic---which means in particular that the conversion is rounded 6695 | according to the current rounding mode. If `a' is a NaN, the largest 6696 | positive integer is returned. If the conversion overflows, the 6697 | largest unsigned integer is returned. If 'a' is negative, the value is 6698 | rounded and zero is returned; negative values that do not round to zero 6699 | will raise the inexact exception. 6700 *----------------------------------------------------------------------------*/ 6701 6702 uint64_t float128_to_uint64(float128 a, float_status *status) 6703 { 6704 bool aSign; 6705 int aExp; 6706 int shiftCount; 6707 uint64_t aSig0, aSig1; 6708 6709 aSig0 = extractFloat128Frac0(a); 6710 aSig1 = extractFloat128Frac1(a); 6711 aExp = extractFloat128Exp(a); 6712 aSign = extractFloat128Sign(a); 6713 if (aSign && (aExp > 0x3FFE)) { 6714 float_raise(float_flag_invalid, status); 6715 if (float128_is_any_nan(a)) { 6716 return UINT64_MAX; 6717 } else { 6718 return 0; 6719 } 6720 } 6721 if (aExp) { 6722 aSig0 |= UINT64_C(0x0001000000000000); 6723 } 6724 shiftCount = 0x402F - aExp; 6725 if (shiftCount <= 0) { 6726 if (0x403E < aExp) { 6727 float_raise(float_flag_invalid, status); 6728 return UINT64_MAX; 6729 } 6730 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 6731 } else { 6732 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 6733 } 6734 return roundAndPackUint64(aSign, aSig0, aSig1, status); 6735 } 6736 6737 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 6738 { 6739 uint64_t v; 6740 signed char current_rounding_mode = status->float_rounding_mode; 6741 6742 set_float_rounding_mode(float_round_to_zero, status); 6743 v = float128_to_uint64(a, status); 6744 set_float_rounding_mode(current_rounding_mode, status); 6745 6746 return v; 6747 } 6748 6749 /*---------------------------------------------------------------------------- 6750 | Returns the result of converting the quadruple-precision floating-point 6751 | value `a' to the 32-bit unsigned integer format. The conversion 6752 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6753 | Arithmetic except that the conversion is always rounded toward zero. 6754 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 6755 | if the conversion overflows, the largest unsigned integer is returned. 6756 | If 'a' is negative, the value is rounded and zero is returned; negative 6757 | values that do not round to zero will raise the inexact exception. 6758 *----------------------------------------------------------------------------*/ 6759 6760 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 6761 { 6762 uint64_t v; 6763 uint32_t res; 6764 int old_exc_flags = get_float_exception_flags(status); 6765 6766 v = float128_to_uint64_round_to_zero(a, status); 6767 if (v > 0xffffffff) { 6768 res = 0xffffffff; 6769 } else { 6770 return v; 6771 } 6772 set_float_exception_flags(old_exc_flags, status); 6773 float_raise(float_flag_invalid, status); 6774 return res; 6775 } 6776 6777 /*---------------------------------------------------------------------------- 6778 | Returns the result of converting the quadruple-precision floating-point value 6779 | `a' to the 32-bit unsigned integer format. The conversion is 6780 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6781 | Arithmetic---which means in particular that the conversion is rounded 6782 | according to the current rounding mode. If `a' is a NaN, the largest 6783 | positive integer is returned. If the conversion overflows, the 6784 | largest unsigned integer is returned. If 'a' is negative, the value is 6785 | rounded and zero is returned; negative values that do not round to zero 6786 | will raise the inexact exception. 6787 *----------------------------------------------------------------------------*/ 6788 6789 uint32_t float128_to_uint32(float128 a, float_status *status) 6790 { 6791 uint64_t v; 6792 uint32_t res; 6793 int old_exc_flags = get_float_exception_flags(status); 6794 6795 v = float128_to_uint64(a, status); 6796 if (v > 0xffffffff) { 6797 res = 0xffffffff; 6798 } else { 6799 return v; 6800 } 6801 set_float_exception_flags(old_exc_flags, status); 6802 float_raise(float_flag_invalid, status); 6803 return res; 6804 } 6805 6806 /*---------------------------------------------------------------------------- 6807 | Returns the result of converting the quadruple-precision floating-point 6808 | value `a' to the single-precision floating-point format. The conversion 6809 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6810 | Arithmetic. 6811 *----------------------------------------------------------------------------*/ 6812 6813 float32 float128_to_float32(float128 a, float_status *status) 6814 { 6815 bool aSign; 6816 int32_t aExp; 6817 uint64_t aSig0, aSig1; 6818 uint32_t zSig; 6819 6820 aSig1 = extractFloat128Frac1( a ); 6821 aSig0 = extractFloat128Frac0( a ); 6822 aExp = extractFloat128Exp( a ); 6823 aSign = extractFloat128Sign( a ); 6824 if ( aExp == 0x7FFF ) { 6825 if ( aSig0 | aSig1 ) { 6826 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 6827 } 6828 return packFloat32( aSign, 0xFF, 0 ); 6829 } 6830 aSig0 |= ( aSig1 != 0 ); 6831 shift64RightJamming( aSig0, 18, &aSig0 ); 6832 zSig = aSig0; 6833 if ( aExp || zSig ) { 6834 zSig |= 0x40000000; 6835 aExp -= 0x3F81; 6836 } 6837 return roundAndPackFloat32(aSign, aExp, zSig, status); 6838 6839 } 6840 6841 /*---------------------------------------------------------------------------- 6842 | Returns the result of converting the quadruple-precision floating-point 6843 | value `a' to the double-precision floating-point format. The conversion 6844 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6845 | Arithmetic. 6846 *----------------------------------------------------------------------------*/ 6847 6848 float64 float128_to_float64(float128 a, float_status *status) 6849 { 6850 bool aSign; 6851 int32_t aExp; 6852 uint64_t aSig0, aSig1; 6853 6854 aSig1 = extractFloat128Frac1( a ); 6855 aSig0 = extractFloat128Frac0( a ); 6856 aExp = extractFloat128Exp( a ); 6857 aSign = extractFloat128Sign( a ); 6858 if ( aExp == 0x7FFF ) { 6859 if ( aSig0 | aSig1 ) { 6860 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 6861 } 6862 return packFloat64( aSign, 0x7FF, 0 ); 6863 } 6864 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6865 aSig0 |= ( aSig1 != 0 ); 6866 if ( aExp || aSig0 ) { 6867 aSig0 |= UINT64_C(0x4000000000000000); 6868 aExp -= 0x3C01; 6869 } 6870 return roundAndPackFloat64(aSign, aExp, aSig0, status); 6871 6872 } 6873 6874 /*---------------------------------------------------------------------------- 6875 | Returns the result of converting the quadruple-precision floating-point 6876 | value `a' to the extended double-precision floating-point format. The 6877 | conversion is performed according to the IEC/IEEE Standard for Binary 6878 | Floating-Point Arithmetic. 6879 *----------------------------------------------------------------------------*/ 6880 6881 floatx80 float128_to_floatx80(float128 a, float_status *status) 6882 { 6883 bool aSign; 6884 int32_t aExp; 6885 uint64_t aSig0, aSig1; 6886 6887 aSig1 = extractFloat128Frac1( a ); 6888 aSig0 = extractFloat128Frac0( a ); 6889 aExp = extractFloat128Exp( a ); 6890 aSign = extractFloat128Sign( a ); 6891 if ( aExp == 0x7FFF ) { 6892 if ( aSig0 | aSig1 ) { 6893 floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status), 6894 status); 6895 return floatx80_silence_nan(res, status); 6896 } 6897 return packFloatx80(aSign, floatx80_infinity_high, 6898 floatx80_infinity_low); 6899 } 6900 if ( aExp == 0 ) { 6901 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6902 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6903 } 6904 else { 6905 aSig0 |= UINT64_C(0x0001000000000000); 6906 } 6907 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6908 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6909 6910 } 6911 6912 /*---------------------------------------------------------------------------- 6913 | Rounds the quadruple-precision floating-point value `a' to an integer, and 6914 | returns the result as a quadruple-precision floating-point value. The 6915 | operation is performed according to the IEC/IEEE Standard for Binary 6916 | Floating-Point Arithmetic. 6917 *----------------------------------------------------------------------------*/ 6918 6919 float128 float128_round_to_int(float128 a, float_status *status) 6920 { 6921 bool aSign; 6922 int32_t aExp; 6923 uint64_t lastBitMask, roundBitsMask; 6924 float128 z; 6925 6926 aExp = extractFloat128Exp( a ); 6927 if ( 0x402F <= aExp ) { 6928 if ( 0x406F <= aExp ) { 6929 if ( ( aExp == 0x7FFF ) 6930 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 6931 ) { 6932 return propagateFloat128NaN(a, a, status); 6933 } 6934 return a; 6935 } 6936 lastBitMask = 1; 6937 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 6938 roundBitsMask = lastBitMask - 1; 6939 z = a; 6940 switch (status->float_rounding_mode) { 6941 case float_round_nearest_even: 6942 if ( lastBitMask ) { 6943 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 6944 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 6945 } 6946 else { 6947 if ( (int64_t) z.low < 0 ) { 6948 ++z.high; 6949 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 6950 } 6951 } 6952 break; 6953 case float_round_ties_away: 6954 if (lastBitMask) { 6955 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 6956 } else { 6957 if ((int64_t) z.low < 0) { 6958 ++z.high; 6959 } 6960 } 6961 break; 6962 case float_round_to_zero: 6963 break; 6964 case float_round_up: 6965 if (!extractFloat128Sign(z)) { 6966 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6967 } 6968 break; 6969 case float_round_down: 6970 if (extractFloat128Sign(z)) { 6971 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6972 } 6973 break; 6974 case float_round_to_odd: 6975 /* 6976 * Note that if lastBitMask == 0, the last bit is the lsb 6977 * of high, and roundBitsMask == -1. 6978 */ 6979 if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) { 6980 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6981 } 6982 break; 6983 default: 6984 abort(); 6985 } 6986 z.low &= ~ roundBitsMask; 6987 } 6988 else { 6989 if ( aExp < 0x3FFF ) { 6990 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 6991 float_raise(float_flag_inexact, status); 6992 aSign = extractFloat128Sign( a ); 6993 switch (status->float_rounding_mode) { 6994 case float_round_nearest_even: 6995 if ( ( aExp == 0x3FFE ) 6996 && ( extractFloat128Frac0( a ) 6997 | extractFloat128Frac1( a ) ) 6998 ) { 6999 return packFloat128( aSign, 0x3FFF, 0, 0 ); 7000 } 7001 break; 7002 case float_round_ties_away: 7003 if (aExp == 0x3FFE) { 7004 return packFloat128(aSign, 0x3FFF, 0, 0); 7005 } 7006 break; 7007 case float_round_down: 7008 return 7009 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 7010 : packFloat128( 0, 0, 0, 0 ); 7011 case float_round_up: 7012 return 7013 aSign ? packFloat128( 1, 0, 0, 0 ) 7014 : packFloat128( 0, 0x3FFF, 0, 0 ); 7015 7016 case float_round_to_odd: 7017 return packFloat128(aSign, 0x3FFF, 0, 0); 7018 7019 case float_round_to_zero: 7020 break; 7021 } 7022 return packFloat128( aSign, 0, 0, 0 ); 7023 } 7024 lastBitMask = 1; 7025 lastBitMask <<= 0x402F - aExp; 7026 roundBitsMask = lastBitMask - 1; 7027 z.low = 0; 7028 z.high = a.high; 7029 switch (status->float_rounding_mode) { 7030 case float_round_nearest_even: 7031 z.high += lastBitMask>>1; 7032 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 7033 z.high &= ~ lastBitMask; 7034 } 7035 break; 7036 case float_round_ties_away: 7037 z.high += lastBitMask>>1; 7038 break; 7039 case float_round_to_zero: 7040 break; 7041 case float_round_up: 7042 if (!extractFloat128Sign(z)) { 7043 z.high |= ( a.low != 0 ); 7044 z.high += roundBitsMask; 7045 } 7046 break; 7047 case float_round_down: 7048 if (extractFloat128Sign(z)) { 7049 z.high |= (a.low != 0); 7050 z.high += roundBitsMask; 7051 } 7052 break; 7053 case float_round_to_odd: 7054 if ((z.high & lastBitMask) == 0) { 7055 z.high |= (a.low != 0); 7056 z.high += roundBitsMask; 7057 } 7058 break; 7059 default: 7060 abort(); 7061 } 7062 z.high &= ~ roundBitsMask; 7063 } 7064 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 7065 float_raise(float_flag_inexact, status); 7066 } 7067 return z; 7068 7069 } 7070 7071 /*---------------------------------------------------------------------------- 7072 | Returns the result of multiplying the quadruple-precision floating-point 7073 | values `a' and `b'. The operation is performed according to the IEC/IEEE 7074 | Standard for Binary Floating-Point Arithmetic. 7075 *----------------------------------------------------------------------------*/ 7076 7077 float128 float128_mul(float128 a, float128 b, float_status *status) 7078 { 7079 bool aSign, bSign, zSign; 7080 int32_t aExp, bExp, zExp; 7081 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 7082 7083 aSig1 = extractFloat128Frac1( a ); 7084 aSig0 = extractFloat128Frac0( a ); 7085 aExp = extractFloat128Exp( a ); 7086 aSign = extractFloat128Sign( a ); 7087 bSig1 = extractFloat128Frac1( b ); 7088 bSig0 = extractFloat128Frac0( b ); 7089 bExp = extractFloat128Exp( b ); 7090 bSign = extractFloat128Sign( b ); 7091 zSign = aSign ^ bSign; 7092 if ( aExp == 0x7FFF ) { 7093 if ( ( aSig0 | aSig1 ) 7094 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 7095 return propagateFloat128NaN(a, b, status); 7096 } 7097 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 7098 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7099 } 7100 if ( bExp == 0x7FFF ) { 7101 if (bSig0 | bSig1) { 7102 return propagateFloat128NaN(a, b, status); 7103 } 7104 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 7105 invalid: 7106 float_raise(float_flag_invalid, status); 7107 return float128_default_nan(status); 7108 } 7109 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7110 } 7111 if ( aExp == 0 ) { 7112 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7113 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7114 } 7115 if ( bExp == 0 ) { 7116 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7117 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7118 } 7119 zExp = aExp + bExp - 0x4000; 7120 aSig0 |= UINT64_C(0x0001000000000000); 7121 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 7122 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 7123 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 7124 zSig2 |= ( zSig3 != 0 ); 7125 if (UINT64_C( 0x0002000000000000) <= zSig0 ) { 7126 shift128ExtraRightJamming( 7127 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 7128 ++zExp; 7129 } 7130 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7131 7132 } 7133 7134 /*---------------------------------------------------------------------------- 7135 | Returns the result of dividing the quadruple-precision floating-point value 7136 | `a' by the corresponding value `b'. The operation is performed according to 7137 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7138 *----------------------------------------------------------------------------*/ 7139 7140 float128 float128_div(float128 a, float128 b, float_status *status) 7141 { 7142 bool aSign, bSign, zSign; 7143 int32_t aExp, bExp, zExp; 7144 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 7145 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7146 7147 aSig1 = extractFloat128Frac1( a ); 7148 aSig0 = extractFloat128Frac0( a ); 7149 aExp = extractFloat128Exp( a ); 7150 aSign = extractFloat128Sign( a ); 7151 bSig1 = extractFloat128Frac1( b ); 7152 bSig0 = extractFloat128Frac0( b ); 7153 bExp = extractFloat128Exp( b ); 7154 bSign = extractFloat128Sign( b ); 7155 zSign = aSign ^ bSign; 7156 if ( aExp == 0x7FFF ) { 7157 if (aSig0 | aSig1) { 7158 return propagateFloat128NaN(a, b, status); 7159 } 7160 if ( bExp == 0x7FFF ) { 7161 if (bSig0 | bSig1) { 7162 return propagateFloat128NaN(a, b, status); 7163 } 7164 goto invalid; 7165 } 7166 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7167 } 7168 if ( bExp == 0x7FFF ) { 7169 if (bSig0 | bSig1) { 7170 return propagateFloat128NaN(a, b, status); 7171 } 7172 return packFloat128( zSign, 0, 0, 0 ); 7173 } 7174 if ( bExp == 0 ) { 7175 if ( ( bSig0 | bSig1 ) == 0 ) { 7176 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 7177 invalid: 7178 float_raise(float_flag_invalid, status); 7179 return float128_default_nan(status); 7180 } 7181 float_raise(float_flag_divbyzero, status); 7182 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7183 } 7184 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7185 } 7186 if ( aExp == 0 ) { 7187 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7188 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7189 } 7190 zExp = aExp - bExp + 0x3FFD; 7191 shortShift128Left( 7192 aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 ); 7193 shortShift128Left( 7194 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 ); 7195 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 7196 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 7197 ++zExp; 7198 } 7199 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7200 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 7201 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 7202 while ( (int64_t) rem0 < 0 ) { 7203 --zSig0; 7204 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 7205 } 7206 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 7207 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 7208 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 7209 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 7210 while ( (int64_t) rem1 < 0 ) { 7211 --zSig1; 7212 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 7213 } 7214 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7215 } 7216 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 7217 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7218 7219 } 7220 7221 /*---------------------------------------------------------------------------- 7222 | Returns the remainder of the quadruple-precision floating-point value `a' 7223 | with respect to the corresponding value `b'. The operation is performed 7224 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7225 *----------------------------------------------------------------------------*/ 7226 7227 float128 float128_rem(float128 a, float128 b, float_status *status) 7228 { 7229 bool aSign, zSign; 7230 int32_t aExp, bExp, expDiff; 7231 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 7232 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 7233 int64_t sigMean0; 7234 7235 aSig1 = extractFloat128Frac1( a ); 7236 aSig0 = extractFloat128Frac0( a ); 7237 aExp = extractFloat128Exp( a ); 7238 aSign = extractFloat128Sign( a ); 7239 bSig1 = extractFloat128Frac1( b ); 7240 bSig0 = extractFloat128Frac0( b ); 7241 bExp = extractFloat128Exp( b ); 7242 if ( aExp == 0x7FFF ) { 7243 if ( ( aSig0 | aSig1 ) 7244 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 7245 return propagateFloat128NaN(a, b, status); 7246 } 7247 goto invalid; 7248 } 7249 if ( bExp == 0x7FFF ) { 7250 if (bSig0 | bSig1) { 7251 return propagateFloat128NaN(a, b, status); 7252 } 7253 return a; 7254 } 7255 if ( bExp == 0 ) { 7256 if ( ( bSig0 | bSig1 ) == 0 ) { 7257 invalid: 7258 float_raise(float_flag_invalid, status); 7259 return float128_default_nan(status); 7260 } 7261 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7262 } 7263 if ( aExp == 0 ) { 7264 if ( ( aSig0 | aSig1 ) == 0 ) return a; 7265 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7266 } 7267 expDiff = aExp - bExp; 7268 if ( expDiff < -1 ) return a; 7269 shortShift128Left( 7270 aSig0 | UINT64_C(0x0001000000000000), 7271 aSig1, 7272 15 - ( expDiff < 0 ), 7273 &aSig0, 7274 &aSig1 7275 ); 7276 shortShift128Left( 7277 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 ); 7278 q = le128( bSig0, bSig1, aSig0, aSig1 ); 7279 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7280 expDiff -= 64; 7281 while ( 0 < expDiff ) { 7282 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7283 q = ( 4 < q ) ? q - 4 : 0; 7284 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7285 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 7286 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 7287 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 7288 expDiff -= 61; 7289 } 7290 if ( -64 < expDiff ) { 7291 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7292 q = ( 4 < q ) ? q - 4 : 0; 7293 q >>= - expDiff; 7294 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7295 expDiff += 52; 7296 if ( expDiff < 0 ) { 7297 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 7298 } 7299 else { 7300 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 7301 } 7302 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7303 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 7304 } 7305 else { 7306 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 7307 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7308 } 7309 do { 7310 alternateASig0 = aSig0; 7311 alternateASig1 = aSig1; 7312 ++q; 7313 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7314 } while ( 0 <= (int64_t) aSig0 ); 7315 add128( 7316 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 7317 if ( ( sigMean0 < 0 ) 7318 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 7319 aSig0 = alternateASig0; 7320 aSig1 = alternateASig1; 7321 } 7322 zSign = ( (int64_t) aSig0 < 0 ); 7323 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 7324 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 7325 status); 7326 } 7327 7328 /*---------------------------------------------------------------------------- 7329 | Returns the square root of the quadruple-precision floating-point value `a'. 7330 | The operation is performed according to the IEC/IEEE Standard for Binary 7331 | Floating-Point Arithmetic. 7332 *----------------------------------------------------------------------------*/ 7333 7334 float128 float128_sqrt(float128 a, float_status *status) 7335 { 7336 bool aSign; 7337 int32_t aExp, zExp; 7338 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 7339 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7340 7341 aSig1 = extractFloat128Frac1( a ); 7342 aSig0 = extractFloat128Frac0( a ); 7343 aExp = extractFloat128Exp( a ); 7344 aSign = extractFloat128Sign( a ); 7345 if ( aExp == 0x7FFF ) { 7346 if (aSig0 | aSig1) { 7347 return propagateFloat128NaN(a, a, status); 7348 } 7349 if ( ! aSign ) return a; 7350 goto invalid; 7351 } 7352 if ( aSign ) { 7353 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 7354 invalid: 7355 float_raise(float_flag_invalid, status); 7356 return float128_default_nan(status); 7357 } 7358 if ( aExp == 0 ) { 7359 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 7360 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7361 } 7362 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 7363 aSig0 |= UINT64_C(0x0001000000000000); 7364 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 7365 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 7366 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 7367 doubleZSig0 = zSig0<<1; 7368 mul64To128( zSig0, zSig0, &term0, &term1 ); 7369 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 7370 while ( (int64_t) rem0 < 0 ) { 7371 --zSig0; 7372 doubleZSig0 -= 2; 7373 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 7374 } 7375 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 7376 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 7377 if ( zSig1 == 0 ) zSig1 = 1; 7378 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 7379 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 7380 mul64To128( zSig1, zSig1, &term2, &term3 ); 7381 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 7382 while ( (int64_t) rem1 < 0 ) { 7383 --zSig1; 7384 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 7385 term3 |= 1; 7386 term2 |= doubleZSig0; 7387 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 7388 } 7389 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7390 } 7391 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 7392 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 7393 7394 } 7395 7396 static inline FloatRelation 7397 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet, 7398 float_status *status) 7399 { 7400 bool aSign, bSign; 7401 7402 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 7403 float_raise(float_flag_invalid, status); 7404 return float_relation_unordered; 7405 } 7406 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 7407 ( extractFloatx80Frac( a )<<1 ) ) || 7408 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 7409 ( extractFloatx80Frac( b )<<1 ) )) { 7410 if (!is_quiet || 7411 floatx80_is_signaling_nan(a, status) || 7412 floatx80_is_signaling_nan(b, status)) { 7413 float_raise(float_flag_invalid, status); 7414 } 7415 return float_relation_unordered; 7416 } 7417 aSign = extractFloatx80Sign( a ); 7418 bSign = extractFloatx80Sign( b ); 7419 if ( aSign != bSign ) { 7420 7421 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 7422 ( ( a.low | b.low ) == 0 ) ) { 7423 /* zero case */ 7424 return float_relation_equal; 7425 } else { 7426 return 1 - (2 * aSign); 7427 } 7428 } else { 7429 /* Normalize pseudo-denormals before comparison. */ 7430 if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) { 7431 ++a.high; 7432 } 7433 if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) { 7434 ++b.high; 7435 } 7436 if (a.low == b.low && a.high == b.high) { 7437 return float_relation_equal; 7438 } else { 7439 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7440 } 7441 } 7442 } 7443 7444 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status) 7445 { 7446 return floatx80_compare_internal(a, b, 0, status); 7447 } 7448 7449 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b, 7450 float_status *status) 7451 { 7452 return floatx80_compare_internal(a, b, 1, status); 7453 } 7454 7455 static inline FloatRelation 7456 float128_compare_internal(float128 a, float128 b, bool is_quiet, 7457 float_status *status) 7458 { 7459 bool aSign, bSign; 7460 7461 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7462 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7463 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7464 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7465 if (!is_quiet || 7466 float128_is_signaling_nan(a, status) || 7467 float128_is_signaling_nan(b, status)) { 7468 float_raise(float_flag_invalid, status); 7469 } 7470 return float_relation_unordered; 7471 } 7472 aSign = extractFloat128Sign( a ); 7473 bSign = extractFloat128Sign( b ); 7474 if ( aSign != bSign ) { 7475 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7476 /* zero case */ 7477 return float_relation_equal; 7478 } else { 7479 return 1 - (2 * aSign); 7480 } 7481 } else { 7482 if (a.low == b.low && a.high == b.high) { 7483 return float_relation_equal; 7484 } else { 7485 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7486 } 7487 } 7488 } 7489 7490 FloatRelation float128_compare(float128 a, float128 b, float_status *status) 7491 { 7492 return float128_compare_internal(a, b, 0, status); 7493 } 7494 7495 FloatRelation float128_compare_quiet(float128 a, float128 b, 7496 float_status *status) 7497 { 7498 return float128_compare_internal(a, b, 1, status); 7499 } 7500 7501 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7502 { 7503 bool aSign; 7504 int32_t aExp; 7505 uint64_t aSig; 7506 7507 if (floatx80_invalid_encoding(a)) { 7508 float_raise(float_flag_invalid, status); 7509 return floatx80_default_nan(status); 7510 } 7511 aSig = extractFloatx80Frac( a ); 7512 aExp = extractFloatx80Exp( a ); 7513 aSign = extractFloatx80Sign( a ); 7514 7515 if ( aExp == 0x7FFF ) { 7516 if ( aSig<<1 ) { 7517 return propagateFloatx80NaN(a, a, status); 7518 } 7519 return a; 7520 } 7521 7522 if (aExp == 0) { 7523 if (aSig == 0) { 7524 return a; 7525 } 7526 aExp++; 7527 } 7528 7529 if (n > 0x10000) { 7530 n = 0x10000; 7531 } else if (n < -0x10000) { 7532 n = -0x10000; 7533 } 7534 7535 aExp += n; 7536 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7537 aSign, aExp, aSig, 0, status); 7538 } 7539 7540 float128 float128_scalbn(float128 a, int n, float_status *status) 7541 { 7542 bool aSign; 7543 int32_t aExp; 7544 uint64_t aSig0, aSig1; 7545 7546 aSig1 = extractFloat128Frac1( a ); 7547 aSig0 = extractFloat128Frac0( a ); 7548 aExp = extractFloat128Exp( a ); 7549 aSign = extractFloat128Sign( a ); 7550 if ( aExp == 0x7FFF ) { 7551 if ( aSig0 | aSig1 ) { 7552 return propagateFloat128NaN(a, a, status); 7553 } 7554 return a; 7555 } 7556 if (aExp != 0) { 7557 aSig0 |= UINT64_C(0x0001000000000000); 7558 } else if (aSig0 == 0 && aSig1 == 0) { 7559 return a; 7560 } else { 7561 aExp++; 7562 } 7563 7564 if (n > 0x10000) { 7565 n = 0x10000; 7566 } else if (n < -0x10000) { 7567 n = -0x10000; 7568 } 7569 7570 aExp += n - 1; 7571 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7572 , status); 7573 7574 } 7575 7576 static void __attribute__((constructor)) softfloat_init(void) 7577 { 7578 union_float64 ua, ub, uc, ur; 7579 7580 if (QEMU_NO_HARDFLOAT) { 7581 return; 7582 } 7583 /* 7584 * Test that the host's FMA is not obviously broken. For example, 7585 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see 7586 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304 7587 */ 7588 ua.s = 0x0020000000000001ULL; 7589 ub.s = 0x3ca0000000000000ULL; 7590 uc.s = 0x0020000000000000ULL; 7591 ur.h = fma(ua.h, ub.h, uc.h); 7592 if (ur.s != 0x0020000000000001ULL) { 7593 force_soft_fma = true; 7594 } 7595 } 7596