1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include <math.h> 87 #include "qemu/bitops.h" 88 #include "fpu/softfloat.h" 89 90 /* We only need stdlib for abort() */ 91 92 /*---------------------------------------------------------------------------- 93 | Primitive arithmetic functions, including multi-word arithmetic, and 94 | division and square root approximations. (Can be specialized to target if 95 | desired.) 96 *----------------------------------------------------------------------------*/ 97 #include "fpu/softfloat-macros.h" 98 99 /* 100 * Hardfloat 101 * 102 * Fast emulation of guest FP instructions is challenging for two reasons. 103 * First, FP instruction semantics are similar but not identical, particularly 104 * when handling NaNs. Second, emulating at reasonable speed the guest FP 105 * exception flags is not trivial: reading the host's flags register with a 106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp], 107 * and trapping on every FP exception is not fast nor pleasant to work with. 108 * 109 * We address these challenges by leveraging the host FPU for a subset of the 110 * operations. To do this we expand on the idea presented in this paper: 111 * 112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a 113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615. 114 * 115 * The idea is thus to leverage the host FPU to (1) compute FP operations 116 * and (2) identify whether FP exceptions occurred while avoiding 117 * expensive exception flag register accesses. 118 * 119 * An important optimization shown in the paper is that given that exception 120 * flags are rarely cleared by the guest, we can avoid recomputing some flags. 121 * This is particularly useful for the inexact flag, which is very frequently 122 * raised in floating-point workloads. 123 * 124 * We optimize the code further by deferring to soft-fp whenever FP exception 125 * detection might get hairy. Two examples: (1) when at least one operand is 126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result 127 * and the result is < the minimum normal. 128 */ 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \ 130 static inline void name(soft_t *a, float_status *s) \ 131 { \ 132 if (unlikely(soft_t ## _is_denormal(*a))) { \ 133 *a = soft_t ## _set_sign(soft_t ## _zero, \ 134 soft_t ## _is_neg(*a)); \ 135 float_raise(float_flag_input_denormal, s); \ 136 } \ 137 } 138 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32) 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64) 141 #undef GEN_INPUT_FLUSH__NOCHECK 142 143 #define GEN_INPUT_FLUSH1(name, soft_t) \ 144 static inline void name(soft_t *a, float_status *s) \ 145 { \ 146 if (likely(!s->flush_inputs_to_zero)) { \ 147 return; \ 148 } \ 149 soft_t ## _input_flush__nocheck(a, s); \ 150 } 151 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32) 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64) 154 #undef GEN_INPUT_FLUSH1 155 156 #define GEN_INPUT_FLUSH2(name, soft_t) \ 157 static inline void name(soft_t *a, soft_t *b, float_status *s) \ 158 { \ 159 if (likely(!s->flush_inputs_to_zero)) { \ 160 return; \ 161 } \ 162 soft_t ## _input_flush__nocheck(a, s); \ 163 soft_t ## _input_flush__nocheck(b, s); \ 164 } 165 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32) 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64) 168 #undef GEN_INPUT_FLUSH2 169 170 #define GEN_INPUT_FLUSH3(name, soft_t) \ 171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \ 172 { \ 173 if (likely(!s->flush_inputs_to_zero)) { \ 174 return; \ 175 } \ 176 soft_t ## _input_flush__nocheck(a, s); \ 177 soft_t ## _input_flush__nocheck(b, s); \ 178 soft_t ## _input_flush__nocheck(c, s); \ 179 } 180 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32) 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64) 183 #undef GEN_INPUT_FLUSH3 184 185 /* 186 * Choose whether to use fpclassify or float32/64_* primitives in the generated 187 * hardfloat functions. Each combination of number of inputs and float size 188 * gets its own value. 189 */ 190 #if defined(__x86_64__) 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1 197 #else 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0 204 #endif 205 206 /* 207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over 208 * float{32,64}_is_infinity when !USE_FP. 209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup. 210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%. 211 */ 212 #if defined(__x86_64__) || defined(__aarch64__) 213 # define QEMU_HARDFLOAT_USE_ISINF 1 214 #else 215 # define QEMU_HARDFLOAT_USE_ISINF 0 216 #endif 217 218 /* 219 * Some targets clear the FP flags before most FP operations. This prevents 220 * the use of hardfloat, since hardfloat relies on the inexact flag being 221 * already set. 222 */ 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__) 224 # if defined(__FAST_MATH__) 225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \ 226 IEEE implementation 227 # endif 228 # define QEMU_NO_HARDFLOAT 1 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN 230 #else 231 # define QEMU_NO_HARDFLOAT 0 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline)) 233 #endif 234 235 static inline bool can_use_fpu(const float_status *s) 236 { 237 if (QEMU_NO_HARDFLOAT) { 238 return false; 239 } 240 return likely(s->float_exception_flags & float_flag_inexact && 241 s->float_rounding_mode == float_round_nearest_even); 242 } 243 244 /* 245 * Hardfloat generation functions. Each operation can have two flavors: 246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for 247 * most condition checks, or native ones (e.g. fpclassify). 248 * 249 * The flavor is chosen by the callers. Instead of using macros, we rely on the 250 * compiler to propagate constants and inline everything into the callers. 251 * 252 * We only generate functions for operations with two inputs, since only 253 * these are common enough to justify consolidating them into common code. 254 */ 255 256 typedef union { 257 float32 s; 258 float h; 259 } union_float32; 260 261 typedef union { 262 float64 s; 263 double h; 264 } union_float64; 265 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b); 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b); 268 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s); 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s); 271 typedef float (*hard_f32_op2_fn)(float a, float b); 272 typedef double (*hard_f64_op2_fn)(double a, double b); 273 274 /* 2-input is-zero-or-normal */ 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b) 276 { 277 if (QEMU_HARDFLOAT_2F32_USE_FP) { 278 /* 279 * Not using a temp variable for consecutive fpclassify calls ends up 280 * generating faster code. 281 */ 282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 284 } 285 return float32_is_zero_or_normal(a.s) && 286 float32_is_zero_or_normal(b.s); 287 } 288 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b) 290 { 291 if (QEMU_HARDFLOAT_2F64_USE_FP) { 292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 294 } 295 return float64_is_zero_or_normal(a.s) && 296 float64_is_zero_or_normal(b.s); 297 } 298 299 /* 3-input is-zero-or-normal */ 300 static inline 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c) 302 { 303 if (QEMU_HARDFLOAT_3F32_USE_FP) { 304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 307 } 308 return float32_is_zero_or_normal(a.s) && 309 float32_is_zero_or_normal(b.s) && 310 float32_is_zero_or_normal(c.s); 311 } 312 313 static inline 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c) 315 { 316 if (QEMU_HARDFLOAT_3F64_USE_FP) { 317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 320 } 321 return float64_is_zero_or_normal(a.s) && 322 float64_is_zero_or_normal(b.s) && 323 float64_is_zero_or_normal(c.s); 324 } 325 326 static inline bool f32_is_inf(union_float32 a) 327 { 328 if (QEMU_HARDFLOAT_USE_ISINF) { 329 return isinf(a.h); 330 } 331 return float32_is_infinity(a.s); 332 } 333 334 static inline bool f64_is_inf(union_float64 a) 335 { 336 if (QEMU_HARDFLOAT_USE_ISINF) { 337 return isinf(a.h); 338 } 339 return float64_is_infinity(a.s); 340 } 341 342 static inline float32 343 float32_gen2(float32 xa, float32 xb, float_status *s, 344 hard_f32_op2_fn hard, soft_f32_op2_fn soft, 345 f32_check_fn pre, f32_check_fn post) 346 { 347 union_float32 ua, ub, ur; 348 349 ua.s = xa; 350 ub.s = xb; 351 352 if (unlikely(!can_use_fpu(s))) { 353 goto soft; 354 } 355 356 float32_input_flush2(&ua.s, &ub.s, s); 357 if (unlikely(!pre(ua, ub))) { 358 goto soft; 359 } 360 361 ur.h = hard(ua.h, ub.h); 362 if (unlikely(f32_is_inf(ur))) { 363 float_raise(float_flag_overflow, s); 364 } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) { 365 goto soft; 366 } 367 return ur.s; 368 369 soft: 370 return soft(ua.s, ub.s, s); 371 } 372 373 static inline float64 374 float64_gen2(float64 xa, float64 xb, float_status *s, 375 hard_f64_op2_fn hard, soft_f64_op2_fn soft, 376 f64_check_fn pre, f64_check_fn post) 377 { 378 union_float64 ua, ub, ur; 379 380 ua.s = xa; 381 ub.s = xb; 382 383 if (unlikely(!can_use_fpu(s))) { 384 goto soft; 385 } 386 387 float64_input_flush2(&ua.s, &ub.s, s); 388 if (unlikely(!pre(ua, ub))) { 389 goto soft; 390 } 391 392 ur.h = hard(ua.h, ub.h); 393 if (unlikely(f64_is_inf(ur))) { 394 float_raise(float_flag_overflow, s); 395 } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) { 396 goto soft; 397 } 398 return ur.s; 399 400 soft: 401 return soft(ua.s, ub.s, s); 402 } 403 404 /*---------------------------------------------------------------------------- 405 | Returns the fraction bits of the single-precision floating-point value `a'. 406 *----------------------------------------------------------------------------*/ 407 408 static inline uint32_t extractFloat32Frac(float32 a) 409 { 410 return float32_val(a) & 0x007FFFFF; 411 } 412 413 /*---------------------------------------------------------------------------- 414 | Returns the exponent bits of the single-precision floating-point value `a'. 415 *----------------------------------------------------------------------------*/ 416 417 static inline int extractFloat32Exp(float32 a) 418 { 419 return (float32_val(a) >> 23) & 0xFF; 420 } 421 422 /*---------------------------------------------------------------------------- 423 | Returns the sign bit of the single-precision floating-point value `a'. 424 *----------------------------------------------------------------------------*/ 425 426 static inline bool extractFloat32Sign(float32 a) 427 { 428 return float32_val(a) >> 31; 429 } 430 431 /*---------------------------------------------------------------------------- 432 | Returns the fraction bits of the double-precision floating-point value `a'. 433 *----------------------------------------------------------------------------*/ 434 435 static inline uint64_t extractFloat64Frac(float64 a) 436 { 437 return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF); 438 } 439 440 /*---------------------------------------------------------------------------- 441 | Returns the exponent bits of the double-precision floating-point value `a'. 442 *----------------------------------------------------------------------------*/ 443 444 static inline int extractFloat64Exp(float64 a) 445 { 446 return (float64_val(a) >> 52) & 0x7FF; 447 } 448 449 /*---------------------------------------------------------------------------- 450 | Returns the sign bit of the double-precision floating-point value `a'. 451 *----------------------------------------------------------------------------*/ 452 453 static inline bool extractFloat64Sign(float64 a) 454 { 455 return float64_val(a) >> 63; 456 } 457 458 /* 459 * Classify a floating point number. Everything above float_class_qnan 460 * is a NaN so cls >= float_class_qnan is any NaN. 461 */ 462 463 typedef enum __attribute__ ((__packed__)) { 464 float_class_unclassified, 465 float_class_zero, 466 float_class_normal, 467 float_class_inf, 468 float_class_qnan, /* all NaNs from here */ 469 float_class_snan, 470 } FloatClass; 471 472 #define float_cmask(bit) (1u << (bit)) 473 474 enum { 475 float_cmask_zero = float_cmask(float_class_zero), 476 float_cmask_normal = float_cmask(float_class_normal), 477 float_cmask_inf = float_cmask(float_class_inf), 478 float_cmask_qnan = float_cmask(float_class_qnan), 479 float_cmask_snan = float_cmask(float_class_snan), 480 481 float_cmask_infzero = float_cmask_zero | float_cmask_inf, 482 float_cmask_anynan = float_cmask_qnan | float_cmask_snan, 483 }; 484 485 486 /* Simple helpers for checking if, or what kind of, NaN we have */ 487 static inline __attribute__((unused)) bool is_nan(FloatClass c) 488 { 489 return unlikely(c >= float_class_qnan); 490 } 491 492 static inline __attribute__((unused)) bool is_snan(FloatClass c) 493 { 494 return c == float_class_snan; 495 } 496 497 static inline __attribute__((unused)) bool is_qnan(FloatClass c) 498 { 499 return c == float_class_qnan; 500 } 501 502 /* 503 * Structure holding all of the decomposed parts of a float. 504 * The exponent is unbiased and the fraction is normalized. 505 * 506 * The fraction words are stored in big-endian word ordering, 507 * so that truncation from a larger format to a smaller format 508 * can be done simply by ignoring subsequent elements. 509 */ 510 511 typedef struct { 512 FloatClass cls; 513 bool sign; 514 int32_t exp; 515 union { 516 /* Routines that know the structure may reference the singular name. */ 517 uint64_t frac; 518 /* 519 * Routines expanded with multiple structures reference "hi" and "lo" 520 * depending on the operation. In FloatParts64, "hi" and "lo" are 521 * both the same word and aliased here. 522 */ 523 uint64_t frac_hi; 524 uint64_t frac_lo; 525 }; 526 } FloatParts64; 527 528 typedef struct { 529 FloatClass cls; 530 bool sign; 531 int32_t exp; 532 uint64_t frac_hi; 533 uint64_t frac_lo; 534 } FloatParts128; 535 536 /* These apply to the most significant word of each FloatPartsN. */ 537 #define DECOMPOSED_BINARY_POINT 63 538 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 539 540 /* Structure holding all of the relevant parameters for a format. 541 * exp_size: the size of the exponent field 542 * exp_bias: the offset applied to the exponent field 543 * exp_max: the maximum normalised exponent 544 * frac_size: the size of the fraction field 545 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 546 * The following are computed based the size of fraction 547 * frac_lsb: least significant bit of fraction 548 * frac_lsbm1: the bit below the least significant bit (for rounding) 549 * round_mask/roundeven_mask: masks used for rounding 550 * The following optional modifiers are available: 551 * arm_althp: handle ARM Alternative Half Precision 552 */ 553 typedef struct { 554 int exp_size; 555 int exp_bias; 556 int exp_max; 557 int frac_size; 558 int frac_shift; 559 uint64_t frac_lsb; 560 uint64_t frac_lsbm1; 561 uint64_t round_mask; 562 uint64_t roundeven_mask; 563 bool arm_althp; 564 } FloatFmt; 565 566 /* Expand fields based on the size of exponent and fraction */ 567 #define FLOAT_PARAMS(E, F) \ 568 .exp_size = E, \ 569 .exp_bias = ((1 << E) - 1) >> 1, \ 570 .exp_max = (1 << E) - 1, \ 571 .frac_size = F, \ 572 .frac_shift = (-F - 1) & 63, \ 573 .frac_lsb = 1ull << ((-F - 1) & 63), \ 574 .frac_lsbm1 = 1ull << ((-F - 2) & 63), \ 575 .round_mask = (1ull << ((-F - 1) & 63)) - 1, \ 576 .roundeven_mask = (2ull << ((-F - 1) & 63)) - 1 577 578 static const FloatFmt float16_params = { 579 FLOAT_PARAMS(5, 10) 580 }; 581 582 static const FloatFmt float16_params_ahp = { 583 FLOAT_PARAMS(5, 10), 584 .arm_althp = true 585 }; 586 587 static const FloatFmt bfloat16_params = { 588 FLOAT_PARAMS(8, 7) 589 }; 590 591 static const FloatFmt float32_params = { 592 FLOAT_PARAMS(8, 23) 593 }; 594 595 static const FloatFmt float64_params = { 596 FLOAT_PARAMS(11, 52) 597 }; 598 599 static const FloatFmt float128_params = { 600 FLOAT_PARAMS(15, 112) 601 }; 602 603 /* Unpack a float to parts, but do not canonicalize. */ 604 static void unpack_raw64(FloatParts64 *r, const FloatFmt *fmt, uint64_t raw) 605 { 606 const int f_size = fmt->frac_size; 607 const int e_size = fmt->exp_size; 608 609 *r = (FloatParts64) { 610 .cls = float_class_unclassified, 611 .sign = extract64(raw, f_size + e_size, 1), 612 .exp = extract64(raw, f_size, e_size), 613 .frac = extract64(raw, 0, f_size) 614 }; 615 } 616 617 static inline void float16_unpack_raw(FloatParts64 *p, float16 f) 618 { 619 unpack_raw64(p, &float16_params, f); 620 } 621 622 static inline void bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f) 623 { 624 unpack_raw64(p, &bfloat16_params, f); 625 } 626 627 static inline void float32_unpack_raw(FloatParts64 *p, float32 f) 628 { 629 unpack_raw64(p, &float32_params, f); 630 } 631 632 static inline void float64_unpack_raw(FloatParts64 *p, float64 f) 633 { 634 unpack_raw64(p, &float64_params, f); 635 } 636 637 static void float128_unpack_raw(FloatParts128 *p, float128 f) 638 { 639 const int f_size = float128_params.frac_size - 64; 640 const int e_size = float128_params.exp_size; 641 642 *p = (FloatParts128) { 643 .cls = float_class_unclassified, 644 .sign = extract64(f.high, f_size + e_size, 1), 645 .exp = extract64(f.high, f_size, e_size), 646 .frac_hi = extract64(f.high, 0, f_size), 647 .frac_lo = f.low, 648 }; 649 } 650 651 /* Pack a float from parts, but do not canonicalize. */ 652 static uint64_t pack_raw64(const FloatParts64 *p, const FloatFmt *fmt) 653 { 654 const int f_size = fmt->frac_size; 655 const int e_size = fmt->exp_size; 656 uint64_t ret; 657 658 ret = (uint64_t)p->sign << (f_size + e_size); 659 ret = deposit64(ret, f_size, e_size, p->exp); 660 ret = deposit64(ret, 0, f_size, p->frac); 661 return ret; 662 } 663 664 static inline float16 float16_pack_raw(const FloatParts64 *p) 665 { 666 return make_float16(pack_raw64(p, &float16_params)); 667 } 668 669 static inline bfloat16 bfloat16_pack_raw(const FloatParts64 *p) 670 { 671 return pack_raw64(p, &bfloat16_params); 672 } 673 674 static inline float32 float32_pack_raw(const FloatParts64 *p) 675 { 676 return make_float32(pack_raw64(p, &float32_params)); 677 } 678 679 static inline float64 float64_pack_raw(const FloatParts64 *p) 680 { 681 return make_float64(pack_raw64(p, &float64_params)); 682 } 683 684 static float128 float128_pack_raw(const FloatParts128 *p) 685 { 686 const int f_size = float128_params.frac_size - 64; 687 const int e_size = float128_params.exp_size; 688 uint64_t hi; 689 690 hi = (uint64_t)p->sign << (f_size + e_size); 691 hi = deposit64(hi, f_size, e_size, p->exp); 692 hi = deposit64(hi, 0, f_size, p->frac_hi); 693 return make_float128(hi, p->frac_lo); 694 } 695 696 /*---------------------------------------------------------------------------- 697 | Functions and definitions to determine: (1) whether tininess for underflow 698 | is detected before or after rounding by default, (2) what (if anything) 699 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 700 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 701 | are propagated from function inputs to output. These details are target- 702 | specific. 703 *----------------------------------------------------------------------------*/ 704 #include "softfloat-specialize.c.inc" 705 706 #define PARTS_GENERIC_64_128(NAME, P) \ 707 QEMU_GENERIC(P, (FloatParts128 *, parts128_##NAME), parts64_##NAME) 708 709 #define parts_default_nan(P, S) PARTS_GENERIC_64_128(default_nan, P)(P, S) 710 #define parts_silence_nan(P, S) PARTS_GENERIC_64_128(silence_nan, P)(P, S) 711 712 static void parts64_return_nan(FloatParts64 *a, float_status *s); 713 static void parts128_return_nan(FloatParts128 *a, float_status *s); 714 715 #define parts_return_nan(P, S) PARTS_GENERIC_64_128(return_nan, P)(P, S) 716 717 static FloatParts64 *parts64_pick_nan(FloatParts64 *a, FloatParts64 *b, 718 float_status *s); 719 static FloatParts128 *parts128_pick_nan(FloatParts128 *a, FloatParts128 *b, 720 float_status *s); 721 722 #define parts_pick_nan(A, B, S) PARTS_GENERIC_64_128(pick_nan, A)(A, B, S) 723 724 /* 725 * Helper functions for softfloat-parts.c.inc, per-size operations. 726 */ 727 728 #define FRAC_GENERIC_64_128(NAME, P) \ 729 QEMU_GENERIC(P, (FloatParts128 *, frac128_##NAME), frac64_##NAME) 730 731 static int frac64_cmp(FloatParts64 *a, FloatParts64 *b) 732 { 733 return a->frac == b->frac ? 0 : a->frac < b->frac ? -1 : 1; 734 } 735 736 static int frac128_cmp(FloatParts128 *a, FloatParts128 *b) 737 { 738 uint64_t ta = a->frac_hi, tb = b->frac_hi; 739 if (ta == tb) { 740 ta = a->frac_lo, tb = b->frac_lo; 741 if (ta == tb) { 742 return 0; 743 } 744 } 745 return ta < tb ? -1 : 1; 746 } 747 748 #define frac_cmp(A, B) FRAC_GENERIC_64_128(cmp, A)(A, B) 749 750 static void frac128_shl(FloatParts128 *a, int c) 751 { 752 shift128Left(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo); 753 } 754 755 #define frac_shl(A, C) frac128_shl(A, C) 756 757 static void frac128_shr(FloatParts128 *a, int c) 758 { 759 shift128Right(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo); 760 } 761 762 #define frac_shr(A, C) frac128_shr(A, C) 763 764 /* Canonicalize EXP and FRAC, setting CLS. */ 765 static FloatParts64 sf_canonicalize(FloatParts64 part, const FloatFmt *parm, 766 float_status *status) 767 { 768 if (part.exp == parm->exp_max && !parm->arm_althp) { 769 if (part.frac == 0) { 770 part.cls = float_class_inf; 771 } else { 772 part.frac <<= parm->frac_shift; 773 part.cls = (parts_is_snan_frac(part.frac, status) 774 ? float_class_snan : float_class_qnan); 775 } 776 } else if (part.exp == 0) { 777 if (likely(part.frac == 0)) { 778 part.cls = float_class_zero; 779 } else if (status->flush_inputs_to_zero) { 780 float_raise(float_flag_input_denormal, status); 781 part.cls = float_class_zero; 782 part.frac = 0; 783 } else { 784 int shift = clz64(part.frac); 785 part.cls = float_class_normal; 786 part.exp = parm->frac_shift - parm->exp_bias - shift + 1; 787 part.frac <<= shift; 788 } 789 } else { 790 part.cls = float_class_normal; 791 part.exp -= parm->exp_bias; 792 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift); 793 } 794 return part; 795 } 796 797 /* Round and uncanonicalize a floating-point number by parts. There 798 * are FRAC_SHIFT bits that may require rounding at the bottom of the 799 * fraction; these bits will be removed. The exponent will be biased 800 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0]. 801 */ 802 803 static FloatParts64 round_canonical(FloatParts64 p, float_status *s, 804 const FloatFmt *parm) 805 { 806 const uint64_t frac_lsb = parm->frac_lsb; 807 const uint64_t frac_lsbm1 = parm->frac_lsbm1; 808 const uint64_t round_mask = parm->round_mask; 809 const uint64_t roundeven_mask = parm->roundeven_mask; 810 const int exp_max = parm->exp_max; 811 const int frac_shift = parm->frac_shift; 812 uint64_t frac, inc; 813 int exp, flags = 0; 814 bool overflow_norm; 815 816 frac = p.frac; 817 exp = p.exp; 818 819 switch (p.cls) { 820 case float_class_normal: 821 switch (s->float_rounding_mode) { 822 case float_round_nearest_even: 823 overflow_norm = false; 824 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 825 break; 826 case float_round_ties_away: 827 overflow_norm = false; 828 inc = frac_lsbm1; 829 break; 830 case float_round_to_zero: 831 overflow_norm = true; 832 inc = 0; 833 break; 834 case float_round_up: 835 inc = p.sign ? 0 : round_mask; 836 overflow_norm = p.sign; 837 break; 838 case float_round_down: 839 inc = p.sign ? round_mask : 0; 840 overflow_norm = !p.sign; 841 break; 842 case float_round_to_odd: 843 overflow_norm = true; 844 inc = frac & frac_lsb ? 0 : round_mask; 845 break; 846 default: 847 g_assert_not_reached(); 848 } 849 850 exp += parm->exp_bias; 851 if (likely(exp > 0)) { 852 if (frac & round_mask) { 853 flags |= float_flag_inexact; 854 if (uadd64_overflow(frac, inc, &frac)) { 855 frac = (frac >> 1) | DECOMPOSED_IMPLICIT_BIT; 856 exp++; 857 } 858 } 859 frac >>= frac_shift; 860 861 if (parm->arm_althp) { 862 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */ 863 if (unlikely(exp > exp_max)) { 864 /* Overflow. Return the maximum normal. */ 865 flags = float_flag_invalid; 866 exp = exp_max; 867 frac = -1; 868 } 869 } else if (unlikely(exp >= exp_max)) { 870 flags |= float_flag_overflow | float_flag_inexact; 871 if (overflow_norm) { 872 exp = exp_max - 1; 873 frac = -1; 874 } else { 875 p.cls = float_class_inf; 876 goto do_inf; 877 } 878 } 879 } else if (s->flush_to_zero) { 880 flags |= float_flag_output_denormal; 881 p.cls = float_class_zero; 882 goto do_zero; 883 } else { 884 bool is_tiny = s->tininess_before_rounding || (exp < 0); 885 886 if (!is_tiny) { 887 uint64_t discard; 888 is_tiny = !uadd64_overflow(frac, inc, &discard); 889 } 890 891 shift64RightJamming(frac, 1 - exp, &frac); 892 if (frac & round_mask) { 893 /* Need to recompute round-to-even. */ 894 switch (s->float_rounding_mode) { 895 case float_round_nearest_even: 896 inc = ((frac & roundeven_mask) != frac_lsbm1 897 ? frac_lsbm1 : 0); 898 break; 899 case float_round_to_odd: 900 inc = frac & frac_lsb ? 0 : round_mask; 901 break; 902 default: 903 break; 904 } 905 flags |= float_flag_inexact; 906 frac += inc; 907 } 908 909 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0); 910 frac >>= frac_shift; 911 912 if (is_tiny && (flags & float_flag_inexact)) { 913 flags |= float_flag_underflow; 914 } 915 if (exp == 0 && frac == 0) { 916 p.cls = float_class_zero; 917 } 918 } 919 break; 920 921 case float_class_zero: 922 do_zero: 923 exp = 0; 924 frac = 0; 925 break; 926 927 case float_class_inf: 928 do_inf: 929 assert(!parm->arm_althp); 930 exp = exp_max; 931 frac = 0; 932 break; 933 934 case float_class_qnan: 935 case float_class_snan: 936 assert(!parm->arm_althp); 937 exp = exp_max; 938 frac >>= parm->frac_shift; 939 break; 940 941 default: 942 g_assert_not_reached(); 943 } 944 945 float_raise(flags, s); 946 p.exp = exp; 947 p.frac = frac; 948 return p; 949 } 950 951 static FloatParts64 pick_nan_muladd(FloatParts64 a, FloatParts64 b, FloatParts64 c, 952 bool inf_zero, float_status *s) 953 { 954 int which; 955 956 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) { 957 float_raise(float_flag_invalid, s); 958 } 959 960 which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s); 961 962 if (s->default_nan_mode) { 963 /* Note that this check is after pickNaNMulAdd so that function 964 * has an opportunity to set the Invalid flag. 965 */ 966 which = 3; 967 } 968 969 switch (which) { 970 case 0: 971 break; 972 case 1: 973 a = b; 974 break; 975 case 2: 976 a = c; 977 break; 978 case 3: 979 parts_default_nan(&a, s); 980 break; 981 default: 982 g_assert_not_reached(); 983 } 984 985 if (is_snan(a.cls)) { 986 parts_silence_nan(&a, s); 987 } 988 return a; 989 } 990 991 #define partsN(NAME) parts64_##NAME 992 #define FloatPartsN FloatParts64 993 994 #include "softfloat-parts.c.inc" 995 996 #undef partsN 997 #undef FloatPartsN 998 #define partsN(NAME) parts128_##NAME 999 #define FloatPartsN FloatParts128 1000 1001 #include "softfloat-parts.c.inc" 1002 1003 #undef partsN 1004 #undef FloatPartsN 1005 1006 /* 1007 * Pack/unpack routines with a specific FloatFmt. 1008 */ 1009 1010 static void float16a_unpack_canonical(FloatParts64 *p, float16 f, 1011 float_status *s, const FloatFmt *params) 1012 { 1013 float16_unpack_raw(p, f); 1014 *p = sf_canonicalize(*p, params, s); 1015 } 1016 1017 static void float16_unpack_canonical(FloatParts64 *p, float16 f, 1018 float_status *s) 1019 { 1020 float16a_unpack_canonical(p, f, s, &float16_params); 1021 } 1022 1023 static void bfloat16_unpack_canonical(FloatParts64 *p, bfloat16 f, 1024 float_status *s) 1025 { 1026 bfloat16_unpack_raw(p, f); 1027 *p = sf_canonicalize(*p, &bfloat16_params, s); 1028 } 1029 1030 static float16 float16a_round_pack_canonical(FloatParts64 *p, 1031 float_status *s, 1032 const FloatFmt *params) 1033 { 1034 *p = round_canonical(*p, s, params); 1035 return float16_pack_raw(p); 1036 } 1037 1038 static float16 float16_round_pack_canonical(FloatParts64 *p, 1039 float_status *s) 1040 { 1041 return float16a_round_pack_canonical(p, s, &float16_params); 1042 } 1043 1044 static bfloat16 bfloat16_round_pack_canonical(FloatParts64 *p, 1045 float_status *s) 1046 { 1047 *p = round_canonical(*p, s, &bfloat16_params); 1048 return bfloat16_pack_raw(p); 1049 } 1050 1051 static void float32_unpack_canonical(FloatParts64 *p, float32 f, 1052 float_status *s) 1053 { 1054 float32_unpack_raw(p, f); 1055 *p = sf_canonicalize(*p, &float32_params, s); 1056 } 1057 1058 static float32 float32_round_pack_canonical(FloatParts64 *p, 1059 float_status *s) 1060 { 1061 *p = round_canonical(*p, s, &float32_params); 1062 return float32_pack_raw(p); 1063 } 1064 1065 static void float64_unpack_canonical(FloatParts64 *p, float64 f, 1066 float_status *s) 1067 { 1068 float64_unpack_raw(p, f); 1069 *p = sf_canonicalize(*p, &float64_params, s); 1070 } 1071 1072 static float64 float64_round_pack_canonical(FloatParts64 *p, 1073 float_status *s) 1074 { 1075 *p = round_canonical(*p, s, &float64_params); 1076 return float64_pack_raw(p); 1077 } 1078 1079 /* 1080 * Returns the result of adding or subtracting the values of the 1081 * floating-point values `a' and `b'. The operation is performed 1082 * according to the IEC/IEEE Standard for Binary Floating-Point 1083 * Arithmetic. 1084 */ 1085 1086 static FloatParts64 addsub_floats(FloatParts64 a, FloatParts64 b, bool subtract, 1087 float_status *s) 1088 { 1089 bool a_sign = a.sign; 1090 bool b_sign = b.sign ^ subtract; 1091 1092 if (a_sign != b_sign) { 1093 /* Subtraction */ 1094 1095 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1096 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) { 1097 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 1098 a.frac = a.frac - b.frac; 1099 } else { 1100 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 1101 a.frac = b.frac - a.frac; 1102 a.exp = b.exp; 1103 a_sign ^= 1; 1104 } 1105 1106 if (a.frac == 0) { 1107 a.cls = float_class_zero; 1108 a.sign = s->float_rounding_mode == float_round_down; 1109 } else { 1110 int shift = clz64(a.frac); 1111 a.frac = a.frac << shift; 1112 a.exp = a.exp - shift; 1113 a.sign = a_sign; 1114 } 1115 return a; 1116 } 1117 if (is_nan(a.cls) || is_nan(b.cls)) { 1118 return *parts_pick_nan(&a, &b, s); 1119 } 1120 if (a.cls == float_class_inf) { 1121 if (b.cls == float_class_inf) { 1122 float_raise(float_flag_invalid, s); 1123 parts_default_nan(&a, s); 1124 } 1125 return a; 1126 } 1127 if (a.cls == float_class_zero && b.cls == float_class_zero) { 1128 a.sign = s->float_rounding_mode == float_round_down; 1129 return a; 1130 } 1131 if (a.cls == float_class_zero || b.cls == float_class_inf) { 1132 b.sign = a_sign ^ 1; 1133 return b; 1134 } 1135 if (b.cls == float_class_zero) { 1136 return a; 1137 } 1138 } else { 1139 /* Addition */ 1140 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1141 if (a.exp > b.exp) { 1142 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 1143 } else if (a.exp < b.exp) { 1144 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 1145 a.exp = b.exp; 1146 } 1147 1148 if (uadd64_overflow(a.frac, b.frac, &a.frac)) { 1149 shift64RightJamming(a.frac, 1, &a.frac); 1150 a.frac |= DECOMPOSED_IMPLICIT_BIT; 1151 a.exp += 1; 1152 } 1153 return a; 1154 } 1155 if (is_nan(a.cls) || is_nan(b.cls)) { 1156 return *parts_pick_nan(&a, &b, s); 1157 } 1158 if (a.cls == float_class_inf || b.cls == float_class_zero) { 1159 return a; 1160 } 1161 if (b.cls == float_class_inf || a.cls == float_class_zero) { 1162 b.sign = b_sign; 1163 return b; 1164 } 1165 } 1166 g_assert_not_reached(); 1167 } 1168 1169 /* 1170 * Returns the result of adding or subtracting the floating-point 1171 * values `a' and `b'. The operation is performed according to the 1172 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1173 */ 1174 1175 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status) 1176 { 1177 FloatParts64 pa, pb, pr; 1178 1179 float16_unpack_canonical(&pa, a, status); 1180 float16_unpack_canonical(&pb, b, status); 1181 pr = addsub_floats(pa, pb, false, status); 1182 1183 return float16_round_pack_canonical(&pr, status); 1184 } 1185 1186 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status) 1187 { 1188 FloatParts64 pa, pb, pr; 1189 1190 float16_unpack_canonical(&pa, a, status); 1191 float16_unpack_canonical(&pb, b, status); 1192 pr = addsub_floats(pa, pb, true, status); 1193 1194 return float16_round_pack_canonical(&pr, status); 1195 } 1196 1197 static float32 QEMU_SOFTFLOAT_ATTR 1198 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status) 1199 { 1200 FloatParts64 pa, pb, pr; 1201 1202 float32_unpack_canonical(&pa, a, status); 1203 float32_unpack_canonical(&pb, b, status); 1204 pr = addsub_floats(pa, pb, subtract, status); 1205 1206 return float32_round_pack_canonical(&pr, status); 1207 } 1208 1209 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status) 1210 { 1211 return soft_f32_addsub(a, b, false, status); 1212 } 1213 1214 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status) 1215 { 1216 return soft_f32_addsub(a, b, true, status); 1217 } 1218 1219 static float64 QEMU_SOFTFLOAT_ATTR 1220 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status) 1221 { 1222 FloatParts64 pa, pb, pr; 1223 1224 float64_unpack_canonical(&pa, a, status); 1225 float64_unpack_canonical(&pb, b, status); 1226 pr = addsub_floats(pa, pb, subtract, status); 1227 1228 return float64_round_pack_canonical(&pr, status); 1229 } 1230 1231 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status) 1232 { 1233 return soft_f64_addsub(a, b, false, status); 1234 } 1235 1236 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status) 1237 { 1238 return soft_f64_addsub(a, b, true, status); 1239 } 1240 1241 static float hard_f32_add(float a, float b) 1242 { 1243 return a + b; 1244 } 1245 1246 static float hard_f32_sub(float a, float b) 1247 { 1248 return a - b; 1249 } 1250 1251 static double hard_f64_add(double a, double b) 1252 { 1253 return a + b; 1254 } 1255 1256 static double hard_f64_sub(double a, double b) 1257 { 1258 return a - b; 1259 } 1260 1261 static bool f32_addsubmul_post(union_float32 a, union_float32 b) 1262 { 1263 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1264 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1265 } 1266 return !(float32_is_zero(a.s) && float32_is_zero(b.s)); 1267 } 1268 1269 static bool f64_addsubmul_post(union_float64 a, union_float64 b) 1270 { 1271 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1272 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1273 } else { 1274 return !(float64_is_zero(a.s) && float64_is_zero(b.s)); 1275 } 1276 } 1277 1278 static float32 float32_addsub(float32 a, float32 b, float_status *s, 1279 hard_f32_op2_fn hard, soft_f32_op2_fn soft) 1280 { 1281 return float32_gen2(a, b, s, hard, soft, 1282 f32_is_zon2, f32_addsubmul_post); 1283 } 1284 1285 static float64 float64_addsub(float64 a, float64 b, float_status *s, 1286 hard_f64_op2_fn hard, soft_f64_op2_fn soft) 1287 { 1288 return float64_gen2(a, b, s, hard, soft, 1289 f64_is_zon2, f64_addsubmul_post); 1290 } 1291 1292 float32 QEMU_FLATTEN 1293 float32_add(float32 a, float32 b, float_status *s) 1294 { 1295 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add); 1296 } 1297 1298 float32 QEMU_FLATTEN 1299 float32_sub(float32 a, float32 b, float_status *s) 1300 { 1301 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub); 1302 } 1303 1304 float64 QEMU_FLATTEN 1305 float64_add(float64 a, float64 b, float_status *s) 1306 { 1307 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add); 1308 } 1309 1310 float64 QEMU_FLATTEN 1311 float64_sub(float64 a, float64 b, float_status *s) 1312 { 1313 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub); 1314 } 1315 1316 /* 1317 * Returns the result of adding or subtracting the bfloat16 1318 * values `a' and `b'. 1319 */ 1320 bfloat16 QEMU_FLATTEN bfloat16_add(bfloat16 a, bfloat16 b, float_status *status) 1321 { 1322 FloatParts64 pa, pb, pr; 1323 1324 bfloat16_unpack_canonical(&pa, a, status); 1325 bfloat16_unpack_canonical(&pb, b, status); 1326 pr = addsub_floats(pa, pb, false, status); 1327 1328 return bfloat16_round_pack_canonical(&pr, status); 1329 } 1330 1331 bfloat16 QEMU_FLATTEN bfloat16_sub(bfloat16 a, bfloat16 b, float_status *status) 1332 { 1333 FloatParts64 pa, pb, pr; 1334 1335 bfloat16_unpack_canonical(&pa, a, status); 1336 bfloat16_unpack_canonical(&pb, b, status); 1337 pr = addsub_floats(pa, pb, true, status); 1338 1339 return bfloat16_round_pack_canonical(&pr, status); 1340 } 1341 1342 /* 1343 * Returns the result of multiplying the floating-point values `a' and 1344 * `b'. The operation is performed according to the IEC/IEEE Standard 1345 * for Binary Floating-Point Arithmetic. 1346 */ 1347 1348 static FloatParts64 mul_floats(FloatParts64 a, FloatParts64 b, float_status *s) 1349 { 1350 bool sign = a.sign ^ b.sign; 1351 1352 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1353 uint64_t hi, lo; 1354 int exp = a.exp + b.exp; 1355 1356 mul64To128(a.frac, b.frac, &hi, &lo); 1357 if (hi & DECOMPOSED_IMPLICIT_BIT) { 1358 exp += 1; 1359 } else { 1360 hi <<= 1; 1361 } 1362 hi |= (lo != 0); 1363 1364 /* Re-use a */ 1365 a.exp = exp; 1366 a.sign = sign; 1367 a.frac = hi; 1368 return a; 1369 } 1370 /* handle all the NaN cases */ 1371 if (is_nan(a.cls) || is_nan(b.cls)) { 1372 return *parts_pick_nan(&a, &b, s); 1373 } 1374 /* Inf * Zero == NaN */ 1375 if ((a.cls == float_class_inf && b.cls == float_class_zero) || 1376 (a.cls == float_class_zero && b.cls == float_class_inf)) { 1377 float_raise(float_flag_invalid, s); 1378 parts_default_nan(&a, s); 1379 return a; 1380 } 1381 /* Multiply by 0 or Inf */ 1382 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1383 a.sign = sign; 1384 return a; 1385 } 1386 if (b.cls == float_class_inf || b.cls == float_class_zero) { 1387 b.sign = sign; 1388 return b; 1389 } 1390 g_assert_not_reached(); 1391 } 1392 1393 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status) 1394 { 1395 FloatParts64 pa, pb, pr; 1396 1397 float16_unpack_canonical(&pa, a, status); 1398 float16_unpack_canonical(&pb, b, status); 1399 pr = mul_floats(pa, pb, status); 1400 1401 return float16_round_pack_canonical(&pr, status); 1402 } 1403 1404 static float32 QEMU_SOFTFLOAT_ATTR 1405 soft_f32_mul(float32 a, float32 b, float_status *status) 1406 { 1407 FloatParts64 pa, pb, pr; 1408 1409 float32_unpack_canonical(&pa, a, status); 1410 float32_unpack_canonical(&pb, b, status); 1411 pr = mul_floats(pa, pb, status); 1412 1413 return float32_round_pack_canonical(&pr, status); 1414 } 1415 1416 static float64 QEMU_SOFTFLOAT_ATTR 1417 soft_f64_mul(float64 a, float64 b, float_status *status) 1418 { 1419 FloatParts64 pa, pb, pr; 1420 1421 float64_unpack_canonical(&pa, a, status); 1422 float64_unpack_canonical(&pb, b, status); 1423 pr = mul_floats(pa, pb, status); 1424 1425 return float64_round_pack_canonical(&pr, status); 1426 } 1427 1428 static float hard_f32_mul(float a, float b) 1429 { 1430 return a * b; 1431 } 1432 1433 static double hard_f64_mul(double a, double b) 1434 { 1435 return a * b; 1436 } 1437 1438 float32 QEMU_FLATTEN 1439 float32_mul(float32 a, float32 b, float_status *s) 1440 { 1441 return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul, 1442 f32_is_zon2, f32_addsubmul_post); 1443 } 1444 1445 float64 QEMU_FLATTEN 1446 float64_mul(float64 a, float64 b, float_status *s) 1447 { 1448 return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul, 1449 f64_is_zon2, f64_addsubmul_post); 1450 } 1451 1452 /* 1453 * Returns the result of multiplying the bfloat16 1454 * values `a' and `b'. 1455 */ 1456 1457 bfloat16 QEMU_FLATTEN bfloat16_mul(bfloat16 a, bfloat16 b, float_status *status) 1458 { 1459 FloatParts64 pa, pb, pr; 1460 1461 bfloat16_unpack_canonical(&pa, a, status); 1462 bfloat16_unpack_canonical(&pb, b, status); 1463 pr = mul_floats(pa, pb, status); 1464 1465 return bfloat16_round_pack_canonical(&pr, status); 1466 } 1467 1468 /* 1469 * Returns the result of multiplying the floating-point values `a' and 1470 * `b' then adding 'c', with no intermediate rounding step after the 1471 * multiplication. The operation is performed according to the 1472 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008. 1473 * The flags argument allows the caller to select negation of the 1474 * addend, the intermediate product, or the final result. (The 1475 * difference between this and having the caller do a separate 1476 * negation is that negating externally will flip the sign bit on 1477 * NaNs.) 1478 */ 1479 1480 static FloatParts64 muladd_floats(FloatParts64 a, FloatParts64 b, FloatParts64 c, 1481 int flags, float_status *s) 1482 { 1483 bool inf_zero, p_sign; 1484 bool sign_flip = flags & float_muladd_negate_result; 1485 FloatClass p_class; 1486 uint64_t hi, lo; 1487 int p_exp; 1488 int ab_mask, abc_mask; 1489 1490 ab_mask = float_cmask(a.cls) | float_cmask(b.cls); 1491 abc_mask = float_cmask(c.cls) | ab_mask; 1492 inf_zero = ab_mask == float_cmask_infzero; 1493 1494 /* It is implementation-defined whether the cases of (0,inf,qnan) 1495 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 1496 * they return if they do), so we have to hand this information 1497 * off to the target-specific pick-a-NaN routine. 1498 */ 1499 if (unlikely(abc_mask & float_cmask_anynan)) { 1500 return pick_nan_muladd(a, b, c, inf_zero, s); 1501 } 1502 1503 if (inf_zero) { 1504 float_raise(float_flag_invalid, s); 1505 parts_default_nan(&a, s); 1506 return a; 1507 } 1508 1509 if (flags & float_muladd_negate_c) { 1510 c.sign ^= 1; 1511 } 1512 1513 p_sign = a.sign ^ b.sign; 1514 1515 if (flags & float_muladd_negate_product) { 1516 p_sign ^= 1; 1517 } 1518 1519 if (ab_mask & float_cmask_inf) { 1520 p_class = float_class_inf; 1521 } else if (ab_mask & float_cmask_zero) { 1522 p_class = float_class_zero; 1523 } else { 1524 p_class = float_class_normal; 1525 } 1526 1527 if (c.cls == float_class_inf) { 1528 if (p_class == float_class_inf && p_sign != c.sign) { 1529 float_raise(float_flag_invalid, s); 1530 parts_default_nan(&c, s); 1531 } else { 1532 c.sign ^= sign_flip; 1533 } 1534 return c; 1535 } 1536 1537 if (p_class == float_class_inf) { 1538 a.cls = float_class_inf; 1539 a.sign = p_sign ^ sign_flip; 1540 return a; 1541 } 1542 1543 if (p_class == float_class_zero) { 1544 if (c.cls == float_class_zero) { 1545 if (p_sign != c.sign) { 1546 p_sign = s->float_rounding_mode == float_round_down; 1547 } 1548 c.sign = p_sign; 1549 } else if (flags & float_muladd_halve_result) { 1550 c.exp -= 1; 1551 } 1552 c.sign ^= sign_flip; 1553 return c; 1554 } 1555 1556 /* a & b should be normals now... */ 1557 assert(a.cls == float_class_normal && 1558 b.cls == float_class_normal); 1559 1560 p_exp = a.exp + b.exp; 1561 1562 mul64To128(a.frac, b.frac, &hi, &lo); 1563 1564 /* Renormalize to the msb. */ 1565 if (hi & DECOMPOSED_IMPLICIT_BIT) { 1566 p_exp += 1; 1567 } else { 1568 shortShift128Left(hi, lo, 1, &hi, &lo); 1569 } 1570 1571 /* + add/sub */ 1572 if (c.cls != float_class_zero) { 1573 int exp_diff = p_exp - c.exp; 1574 if (p_sign == c.sign) { 1575 /* Addition */ 1576 if (exp_diff <= 0) { 1577 shift64RightJamming(hi, -exp_diff, &hi); 1578 p_exp = c.exp; 1579 if (uadd64_overflow(hi, c.frac, &hi)) { 1580 shift64RightJamming(hi, 1, &hi); 1581 hi |= DECOMPOSED_IMPLICIT_BIT; 1582 p_exp += 1; 1583 } 1584 } else { 1585 uint64_t c_hi, c_lo, over; 1586 shift128RightJamming(c.frac, 0, exp_diff, &c_hi, &c_lo); 1587 add192(0, hi, lo, 0, c_hi, c_lo, &over, &hi, &lo); 1588 if (over) { 1589 shift64RightJamming(hi, 1, &hi); 1590 hi |= DECOMPOSED_IMPLICIT_BIT; 1591 p_exp += 1; 1592 } 1593 } 1594 } else { 1595 /* Subtraction */ 1596 uint64_t c_hi = c.frac, c_lo = 0; 1597 1598 if (exp_diff <= 0) { 1599 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo); 1600 if (exp_diff == 0 1601 && 1602 (hi > c_hi || (hi == c_hi && lo >= c_lo))) { 1603 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1604 } else { 1605 sub128(c_hi, c_lo, hi, lo, &hi, &lo); 1606 p_sign ^= 1; 1607 p_exp = c.exp; 1608 } 1609 } else { 1610 shift128RightJamming(c_hi, c_lo, 1611 exp_diff, 1612 &c_hi, &c_lo); 1613 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1614 } 1615 1616 if (hi == 0 && lo == 0) { 1617 a.cls = float_class_zero; 1618 a.sign = s->float_rounding_mode == float_round_down; 1619 a.sign ^= sign_flip; 1620 return a; 1621 } else { 1622 int shift; 1623 if (hi != 0) { 1624 shift = clz64(hi); 1625 } else { 1626 shift = clz64(lo) + 64; 1627 } 1628 /* Normalizing to a binary point of 124 is the 1629 correct adjust for the exponent. However since we're 1630 shifting, we might as well put the binary point back 1631 at 63 where we really want it. Therefore shift as 1632 if we're leaving 1 bit at the top of the word, but 1633 adjust the exponent as if we're leaving 3 bits. */ 1634 shift128Left(hi, lo, shift, &hi, &lo); 1635 p_exp -= shift; 1636 } 1637 } 1638 } 1639 hi |= (lo != 0); 1640 1641 if (flags & float_muladd_halve_result) { 1642 p_exp -= 1; 1643 } 1644 1645 /* finally prepare our result */ 1646 a.cls = float_class_normal; 1647 a.sign = p_sign ^ sign_flip; 1648 a.exp = p_exp; 1649 a.frac = hi; 1650 1651 return a; 1652 } 1653 1654 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c, 1655 int flags, float_status *status) 1656 { 1657 FloatParts64 pa, pb, pc, pr; 1658 1659 float16_unpack_canonical(&pa, a, status); 1660 float16_unpack_canonical(&pb, b, status); 1661 float16_unpack_canonical(&pc, c, status); 1662 pr = muladd_floats(pa, pb, pc, flags, status); 1663 1664 return float16_round_pack_canonical(&pr, status); 1665 } 1666 1667 static float32 QEMU_SOFTFLOAT_ATTR 1668 soft_f32_muladd(float32 a, float32 b, float32 c, int flags, 1669 float_status *status) 1670 { 1671 FloatParts64 pa, pb, pc, pr; 1672 1673 float32_unpack_canonical(&pa, a, status); 1674 float32_unpack_canonical(&pb, b, status); 1675 float32_unpack_canonical(&pc, c, status); 1676 pr = muladd_floats(pa, pb, pc, flags, status); 1677 1678 return float32_round_pack_canonical(&pr, status); 1679 } 1680 1681 static float64 QEMU_SOFTFLOAT_ATTR 1682 soft_f64_muladd(float64 a, float64 b, float64 c, int flags, 1683 float_status *status) 1684 { 1685 FloatParts64 pa, pb, pc, pr; 1686 1687 float64_unpack_canonical(&pa, a, status); 1688 float64_unpack_canonical(&pb, b, status); 1689 float64_unpack_canonical(&pc, c, status); 1690 pr = muladd_floats(pa, pb, pc, flags, status); 1691 1692 return float64_round_pack_canonical(&pr, status); 1693 } 1694 1695 static bool force_soft_fma; 1696 1697 float32 QEMU_FLATTEN 1698 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s) 1699 { 1700 union_float32 ua, ub, uc, ur; 1701 1702 ua.s = xa; 1703 ub.s = xb; 1704 uc.s = xc; 1705 1706 if (unlikely(!can_use_fpu(s))) { 1707 goto soft; 1708 } 1709 if (unlikely(flags & float_muladd_halve_result)) { 1710 goto soft; 1711 } 1712 1713 float32_input_flush3(&ua.s, &ub.s, &uc.s, s); 1714 if (unlikely(!f32_is_zon3(ua, ub, uc))) { 1715 goto soft; 1716 } 1717 1718 if (unlikely(force_soft_fma)) { 1719 goto soft; 1720 } 1721 1722 /* 1723 * When (a || b) == 0, there's no need to check for under/over flow, 1724 * since we know the addend is (normal || 0) and the product is 0. 1725 */ 1726 if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) { 1727 union_float32 up; 1728 bool prod_sign; 1729 1730 prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s); 1731 prod_sign ^= !!(flags & float_muladd_negate_product); 1732 up.s = float32_set_sign(float32_zero, prod_sign); 1733 1734 if (flags & float_muladd_negate_c) { 1735 uc.h = -uc.h; 1736 } 1737 ur.h = up.h + uc.h; 1738 } else { 1739 union_float32 ua_orig = ua; 1740 union_float32 uc_orig = uc; 1741 1742 if (flags & float_muladd_negate_product) { 1743 ua.h = -ua.h; 1744 } 1745 if (flags & float_muladd_negate_c) { 1746 uc.h = -uc.h; 1747 } 1748 1749 ur.h = fmaf(ua.h, ub.h, uc.h); 1750 1751 if (unlikely(f32_is_inf(ur))) { 1752 float_raise(float_flag_overflow, s); 1753 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) { 1754 ua = ua_orig; 1755 uc = uc_orig; 1756 goto soft; 1757 } 1758 } 1759 if (flags & float_muladd_negate_result) { 1760 return float32_chs(ur.s); 1761 } 1762 return ur.s; 1763 1764 soft: 1765 return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s); 1766 } 1767 1768 float64 QEMU_FLATTEN 1769 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s) 1770 { 1771 union_float64 ua, ub, uc, ur; 1772 1773 ua.s = xa; 1774 ub.s = xb; 1775 uc.s = xc; 1776 1777 if (unlikely(!can_use_fpu(s))) { 1778 goto soft; 1779 } 1780 if (unlikely(flags & float_muladd_halve_result)) { 1781 goto soft; 1782 } 1783 1784 float64_input_flush3(&ua.s, &ub.s, &uc.s, s); 1785 if (unlikely(!f64_is_zon3(ua, ub, uc))) { 1786 goto soft; 1787 } 1788 1789 if (unlikely(force_soft_fma)) { 1790 goto soft; 1791 } 1792 1793 /* 1794 * When (a || b) == 0, there's no need to check for under/over flow, 1795 * since we know the addend is (normal || 0) and the product is 0. 1796 */ 1797 if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) { 1798 union_float64 up; 1799 bool prod_sign; 1800 1801 prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s); 1802 prod_sign ^= !!(flags & float_muladd_negate_product); 1803 up.s = float64_set_sign(float64_zero, prod_sign); 1804 1805 if (flags & float_muladd_negate_c) { 1806 uc.h = -uc.h; 1807 } 1808 ur.h = up.h + uc.h; 1809 } else { 1810 union_float64 ua_orig = ua; 1811 union_float64 uc_orig = uc; 1812 1813 if (flags & float_muladd_negate_product) { 1814 ua.h = -ua.h; 1815 } 1816 if (flags & float_muladd_negate_c) { 1817 uc.h = -uc.h; 1818 } 1819 1820 ur.h = fma(ua.h, ub.h, uc.h); 1821 1822 if (unlikely(f64_is_inf(ur))) { 1823 float_raise(float_flag_overflow, s); 1824 } else if (unlikely(fabs(ur.h) <= FLT_MIN)) { 1825 ua = ua_orig; 1826 uc = uc_orig; 1827 goto soft; 1828 } 1829 } 1830 if (flags & float_muladd_negate_result) { 1831 return float64_chs(ur.s); 1832 } 1833 return ur.s; 1834 1835 soft: 1836 return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s); 1837 } 1838 1839 /* 1840 * Returns the result of multiplying the bfloat16 values `a' 1841 * and `b' then adding 'c', with no intermediate rounding step after the 1842 * multiplication. 1843 */ 1844 1845 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c, 1846 int flags, float_status *status) 1847 { 1848 FloatParts64 pa, pb, pc, pr; 1849 1850 bfloat16_unpack_canonical(&pa, a, status); 1851 bfloat16_unpack_canonical(&pb, b, status); 1852 bfloat16_unpack_canonical(&pc, c, status); 1853 pr = muladd_floats(pa, pb, pc, flags, status); 1854 1855 return bfloat16_round_pack_canonical(&pr, status); 1856 } 1857 1858 /* 1859 * Returns the result of dividing the floating-point value `a' by the 1860 * corresponding value `b'. The operation is performed according to 1861 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1862 */ 1863 1864 static FloatParts64 div_floats(FloatParts64 a, FloatParts64 b, float_status *s) 1865 { 1866 bool sign = a.sign ^ b.sign; 1867 1868 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1869 uint64_t n0, n1, q, r; 1870 int exp = a.exp - b.exp; 1871 1872 /* 1873 * We want a 2*N / N-bit division to produce exactly an N-bit 1874 * result, so that we do not lose any precision and so that we 1875 * do not have to renormalize afterward. If A.frac < B.frac, 1876 * then division would produce an (N-1)-bit result; shift A left 1877 * by one to produce the an N-bit result, and decrement the 1878 * exponent to match. 1879 * 1880 * The udiv_qrnnd algorithm that we're using requires normalization, 1881 * i.e. the msb of the denominator must be set, which is already true. 1882 */ 1883 if (a.frac < b.frac) { 1884 exp -= 1; 1885 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0); 1886 } else { 1887 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, &n1, &n0); 1888 } 1889 q = udiv_qrnnd(&r, n1, n0, b.frac); 1890 1891 /* Set lsb if there is a remainder, to set inexact. */ 1892 a.frac = q | (r != 0); 1893 a.sign = sign; 1894 a.exp = exp; 1895 return a; 1896 } 1897 /* handle all the NaN cases */ 1898 if (is_nan(a.cls) || is_nan(b.cls)) { 1899 return *parts_pick_nan(&a, &b, s); 1900 } 1901 /* 0/0 or Inf/Inf */ 1902 if (a.cls == b.cls 1903 && 1904 (a.cls == float_class_inf || a.cls == float_class_zero)) { 1905 float_raise(float_flag_invalid, s); 1906 parts_default_nan(&a, s); 1907 return a; 1908 } 1909 /* Inf / x or 0 / x */ 1910 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1911 a.sign = sign; 1912 return a; 1913 } 1914 /* Div 0 => Inf */ 1915 if (b.cls == float_class_zero) { 1916 float_raise(float_flag_divbyzero, s); 1917 a.cls = float_class_inf; 1918 a.sign = sign; 1919 return a; 1920 } 1921 /* Div by Inf */ 1922 if (b.cls == float_class_inf) { 1923 a.cls = float_class_zero; 1924 a.sign = sign; 1925 return a; 1926 } 1927 g_assert_not_reached(); 1928 } 1929 1930 float16 float16_div(float16 a, float16 b, float_status *status) 1931 { 1932 FloatParts64 pa, pb, pr; 1933 1934 float16_unpack_canonical(&pa, a, status); 1935 float16_unpack_canonical(&pb, b, status); 1936 pr = div_floats(pa, pb, status); 1937 1938 return float16_round_pack_canonical(&pr, status); 1939 } 1940 1941 static float32 QEMU_SOFTFLOAT_ATTR 1942 soft_f32_div(float32 a, float32 b, float_status *status) 1943 { 1944 FloatParts64 pa, pb, pr; 1945 1946 float32_unpack_canonical(&pa, a, status); 1947 float32_unpack_canonical(&pb, b, status); 1948 pr = div_floats(pa, pb, status); 1949 1950 return float32_round_pack_canonical(&pr, status); 1951 } 1952 1953 static float64 QEMU_SOFTFLOAT_ATTR 1954 soft_f64_div(float64 a, float64 b, float_status *status) 1955 { 1956 FloatParts64 pa, pb, pr; 1957 1958 float64_unpack_canonical(&pa, a, status); 1959 float64_unpack_canonical(&pb, b, status); 1960 pr = div_floats(pa, pb, status); 1961 1962 return float64_round_pack_canonical(&pr, status); 1963 } 1964 1965 static float hard_f32_div(float a, float b) 1966 { 1967 return a / b; 1968 } 1969 1970 static double hard_f64_div(double a, double b) 1971 { 1972 return a / b; 1973 } 1974 1975 static bool f32_div_pre(union_float32 a, union_float32 b) 1976 { 1977 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1978 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1979 fpclassify(b.h) == FP_NORMAL; 1980 } 1981 return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s); 1982 } 1983 1984 static bool f64_div_pre(union_float64 a, union_float64 b) 1985 { 1986 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1987 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1988 fpclassify(b.h) == FP_NORMAL; 1989 } 1990 return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s); 1991 } 1992 1993 static bool f32_div_post(union_float32 a, union_float32 b) 1994 { 1995 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1996 return fpclassify(a.h) != FP_ZERO; 1997 } 1998 return !float32_is_zero(a.s); 1999 } 2000 2001 static bool f64_div_post(union_float64 a, union_float64 b) 2002 { 2003 if (QEMU_HARDFLOAT_2F64_USE_FP) { 2004 return fpclassify(a.h) != FP_ZERO; 2005 } 2006 return !float64_is_zero(a.s); 2007 } 2008 2009 float32 QEMU_FLATTEN 2010 float32_div(float32 a, float32 b, float_status *s) 2011 { 2012 return float32_gen2(a, b, s, hard_f32_div, soft_f32_div, 2013 f32_div_pre, f32_div_post); 2014 } 2015 2016 float64 QEMU_FLATTEN 2017 float64_div(float64 a, float64 b, float_status *s) 2018 { 2019 return float64_gen2(a, b, s, hard_f64_div, soft_f64_div, 2020 f64_div_pre, f64_div_post); 2021 } 2022 2023 /* 2024 * Returns the result of dividing the bfloat16 2025 * value `a' by the corresponding value `b'. 2026 */ 2027 2028 bfloat16 bfloat16_div(bfloat16 a, bfloat16 b, float_status *status) 2029 { 2030 FloatParts64 pa, pb, pr; 2031 2032 bfloat16_unpack_canonical(&pa, a, status); 2033 bfloat16_unpack_canonical(&pb, b, status); 2034 pr = div_floats(pa, pb, status); 2035 2036 return bfloat16_round_pack_canonical(&pr, status); 2037 } 2038 2039 /* 2040 * Float to Float conversions 2041 * 2042 * Returns the result of converting one float format to another. The 2043 * conversion is performed according to the IEC/IEEE Standard for 2044 * Binary Floating-Point Arithmetic. 2045 * 2046 * The float_to_float helper only needs to take care of raising 2047 * invalid exceptions and handling the conversion on NaNs. 2048 */ 2049 2050 static FloatParts64 float_to_float(FloatParts64 a, const FloatFmt *dstf, 2051 float_status *s) 2052 { 2053 if (dstf->arm_althp) { 2054 switch (a.cls) { 2055 case float_class_qnan: 2056 case float_class_snan: 2057 /* There is no NaN in the destination format. Raise Invalid 2058 * and return a zero with the sign of the input NaN. 2059 */ 2060 float_raise(float_flag_invalid, s); 2061 a.cls = float_class_zero; 2062 a.frac = 0; 2063 a.exp = 0; 2064 break; 2065 2066 case float_class_inf: 2067 /* There is no Inf in the destination format. Raise Invalid 2068 * and return the maximum normal with the correct sign. 2069 */ 2070 float_raise(float_flag_invalid, s); 2071 a.cls = float_class_normal; 2072 a.exp = dstf->exp_max; 2073 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift; 2074 break; 2075 2076 default: 2077 break; 2078 } 2079 } else if (is_nan(a.cls)) { 2080 parts_return_nan(&a, s); 2081 } 2082 return a; 2083 } 2084 2085 float32 float16_to_float32(float16 a, bool ieee, float_status *s) 2086 { 2087 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2088 FloatParts64 pa, pr; 2089 2090 float16a_unpack_canonical(&pa, a, s, fmt16); 2091 pr = float_to_float(pa, &float32_params, s); 2092 return float32_round_pack_canonical(&pr, s); 2093 } 2094 2095 float64 float16_to_float64(float16 a, bool ieee, float_status *s) 2096 { 2097 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2098 FloatParts64 pa, pr; 2099 2100 float16a_unpack_canonical(&pa, a, s, fmt16); 2101 pr = float_to_float(pa, &float64_params, s); 2102 return float64_round_pack_canonical(&pr, s); 2103 } 2104 2105 float16 float32_to_float16(float32 a, bool ieee, float_status *s) 2106 { 2107 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2108 FloatParts64 pa, pr; 2109 2110 float32_unpack_canonical(&pa, a, s); 2111 pr = float_to_float(pa, fmt16, s); 2112 return float16a_round_pack_canonical(&pr, s, fmt16); 2113 } 2114 2115 static float64 QEMU_SOFTFLOAT_ATTR 2116 soft_float32_to_float64(float32 a, float_status *s) 2117 { 2118 FloatParts64 pa, pr; 2119 2120 float32_unpack_canonical(&pa, a, s); 2121 pr = float_to_float(pa, &float64_params, s); 2122 return float64_round_pack_canonical(&pr, s); 2123 } 2124 2125 float64 float32_to_float64(float32 a, float_status *s) 2126 { 2127 if (likely(float32_is_normal(a))) { 2128 /* Widening conversion can never produce inexact results. */ 2129 union_float32 uf; 2130 union_float64 ud; 2131 uf.s = a; 2132 ud.h = uf.h; 2133 return ud.s; 2134 } else if (float32_is_zero(a)) { 2135 return float64_set_sign(float64_zero, float32_is_neg(a)); 2136 } else { 2137 return soft_float32_to_float64(a, s); 2138 } 2139 } 2140 2141 float16 float64_to_float16(float64 a, bool ieee, float_status *s) 2142 { 2143 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2144 FloatParts64 pa, pr; 2145 2146 float64_unpack_canonical(&pa, a, s); 2147 pr = float_to_float(pa, fmt16, s); 2148 return float16a_round_pack_canonical(&pr, s, fmt16); 2149 } 2150 2151 float32 float64_to_float32(float64 a, float_status *s) 2152 { 2153 FloatParts64 pa, pr; 2154 2155 float64_unpack_canonical(&pa, a, s); 2156 pr = float_to_float(pa, &float32_params, s); 2157 return float32_round_pack_canonical(&pr, s); 2158 } 2159 2160 float32 bfloat16_to_float32(bfloat16 a, float_status *s) 2161 { 2162 FloatParts64 pa, pr; 2163 2164 bfloat16_unpack_canonical(&pa, a, s); 2165 pr = float_to_float(pa, &float32_params, s); 2166 return float32_round_pack_canonical(&pr, s); 2167 } 2168 2169 float64 bfloat16_to_float64(bfloat16 a, float_status *s) 2170 { 2171 FloatParts64 pa, pr; 2172 2173 bfloat16_unpack_canonical(&pa, a, s); 2174 pr = float_to_float(pa, &float64_params, s); 2175 return float64_round_pack_canonical(&pr, s); 2176 } 2177 2178 bfloat16 float32_to_bfloat16(float32 a, float_status *s) 2179 { 2180 FloatParts64 pa, pr; 2181 2182 float32_unpack_canonical(&pa, a, s); 2183 pr = float_to_float(pa, &bfloat16_params, s); 2184 return bfloat16_round_pack_canonical(&pr, s); 2185 } 2186 2187 bfloat16 float64_to_bfloat16(float64 a, float_status *s) 2188 { 2189 FloatParts64 pa, pr; 2190 2191 float64_unpack_canonical(&pa, a, s); 2192 pr = float_to_float(pa, &bfloat16_params, s); 2193 return bfloat16_round_pack_canonical(&pr, s); 2194 } 2195 2196 /* 2197 * Rounds the floating-point value `a' to an integer, and returns the 2198 * result as a floating-point value. The operation is performed 2199 * according to the IEC/IEEE Standard for Binary Floating-Point 2200 * Arithmetic. 2201 */ 2202 2203 static FloatParts64 round_to_int(FloatParts64 a, FloatRoundMode rmode, 2204 int scale, float_status *s) 2205 { 2206 switch (a.cls) { 2207 case float_class_qnan: 2208 case float_class_snan: 2209 parts_return_nan(&a, s); 2210 break; 2211 2212 case float_class_zero: 2213 case float_class_inf: 2214 /* already "integral" */ 2215 break; 2216 2217 case float_class_normal: 2218 scale = MIN(MAX(scale, -0x10000), 0x10000); 2219 a.exp += scale; 2220 2221 if (a.exp >= DECOMPOSED_BINARY_POINT) { 2222 /* already integral */ 2223 break; 2224 } 2225 if (a.exp < 0) { 2226 bool one; 2227 /* all fractional */ 2228 float_raise(float_flag_inexact, s); 2229 switch (rmode) { 2230 case float_round_nearest_even: 2231 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT; 2232 break; 2233 case float_round_ties_away: 2234 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT; 2235 break; 2236 case float_round_to_zero: 2237 one = false; 2238 break; 2239 case float_round_up: 2240 one = !a.sign; 2241 break; 2242 case float_round_down: 2243 one = a.sign; 2244 break; 2245 case float_round_to_odd: 2246 one = true; 2247 break; 2248 default: 2249 g_assert_not_reached(); 2250 } 2251 2252 if (one) { 2253 a.frac = DECOMPOSED_IMPLICIT_BIT; 2254 a.exp = 0; 2255 } else { 2256 a.cls = float_class_zero; 2257 } 2258 } else { 2259 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp; 2260 uint64_t frac_lsbm1 = frac_lsb >> 1; 2261 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb; 2262 uint64_t rnd_mask = rnd_even_mask >> 1; 2263 uint64_t inc; 2264 2265 switch (rmode) { 2266 case float_round_nearest_even: 2267 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 2268 break; 2269 case float_round_ties_away: 2270 inc = frac_lsbm1; 2271 break; 2272 case float_round_to_zero: 2273 inc = 0; 2274 break; 2275 case float_round_up: 2276 inc = a.sign ? 0 : rnd_mask; 2277 break; 2278 case float_round_down: 2279 inc = a.sign ? rnd_mask : 0; 2280 break; 2281 case float_round_to_odd: 2282 inc = a.frac & frac_lsb ? 0 : rnd_mask; 2283 break; 2284 default: 2285 g_assert_not_reached(); 2286 } 2287 2288 if (a.frac & rnd_mask) { 2289 float_raise(float_flag_inexact, s); 2290 if (uadd64_overflow(a.frac, inc, &a.frac)) { 2291 a.frac >>= 1; 2292 a.frac |= DECOMPOSED_IMPLICIT_BIT; 2293 a.exp++; 2294 } 2295 a.frac &= ~rnd_mask; 2296 } 2297 } 2298 break; 2299 default: 2300 g_assert_not_reached(); 2301 } 2302 return a; 2303 } 2304 2305 float16 float16_round_to_int(float16 a, float_status *s) 2306 { 2307 FloatParts64 pa, pr; 2308 2309 float16_unpack_canonical(&pa, a, s); 2310 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2311 return float16_round_pack_canonical(&pr, s); 2312 } 2313 2314 float32 float32_round_to_int(float32 a, float_status *s) 2315 { 2316 FloatParts64 pa, pr; 2317 2318 float32_unpack_canonical(&pa, a, s); 2319 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2320 return float32_round_pack_canonical(&pr, s); 2321 } 2322 2323 float64 float64_round_to_int(float64 a, float_status *s) 2324 { 2325 FloatParts64 pa, pr; 2326 2327 float64_unpack_canonical(&pa, a, s); 2328 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2329 return float64_round_pack_canonical(&pr, s); 2330 } 2331 2332 /* 2333 * Rounds the bfloat16 value `a' to an integer, and returns the 2334 * result as a bfloat16 value. 2335 */ 2336 2337 bfloat16 bfloat16_round_to_int(bfloat16 a, float_status *s) 2338 { 2339 FloatParts64 pa, pr; 2340 2341 bfloat16_unpack_canonical(&pa, a, s); 2342 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2343 return bfloat16_round_pack_canonical(&pr, s); 2344 } 2345 2346 /* 2347 * Returns the result of converting the floating-point value `a' to 2348 * the two's complement integer format. The conversion is performed 2349 * according to the IEC/IEEE Standard for Binary Floating-Point 2350 * Arithmetic---which means in particular that the conversion is 2351 * rounded according to the current rounding mode. If `a' is a NaN, 2352 * the largest positive integer is returned. Otherwise, if the 2353 * conversion overflows, the largest integer with the same sign as `a' 2354 * is returned. 2355 */ 2356 2357 static int64_t round_to_int_and_pack(FloatParts64 in, FloatRoundMode rmode, 2358 int scale, int64_t min, int64_t max, 2359 float_status *s) 2360 { 2361 uint64_t r; 2362 int orig_flags = get_float_exception_flags(s); 2363 FloatParts64 p = round_to_int(in, rmode, scale, s); 2364 2365 switch (p.cls) { 2366 case float_class_snan: 2367 case float_class_qnan: 2368 s->float_exception_flags = orig_flags | float_flag_invalid; 2369 return max; 2370 case float_class_inf: 2371 s->float_exception_flags = orig_flags | float_flag_invalid; 2372 return p.sign ? min : max; 2373 case float_class_zero: 2374 return 0; 2375 case float_class_normal: 2376 if (p.exp <= DECOMPOSED_BINARY_POINT) { 2377 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2378 } else { 2379 r = UINT64_MAX; 2380 } 2381 if (p.sign) { 2382 if (r <= -(uint64_t) min) { 2383 return -r; 2384 } else { 2385 s->float_exception_flags = orig_flags | float_flag_invalid; 2386 return min; 2387 } 2388 } else { 2389 if (r <= max) { 2390 return r; 2391 } else { 2392 s->float_exception_flags = orig_flags | float_flag_invalid; 2393 return max; 2394 } 2395 } 2396 default: 2397 g_assert_not_reached(); 2398 } 2399 } 2400 2401 int8_t float16_to_int8_scalbn(float16 a, FloatRoundMode rmode, int scale, 2402 float_status *s) 2403 { 2404 FloatParts64 p; 2405 2406 float16_unpack_canonical(&p, a, s); 2407 return round_to_int_and_pack(p, rmode, scale, INT8_MIN, INT8_MAX, s); 2408 } 2409 2410 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale, 2411 float_status *s) 2412 { 2413 FloatParts64 p; 2414 2415 float16_unpack_canonical(&p, a, s); 2416 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2417 } 2418 2419 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale, 2420 float_status *s) 2421 { 2422 FloatParts64 p; 2423 2424 float16_unpack_canonical(&p, a, s); 2425 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2426 } 2427 2428 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale, 2429 float_status *s) 2430 { 2431 FloatParts64 p; 2432 2433 float16_unpack_canonical(&p, a, s); 2434 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2435 } 2436 2437 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale, 2438 float_status *s) 2439 { 2440 FloatParts64 p; 2441 2442 float32_unpack_canonical(&p, a, s); 2443 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2444 } 2445 2446 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale, 2447 float_status *s) 2448 { 2449 FloatParts64 p; 2450 2451 float32_unpack_canonical(&p, a, s); 2452 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2453 } 2454 2455 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale, 2456 float_status *s) 2457 { 2458 FloatParts64 p; 2459 2460 float32_unpack_canonical(&p, a, s); 2461 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2462 } 2463 2464 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale, 2465 float_status *s) 2466 { 2467 FloatParts64 p; 2468 2469 float64_unpack_canonical(&p, a, s); 2470 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2471 } 2472 2473 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale, 2474 float_status *s) 2475 { 2476 FloatParts64 p; 2477 2478 float64_unpack_canonical(&p, a, s); 2479 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2480 } 2481 2482 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale, 2483 float_status *s) 2484 { 2485 FloatParts64 p; 2486 2487 float64_unpack_canonical(&p, a, s); 2488 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2489 } 2490 2491 int8_t float16_to_int8(float16 a, float_status *s) 2492 { 2493 return float16_to_int8_scalbn(a, s->float_rounding_mode, 0, s); 2494 } 2495 2496 int16_t float16_to_int16(float16 a, float_status *s) 2497 { 2498 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2499 } 2500 2501 int32_t float16_to_int32(float16 a, float_status *s) 2502 { 2503 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2504 } 2505 2506 int64_t float16_to_int64(float16 a, float_status *s) 2507 { 2508 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2509 } 2510 2511 int16_t float32_to_int16(float32 a, float_status *s) 2512 { 2513 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2514 } 2515 2516 int32_t float32_to_int32(float32 a, float_status *s) 2517 { 2518 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2519 } 2520 2521 int64_t float32_to_int64(float32 a, float_status *s) 2522 { 2523 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2524 } 2525 2526 int16_t float64_to_int16(float64 a, float_status *s) 2527 { 2528 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2529 } 2530 2531 int32_t float64_to_int32(float64 a, float_status *s) 2532 { 2533 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2534 } 2535 2536 int64_t float64_to_int64(float64 a, float_status *s) 2537 { 2538 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2539 } 2540 2541 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s) 2542 { 2543 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2544 } 2545 2546 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s) 2547 { 2548 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2549 } 2550 2551 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s) 2552 { 2553 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2554 } 2555 2556 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s) 2557 { 2558 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s); 2559 } 2560 2561 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s) 2562 { 2563 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s); 2564 } 2565 2566 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s) 2567 { 2568 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s); 2569 } 2570 2571 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s) 2572 { 2573 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s); 2574 } 2575 2576 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s) 2577 { 2578 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s); 2579 } 2580 2581 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s) 2582 { 2583 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s); 2584 } 2585 2586 /* 2587 * Returns the result of converting the floating-point value `a' to 2588 * the two's complement integer format. 2589 */ 2590 2591 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2592 float_status *s) 2593 { 2594 FloatParts64 p; 2595 2596 bfloat16_unpack_canonical(&p, a, s); 2597 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2598 } 2599 2600 int32_t bfloat16_to_int32_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2601 float_status *s) 2602 { 2603 FloatParts64 p; 2604 2605 bfloat16_unpack_canonical(&p, a, s); 2606 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2607 } 2608 2609 int64_t bfloat16_to_int64_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2610 float_status *s) 2611 { 2612 FloatParts64 p; 2613 2614 bfloat16_unpack_canonical(&p, a, s); 2615 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2616 } 2617 2618 int16_t bfloat16_to_int16(bfloat16 a, float_status *s) 2619 { 2620 return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2621 } 2622 2623 int32_t bfloat16_to_int32(bfloat16 a, float_status *s) 2624 { 2625 return bfloat16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2626 } 2627 2628 int64_t bfloat16_to_int64(bfloat16 a, float_status *s) 2629 { 2630 return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2631 } 2632 2633 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s) 2634 { 2635 return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2636 } 2637 2638 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a, float_status *s) 2639 { 2640 return bfloat16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2641 } 2642 2643 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a, float_status *s) 2644 { 2645 return bfloat16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2646 } 2647 2648 /* 2649 * Returns the result of converting the floating-point value `a' to 2650 * the unsigned integer format. The conversion is performed according 2651 * to the IEC/IEEE Standard for Binary Floating-Point 2652 * Arithmetic---which means in particular that the conversion is 2653 * rounded according to the current rounding mode. If `a' is a NaN, 2654 * the largest unsigned integer is returned. Otherwise, if the 2655 * conversion overflows, the largest unsigned integer is returned. If 2656 * the 'a' is negative, the result is rounded and zero is returned; 2657 * values that do not round to zero will raise the inexact exception 2658 * flag. 2659 */ 2660 2661 static uint64_t round_to_uint_and_pack(FloatParts64 in, FloatRoundMode rmode, 2662 int scale, uint64_t max, 2663 float_status *s) 2664 { 2665 int orig_flags = get_float_exception_flags(s); 2666 FloatParts64 p = round_to_int(in, rmode, scale, s); 2667 uint64_t r; 2668 2669 switch (p.cls) { 2670 case float_class_snan: 2671 case float_class_qnan: 2672 s->float_exception_flags = orig_flags | float_flag_invalid; 2673 return max; 2674 case float_class_inf: 2675 s->float_exception_flags = orig_flags | float_flag_invalid; 2676 return p.sign ? 0 : max; 2677 case float_class_zero: 2678 return 0; 2679 case float_class_normal: 2680 if (p.sign) { 2681 s->float_exception_flags = orig_flags | float_flag_invalid; 2682 return 0; 2683 } 2684 2685 if (p.exp <= DECOMPOSED_BINARY_POINT) { 2686 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2687 } else { 2688 s->float_exception_flags = orig_flags | float_flag_invalid; 2689 return max; 2690 } 2691 2692 /* For uint64 this will never trip, but if p.exp is too large 2693 * to shift a decomposed fraction we shall have exited via the 2694 * 3rd leg above. 2695 */ 2696 if (r > max) { 2697 s->float_exception_flags = orig_flags | float_flag_invalid; 2698 return max; 2699 } 2700 return r; 2701 default: 2702 g_assert_not_reached(); 2703 } 2704 } 2705 2706 uint8_t float16_to_uint8_scalbn(float16 a, FloatRoundMode rmode, int scale, 2707 float_status *s) 2708 { 2709 FloatParts64 p; 2710 2711 float16_unpack_canonical(&p, a, s); 2712 return round_to_uint_and_pack(p, rmode, scale, UINT8_MAX, s); 2713 } 2714 2715 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale, 2716 float_status *s) 2717 { 2718 FloatParts64 p; 2719 2720 float16_unpack_canonical(&p, a, s); 2721 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2722 } 2723 2724 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale, 2725 float_status *s) 2726 { 2727 FloatParts64 p; 2728 2729 float16_unpack_canonical(&p, a, s); 2730 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2731 } 2732 2733 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale, 2734 float_status *s) 2735 { 2736 FloatParts64 p; 2737 2738 float16_unpack_canonical(&p, a, s); 2739 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2740 } 2741 2742 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale, 2743 float_status *s) 2744 { 2745 FloatParts64 p; 2746 2747 float32_unpack_canonical(&p, a, s); 2748 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2749 } 2750 2751 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale, 2752 float_status *s) 2753 { 2754 FloatParts64 p; 2755 2756 float32_unpack_canonical(&p, a, s); 2757 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2758 } 2759 2760 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale, 2761 float_status *s) 2762 { 2763 FloatParts64 p; 2764 2765 float32_unpack_canonical(&p, a, s); 2766 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2767 } 2768 2769 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale, 2770 float_status *s) 2771 { 2772 FloatParts64 p; 2773 2774 float64_unpack_canonical(&p, a, s); 2775 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2776 } 2777 2778 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale, 2779 float_status *s) 2780 { 2781 FloatParts64 p; 2782 2783 float64_unpack_canonical(&p, a, s); 2784 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2785 } 2786 2787 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale, 2788 float_status *s) 2789 { 2790 FloatParts64 p; 2791 2792 float64_unpack_canonical(&p, a, s); 2793 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2794 } 2795 2796 uint8_t float16_to_uint8(float16 a, float_status *s) 2797 { 2798 return float16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s); 2799 } 2800 2801 uint16_t float16_to_uint16(float16 a, float_status *s) 2802 { 2803 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2804 } 2805 2806 uint32_t float16_to_uint32(float16 a, float_status *s) 2807 { 2808 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2809 } 2810 2811 uint64_t float16_to_uint64(float16 a, float_status *s) 2812 { 2813 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2814 } 2815 2816 uint16_t float32_to_uint16(float32 a, float_status *s) 2817 { 2818 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2819 } 2820 2821 uint32_t float32_to_uint32(float32 a, float_status *s) 2822 { 2823 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2824 } 2825 2826 uint64_t float32_to_uint64(float32 a, float_status *s) 2827 { 2828 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2829 } 2830 2831 uint16_t float64_to_uint16(float64 a, float_status *s) 2832 { 2833 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2834 } 2835 2836 uint32_t float64_to_uint32(float64 a, float_status *s) 2837 { 2838 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2839 } 2840 2841 uint64_t float64_to_uint64(float64 a, float_status *s) 2842 { 2843 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2844 } 2845 2846 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s) 2847 { 2848 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2849 } 2850 2851 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s) 2852 { 2853 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2854 } 2855 2856 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s) 2857 { 2858 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2859 } 2860 2861 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s) 2862 { 2863 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2864 } 2865 2866 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s) 2867 { 2868 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2869 } 2870 2871 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s) 2872 { 2873 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2874 } 2875 2876 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s) 2877 { 2878 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2879 } 2880 2881 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s) 2882 { 2883 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2884 } 2885 2886 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s) 2887 { 2888 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2889 } 2890 2891 /* 2892 * Returns the result of converting the bfloat16 value `a' to 2893 * the unsigned integer format. 2894 */ 2895 2896 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode, 2897 int scale, float_status *s) 2898 { 2899 FloatParts64 p; 2900 2901 bfloat16_unpack_canonical(&p, a, s); 2902 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2903 } 2904 2905 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode rmode, 2906 int scale, float_status *s) 2907 { 2908 FloatParts64 p; 2909 2910 bfloat16_unpack_canonical(&p, a, s); 2911 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2912 } 2913 2914 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode rmode, 2915 int scale, float_status *s) 2916 { 2917 FloatParts64 p; 2918 2919 bfloat16_unpack_canonical(&p, a, s); 2920 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2921 } 2922 2923 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s) 2924 { 2925 return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2926 } 2927 2928 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *s) 2929 { 2930 return bfloat16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2931 } 2932 2933 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s) 2934 { 2935 return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2936 } 2937 2938 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s) 2939 { 2940 return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2941 } 2942 2943 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *s) 2944 { 2945 return bfloat16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2946 } 2947 2948 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *s) 2949 { 2950 return bfloat16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2951 } 2952 2953 /* 2954 * Integer to float conversions 2955 * 2956 * Returns the result of converting the two's complement integer `a' 2957 * to the floating-point format. The conversion is performed according 2958 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2959 */ 2960 2961 static FloatParts64 int_to_float(int64_t a, int scale, float_status *status) 2962 { 2963 FloatParts64 r = { .sign = false }; 2964 2965 if (a == 0) { 2966 r.cls = float_class_zero; 2967 } else { 2968 uint64_t f = a; 2969 int shift; 2970 2971 r.cls = float_class_normal; 2972 if (a < 0) { 2973 f = -f; 2974 r.sign = true; 2975 } 2976 shift = clz64(f); 2977 scale = MIN(MAX(scale, -0x10000), 0x10000); 2978 2979 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 2980 r.frac = f << shift; 2981 } 2982 2983 return r; 2984 } 2985 2986 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status) 2987 { 2988 FloatParts64 pa = int_to_float(a, scale, status); 2989 return float16_round_pack_canonical(&pa, status); 2990 } 2991 2992 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status) 2993 { 2994 return int64_to_float16_scalbn(a, scale, status); 2995 } 2996 2997 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status) 2998 { 2999 return int64_to_float16_scalbn(a, scale, status); 3000 } 3001 3002 float16 int64_to_float16(int64_t a, float_status *status) 3003 { 3004 return int64_to_float16_scalbn(a, 0, status); 3005 } 3006 3007 float16 int32_to_float16(int32_t a, float_status *status) 3008 { 3009 return int64_to_float16_scalbn(a, 0, status); 3010 } 3011 3012 float16 int16_to_float16(int16_t a, float_status *status) 3013 { 3014 return int64_to_float16_scalbn(a, 0, status); 3015 } 3016 3017 float16 int8_to_float16(int8_t a, float_status *status) 3018 { 3019 return int64_to_float16_scalbn(a, 0, status); 3020 } 3021 3022 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status) 3023 { 3024 FloatParts64 pa = int_to_float(a, scale, status); 3025 return float32_round_pack_canonical(&pa, status); 3026 } 3027 3028 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status) 3029 { 3030 return int64_to_float32_scalbn(a, scale, status); 3031 } 3032 3033 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status) 3034 { 3035 return int64_to_float32_scalbn(a, scale, status); 3036 } 3037 3038 float32 int64_to_float32(int64_t a, float_status *status) 3039 { 3040 return int64_to_float32_scalbn(a, 0, status); 3041 } 3042 3043 float32 int32_to_float32(int32_t a, float_status *status) 3044 { 3045 return int64_to_float32_scalbn(a, 0, status); 3046 } 3047 3048 float32 int16_to_float32(int16_t a, float_status *status) 3049 { 3050 return int64_to_float32_scalbn(a, 0, status); 3051 } 3052 3053 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status) 3054 { 3055 FloatParts64 pa = int_to_float(a, scale, status); 3056 return float64_round_pack_canonical(&pa, status); 3057 } 3058 3059 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status) 3060 { 3061 return int64_to_float64_scalbn(a, scale, status); 3062 } 3063 3064 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status) 3065 { 3066 return int64_to_float64_scalbn(a, scale, status); 3067 } 3068 3069 float64 int64_to_float64(int64_t a, float_status *status) 3070 { 3071 return int64_to_float64_scalbn(a, 0, status); 3072 } 3073 3074 float64 int32_to_float64(int32_t a, float_status *status) 3075 { 3076 return int64_to_float64_scalbn(a, 0, status); 3077 } 3078 3079 float64 int16_to_float64(int16_t a, float_status *status) 3080 { 3081 return int64_to_float64_scalbn(a, 0, status); 3082 } 3083 3084 /* 3085 * Returns the result of converting the two's complement integer `a' 3086 * to the bfloat16 format. 3087 */ 3088 3089 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status) 3090 { 3091 FloatParts64 pa = int_to_float(a, scale, status); 3092 return bfloat16_round_pack_canonical(&pa, status); 3093 } 3094 3095 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int scale, float_status *status) 3096 { 3097 return int64_to_bfloat16_scalbn(a, scale, status); 3098 } 3099 3100 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status) 3101 { 3102 return int64_to_bfloat16_scalbn(a, scale, status); 3103 } 3104 3105 bfloat16 int64_to_bfloat16(int64_t a, float_status *status) 3106 { 3107 return int64_to_bfloat16_scalbn(a, 0, status); 3108 } 3109 3110 bfloat16 int32_to_bfloat16(int32_t a, float_status *status) 3111 { 3112 return int64_to_bfloat16_scalbn(a, 0, status); 3113 } 3114 3115 bfloat16 int16_to_bfloat16(int16_t a, float_status *status) 3116 { 3117 return int64_to_bfloat16_scalbn(a, 0, status); 3118 } 3119 3120 /* 3121 * Unsigned Integer to float conversions 3122 * 3123 * Returns the result of converting the unsigned integer `a' to the 3124 * floating-point format. The conversion is performed according to the 3125 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3126 */ 3127 3128 static FloatParts64 uint_to_float(uint64_t a, int scale, float_status *status) 3129 { 3130 FloatParts64 r = { .sign = false }; 3131 int shift; 3132 3133 if (a == 0) { 3134 r.cls = float_class_zero; 3135 } else { 3136 scale = MIN(MAX(scale, -0x10000), 0x10000); 3137 shift = clz64(a); 3138 r.cls = float_class_normal; 3139 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 3140 r.frac = a << shift; 3141 } 3142 3143 return r; 3144 } 3145 3146 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status) 3147 { 3148 FloatParts64 pa = uint_to_float(a, scale, status); 3149 return float16_round_pack_canonical(&pa, status); 3150 } 3151 3152 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status) 3153 { 3154 return uint64_to_float16_scalbn(a, scale, status); 3155 } 3156 3157 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status) 3158 { 3159 return uint64_to_float16_scalbn(a, scale, status); 3160 } 3161 3162 float16 uint64_to_float16(uint64_t a, float_status *status) 3163 { 3164 return uint64_to_float16_scalbn(a, 0, status); 3165 } 3166 3167 float16 uint32_to_float16(uint32_t a, float_status *status) 3168 { 3169 return uint64_to_float16_scalbn(a, 0, status); 3170 } 3171 3172 float16 uint16_to_float16(uint16_t a, float_status *status) 3173 { 3174 return uint64_to_float16_scalbn(a, 0, status); 3175 } 3176 3177 float16 uint8_to_float16(uint8_t a, float_status *status) 3178 { 3179 return uint64_to_float16_scalbn(a, 0, status); 3180 } 3181 3182 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status) 3183 { 3184 FloatParts64 pa = uint_to_float(a, scale, status); 3185 return float32_round_pack_canonical(&pa, status); 3186 } 3187 3188 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status) 3189 { 3190 return uint64_to_float32_scalbn(a, scale, status); 3191 } 3192 3193 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status) 3194 { 3195 return uint64_to_float32_scalbn(a, scale, status); 3196 } 3197 3198 float32 uint64_to_float32(uint64_t a, float_status *status) 3199 { 3200 return uint64_to_float32_scalbn(a, 0, status); 3201 } 3202 3203 float32 uint32_to_float32(uint32_t a, float_status *status) 3204 { 3205 return uint64_to_float32_scalbn(a, 0, status); 3206 } 3207 3208 float32 uint16_to_float32(uint16_t a, float_status *status) 3209 { 3210 return uint64_to_float32_scalbn(a, 0, status); 3211 } 3212 3213 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status) 3214 { 3215 FloatParts64 pa = uint_to_float(a, scale, status); 3216 return float64_round_pack_canonical(&pa, status); 3217 } 3218 3219 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status) 3220 { 3221 return uint64_to_float64_scalbn(a, scale, status); 3222 } 3223 3224 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status) 3225 { 3226 return uint64_to_float64_scalbn(a, scale, status); 3227 } 3228 3229 float64 uint64_to_float64(uint64_t a, float_status *status) 3230 { 3231 return uint64_to_float64_scalbn(a, 0, status); 3232 } 3233 3234 float64 uint32_to_float64(uint32_t a, float_status *status) 3235 { 3236 return uint64_to_float64_scalbn(a, 0, status); 3237 } 3238 3239 float64 uint16_to_float64(uint16_t a, float_status *status) 3240 { 3241 return uint64_to_float64_scalbn(a, 0, status); 3242 } 3243 3244 /* 3245 * Returns the result of converting the unsigned integer `a' to the 3246 * bfloat16 format. 3247 */ 3248 3249 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status) 3250 { 3251 FloatParts64 pa = uint_to_float(a, scale, status); 3252 return bfloat16_round_pack_canonical(&pa, status); 3253 } 3254 3255 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int scale, float_status *status) 3256 { 3257 return uint64_to_bfloat16_scalbn(a, scale, status); 3258 } 3259 3260 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status) 3261 { 3262 return uint64_to_bfloat16_scalbn(a, scale, status); 3263 } 3264 3265 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status) 3266 { 3267 return uint64_to_bfloat16_scalbn(a, 0, status); 3268 } 3269 3270 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status) 3271 { 3272 return uint64_to_bfloat16_scalbn(a, 0, status); 3273 } 3274 3275 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status) 3276 { 3277 return uint64_to_bfloat16_scalbn(a, 0, status); 3278 } 3279 3280 /* Float Min/Max */ 3281 /* min() and max() functions. These can't be implemented as 3282 * 'compare and pick one input' because that would mishandle 3283 * NaNs and +0 vs -0. 3284 * 3285 * minnum() and maxnum() functions. These are similar to the min() 3286 * and max() functions but if one of the arguments is a QNaN and 3287 * the other is numerical then the numerical argument is returned. 3288 * SNaNs will get quietened before being returned. 3289 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 3290 * and maxNum() operations. min() and max() are the typical min/max 3291 * semantics provided by many CPUs which predate that specification. 3292 * 3293 * minnummag() and maxnummag() functions correspond to minNumMag() 3294 * and minNumMag() from the IEEE-754 2008. 3295 */ 3296 static FloatParts64 minmax_floats(FloatParts64 a, FloatParts64 b, bool ismin, 3297 bool ieee, bool ismag, float_status *s) 3298 { 3299 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) { 3300 if (ieee) { 3301 /* Takes two floating-point values `a' and `b', one of 3302 * which is a NaN, and returns the appropriate NaN 3303 * result. If either `a' or `b' is a signaling NaN, 3304 * the invalid exception is raised. 3305 */ 3306 if (is_snan(a.cls) || is_snan(b.cls)) { 3307 return *parts_pick_nan(&a, &b, s); 3308 } else if (is_nan(a.cls) && !is_nan(b.cls)) { 3309 return b; 3310 } else if (is_nan(b.cls) && !is_nan(a.cls)) { 3311 return a; 3312 } 3313 } 3314 return *parts_pick_nan(&a, &b, s); 3315 } else { 3316 int a_exp, b_exp; 3317 3318 switch (a.cls) { 3319 case float_class_normal: 3320 a_exp = a.exp; 3321 break; 3322 case float_class_inf: 3323 a_exp = INT_MAX; 3324 break; 3325 case float_class_zero: 3326 a_exp = INT_MIN; 3327 break; 3328 default: 3329 g_assert_not_reached(); 3330 break; 3331 } 3332 switch (b.cls) { 3333 case float_class_normal: 3334 b_exp = b.exp; 3335 break; 3336 case float_class_inf: 3337 b_exp = INT_MAX; 3338 break; 3339 case float_class_zero: 3340 b_exp = INT_MIN; 3341 break; 3342 default: 3343 g_assert_not_reached(); 3344 break; 3345 } 3346 3347 if (ismag && (a_exp != b_exp || a.frac != b.frac)) { 3348 bool a_less = a_exp < b_exp; 3349 if (a_exp == b_exp) { 3350 a_less = a.frac < b.frac; 3351 } 3352 return a_less ^ ismin ? b : a; 3353 } 3354 3355 if (a.sign == b.sign) { 3356 bool a_less = a_exp < b_exp; 3357 if (a_exp == b_exp) { 3358 a_less = a.frac < b.frac; 3359 } 3360 return a.sign ^ a_less ^ ismin ? b : a; 3361 } else { 3362 return a.sign ^ ismin ? b : a; 3363 } 3364 } 3365 } 3366 3367 #define MINMAX(sz, name, ismin, isiee, ismag) \ 3368 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \ 3369 float_status *s) \ 3370 { \ 3371 FloatParts64 pa, pb, pr; \ 3372 float ## sz ## _unpack_canonical(&pa, a, s); \ 3373 float ## sz ## _unpack_canonical(&pb, b, s); \ 3374 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 3375 return float ## sz ## _round_pack_canonical(&pr, s); \ 3376 } 3377 3378 MINMAX(16, min, true, false, false) 3379 MINMAX(16, minnum, true, true, false) 3380 MINMAX(16, minnummag, true, true, true) 3381 MINMAX(16, max, false, false, false) 3382 MINMAX(16, maxnum, false, true, false) 3383 MINMAX(16, maxnummag, false, true, true) 3384 3385 MINMAX(32, min, true, false, false) 3386 MINMAX(32, minnum, true, true, false) 3387 MINMAX(32, minnummag, true, true, true) 3388 MINMAX(32, max, false, false, false) 3389 MINMAX(32, maxnum, false, true, false) 3390 MINMAX(32, maxnummag, false, true, true) 3391 3392 MINMAX(64, min, true, false, false) 3393 MINMAX(64, minnum, true, true, false) 3394 MINMAX(64, minnummag, true, true, true) 3395 MINMAX(64, max, false, false, false) 3396 MINMAX(64, maxnum, false, true, false) 3397 MINMAX(64, maxnummag, false, true, true) 3398 3399 #undef MINMAX 3400 3401 #define BF16_MINMAX(name, ismin, isiee, ismag) \ 3402 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s) \ 3403 { \ 3404 FloatParts64 pa, pb, pr; \ 3405 bfloat16_unpack_canonical(&pa, a, s); \ 3406 bfloat16_unpack_canonical(&pb, b, s); \ 3407 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 3408 return bfloat16_round_pack_canonical(&pr, s); \ 3409 } 3410 3411 BF16_MINMAX(min, true, false, false) 3412 BF16_MINMAX(minnum, true, true, false) 3413 BF16_MINMAX(minnummag, true, true, true) 3414 BF16_MINMAX(max, false, false, false) 3415 BF16_MINMAX(maxnum, false, true, false) 3416 BF16_MINMAX(maxnummag, false, true, true) 3417 3418 #undef BF16_MINMAX 3419 3420 /* Floating point compare */ 3421 static FloatRelation compare_floats(FloatParts64 a, FloatParts64 b, bool is_quiet, 3422 float_status *s) 3423 { 3424 if (is_nan(a.cls) || is_nan(b.cls)) { 3425 if (!is_quiet || 3426 a.cls == float_class_snan || 3427 b.cls == float_class_snan) { 3428 float_raise(float_flag_invalid, s); 3429 } 3430 return float_relation_unordered; 3431 } 3432 3433 if (a.cls == float_class_zero) { 3434 if (b.cls == float_class_zero) { 3435 return float_relation_equal; 3436 } 3437 return b.sign ? float_relation_greater : float_relation_less; 3438 } else if (b.cls == float_class_zero) { 3439 return a.sign ? float_relation_less : float_relation_greater; 3440 } 3441 3442 /* The only really important thing about infinity is its sign. If 3443 * both are infinities the sign marks the smallest of the two. 3444 */ 3445 if (a.cls == float_class_inf) { 3446 if ((b.cls == float_class_inf) && (a.sign == b.sign)) { 3447 return float_relation_equal; 3448 } 3449 return a.sign ? float_relation_less : float_relation_greater; 3450 } else if (b.cls == float_class_inf) { 3451 return b.sign ? float_relation_greater : float_relation_less; 3452 } 3453 3454 if (a.sign != b.sign) { 3455 return a.sign ? float_relation_less : float_relation_greater; 3456 } 3457 3458 if (a.exp == b.exp) { 3459 if (a.frac == b.frac) { 3460 return float_relation_equal; 3461 } 3462 if (a.sign) { 3463 return a.frac > b.frac ? 3464 float_relation_less : float_relation_greater; 3465 } else { 3466 return a.frac > b.frac ? 3467 float_relation_greater : float_relation_less; 3468 } 3469 } else { 3470 if (a.sign) { 3471 return a.exp > b.exp ? float_relation_less : float_relation_greater; 3472 } else { 3473 return a.exp > b.exp ? float_relation_greater : float_relation_less; 3474 } 3475 } 3476 } 3477 3478 #define COMPARE(name, attr, sz) \ 3479 static int attr \ 3480 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \ 3481 { \ 3482 FloatParts64 pa, pb; \ 3483 float ## sz ## _unpack_canonical(&pa, a, s); \ 3484 float ## sz ## _unpack_canonical(&pb, b, s); \ 3485 return compare_floats(pa, pb, is_quiet, s); \ 3486 } 3487 3488 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16) 3489 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32) 3490 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64) 3491 3492 #undef COMPARE 3493 3494 FloatRelation float16_compare(float16 a, float16 b, float_status *s) 3495 { 3496 return soft_f16_compare(a, b, false, s); 3497 } 3498 3499 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s) 3500 { 3501 return soft_f16_compare(a, b, true, s); 3502 } 3503 3504 static FloatRelation QEMU_FLATTEN 3505 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s) 3506 { 3507 union_float32 ua, ub; 3508 3509 ua.s = xa; 3510 ub.s = xb; 3511 3512 if (QEMU_NO_HARDFLOAT) { 3513 goto soft; 3514 } 3515 3516 float32_input_flush2(&ua.s, &ub.s, s); 3517 if (isgreaterequal(ua.h, ub.h)) { 3518 if (isgreater(ua.h, ub.h)) { 3519 return float_relation_greater; 3520 } 3521 return float_relation_equal; 3522 } 3523 if (likely(isless(ua.h, ub.h))) { 3524 return float_relation_less; 3525 } 3526 /* The only condition remaining is unordered. 3527 * Fall through to set flags. 3528 */ 3529 soft: 3530 return soft_f32_compare(ua.s, ub.s, is_quiet, s); 3531 } 3532 3533 FloatRelation float32_compare(float32 a, float32 b, float_status *s) 3534 { 3535 return f32_compare(a, b, false, s); 3536 } 3537 3538 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s) 3539 { 3540 return f32_compare(a, b, true, s); 3541 } 3542 3543 static FloatRelation QEMU_FLATTEN 3544 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s) 3545 { 3546 union_float64 ua, ub; 3547 3548 ua.s = xa; 3549 ub.s = xb; 3550 3551 if (QEMU_NO_HARDFLOAT) { 3552 goto soft; 3553 } 3554 3555 float64_input_flush2(&ua.s, &ub.s, s); 3556 if (isgreaterequal(ua.h, ub.h)) { 3557 if (isgreater(ua.h, ub.h)) { 3558 return float_relation_greater; 3559 } 3560 return float_relation_equal; 3561 } 3562 if (likely(isless(ua.h, ub.h))) { 3563 return float_relation_less; 3564 } 3565 /* The only condition remaining is unordered. 3566 * Fall through to set flags. 3567 */ 3568 soft: 3569 return soft_f64_compare(ua.s, ub.s, is_quiet, s); 3570 } 3571 3572 FloatRelation float64_compare(float64 a, float64 b, float_status *s) 3573 { 3574 return f64_compare(a, b, false, s); 3575 } 3576 3577 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s) 3578 { 3579 return f64_compare(a, b, true, s); 3580 } 3581 3582 static FloatRelation QEMU_FLATTEN 3583 soft_bf16_compare(bfloat16 a, bfloat16 b, bool is_quiet, float_status *s) 3584 { 3585 FloatParts64 pa, pb; 3586 3587 bfloat16_unpack_canonical(&pa, a, s); 3588 bfloat16_unpack_canonical(&pb, b, s); 3589 return compare_floats(pa, pb, is_quiet, s); 3590 } 3591 3592 FloatRelation bfloat16_compare(bfloat16 a, bfloat16 b, float_status *s) 3593 { 3594 return soft_bf16_compare(a, b, false, s); 3595 } 3596 3597 FloatRelation bfloat16_compare_quiet(bfloat16 a, bfloat16 b, float_status *s) 3598 { 3599 return soft_bf16_compare(a, b, true, s); 3600 } 3601 3602 /* Multiply A by 2 raised to the power N. */ 3603 static FloatParts64 scalbn_decomposed(FloatParts64 a, int n, float_status *s) 3604 { 3605 if (unlikely(is_nan(a.cls))) { 3606 parts_return_nan(&a, s); 3607 } 3608 if (a.cls == float_class_normal) { 3609 /* The largest float type (even though not supported by FloatParts64) 3610 * is float128, which has a 15 bit exponent. Bounding N to 16 bits 3611 * still allows rounding to infinity, without allowing overflow 3612 * within the int32_t that backs FloatParts64.exp. 3613 */ 3614 n = MIN(MAX(n, -0x10000), 0x10000); 3615 a.exp += n; 3616 } 3617 return a; 3618 } 3619 3620 float16 float16_scalbn(float16 a, int n, float_status *status) 3621 { 3622 FloatParts64 pa, pr; 3623 3624 float16_unpack_canonical(&pa, a, status); 3625 pr = scalbn_decomposed(pa, n, status); 3626 return float16_round_pack_canonical(&pr, status); 3627 } 3628 3629 float32 float32_scalbn(float32 a, int n, float_status *status) 3630 { 3631 FloatParts64 pa, pr; 3632 3633 float32_unpack_canonical(&pa, a, status); 3634 pr = scalbn_decomposed(pa, n, status); 3635 return float32_round_pack_canonical(&pr, status); 3636 } 3637 3638 float64 float64_scalbn(float64 a, int n, float_status *status) 3639 { 3640 FloatParts64 pa, pr; 3641 3642 float64_unpack_canonical(&pa, a, status); 3643 pr = scalbn_decomposed(pa, n, status); 3644 return float64_round_pack_canonical(&pr, status); 3645 } 3646 3647 bfloat16 bfloat16_scalbn(bfloat16 a, int n, float_status *status) 3648 { 3649 FloatParts64 pa, pr; 3650 3651 bfloat16_unpack_canonical(&pa, a, status); 3652 pr = scalbn_decomposed(pa, n, status); 3653 return bfloat16_round_pack_canonical(&pr, status); 3654 } 3655 3656 /* 3657 * Square Root 3658 * 3659 * The old softfloat code did an approximation step before zeroing in 3660 * on the final result. However for simpleness we just compute the 3661 * square root by iterating down from the implicit bit to enough extra 3662 * bits to ensure we get a correctly rounded result. 3663 * 3664 * This does mean however the calculation is slower than before, 3665 * especially for 64 bit floats. 3666 */ 3667 3668 static FloatParts64 sqrt_float(FloatParts64 a, float_status *s, const FloatFmt *p) 3669 { 3670 uint64_t a_frac, r_frac, s_frac; 3671 int bit, last_bit; 3672 3673 if (is_nan(a.cls)) { 3674 parts_return_nan(&a, s); 3675 return a; 3676 } 3677 if (a.cls == float_class_zero) { 3678 return a; /* sqrt(+-0) = +-0 */ 3679 } 3680 if (a.sign) { 3681 float_raise(float_flag_invalid, s); 3682 parts_default_nan(&a, s); 3683 return a; 3684 } 3685 if (a.cls == float_class_inf) { 3686 return a; /* sqrt(+inf) = +inf */ 3687 } 3688 3689 assert(a.cls == float_class_normal); 3690 3691 /* We need two overflow bits at the top. Adding room for that is a 3692 * right shift. If the exponent is odd, we can discard the low bit 3693 * by multiplying the fraction by 2; that's a left shift. Combine 3694 * those and we shift right by 1 if the exponent is odd, otherwise 2. 3695 */ 3696 a_frac = a.frac >> (2 - (a.exp & 1)); 3697 a.exp >>= 1; 3698 3699 /* Bit-by-bit computation of sqrt. */ 3700 r_frac = 0; 3701 s_frac = 0; 3702 3703 /* Iterate from implicit bit down to the 3 extra bits to compute a 3704 * properly rounded result. Remember we've inserted two more bits 3705 * at the top, so these positions are two less. 3706 */ 3707 bit = DECOMPOSED_BINARY_POINT - 2; 3708 last_bit = MAX(p->frac_shift - 4, 0); 3709 do { 3710 uint64_t q = 1ULL << bit; 3711 uint64_t t_frac = s_frac + q; 3712 if (t_frac <= a_frac) { 3713 s_frac = t_frac + q; 3714 a_frac -= t_frac; 3715 r_frac += q; 3716 } 3717 a_frac <<= 1; 3718 } while (--bit >= last_bit); 3719 3720 /* Undo the right shift done above. If there is any remaining 3721 * fraction, the result is inexact. Set the sticky bit. 3722 */ 3723 a.frac = (r_frac << 2) + (a_frac != 0); 3724 3725 return a; 3726 } 3727 3728 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status) 3729 { 3730 FloatParts64 pa, pr; 3731 3732 float16_unpack_canonical(&pa, a, status); 3733 pr = sqrt_float(pa, status, &float16_params); 3734 return float16_round_pack_canonical(&pr, status); 3735 } 3736 3737 static float32 QEMU_SOFTFLOAT_ATTR 3738 soft_f32_sqrt(float32 a, float_status *status) 3739 { 3740 FloatParts64 pa, pr; 3741 3742 float32_unpack_canonical(&pa, a, status); 3743 pr = sqrt_float(pa, status, &float32_params); 3744 return float32_round_pack_canonical(&pr, status); 3745 } 3746 3747 static float64 QEMU_SOFTFLOAT_ATTR 3748 soft_f64_sqrt(float64 a, float_status *status) 3749 { 3750 FloatParts64 pa, pr; 3751 3752 float64_unpack_canonical(&pa, a, status); 3753 pr = sqrt_float(pa, status, &float64_params); 3754 return float64_round_pack_canonical(&pr, status); 3755 } 3756 3757 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s) 3758 { 3759 union_float32 ua, ur; 3760 3761 ua.s = xa; 3762 if (unlikely(!can_use_fpu(s))) { 3763 goto soft; 3764 } 3765 3766 float32_input_flush1(&ua.s, s); 3767 if (QEMU_HARDFLOAT_1F32_USE_FP) { 3768 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3769 fpclassify(ua.h) == FP_ZERO) || 3770 signbit(ua.h))) { 3771 goto soft; 3772 } 3773 } else if (unlikely(!float32_is_zero_or_normal(ua.s) || 3774 float32_is_neg(ua.s))) { 3775 goto soft; 3776 } 3777 ur.h = sqrtf(ua.h); 3778 return ur.s; 3779 3780 soft: 3781 return soft_f32_sqrt(ua.s, s); 3782 } 3783 3784 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s) 3785 { 3786 union_float64 ua, ur; 3787 3788 ua.s = xa; 3789 if (unlikely(!can_use_fpu(s))) { 3790 goto soft; 3791 } 3792 3793 float64_input_flush1(&ua.s, s); 3794 if (QEMU_HARDFLOAT_1F64_USE_FP) { 3795 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3796 fpclassify(ua.h) == FP_ZERO) || 3797 signbit(ua.h))) { 3798 goto soft; 3799 } 3800 } else if (unlikely(!float64_is_zero_or_normal(ua.s) || 3801 float64_is_neg(ua.s))) { 3802 goto soft; 3803 } 3804 ur.h = sqrt(ua.h); 3805 return ur.s; 3806 3807 soft: 3808 return soft_f64_sqrt(ua.s, s); 3809 } 3810 3811 bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status) 3812 { 3813 FloatParts64 pa, pr; 3814 3815 bfloat16_unpack_canonical(&pa, a, status); 3816 pr = sqrt_float(pa, status, &bfloat16_params); 3817 return bfloat16_round_pack_canonical(&pr, status); 3818 } 3819 3820 /*---------------------------------------------------------------------------- 3821 | The pattern for a default generated NaN. 3822 *----------------------------------------------------------------------------*/ 3823 3824 float16 float16_default_nan(float_status *status) 3825 { 3826 FloatParts64 p; 3827 3828 parts_default_nan(&p, status); 3829 p.frac >>= float16_params.frac_shift; 3830 return float16_pack_raw(&p); 3831 } 3832 3833 float32 float32_default_nan(float_status *status) 3834 { 3835 FloatParts64 p; 3836 3837 parts_default_nan(&p, status); 3838 p.frac >>= float32_params.frac_shift; 3839 return float32_pack_raw(&p); 3840 } 3841 3842 float64 float64_default_nan(float_status *status) 3843 { 3844 FloatParts64 p; 3845 3846 parts_default_nan(&p, status); 3847 p.frac >>= float64_params.frac_shift; 3848 return float64_pack_raw(&p); 3849 } 3850 3851 float128 float128_default_nan(float_status *status) 3852 { 3853 FloatParts128 p; 3854 3855 parts_default_nan(&p, status); 3856 frac_shr(&p, float128_params.frac_shift); 3857 return float128_pack_raw(&p); 3858 } 3859 3860 bfloat16 bfloat16_default_nan(float_status *status) 3861 { 3862 FloatParts64 p; 3863 3864 parts_default_nan(&p, status); 3865 p.frac >>= bfloat16_params.frac_shift; 3866 return bfloat16_pack_raw(&p); 3867 } 3868 3869 /*---------------------------------------------------------------------------- 3870 | Returns a quiet NaN from a signalling NaN for the floating point value `a'. 3871 *----------------------------------------------------------------------------*/ 3872 3873 float16 float16_silence_nan(float16 a, float_status *status) 3874 { 3875 FloatParts64 p; 3876 3877 float16_unpack_raw(&p, a); 3878 p.frac <<= float16_params.frac_shift; 3879 parts_silence_nan(&p, status); 3880 p.frac >>= float16_params.frac_shift; 3881 return float16_pack_raw(&p); 3882 } 3883 3884 float32 float32_silence_nan(float32 a, float_status *status) 3885 { 3886 FloatParts64 p; 3887 3888 float32_unpack_raw(&p, a); 3889 p.frac <<= float32_params.frac_shift; 3890 parts_silence_nan(&p, status); 3891 p.frac >>= float32_params.frac_shift; 3892 return float32_pack_raw(&p); 3893 } 3894 3895 float64 float64_silence_nan(float64 a, float_status *status) 3896 { 3897 FloatParts64 p; 3898 3899 float64_unpack_raw(&p, a); 3900 p.frac <<= float64_params.frac_shift; 3901 parts_silence_nan(&p, status); 3902 p.frac >>= float64_params.frac_shift; 3903 return float64_pack_raw(&p); 3904 } 3905 3906 bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status) 3907 { 3908 FloatParts64 p; 3909 3910 bfloat16_unpack_raw(&p, a); 3911 p.frac <<= bfloat16_params.frac_shift; 3912 parts_silence_nan(&p, status); 3913 p.frac >>= bfloat16_params.frac_shift; 3914 return bfloat16_pack_raw(&p); 3915 } 3916 3917 float128 float128_silence_nan(float128 a, float_status *status) 3918 { 3919 FloatParts128 p; 3920 3921 float128_unpack_raw(&p, a); 3922 frac_shl(&p, float128_params.frac_shift); 3923 parts_silence_nan(&p, status); 3924 frac_shr(&p, float128_params.frac_shift); 3925 return float128_pack_raw(&p); 3926 } 3927 3928 /*---------------------------------------------------------------------------- 3929 | If `a' is denormal and we are in flush-to-zero mode then set the 3930 | input-denormal exception and return zero. Otherwise just return the value. 3931 *----------------------------------------------------------------------------*/ 3932 3933 static bool parts_squash_denormal(FloatParts64 p, float_status *status) 3934 { 3935 if (p.exp == 0 && p.frac != 0) { 3936 float_raise(float_flag_input_denormal, status); 3937 return true; 3938 } 3939 3940 return false; 3941 } 3942 3943 float16 float16_squash_input_denormal(float16 a, float_status *status) 3944 { 3945 if (status->flush_inputs_to_zero) { 3946 FloatParts64 p; 3947 3948 float16_unpack_raw(&p, a); 3949 if (parts_squash_denormal(p, status)) { 3950 return float16_set_sign(float16_zero, p.sign); 3951 } 3952 } 3953 return a; 3954 } 3955 3956 float32 float32_squash_input_denormal(float32 a, float_status *status) 3957 { 3958 if (status->flush_inputs_to_zero) { 3959 FloatParts64 p; 3960 3961 float32_unpack_raw(&p, a); 3962 if (parts_squash_denormal(p, status)) { 3963 return float32_set_sign(float32_zero, p.sign); 3964 } 3965 } 3966 return a; 3967 } 3968 3969 float64 float64_squash_input_denormal(float64 a, float_status *status) 3970 { 3971 if (status->flush_inputs_to_zero) { 3972 FloatParts64 p; 3973 3974 float64_unpack_raw(&p, a); 3975 if (parts_squash_denormal(p, status)) { 3976 return float64_set_sign(float64_zero, p.sign); 3977 } 3978 } 3979 return a; 3980 } 3981 3982 bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status) 3983 { 3984 if (status->flush_inputs_to_zero) { 3985 FloatParts64 p; 3986 3987 bfloat16_unpack_raw(&p, a); 3988 if (parts_squash_denormal(p, status)) { 3989 return bfloat16_set_sign(bfloat16_zero, p.sign); 3990 } 3991 } 3992 return a; 3993 } 3994 3995 /*---------------------------------------------------------------------------- 3996 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 3997 | and 7, and returns the properly rounded 32-bit integer corresponding to the 3998 | input. If `zSign' is 1, the input is negated before being converted to an 3999 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 4000 | is simply rounded to an integer, with the inexact exception raised if the 4001 | input cannot be represented exactly as an integer. However, if the fixed- 4002 | point input is too large, the invalid exception is raised and the largest 4003 | positive or negative integer is returned. 4004 *----------------------------------------------------------------------------*/ 4005 4006 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ, 4007 float_status *status) 4008 { 4009 int8_t roundingMode; 4010 bool roundNearestEven; 4011 int8_t roundIncrement, roundBits; 4012 int32_t z; 4013 4014 roundingMode = status->float_rounding_mode; 4015 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4016 switch (roundingMode) { 4017 case float_round_nearest_even: 4018 case float_round_ties_away: 4019 roundIncrement = 0x40; 4020 break; 4021 case float_round_to_zero: 4022 roundIncrement = 0; 4023 break; 4024 case float_round_up: 4025 roundIncrement = zSign ? 0 : 0x7f; 4026 break; 4027 case float_round_down: 4028 roundIncrement = zSign ? 0x7f : 0; 4029 break; 4030 case float_round_to_odd: 4031 roundIncrement = absZ & 0x80 ? 0 : 0x7f; 4032 break; 4033 default: 4034 abort(); 4035 } 4036 roundBits = absZ & 0x7F; 4037 absZ = ( absZ + roundIncrement )>>7; 4038 if (!(roundBits ^ 0x40) && roundNearestEven) { 4039 absZ &= ~1; 4040 } 4041 z = absZ; 4042 if ( zSign ) z = - z; 4043 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 4044 float_raise(float_flag_invalid, status); 4045 return zSign ? INT32_MIN : INT32_MAX; 4046 } 4047 if (roundBits) { 4048 float_raise(float_flag_inexact, status); 4049 } 4050 return z; 4051 4052 } 4053 4054 /*---------------------------------------------------------------------------- 4055 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 4056 | `absZ1', with binary point between bits 63 and 64 (between the input words), 4057 | and returns the properly rounded 64-bit integer corresponding to the input. 4058 | If `zSign' is 1, the input is negated before being converted to an integer. 4059 | Ordinarily, the fixed-point input is simply rounded to an integer, with 4060 | the inexact exception raised if the input cannot be represented exactly as 4061 | an integer. However, if the fixed-point input is too large, the invalid 4062 | exception is raised and the largest positive or negative integer is 4063 | returned. 4064 *----------------------------------------------------------------------------*/ 4065 4066 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1, 4067 float_status *status) 4068 { 4069 int8_t roundingMode; 4070 bool roundNearestEven, increment; 4071 int64_t z; 4072 4073 roundingMode = status->float_rounding_mode; 4074 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4075 switch (roundingMode) { 4076 case float_round_nearest_even: 4077 case float_round_ties_away: 4078 increment = ((int64_t) absZ1 < 0); 4079 break; 4080 case float_round_to_zero: 4081 increment = 0; 4082 break; 4083 case float_round_up: 4084 increment = !zSign && absZ1; 4085 break; 4086 case float_round_down: 4087 increment = zSign && absZ1; 4088 break; 4089 case float_round_to_odd: 4090 increment = !(absZ0 & 1) && absZ1; 4091 break; 4092 default: 4093 abort(); 4094 } 4095 if ( increment ) { 4096 ++absZ0; 4097 if ( absZ0 == 0 ) goto overflow; 4098 if (!(absZ1 << 1) && roundNearestEven) { 4099 absZ0 &= ~1; 4100 } 4101 } 4102 z = absZ0; 4103 if ( zSign ) z = - z; 4104 if ( z && ( ( z < 0 ) ^ zSign ) ) { 4105 overflow: 4106 float_raise(float_flag_invalid, status); 4107 return zSign ? INT64_MIN : INT64_MAX; 4108 } 4109 if (absZ1) { 4110 float_raise(float_flag_inexact, status); 4111 } 4112 return z; 4113 4114 } 4115 4116 /*---------------------------------------------------------------------------- 4117 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 4118 | `absZ1', with binary point between bits 63 and 64 (between the input words), 4119 | and returns the properly rounded 64-bit unsigned integer corresponding to the 4120 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 4121 | with the inexact exception raised if the input cannot be represented exactly 4122 | as an integer. However, if the fixed-point input is too large, the invalid 4123 | exception is raised and the largest unsigned integer is returned. 4124 *----------------------------------------------------------------------------*/ 4125 4126 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0, 4127 uint64_t absZ1, float_status *status) 4128 { 4129 int8_t roundingMode; 4130 bool roundNearestEven, increment; 4131 4132 roundingMode = status->float_rounding_mode; 4133 roundNearestEven = (roundingMode == float_round_nearest_even); 4134 switch (roundingMode) { 4135 case float_round_nearest_even: 4136 case float_round_ties_away: 4137 increment = ((int64_t)absZ1 < 0); 4138 break; 4139 case float_round_to_zero: 4140 increment = 0; 4141 break; 4142 case float_round_up: 4143 increment = !zSign && absZ1; 4144 break; 4145 case float_round_down: 4146 increment = zSign && absZ1; 4147 break; 4148 case float_round_to_odd: 4149 increment = !(absZ0 & 1) && absZ1; 4150 break; 4151 default: 4152 abort(); 4153 } 4154 if (increment) { 4155 ++absZ0; 4156 if (absZ0 == 0) { 4157 float_raise(float_flag_invalid, status); 4158 return UINT64_MAX; 4159 } 4160 if (!(absZ1 << 1) && roundNearestEven) { 4161 absZ0 &= ~1; 4162 } 4163 } 4164 4165 if (zSign && absZ0) { 4166 float_raise(float_flag_invalid, status); 4167 return 0; 4168 } 4169 4170 if (absZ1) { 4171 float_raise(float_flag_inexact, status); 4172 } 4173 return absZ0; 4174 } 4175 4176 /*---------------------------------------------------------------------------- 4177 | Normalizes the subnormal single-precision floating-point value represented 4178 | by the denormalized significand `aSig'. The normalized exponent and 4179 | significand are stored at the locations pointed to by `zExpPtr' and 4180 | `zSigPtr', respectively. 4181 *----------------------------------------------------------------------------*/ 4182 4183 static void 4184 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 4185 { 4186 int8_t shiftCount; 4187 4188 shiftCount = clz32(aSig) - 8; 4189 *zSigPtr = aSig<<shiftCount; 4190 *zExpPtr = 1 - shiftCount; 4191 4192 } 4193 4194 /*---------------------------------------------------------------------------- 4195 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4196 | and significand `zSig', and returns the proper single-precision floating- 4197 | point value corresponding to the abstract input. Ordinarily, the abstract 4198 | value is simply rounded and packed into the single-precision format, with 4199 | the inexact exception raised if the abstract input cannot be represented 4200 | exactly. However, if the abstract value is too large, the overflow and 4201 | inexact exceptions are raised and an infinity or maximal finite value is 4202 | returned. If the abstract value is too small, the input value is rounded to 4203 | a subnormal number, and the underflow and inexact exceptions are raised if 4204 | the abstract input cannot be represented exactly as a subnormal single- 4205 | precision floating-point number. 4206 | The input significand `zSig' has its binary point between bits 30 4207 | and 29, which is 7 bits to the left of the usual location. This shifted 4208 | significand must be normalized or smaller. If `zSig' is not normalized, 4209 | `zExp' must be 0; in that case, the result returned is a subnormal number, 4210 | and it must not require rounding. In the usual case that `zSig' is 4211 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 4212 | The handling of underflow and overflow follows the IEC/IEEE Standard for 4213 | Binary Floating-Point Arithmetic. 4214 *----------------------------------------------------------------------------*/ 4215 4216 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig, 4217 float_status *status) 4218 { 4219 int8_t roundingMode; 4220 bool roundNearestEven; 4221 int8_t roundIncrement, roundBits; 4222 bool isTiny; 4223 4224 roundingMode = status->float_rounding_mode; 4225 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4226 switch (roundingMode) { 4227 case float_round_nearest_even: 4228 case float_round_ties_away: 4229 roundIncrement = 0x40; 4230 break; 4231 case float_round_to_zero: 4232 roundIncrement = 0; 4233 break; 4234 case float_round_up: 4235 roundIncrement = zSign ? 0 : 0x7f; 4236 break; 4237 case float_round_down: 4238 roundIncrement = zSign ? 0x7f : 0; 4239 break; 4240 case float_round_to_odd: 4241 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 4242 break; 4243 default: 4244 abort(); 4245 break; 4246 } 4247 roundBits = zSig & 0x7F; 4248 if ( 0xFD <= (uint16_t) zExp ) { 4249 if ( ( 0xFD < zExp ) 4250 || ( ( zExp == 0xFD ) 4251 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 4252 ) { 4253 bool overflow_to_inf = roundingMode != float_round_to_odd && 4254 roundIncrement != 0; 4255 float_raise(float_flag_overflow | float_flag_inexact, status); 4256 return packFloat32(zSign, 0xFF, -!overflow_to_inf); 4257 } 4258 if ( zExp < 0 ) { 4259 if (status->flush_to_zero) { 4260 float_raise(float_flag_output_denormal, status); 4261 return packFloat32(zSign, 0, 0); 4262 } 4263 isTiny = status->tininess_before_rounding 4264 || (zExp < -1) 4265 || (zSig + roundIncrement < 0x80000000); 4266 shift32RightJamming( zSig, - zExp, &zSig ); 4267 zExp = 0; 4268 roundBits = zSig & 0x7F; 4269 if (isTiny && roundBits) { 4270 float_raise(float_flag_underflow, status); 4271 } 4272 if (roundingMode == float_round_to_odd) { 4273 /* 4274 * For round-to-odd case, the roundIncrement depends on 4275 * zSig which just changed. 4276 */ 4277 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 4278 } 4279 } 4280 } 4281 if (roundBits) { 4282 float_raise(float_flag_inexact, status); 4283 } 4284 zSig = ( zSig + roundIncrement )>>7; 4285 if (!(roundBits ^ 0x40) && roundNearestEven) { 4286 zSig &= ~1; 4287 } 4288 if ( zSig == 0 ) zExp = 0; 4289 return packFloat32( zSign, zExp, zSig ); 4290 4291 } 4292 4293 /*---------------------------------------------------------------------------- 4294 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4295 | and significand `zSig', and returns the proper single-precision floating- 4296 | point value corresponding to the abstract input. This routine is just like 4297 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 4298 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 4299 | floating-point exponent. 4300 *----------------------------------------------------------------------------*/ 4301 4302 static float32 4303 normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig, 4304 float_status *status) 4305 { 4306 int8_t shiftCount; 4307 4308 shiftCount = clz32(zSig) - 1; 4309 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 4310 status); 4311 4312 } 4313 4314 /*---------------------------------------------------------------------------- 4315 | Normalizes the subnormal double-precision floating-point value represented 4316 | by the denormalized significand `aSig'. The normalized exponent and 4317 | significand are stored at the locations pointed to by `zExpPtr' and 4318 | `zSigPtr', respectively. 4319 *----------------------------------------------------------------------------*/ 4320 4321 static void 4322 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 4323 { 4324 int8_t shiftCount; 4325 4326 shiftCount = clz64(aSig) - 11; 4327 *zSigPtr = aSig<<shiftCount; 4328 *zExpPtr = 1 - shiftCount; 4329 4330 } 4331 4332 /*---------------------------------------------------------------------------- 4333 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 4334 | double-precision floating-point value, returning the result. After being 4335 | shifted into the proper positions, the three fields are simply added 4336 | together to form the result. This means that any integer portion of `zSig' 4337 | will be added into the exponent. Since a properly normalized significand 4338 | will have an integer portion equal to 1, the `zExp' input should be 1 less 4339 | than the desired result exponent whenever `zSig' is a complete, normalized 4340 | significand. 4341 *----------------------------------------------------------------------------*/ 4342 4343 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig) 4344 { 4345 4346 return make_float64( 4347 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 4348 4349 } 4350 4351 /*---------------------------------------------------------------------------- 4352 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4353 | and significand `zSig', and returns the proper double-precision floating- 4354 | point value corresponding to the abstract input. Ordinarily, the abstract 4355 | value is simply rounded and packed into the double-precision format, with 4356 | the inexact exception raised if the abstract input cannot be represented 4357 | exactly. However, if the abstract value is too large, the overflow and 4358 | inexact exceptions are raised and an infinity or maximal finite value is 4359 | returned. If the abstract value is too small, the input value is rounded to 4360 | a subnormal number, and the underflow and inexact exceptions are raised if 4361 | the abstract input cannot be represented exactly as a subnormal double- 4362 | precision floating-point number. 4363 | The input significand `zSig' has its binary point between bits 62 4364 | and 61, which is 10 bits to the left of the usual location. This shifted 4365 | significand must be normalized or smaller. If `zSig' is not normalized, 4366 | `zExp' must be 0; in that case, the result returned is a subnormal number, 4367 | and it must not require rounding. In the usual case that `zSig' is 4368 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 4369 | The handling of underflow and overflow follows the IEC/IEEE Standard for 4370 | Binary Floating-Point Arithmetic. 4371 *----------------------------------------------------------------------------*/ 4372 4373 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig, 4374 float_status *status) 4375 { 4376 int8_t roundingMode; 4377 bool roundNearestEven; 4378 int roundIncrement, roundBits; 4379 bool isTiny; 4380 4381 roundingMode = status->float_rounding_mode; 4382 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4383 switch (roundingMode) { 4384 case float_round_nearest_even: 4385 case float_round_ties_away: 4386 roundIncrement = 0x200; 4387 break; 4388 case float_round_to_zero: 4389 roundIncrement = 0; 4390 break; 4391 case float_round_up: 4392 roundIncrement = zSign ? 0 : 0x3ff; 4393 break; 4394 case float_round_down: 4395 roundIncrement = zSign ? 0x3ff : 0; 4396 break; 4397 case float_round_to_odd: 4398 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 4399 break; 4400 default: 4401 abort(); 4402 } 4403 roundBits = zSig & 0x3FF; 4404 if ( 0x7FD <= (uint16_t) zExp ) { 4405 if ( ( 0x7FD < zExp ) 4406 || ( ( zExp == 0x7FD ) 4407 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 4408 ) { 4409 bool overflow_to_inf = roundingMode != float_round_to_odd && 4410 roundIncrement != 0; 4411 float_raise(float_flag_overflow | float_flag_inexact, status); 4412 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 4413 } 4414 if ( zExp < 0 ) { 4415 if (status->flush_to_zero) { 4416 float_raise(float_flag_output_denormal, status); 4417 return packFloat64(zSign, 0, 0); 4418 } 4419 isTiny = status->tininess_before_rounding 4420 || (zExp < -1) 4421 || (zSig + roundIncrement < UINT64_C(0x8000000000000000)); 4422 shift64RightJamming( zSig, - zExp, &zSig ); 4423 zExp = 0; 4424 roundBits = zSig & 0x3FF; 4425 if (isTiny && roundBits) { 4426 float_raise(float_flag_underflow, status); 4427 } 4428 if (roundingMode == float_round_to_odd) { 4429 /* 4430 * For round-to-odd case, the roundIncrement depends on 4431 * zSig which just changed. 4432 */ 4433 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 4434 } 4435 } 4436 } 4437 if (roundBits) { 4438 float_raise(float_flag_inexact, status); 4439 } 4440 zSig = ( zSig + roundIncrement )>>10; 4441 if (!(roundBits ^ 0x200) && roundNearestEven) { 4442 zSig &= ~1; 4443 } 4444 if ( zSig == 0 ) zExp = 0; 4445 return packFloat64( zSign, zExp, zSig ); 4446 4447 } 4448 4449 /*---------------------------------------------------------------------------- 4450 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4451 | and significand `zSig', and returns the proper double-precision floating- 4452 | point value corresponding to the abstract input. This routine is just like 4453 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 4454 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 4455 | floating-point exponent. 4456 *----------------------------------------------------------------------------*/ 4457 4458 static float64 4459 normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig, 4460 float_status *status) 4461 { 4462 int8_t shiftCount; 4463 4464 shiftCount = clz64(zSig) - 1; 4465 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 4466 status); 4467 4468 } 4469 4470 /*---------------------------------------------------------------------------- 4471 | Normalizes the subnormal extended double-precision floating-point value 4472 | represented by the denormalized significand `aSig'. The normalized exponent 4473 | and significand are stored at the locations pointed to by `zExpPtr' and 4474 | `zSigPtr', respectively. 4475 *----------------------------------------------------------------------------*/ 4476 4477 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr, 4478 uint64_t *zSigPtr) 4479 { 4480 int8_t shiftCount; 4481 4482 shiftCount = clz64(aSig); 4483 *zSigPtr = aSig<<shiftCount; 4484 *zExpPtr = 1 - shiftCount; 4485 } 4486 4487 /*---------------------------------------------------------------------------- 4488 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4489 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 4490 | and returns the proper extended double-precision floating-point value 4491 | corresponding to the abstract input. Ordinarily, the abstract value is 4492 | rounded and packed into the extended double-precision format, with the 4493 | inexact exception raised if the abstract input cannot be represented 4494 | exactly. However, if the abstract value is too large, the overflow and 4495 | inexact exceptions are raised and an infinity or maximal finite value is 4496 | returned. If the abstract value is too small, the input value is rounded to 4497 | a subnormal number, and the underflow and inexact exceptions are raised if 4498 | the abstract input cannot be represented exactly as a subnormal extended 4499 | double-precision floating-point number. 4500 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 4501 | number of bits as single or double precision, respectively. Otherwise, the 4502 | result is rounded to the full precision of the extended double-precision 4503 | format. 4504 | The input significand must be normalized or smaller. If the input 4505 | significand is not normalized, `zExp' must be 0; in that case, the result 4506 | returned is a subnormal number, and it must not require rounding. The 4507 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 4508 | Floating-Point Arithmetic. 4509 *----------------------------------------------------------------------------*/ 4510 4511 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign, 4512 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 4513 float_status *status) 4514 { 4515 int8_t roundingMode; 4516 bool roundNearestEven, increment, isTiny; 4517 int64_t roundIncrement, roundMask, roundBits; 4518 4519 roundingMode = status->float_rounding_mode; 4520 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4521 if ( roundingPrecision == 80 ) goto precision80; 4522 if ( roundingPrecision == 64 ) { 4523 roundIncrement = UINT64_C(0x0000000000000400); 4524 roundMask = UINT64_C(0x00000000000007FF); 4525 } 4526 else if ( roundingPrecision == 32 ) { 4527 roundIncrement = UINT64_C(0x0000008000000000); 4528 roundMask = UINT64_C(0x000000FFFFFFFFFF); 4529 } 4530 else { 4531 goto precision80; 4532 } 4533 zSig0 |= ( zSig1 != 0 ); 4534 switch (roundingMode) { 4535 case float_round_nearest_even: 4536 case float_round_ties_away: 4537 break; 4538 case float_round_to_zero: 4539 roundIncrement = 0; 4540 break; 4541 case float_round_up: 4542 roundIncrement = zSign ? 0 : roundMask; 4543 break; 4544 case float_round_down: 4545 roundIncrement = zSign ? roundMask : 0; 4546 break; 4547 default: 4548 abort(); 4549 } 4550 roundBits = zSig0 & roundMask; 4551 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 4552 if ( ( 0x7FFE < zExp ) 4553 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 4554 ) { 4555 goto overflow; 4556 } 4557 if ( zExp <= 0 ) { 4558 if (status->flush_to_zero) { 4559 float_raise(float_flag_output_denormal, status); 4560 return packFloatx80(zSign, 0, 0); 4561 } 4562 isTiny = status->tininess_before_rounding 4563 || (zExp < 0 ) 4564 || (zSig0 <= zSig0 + roundIncrement); 4565 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 4566 zExp = 0; 4567 roundBits = zSig0 & roundMask; 4568 if (isTiny && roundBits) { 4569 float_raise(float_flag_underflow, status); 4570 } 4571 if (roundBits) { 4572 float_raise(float_flag_inexact, status); 4573 } 4574 zSig0 += roundIncrement; 4575 if ( (int64_t) zSig0 < 0 ) zExp = 1; 4576 roundIncrement = roundMask + 1; 4577 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 4578 roundMask |= roundIncrement; 4579 } 4580 zSig0 &= ~ roundMask; 4581 return packFloatx80( zSign, zExp, zSig0 ); 4582 } 4583 } 4584 if (roundBits) { 4585 float_raise(float_flag_inexact, status); 4586 } 4587 zSig0 += roundIncrement; 4588 if ( zSig0 < roundIncrement ) { 4589 ++zExp; 4590 zSig0 = UINT64_C(0x8000000000000000); 4591 } 4592 roundIncrement = roundMask + 1; 4593 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 4594 roundMask |= roundIncrement; 4595 } 4596 zSig0 &= ~ roundMask; 4597 if ( zSig0 == 0 ) zExp = 0; 4598 return packFloatx80( zSign, zExp, zSig0 ); 4599 precision80: 4600 switch (roundingMode) { 4601 case float_round_nearest_even: 4602 case float_round_ties_away: 4603 increment = ((int64_t)zSig1 < 0); 4604 break; 4605 case float_round_to_zero: 4606 increment = 0; 4607 break; 4608 case float_round_up: 4609 increment = !zSign && zSig1; 4610 break; 4611 case float_round_down: 4612 increment = zSign && zSig1; 4613 break; 4614 default: 4615 abort(); 4616 } 4617 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 4618 if ( ( 0x7FFE < zExp ) 4619 || ( ( zExp == 0x7FFE ) 4620 && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) ) 4621 && increment 4622 ) 4623 ) { 4624 roundMask = 0; 4625 overflow: 4626 float_raise(float_flag_overflow | float_flag_inexact, status); 4627 if ( ( roundingMode == float_round_to_zero ) 4628 || ( zSign && ( roundingMode == float_round_up ) ) 4629 || ( ! zSign && ( roundingMode == float_round_down ) ) 4630 ) { 4631 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 4632 } 4633 return packFloatx80(zSign, 4634 floatx80_infinity_high, 4635 floatx80_infinity_low); 4636 } 4637 if ( zExp <= 0 ) { 4638 isTiny = status->tininess_before_rounding 4639 || (zExp < 0) 4640 || !increment 4641 || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF)); 4642 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 4643 zExp = 0; 4644 if (isTiny && zSig1) { 4645 float_raise(float_flag_underflow, status); 4646 } 4647 if (zSig1) { 4648 float_raise(float_flag_inexact, status); 4649 } 4650 switch (roundingMode) { 4651 case float_round_nearest_even: 4652 case float_round_ties_away: 4653 increment = ((int64_t)zSig1 < 0); 4654 break; 4655 case float_round_to_zero: 4656 increment = 0; 4657 break; 4658 case float_round_up: 4659 increment = !zSign && zSig1; 4660 break; 4661 case float_round_down: 4662 increment = zSign && zSig1; 4663 break; 4664 default: 4665 abort(); 4666 } 4667 if ( increment ) { 4668 ++zSig0; 4669 if (!(zSig1 << 1) && roundNearestEven) { 4670 zSig0 &= ~1; 4671 } 4672 if ( (int64_t) zSig0 < 0 ) zExp = 1; 4673 } 4674 return packFloatx80( zSign, zExp, zSig0 ); 4675 } 4676 } 4677 if (zSig1) { 4678 float_raise(float_flag_inexact, status); 4679 } 4680 if ( increment ) { 4681 ++zSig0; 4682 if ( zSig0 == 0 ) { 4683 ++zExp; 4684 zSig0 = UINT64_C(0x8000000000000000); 4685 } 4686 else { 4687 if (!(zSig1 << 1) && roundNearestEven) { 4688 zSig0 &= ~1; 4689 } 4690 } 4691 } 4692 else { 4693 if ( zSig0 == 0 ) zExp = 0; 4694 } 4695 return packFloatx80( zSign, zExp, zSig0 ); 4696 4697 } 4698 4699 /*---------------------------------------------------------------------------- 4700 | Takes an abstract floating-point value having sign `zSign', exponent 4701 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 4702 | and returns the proper extended double-precision floating-point value 4703 | corresponding to the abstract input. This routine is just like 4704 | `roundAndPackFloatx80' except that the input significand does not have to be 4705 | normalized. 4706 *----------------------------------------------------------------------------*/ 4707 4708 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 4709 bool zSign, int32_t zExp, 4710 uint64_t zSig0, uint64_t zSig1, 4711 float_status *status) 4712 { 4713 int8_t shiftCount; 4714 4715 if ( zSig0 == 0 ) { 4716 zSig0 = zSig1; 4717 zSig1 = 0; 4718 zExp -= 64; 4719 } 4720 shiftCount = clz64(zSig0); 4721 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4722 zExp -= shiftCount; 4723 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 4724 zSig0, zSig1, status); 4725 4726 } 4727 4728 /*---------------------------------------------------------------------------- 4729 | Returns the least-significant 64 fraction bits of the quadruple-precision 4730 | floating-point value `a'. 4731 *----------------------------------------------------------------------------*/ 4732 4733 static inline uint64_t extractFloat128Frac1( float128 a ) 4734 { 4735 4736 return a.low; 4737 4738 } 4739 4740 /*---------------------------------------------------------------------------- 4741 | Returns the most-significant 48 fraction bits of the quadruple-precision 4742 | floating-point value `a'. 4743 *----------------------------------------------------------------------------*/ 4744 4745 static inline uint64_t extractFloat128Frac0( float128 a ) 4746 { 4747 4748 return a.high & UINT64_C(0x0000FFFFFFFFFFFF); 4749 4750 } 4751 4752 /*---------------------------------------------------------------------------- 4753 | Returns the exponent bits of the quadruple-precision floating-point value 4754 | `a'. 4755 *----------------------------------------------------------------------------*/ 4756 4757 static inline int32_t extractFloat128Exp( float128 a ) 4758 { 4759 4760 return ( a.high>>48 ) & 0x7FFF; 4761 4762 } 4763 4764 /*---------------------------------------------------------------------------- 4765 | Returns the sign bit of the quadruple-precision floating-point value `a'. 4766 *----------------------------------------------------------------------------*/ 4767 4768 static inline bool extractFloat128Sign(float128 a) 4769 { 4770 return a.high >> 63; 4771 } 4772 4773 /*---------------------------------------------------------------------------- 4774 | Normalizes the subnormal quadruple-precision floating-point value 4775 | represented by the denormalized significand formed by the concatenation of 4776 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 4777 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 4778 | significand are stored at the location pointed to by `zSig0Ptr', and the 4779 | least significant 64 bits of the normalized significand are stored at the 4780 | location pointed to by `zSig1Ptr'. 4781 *----------------------------------------------------------------------------*/ 4782 4783 static void 4784 normalizeFloat128Subnormal( 4785 uint64_t aSig0, 4786 uint64_t aSig1, 4787 int32_t *zExpPtr, 4788 uint64_t *zSig0Ptr, 4789 uint64_t *zSig1Ptr 4790 ) 4791 { 4792 int8_t shiftCount; 4793 4794 if ( aSig0 == 0 ) { 4795 shiftCount = clz64(aSig1) - 15; 4796 if ( shiftCount < 0 ) { 4797 *zSig0Ptr = aSig1>>( - shiftCount ); 4798 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 4799 } 4800 else { 4801 *zSig0Ptr = aSig1<<shiftCount; 4802 *zSig1Ptr = 0; 4803 } 4804 *zExpPtr = - shiftCount - 63; 4805 } 4806 else { 4807 shiftCount = clz64(aSig0) - 15; 4808 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 4809 *zExpPtr = 1 - shiftCount; 4810 } 4811 4812 } 4813 4814 /*---------------------------------------------------------------------------- 4815 | Packs the sign `zSign', the exponent `zExp', and the significand formed 4816 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 4817 | floating-point value, returning the result. After being shifted into the 4818 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 4819 | added together to form the most significant 32 bits of the result. This 4820 | means that any integer portion of `zSig0' will be added into the exponent. 4821 | Since a properly normalized significand will have an integer portion equal 4822 | to 1, the `zExp' input should be 1 less than the desired result exponent 4823 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 4824 | significand. 4825 *----------------------------------------------------------------------------*/ 4826 4827 static inline float128 4828 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1) 4829 { 4830 float128 z; 4831 4832 z.low = zSig1; 4833 z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0; 4834 return z; 4835 } 4836 4837 /*---------------------------------------------------------------------------- 4838 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4839 | and extended significand formed by the concatenation of `zSig0', `zSig1', 4840 | and `zSig2', and returns the proper quadruple-precision floating-point value 4841 | corresponding to the abstract input. Ordinarily, the abstract value is 4842 | simply rounded and packed into the quadruple-precision format, with the 4843 | inexact exception raised if the abstract input cannot be represented 4844 | exactly. However, if the abstract value is too large, the overflow and 4845 | inexact exceptions are raised and an infinity or maximal finite value is 4846 | returned. If the abstract value is too small, the input value is rounded to 4847 | a subnormal number, and the underflow and inexact exceptions are raised if 4848 | the abstract input cannot be represented exactly as a subnormal quadruple- 4849 | precision floating-point number. 4850 | The input significand must be normalized or smaller. If the input 4851 | significand is not normalized, `zExp' must be 0; in that case, the result 4852 | returned is a subnormal number, and it must not require rounding. In the 4853 | usual case that the input significand is normalized, `zExp' must be 1 less 4854 | than the ``true'' floating-point exponent. The handling of underflow and 4855 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4856 *----------------------------------------------------------------------------*/ 4857 4858 static float128 roundAndPackFloat128(bool zSign, int32_t zExp, 4859 uint64_t zSig0, uint64_t zSig1, 4860 uint64_t zSig2, float_status *status) 4861 { 4862 int8_t roundingMode; 4863 bool roundNearestEven, increment, isTiny; 4864 4865 roundingMode = status->float_rounding_mode; 4866 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4867 switch (roundingMode) { 4868 case float_round_nearest_even: 4869 case float_round_ties_away: 4870 increment = ((int64_t)zSig2 < 0); 4871 break; 4872 case float_round_to_zero: 4873 increment = 0; 4874 break; 4875 case float_round_up: 4876 increment = !zSign && zSig2; 4877 break; 4878 case float_round_down: 4879 increment = zSign && zSig2; 4880 break; 4881 case float_round_to_odd: 4882 increment = !(zSig1 & 0x1) && zSig2; 4883 break; 4884 default: 4885 abort(); 4886 } 4887 if ( 0x7FFD <= (uint32_t) zExp ) { 4888 if ( ( 0x7FFD < zExp ) 4889 || ( ( zExp == 0x7FFD ) 4890 && eq128( 4891 UINT64_C(0x0001FFFFFFFFFFFF), 4892 UINT64_C(0xFFFFFFFFFFFFFFFF), 4893 zSig0, 4894 zSig1 4895 ) 4896 && increment 4897 ) 4898 ) { 4899 float_raise(float_flag_overflow | float_flag_inexact, status); 4900 if ( ( roundingMode == float_round_to_zero ) 4901 || ( zSign && ( roundingMode == float_round_up ) ) 4902 || ( ! zSign && ( roundingMode == float_round_down ) ) 4903 || (roundingMode == float_round_to_odd) 4904 ) { 4905 return 4906 packFloat128( 4907 zSign, 4908 0x7FFE, 4909 UINT64_C(0x0000FFFFFFFFFFFF), 4910 UINT64_C(0xFFFFFFFFFFFFFFFF) 4911 ); 4912 } 4913 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4914 } 4915 if ( zExp < 0 ) { 4916 if (status->flush_to_zero) { 4917 float_raise(float_flag_output_denormal, status); 4918 return packFloat128(zSign, 0, 0, 0); 4919 } 4920 isTiny = status->tininess_before_rounding 4921 || (zExp < -1) 4922 || !increment 4923 || lt128(zSig0, zSig1, 4924 UINT64_C(0x0001FFFFFFFFFFFF), 4925 UINT64_C(0xFFFFFFFFFFFFFFFF)); 4926 shift128ExtraRightJamming( 4927 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 4928 zExp = 0; 4929 if (isTiny && zSig2) { 4930 float_raise(float_flag_underflow, status); 4931 } 4932 switch (roundingMode) { 4933 case float_round_nearest_even: 4934 case float_round_ties_away: 4935 increment = ((int64_t)zSig2 < 0); 4936 break; 4937 case float_round_to_zero: 4938 increment = 0; 4939 break; 4940 case float_round_up: 4941 increment = !zSign && zSig2; 4942 break; 4943 case float_round_down: 4944 increment = zSign && zSig2; 4945 break; 4946 case float_round_to_odd: 4947 increment = !(zSig1 & 0x1) && zSig2; 4948 break; 4949 default: 4950 abort(); 4951 } 4952 } 4953 } 4954 if (zSig2) { 4955 float_raise(float_flag_inexact, status); 4956 } 4957 if ( increment ) { 4958 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 4959 if ((zSig2 + zSig2 == 0) && roundNearestEven) { 4960 zSig1 &= ~1; 4961 } 4962 } 4963 else { 4964 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 4965 } 4966 return packFloat128( zSign, zExp, zSig0, zSig1 ); 4967 4968 } 4969 4970 /*---------------------------------------------------------------------------- 4971 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4972 | and significand formed by the concatenation of `zSig0' and `zSig1', and 4973 | returns the proper quadruple-precision floating-point value corresponding 4974 | to the abstract input. This routine is just like `roundAndPackFloat128' 4975 | except that the input significand has fewer bits and does not have to be 4976 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 4977 | point exponent. 4978 *----------------------------------------------------------------------------*/ 4979 4980 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp, 4981 uint64_t zSig0, uint64_t zSig1, 4982 float_status *status) 4983 { 4984 int8_t shiftCount; 4985 uint64_t zSig2; 4986 4987 if ( zSig0 == 0 ) { 4988 zSig0 = zSig1; 4989 zSig1 = 0; 4990 zExp -= 64; 4991 } 4992 shiftCount = clz64(zSig0) - 15; 4993 if ( 0 <= shiftCount ) { 4994 zSig2 = 0; 4995 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4996 } 4997 else { 4998 shift128ExtraRightJamming( 4999 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 5000 } 5001 zExp -= shiftCount; 5002 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 5003 5004 } 5005 5006 5007 /*---------------------------------------------------------------------------- 5008 | Returns the result of converting the 32-bit two's complement integer `a' 5009 | to the extended double-precision floating-point format. The conversion 5010 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5011 | Arithmetic. 5012 *----------------------------------------------------------------------------*/ 5013 5014 floatx80 int32_to_floatx80(int32_t a, float_status *status) 5015 { 5016 bool zSign; 5017 uint32_t absA; 5018 int8_t shiftCount; 5019 uint64_t zSig; 5020 5021 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 5022 zSign = ( a < 0 ); 5023 absA = zSign ? - a : a; 5024 shiftCount = clz32(absA) + 32; 5025 zSig = absA; 5026 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 5027 5028 } 5029 5030 /*---------------------------------------------------------------------------- 5031 | Returns the result of converting the 32-bit two's complement integer `a' to 5032 | the quadruple-precision floating-point format. The conversion is performed 5033 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5034 *----------------------------------------------------------------------------*/ 5035 5036 float128 int32_to_float128(int32_t a, float_status *status) 5037 { 5038 bool zSign; 5039 uint32_t absA; 5040 int8_t shiftCount; 5041 uint64_t zSig0; 5042 5043 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 5044 zSign = ( a < 0 ); 5045 absA = zSign ? - a : a; 5046 shiftCount = clz32(absA) + 17; 5047 zSig0 = absA; 5048 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 5049 5050 } 5051 5052 /*---------------------------------------------------------------------------- 5053 | Returns the result of converting the 64-bit two's complement integer `a' 5054 | to the extended double-precision floating-point format. The conversion 5055 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5056 | Arithmetic. 5057 *----------------------------------------------------------------------------*/ 5058 5059 floatx80 int64_to_floatx80(int64_t a, float_status *status) 5060 { 5061 bool zSign; 5062 uint64_t absA; 5063 int8_t shiftCount; 5064 5065 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 5066 zSign = ( a < 0 ); 5067 absA = zSign ? - a : a; 5068 shiftCount = clz64(absA); 5069 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 5070 5071 } 5072 5073 /*---------------------------------------------------------------------------- 5074 | Returns the result of converting the 64-bit two's complement integer `a' to 5075 | the quadruple-precision floating-point format. The conversion is performed 5076 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5077 *----------------------------------------------------------------------------*/ 5078 5079 float128 int64_to_float128(int64_t a, float_status *status) 5080 { 5081 bool zSign; 5082 uint64_t absA; 5083 int8_t shiftCount; 5084 int32_t zExp; 5085 uint64_t zSig0, zSig1; 5086 5087 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 5088 zSign = ( a < 0 ); 5089 absA = zSign ? - a : a; 5090 shiftCount = clz64(absA) + 49; 5091 zExp = 0x406E - shiftCount; 5092 if ( 64 <= shiftCount ) { 5093 zSig1 = 0; 5094 zSig0 = absA; 5095 shiftCount -= 64; 5096 } 5097 else { 5098 zSig1 = absA; 5099 zSig0 = 0; 5100 } 5101 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 5102 return packFloat128( zSign, zExp, zSig0, zSig1 ); 5103 5104 } 5105 5106 /*---------------------------------------------------------------------------- 5107 | Returns the result of converting the 64-bit unsigned integer `a' 5108 | to the quadruple-precision floating-point format. The conversion is performed 5109 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5110 *----------------------------------------------------------------------------*/ 5111 5112 float128 uint64_to_float128(uint64_t a, float_status *status) 5113 { 5114 if (a == 0) { 5115 return float128_zero; 5116 } 5117 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status); 5118 } 5119 5120 /*---------------------------------------------------------------------------- 5121 | Returns the result of converting the single-precision floating-point value 5122 | `a' to the extended double-precision floating-point format. The conversion 5123 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5124 | Arithmetic. 5125 *----------------------------------------------------------------------------*/ 5126 5127 floatx80 float32_to_floatx80(float32 a, float_status *status) 5128 { 5129 bool aSign; 5130 int aExp; 5131 uint32_t aSig; 5132 5133 a = float32_squash_input_denormal(a, status); 5134 aSig = extractFloat32Frac( a ); 5135 aExp = extractFloat32Exp( a ); 5136 aSign = extractFloat32Sign( a ); 5137 if ( aExp == 0xFF ) { 5138 if (aSig) { 5139 floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status), 5140 status); 5141 return floatx80_silence_nan(res, status); 5142 } 5143 return packFloatx80(aSign, 5144 floatx80_infinity_high, 5145 floatx80_infinity_low); 5146 } 5147 if ( aExp == 0 ) { 5148 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 5149 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5150 } 5151 aSig |= 0x00800000; 5152 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 5153 5154 } 5155 5156 /*---------------------------------------------------------------------------- 5157 | Returns the result of converting the single-precision floating-point value 5158 | `a' to the double-precision floating-point format. The conversion is 5159 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5160 | Arithmetic. 5161 *----------------------------------------------------------------------------*/ 5162 5163 float128 float32_to_float128(float32 a, float_status *status) 5164 { 5165 bool aSign; 5166 int aExp; 5167 uint32_t aSig; 5168 5169 a = float32_squash_input_denormal(a, status); 5170 aSig = extractFloat32Frac( a ); 5171 aExp = extractFloat32Exp( a ); 5172 aSign = extractFloat32Sign( a ); 5173 if ( aExp == 0xFF ) { 5174 if (aSig) { 5175 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 5176 } 5177 return packFloat128( aSign, 0x7FFF, 0, 0 ); 5178 } 5179 if ( aExp == 0 ) { 5180 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 5181 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5182 --aExp; 5183 } 5184 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 5185 5186 } 5187 5188 /*---------------------------------------------------------------------------- 5189 | Returns the remainder of the single-precision floating-point value `a' 5190 | with respect to the corresponding value `b'. The operation is performed 5191 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5192 *----------------------------------------------------------------------------*/ 5193 5194 float32 float32_rem(float32 a, float32 b, float_status *status) 5195 { 5196 bool aSign, zSign; 5197 int aExp, bExp, expDiff; 5198 uint32_t aSig, bSig; 5199 uint32_t q; 5200 uint64_t aSig64, bSig64, q64; 5201 uint32_t alternateASig; 5202 int32_t sigMean; 5203 a = float32_squash_input_denormal(a, status); 5204 b = float32_squash_input_denormal(b, status); 5205 5206 aSig = extractFloat32Frac( a ); 5207 aExp = extractFloat32Exp( a ); 5208 aSign = extractFloat32Sign( a ); 5209 bSig = extractFloat32Frac( b ); 5210 bExp = extractFloat32Exp( b ); 5211 if ( aExp == 0xFF ) { 5212 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 5213 return propagateFloat32NaN(a, b, status); 5214 } 5215 float_raise(float_flag_invalid, status); 5216 return float32_default_nan(status); 5217 } 5218 if ( bExp == 0xFF ) { 5219 if (bSig) { 5220 return propagateFloat32NaN(a, b, status); 5221 } 5222 return a; 5223 } 5224 if ( bExp == 0 ) { 5225 if ( bSig == 0 ) { 5226 float_raise(float_flag_invalid, status); 5227 return float32_default_nan(status); 5228 } 5229 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 5230 } 5231 if ( aExp == 0 ) { 5232 if ( aSig == 0 ) return a; 5233 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5234 } 5235 expDiff = aExp - bExp; 5236 aSig |= 0x00800000; 5237 bSig |= 0x00800000; 5238 if ( expDiff < 32 ) { 5239 aSig <<= 8; 5240 bSig <<= 8; 5241 if ( expDiff < 0 ) { 5242 if ( expDiff < -1 ) return a; 5243 aSig >>= 1; 5244 } 5245 q = ( bSig <= aSig ); 5246 if ( q ) aSig -= bSig; 5247 if ( 0 < expDiff ) { 5248 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 5249 q >>= 32 - expDiff; 5250 bSig >>= 2; 5251 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 5252 } 5253 else { 5254 aSig >>= 2; 5255 bSig >>= 2; 5256 } 5257 } 5258 else { 5259 if ( bSig <= aSig ) aSig -= bSig; 5260 aSig64 = ( (uint64_t) aSig )<<40; 5261 bSig64 = ( (uint64_t) bSig )<<40; 5262 expDiff -= 64; 5263 while ( 0 < expDiff ) { 5264 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 5265 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 5266 aSig64 = - ( ( bSig * q64 )<<38 ); 5267 expDiff -= 62; 5268 } 5269 expDiff += 64; 5270 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 5271 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 5272 q = q64>>( 64 - expDiff ); 5273 bSig <<= 6; 5274 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 5275 } 5276 do { 5277 alternateASig = aSig; 5278 ++q; 5279 aSig -= bSig; 5280 } while ( 0 <= (int32_t) aSig ); 5281 sigMean = aSig + alternateASig; 5282 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 5283 aSig = alternateASig; 5284 } 5285 zSign = ( (int32_t) aSig < 0 ); 5286 if ( zSign ) aSig = - aSig; 5287 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 5288 } 5289 5290 5291 5292 /*---------------------------------------------------------------------------- 5293 | Returns the binary exponential of the single-precision floating-point value 5294 | `a'. The operation is performed according to the IEC/IEEE Standard for 5295 | Binary Floating-Point Arithmetic. 5296 | 5297 | Uses the following identities: 5298 | 5299 | 1. ------------------------------------------------------------------------- 5300 | x x*ln(2) 5301 | 2 = e 5302 | 5303 | 2. ------------------------------------------------------------------------- 5304 | 2 3 4 5 n 5305 | x x x x x x x 5306 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 5307 | 1! 2! 3! 4! 5! n! 5308 *----------------------------------------------------------------------------*/ 5309 5310 static const float64 float32_exp2_coefficients[15] = 5311 { 5312 const_float64( 0x3ff0000000000000ll ), /* 1 */ 5313 const_float64( 0x3fe0000000000000ll ), /* 2 */ 5314 const_float64( 0x3fc5555555555555ll ), /* 3 */ 5315 const_float64( 0x3fa5555555555555ll ), /* 4 */ 5316 const_float64( 0x3f81111111111111ll ), /* 5 */ 5317 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 5318 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 5319 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 5320 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 5321 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 5322 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 5323 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 5324 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 5325 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 5326 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 5327 }; 5328 5329 float32 float32_exp2(float32 a, float_status *status) 5330 { 5331 bool aSign; 5332 int aExp; 5333 uint32_t aSig; 5334 float64 r, x, xn; 5335 int i; 5336 a = float32_squash_input_denormal(a, status); 5337 5338 aSig = extractFloat32Frac( a ); 5339 aExp = extractFloat32Exp( a ); 5340 aSign = extractFloat32Sign( a ); 5341 5342 if ( aExp == 0xFF) { 5343 if (aSig) { 5344 return propagateFloat32NaN(a, float32_zero, status); 5345 } 5346 return (aSign) ? float32_zero : a; 5347 } 5348 if (aExp == 0) { 5349 if (aSig == 0) return float32_one; 5350 } 5351 5352 float_raise(float_flag_inexact, status); 5353 5354 /* ******************************* */ 5355 /* using float64 for approximation */ 5356 /* ******************************* */ 5357 x = float32_to_float64(a, status); 5358 x = float64_mul(x, float64_ln2, status); 5359 5360 xn = x; 5361 r = float64_one; 5362 for (i = 0 ; i < 15 ; i++) { 5363 float64 f; 5364 5365 f = float64_mul(xn, float32_exp2_coefficients[i], status); 5366 r = float64_add(r, f, status); 5367 5368 xn = float64_mul(xn, x, status); 5369 } 5370 5371 return float64_to_float32(r, status); 5372 } 5373 5374 /*---------------------------------------------------------------------------- 5375 | Returns the binary log of the single-precision floating-point value `a'. 5376 | The operation is performed according to the IEC/IEEE Standard for Binary 5377 | Floating-Point Arithmetic. 5378 *----------------------------------------------------------------------------*/ 5379 float32 float32_log2(float32 a, float_status *status) 5380 { 5381 bool aSign, zSign; 5382 int aExp; 5383 uint32_t aSig, zSig, i; 5384 5385 a = float32_squash_input_denormal(a, status); 5386 aSig = extractFloat32Frac( a ); 5387 aExp = extractFloat32Exp( a ); 5388 aSign = extractFloat32Sign( a ); 5389 5390 if ( aExp == 0 ) { 5391 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 5392 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5393 } 5394 if ( aSign ) { 5395 float_raise(float_flag_invalid, status); 5396 return float32_default_nan(status); 5397 } 5398 if ( aExp == 0xFF ) { 5399 if (aSig) { 5400 return propagateFloat32NaN(a, float32_zero, status); 5401 } 5402 return a; 5403 } 5404 5405 aExp -= 0x7F; 5406 aSig |= 0x00800000; 5407 zSign = aExp < 0; 5408 zSig = aExp << 23; 5409 5410 for (i = 1 << 22; i > 0; i >>= 1) { 5411 aSig = ( (uint64_t)aSig * aSig ) >> 23; 5412 if ( aSig & 0x01000000 ) { 5413 aSig >>= 1; 5414 zSig |= i; 5415 } 5416 } 5417 5418 if ( zSign ) 5419 zSig = -zSig; 5420 5421 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 5422 } 5423 5424 /*---------------------------------------------------------------------------- 5425 | Returns the result of converting the double-precision floating-point value 5426 | `a' to the extended double-precision floating-point format. The conversion 5427 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5428 | Arithmetic. 5429 *----------------------------------------------------------------------------*/ 5430 5431 floatx80 float64_to_floatx80(float64 a, float_status *status) 5432 { 5433 bool aSign; 5434 int aExp; 5435 uint64_t aSig; 5436 5437 a = float64_squash_input_denormal(a, status); 5438 aSig = extractFloat64Frac( a ); 5439 aExp = extractFloat64Exp( a ); 5440 aSign = extractFloat64Sign( a ); 5441 if ( aExp == 0x7FF ) { 5442 if (aSig) { 5443 floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status), 5444 status); 5445 return floatx80_silence_nan(res, status); 5446 } 5447 return packFloatx80(aSign, 5448 floatx80_infinity_high, 5449 floatx80_infinity_low); 5450 } 5451 if ( aExp == 0 ) { 5452 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 5453 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5454 } 5455 return 5456 packFloatx80( 5457 aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11); 5458 5459 } 5460 5461 /*---------------------------------------------------------------------------- 5462 | Returns the result of converting the double-precision floating-point value 5463 | `a' to the quadruple-precision floating-point format. The conversion is 5464 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5465 | Arithmetic. 5466 *----------------------------------------------------------------------------*/ 5467 5468 float128 float64_to_float128(float64 a, float_status *status) 5469 { 5470 bool aSign; 5471 int aExp; 5472 uint64_t aSig, zSig0, zSig1; 5473 5474 a = float64_squash_input_denormal(a, status); 5475 aSig = extractFloat64Frac( a ); 5476 aExp = extractFloat64Exp( a ); 5477 aSign = extractFloat64Sign( a ); 5478 if ( aExp == 0x7FF ) { 5479 if (aSig) { 5480 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 5481 } 5482 return packFloat128( aSign, 0x7FFF, 0, 0 ); 5483 } 5484 if ( aExp == 0 ) { 5485 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 5486 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5487 --aExp; 5488 } 5489 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 5490 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 5491 5492 } 5493 5494 5495 /*---------------------------------------------------------------------------- 5496 | Returns the remainder of the double-precision floating-point value `a' 5497 | with respect to the corresponding value `b'. The operation is performed 5498 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5499 *----------------------------------------------------------------------------*/ 5500 5501 float64 float64_rem(float64 a, float64 b, float_status *status) 5502 { 5503 bool aSign, zSign; 5504 int aExp, bExp, expDiff; 5505 uint64_t aSig, bSig; 5506 uint64_t q, alternateASig; 5507 int64_t sigMean; 5508 5509 a = float64_squash_input_denormal(a, status); 5510 b = float64_squash_input_denormal(b, status); 5511 aSig = extractFloat64Frac( a ); 5512 aExp = extractFloat64Exp( a ); 5513 aSign = extractFloat64Sign( a ); 5514 bSig = extractFloat64Frac( b ); 5515 bExp = extractFloat64Exp( b ); 5516 if ( aExp == 0x7FF ) { 5517 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 5518 return propagateFloat64NaN(a, b, status); 5519 } 5520 float_raise(float_flag_invalid, status); 5521 return float64_default_nan(status); 5522 } 5523 if ( bExp == 0x7FF ) { 5524 if (bSig) { 5525 return propagateFloat64NaN(a, b, status); 5526 } 5527 return a; 5528 } 5529 if ( bExp == 0 ) { 5530 if ( bSig == 0 ) { 5531 float_raise(float_flag_invalid, status); 5532 return float64_default_nan(status); 5533 } 5534 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 5535 } 5536 if ( aExp == 0 ) { 5537 if ( aSig == 0 ) return a; 5538 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5539 } 5540 expDiff = aExp - bExp; 5541 aSig = (aSig | UINT64_C(0x0010000000000000)) << 11; 5542 bSig = (bSig | UINT64_C(0x0010000000000000)) << 11; 5543 if ( expDiff < 0 ) { 5544 if ( expDiff < -1 ) return a; 5545 aSig >>= 1; 5546 } 5547 q = ( bSig <= aSig ); 5548 if ( q ) aSig -= bSig; 5549 expDiff -= 64; 5550 while ( 0 < expDiff ) { 5551 q = estimateDiv128To64( aSig, 0, bSig ); 5552 q = ( 2 < q ) ? q - 2 : 0; 5553 aSig = - ( ( bSig>>2 ) * q ); 5554 expDiff -= 62; 5555 } 5556 expDiff += 64; 5557 if ( 0 < expDiff ) { 5558 q = estimateDiv128To64( aSig, 0, bSig ); 5559 q = ( 2 < q ) ? q - 2 : 0; 5560 q >>= 64 - expDiff; 5561 bSig >>= 2; 5562 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 5563 } 5564 else { 5565 aSig >>= 2; 5566 bSig >>= 2; 5567 } 5568 do { 5569 alternateASig = aSig; 5570 ++q; 5571 aSig -= bSig; 5572 } while ( 0 <= (int64_t) aSig ); 5573 sigMean = aSig + alternateASig; 5574 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 5575 aSig = alternateASig; 5576 } 5577 zSign = ( (int64_t) aSig < 0 ); 5578 if ( zSign ) aSig = - aSig; 5579 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 5580 5581 } 5582 5583 /*---------------------------------------------------------------------------- 5584 | Returns the binary log of the double-precision floating-point value `a'. 5585 | The operation is performed according to the IEC/IEEE Standard for Binary 5586 | Floating-Point Arithmetic. 5587 *----------------------------------------------------------------------------*/ 5588 float64 float64_log2(float64 a, float_status *status) 5589 { 5590 bool aSign, zSign; 5591 int aExp; 5592 uint64_t aSig, aSig0, aSig1, zSig, i; 5593 a = float64_squash_input_denormal(a, status); 5594 5595 aSig = extractFloat64Frac( a ); 5596 aExp = extractFloat64Exp( a ); 5597 aSign = extractFloat64Sign( a ); 5598 5599 if ( aExp == 0 ) { 5600 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 5601 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5602 } 5603 if ( aSign ) { 5604 float_raise(float_flag_invalid, status); 5605 return float64_default_nan(status); 5606 } 5607 if ( aExp == 0x7FF ) { 5608 if (aSig) { 5609 return propagateFloat64NaN(a, float64_zero, status); 5610 } 5611 return a; 5612 } 5613 5614 aExp -= 0x3FF; 5615 aSig |= UINT64_C(0x0010000000000000); 5616 zSign = aExp < 0; 5617 zSig = (uint64_t)aExp << 52; 5618 for (i = 1LL << 51; i > 0; i >>= 1) { 5619 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 5620 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 5621 if ( aSig & UINT64_C(0x0020000000000000) ) { 5622 aSig >>= 1; 5623 zSig |= i; 5624 } 5625 } 5626 5627 if ( zSign ) 5628 zSig = -zSig; 5629 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 5630 } 5631 5632 /*---------------------------------------------------------------------------- 5633 | Returns the result of converting the extended double-precision floating- 5634 | point value `a' to the 32-bit two's complement integer format. The 5635 | conversion is performed according to the IEC/IEEE Standard for Binary 5636 | Floating-Point Arithmetic---which means in particular that the conversion 5637 | is rounded according to the current rounding mode. If `a' is a NaN, the 5638 | largest positive integer is returned. Otherwise, if the conversion 5639 | overflows, the largest integer with the same sign as `a' is returned. 5640 *----------------------------------------------------------------------------*/ 5641 5642 int32_t floatx80_to_int32(floatx80 a, float_status *status) 5643 { 5644 bool aSign; 5645 int32_t aExp, shiftCount; 5646 uint64_t aSig; 5647 5648 if (floatx80_invalid_encoding(a)) { 5649 float_raise(float_flag_invalid, status); 5650 return 1 << 31; 5651 } 5652 aSig = extractFloatx80Frac( a ); 5653 aExp = extractFloatx80Exp( a ); 5654 aSign = extractFloatx80Sign( a ); 5655 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5656 shiftCount = 0x4037 - aExp; 5657 if ( shiftCount <= 0 ) shiftCount = 1; 5658 shift64RightJamming( aSig, shiftCount, &aSig ); 5659 return roundAndPackInt32(aSign, aSig, status); 5660 5661 } 5662 5663 /*---------------------------------------------------------------------------- 5664 | Returns the result of converting the extended double-precision floating- 5665 | point value `a' to the 32-bit two's complement integer format. The 5666 | conversion is performed according to the IEC/IEEE Standard for Binary 5667 | Floating-Point Arithmetic, except that the conversion is always rounded 5668 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5669 | Otherwise, if the conversion overflows, the largest integer with the same 5670 | sign as `a' is returned. 5671 *----------------------------------------------------------------------------*/ 5672 5673 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 5674 { 5675 bool aSign; 5676 int32_t aExp, shiftCount; 5677 uint64_t aSig, savedASig; 5678 int32_t z; 5679 5680 if (floatx80_invalid_encoding(a)) { 5681 float_raise(float_flag_invalid, status); 5682 return 1 << 31; 5683 } 5684 aSig = extractFloatx80Frac( a ); 5685 aExp = extractFloatx80Exp( a ); 5686 aSign = extractFloatx80Sign( a ); 5687 if ( 0x401E < aExp ) { 5688 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5689 goto invalid; 5690 } 5691 else if ( aExp < 0x3FFF ) { 5692 if (aExp || aSig) { 5693 float_raise(float_flag_inexact, status); 5694 } 5695 return 0; 5696 } 5697 shiftCount = 0x403E - aExp; 5698 savedASig = aSig; 5699 aSig >>= shiftCount; 5700 z = aSig; 5701 if ( aSign ) z = - z; 5702 if ( ( z < 0 ) ^ aSign ) { 5703 invalid: 5704 float_raise(float_flag_invalid, status); 5705 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5706 } 5707 if ( ( aSig<<shiftCount ) != savedASig ) { 5708 float_raise(float_flag_inexact, status); 5709 } 5710 return z; 5711 5712 } 5713 5714 /*---------------------------------------------------------------------------- 5715 | Returns the result of converting the extended double-precision floating- 5716 | point value `a' to the 64-bit two's complement integer format. The 5717 | conversion is performed according to the IEC/IEEE Standard for Binary 5718 | Floating-Point Arithmetic---which means in particular that the conversion 5719 | is rounded according to the current rounding mode. If `a' is a NaN, 5720 | the largest positive integer is returned. Otherwise, if the conversion 5721 | overflows, the largest integer with the same sign as `a' is returned. 5722 *----------------------------------------------------------------------------*/ 5723 5724 int64_t floatx80_to_int64(floatx80 a, float_status *status) 5725 { 5726 bool aSign; 5727 int32_t aExp, shiftCount; 5728 uint64_t aSig, aSigExtra; 5729 5730 if (floatx80_invalid_encoding(a)) { 5731 float_raise(float_flag_invalid, status); 5732 return 1ULL << 63; 5733 } 5734 aSig = extractFloatx80Frac( a ); 5735 aExp = extractFloatx80Exp( a ); 5736 aSign = extractFloatx80Sign( a ); 5737 shiftCount = 0x403E - aExp; 5738 if ( shiftCount <= 0 ) { 5739 if ( shiftCount ) { 5740 float_raise(float_flag_invalid, status); 5741 if (!aSign || floatx80_is_any_nan(a)) { 5742 return INT64_MAX; 5743 } 5744 return INT64_MIN; 5745 } 5746 aSigExtra = 0; 5747 } 5748 else { 5749 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 5750 } 5751 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 5752 5753 } 5754 5755 /*---------------------------------------------------------------------------- 5756 | Returns the result of converting the extended double-precision floating- 5757 | point value `a' to the 64-bit two's complement integer format. The 5758 | conversion is performed according to the IEC/IEEE Standard for Binary 5759 | Floating-Point Arithmetic, except that the conversion is always rounded 5760 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5761 | Otherwise, if the conversion overflows, the largest integer with the same 5762 | sign as `a' is returned. 5763 *----------------------------------------------------------------------------*/ 5764 5765 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 5766 { 5767 bool aSign; 5768 int32_t aExp, shiftCount; 5769 uint64_t aSig; 5770 int64_t z; 5771 5772 if (floatx80_invalid_encoding(a)) { 5773 float_raise(float_flag_invalid, status); 5774 return 1ULL << 63; 5775 } 5776 aSig = extractFloatx80Frac( a ); 5777 aExp = extractFloatx80Exp( a ); 5778 aSign = extractFloatx80Sign( a ); 5779 shiftCount = aExp - 0x403E; 5780 if ( 0 <= shiftCount ) { 5781 aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF); 5782 if ( ( a.high != 0xC03E ) || aSig ) { 5783 float_raise(float_flag_invalid, status); 5784 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 5785 return INT64_MAX; 5786 } 5787 } 5788 return INT64_MIN; 5789 } 5790 else if ( aExp < 0x3FFF ) { 5791 if (aExp | aSig) { 5792 float_raise(float_flag_inexact, status); 5793 } 5794 return 0; 5795 } 5796 z = aSig>>( - shiftCount ); 5797 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 5798 float_raise(float_flag_inexact, status); 5799 } 5800 if ( aSign ) z = - z; 5801 return z; 5802 5803 } 5804 5805 /*---------------------------------------------------------------------------- 5806 | Returns the result of converting the extended double-precision floating- 5807 | point value `a' to the single-precision floating-point format. The 5808 | conversion is performed according to the IEC/IEEE Standard for Binary 5809 | Floating-Point Arithmetic. 5810 *----------------------------------------------------------------------------*/ 5811 5812 float32 floatx80_to_float32(floatx80 a, float_status *status) 5813 { 5814 bool aSign; 5815 int32_t aExp; 5816 uint64_t aSig; 5817 5818 if (floatx80_invalid_encoding(a)) { 5819 float_raise(float_flag_invalid, status); 5820 return float32_default_nan(status); 5821 } 5822 aSig = extractFloatx80Frac( a ); 5823 aExp = extractFloatx80Exp( a ); 5824 aSign = extractFloatx80Sign( a ); 5825 if ( aExp == 0x7FFF ) { 5826 if ( (uint64_t) ( aSig<<1 ) ) { 5827 float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status), 5828 status); 5829 return float32_silence_nan(res, status); 5830 } 5831 return packFloat32( aSign, 0xFF, 0 ); 5832 } 5833 shift64RightJamming( aSig, 33, &aSig ); 5834 if ( aExp || aSig ) aExp -= 0x3F81; 5835 return roundAndPackFloat32(aSign, aExp, aSig, status); 5836 5837 } 5838 5839 /*---------------------------------------------------------------------------- 5840 | Returns the result of converting the extended double-precision floating- 5841 | point value `a' to the double-precision floating-point format. The 5842 | conversion is performed according to the IEC/IEEE Standard for Binary 5843 | Floating-Point Arithmetic. 5844 *----------------------------------------------------------------------------*/ 5845 5846 float64 floatx80_to_float64(floatx80 a, float_status *status) 5847 { 5848 bool aSign; 5849 int32_t aExp; 5850 uint64_t aSig, zSig; 5851 5852 if (floatx80_invalid_encoding(a)) { 5853 float_raise(float_flag_invalid, status); 5854 return float64_default_nan(status); 5855 } 5856 aSig = extractFloatx80Frac( a ); 5857 aExp = extractFloatx80Exp( a ); 5858 aSign = extractFloatx80Sign( a ); 5859 if ( aExp == 0x7FFF ) { 5860 if ( (uint64_t) ( aSig<<1 ) ) { 5861 float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status), 5862 status); 5863 return float64_silence_nan(res, status); 5864 } 5865 return packFloat64( aSign, 0x7FF, 0 ); 5866 } 5867 shift64RightJamming( aSig, 1, &zSig ); 5868 if ( aExp || aSig ) aExp -= 0x3C01; 5869 return roundAndPackFloat64(aSign, aExp, zSig, status); 5870 5871 } 5872 5873 /*---------------------------------------------------------------------------- 5874 | Returns the result of converting the extended double-precision floating- 5875 | point value `a' to the quadruple-precision floating-point format. The 5876 | conversion is performed according to the IEC/IEEE Standard for Binary 5877 | Floating-Point Arithmetic. 5878 *----------------------------------------------------------------------------*/ 5879 5880 float128 floatx80_to_float128(floatx80 a, float_status *status) 5881 { 5882 bool aSign; 5883 int aExp; 5884 uint64_t aSig, zSig0, zSig1; 5885 5886 if (floatx80_invalid_encoding(a)) { 5887 float_raise(float_flag_invalid, status); 5888 return float128_default_nan(status); 5889 } 5890 aSig = extractFloatx80Frac( a ); 5891 aExp = extractFloatx80Exp( a ); 5892 aSign = extractFloatx80Sign( a ); 5893 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 5894 float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status), 5895 status); 5896 return float128_silence_nan(res, status); 5897 } 5898 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 5899 return packFloat128( aSign, aExp, zSig0, zSig1 ); 5900 5901 } 5902 5903 /*---------------------------------------------------------------------------- 5904 | Rounds the extended double-precision floating-point value `a' 5905 | to the precision provided by floatx80_rounding_precision and returns the 5906 | result as an extended double-precision floating-point value. 5907 | The operation is performed according to the IEC/IEEE Standard for Binary 5908 | Floating-Point Arithmetic. 5909 *----------------------------------------------------------------------------*/ 5910 5911 floatx80 floatx80_round(floatx80 a, float_status *status) 5912 { 5913 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5914 extractFloatx80Sign(a), 5915 extractFloatx80Exp(a), 5916 extractFloatx80Frac(a), 0, status); 5917 } 5918 5919 /*---------------------------------------------------------------------------- 5920 | Rounds the extended double-precision floating-point value `a' to an integer, 5921 | and returns the result as an extended quadruple-precision floating-point 5922 | value. The operation is performed according to the IEC/IEEE Standard for 5923 | Binary Floating-Point Arithmetic. 5924 *----------------------------------------------------------------------------*/ 5925 5926 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 5927 { 5928 bool aSign; 5929 int32_t aExp; 5930 uint64_t lastBitMask, roundBitsMask; 5931 floatx80 z; 5932 5933 if (floatx80_invalid_encoding(a)) { 5934 float_raise(float_flag_invalid, status); 5935 return floatx80_default_nan(status); 5936 } 5937 aExp = extractFloatx80Exp( a ); 5938 if ( 0x403E <= aExp ) { 5939 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 5940 return propagateFloatx80NaN(a, a, status); 5941 } 5942 return a; 5943 } 5944 if ( aExp < 0x3FFF ) { 5945 if ( ( aExp == 0 ) 5946 && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) { 5947 return a; 5948 } 5949 float_raise(float_flag_inexact, status); 5950 aSign = extractFloatx80Sign( a ); 5951 switch (status->float_rounding_mode) { 5952 case float_round_nearest_even: 5953 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 5954 ) { 5955 return 5956 packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000)); 5957 } 5958 break; 5959 case float_round_ties_away: 5960 if (aExp == 0x3FFE) { 5961 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000)); 5962 } 5963 break; 5964 case float_round_down: 5965 return 5966 aSign ? 5967 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000)) 5968 : packFloatx80( 0, 0, 0 ); 5969 case float_round_up: 5970 return 5971 aSign ? packFloatx80( 1, 0, 0 ) 5972 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000)); 5973 5974 case float_round_to_zero: 5975 break; 5976 default: 5977 g_assert_not_reached(); 5978 } 5979 return packFloatx80( aSign, 0, 0 ); 5980 } 5981 lastBitMask = 1; 5982 lastBitMask <<= 0x403E - aExp; 5983 roundBitsMask = lastBitMask - 1; 5984 z = a; 5985 switch (status->float_rounding_mode) { 5986 case float_round_nearest_even: 5987 z.low += lastBitMask>>1; 5988 if ((z.low & roundBitsMask) == 0) { 5989 z.low &= ~lastBitMask; 5990 } 5991 break; 5992 case float_round_ties_away: 5993 z.low += lastBitMask >> 1; 5994 break; 5995 case float_round_to_zero: 5996 break; 5997 case float_round_up: 5998 if (!extractFloatx80Sign(z)) { 5999 z.low += roundBitsMask; 6000 } 6001 break; 6002 case float_round_down: 6003 if (extractFloatx80Sign(z)) { 6004 z.low += roundBitsMask; 6005 } 6006 break; 6007 default: 6008 abort(); 6009 } 6010 z.low &= ~ roundBitsMask; 6011 if ( z.low == 0 ) { 6012 ++z.high; 6013 z.low = UINT64_C(0x8000000000000000); 6014 } 6015 if (z.low != a.low) { 6016 float_raise(float_flag_inexact, status); 6017 } 6018 return z; 6019 6020 } 6021 6022 /*---------------------------------------------------------------------------- 6023 | Returns the result of adding the absolute values of the extended double- 6024 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 6025 | negated before being returned. `zSign' is ignored if the result is a NaN. 6026 | The addition is performed according to the IEC/IEEE Standard for Binary 6027 | Floating-Point Arithmetic. 6028 *----------------------------------------------------------------------------*/ 6029 6030 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign, 6031 float_status *status) 6032 { 6033 int32_t aExp, bExp, zExp; 6034 uint64_t aSig, bSig, zSig0, zSig1; 6035 int32_t expDiff; 6036 6037 aSig = extractFloatx80Frac( a ); 6038 aExp = extractFloatx80Exp( a ); 6039 bSig = extractFloatx80Frac( b ); 6040 bExp = extractFloatx80Exp( b ); 6041 expDiff = aExp - bExp; 6042 if ( 0 < expDiff ) { 6043 if ( aExp == 0x7FFF ) { 6044 if ((uint64_t)(aSig << 1)) { 6045 return propagateFloatx80NaN(a, b, status); 6046 } 6047 return a; 6048 } 6049 if ( bExp == 0 ) --expDiff; 6050 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 6051 zExp = aExp; 6052 } 6053 else if ( expDiff < 0 ) { 6054 if ( bExp == 0x7FFF ) { 6055 if ((uint64_t)(bSig << 1)) { 6056 return propagateFloatx80NaN(a, b, status); 6057 } 6058 return packFloatx80(zSign, 6059 floatx80_infinity_high, 6060 floatx80_infinity_low); 6061 } 6062 if ( aExp == 0 ) ++expDiff; 6063 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 6064 zExp = bExp; 6065 } 6066 else { 6067 if ( aExp == 0x7FFF ) { 6068 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 6069 return propagateFloatx80NaN(a, b, status); 6070 } 6071 return a; 6072 } 6073 zSig1 = 0; 6074 zSig0 = aSig + bSig; 6075 if ( aExp == 0 ) { 6076 if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) { 6077 /* At least one of the values is a pseudo-denormal, 6078 * and there is a carry out of the result. */ 6079 zExp = 1; 6080 goto shiftRight1; 6081 } 6082 if (zSig0 == 0) { 6083 return packFloatx80(zSign, 0, 0); 6084 } 6085 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 6086 goto roundAndPack; 6087 } 6088 zExp = aExp; 6089 goto shiftRight1; 6090 } 6091 zSig0 = aSig + bSig; 6092 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 6093 shiftRight1: 6094 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 6095 zSig0 |= UINT64_C(0x8000000000000000); 6096 ++zExp; 6097 roundAndPack: 6098 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6099 zSign, zExp, zSig0, zSig1, status); 6100 } 6101 6102 /*---------------------------------------------------------------------------- 6103 | Returns the result of subtracting the absolute values of the extended 6104 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 6105 | difference is negated before being returned. `zSign' is ignored if the 6106 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6107 | Standard for Binary Floating-Point Arithmetic. 6108 *----------------------------------------------------------------------------*/ 6109 6110 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign, 6111 float_status *status) 6112 { 6113 int32_t aExp, bExp, zExp; 6114 uint64_t aSig, bSig, zSig0, zSig1; 6115 int32_t expDiff; 6116 6117 aSig = extractFloatx80Frac( a ); 6118 aExp = extractFloatx80Exp( a ); 6119 bSig = extractFloatx80Frac( b ); 6120 bExp = extractFloatx80Exp( b ); 6121 expDiff = aExp - bExp; 6122 if ( 0 < expDiff ) goto aExpBigger; 6123 if ( expDiff < 0 ) goto bExpBigger; 6124 if ( aExp == 0x7FFF ) { 6125 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 6126 return propagateFloatx80NaN(a, b, status); 6127 } 6128 float_raise(float_flag_invalid, status); 6129 return floatx80_default_nan(status); 6130 } 6131 if ( aExp == 0 ) { 6132 aExp = 1; 6133 bExp = 1; 6134 } 6135 zSig1 = 0; 6136 if ( bSig < aSig ) goto aBigger; 6137 if ( aSig < bSig ) goto bBigger; 6138 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 6139 bExpBigger: 6140 if ( bExp == 0x7FFF ) { 6141 if ((uint64_t)(bSig << 1)) { 6142 return propagateFloatx80NaN(a, b, status); 6143 } 6144 return packFloatx80(zSign ^ 1, floatx80_infinity_high, 6145 floatx80_infinity_low); 6146 } 6147 if ( aExp == 0 ) ++expDiff; 6148 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 6149 bBigger: 6150 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 6151 zExp = bExp; 6152 zSign ^= 1; 6153 goto normalizeRoundAndPack; 6154 aExpBigger: 6155 if ( aExp == 0x7FFF ) { 6156 if ((uint64_t)(aSig << 1)) { 6157 return propagateFloatx80NaN(a, b, status); 6158 } 6159 return a; 6160 } 6161 if ( bExp == 0 ) --expDiff; 6162 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 6163 aBigger: 6164 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 6165 zExp = aExp; 6166 normalizeRoundAndPack: 6167 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 6168 zSign, zExp, zSig0, zSig1, status); 6169 } 6170 6171 /*---------------------------------------------------------------------------- 6172 | Returns the result of adding the extended double-precision floating-point 6173 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6174 | Standard for Binary Floating-Point Arithmetic. 6175 *----------------------------------------------------------------------------*/ 6176 6177 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 6178 { 6179 bool aSign, bSign; 6180 6181 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6182 float_raise(float_flag_invalid, status); 6183 return floatx80_default_nan(status); 6184 } 6185 aSign = extractFloatx80Sign( a ); 6186 bSign = extractFloatx80Sign( b ); 6187 if ( aSign == bSign ) { 6188 return addFloatx80Sigs(a, b, aSign, status); 6189 } 6190 else { 6191 return subFloatx80Sigs(a, b, aSign, status); 6192 } 6193 6194 } 6195 6196 /*---------------------------------------------------------------------------- 6197 | Returns the result of subtracting the extended double-precision floating- 6198 | point values `a' and `b'. The operation is performed according to the 6199 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6200 *----------------------------------------------------------------------------*/ 6201 6202 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 6203 { 6204 bool aSign, bSign; 6205 6206 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6207 float_raise(float_flag_invalid, status); 6208 return floatx80_default_nan(status); 6209 } 6210 aSign = extractFloatx80Sign( a ); 6211 bSign = extractFloatx80Sign( b ); 6212 if ( aSign == bSign ) { 6213 return subFloatx80Sigs(a, b, aSign, status); 6214 } 6215 else { 6216 return addFloatx80Sigs(a, b, aSign, status); 6217 } 6218 6219 } 6220 6221 /*---------------------------------------------------------------------------- 6222 | Returns the result of multiplying the extended double-precision floating- 6223 | point values `a' and `b'. The operation is performed according to the 6224 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6225 *----------------------------------------------------------------------------*/ 6226 6227 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 6228 { 6229 bool aSign, bSign, zSign; 6230 int32_t aExp, bExp, zExp; 6231 uint64_t aSig, bSig, zSig0, zSig1; 6232 6233 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6234 float_raise(float_flag_invalid, status); 6235 return floatx80_default_nan(status); 6236 } 6237 aSig = extractFloatx80Frac( a ); 6238 aExp = extractFloatx80Exp( a ); 6239 aSign = extractFloatx80Sign( a ); 6240 bSig = extractFloatx80Frac( b ); 6241 bExp = extractFloatx80Exp( b ); 6242 bSign = extractFloatx80Sign( b ); 6243 zSign = aSign ^ bSign; 6244 if ( aExp == 0x7FFF ) { 6245 if ( (uint64_t) ( aSig<<1 ) 6246 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 6247 return propagateFloatx80NaN(a, b, status); 6248 } 6249 if ( ( bExp | bSig ) == 0 ) goto invalid; 6250 return packFloatx80(zSign, floatx80_infinity_high, 6251 floatx80_infinity_low); 6252 } 6253 if ( bExp == 0x7FFF ) { 6254 if ((uint64_t)(bSig << 1)) { 6255 return propagateFloatx80NaN(a, b, status); 6256 } 6257 if ( ( aExp | aSig ) == 0 ) { 6258 invalid: 6259 float_raise(float_flag_invalid, status); 6260 return floatx80_default_nan(status); 6261 } 6262 return packFloatx80(zSign, floatx80_infinity_high, 6263 floatx80_infinity_low); 6264 } 6265 if ( aExp == 0 ) { 6266 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6267 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 6268 } 6269 if ( bExp == 0 ) { 6270 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6271 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6272 } 6273 zExp = aExp + bExp - 0x3FFE; 6274 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 6275 if ( 0 < (int64_t) zSig0 ) { 6276 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 6277 --zExp; 6278 } 6279 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6280 zSign, zExp, zSig0, zSig1, status); 6281 } 6282 6283 /*---------------------------------------------------------------------------- 6284 | Returns the result of dividing the extended double-precision floating-point 6285 | value `a' by the corresponding value `b'. The operation is performed 6286 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6287 *----------------------------------------------------------------------------*/ 6288 6289 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 6290 { 6291 bool aSign, bSign, zSign; 6292 int32_t aExp, bExp, zExp; 6293 uint64_t aSig, bSig, zSig0, zSig1; 6294 uint64_t rem0, rem1, rem2, term0, term1, term2; 6295 6296 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6297 float_raise(float_flag_invalid, status); 6298 return floatx80_default_nan(status); 6299 } 6300 aSig = extractFloatx80Frac( a ); 6301 aExp = extractFloatx80Exp( a ); 6302 aSign = extractFloatx80Sign( a ); 6303 bSig = extractFloatx80Frac( b ); 6304 bExp = extractFloatx80Exp( b ); 6305 bSign = extractFloatx80Sign( b ); 6306 zSign = aSign ^ bSign; 6307 if ( aExp == 0x7FFF ) { 6308 if ((uint64_t)(aSig << 1)) { 6309 return propagateFloatx80NaN(a, b, status); 6310 } 6311 if ( bExp == 0x7FFF ) { 6312 if ((uint64_t)(bSig << 1)) { 6313 return propagateFloatx80NaN(a, b, status); 6314 } 6315 goto invalid; 6316 } 6317 return packFloatx80(zSign, floatx80_infinity_high, 6318 floatx80_infinity_low); 6319 } 6320 if ( bExp == 0x7FFF ) { 6321 if ((uint64_t)(bSig << 1)) { 6322 return propagateFloatx80NaN(a, b, status); 6323 } 6324 return packFloatx80( zSign, 0, 0 ); 6325 } 6326 if ( bExp == 0 ) { 6327 if ( bSig == 0 ) { 6328 if ( ( aExp | aSig ) == 0 ) { 6329 invalid: 6330 float_raise(float_flag_invalid, status); 6331 return floatx80_default_nan(status); 6332 } 6333 float_raise(float_flag_divbyzero, status); 6334 return packFloatx80(zSign, floatx80_infinity_high, 6335 floatx80_infinity_low); 6336 } 6337 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6338 } 6339 if ( aExp == 0 ) { 6340 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6341 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 6342 } 6343 zExp = aExp - bExp + 0x3FFE; 6344 rem1 = 0; 6345 if ( bSig <= aSig ) { 6346 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 6347 ++zExp; 6348 } 6349 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 6350 mul64To128( bSig, zSig0, &term0, &term1 ); 6351 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 6352 while ( (int64_t) rem0 < 0 ) { 6353 --zSig0; 6354 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 6355 } 6356 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 6357 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 6358 mul64To128( bSig, zSig1, &term1, &term2 ); 6359 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6360 while ( (int64_t) rem1 < 0 ) { 6361 --zSig1; 6362 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 6363 } 6364 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 6365 } 6366 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6367 zSign, zExp, zSig0, zSig1, status); 6368 } 6369 6370 /*---------------------------------------------------------------------------- 6371 | Returns the remainder of the extended double-precision floating-point value 6372 | `a' with respect to the corresponding value `b'. The operation is performed 6373 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic, 6374 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating 6375 | the quotient toward zero instead. '*quotient' is set to the low 64 bits of 6376 | the absolute value of the integer quotient. 6377 *----------------------------------------------------------------------------*/ 6378 6379 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient, 6380 float_status *status) 6381 { 6382 bool aSign, zSign; 6383 int32_t aExp, bExp, expDiff, aExpOrig; 6384 uint64_t aSig0, aSig1, bSig; 6385 uint64_t q, term0, term1, alternateASig0, alternateASig1; 6386 6387 *quotient = 0; 6388 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6389 float_raise(float_flag_invalid, status); 6390 return floatx80_default_nan(status); 6391 } 6392 aSig0 = extractFloatx80Frac( a ); 6393 aExpOrig = aExp = extractFloatx80Exp( a ); 6394 aSign = extractFloatx80Sign( a ); 6395 bSig = extractFloatx80Frac( b ); 6396 bExp = extractFloatx80Exp( b ); 6397 if ( aExp == 0x7FFF ) { 6398 if ( (uint64_t) ( aSig0<<1 ) 6399 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 6400 return propagateFloatx80NaN(a, b, status); 6401 } 6402 goto invalid; 6403 } 6404 if ( bExp == 0x7FFF ) { 6405 if ((uint64_t)(bSig << 1)) { 6406 return propagateFloatx80NaN(a, b, status); 6407 } 6408 if (aExp == 0 && aSig0 >> 63) { 6409 /* 6410 * Pseudo-denormal argument must be returned in normalized 6411 * form. 6412 */ 6413 return packFloatx80(aSign, 1, aSig0); 6414 } 6415 return a; 6416 } 6417 if ( bExp == 0 ) { 6418 if ( bSig == 0 ) { 6419 invalid: 6420 float_raise(float_flag_invalid, status); 6421 return floatx80_default_nan(status); 6422 } 6423 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6424 } 6425 if ( aExp == 0 ) { 6426 if ( aSig0 == 0 ) return a; 6427 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6428 } 6429 zSign = aSign; 6430 expDiff = aExp - bExp; 6431 aSig1 = 0; 6432 if ( expDiff < 0 ) { 6433 if ( mod || expDiff < -1 ) { 6434 if (aExp == 1 && aExpOrig == 0) { 6435 /* 6436 * Pseudo-denormal argument must be returned in 6437 * normalized form. 6438 */ 6439 return packFloatx80(aSign, aExp, aSig0); 6440 } 6441 return a; 6442 } 6443 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 6444 expDiff = 0; 6445 } 6446 *quotient = q = ( bSig <= aSig0 ); 6447 if ( q ) aSig0 -= bSig; 6448 expDiff -= 64; 6449 while ( 0 < expDiff ) { 6450 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6451 q = ( 2 < q ) ? q - 2 : 0; 6452 mul64To128( bSig, q, &term0, &term1 ); 6453 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6454 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 6455 expDiff -= 62; 6456 *quotient <<= 62; 6457 *quotient += q; 6458 } 6459 expDiff += 64; 6460 if ( 0 < expDiff ) { 6461 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6462 q = ( 2 < q ) ? q - 2 : 0; 6463 q >>= 64 - expDiff; 6464 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 6465 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6466 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 6467 while ( le128( term0, term1, aSig0, aSig1 ) ) { 6468 ++q; 6469 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6470 } 6471 if (expDiff < 64) { 6472 *quotient <<= expDiff; 6473 } else { 6474 *quotient = 0; 6475 } 6476 *quotient += q; 6477 } 6478 else { 6479 term1 = 0; 6480 term0 = bSig; 6481 } 6482 if (!mod) { 6483 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 6484 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6485 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6486 && ( q & 1 ) ) 6487 ) { 6488 aSig0 = alternateASig0; 6489 aSig1 = alternateASig1; 6490 zSign = ! zSign; 6491 ++*quotient; 6492 } 6493 } 6494 return 6495 normalizeRoundAndPackFloatx80( 6496 80, zSign, bExp + expDiff, aSig0, aSig1, status); 6497 6498 } 6499 6500 /*---------------------------------------------------------------------------- 6501 | Returns the remainder of the extended double-precision floating-point value 6502 | `a' with respect to the corresponding value `b'. The operation is performed 6503 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6504 *----------------------------------------------------------------------------*/ 6505 6506 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 6507 { 6508 uint64_t quotient; 6509 return floatx80_modrem(a, b, false, "ient, status); 6510 } 6511 6512 /*---------------------------------------------------------------------------- 6513 | Returns the remainder of the extended double-precision floating-point value 6514 | `a' with respect to the corresponding value `b', with the quotient truncated 6515 | toward zero. 6516 *----------------------------------------------------------------------------*/ 6517 6518 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status) 6519 { 6520 uint64_t quotient; 6521 return floatx80_modrem(a, b, true, "ient, status); 6522 } 6523 6524 /*---------------------------------------------------------------------------- 6525 | Returns the square root of the extended double-precision floating-point 6526 | value `a'. The operation is performed according to the IEC/IEEE Standard 6527 | for Binary Floating-Point Arithmetic. 6528 *----------------------------------------------------------------------------*/ 6529 6530 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 6531 { 6532 bool aSign; 6533 int32_t aExp, zExp; 6534 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 6535 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6536 6537 if (floatx80_invalid_encoding(a)) { 6538 float_raise(float_flag_invalid, status); 6539 return floatx80_default_nan(status); 6540 } 6541 aSig0 = extractFloatx80Frac( a ); 6542 aExp = extractFloatx80Exp( a ); 6543 aSign = extractFloatx80Sign( a ); 6544 if ( aExp == 0x7FFF ) { 6545 if ((uint64_t)(aSig0 << 1)) { 6546 return propagateFloatx80NaN(a, a, status); 6547 } 6548 if ( ! aSign ) return a; 6549 goto invalid; 6550 } 6551 if ( aSign ) { 6552 if ( ( aExp | aSig0 ) == 0 ) return a; 6553 invalid: 6554 float_raise(float_flag_invalid, status); 6555 return floatx80_default_nan(status); 6556 } 6557 if ( aExp == 0 ) { 6558 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 6559 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6560 } 6561 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 6562 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 6563 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 6564 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6565 doubleZSig0 = zSig0<<1; 6566 mul64To128( zSig0, zSig0, &term0, &term1 ); 6567 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6568 while ( (int64_t) rem0 < 0 ) { 6569 --zSig0; 6570 doubleZSig0 -= 2; 6571 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6572 } 6573 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6574 if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) { 6575 if ( zSig1 == 0 ) zSig1 = 1; 6576 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6577 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6578 mul64To128( zSig1, zSig1, &term2, &term3 ); 6579 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6580 while ( (int64_t) rem1 < 0 ) { 6581 --zSig1; 6582 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6583 term3 |= 1; 6584 term2 |= doubleZSig0; 6585 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6586 } 6587 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6588 } 6589 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 6590 zSig0 |= doubleZSig0; 6591 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6592 0, zExp, zSig0, zSig1, status); 6593 } 6594 6595 /*---------------------------------------------------------------------------- 6596 | Returns the result of converting the quadruple-precision floating-point 6597 | value `a' to the 32-bit two's complement integer format. The conversion 6598 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6599 | Arithmetic---which means in particular that the conversion is rounded 6600 | according to the current rounding mode. If `a' is a NaN, the largest 6601 | positive integer is returned. Otherwise, if the conversion overflows, the 6602 | largest integer with the same sign as `a' is returned. 6603 *----------------------------------------------------------------------------*/ 6604 6605 int32_t float128_to_int32(float128 a, float_status *status) 6606 { 6607 bool aSign; 6608 int32_t aExp, shiftCount; 6609 uint64_t aSig0, aSig1; 6610 6611 aSig1 = extractFloat128Frac1( a ); 6612 aSig0 = extractFloat128Frac0( a ); 6613 aExp = extractFloat128Exp( a ); 6614 aSign = extractFloat128Sign( a ); 6615 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 6616 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6617 aSig0 |= ( aSig1 != 0 ); 6618 shiftCount = 0x4028 - aExp; 6619 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 6620 return roundAndPackInt32(aSign, aSig0, status); 6621 6622 } 6623 6624 /*---------------------------------------------------------------------------- 6625 | Returns the result of converting the quadruple-precision floating-point 6626 | value `a' to the 32-bit two's complement integer format. The conversion 6627 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6628 | Arithmetic, except that the conversion is always rounded toward zero. If 6629 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 6630 | conversion overflows, the largest integer with the same sign as `a' is 6631 | returned. 6632 *----------------------------------------------------------------------------*/ 6633 6634 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 6635 { 6636 bool aSign; 6637 int32_t aExp, shiftCount; 6638 uint64_t aSig0, aSig1, savedASig; 6639 int32_t z; 6640 6641 aSig1 = extractFloat128Frac1( a ); 6642 aSig0 = extractFloat128Frac0( a ); 6643 aExp = extractFloat128Exp( a ); 6644 aSign = extractFloat128Sign( a ); 6645 aSig0 |= ( aSig1 != 0 ); 6646 if ( 0x401E < aExp ) { 6647 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 6648 goto invalid; 6649 } 6650 else if ( aExp < 0x3FFF ) { 6651 if (aExp || aSig0) { 6652 float_raise(float_flag_inexact, status); 6653 } 6654 return 0; 6655 } 6656 aSig0 |= UINT64_C(0x0001000000000000); 6657 shiftCount = 0x402F - aExp; 6658 savedASig = aSig0; 6659 aSig0 >>= shiftCount; 6660 z = aSig0; 6661 if ( aSign ) z = - z; 6662 if ( ( z < 0 ) ^ aSign ) { 6663 invalid: 6664 float_raise(float_flag_invalid, status); 6665 return aSign ? INT32_MIN : INT32_MAX; 6666 } 6667 if ( ( aSig0<<shiftCount ) != savedASig ) { 6668 float_raise(float_flag_inexact, status); 6669 } 6670 return z; 6671 6672 } 6673 6674 /*---------------------------------------------------------------------------- 6675 | Returns the result of converting the quadruple-precision floating-point 6676 | value `a' to the 64-bit two's complement integer format. The conversion 6677 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6678 | Arithmetic---which means in particular that the conversion is rounded 6679 | according to the current rounding mode. If `a' is a NaN, the largest 6680 | positive integer is returned. Otherwise, if the conversion overflows, the 6681 | largest integer with the same sign as `a' is returned. 6682 *----------------------------------------------------------------------------*/ 6683 6684 int64_t float128_to_int64(float128 a, float_status *status) 6685 { 6686 bool aSign; 6687 int32_t aExp, shiftCount; 6688 uint64_t aSig0, aSig1; 6689 6690 aSig1 = extractFloat128Frac1( a ); 6691 aSig0 = extractFloat128Frac0( a ); 6692 aExp = extractFloat128Exp( a ); 6693 aSign = extractFloat128Sign( a ); 6694 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6695 shiftCount = 0x402F - aExp; 6696 if ( shiftCount <= 0 ) { 6697 if ( 0x403E < aExp ) { 6698 float_raise(float_flag_invalid, status); 6699 if ( ! aSign 6700 || ( ( aExp == 0x7FFF ) 6701 && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) ) 6702 ) 6703 ) { 6704 return INT64_MAX; 6705 } 6706 return INT64_MIN; 6707 } 6708 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 6709 } 6710 else { 6711 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 6712 } 6713 return roundAndPackInt64(aSign, aSig0, aSig1, status); 6714 6715 } 6716 6717 /*---------------------------------------------------------------------------- 6718 | Returns the result of converting the quadruple-precision floating-point 6719 | value `a' to the 64-bit two's complement integer format. The conversion 6720 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6721 | Arithmetic, except that the conversion is always rounded toward zero. 6722 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 6723 | the conversion overflows, the largest integer with the same sign as `a' is 6724 | returned. 6725 *----------------------------------------------------------------------------*/ 6726 6727 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 6728 { 6729 bool aSign; 6730 int32_t aExp, shiftCount; 6731 uint64_t aSig0, aSig1; 6732 int64_t z; 6733 6734 aSig1 = extractFloat128Frac1( a ); 6735 aSig0 = extractFloat128Frac0( a ); 6736 aExp = extractFloat128Exp( a ); 6737 aSign = extractFloat128Sign( a ); 6738 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6739 shiftCount = aExp - 0x402F; 6740 if ( 0 < shiftCount ) { 6741 if ( 0x403E <= aExp ) { 6742 aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF); 6743 if ( ( a.high == UINT64_C(0xC03E000000000000) ) 6744 && ( aSig1 < UINT64_C(0x0002000000000000) ) ) { 6745 if (aSig1) { 6746 float_raise(float_flag_inexact, status); 6747 } 6748 } 6749 else { 6750 float_raise(float_flag_invalid, status); 6751 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 6752 return INT64_MAX; 6753 } 6754 } 6755 return INT64_MIN; 6756 } 6757 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 6758 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 6759 float_raise(float_flag_inexact, status); 6760 } 6761 } 6762 else { 6763 if ( aExp < 0x3FFF ) { 6764 if ( aExp | aSig0 | aSig1 ) { 6765 float_raise(float_flag_inexact, status); 6766 } 6767 return 0; 6768 } 6769 z = aSig0>>( - shiftCount ); 6770 if ( aSig1 6771 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 6772 float_raise(float_flag_inexact, status); 6773 } 6774 } 6775 if ( aSign ) z = - z; 6776 return z; 6777 6778 } 6779 6780 /*---------------------------------------------------------------------------- 6781 | Returns the result of converting the quadruple-precision floating-point value 6782 | `a' to the 64-bit unsigned integer format. The conversion is 6783 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6784 | Arithmetic---which means in particular that the conversion is rounded 6785 | according to the current rounding mode. If `a' is a NaN, the largest 6786 | positive integer is returned. If the conversion overflows, the 6787 | largest unsigned integer is returned. If 'a' is negative, the value is 6788 | rounded and zero is returned; negative values that do not round to zero 6789 | will raise the inexact exception. 6790 *----------------------------------------------------------------------------*/ 6791 6792 uint64_t float128_to_uint64(float128 a, float_status *status) 6793 { 6794 bool aSign; 6795 int aExp; 6796 int shiftCount; 6797 uint64_t aSig0, aSig1; 6798 6799 aSig0 = extractFloat128Frac0(a); 6800 aSig1 = extractFloat128Frac1(a); 6801 aExp = extractFloat128Exp(a); 6802 aSign = extractFloat128Sign(a); 6803 if (aSign && (aExp > 0x3FFE)) { 6804 float_raise(float_flag_invalid, status); 6805 if (float128_is_any_nan(a)) { 6806 return UINT64_MAX; 6807 } else { 6808 return 0; 6809 } 6810 } 6811 if (aExp) { 6812 aSig0 |= UINT64_C(0x0001000000000000); 6813 } 6814 shiftCount = 0x402F - aExp; 6815 if (shiftCount <= 0) { 6816 if (0x403E < aExp) { 6817 float_raise(float_flag_invalid, status); 6818 return UINT64_MAX; 6819 } 6820 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 6821 } else { 6822 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 6823 } 6824 return roundAndPackUint64(aSign, aSig0, aSig1, status); 6825 } 6826 6827 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 6828 { 6829 uint64_t v; 6830 signed char current_rounding_mode = status->float_rounding_mode; 6831 6832 set_float_rounding_mode(float_round_to_zero, status); 6833 v = float128_to_uint64(a, status); 6834 set_float_rounding_mode(current_rounding_mode, status); 6835 6836 return v; 6837 } 6838 6839 /*---------------------------------------------------------------------------- 6840 | Returns the result of converting the quadruple-precision floating-point 6841 | value `a' to the 32-bit unsigned integer format. The conversion 6842 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6843 | Arithmetic except that the conversion is always rounded toward zero. 6844 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 6845 | if the conversion overflows, the largest unsigned integer is returned. 6846 | If 'a' is negative, the value is rounded and zero is returned; negative 6847 | values that do not round to zero will raise the inexact exception. 6848 *----------------------------------------------------------------------------*/ 6849 6850 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 6851 { 6852 uint64_t v; 6853 uint32_t res; 6854 int old_exc_flags = get_float_exception_flags(status); 6855 6856 v = float128_to_uint64_round_to_zero(a, status); 6857 if (v > 0xffffffff) { 6858 res = 0xffffffff; 6859 } else { 6860 return v; 6861 } 6862 set_float_exception_flags(old_exc_flags, status); 6863 float_raise(float_flag_invalid, status); 6864 return res; 6865 } 6866 6867 /*---------------------------------------------------------------------------- 6868 | Returns the result of converting the quadruple-precision floating-point value 6869 | `a' to the 32-bit unsigned integer format. The conversion is 6870 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6871 | Arithmetic---which means in particular that the conversion is rounded 6872 | according to the current rounding mode. If `a' is a NaN, the largest 6873 | positive integer is returned. If the conversion overflows, the 6874 | largest unsigned integer is returned. If 'a' is negative, the value is 6875 | rounded and zero is returned; negative values that do not round to zero 6876 | will raise the inexact exception. 6877 *----------------------------------------------------------------------------*/ 6878 6879 uint32_t float128_to_uint32(float128 a, float_status *status) 6880 { 6881 uint64_t v; 6882 uint32_t res; 6883 int old_exc_flags = get_float_exception_flags(status); 6884 6885 v = float128_to_uint64(a, status); 6886 if (v > 0xffffffff) { 6887 res = 0xffffffff; 6888 } else { 6889 return v; 6890 } 6891 set_float_exception_flags(old_exc_flags, status); 6892 float_raise(float_flag_invalid, status); 6893 return res; 6894 } 6895 6896 /*---------------------------------------------------------------------------- 6897 | Returns the result of converting the quadruple-precision floating-point 6898 | value `a' to the single-precision floating-point format. The conversion 6899 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6900 | Arithmetic. 6901 *----------------------------------------------------------------------------*/ 6902 6903 float32 float128_to_float32(float128 a, float_status *status) 6904 { 6905 bool aSign; 6906 int32_t aExp; 6907 uint64_t aSig0, aSig1; 6908 uint32_t zSig; 6909 6910 aSig1 = extractFloat128Frac1( a ); 6911 aSig0 = extractFloat128Frac0( a ); 6912 aExp = extractFloat128Exp( a ); 6913 aSign = extractFloat128Sign( a ); 6914 if ( aExp == 0x7FFF ) { 6915 if ( aSig0 | aSig1 ) { 6916 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 6917 } 6918 return packFloat32( aSign, 0xFF, 0 ); 6919 } 6920 aSig0 |= ( aSig1 != 0 ); 6921 shift64RightJamming( aSig0, 18, &aSig0 ); 6922 zSig = aSig0; 6923 if ( aExp || zSig ) { 6924 zSig |= 0x40000000; 6925 aExp -= 0x3F81; 6926 } 6927 return roundAndPackFloat32(aSign, aExp, zSig, status); 6928 6929 } 6930 6931 /*---------------------------------------------------------------------------- 6932 | Returns the result of converting the quadruple-precision floating-point 6933 | value `a' to the double-precision floating-point format. The conversion 6934 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6935 | Arithmetic. 6936 *----------------------------------------------------------------------------*/ 6937 6938 float64 float128_to_float64(float128 a, float_status *status) 6939 { 6940 bool aSign; 6941 int32_t aExp; 6942 uint64_t aSig0, aSig1; 6943 6944 aSig1 = extractFloat128Frac1( a ); 6945 aSig0 = extractFloat128Frac0( a ); 6946 aExp = extractFloat128Exp( a ); 6947 aSign = extractFloat128Sign( a ); 6948 if ( aExp == 0x7FFF ) { 6949 if ( aSig0 | aSig1 ) { 6950 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 6951 } 6952 return packFloat64( aSign, 0x7FF, 0 ); 6953 } 6954 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6955 aSig0 |= ( aSig1 != 0 ); 6956 if ( aExp || aSig0 ) { 6957 aSig0 |= UINT64_C(0x4000000000000000); 6958 aExp -= 0x3C01; 6959 } 6960 return roundAndPackFloat64(aSign, aExp, aSig0, status); 6961 6962 } 6963 6964 /*---------------------------------------------------------------------------- 6965 | Returns the result of converting the quadruple-precision floating-point 6966 | value `a' to the extended double-precision floating-point format. The 6967 | conversion is performed according to the IEC/IEEE Standard for Binary 6968 | Floating-Point Arithmetic. 6969 *----------------------------------------------------------------------------*/ 6970 6971 floatx80 float128_to_floatx80(float128 a, float_status *status) 6972 { 6973 bool aSign; 6974 int32_t aExp; 6975 uint64_t aSig0, aSig1; 6976 6977 aSig1 = extractFloat128Frac1( a ); 6978 aSig0 = extractFloat128Frac0( a ); 6979 aExp = extractFloat128Exp( a ); 6980 aSign = extractFloat128Sign( a ); 6981 if ( aExp == 0x7FFF ) { 6982 if ( aSig0 | aSig1 ) { 6983 floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status), 6984 status); 6985 return floatx80_silence_nan(res, status); 6986 } 6987 return packFloatx80(aSign, floatx80_infinity_high, 6988 floatx80_infinity_low); 6989 } 6990 if ( aExp == 0 ) { 6991 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6992 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6993 } 6994 else { 6995 aSig0 |= UINT64_C(0x0001000000000000); 6996 } 6997 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6998 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6999 7000 } 7001 7002 /*---------------------------------------------------------------------------- 7003 | Rounds the quadruple-precision floating-point value `a' to an integer, and 7004 | returns the result as a quadruple-precision floating-point value. The 7005 | operation is performed according to the IEC/IEEE Standard for Binary 7006 | Floating-Point Arithmetic. 7007 *----------------------------------------------------------------------------*/ 7008 7009 float128 float128_round_to_int(float128 a, float_status *status) 7010 { 7011 bool aSign; 7012 int32_t aExp; 7013 uint64_t lastBitMask, roundBitsMask; 7014 float128 z; 7015 7016 aExp = extractFloat128Exp( a ); 7017 if ( 0x402F <= aExp ) { 7018 if ( 0x406F <= aExp ) { 7019 if ( ( aExp == 0x7FFF ) 7020 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 7021 ) { 7022 return propagateFloat128NaN(a, a, status); 7023 } 7024 return a; 7025 } 7026 lastBitMask = 1; 7027 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 7028 roundBitsMask = lastBitMask - 1; 7029 z = a; 7030 switch (status->float_rounding_mode) { 7031 case float_round_nearest_even: 7032 if ( lastBitMask ) { 7033 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 7034 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 7035 } 7036 else { 7037 if ( (int64_t) z.low < 0 ) { 7038 ++z.high; 7039 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 7040 } 7041 } 7042 break; 7043 case float_round_ties_away: 7044 if (lastBitMask) { 7045 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 7046 } else { 7047 if ((int64_t) z.low < 0) { 7048 ++z.high; 7049 } 7050 } 7051 break; 7052 case float_round_to_zero: 7053 break; 7054 case float_round_up: 7055 if (!extractFloat128Sign(z)) { 7056 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 7057 } 7058 break; 7059 case float_round_down: 7060 if (extractFloat128Sign(z)) { 7061 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 7062 } 7063 break; 7064 case float_round_to_odd: 7065 /* 7066 * Note that if lastBitMask == 0, the last bit is the lsb 7067 * of high, and roundBitsMask == -1. 7068 */ 7069 if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) { 7070 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 7071 } 7072 break; 7073 default: 7074 abort(); 7075 } 7076 z.low &= ~ roundBitsMask; 7077 } 7078 else { 7079 if ( aExp < 0x3FFF ) { 7080 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 7081 float_raise(float_flag_inexact, status); 7082 aSign = extractFloat128Sign( a ); 7083 switch (status->float_rounding_mode) { 7084 case float_round_nearest_even: 7085 if ( ( aExp == 0x3FFE ) 7086 && ( extractFloat128Frac0( a ) 7087 | extractFloat128Frac1( a ) ) 7088 ) { 7089 return packFloat128( aSign, 0x3FFF, 0, 0 ); 7090 } 7091 break; 7092 case float_round_ties_away: 7093 if (aExp == 0x3FFE) { 7094 return packFloat128(aSign, 0x3FFF, 0, 0); 7095 } 7096 break; 7097 case float_round_down: 7098 return 7099 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 7100 : packFloat128( 0, 0, 0, 0 ); 7101 case float_round_up: 7102 return 7103 aSign ? packFloat128( 1, 0, 0, 0 ) 7104 : packFloat128( 0, 0x3FFF, 0, 0 ); 7105 7106 case float_round_to_odd: 7107 return packFloat128(aSign, 0x3FFF, 0, 0); 7108 7109 case float_round_to_zero: 7110 break; 7111 } 7112 return packFloat128( aSign, 0, 0, 0 ); 7113 } 7114 lastBitMask = 1; 7115 lastBitMask <<= 0x402F - aExp; 7116 roundBitsMask = lastBitMask - 1; 7117 z.low = 0; 7118 z.high = a.high; 7119 switch (status->float_rounding_mode) { 7120 case float_round_nearest_even: 7121 z.high += lastBitMask>>1; 7122 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 7123 z.high &= ~ lastBitMask; 7124 } 7125 break; 7126 case float_round_ties_away: 7127 z.high += lastBitMask>>1; 7128 break; 7129 case float_round_to_zero: 7130 break; 7131 case float_round_up: 7132 if (!extractFloat128Sign(z)) { 7133 z.high |= ( a.low != 0 ); 7134 z.high += roundBitsMask; 7135 } 7136 break; 7137 case float_round_down: 7138 if (extractFloat128Sign(z)) { 7139 z.high |= (a.low != 0); 7140 z.high += roundBitsMask; 7141 } 7142 break; 7143 case float_round_to_odd: 7144 if ((z.high & lastBitMask) == 0) { 7145 z.high |= (a.low != 0); 7146 z.high += roundBitsMask; 7147 } 7148 break; 7149 default: 7150 abort(); 7151 } 7152 z.high &= ~ roundBitsMask; 7153 } 7154 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 7155 float_raise(float_flag_inexact, status); 7156 } 7157 return z; 7158 7159 } 7160 7161 /*---------------------------------------------------------------------------- 7162 | Returns the result of adding the absolute values of the quadruple-precision 7163 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 7164 | before being returned. `zSign' is ignored if the result is a NaN. 7165 | The addition is performed according to the IEC/IEEE Standard for Binary 7166 | Floating-Point Arithmetic. 7167 *----------------------------------------------------------------------------*/ 7168 7169 static float128 addFloat128Sigs(float128 a, float128 b, bool zSign, 7170 float_status *status) 7171 { 7172 int32_t aExp, bExp, zExp; 7173 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 7174 int32_t expDiff; 7175 7176 aSig1 = extractFloat128Frac1( a ); 7177 aSig0 = extractFloat128Frac0( a ); 7178 aExp = extractFloat128Exp( a ); 7179 bSig1 = extractFloat128Frac1( b ); 7180 bSig0 = extractFloat128Frac0( b ); 7181 bExp = extractFloat128Exp( b ); 7182 expDiff = aExp - bExp; 7183 if ( 0 < expDiff ) { 7184 if ( aExp == 0x7FFF ) { 7185 if (aSig0 | aSig1) { 7186 return propagateFloat128NaN(a, b, status); 7187 } 7188 return a; 7189 } 7190 if ( bExp == 0 ) { 7191 --expDiff; 7192 } 7193 else { 7194 bSig0 |= UINT64_C(0x0001000000000000); 7195 } 7196 shift128ExtraRightJamming( 7197 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 7198 zExp = aExp; 7199 } 7200 else if ( expDiff < 0 ) { 7201 if ( bExp == 0x7FFF ) { 7202 if (bSig0 | bSig1) { 7203 return propagateFloat128NaN(a, b, status); 7204 } 7205 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7206 } 7207 if ( aExp == 0 ) { 7208 ++expDiff; 7209 } 7210 else { 7211 aSig0 |= UINT64_C(0x0001000000000000); 7212 } 7213 shift128ExtraRightJamming( 7214 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 7215 zExp = bExp; 7216 } 7217 else { 7218 if ( aExp == 0x7FFF ) { 7219 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 7220 return propagateFloat128NaN(a, b, status); 7221 } 7222 return a; 7223 } 7224 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 7225 if ( aExp == 0 ) { 7226 if (status->flush_to_zero) { 7227 if (zSig0 | zSig1) { 7228 float_raise(float_flag_output_denormal, status); 7229 } 7230 return packFloat128(zSign, 0, 0, 0); 7231 } 7232 return packFloat128( zSign, 0, zSig0, zSig1 ); 7233 } 7234 zSig2 = 0; 7235 zSig0 |= UINT64_C(0x0002000000000000); 7236 zExp = aExp; 7237 goto shiftRight1; 7238 } 7239 aSig0 |= UINT64_C(0x0001000000000000); 7240 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 7241 --zExp; 7242 if ( zSig0 < UINT64_C(0x0002000000000000) ) goto roundAndPack; 7243 ++zExp; 7244 shiftRight1: 7245 shift128ExtraRightJamming( 7246 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 7247 roundAndPack: 7248 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7249 7250 } 7251 7252 /*---------------------------------------------------------------------------- 7253 | Returns the result of subtracting the absolute values of the quadruple- 7254 | precision floating-point values `a' and `b'. If `zSign' is 1, the 7255 | difference is negated before being returned. `zSign' is ignored if the 7256 | result is a NaN. The subtraction is performed according to the IEC/IEEE 7257 | Standard for Binary Floating-Point Arithmetic. 7258 *----------------------------------------------------------------------------*/ 7259 7260 static float128 subFloat128Sigs(float128 a, float128 b, bool zSign, 7261 float_status *status) 7262 { 7263 int32_t aExp, bExp, zExp; 7264 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 7265 int32_t expDiff; 7266 7267 aSig1 = extractFloat128Frac1( a ); 7268 aSig0 = extractFloat128Frac0( a ); 7269 aExp = extractFloat128Exp( a ); 7270 bSig1 = extractFloat128Frac1( b ); 7271 bSig0 = extractFloat128Frac0( b ); 7272 bExp = extractFloat128Exp( b ); 7273 expDiff = aExp - bExp; 7274 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 7275 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 7276 if ( 0 < expDiff ) goto aExpBigger; 7277 if ( expDiff < 0 ) goto bExpBigger; 7278 if ( aExp == 0x7FFF ) { 7279 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 7280 return propagateFloat128NaN(a, b, status); 7281 } 7282 float_raise(float_flag_invalid, status); 7283 return float128_default_nan(status); 7284 } 7285 if ( aExp == 0 ) { 7286 aExp = 1; 7287 bExp = 1; 7288 } 7289 if ( bSig0 < aSig0 ) goto aBigger; 7290 if ( aSig0 < bSig0 ) goto bBigger; 7291 if ( bSig1 < aSig1 ) goto aBigger; 7292 if ( aSig1 < bSig1 ) goto bBigger; 7293 return packFloat128(status->float_rounding_mode == float_round_down, 7294 0, 0, 0); 7295 bExpBigger: 7296 if ( bExp == 0x7FFF ) { 7297 if (bSig0 | bSig1) { 7298 return propagateFloat128NaN(a, b, status); 7299 } 7300 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 7301 } 7302 if ( aExp == 0 ) { 7303 ++expDiff; 7304 } 7305 else { 7306 aSig0 |= UINT64_C(0x4000000000000000); 7307 } 7308 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 7309 bSig0 |= UINT64_C(0x4000000000000000); 7310 bBigger: 7311 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 7312 zExp = bExp; 7313 zSign ^= 1; 7314 goto normalizeRoundAndPack; 7315 aExpBigger: 7316 if ( aExp == 0x7FFF ) { 7317 if (aSig0 | aSig1) { 7318 return propagateFloat128NaN(a, b, status); 7319 } 7320 return a; 7321 } 7322 if ( bExp == 0 ) { 7323 --expDiff; 7324 } 7325 else { 7326 bSig0 |= UINT64_C(0x4000000000000000); 7327 } 7328 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 7329 aSig0 |= UINT64_C(0x4000000000000000); 7330 aBigger: 7331 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 7332 zExp = aExp; 7333 normalizeRoundAndPack: 7334 --zExp; 7335 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 7336 status); 7337 7338 } 7339 7340 /*---------------------------------------------------------------------------- 7341 | Returns the result of adding the quadruple-precision floating-point values 7342 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 7343 | for Binary Floating-Point Arithmetic. 7344 *----------------------------------------------------------------------------*/ 7345 7346 float128 float128_add(float128 a, float128 b, float_status *status) 7347 { 7348 bool aSign, bSign; 7349 7350 aSign = extractFloat128Sign( a ); 7351 bSign = extractFloat128Sign( b ); 7352 if ( aSign == bSign ) { 7353 return addFloat128Sigs(a, b, aSign, status); 7354 } 7355 else { 7356 return subFloat128Sigs(a, b, aSign, status); 7357 } 7358 7359 } 7360 7361 /*---------------------------------------------------------------------------- 7362 | Returns the result of subtracting the quadruple-precision floating-point 7363 | values `a' and `b'. The operation is performed according to the IEC/IEEE 7364 | Standard for Binary Floating-Point Arithmetic. 7365 *----------------------------------------------------------------------------*/ 7366 7367 float128 float128_sub(float128 a, float128 b, float_status *status) 7368 { 7369 bool aSign, bSign; 7370 7371 aSign = extractFloat128Sign( a ); 7372 bSign = extractFloat128Sign( b ); 7373 if ( aSign == bSign ) { 7374 return subFloat128Sigs(a, b, aSign, status); 7375 } 7376 else { 7377 return addFloat128Sigs(a, b, aSign, status); 7378 } 7379 7380 } 7381 7382 /*---------------------------------------------------------------------------- 7383 | Returns the result of multiplying the quadruple-precision floating-point 7384 | values `a' and `b'. The operation is performed according to the IEC/IEEE 7385 | Standard for Binary Floating-Point Arithmetic. 7386 *----------------------------------------------------------------------------*/ 7387 7388 float128 float128_mul(float128 a, float128 b, float_status *status) 7389 { 7390 bool aSign, bSign, zSign; 7391 int32_t aExp, bExp, zExp; 7392 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 7393 7394 aSig1 = extractFloat128Frac1( a ); 7395 aSig0 = extractFloat128Frac0( a ); 7396 aExp = extractFloat128Exp( a ); 7397 aSign = extractFloat128Sign( a ); 7398 bSig1 = extractFloat128Frac1( b ); 7399 bSig0 = extractFloat128Frac0( b ); 7400 bExp = extractFloat128Exp( b ); 7401 bSign = extractFloat128Sign( b ); 7402 zSign = aSign ^ bSign; 7403 if ( aExp == 0x7FFF ) { 7404 if ( ( aSig0 | aSig1 ) 7405 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 7406 return propagateFloat128NaN(a, b, status); 7407 } 7408 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 7409 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7410 } 7411 if ( bExp == 0x7FFF ) { 7412 if (bSig0 | bSig1) { 7413 return propagateFloat128NaN(a, b, status); 7414 } 7415 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 7416 invalid: 7417 float_raise(float_flag_invalid, status); 7418 return float128_default_nan(status); 7419 } 7420 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7421 } 7422 if ( aExp == 0 ) { 7423 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7424 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7425 } 7426 if ( bExp == 0 ) { 7427 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7428 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7429 } 7430 zExp = aExp + bExp - 0x4000; 7431 aSig0 |= UINT64_C(0x0001000000000000); 7432 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 7433 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 7434 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 7435 zSig2 |= ( zSig3 != 0 ); 7436 if (UINT64_C( 0x0002000000000000) <= zSig0 ) { 7437 shift128ExtraRightJamming( 7438 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 7439 ++zExp; 7440 } 7441 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7442 7443 } 7444 7445 /*---------------------------------------------------------------------------- 7446 | Returns the result of dividing the quadruple-precision floating-point value 7447 | `a' by the corresponding value `b'. The operation is performed according to 7448 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7449 *----------------------------------------------------------------------------*/ 7450 7451 float128 float128_div(float128 a, float128 b, float_status *status) 7452 { 7453 bool aSign, bSign, zSign; 7454 int32_t aExp, bExp, zExp; 7455 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 7456 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7457 7458 aSig1 = extractFloat128Frac1( a ); 7459 aSig0 = extractFloat128Frac0( a ); 7460 aExp = extractFloat128Exp( a ); 7461 aSign = extractFloat128Sign( a ); 7462 bSig1 = extractFloat128Frac1( b ); 7463 bSig0 = extractFloat128Frac0( b ); 7464 bExp = extractFloat128Exp( b ); 7465 bSign = extractFloat128Sign( b ); 7466 zSign = aSign ^ bSign; 7467 if ( aExp == 0x7FFF ) { 7468 if (aSig0 | aSig1) { 7469 return propagateFloat128NaN(a, b, status); 7470 } 7471 if ( bExp == 0x7FFF ) { 7472 if (bSig0 | bSig1) { 7473 return propagateFloat128NaN(a, b, status); 7474 } 7475 goto invalid; 7476 } 7477 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7478 } 7479 if ( bExp == 0x7FFF ) { 7480 if (bSig0 | bSig1) { 7481 return propagateFloat128NaN(a, b, status); 7482 } 7483 return packFloat128( zSign, 0, 0, 0 ); 7484 } 7485 if ( bExp == 0 ) { 7486 if ( ( bSig0 | bSig1 ) == 0 ) { 7487 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 7488 invalid: 7489 float_raise(float_flag_invalid, status); 7490 return float128_default_nan(status); 7491 } 7492 float_raise(float_flag_divbyzero, status); 7493 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7494 } 7495 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7496 } 7497 if ( aExp == 0 ) { 7498 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7499 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7500 } 7501 zExp = aExp - bExp + 0x3FFD; 7502 shortShift128Left( 7503 aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 ); 7504 shortShift128Left( 7505 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 ); 7506 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 7507 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 7508 ++zExp; 7509 } 7510 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7511 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 7512 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 7513 while ( (int64_t) rem0 < 0 ) { 7514 --zSig0; 7515 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 7516 } 7517 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 7518 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 7519 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 7520 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 7521 while ( (int64_t) rem1 < 0 ) { 7522 --zSig1; 7523 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 7524 } 7525 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7526 } 7527 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 7528 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7529 7530 } 7531 7532 /*---------------------------------------------------------------------------- 7533 | Returns the remainder of the quadruple-precision floating-point value `a' 7534 | with respect to the corresponding value `b'. The operation is performed 7535 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7536 *----------------------------------------------------------------------------*/ 7537 7538 float128 float128_rem(float128 a, float128 b, float_status *status) 7539 { 7540 bool aSign, zSign; 7541 int32_t aExp, bExp, expDiff; 7542 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 7543 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 7544 int64_t sigMean0; 7545 7546 aSig1 = extractFloat128Frac1( a ); 7547 aSig0 = extractFloat128Frac0( a ); 7548 aExp = extractFloat128Exp( a ); 7549 aSign = extractFloat128Sign( a ); 7550 bSig1 = extractFloat128Frac1( b ); 7551 bSig0 = extractFloat128Frac0( b ); 7552 bExp = extractFloat128Exp( b ); 7553 if ( aExp == 0x7FFF ) { 7554 if ( ( aSig0 | aSig1 ) 7555 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 7556 return propagateFloat128NaN(a, b, status); 7557 } 7558 goto invalid; 7559 } 7560 if ( bExp == 0x7FFF ) { 7561 if (bSig0 | bSig1) { 7562 return propagateFloat128NaN(a, b, status); 7563 } 7564 return a; 7565 } 7566 if ( bExp == 0 ) { 7567 if ( ( bSig0 | bSig1 ) == 0 ) { 7568 invalid: 7569 float_raise(float_flag_invalid, status); 7570 return float128_default_nan(status); 7571 } 7572 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7573 } 7574 if ( aExp == 0 ) { 7575 if ( ( aSig0 | aSig1 ) == 0 ) return a; 7576 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7577 } 7578 expDiff = aExp - bExp; 7579 if ( expDiff < -1 ) return a; 7580 shortShift128Left( 7581 aSig0 | UINT64_C(0x0001000000000000), 7582 aSig1, 7583 15 - ( expDiff < 0 ), 7584 &aSig0, 7585 &aSig1 7586 ); 7587 shortShift128Left( 7588 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 ); 7589 q = le128( bSig0, bSig1, aSig0, aSig1 ); 7590 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7591 expDiff -= 64; 7592 while ( 0 < expDiff ) { 7593 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7594 q = ( 4 < q ) ? q - 4 : 0; 7595 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7596 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 7597 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 7598 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 7599 expDiff -= 61; 7600 } 7601 if ( -64 < expDiff ) { 7602 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7603 q = ( 4 < q ) ? q - 4 : 0; 7604 q >>= - expDiff; 7605 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7606 expDiff += 52; 7607 if ( expDiff < 0 ) { 7608 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 7609 } 7610 else { 7611 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 7612 } 7613 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7614 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 7615 } 7616 else { 7617 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 7618 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7619 } 7620 do { 7621 alternateASig0 = aSig0; 7622 alternateASig1 = aSig1; 7623 ++q; 7624 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7625 } while ( 0 <= (int64_t) aSig0 ); 7626 add128( 7627 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 7628 if ( ( sigMean0 < 0 ) 7629 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 7630 aSig0 = alternateASig0; 7631 aSig1 = alternateASig1; 7632 } 7633 zSign = ( (int64_t) aSig0 < 0 ); 7634 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 7635 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 7636 status); 7637 } 7638 7639 /*---------------------------------------------------------------------------- 7640 | Returns the square root of the quadruple-precision floating-point value `a'. 7641 | The operation is performed according to the IEC/IEEE Standard for Binary 7642 | Floating-Point Arithmetic. 7643 *----------------------------------------------------------------------------*/ 7644 7645 float128 float128_sqrt(float128 a, float_status *status) 7646 { 7647 bool aSign; 7648 int32_t aExp, zExp; 7649 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 7650 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7651 7652 aSig1 = extractFloat128Frac1( a ); 7653 aSig0 = extractFloat128Frac0( a ); 7654 aExp = extractFloat128Exp( a ); 7655 aSign = extractFloat128Sign( a ); 7656 if ( aExp == 0x7FFF ) { 7657 if (aSig0 | aSig1) { 7658 return propagateFloat128NaN(a, a, status); 7659 } 7660 if ( ! aSign ) return a; 7661 goto invalid; 7662 } 7663 if ( aSign ) { 7664 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 7665 invalid: 7666 float_raise(float_flag_invalid, status); 7667 return float128_default_nan(status); 7668 } 7669 if ( aExp == 0 ) { 7670 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 7671 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7672 } 7673 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 7674 aSig0 |= UINT64_C(0x0001000000000000); 7675 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 7676 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 7677 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 7678 doubleZSig0 = zSig0<<1; 7679 mul64To128( zSig0, zSig0, &term0, &term1 ); 7680 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 7681 while ( (int64_t) rem0 < 0 ) { 7682 --zSig0; 7683 doubleZSig0 -= 2; 7684 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 7685 } 7686 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 7687 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 7688 if ( zSig1 == 0 ) zSig1 = 1; 7689 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 7690 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 7691 mul64To128( zSig1, zSig1, &term2, &term3 ); 7692 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 7693 while ( (int64_t) rem1 < 0 ) { 7694 --zSig1; 7695 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 7696 term3 |= 1; 7697 term2 |= doubleZSig0; 7698 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 7699 } 7700 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7701 } 7702 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 7703 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 7704 7705 } 7706 7707 static inline FloatRelation 7708 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet, 7709 float_status *status) 7710 { 7711 bool aSign, bSign; 7712 7713 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 7714 float_raise(float_flag_invalid, status); 7715 return float_relation_unordered; 7716 } 7717 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 7718 ( extractFloatx80Frac( a )<<1 ) ) || 7719 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 7720 ( extractFloatx80Frac( b )<<1 ) )) { 7721 if (!is_quiet || 7722 floatx80_is_signaling_nan(a, status) || 7723 floatx80_is_signaling_nan(b, status)) { 7724 float_raise(float_flag_invalid, status); 7725 } 7726 return float_relation_unordered; 7727 } 7728 aSign = extractFloatx80Sign( a ); 7729 bSign = extractFloatx80Sign( b ); 7730 if ( aSign != bSign ) { 7731 7732 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 7733 ( ( a.low | b.low ) == 0 ) ) { 7734 /* zero case */ 7735 return float_relation_equal; 7736 } else { 7737 return 1 - (2 * aSign); 7738 } 7739 } else { 7740 /* Normalize pseudo-denormals before comparison. */ 7741 if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) { 7742 ++a.high; 7743 } 7744 if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) { 7745 ++b.high; 7746 } 7747 if (a.low == b.low && a.high == b.high) { 7748 return float_relation_equal; 7749 } else { 7750 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7751 } 7752 } 7753 } 7754 7755 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status) 7756 { 7757 return floatx80_compare_internal(a, b, 0, status); 7758 } 7759 7760 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b, 7761 float_status *status) 7762 { 7763 return floatx80_compare_internal(a, b, 1, status); 7764 } 7765 7766 static inline FloatRelation 7767 float128_compare_internal(float128 a, float128 b, bool is_quiet, 7768 float_status *status) 7769 { 7770 bool aSign, bSign; 7771 7772 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7773 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7774 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7775 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7776 if (!is_quiet || 7777 float128_is_signaling_nan(a, status) || 7778 float128_is_signaling_nan(b, status)) { 7779 float_raise(float_flag_invalid, status); 7780 } 7781 return float_relation_unordered; 7782 } 7783 aSign = extractFloat128Sign( a ); 7784 bSign = extractFloat128Sign( b ); 7785 if ( aSign != bSign ) { 7786 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7787 /* zero case */ 7788 return float_relation_equal; 7789 } else { 7790 return 1 - (2 * aSign); 7791 } 7792 } else { 7793 if (a.low == b.low && a.high == b.high) { 7794 return float_relation_equal; 7795 } else { 7796 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7797 } 7798 } 7799 } 7800 7801 FloatRelation float128_compare(float128 a, float128 b, float_status *status) 7802 { 7803 return float128_compare_internal(a, b, 0, status); 7804 } 7805 7806 FloatRelation float128_compare_quiet(float128 a, float128 b, 7807 float_status *status) 7808 { 7809 return float128_compare_internal(a, b, 1, status); 7810 } 7811 7812 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7813 { 7814 bool aSign; 7815 int32_t aExp; 7816 uint64_t aSig; 7817 7818 if (floatx80_invalid_encoding(a)) { 7819 float_raise(float_flag_invalid, status); 7820 return floatx80_default_nan(status); 7821 } 7822 aSig = extractFloatx80Frac( a ); 7823 aExp = extractFloatx80Exp( a ); 7824 aSign = extractFloatx80Sign( a ); 7825 7826 if ( aExp == 0x7FFF ) { 7827 if ( aSig<<1 ) { 7828 return propagateFloatx80NaN(a, a, status); 7829 } 7830 return a; 7831 } 7832 7833 if (aExp == 0) { 7834 if (aSig == 0) { 7835 return a; 7836 } 7837 aExp++; 7838 } 7839 7840 if (n > 0x10000) { 7841 n = 0x10000; 7842 } else if (n < -0x10000) { 7843 n = -0x10000; 7844 } 7845 7846 aExp += n; 7847 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7848 aSign, aExp, aSig, 0, status); 7849 } 7850 7851 float128 float128_scalbn(float128 a, int n, float_status *status) 7852 { 7853 bool aSign; 7854 int32_t aExp; 7855 uint64_t aSig0, aSig1; 7856 7857 aSig1 = extractFloat128Frac1( a ); 7858 aSig0 = extractFloat128Frac0( a ); 7859 aExp = extractFloat128Exp( a ); 7860 aSign = extractFloat128Sign( a ); 7861 if ( aExp == 0x7FFF ) { 7862 if ( aSig0 | aSig1 ) { 7863 return propagateFloat128NaN(a, a, status); 7864 } 7865 return a; 7866 } 7867 if (aExp != 0) { 7868 aSig0 |= UINT64_C(0x0001000000000000); 7869 } else if (aSig0 == 0 && aSig1 == 0) { 7870 return a; 7871 } else { 7872 aExp++; 7873 } 7874 7875 if (n > 0x10000) { 7876 n = 0x10000; 7877 } else if (n < -0x10000) { 7878 n = -0x10000; 7879 } 7880 7881 aExp += n - 1; 7882 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7883 , status); 7884 7885 } 7886 7887 static void __attribute__((constructor)) softfloat_init(void) 7888 { 7889 union_float64 ua, ub, uc, ur; 7890 7891 if (QEMU_NO_HARDFLOAT) { 7892 return; 7893 } 7894 /* 7895 * Test that the host's FMA is not obviously broken. For example, 7896 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see 7897 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304 7898 */ 7899 ua.s = 0x0020000000000001ULL; 7900 ub.s = 0x3ca0000000000000ULL; 7901 uc.s = 0x0020000000000000ULL; 7902 ur.h = fma(ua.h, ub.h, uc.h); 7903 if (ur.s != 0x0020000000000001ULL) { 7904 force_soft_fma = true; 7905 } 7906 } 7907