1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include <math.h> 87 #include "qemu/bitops.h" 88 #include "fpu/softfloat.h" 89 90 /* We only need stdlib for abort() */ 91 92 /*---------------------------------------------------------------------------- 93 | Primitive arithmetic functions, including multi-word arithmetic, and 94 | division and square root approximations. (Can be specialized to target if 95 | desired.) 96 *----------------------------------------------------------------------------*/ 97 #include "fpu/softfloat-macros.h" 98 99 /* 100 * Hardfloat 101 * 102 * Fast emulation of guest FP instructions is challenging for two reasons. 103 * First, FP instruction semantics are similar but not identical, particularly 104 * when handling NaNs. Second, emulating at reasonable speed the guest FP 105 * exception flags is not trivial: reading the host's flags register with a 106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp], 107 * and trapping on every FP exception is not fast nor pleasant to work with. 108 * 109 * We address these challenges by leveraging the host FPU for a subset of the 110 * operations. To do this we expand on the idea presented in this paper: 111 * 112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a 113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615. 114 * 115 * The idea is thus to leverage the host FPU to (1) compute FP operations 116 * and (2) identify whether FP exceptions occurred while avoiding 117 * expensive exception flag register accesses. 118 * 119 * An important optimization shown in the paper is that given that exception 120 * flags are rarely cleared by the guest, we can avoid recomputing some flags. 121 * This is particularly useful for the inexact flag, which is very frequently 122 * raised in floating-point workloads. 123 * 124 * We optimize the code further by deferring to soft-fp whenever FP exception 125 * detection might get hairy. Two examples: (1) when at least one operand is 126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result 127 * and the result is < the minimum normal. 128 */ 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \ 130 static inline void name(soft_t *a, float_status *s) \ 131 { \ 132 if (unlikely(soft_t ## _is_denormal(*a))) { \ 133 *a = soft_t ## _set_sign(soft_t ## _zero, \ 134 soft_t ## _is_neg(*a)); \ 135 float_raise(float_flag_input_denormal, s); \ 136 } \ 137 } 138 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32) 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64) 141 #undef GEN_INPUT_FLUSH__NOCHECK 142 143 #define GEN_INPUT_FLUSH1(name, soft_t) \ 144 static inline void name(soft_t *a, float_status *s) \ 145 { \ 146 if (likely(!s->flush_inputs_to_zero)) { \ 147 return; \ 148 } \ 149 soft_t ## _input_flush__nocheck(a, s); \ 150 } 151 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32) 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64) 154 #undef GEN_INPUT_FLUSH1 155 156 #define GEN_INPUT_FLUSH2(name, soft_t) \ 157 static inline void name(soft_t *a, soft_t *b, float_status *s) \ 158 { \ 159 if (likely(!s->flush_inputs_to_zero)) { \ 160 return; \ 161 } \ 162 soft_t ## _input_flush__nocheck(a, s); \ 163 soft_t ## _input_flush__nocheck(b, s); \ 164 } 165 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32) 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64) 168 #undef GEN_INPUT_FLUSH2 169 170 #define GEN_INPUT_FLUSH3(name, soft_t) \ 171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \ 172 { \ 173 if (likely(!s->flush_inputs_to_zero)) { \ 174 return; \ 175 } \ 176 soft_t ## _input_flush__nocheck(a, s); \ 177 soft_t ## _input_flush__nocheck(b, s); \ 178 soft_t ## _input_flush__nocheck(c, s); \ 179 } 180 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32) 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64) 183 #undef GEN_INPUT_FLUSH3 184 185 /* 186 * Choose whether to use fpclassify or float32/64_* primitives in the generated 187 * hardfloat functions. Each combination of number of inputs and float size 188 * gets its own value. 189 */ 190 #if defined(__x86_64__) 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1 197 #else 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0 204 #endif 205 206 /* 207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over 208 * float{32,64}_is_infinity when !USE_FP. 209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup. 210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%. 211 */ 212 #if defined(__x86_64__) || defined(__aarch64__) 213 # define QEMU_HARDFLOAT_USE_ISINF 1 214 #else 215 # define QEMU_HARDFLOAT_USE_ISINF 0 216 #endif 217 218 /* 219 * Some targets clear the FP flags before most FP operations. This prevents 220 * the use of hardfloat, since hardfloat relies on the inexact flag being 221 * already set. 222 */ 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__) 224 # if defined(__FAST_MATH__) 225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \ 226 IEEE implementation 227 # endif 228 # define QEMU_NO_HARDFLOAT 1 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN 230 #else 231 # define QEMU_NO_HARDFLOAT 0 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline)) 233 #endif 234 235 static inline bool can_use_fpu(const float_status *s) 236 { 237 if (QEMU_NO_HARDFLOAT) { 238 return false; 239 } 240 return likely(s->float_exception_flags & float_flag_inexact && 241 s->float_rounding_mode == float_round_nearest_even); 242 } 243 244 /* 245 * Hardfloat generation functions. Each operation can have two flavors: 246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for 247 * most condition checks, or native ones (e.g. fpclassify). 248 * 249 * The flavor is chosen by the callers. Instead of using macros, we rely on the 250 * compiler to propagate constants and inline everything into the callers. 251 * 252 * We only generate functions for operations with two inputs, since only 253 * these are common enough to justify consolidating them into common code. 254 */ 255 256 typedef union { 257 float32 s; 258 float h; 259 } union_float32; 260 261 typedef union { 262 float64 s; 263 double h; 264 } union_float64; 265 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b); 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b); 268 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s); 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s); 271 typedef float (*hard_f32_op2_fn)(float a, float b); 272 typedef double (*hard_f64_op2_fn)(double a, double b); 273 274 /* 2-input is-zero-or-normal */ 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b) 276 { 277 if (QEMU_HARDFLOAT_2F32_USE_FP) { 278 /* 279 * Not using a temp variable for consecutive fpclassify calls ends up 280 * generating faster code. 281 */ 282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 284 } 285 return float32_is_zero_or_normal(a.s) && 286 float32_is_zero_or_normal(b.s); 287 } 288 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b) 290 { 291 if (QEMU_HARDFLOAT_2F64_USE_FP) { 292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 294 } 295 return float64_is_zero_or_normal(a.s) && 296 float64_is_zero_or_normal(b.s); 297 } 298 299 /* 3-input is-zero-or-normal */ 300 static inline 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c) 302 { 303 if (QEMU_HARDFLOAT_3F32_USE_FP) { 304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 307 } 308 return float32_is_zero_or_normal(a.s) && 309 float32_is_zero_or_normal(b.s) && 310 float32_is_zero_or_normal(c.s); 311 } 312 313 static inline 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c) 315 { 316 if (QEMU_HARDFLOAT_3F64_USE_FP) { 317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 320 } 321 return float64_is_zero_or_normal(a.s) && 322 float64_is_zero_or_normal(b.s) && 323 float64_is_zero_or_normal(c.s); 324 } 325 326 static inline bool f32_is_inf(union_float32 a) 327 { 328 if (QEMU_HARDFLOAT_USE_ISINF) { 329 return isinf(a.h); 330 } 331 return float32_is_infinity(a.s); 332 } 333 334 static inline bool f64_is_inf(union_float64 a) 335 { 336 if (QEMU_HARDFLOAT_USE_ISINF) { 337 return isinf(a.h); 338 } 339 return float64_is_infinity(a.s); 340 } 341 342 static inline float32 343 float32_gen2(float32 xa, float32 xb, float_status *s, 344 hard_f32_op2_fn hard, soft_f32_op2_fn soft, 345 f32_check_fn pre, f32_check_fn post) 346 { 347 union_float32 ua, ub, ur; 348 349 ua.s = xa; 350 ub.s = xb; 351 352 if (unlikely(!can_use_fpu(s))) { 353 goto soft; 354 } 355 356 float32_input_flush2(&ua.s, &ub.s, s); 357 if (unlikely(!pre(ua, ub))) { 358 goto soft; 359 } 360 361 ur.h = hard(ua.h, ub.h); 362 if (unlikely(f32_is_inf(ur))) { 363 float_raise(float_flag_overflow, s); 364 } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) { 365 goto soft; 366 } 367 return ur.s; 368 369 soft: 370 return soft(ua.s, ub.s, s); 371 } 372 373 static inline float64 374 float64_gen2(float64 xa, float64 xb, float_status *s, 375 hard_f64_op2_fn hard, soft_f64_op2_fn soft, 376 f64_check_fn pre, f64_check_fn post) 377 { 378 union_float64 ua, ub, ur; 379 380 ua.s = xa; 381 ub.s = xb; 382 383 if (unlikely(!can_use_fpu(s))) { 384 goto soft; 385 } 386 387 float64_input_flush2(&ua.s, &ub.s, s); 388 if (unlikely(!pre(ua, ub))) { 389 goto soft; 390 } 391 392 ur.h = hard(ua.h, ub.h); 393 if (unlikely(f64_is_inf(ur))) { 394 float_raise(float_flag_overflow, s); 395 } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) { 396 goto soft; 397 } 398 return ur.s; 399 400 soft: 401 return soft(ua.s, ub.s, s); 402 } 403 404 /*---------------------------------------------------------------------------- 405 | Returns the fraction bits of the single-precision floating-point value `a'. 406 *----------------------------------------------------------------------------*/ 407 408 static inline uint32_t extractFloat32Frac(float32 a) 409 { 410 return float32_val(a) & 0x007FFFFF; 411 } 412 413 /*---------------------------------------------------------------------------- 414 | Returns the exponent bits of the single-precision floating-point value `a'. 415 *----------------------------------------------------------------------------*/ 416 417 static inline int extractFloat32Exp(float32 a) 418 { 419 return (float32_val(a) >> 23) & 0xFF; 420 } 421 422 /*---------------------------------------------------------------------------- 423 | Returns the sign bit of the single-precision floating-point value `a'. 424 *----------------------------------------------------------------------------*/ 425 426 static inline bool extractFloat32Sign(float32 a) 427 { 428 return float32_val(a) >> 31; 429 } 430 431 /*---------------------------------------------------------------------------- 432 | Returns the fraction bits of the double-precision floating-point value `a'. 433 *----------------------------------------------------------------------------*/ 434 435 static inline uint64_t extractFloat64Frac(float64 a) 436 { 437 return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF); 438 } 439 440 /*---------------------------------------------------------------------------- 441 | Returns the exponent bits of the double-precision floating-point value `a'. 442 *----------------------------------------------------------------------------*/ 443 444 static inline int extractFloat64Exp(float64 a) 445 { 446 return (float64_val(a) >> 52) & 0x7FF; 447 } 448 449 /*---------------------------------------------------------------------------- 450 | Returns the sign bit of the double-precision floating-point value `a'. 451 *----------------------------------------------------------------------------*/ 452 453 static inline bool extractFloat64Sign(float64 a) 454 { 455 return float64_val(a) >> 63; 456 } 457 458 /* 459 * Classify a floating point number. Everything above float_class_qnan 460 * is a NaN so cls >= float_class_qnan is any NaN. 461 */ 462 463 typedef enum __attribute__ ((__packed__)) { 464 float_class_unclassified, 465 float_class_zero, 466 float_class_normal, 467 float_class_inf, 468 float_class_qnan, /* all NaNs from here */ 469 float_class_snan, 470 } FloatClass; 471 472 #define float_cmask(bit) (1u << (bit)) 473 474 enum { 475 float_cmask_zero = float_cmask(float_class_zero), 476 float_cmask_normal = float_cmask(float_class_normal), 477 float_cmask_inf = float_cmask(float_class_inf), 478 float_cmask_qnan = float_cmask(float_class_qnan), 479 float_cmask_snan = float_cmask(float_class_snan), 480 481 float_cmask_infzero = float_cmask_zero | float_cmask_inf, 482 float_cmask_anynan = float_cmask_qnan | float_cmask_snan, 483 }; 484 485 486 /* Simple helpers for checking if, or what kind of, NaN we have */ 487 static inline __attribute__((unused)) bool is_nan(FloatClass c) 488 { 489 return unlikely(c >= float_class_qnan); 490 } 491 492 static inline __attribute__((unused)) bool is_snan(FloatClass c) 493 { 494 return c == float_class_snan; 495 } 496 497 static inline __attribute__((unused)) bool is_qnan(FloatClass c) 498 { 499 return c == float_class_qnan; 500 } 501 502 /* 503 * Structure holding all of the decomposed parts of a float. 504 * The exponent is unbiased and the fraction is normalized. 505 * 506 * The fraction words are stored in big-endian word ordering, 507 * so that truncation from a larger format to a smaller format 508 * can be done simply by ignoring subsequent elements. 509 */ 510 511 typedef struct { 512 FloatClass cls; 513 bool sign; 514 int32_t exp; 515 union { 516 /* Routines that know the structure may reference the singular name. */ 517 uint64_t frac; 518 /* 519 * Routines expanded with multiple structures reference "hi" and "lo" 520 * depending on the operation. In FloatParts64, "hi" and "lo" are 521 * both the same word and aliased here. 522 */ 523 uint64_t frac_hi; 524 uint64_t frac_lo; 525 }; 526 } FloatParts64; 527 528 typedef struct { 529 FloatClass cls; 530 bool sign; 531 int32_t exp; 532 uint64_t frac_hi; 533 uint64_t frac_lo; 534 } FloatParts128; 535 536 /* These apply to the most significant word of each FloatPartsN. */ 537 #define DECOMPOSED_BINARY_POINT 63 538 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 539 540 /* Structure holding all of the relevant parameters for a format. 541 * exp_size: the size of the exponent field 542 * exp_bias: the offset applied to the exponent field 543 * exp_max: the maximum normalised exponent 544 * frac_size: the size of the fraction field 545 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 546 * The following are computed based the size of fraction 547 * frac_lsb: least significant bit of fraction 548 * frac_lsbm1: the bit below the least significant bit (for rounding) 549 * round_mask/roundeven_mask: masks used for rounding 550 * The following optional modifiers are available: 551 * arm_althp: handle ARM Alternative Half Precision 552 */ 553 typedef struct { 554 int exp_size; 555 int exp_bias; 556 int exp_max; 557 int frac_size; 558 int frac_shift; 559 uint64_t frac_lsb; 560 uint64_t frac_lsbm1; 561 uint64_t round_mask; 562 uint64_t roundeven_mask; 563 bool arm_althp; 564 } FloatFmt; 565 566 /* Expand fields based on the size of exponent and fraction */ 567 #define FLOAT_PARAMS(E, F) \ 568 .exp_size = E, \ 569 .exp_bias = ((1 << E) - 1) >> 1, \ 570 .exp_max = (1 << E) - 1, \ 571 .frac_size = F, \ 572 .frac_shift = (-F - 1) & 63, \ 573 .frac_lsb = 1ull << ((-F - 1) & 63), \ 574 .frac_lsbm1 = 1ull << ((-F - 2) & 63), \ 575 .round_mask = (1ull << ((-F - 1) & 63)) - 1, \ 576 .roundeven_mask = (2ull << ((-F - 1) & 63)) - 1 577 578 static const FloatFmt float16_params = { 579 FLOAT_PARAMS(5, 10) 580 }; 581 582 static const FloatFmt float16_params_ahp = { 583 FLOAT_PARAMS(5, 10), 584 .arm_althp = true 585 }; 586 587 static const FloatFmt bfloat16_params = { 588 FLOAT_PARAMS(8, 7) 589 }; 590 591 static const FloatFmt float32_params = { 592 FLOAT_PARAMS(8, 23) 593 }; 594 595 static const FloatFmt float64_params = { 596 FLOAT_PARAMS(11, 52) 597 }; 598 599 static const FloatFmt float128_params = { 600 FLOAT_PARAMS(15, 112) 601 }; 602 603 /* Unpack a float to parts, but do not canonicalize. */ 604 static void unpack_raw64(FloatParts64 *r, const FloatFmt *fmt, uint64_t raw) 605 { 606 const int f_size = fmt->frac_size; 607 const int e_size = fmt->exp_size; 608 609 *r = (FloatParts64) { 610 .cls = float_class_unclassified, 611 .sign = extract64(raw, f_size + e_size, 1), 612 .exp = extract64(raw, f_size, e_size), 613 .frac = extract64(raw, 0, f_size) 614 }; 615 } 616 617 static inline void float16_unpack_raw(FloatParts64 *p, float16 f) 618 { 619 unpack_raw64(p, &float16_params, f); 620 } 621 622 static inline void bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f) 623 { 624 unpack_raw64(p, &bfloat16_params, f); 625 } 626 627 static inline void float32_unpack_raw(FloatParts64 *p, float32 f) 628 { 629 unpack_raw64(p, &float32_params, f); 630 } 631 632 static inline void float64_unpack_raw(FloatParts64 *p, float64 f) 633 { 634 unpack_raw64(p, &float64_params, f); 635 } 636 637 static void float128_unpack_raw(FloatParts128 *p, float128 f) 638 { 639 const int f_size = float128_params.frac_size - 64; 640 const int e_size = float128_params.exp_size; 641 642 *p = (FloatParts128) { 643 .cls = float_class_unclassified, 644 .sign = extract64(f.high, f_size + e_size, 1), 645 .exp = extract64(f.high, f_size, e_size), 646 .frac_hi = extract64(f.high, 0, f_size), 647 .frac_lo = f.low, 648 }; 649 } 650 651 /* Pack a float from parts, but do not canonicalize. */ 652 static uint64_t pack_raw64(const FloatParts64 *p, const FloatFmt *fmt) 653 { 654 const int f_size = fmt->frac_size; 655 const int e_size = fmt->exp_size; 656 uint64_t ret; 657 658 ret = (uint64_t)p->sign << (f_size + e_size); 659 ret = deposit64(ret, f_size, e_size, p->exp); 660 ret = deposit64(ret, 0, f_size, p->frac); 661 return ret; 662 } 663 664 static inline float16 float16_pack_raw(const FloatParts64 *p) 665 { 666 return make_float16(pack_raw64(p, &float16_params)); 667 } 668 669 static inline bfloat16 bfloat16_pack_raw(const FloatParts64 *p) 670 { 671 return pack_raw64(p, &bfloat16_params); 672 } 673 674 static inline float32 float32_pack_raw(const FloatParts64 *p) 675 { 676 return make_float32(pack_raw64(p, &float32_params)); 677 } 678 679 static inline float64 float64_pack_raw(const FloatParts64 *p) 680 { 681 return make_float64(pack_raw64(p, &float64_params)); 682 } 683 684 static float128 float128_pack_raw(const FloatParts128 *p) 685 { 686 const int f_size = float128_params.frac_size - 64; 687 const int e_size = float128_params.exp_size; 688 uint64_t hi; 689 690 hi = (uint64_t)p->sign << (f_size + e_size); 691 hi = deposit64(hi, f_size, e_size, p->exp); 692 hi = deposit64(hi, 0, f_size, p->frac_hi); 693 return make_float128(hi, p->frac_lo); 694 } 695 696 /*---------------------------------------------------------------------------- 697 | Functions and definitions to determine: (1) whether tininess for underflow 698 | is detected before or after rounding by default, (2) what (if anything) 699 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 700 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 701 | are propagated from function inputs to output. These details are target- 702 | specific. 703 *----------------------------------------------------------------------------*/ 704 #include "softfloat-specialize.c.inc" 705 706 #define PARTS_GENERIC_64_128(NAME, P) \ 707 QEMU_GENERIC(P, (FloatParts128 *, parts128_##NAME), parts64_##NAME) 708 709 #define parts_default_nan(P, S) PARTS_GENERIC_64_128(default_nan, P)(P, S) 710 #define parts_silence_nan(P, S) PARTS_GENERIC_64_128(silence_nan, P)(P, S) 711 712 static void parts64_return_nan(FloatParts64 *a, float_status *s); 713 static void parts128_return_nan(FloatParts128 *a, float_status *s); 714 715 #define parts_return_nan(P, S) PARTS_GENERIC_64_128(return_nan, P)(P, S) 716 717 static FloatParts64 *parts64_pick_nan(FloatParts64 *a, FloatParts64 *b, 718 float_status *s); 719 static FloatParts128 *parts128_pick_nan(FloatParts128 *a, FloatParts128 *b, 720 float_status *s); 721 722 #define parts_pick_nan(A, B, S) PARTS_GENERIC_64_128(pick_nan, A)(A, B, S) 723 724 static FloatParts64 *parts64_pick_nan_muladd(FloatParts64 *a, FloatParts64 *b, 725 FloatParts64 *c, float_status *s, 726 int ab_mask, int abc_mask); 727 static FloatParts128 *parts128_pick_nan_muladd(FloatParts128 *a, 728 FloatParts128 *b, 729 FloatParts128 *c, 730 float_status *s, 731 int ab_mask, int abc_mask); 732 733 #define parts_pick_nan_muladd(A, B, C, S, ABM, ABCM) \ 734 PARTS_GENERIC_64_128(pick_nan_muladd, A)(A, B, C, S, ABM, ABCM) 735 736 /* 737 * Helper functions for softfloat-parts.c.inc, per-size operations. 738 */ 739 740 #define FRAC_GENERIC_64_128(NAME, P) \ 741 QEMU_GENERIC(P, (FloatParts128 *, frac128_##NAME), frac64_##NAME) 742 743 static int frac64_cmp(FloatParts64 *a, FloatParts64 *b) 744 { 745 return a->frac == b->frac ? 0 : a->frac < b->frac ? -1 : 1; 746 } 747 748 static int frac128_cmp(FloatParts128 *a, FloatParts128 *b) 749 { 750 uint64_t ta = a->frac_hi, tb = b->frac_hi; 751 if (ta == tb) { 752 ta = a->frac_lo, tb = b->frac_lo; 753 if (ta == tb) { 754 return 0; 755 } 756 } 757 return ta < tb ? -1 : 1; 758 } 759 760 #define frac_cmp(A, B) FRAC_GENERIC_64_128(cmp, A)(A, B) 761 762 static void frac128_shl(FloatParts128 *a, int c) 763 { 764 shift128Left(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo); 765 } 766 767 #define frac_shl(A, C) frac128_shl(A, C) 768 769 static void frac128_shr(FloatParts128 *a, int c) 770 { 771 shift128Right(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo); 772 } 773 774 #define frac_shr(A, C) frac128_shr(A, C) 775 776 /* Canonicalize EXP and FRAC, setting CLS. */ 777 static FloatParts64 sf_canonicalize(FloatParts64 part, const FloatFmt *parm, 778 float_status *status) 779 { 780 if (part.exp == parm->exp_max && !parm->arm_althp) { 781 if (part.frac == 0) { 782 part.cls = float_class_inf; 783 } else { 784 part.frac <<= parm->frac_shift; 785 part.cls = (parts_is_snan_frac(part.frac, status) 786 ? float_class_snan : float_class_qnan); 787 } 788 } else if (part.exp == 0) { 789 if (likely(part.frac == 0)) { 790 part.cls = float_class_zero; 791 } else if (status->flush_inputs_to_zero) { 792 float_raise(float_flag_input_denormal, status); 793 part.cls = float_class_zero; 794 part.frac = 0; 795 } else { 796 int shift = clz64(part.frac); 797 part.cls = float_class_normal; 798 part.exp = parm->frac_shift - parm->exp_bias - shift + 1; 799 part.frac <<= shift; 800 } 801 } else { 802 part.cls = float_class_normal; 803 part.exp -= parm->exp_bias; 804 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift); 805 } 806 return part; 807 } 808 809 /* Round and uncanonicalize a floating-point number by parts. There 810 * are FRAC_SHIFT bits that may require rounding at the bottom of the 811 * fraction; these bits will be removed. The exponent will be biased 812 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0]. 813 */ 814 815 static FloatParts64 round_canonical(FloatParts64 p, float_status *s, 816 const FloatFmt *parm) 817 { 818 const uint64_t frac_lsb = parm->frac_lsb; 819 const uint64_t frac_lsbm1 = parm->frac_lsbm1; 820 const uint64_t round_mask = parm->round_mask; 821 const uint64_t roundeven_mask = parm->roundeven_mask; 822 const int exp_max = parm->exp_max; 823 const int frac_shift = parm->frac_shift; 824 uint64_t frac, inc; 825 int exp, flags = 0; 826 bool overflow_norm; 827 828 frac = p.frac; 829 exp = p.exp; 830 831 switch (p.cls) { 832 case float_class_normal: 833 switch (s->float_rounding_mode) { 834 case float_round_nearest_even: 835 overflow_norm = false; 836 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 837 break; 838 case float_round_ties_away: 839 overflow_norm = false; 840 inc = frac_lsbm1; 841 break; 842 case float_round_to_zero: 843 overflow_norm = true; 844 inc = 0; 845 break; 846 case float_round_up: 847 inc = p.sign ? 0 : round_mask; 848 overflow_norm = p.sign; 849 break; 850 case float_round_down: 851 inc = p.sign ? round_mask : 0; 852 overflow_norm = !p.sign; 853 break; 854 case float_round_to_odd: 855 overflow_norm = true; 856 inc = frac & frac_lsb ? 0 : round_mask; 857 break; 858 default: 859 g_assert_not_reached(); 860 } 861 862 exp += parm->exp_bias; 863 if (likely(exp > 0)) { 864 if (frac & round_mask) { 865 flags |= float_flag_inexact; 866 if (uadd64_overflow(frac, inc, &frac)) { 867 frac = (frac >> 1) | DECOMPOSED_IMPLICIT_BIT; 868 exp++; 869 } 870 } 871 frac >>= frac_shift; 872 873 if (parm->arm_althp) { 874 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */ 875 if (unlikely(exp > exp_max)) { 876 /* Overflow. Return the maximum normal. */ 877 flags = float_flag_invalid; 878 exp = exp_max; 879 frac = -1; 880 } 881 } else if (unlikely(exp >= exp_max)) { 882 flags |= float_flag_overflow | float_flag_inexact; 883 if (overflow_norm) { 884 exp = exp_max - 1; 885 frac = -1; 886 } else { 887 p.cls = float_class_inf; 888 goto do_inf; 889 } 890 } 891 } else if (s->flush_to_zero) { 892 flags |= float_flag_output_denormal; 893 p.cls = float_class_zero; 894 goto do_zero; 895 } else { 896 bool is_tiny = s->tininess_before_rounding || (exp < 0); 897 898 if (!is_tiny) { 899 uint64_t discard; 900 is_tiny = !uadd64_overflow(frac, inc, &discard); 901 } 902 903 shift64RightJamming(frac, 1 - exp, &frac); 904 if (frac & round_mask) { 905 /* Need to recompute round-to-even. */ 906 switch (s->float_rounding_mode) { 907 case float_round_nearest_even: 908 inc = ((frac & roundeven_mask) != frac_lsbm1 909 ? frac_lsbm1 : 0); 910 break; 911 case float_round_to_odd: 912 inc = frac & frac_lsb ? 0 : round_mask; 913 break; 914 default: 915 break; 916 } 917 flags |= float_flag_inexact; 918 frac += inc; 919 } 920 921 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0); 922 frac >>= frac_shift; 923 924 if (is_tiny && (flags & float_flag_inexact)) { 925 flags |= float_flag_underflow; 926 } 927 if (exp == 0 && frac == 0) { 928 p.cls = float_class_zero; 929 } 930 } 931 break; 932 933 case float_class_zero: 934 do_zero: 935 exp = 0; 936 frac = 0; 937 break; 938 939 case float_class_inf: 940 do_inf: 941 assert(!parm->arm_althp); 942 exp = exp_max; 943 frac = 0; 944 break; 945 946 case float_class_qnan: 947 case float_class_snan: 948 assert(!parm->arm_althp); 949 exp = exp_max; 950 frac >>= parm->frac_shift; 951 break; 952 953 default: 954 g_assert_not_reached(); 955 } 956 957 float_raise(flags, s); 958 p.exp = exp; 959 p.frac = frac; 960 return p; 961 } 962 963 964 #define partsN(NAME) parts64_##NAME 965 #define FloatPartsN FloatParts64 966 967 #include "softfloat-parts.c.inc" 968 969 #undef partsN 970 #undef FloatPartsN 971 #define partsN(NAME) parts128_##NAME 972 #define FloatPartsN FloatParts128 973 974 #include "softfloat-parts.c.inc" 975 976 #undef partsN 977 #undef FloatPartsN 978 979 /* 980 * Pack/unpack routines with a specific FloatFmt. 981 */ 982 983 static void float16a_unpack_canonical(FloatParts64 *p, float16 f, 984 float_status *s, const FloatFmt *params) 985 { 986 float16_unpack_raw(p, f); 987 *p = sf_canonicalize(*p, params, s); 988 } 989 990 static void float16_unpack_canonical(FloatParts64 *p, float16 f, 991 float_status *s) 992 { 993 float16a_unpack_canonical(p, f, s, &float16_params); 994 } 995 996 static void bfloat16_unpack_canonical(FloatParts64 *p, bfloat16 f, 997 float_status *s) 998 { 999 bfloat16_unpack_raw(p, f); 1000 *p = sf_canonicalize(*p, &bfloat16_params, s); 1001 } 1002 1003 static float16 float16a_round_pack_canonical(FloatParts64 *p, 1004 float_status *s, 1005 const FloatFmt *params) 1006 { 1007 *p = round_canonical(*p, s, params); 1008 return float16_pack_raw(p); 1009 } 1010 1011 static float16 float16_round_pack_canonical(FloatParts64 *p, 1012 float_status *s) 1013 { 1014 return float16a_round_pack_canonical(p, s, &float16_params); 1015 } 1016 1017 static bfloat16 bfloat16_round_pack_canonical(FloatParts64 *p, 1018 float_status *s) 1019 { 1020 *p = round_canonical(*p, s, &bfloat16_params); 1021 return bfloat16_pack_raw(p); 1022 } 1023 1024 static void float32_unpack_canonical(FloatParts64 *p, float32 f, 1025 float_status *s) 1026 { 1027 float32_unpack_raw(p, f); 1028 *p = sf_canonicalize(*p, &float32_params, s); 1029 } 1030 1031 static float32 float32_round_pack_canonical(FloatParts64 *p, 1032 float_status *s) 1033 { 1034 *p = round_canonical(*p, s, &float32_params); 1035 return float32_pack_raw(p); 1036 } 1037 1038 static void float64_unpack_canonical(FloatParts64 *p, float64 f, 1039 float_status *s) 1040 { 1041 float64_unpack_raw(p, f); 1042 *p = sf_canonicalize(*p, &float64_params, s); 1043 } 1044 1045 static float64 float64_round_pack_canonical(FloatParts64 *p, 1046 float_status *s) 1047 { 1048 *p = round_canonical(*p, s, &float64_params); 1049 return float64_pack_raw(p); 1050 } 1051 1052 /* 1053 * Returns the result of adding or subtracting the values of the 1054 * floating-point values `a' and `b'. The operation is performed 1055 * according to the IEC/IEEE Standard for Binary Floating-Point 1056 * Arithmetic. 1057 */ 1058 1059 static FloatParts64 addsub_floats(FloatParts64 a, FloatParts64 b, bool subtract, 1060 float_status *s) 1061 { 1062 bool a_sign = a.sign; 1063 bool b_sign = b.sign ^ subtract; 1064 1065 if (a_sign != b_sign) { 1066 /* Subtraction */ 1067 1068 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1069 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) { 1070 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 1071 a.frac = a.frac - b.frac; 1072 } else { 1073 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 1074 a.frac = b.frac - a.frac; 1075 a.exp = b.exp; 1076 a_sign ^= 1; 1077 } 1078 1079 if (a.frac == 0) { 1080 a.cls = float_class_zero; 1081 a.sign = s->float_rounding_mode == float_round_down; 1082 } else { 1083 int shift = clz64(a.frac); 1084 a.frac = a.frac << shift; 1085 a.exp = a.exp - shift; 1086 a.sign = a_sign; 1087 } 1088 return a; 1089 } 1090 if (is_nan(a.cls) || is_nan(b.cls)) { 1091 return *parts_pick_nan(&a, &b, s); 1092 } 1093 if (a.cls == float_class_inf) { 1094 if (b.cls == float_class_inf) { 1095 float_raise(float_flag_invalid, s); 1096 parts_default_nan(&a, s); 1097 } 1098 return a; 1099 } 1100 if (a.cls == float_class_zero && b.cls == float_class_zero) { 1101 a.sign = s->float_rounding_mode == float_round_down; 1102 return a; 1103 } 1104 if (a.cls == float_class_zero || b.cls == float_class_inf) { 1105 b.sign = a_sign ^ 1; 1106 return b; 1107 } 1108 if (b.cls == float_class_zero) { 1109 return a; 1110 } 1111 } else { 1112 /* Addition */ 1113 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1114 if (a.exp > b.exp) { 1115 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 1116 } else if (a.exp < b.exp) { 1117 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 1118 a.exp = b.exp; 1119 } 1120 1121 if (uadd64_overflow(a.frac, b.frac, &a.frac)) { 1122 shift64RightJamming(a.frac, 1, &a.frac); 1123 a.frac |= DECOMPOSED_IMPLICIT_BIT; 1124 a.exp += 1; 1125 } 1126 return a; 1127 } 1128 if (is_nan(a.cls) || is_nan(b.cls)) { 1129 return *parts_pick_nan(&a, &b, s); 1130 } 1131 if (a.cls == float_class_inf || b.cls == float_class_zero) { 1132 return a; 1133 } 1134 if (b.cls == float_class_inf || a.cls == float_class_zero) { 1135 b.sign = b_sign; 1136 return b; 1137 } 1138 } 1139 g_assert_not_reached(); 1140 } 1141 1142 /* 1143 * Returns the result of adding or subtracting the floating-point 1144 * values `a' and `b'. The operation is performed according to the 1145 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1146 */ 1147 1148 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status) 1149 { 1150 FloatParts64 pa, pb, pr; 1151 1152 float16_unpack_canonical(&pa, a, status); 1153 float16_unpack_canonical(&pb, b, status); 1154 pr = addsub_floats(pa, pb, false, status); 1155 1156 return float16_round_pack_canonical(&pr, status); 1157 } 1158 1159 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status) 1160 { 1161 FloatParts64 pa, pb, pr; 1162 1163 float16_unpack_canonical(&pa, a, status); 1164 float16_unpack_canonical(&pb, b, status); 1165 pr = addsub_floats(pa, pb, true, status); 1166 1167 return float16_round_pack_canonical(&pr, status); 1168 } 1169 1170 static float32 QEMU_SOFTFLOAT_ATTR 1171 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status) 1172 { 1173 FloatParts64 pa, pb, pr; 1174 1175 float32_unpack_canonical(&pa, a, status); 1176 float32_unpack_canonical(&pb, b, status); 1177 pr = addsub_floats(pa, pb, subtract, status); 1178 1179 return float32_round_pack_canonical(&pr, status); 1180 } 1181 1182 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status) 1183 { 1184 return soft_f32_addsub(a, b, false, status); 1185 } 1186 1187 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status) 1188 { 1189 return soft_f32_addsub(a, b, true, status); 1190 } 1191 1192 static float64 QEMU_SOFTFLOAT_ATTR 1193 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status) 1194 { 1195 FloatParts64 pa, pb, pr; 1196 1197 float64_unpack_canonical(&pa, a, status); 1198 float64_unpack_canonical(&pb, b, status); 1199 pr = addsub_floats(pa, pb, subtract, status); 1200 1201 return float64_round_pack_canonical(&pr, status); 1202 } 1203 1204 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status) 1205 { 1206 return soft_f64_addsub(a, b, false, status); 1207 } 1208 1209 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status) 1210 { 1211 return soft_f64_addsub(a, b, true, status); 1212 } 1213 1214 static float hard_f32_add(float a, float b) 1215 { 1216 return a + b; 1217 } 1218 1219 static float hard_f32_sub(float a, float b) 1220 { 1221 return a - b; 1222 } 1223 1224 static double hard_f64_add(double a, double b) 1225 { 1226 return a + b; 1227 } 1228 1229 static double hard_f64_sub(double a, double b) 1230 { 1231 return a - b; 1232 } 1233 1234 static bool f32_addsubmul_post(union_float32 a, union_float32 b) 1235 { 1236 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1237 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1238 } 1239 return !(float32_is_zero(a.s) && float32_is_zero(b.s)); 1240 } 1241 1242 static bool f64_addsubmul_post(union_float64 a, union_float64 b) 1243 { 1244 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1245 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1246 } else { 1247 return !(float64_is_zero(a.s) && float64_is_zero(b.s)); 1248 } 1249 } 1250 1251 static float32 float32_addsub(float32 a, float32 b, float_status *s, 1252 hard_f32_op2_fn hard, soft_f32_op2_fn soft) 1253 { 1254 return float32_gen2(a, b, s, hard, soft, 1255 f32_is_zon2, f32_addsubmul_post); 1256 } 1257 1258 static float64 float64_addsub(float64 a, float64 b, float_status *s, 1259 hard_f64_op2_fn hard, soft_f64_op2_fn soft) 1260 { 1261 return float64_gen2(a, b, s, hard, soft, 1262 f64_is_zon2, f64_addsubmul_post); 1263 } 1264 1265 float32 QEMU_FLATTEN 1266 float32_add(float32 a, float32 b, float_status *s) 1267 { 1268 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add); 1269 } 1270 1271 float32 QEMU_FLATTEN 1272 float32_sub(float32 a, float32 b, float_status *s) 1273 { 1274 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub); 1275 } 1276 1277 float64 QEMU_FLATTEN 1278 float64_add(float64 a, float64 b, float_status *s) 1279 { 1280 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add); 1281 } 1282 1283 float64 QEMU_FLATTEN 1284 float64_sub(float64 a, float64 b, float_status *s) 1285 { 1286 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub); 1287 } 1288 1289 /* 1290 * Returns the result of adding or subtracting the bfloat16 1291 * values `a' and `b'. 1292 */ 1293 bfloat16 QEMU_FLATTEN bfloat16_add(bfloat16 a, bfloat16 b, float_status *status) 1294 { 1295 FloatParts64 pa, pb, pr; 1296 1297 bfloat16_unpack_canonical(&pa, a, status); 1298 bfloat16_unpack_canonical(&pb, b, status); 1299 pr = addsub_floats(pa, pb, false, status); 1300 1301 return bfloat16_round_pack_canonical(&pr, status); 1302 } 1303 1304 bfloat16 QEMU_FLATTEN bfloat16_sub(bfloat16 a, bfloat16 b, float_status *status) 1305 { 1306 FloatParts64 pa, pb, pr; 1307 1308 bfloat16_unpack_canonical(&pa, a, status); 1309 bfloat16_unpack_canonical(&pb, b, status); 1310 pr = addsub_floats(pa, pb, true, status); 1311 1312 return bfloat16_round_pack_canonical(&pr, status); 1313 } 1314 1315 /* 1316 * Returns the result of multiplying the floating-point values `a' and 1317 * `b'. The operation is performed according to the IEC/IEEE Standard 1318 * for Binary Floating-Point Arithmetic. 1319 */ 1320 1321 static FloatParts64 mul_floats(FloatParts64 a, FloatParts64 b, float_status *s) 1322 { 1323 bool sign = a.sign ^ b.sign; 1324 1325 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1326 uint64_t hi, lo; 1327 int exp = a.exp + b.exp; 1328 1329 mul64To128(a.frac, b.frac, &hi, &lo); 1330 if (hi & DECOMPOSED_IMPLICIT_BIT) { 1331 exp += 1; 1332 } else { 1333 hi <<= 1; 1334 } 1335 hi |= (lo != 0); 1336 1337 /* Re-use a */ 1338 a.exp = exp; 1339 a.sign = sign; 1340 a.frac = hi; 1341 return a; 1342 } 1343 /* handle all the NaN cases */ 1344 if (is_nan(a.cls) || is_nan(b.cls)) { 1345 return *parts_pick_nan(&a, &b, s); 1346 } 1347 /* Inf * Zero == NaN */ 1348 if ((a.cls == float_class_inf && b.cls == float_class_zero) || 1349 (a.cls == float_class_zero && b.cls == float_class_inf)) { 1350 float_raise(float_flag_invalid, s); 1351 parts_default_nan(&a, s); 1352 return a; 1353 } 1354 /* Multiply by 0 or Inf */ 1355 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1356 a.sign = sign; 1357 return a; 1358 } 1359 if (b.cls == float_class_inf || b.cls == float_class_zero) { 1360 b.sign = sign; 1361 return b; 1362 } 1363 g_assert_not_reached(); 1364 } 1365 1366 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status) 1367 { 1368 FloatParts64 pa, pb, pr; 1369 1370 float16_unpack_canonical(&pa, a, status); 1371 float16_unpack_canonical(&pb, b, status); 1372 pr = mul_floats(pa, pb, status); 1373 1374 return float16_round_pack_canonical(&pr, status); 1375 } 1376 1377 static float32 QEMU_SOFTFLOAT_ATTR 1378 soft_f32_mul(float32 a, float32 b, float_status *status) 1379 { 1380 FloatParts64 pa, pb, pr; 1381 1382 float32_unpack_canonical(&pa, a, status); 1383 float32_unpack_canonical(&pb, b, status); 1384 pr = mul_floats(pa, pb, status); 1385 1386 return float32_round_pack_canonical(&pr, status); 1387 } 1388 1389 static float64 QEMU_SOFTFLOAT_ATTR 1390 soft_f64_mul(float64 a, float64 b, float_status *status) 1391 { 1392 FloatParts64 pa, pb, pr; 1393 1394 float64_unpack_canonical(&pa, a, status); 1395 float64_unpack_canonical(&pb, b, status); 1396 pr = mul_floats(pa, pb, status); 1397 1398 return float64_round_pack_canonical(&pr, status); 1399 } 1400 1401 static float hard_f32_mul(float a, float b) 1402 { 1403 return a * b; 1404 } 1405 1406 static double hard_f64_mul(double a, double b) 1407 { 1408 return a * b; 1409 } 1410 1411 float32 QEMU_FLATTEN 1412 float32_mul(float32 a, float32 b, float_status *s) 1413 { 1414 return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul, 1415 f32_is_zon2, f32_addsubmul_post); 1416 } 1417 1418 float64 QEMU_FLATTEN 1419 float64_mul(float64 a, float64 b, float_status *s) 1420 { 1421 return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul, 1422 f64_is_zon2, f64_addsubmul_post); 1423 } 1424 1425 /* 1426 * Returns the result of multiplying the bfloat16 1427 * values `a' and `b'. 1428 */ 1429 1430 bfloat16 QEMU_FLATTEN bfloat16_mul(bfloat16 a, bfloat16 b, float_status *status) 1431 { 1432 FloatParts64 pa, pb, pr; 1433 1434 bfloat16_unpack_canonical(&pa, a, status); 1435 bfloat16_unpack_canonical(&pb, b, status); 1436 pr = mul_floats(pa, pb, status); 1437 1438 return bfloat16_round_pack_canonical(&pr, status); 1439 } 1440 1441 /* 1442 * Returns the result of multiplying the floating-point values `a' and 1443 * `b' then adding 'c', with no intermediate rounding step after the 1444 * multiplication. The operation is performed according to the 1445 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008. 1446 * The flags argument allows the caller to select negation of the 1447 * addend, the intermediate product, or the final result. (The 1448 * difference between this and having the caller do a separate 1449 * negation is that negating externally will flip the sign bit on 1450 * NaNs.) 1451 */ 1452 1453 static FloatParts64 muladd_floats(FloatParts64 a, FloatParts64 b, FloatParts64 c, 1454 int flags, float_status *s) 1455 { 1456 bool inf_zero, p_sign; 1457 bool sign_flip = flags & float_muladd_negate_result; 1458 FloatClass p_class; 1459 uint64_t hi, lo; 1460 int p_exp; 1461 int ab_mask, abc_mask; 1462 1463 ab_mask = float_cmask(a.cls) | float_cmask(b.cls); 1464 abc_mask = float_cmask(c.cls) | ab_mask; 1465 inf_zero = ab_mask == float_cmask_infzero; 1466 1467 /* It is implementation-defined whether the cases of (0,inf,qnan) 1468 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 1469 * they return if they do), so we have to hand this information 1470 * off to the target-specific pick-a-NaN routine. 1471 */ 1472 if (unlikely(abc_mask & float_cmask_anynan)) { 1473 return *parts_pick_nan_muladd(&a, &b, &c, s, ab_mask, abc_mask); 1474 } 1475 1476 if (inf_zero) { 1477 float_raise(float_flag_invalid, s); 1478 parts_default_nan(&a, s); 1479 return a; 1480 } 1481 1482 if (flags & float_muladd_negate_c) { 1483 c.sign ^= 1; 1484 } 1485 1486 p_sign = a.sign ^ b.sign; 1487 1488 if (flags & float_muladd_negate_product) { 1489 p_sign ^= 1; 1490 } 1491 1492 if (ab_mask & float_cmask_inf) { 1493 p_class = float_class_inf; 1494 } else if (ab_mask & float_cmask_zero) { 1495 p_class = float_class_zero; 1496 } else { 1497 p_class = float_class_normal; 1498 } 1499 1500 if (c.cls == float_class_inf) { 1501 if (p_class == float_class_inf && p_sign != c.sign) { 1502 float_raise(float_flag_invalid, s); 1503 parts_default_nan(&c, s); 1504 } else { 1505 c.sign ^= sign_flip; 1506 } 1507 return c; 1508 } 1509 1510 if (p_class == float_class_inf) { 1511 a.cls = float_class_inf; 1512 a.sign = p_sign ^ sign_flip; 1513 return a; 1514 } 1515 1516 if (p_class == float_class_zero) { 1517 if (c.cls == float_class_zero) { 1518 if (p_sign != c.sign) { 1519 p_sign = s->float_rounding_mode == float_round_down; 1520 } 1521 c.sign = p_sign; 1522 } else if (flags & float_muladd_halve_result) { 1523 c.exp -= 1; 1524 } 1525 c.sign ^= sign_flip; 1526 return c; 1527 } 1528 1529 /* a & b should be normals now... */ 1530 assert(a.cls == float_class_normal && 1531 b.cls == float_class_normal); 1532 1533 p_exp = a.exp + b.exp; 1534 1535 mul64To128(a.frac, b.frac, &hi, &lo); 1536 1537 /* Renormalize to the msb. */ 1538 if (hi & DECOMPOSED_IMPLICIT_BIT) { 1539 p_exp += 1; 1540 } else { 1541 shortShift128Left(hi, lo, 1, &hi, &lo); 1542 } 1543 1544 /* + add/sub */ 1545 if (c.cls != float_class_zero) { 1546 int exp_diff = p_exp - c.exp; 1547 if (p_sign == c.sign) { 1548 /* Addition */ 1549 if (exp_diff <= 0) { 1550 shift64RightJamming(hi, -exp_diff, &hi); 1551 p_exp = c.exp; 1552 if (uadd64_overflow(hi, c.frac, &hi)) { 1553 shift64RightJamming(hi, 1, &hi); 1554 hi |= DECOMPOSED_IMPLICIT_BIT; 1555 p_exp += 1; 1556 } 1557 } else { 1558 uint64_t c_hi, c_lo, over; 1559 shift128RightJamming(c.frac, 0, exp_diff, &c_hi, &c_lo); 1560 add192(0, hi, lo, 0, c_hi, c_lo, &over, &hi, &lo); 1561 if (over) { 1562 shift64RightJamming(hi, 1, &hi); 1563 hi |= DECOMPOSED_IMPLICIT_BIT; 1564 p_exp += 1; 1565 } 1566 } 1567 } else { 1568 /* Subtraction */ 1569 uint64_t c_hi = c.frac, c_lo = 0; 1570 1571 if (exp_diff <= 0) { 1572 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo); 1573 if (exp_diff == 0 1574 && 1575 (hi > c_hi || (hi == c_hi && lo >= c_lo))) { 1576 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1577 } else { 1578 sub128(c_hi, c_lo, hi, lo, &hi, &lo); 1579 p_sign ^= 1; 1580 p_exp = c.exp; 1581 } 1582 } else { 1583 shift128RightJamming(c_hi, c_lo, 1584 exp_diff, 1585 &c_hi, &c_lo); 1586 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1587 } 1588 1589 if (hi == 0 && lo == 0) { 1590 a.cls = float_class_zero; 1591 a.sign = s->float_rounding_mode == float_round_down; 1592 a.sign ^= sign_flip; 1593 return a; 1594 } else { 1595 int shift; 1596 if (hi != 0) { 1597 shift = clz64(hi); 1598 } else { 1599 shift = clz64(lo) + 64; 1600 } 1601 /* Normalizing to a binary point of 124 is the 1602 correct adjust for the exponent. However since we're 1603 shifting, we might as well put the binary point back 1604 at 63 where we really want it. Therefore shift as 1605 if we're leaving 1 bit at the top of the word, but 1606 adjust the exponent as if we're leaving 3 bits. */ 1607 shift128Left(hi, lo, shift, &hi, &lo); 1608 p_exp -= shift; 1609 } 1610 } 1611 } 1612 hi |= (lo != 0); 1613 1614 if (flags & float_muladd_halve_result) { 1615 p_exp -= 1; 1616 } 1617 1618 /* finally prepare our result */ 1619 a.cls = float_class_normal; 1620 a.sign = p_sign ^ sign_flip; 1621 a.exp = p_exp; 1622 a.frac = hi; 1623 1624 return a; 1625 } 1626 1627 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c, 1628 int flags, float_status *status) 1629 { 1630 FloatParts64 pa, pb, pc, pr; 1631 1632 float16_unpack_canonical(&pa, a, status); 1633 float16_unpack_canonical(&pb, b, status); 1634 float16_unpack_canonical(&pc, c, status); 1635 pr = muladd_floats(pa, pb, pc, flags, status); 1636 1637 return float16_round_pack_canonical(&pr, status); 1638 } 1639 1640 static float32 QEMU_SOFTFLOAT_ATTR 1641 soft_f32_muladd(float32 a, float32 b, float32 c, int flags, 1642 float_status *status) 1643 { 1644 FloatParts64 pa, pb, pc, pr; 1645 1646 float32_unpack_canonical(&pa, a, status); 1647 float32_unpack_canonical(&pb, b, status); 1648 float32_unpack_canonical(&pc, c, status); 1649 pr = muladd_floats(pa, pb, pc, flags, status); 1650 1651 return float32_round_pack_canonical(&pr, status); 1652 } 1653 1654 static float64 QEMU_SOFTFLOAT_ATTR 1655 soft_f64_muladd(float64 a, float64 b, float64 c, int flags, 1656 float_status *status) 1657 { 1658 FloatParts64 pa, pb, pc, pr; 1659 1660 float64_unpack_canonical(&pa, a, status); 1661 float64_unpack_canonical(&pb, b, status); 1662 float64_unpack_canonical(&pc, c, status); 1663 pr = muladd_floats(pa, pb, pc, flags, status); 1664 1665 return float64_round_pack_canonical(&pr, status); 1666 } 1667 1668 static bool force_soft_fma; 1669 1670 float32 QEMU_FLATTEN 1671 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s) 1672 { 1673 union_float32 ua, ub, uc, ur; 1674 1675 ua.s = xa; 1676 ub.s = xb; 1677 uc.s = xc; 1678 1679 if (unlikely(!can_use_fpu(s))) { 1680 goto soft; 1681 } 1682 if (unlikely(flags & float_muladd_halve_result)) { 1683 goto soft; 1684 } 1685 1686 float32_input_flush3(&ua.s, &ub.s, &uc.s, s); 1687 if (unlikely(!f32_is_zon3(ua, ub, uc))) { 1688 goto soft; 1689 } 1690 1691 if (unlikely(force_soft_fma)) { 1692 goto soft; 1693 } 1694 1695 /* 1696 * When (a || b) == 0, there's no need to check for under/over flow, 1697 * since we know the addend is (normal || 0) and the product is 0. 1698 */ 1699 if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) { 1700 union_float32 up; 1701 bool prod_sign; 1702 1703 prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s); 1704 prod_sign ^= !!(flags & float_muladd_negate_product); 1705 up.s = float32_set_sign(float32_zero, prod_sign); 1706 1707 if (flags & float_muladd_negate_c) { 1708 uc.h = -uc.h; 1709 } 1710 ur.h = up.h + uc.h; 1711 } else { 1712 union_float32 ua_orig = ua; 1713 union_float32 uc_orig = uc; 1714 1715 if (flags & float_muladd_negate_product) { 1716 ua.h = -ua.h; 1717 } 1718 if (flags & float_muladd_negate_c) { 1719 uc.h = -uc.h; 1720 } 1721 1722 ur.h = fmaf(ua.h, ub.h, uc.h); 1723 1724 if (unlikely(f32_is_inf(ur))) { 1725 float_raise(float_flag_overflow, s); 1726 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) { 1727 ua = ua_orig; 1728 uc = uc_orig; 1729 goto soft; 1730 } 1731 } 1732 if (flags & float_muladd_negate_result) { 1733 return float32_chs(ur.s); 1734 } 1735 return ur.s; 1736 1737 soft: 1738 return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s); 1739 } 1740 1741 float64 QEMU_FLATTEN 1742 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s) 1743 { 1744 union_float64 ua, ub, uc, ur; 1745 1746 ua.s = xa; 1747 ub.s = xb; 1748 uc.s = xc; 1749 1750 if (unlikely(!can_use_fpu(s))) { 1751 goto soft; 1752 } 1753 if (unlikely(flags & float_muladd_halve_result)) { 1754 goto soft; 1755 } 1756 1757 float64_input_flush3(&ua.s, &ub.s, &uc.s, s); 1758 if (unlikely(!f64_is_zon3(ua, ub, uc))) { 1759 goto soft; 1760 } 1761 1762 if (unlikely(force_soft_fma)) { 1763 goto soft; 1764 } 1765 1766 /* 1767 * When (a || b) == 0, there's no need to check for under/over flow, 1768 * since we know the addend is (normal || 0) and the product is 0. 1769 */ 1770 if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) { 1771 union_float64 up; 1772 bool prod_sign; 1773 1774 prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s); 1775 prod_sign ^= !!(flags & float_muladd_negate_product); 1776 up.s = float64_set_sign(float64_zero, prod_sign); 1777 1778 if (flags & float_muladd_negate_c) { 1779 uc.h = -uc.h; 1780 } 1781 ur.h = up.h + uc.h; 1782 } else { 1783 union_float64 ua_orig = ua; 1784 union_float64 uc_orig = uc; 1785 1786 if (flags & float_muladd_negate_product) { 1787 ua.h = -ua.h; 1788 } 1789 if (flags & float_muladd_negate_c) { 1790 uc.h = -uc.h; 1791 } 1792 1793 ur.h = fma(ua.h, ub.h, uc.h); 1794 1795 if (unlikely(f64_is_inf(ur))) { 1796 float_raise(float_flag_overflow, s); 1797 } else if (unlikely(fabs(ur.h) <= FLT_MIN)) { 1798 ua = ua_orig; 1799 uc = uc_orig; 1800 goto soft; 1801 } 1802 } 1803 if (flags & float_muladd_negate_result) { 1804 return float64_chs(ur.s); 1805 } 1806 return ur.s; 1807 1808 soft: 1809 return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s); 1810 } 1811 1812 /* 1813 * Returns the result of multiplying the bfloat16 values `a' 1814 * and `b' then adding 'c', with no intermediate rounding step after the 1815 * multiplication. 1816 */ 1817 1818 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c, 1819 int flags, float_status *status) 1820 { 1821 FloatParts64 pa, pb, pc, pr; 1822 1823 bfloat16_unpack_canonical(&pa, a, status); 1824 bfloat16_unpack_canonical(&pb, b, status); 1825 bfloat16_unpack_canonical(&pc, c, status); 1826 pr = muladd_floats(pa, pb, pc, flags, status); 1827 1828 return bfloat16_round_pack_canonical(&pr, status); 1829 } 1830 1831 /* 1832 * Returns the result of dividing the floating-point value `a' by the 1833 * corresponding value `b'. The operation is performed according to 1834 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1835 */ 1836 1837 static FloatParts64 div_floats(FloatParts64 a, FloatParts64 b, float_status *s) 1838 { 1839 bool sign = a.sign ^ b.sign; 1840 1841 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1842 uint64_t n0, n1, q, r; 1843 int exp = a.exp - b.exp; 1844 1845 /* 1846 * We want a 2*N / N-bit division to produce exactly an N-bit 1847 * result, so that we do not lose any precision and so that we 1848 * do not have to renormalize afterward. If A.frac < B.frac, 1849 * then division would produce an (N-1)-bit result; shift A left 1850 * by one to produce the an N-bit result, and decrement the 1851 * exponent to match. 1852 * 1853 * The udiv_qrnnd algorithm that we're using requires normalization, 1854 * i.e. the msb of the denominator must be set, which is already true. 1855 */ 1856 if (a.frac < b.frac) { 1857 exp -= 1; 1858 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0); 1859 } else { 1860 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, &n1, &n0); 1861 } 1862 q = udiv_qrnnd(&r, n1, n0, b.frac); 1863 1864 /* Set lsb if there is a remainder, to set inexact. */ 1865 a.frac = q | (r != 0); 1866 a.sign = sign; 1867 a.exp = exp; 1868 return a; 1869 } 1870 /* handle all the NaN cases */ 1871 if (is_nan(a.cls) || is_nan(b.cls)) { 1872 return *parts_pick_nan(&a, &b, s); 1873 } 1874 /* 0/0 or Inf/Inf */ 1875 if (a.cls == b.cls 1876 && 1877 (a.cls == float_class_inf || a.cls == float_class_zero)) { 1878 float_raise(float_flag_invalid, s); 1879 parts_default_nan(&a, s); 1880 return a; 1881 } 1882 /* Inf / x or 0 / x */ 1883 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1884 a.sign = sign; 1885 return a; 1886 } 1887 /* Div 0 => Inf */ 1888 if (b.cls == float_class_zero) { 1889 float_raise(float_flag_divbyzero, s); 1890 a.cls = float_class_inf; 1891 a.sign = sign; 1892 return a; 1893 } 1894 /* Div by Inf */ 1895 if (b.cls == float_class_inf) { 1896 a.cls = float_class_zero; 1897 a.sign = sign; 1898 return a; 1899 } 1900 g_assert_not_reached(); 1901 } 1902 1903 float16 float16_div(float16 a, float16 b, float_status *status) 1904 { 1905 FloatParts64 pa, pb, pr; 1906 1907 float16_unpack_canonical(&pa, a, status); 1908 float16_unpack_canonical(&pb, b, status); 1909 pr = div_floats(pa, pb, status); 1910 1911 return float16_round_pack_canonical(&pr, status); 1912 } 1913 1914 static float32 QEMU_SOFTFLOAT_ATTR 1915 soft_f32_div(float32 a, float32 b, float_status *status) 1916 { 1917 FloatParts64 pa, pb, pr; 1918 1919 float32_unpack_canonical(&pa, a, status); 1920 float32_unpack_canonical(&pb, b, status); 1921 pr = div_floats(pa, pb, status); 1922 1923 return float32_round_pack_canonical(&pr, status); 1924 } 1925 1926 static float64 QEMU_SOFTFLOAT_ATTR 1927 soft_f64_div(float64 a, float64 b, float_status *status) 1928 { 1929 FloatParts64 pa, pb, pr; 1930 1931 float64_unpack_canonical(&pa, a, status); 1932 float64_unpack_canonical(&pb, b, status); 1933 pr = div_floats(pa, pb, status); 1934 1935 return float64_round_pack_canonical(&pr, status); 1936 } 1937 1938 static float hard_f32_div(float a, float b) 1939 { 1940 return a / b; 1941 } 1942 1943 static double hard_f64_div(double a, double b) 1944 { 1945 return a / b; 1946 } 1947 1948 static bool f32_div_pre(union_float32 a, union_float32 b) 1949 { 1950 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1951 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1952 fpclassify(b.h) == FP_NORMAL; 1953 } 1954 return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s); 1955 } 1956 1957 static bool f64_div_pre(union_float64 a, union_float64 b) 1958 { 1959 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1960 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1961 fpclassify(b.h) == FP_NORMAL; 1962 } 1963 return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s); 1964 } 1965 1966 static bool f32_div_post(union_float32 a, union_float32 b) 1967 { 1968 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1969 return fpclassify(a.h) != FP_ZERO; 1970 } 1971 return !float32_is_zero(a.s); 1972 } 1973 1974 static bool f64_div_post(union_float64 a, union_float64 b) 1975 { 1976 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1977 return fpclassify(a.h) != FP_ZERO; 1978 } 1979 return !float64_is_zero(a.s); 1980 } 1981 1982 float32 QEMU_FLATTEN 1983 float32_div(float32 a, float32 b, float_status *s) 1984 { 1985 return float32_gen2(a, b, s, hard_f32_div, soft_f32_div, 1986 f32_div_pre, f32_div_post); 1987 } 1988 1989 float64 QEMU_FLATTEN 1990 float64_div(float64 a, float64 b, float_status *s) 1991 { 1992 return float64_gen2(a, b, s, hard_f64_div, soft_f64_div, 1993 f64_div_pre, f64_div_post); 1994 } 1995 1996 /* 1997 * Returns the result of dividing the bfloat16 1998 * value `a' by the corresponding value `b'. 1999 */ 2000 2001 bfloat16 bfloat16_div(bfloat16 a, bfloat16 b, float_status *status) 2002 { 2003 FloatParts64 pa, pb, pr; 2004 2005 bfloat16_unpack_canonical(&pa, a, status); 2006 bfloat16_unpack_canonical(&pb, b, status); 2007 pr = div_floats(pa, pb, status); 2008 2009 return bfloat16_round_pack_canonical(&pr, status); 2010 } 2011 2012 /* 2013 * Float to Float conversions 2014 * 2015 * Returns the result of converting one float format to another. The 2016 * conversion is performed according to the IEC/IEEE Standard for 2017 * Binary Floating-Point Arithmetic. 2018 * 2019 * The float_to_float helper only needs to take care of raising 2020 * invalid exceptions and handling the conversion on NaNs. 2021 */ 2022 2023 static FloatParts64 float_to_float(FloatParts64 a, const FloatFmt *dstf, 2024 float_status *s) 2025 { 2026 if (dstf->arm_althp) { 2027 switch (a.cls) { 2028 case float_class_qnan: 2029 case float_class_snan: 2030 /* There is no NaN in the destination format. Raise Invalid 2031 * and return a zero with the sign of the input NaN. 2032 */ 2033 float_raise(float_flag_invalid, s); 2034 a.cls = float_class_zero; 2035 a.frac = 0; 2036 a.exp = 0; 2037 break; 2038 2039 case float_class_inf: 2040 /* There is no Inf in the destination format. Raise Invalid 2041 * and return the maximum normal with the correct sign. 2042 */ 2043 float_raise(float_flag_invalid, s); 2044 a.cls = float_class_normal; 2045 a.exp = dstf->exp_max; 2046 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift; 2047 break; 2048 2049 default: 2050 break; 2051 } 2052 } else if (is_nan(a.cls)) { 2053 parts_return_nan(&a, s); 2054 } 2055 return a; 2056 } 2057 2058 float32 float16_to_float32(float16 a, bool ieee, float_status *s) 2059 { 2060 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2061 FloatParts64 pa, pr; 2062 2063 float16a_unpack_canonical(&pa, a, s, fmt16); 2064 pr = float_to_float(pa, &float32_params, s); 2065 return float32_round_pack_canonical(&pr, s); 2066 } 2067 2068 float64 float16_to_float64(float16 a, bool ieee, float_status *s) 2069 { 2070 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2071 FloatParts64 pa, pr; 2072 2073 float16a_unpack_canonical(&pa, a, s, fmt16); 2074 pr = float_to_float(pa, &float64_params, s); 2075 return float64_round_pack_canonical(&pr, s); 2076 } 2077 2078 float16 float32_to_float16(float32 a, bool ieee, float_status *s) 2079 { 2080 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2081 FloatParts64 pa, pr; 2082 2083 float32_unpack_canonical(&pa, a, s); 2084 pr = float_to_float(pa, fmt16, s); 2085 return float16a_round_pack_canonical(&pr, s, fmt16); 2086 } 2087 2088 static float64 QEMU_SOFTFLOAT_ATTR 2089 soft_float32_to_float64(float32 a, float_status *s) 2090 { 2091 FloatParts64 pa, pr; 2092 2093 float32_unpack_canonical(&pa, a, s); 2094 pr = float_to_float(pa, &float64_params, s); 2095 return float64_round_pack_canonical(&pr, s); 2096 } 2097 2098 float64 float32_to_float64(float32 a, float_status *s) 2099 { 2100 if (likely(float32_is_normal(a))) { 2101 /* Widening conversion can never produce inexact results. */ 2102 union_float32 uf; 2103 union_float64 ud; 2104 uf.s = a; 2105 ud.h = uf.h; 2106 return ud.s; 2107 } else if (float32_is_zero(a)) { 2108 return float64_set_sign(float64_zero, float32_is_neg(a)); 2109 } else { 2110 return soft_float32_to_float64(a, s); 2111 } 2112 } 2113 2114 float16 float64_to_float16(float64 a, bool ieee, float_status *s) 2115 { 2116 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2117 FloatParts64 pa, pr; 2118 2119 float64_unpack_canonical(&pa, a, s); 2120 pr = float_to_float(pa, fmt16, s); 2121 return float16a_round_pack_canonical(&pr, s, fmt16); 2122 } 2123 2124 float32 float64_to_float32(float64 a, float_status *s) 2125 { 2126 FloatParts64 pa, pr; 2127 2128 float64_unpack_canonical(&pa, a, s); 2129 pr = float_to_float(pa, &float32_params, s); 2130 return float32_round_pack_canonical(&pr, s); 2131 } 2132 2133 float32 bfloat16_to_float32(bfloat16 a, float_status *s) 2134 { 2135 FloatParts64 pa, pr; 2136 2137 bfloat16_unpack_canonical(&pa, a, s); 2138 pr = float_to_float(pa, &float32_params, s); 2139 return float32_round_pack_canonical(&pr, s); 2140 } 2141 2142 float64 bfloat16_to_float64(bfloat16 a, float_status *s) 2143 { 2144 FloatParts64 pa, pr; 2145 2146 bfloat16_unpack_canonical(&pa, a, s); 2147 pr = float_to_float(pa, &float64_params, s); 2148 return float64_round_pack_canonical(&pr, s); 2149 } 2150 2151 bfloat16 float32_to_bfloat16(float32 a, float_status *s) 2152 { 2153 FloatParts64 pa, pr; 2154 2155 float32_unpack_canonical(&pa, a, s); 2156 pr = float_to_float(pa, &bfloat16_params, s); 2157 return bfloat16_round_pack_canonical(&pr, s); 2158 } 2159 2160 bfloat16 float64_to_bfloat16(float64 a, float_status *s) 2161 { 2162 FloatParts64 pa, pr; 2163 2164 float64_unpack_canonical(&pa, a, s); 2165 pr = float_to_float(pa, &bfloat16_params, s); 2166 return bfloat16_round_pack_canonical(&pr, s); 2167 } 2168 2169 /* 2170 * Rounds the floating-point value `a' to an integer, and returns the 2171 * result as a floating-point value. The operation is performed 2172 * according to the IEC/IEEE Standard for Binary Floating-Point 2173 * Arithmetic. 2174 */ 2175 2176 static FloatParts64 round_to_int(FloatParts64 a, FloatRoundMode rmode, 2177 int scale, float_status *s) 2178 { 2179 switch (a.cls) { 2180 case float_class_qnan: 2181 case float_class_snan: 2182 parts_return_nan(&a, s); 2183 break; 2184 2185 case float_class_zero: 2186 case float_class_inf: 2187 /* already "integral" */ 2188 break; 2189 2190 case float_class_normal: 2191 scale = MIN(MAX(scale, -0x10000), 0x10000); 2192 a.exp += scale; 2193 2194 if (a.exp >= DECOMPOSED_BINARY_POINT) { 2195 /* already integral */ 2196 break; 2197 } 2198 if (a.exp < 0) { 2199 bool one; 2200 /* all fractional */ 2201 float_raise(float_flag_inexact, s); 2202 switch (rmode) { 2203 case float_round_nearest_even: 2204 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT; 2205 break; 2206 case float_round_ties_away: 2207 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT; 2208 break; 2209 case float_round_to_zero: 2210 one = false; 2211 break; 2212 case float_round_up: 2213 one = !a.sign; 2214 break; 2215 case float_round_down: 2216 one = a.sign; 2217 break; 2218 case float_round_to_odd: 2219 one = true; 2220 break; 2221 default: 2222 g_assert_not_reached(); 2223 } 2224 2225 if (one) { 2226 a.frac = DECOMPOSED_IMPLICIT_BIT; 2227 a.exp = 0; 2228 } else { 2229 a.cls = float_class_zero; 2230 } 2231 } else { 2232 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp; 2233 uint64_t frac_lsbm1 = frac_lsb >> 1; 2234 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb; 2235 uint64_t rnd_mask = rnd_even_mask >> 1; 2236 uint64_t inc; 2237 2238 switch (rmode) { 2239 case float_round_nearest_even: 2240 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 2241 break; 2242 case float_round_ties_away: 2243 inc = frac_lsbm1; 2244 break; 2245 case float_round_to_zero: 2246 inc = 0; 2247 break; 2248 case float_round_up: 2249 inc = a.sign ? 0 : rnd_mask; 2250 break; 2251 case float_round_down: 2252 inc = a.sign ? rnd_mask : 0; 2253 break; 2254 case float_round_to_odd: 2255 inc = a.frac & frac_lsb ? 0 : rnd_mask; 2256 break; 2257 default: 2258 g_assert_not_reached(); 2259 } 2260 2261 if (a.frac & rnd_mask) { 2262 float_raise(float_flag_inexact, s); 2263 if (uadd64_overflow(a.frac, inc, &a.frac)) { 2264 a.frac >>= 1; 2265 a.frac |= DECOMPOSED_IMPLICIT_BIT; 2266 a.exp++; 2267 } 2268 a.frac &= ~rnd_mask; 2269 } 2270 } 2271 break; 2272 default: 2273 g_assert_not_reached(); 2274 } 2275 return a; 2276 } 2277 2278 float16 float16_round_to_int(float16 a, float_status *s) 2279 { 2280 FloatParts64 pa, pr; 2281 2282 float16_unpack_canonical(&pa, a, s); 2283 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2284 return float16_round_pack_canonical(&pr, s); 2285 } 2286 2287 float32 float32_round_to_int(float32 a, float_status *s) 2288 { 2289 FloatParts64 pa, pr; 2290 2291 float32_unpack_canonical(&pa, a, s); 2292 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2293 return float32_round_pack_canonical(&pr, s); 2294 } 2295 2296 float64 float64_round_to_int(float64 a, float_status *s) 2297 { 2298 FloatParts64 pa, pr; 2299 2300 float64_unpack_canonical(&pa, a, s); 2301 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2302 return float64_round_pack_canonical(&pr, s); 2303 } 2304 2305 /* 2306 * Rounds the bfloat16 value `a' to an integer, and returns the 2307 * result as a bfloat16 value. 2308 */ 2309 2310 bfloat16 bfloat16_round_to_int(bfloat16 a, float_status *s) 2311 { 2312 FloatParts64 pa, pr; 2313 2314 bfloat16_unpack_canonical(&pa, a, s); 2315 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2316 return bfloat16_round_pack_canonical(&pr, s); 2317 } 2318 2319 /* 2320 * Returns the result of converting the floating-point value `a' to 2321 * the two's complement integer format. The conversion is performed 2322 * according to the IEC/IEEE Standard for Binary Floating-Point 2323 * Arithmetic---which means in particular that the conversion is 2324 * rounded according to the current rounding mode. If `a' is a NaN, 2325 * the largest positive integer is returned. Otherwise, if the 2326 * conversion overflows, the largest integer with the same sign as `a' 2327 * is returned. 2328 */ 2329 2330 static int64_t round_to_int_and_pack(FloatParts64 in, FloatRoundMode rmode, 2331 int scale, int64_t min, int64_t max, 2332 float_status *s) 2333 { 2334 uint64_t r; 2335 int orig_flags = get_float_exception_flags(s); 2336 FloatParts64 p = round_to_int(in, rmode, scale, s); 2337 2338 switch (p.cls) { 2339 case float_class_snan: 2340 case float_class_qnan: 2341 s->float_exception_flags = orig_flags | float_flag_invalid; 2342 return max; 2343 case float_class_inf: 2344 s->float_exception_flags = orig_flags | float_flag_invalid; 2345 return p.sign ? min : max; 2346 case float_class_zero: 2347 return 0; 2348 case float_class_normal: 2349 if (p.exp <= DECOMPOSED_BINARY_POINT) { 2350 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2351 } else { 2352 r = UINT64_MAX; 2353 } 2354 if (p.sign) { 2355 if (r <= -(uint64_t) min) { 2356 return -r; 2357 } else { 2358 s->float_exception_flags = orig_flags | float_flag_invalid; 2359 return min; 2360 } 2361 } else { 2362 if (r <= max) { 2363 return r; 2364 } else { 2365 s->float_exception_flags = orig_flags | float_flag_invalid; 2366 return max; 2367 } 2368 } 2369 default: 2370 g_assert_not_reached(); 2371 } 2372 } 2373 2374 int8_t float16_to_int8_scalbn(float16 a, FloatRoundMode rmode, int scale, 2375 float_status *s) 2376 { 2377 FloatParts64 p; 2378 2379 float16_unpack_canonical(&p, a, s); 2380 return round_to_int_and_pack(p, rmode, scale, INT8_MIN, INT8_MAX, s); 2381 } 2382 2383 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale, 2384 float_status *s) 2385 { 2386 FloatParts64 p; 2387 2388 float16_unpack_canonical(&p, a, s); 2389 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2390 } 2391 2392 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale, 2393 float_status *s) 2394 { 2395 FloatParts64 p; 2396 2397 float16_unpack_canonical(&p, a, s); 2398 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2399 } 2400 2401 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale, 2402 float_status *s) 2403 { 2404 FloatParts64 p; 2405 2406 float16_unpack_canonical(&p, a, s); 2407 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2408 } 2409 2410 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale, 2411 float_status *s) 2412 { 2413 FloatParts64 p; 2414 2415 float32_unpack_canonical(&p, a, s); 2416 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2417 } 2418 2419 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale, 2420 float_status *s) 2421 { 2422 FloatParts64 p; 2423 2424 float32_unpack_canonical(&p, a, s); 2425 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2426 } 2427 2428 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale, 2429 float_status *s) 2430 { 2431 FloatParts64 p; 2432 2433 float32_unpack_canonical(&p, a, s); 2434 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2435 } 2436 2437 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale, 2438 float_status *s) 2439 { 2440 FloatParts64 p; 2441 2442 float64_unpack_canonical(&p, a, s); 2443 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2444 } 2445 2446 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale, 2447 float_status *s) 2448 { 2449 FloatParts64 p; 2450 2451 float64_unpack_canonical(&p, a, s); 2452 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2453 } 2454 2455 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale, 2456 float_status *s) 2457 { 2458 FloatParts64 p; 2459 2460 float64_unpack_canonical(&p, a, s); 2461 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2462 } 2463 2464 int8_t float16_to_int8(float16 a, float_status *s) 2465 { 2466 return float16_to_int8_scalbn(a, s->float_rounding_mode, 0, s); 2467 } 2468 2469 int16_t float16_to_int16(float16 a, float_status *s) 2470 { 2471 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2472 } 2473 2474 int32_t float16_to_int32(float16 a, float_status *s) 2475 { 2476 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2477 } 2478 2479 int64_t float16_to_int64(float16 a, float_status *s) 2480 { 2481 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2482 } 2483 2484 int16_t float32_to_int16(float32 a, float_status *s) 2485 { 2486 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2487 } 2488 2489 int32_t float32_to_int32(float32 a, float_status *s) 2490 { 2491 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2492 } 2493 2494 int64_t float32_to_int64(float32 a, float_status *s) 2495 { 2496 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2497 } 2498 2499 int16_t float64_to_int16(float64 a, float_status *s) 2500 { 2501 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2502 } 2503 2504 int32_t float64_to_int32(float64 a, float_status *s) 2505 { 2506 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2507 } 2508 2509 int64_t float64_to_int64(float64 a, float_status *s) 2510 { 2511 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2512 } 2513 2514 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s) 2515 { 2516 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2517 } 2518 2519 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s) 2520 { 2521 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2522 } 2523 2524 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s) 2525 { 2526 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2527 } 2528 2529 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s) 2530 { 2531 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s); 2532 } 2533 2534 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s) 2535 { 2536 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s); 2537 } 2538 2539 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s) 2540 { 2541 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s); 2542 } 2543 2544 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s) 2545 { 2546 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s); 2547 } 2548 2549 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s) 2550 { 2551 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s); 2552 } 2553 2554 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s) 2555 { 2556 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s); 2557 } 2558 2559 /* 2560 * Returns the result of converting the floating-point value `a' to 2561 * the two's complement integer format. 2562 */ 2563 2564 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2565 float_status *s) 2566 { 2567 FloatParts64 p; 2568 2569 bfloat16_unpack_canonical(&p, a, s); 2570 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2571 } 2572 2573 int32_t bfloat16_to_int32_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2574 float_status *s) 2575 { 2576 FloatParts64 p; 2577 2578 bfloat16_unpack_canonical(&p, a, s); 2579 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2580 } 2581 2582 int64_t bfloat16_to_int64_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2583 float_status *s) 2584 { 2585 FloatParts64 p; 2586 2587 bfloat16_unpack_canonical(&p, a, s); 2588 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2589 } 2590 2591 int16_t bfloat16_to_int16(bfloat16 a, float_status *s) 2592 { 2593 return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2594 } 2595 2596 int32_t bfloat16_to_int32(bfloat16 a, float_status *s) 2597 { 2598 return bfloat16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2599 } 2600 2601 int64_t bfloat16_to_int64(bfloat16 a, float_status *s) 2602 { 2603 return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2604 } 2605 2606 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s) 2607 { 2608 return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2609 } 2610 2611 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a, float_status *s) 2612 { 2613 return bfloat16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2614 } 2615 2616 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a, float_status *s) 2617 { 2618 return bfloat16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2619 } 2620 2621 /* 2622 * Returns the result of converting the floating-point value `a' to 2623 * the unsigned integer format. The conversion is performed according 2624 * to the IEC/IEEE Standard for Binary Floating-Point 2625 * Arithmetic---which means in particular that the conversion is 2626 * rounded according to the current rounding mode. If `a' is a NaN, 2627 * the largest unsigned integer is returned. Otherwise, if the 2628 * conversion overflows, the largest unsigned integer is returned. If 2629 * the 'a' is negative, the result is rounded and zero is returned; 2630 * values that do not round to zero will raise the inexact exception 2631 * flag. 2632 */ 2633 2634 static uint64_t round_to_uint_and_pack(FloatParts64 in, FloatRoundMode rmode, 2635 int scale, uint64_t max, 2636 float_status *s) 2637 { 2638 int orig_flags = get_float_exception_flags(s); 2639 FloatParts64 p = round_to_int(in, rmode, scale, s); 2640 uint64_t r; 2641 2642 switch (p.cls) { 2643 case float_class_snan: 2644 case float_class_qnan: 2645 s->float_exception_flags = orig_flags | float_flag_invalid; 2646 return max; 2647 case float_class_inf: 2648 s->float_exception_flags = orig_flags | float_flag_invalid; 2649 return p.sign ? 0 : max; 2650 case float_class_zero: 2651 return 0; 2652 case float_class_normal: 2653 if (p.sign) { 2654 s->float_exception_flags = orig_flags | float_flag_invalid; 2655 return 0; 2656 } 2657 2658 if (p.exp <= DECOMPOSED_BINARY_POINT) { 2659 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2660 } else { 2661 s->float_exception_flags = orig_flags | float_flag_invalid; 2662 return max; 2663 } 2664 2665 /* For uint64 this will never trip, but if p.exp is too large 2666 * to shift a decomposed fraction we shall have exited via the 2667 * 3rd leg above. 2668 */ 2669 if (r > max) { 2670 s->float_exception_flags = orig_flags | float_flag_invalid; 2671 return max; 2672 } 2673 return r; 2674 default: 2675 g_assert_not_reached(); 2676 } 2677 } 2678 2679 uint8_t float16_to_uint8_scalbn(float16 a, FloatRoundMode rmode, int scale, 2680 float_status *s) 2681 { 2682 FloatParts64 p; 2683 2684 float16_unpack_canonical(&p, a, s); 2685 return round_to_uint_and_pack(p, rmode, scale, UINT8_MAX, s); 2686 } 2687 2688 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale, 2689 float_status *s) 2690 { 2691 FloatParts64 p; 2692 2693 float16_unpack_canonical(&p, a, s); 2694 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2695 } 2696 2697 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale, 2698 float_status *s) 2699 { 2700 FloatParts64 p; 2701 2702 float16_unpack_canonical(&p, a, s); 2703 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2704 } 2705 2706 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale, 2707 float_status *s) 2708 { 2709 FloatParts64 p; 2710 2711 float16_unpack_canonical(&p, a, s); 2712 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2713 } 2714 2715 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale, 2716 float_status *s) 2717 { 2718 FloatParts64 p; 2719 2720 float32_unpack_canonical(&p, a, s); 2721 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2722 } 2723 2724 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale, 2725 float_status *s) 2726 { 2727 FloatParts64 p; 2728 2729 float32_unpack_canonical(&p, a, s); 2730 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2731 } 2732 2733 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale, 2734 float_status *s) 2735 { 2736 FloatParts64 p; 2737 2738 float32_unpack_canonical(&p, a, s); 2739 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2740 } 2741 2742 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale, 2743 float_status *s) 2744 { 2745 FloatParts64 p; 2746 2747 float64_unpack_canonical(&p, a, s); 2748 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2749 } 2750 2751 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale, 2752 float_status *s) 2753 { 2754 FloatParts64 p; 2755 2756 float64_unpack_canonical(&p, a, s); 2757 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2758 } 2759 2760 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale, 2761 float_status *s) 2762 { 2763 FloatParts64 p; 2764 2765 float64_unpack_canonical(&p, a, s); 2766 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2767 } 2768 2769 uint8_t float16_to_uint8(float16 a, float_status *s) 2770 { 2771 return float16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s); 2772 } 2773 2774 uint16_t float16_to_uint16(float16 a, float_status *s) 2775 { 2776 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2777 } 2778 2779 uint32_t float16_to_uint32(float16 a, float_status *s) 2780 { 2781 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2782 } 2783 2784 uint64_t float16_to_uint64(float16 a, float_status *s) 2785 { 2786 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2787 } 2788 2789 uint16_t float32_to_uint16(float32 a, float_status *s) 2790 { 2791 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2792 } 2793 2794 uint32_t float32_to_uint32(float32 a, float_status *s) 2795 { 2796 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2797 } 2798 2799 uint64_t float32_to_uint64(float32 a, float_status *s) 2800 { 2801 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2802 } 2803 2804 uint16_t float64_to_uint16(float64 a, float_status *s) 2805 { 2806 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2807 } 2808 2809 uint32_t float64_to_uint32(float64 a, float_status *s) 2810 { 2811 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2812 } 2813 2814 uint64_t float64_to_uint64(float64 a, float_status *s) 2815 { 2816 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2817 } 2818 2819 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s) 2820 { 2821 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2822 } 2823 2824 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s) 2825 { 2826 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2827 } 2828 2829 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s) 2830 { 2831 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2832 } 2833 2834 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s) 2835 { 2836 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2837 } 2838 2839 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s) 2840 { 2841 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2842 } 2843 2844 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s) 2845 { 2846 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2847 } 2848 2849 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s) 2850 { 2851 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2852 } 2853 2854 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s) 2855 { 2856 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2857 } 2858 2859 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s) 2860 { 2861 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2862 } 2863 2864 /* 2865 * Returns the result of converting the bfloat16 value `a' to 2866 * the unsigned integer format. 2867 */ 2868 2869 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode, 2870 int scale, float_status *s) 2871 { 2872 FloatParts64 p; 2873 2874 bfloat16_unpack_canonical(&p, a, s); 2875 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2876 } 2877 2878 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode rmode, 2879 int scale, float_status *s) 2880 { 2881 FloatParts64 p; 2882 2883 bfloat16_unpack_canonical(&p, a, s); 2884 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2885 } 2886 2887 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode rmode, 2888 int scale, float_status *s) 2889 { 2890 FloatParts64 p; 2891 2892 bfloat16_unpack_canonical(&p, a, s); 2893 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2894 } 2895 2896 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s) 2897 { 2898 return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2899 } 2900 2901 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *s) 2902 { 2903 return bfloat16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2904 } 2905 2906 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s) 2907 { 2908 return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2909 } 2910 2911 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s) 2912 { 2913 return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2914 } 2915 2916 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *s) 2917 { 2918 return bfloat16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2919 } 2920 2921 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *s) 2922 { 2923 return bfloat16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2924 } 2925 2926 /* 2927 * Integer to float conversions 2928 * 2929 * Returns the result of converting the two's complement integer `a' 2930 * to the floating-point format. The conversion is performed according 2931 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2932 */ 2933 2934 static FloatParts64 int_to_float(int64_t a, int scale, float_status *status) 2935 { 2936 FloatParts64 r = { .sign = false }; 2937 2938 if (a == 0) { 2939 r.cls = float_class_zero; 2940 } else { 2941 uint64_t f = a; 2942 int shift; 2943 2944 r.cls = float_class_normal; 2945 if (a < 0) { 2946 f = -f; 2947 r.sign = true; 2948 } 2949 shift = clz64(f); 2950 scale = MIN(MAX(scale, -0x10000), 0x10000); 2951 2952 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 2953 r.frac = f << shift; 2954 } 2955 2956 return r; 2957 } 2958 2959 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status) 2960 { 2961 FloatParts64 pa = int_to_float(a, scale, status); 2962 return float16_round_pack_canonical(&pa, status); 2963 } 2964 2965 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status) 2966 { 2967 return int64_to_float16_scalbn(a, scale, status); 2968 } 2969 2970 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status) 2971 { 2972 return int64_to_float16_scalbn(a, scale, status); 2973 } 2974 2975 float16 int64_to_float16(int64_t a, float_status *status) 2976 { 2977 return int64_to_float16_scalbn(a, 0, status); 2978 } 2979 2980 float16 int32_to_float16(int32_t a, float_status *status) 2981 { 2982 return int64_to_float16_scalbn(a, 0, status); 2983 } 2984 2985 float16 int16_to_float16(int16_t a, float_status *status) 2986 { 2987 return int64_to_float16_scalbn(a, 0, status); 2988 } 2989 2990 float16 int8_to_float16(int8_t a, float_status *status) 2991 { 2992 return int64_to_float16_scalbn(a, 0, status); 2993 } 2994 2995 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status) 2996 { 2997 FloatParts64 pa = int_to_float(a, scale, status); 2998 return float32_round_pack_canonical(&pa, status); 2999 } 3000 3001 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status) 3002 { 3003 return int64_to_float32_scalbn(a, scale, status); 3004 } 3005 3006 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status) 3007 { 3008 return int64_to_float32_scalbn(a, scale, status); 3009 } 3010 3011 float32 int64_to_float32(int64_t a, float_status *status) 3012 { 3013 return int64_to_float32_scalbn(a, 0, status); 3014 } 3015 3016 float32 int32_to_float32(int32_t a, float_status *status) 3017 { 3018 return int64_to_float32_scalbn(a, 0, status); 3019 } 3020 3021 float32 int16_to_float32(int16_t a, float_status *status) 3022 { 3023 return int64_to_float32_scalbn(a, 0, status); 3024 } 3025 3026 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status) 3027 { 3028 FloatParts64 pa = int_to_float(a, scale, status); 3029 return float64_round_pack_canonical(&pa, status); 3030 } 3031 3032 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status) 3033 { 3034 return int64_to_float64_scalbn(a, scale, status); 3035 } 3036 3037 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status) 3038 { 3039 return int64_to_float64_scalbn(a, scale, status); 3040 } 3041 3042 float64 int64_to_float64(int64_t a, float_status *status) 3043 { 3044 return int64_to_float64_scalbn(a, 0, status); 3045 } 3046 3047 float64 int32_to_float64(int32_t a, float_status *status) 3048 { 3049 return int64_to_float64_scalbn(a, 0, status); 3050 } 3051 3052 float64 int16_to_float64(int16_t a, float_status *status) 3053 { 3054 return int64_to_float64_scalbn(a, 0, status); 3055 } 3056 3057 /* 3058 * Returns the result of converting the two's complement integer `a' 3059 * to the bfloat16 format. 3060 */ 3061 3062 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status) 3063 { 3064 FloatParts64 pa = int_to_float(a, scale, status); 3065 return bfloat16_round_pack_canonical(&pa, status); 3066 } 3067 3068 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int scale, float_status *status) 3069 { 3070 return int64_to_bfloat16_scalbn(a, scale, status); 3071 } 3072 3073 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status) 3074 { 3075 return int64_to_bfloat16_scalbn(a, scale, status); 3076 } 3077 3078 bfloat16 int64_to_bfloat16(int64_t a, float_status *status) 3079 { 3080 return int64_to_bfloat16_scalbn(a, 0, status); 3081 } 3082 3083 bfloat16 int32_to_bfloat16(int32_t a, float_status *status) 3084 { 3085 return int64_to_bfloat16_scalbn(a, 0, status); 3086 } 3087 3088 bfloat16 int16_to_bfloat16(int16_t a, float_status *status) 3089 { 3090 return int64_to_bfloat16_scalbn(a, 0, status); 3091 } 3092 3093 /* 3094 * Unsigned Integer to float conversions 3095 * 3096 * Returns the result of converting the unsigned integer `a' to the 3097 * floating-point format. The conversion is performed according to the 3098 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3099 */ 3100 3101 static FloatParts64 uint_to_float(uint64_t a, int scale, float_status *status) 3102 { 3103 FloatParts64 r = { .sign = false }; 3104 int shift; 3105 3106 if (a == 0) { 3107 r.cls = float_class_zero; 3108 } else { 3109 scale = MIN(MAX(scale, -0x10000), 0x10000); 3110 shift = clz64(a); 3111 r.cls = float_class_normal; 3112 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 3113 r.frac = a << shift; 3114 } 3115 3116 return r; 3117 } 3118 3119 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status) 3120 { 3121 FloatParts64 pa = uint_to_float(a, scale, status); 3122 return float16_round_pack_canonical(&pa, status); 3123 } 3124 3125 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status) 3126 { 3127 return uint64_to_float16_scalbn(a, scale, status); 3128 } 3129 3130 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status) 3131 { 3132 return uint64_to_float16_scalbn(a, scale, status); 3133 } 3134 3135 float16 uint64_to_float16(uint64_t a, float_status *status) 3136 { 3137 return uint64_to_float16_scalbn(a, 0, status); 3138 } 3139 3140 float16 uint32_to_float16(uint32_t a, float_status *status) 3141 { 3142 return uint64_to_float16_scalbn(a, 0, status); 3143 } 3144 3145 float16 uint16_to_float16(uint16_t a, float_status *status) 3146 { 3147 return uint64_to_float16_scalbn(a, 0, status); 3148 } 3149 3150 float16 uint8_to_float16(uint8_t a, float_status *status) 3151 { 3152 return uint64_to_float16_scalbn(a, 0, status); 3153 } 3154 3155 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status) 3156 { 3157 FloatParts64 pa = uint_to_float(a, scale, status); 3158 return float32_round_pack_canonical(&pa, status); 3159 } 3160 3161 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status) 3162 { 3163 return uint64_to_float32_scalbn(a, scale, status); 3164 } 3165 3166 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status) 3167 { 3168 return uint64_to_float32_scalbn(a, scale, status); 3169 } 3170 3171 float32 uint64_to_float32(uint64_t a, float_status *status) 3172 { 3173 return uint64_to_float32_scalbn(a, 0, status); 3174 } 3175 3176 float32 uint32_to_float32(uint32_t a, float_status *status) 3177 { 3178 return uint64_to_float32_scalbn(a, 0, status); 3179 } 3180 3181 float32 uint16_to_float32(uint16_t a, float_status *status) 3182 { 3183 return uint64_to_float32_scalbn(a, 0, status); 3184 } 3185 3186 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status) 3187 { 3188 FloatParts64 pa = uint_to_float(a, scale, status); 3189 return float64_round_pack_canonical(&pa, status); 3190 } 3191 3192 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status) 3193 { 3194 return uint64_to_float64_scalbn(a, scale, status); 3195 } 3196 3197 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status) 3198 { 3199 return uint64_to_float64_scalbn(a, scale, status); 3200 } 3201 3202 float64 uint64_to_float64(uint64_t a, float_status *status) 3203 { 3204 return uint64_to_float64_scalbn(a, 0, status); 3205 } 3206 3207 float64 uint32_to_float64(uint32_t a, float_status *status) 3208 { 3209 return uint64_to_float64_scalbn(a, 0, status); 3210 } 3211 3212 float64 uint16_to_float64(uint16_t a, float_status *status) 3213 { 3214 return uint64_to_float64_scalbn(a, 0, status); 3215 } 3216 3217 /* 3218 * Returns the result of converting the unsigned integer `a' to the 3219 * bfloat16 format. 3220 */ 3221 3222 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status) 3223 { 3224 FloatParts64 pa = uint_to_float(a, scale, status); 3225 return bfloat16_round_pack_canonical(&pa, status); 3226 } 3227 3228 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int scale, float_status *status) 3229 { 3230 return uint64_to_bfloat16_scalbn(a, scale, status); 3231 } 3232 3233 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status) 3234 { 3235 return uint64_to_bfloat16_scalbn(a, scale, status); 3236 } 3237 3238 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status) 3239 { 3240 return uint64_to_bfloat16_scalbn(a, 0, status); 3241 } 3242 3243 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status) 3244 { 3245 return uint64_to_bfloat16_scalbn(a, 0, status); 3246 } 3247 3248 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status) 3249 { 3250 return uint64_to_bfloat16_scalbn(a, 0, status); 3251 } 3252 3253 /* Float Min/Max */ 3254 /* min() and max() functions. These can't be implemented as 3255 * 'compare and pick one input' because that would mishandle 3256 * NaNs and +0 vs -0. 3257 * 3258 * minnum() and maxnum() functions. These are similar to the min() 3259 * and max() functions but if one of the arguments is a QNaN and 3260 * the other is numerical then the numerical argument is returned. 3261 * SNaNs will get quietened before being returned. 3262 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 3263 * and maxNum() operations. min() and max() are the typical min/max 3264 * semantics provided by many CPUs which predate that specification. 3265 * 3266 * minnummag() and maxnummag() functions correspond to minNumMag() 3267 * and minNumMag() from the IEEE-754 2008. 3268 */ 3269 static FloatParts64 minmax_floats(FloatParts64 a, FloatParts64 b, bool ismin, 3270 bool ieee, bool ismag, float_status *s) 3271 { 3272 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) { 3273 if (ieee) { 3274 /* Takes two floating-point values `a' and `b', one of 3275 * which is a NaN, and returns the appropriate NaN 3276 * result. If either `a' or `b' is a signaling NaN, 3277 * the invalid exception is raised. 3278 */ 3279 if (is_snan(a.cls) || is_snan(b.cls)) { 3280 return *parts_pick_nan(&a, &b, s); 3281 } else if (is_nan(a.cls) && !is_nan(b.cls)) { 3282 return b; 3283 } else if (is_nan(b.cls) && !is_nan(a.cls)) { 3284 return a; 3285 } 3286 } 3287 return *parts_pick_nan(&a, &b, s); 3288 } else { 3289 int a_exp, b_exp; 3290 3291 switch (a.cls) { 3292 case float_class_normal: 3293 a_exp = a.exp; 3294 break; 3295 case float_class_inf: 3296 a_exp = INT_MAX; 3297 break; 3298 case float_class_zero: 3299 a_exp = INT_MIN; 3300 break; 3301 default: 3302 g_assert_not_reached(); 3303 break; 3304 } 3305 switch (b.cls) { 3306 case float_class_normal: 3307 b_exp = b.exp; 3308 break; 3309 case float_class_inf: 3310 b_exp = INT_MAX; 3311 break; 3312 case float_class_zero: 3313 b_exp = INT_MIN; 3314 break; 3315 default: 3316 g_assert_not_reached(); 3317 break; 3318 } 3319 3320 if (ismag && (a_exp != b_exp || a.frac != b.frac)) { 3321 bool a_less = a_exp < b_exp; 3322 if (a_exp == b_exp) { 3323 a_less = a.frac < b.frac; 3324 } 3325 return a_less ^ ismin ? b : a; 3326 } 3327 3328 if (a.sign == b.sign) { 3329 bool a_less = a_exp < b_exp; 3330 if (a_exp == b_exp) { 3331 a_less = a.frac < b.frac; 3332 } 3333 return a.sign ^ a_less ^ ismin ? b : a; 3334 } else { 3335 return a.sign ^ ismin ? b : a; 3336 } 3337 } 3338 } 3339 3340 #define MINMAX(sz, name, ismin, isiee, ismag) \ 3341 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \ 3342 float_status *s) \ 3343 { \ 3344 FloatParts64 pa, pb, pr; \ 3345 float ## sz ## _unpack_canonical(&pa, a, s); \ 3346 float ## sz ## _unpack_canonical(&pb, b, s); \ 3347 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 3348 return float ## sz ## _round_pack_canonical(&pr, s); \ 3349 } 3350 3351 MINMAX(16, min, true, false, false) 3352 MINMAX(16, minnum, true, true, false) 3353 MINMAX(16, minnummag, true, true, true) 3354 MINMAX(16, max, false, false, false) 3355 MINMAX(16, maxnum, false, true, false) 3356 MINMAX(16, maxnummag, false, true, true) 3357 3358 MINMAX(32, min, true, false, false) 3359 MINMAX(32, minnum, true, true, false) 3360 MINMAX(32, minnummag, true, true, true) 3361 MINMAX(32, max, false, false, false) 3362 MINMAX(32, maxnum, false, true, false) 3363 MINMAX(32, maxnummag, false, true, true) 3364 3365 MINMAX(64, min, true, false, false) 3366 MINMAX(64, minnum, true, true, false) 3367 MINMAX(64, minnummag, true, true, true) 3368 MINMAX(64, max, false, false, false) 3369 MINMAX(64, maxnum, false, true, false) 3370 MINMAX(64, maxnummag, false, true, true) 3371 3372 #undef MINMAX 3373 3374 #define BF16_MINMAX(name, ismin, isiee, ismag) \ 3375 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s) \ 3376 { \ 3377 FloatParts64 pa, pb, pr; \ 3378 bfloat16_unpack_canonical(&pa, a, s); \ 3379 bfloat16_unpack_canonical(&pb, b, s); \ 3380 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 3381 return bfloat16_round_pack_canonical(&pr, s); \ 3382 } 3383 3384 BF16_MINMAX(min, true, false, false) 3385 BF16_MINMAX(minnum, true, true, false) 3386 BF16_MINMAX(minnummag, true, true, true) 3387 BF16_MINMAX(max, false, false, false) 3388 BF16_MINMAX(maxnum, false, true, false) 3389 BF16_MINMAX(maxnummag, false, true, true) 3390 3391 #undef BF16_MINMAX 3392 3393 /* Floating point compare */ 3394 static FloatRelation compare_floats(FloatParts64 a, FloatParts64 b, bool is_quiet, 3395 float_status *s) 3396 { 3397 if (is_nan(a.cls) || is_nan(b.cls)) { 3398 if (!is_quiet || 3399 a.cls == float_class_snan || 3400 b.cls == float_class_snan) { 3401 float_raise(float_flag_invalid, s); 3402 } 3403 return float_relation_unordered; 3404 } 3405 3406 if (a.cls == float_class_zero) { 3407 if (b.cls == float_class_zero) { 3408 return float_relation_equal; 3409 } 3410 return b.sign ? float_relation_greater : float_relation_less; 3411 } else if (b.cls == float_class_zero) { 3412 return a.sign ? float_relation_less : float_relation_greater; 3413 } 3414 3415 /* The only really important thing about infinity is its sign. If 3416 * both are infinities the sign marks the smallest of the two. 3417 */ 3418 if (a.cls == float_class_inf) { 3419 if ((b.cls == float_class_inf) && (a.sign == b.sign)) { 3420 return float_relation_equal; 3421 } 3422 return a.sign ? float_relation_less : float_relation_greater; 3423 } else if (b.cls == float_class_inf) { 3424 return b.sign ? float_relation_greater : float_relation_less; 3425 } 3426 3427 if (a.sign != b.sign) { 3428 return a.sign ? float_relation_less : float_relation_greater; 3429 } 3430 3431 if (a.exp == b.exp) { 3432 if (a.frac == b.frac) { 3433 return float_relation_equal; 3434 } 3435 if (a.sign) { 3436 return a.frac > b.frac ? 3437 float_relation_less : float_relation_greater; 3438 } else { 3439 return a.frac > b.frac ? 3440 float_relation_greater : float_relation_less; 3441 } 3442 } else { 3443 if (a.sign) { 3444 return a.exp > b.exp ? float_relation_less : float_relation_greater; 3445 } else { 3446 return a.exp > b.exp ? float_relation_greater : float_relation_less; 3447 } 3448 } 3449 } 3450 3451 #define COMPARE(name, attr, sz) \ 3452 static int attr \ 3453 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \ 3454 { \ 3455 FloatParts64 pa, pb; \ 3456 float ## sz ## _unpack_canonical(&pa, a, s); \ 3457 float ## sz ## _unpack_canonical(&pb, b, s); \ 3458 return compare_floats(pa, pb, is_quiet, s); \ 3459 } 3460 3461 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16) 3462 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32) 3463 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64) 3464 3465 #undef COMPARE 3466 3467 FloatRelation float16_compare(float16 a, float16 b, float_status *s) 3468 { 3469 return soft_f16_compare(a, b, false, s); 3470 } 3471 3472 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s) 3473 { 3474 return soft_f16_compare(a, b, true, s); 3475 } 3476 3477 static FloatRelation QEMU_FLATTEN 3478 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s) 3479 { 3480 union_float32 ua, ub; 3481 3482 ua.s = xa; 3483 ub.s = xb; 3484 3485 if (QEMU_NO_HARDFLOAT) { 3486 goto soft; 3487 } 3488 3489 float32_input_flush2(&ua.s, &ub.s, s); 3490 if (isgreaterequal(ua.h, ub.h)) { 3491 if (isgreater(ua.h, ub.h)) { 3492 return float_relation_greater; 3493 } 3494 return float_relation_equal; 3495 } 3496 if (likely(isless(ua.h, ub.h))) { 3497 return float_relation_less; 3498 } 3499 /* The only condition remaining is unordered. 3500 * Fall through to set flags. 3501 */ 3502 soft: 3503 return soft_f32_compare(ua.s, ub.s, is_quiet, s); 3504 } 3505 3506 FloatRelation float32_compare(float32 a, float32 b, float_status *s) 3507 { 3508 return f32_compare(a, b, false, s); 3509 } 3510 3511 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s) 3512 { 3513 return f32_compare(a, b, true, s); 3514 } 3515 3516 static FloatRelation QEMU_FLATTEN 3517 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s) 3518 { 3519 union_float64 ua, ub; 3520 3521 ua.s = xa; 3522 ub.s = xb; 3523 3524 if (QEMU_NO_HARDFLOAT) { 3525 goto soft; 3526 } 3527 3528 float64_input_flush2(&ua.s, &ub.s, s); 3529 if (isgreaterequal(ua.h, ub.h)) { 3530 if (isgreater(ua.h, ub.h)) { 3531 return float_relation_greater; 3532 } 3533 return float_relation_equal; 3534 } 3535 if (likely(isless(ua.h, ub.h))) { 3536 return float_relation_less; 3537 } 3538 /* The only condition remaining is unordered. 3539 * Fall through to set flags. 3540 */ 3541 soft: 3542 return soft_f64_compare(ua.s, ub.s, is_quiet, s); 3543 } 3544 3545 FloatRelation float64_compare(float64 a, float64 b, float_status *s) 3546 { 3547 return f64_compare(a, b, false, s); 3548 } 3549 3550 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s) 3551 { 3552 return f64_compare(a, b, true, s); 3553 } 3554 3555 static FloatRelation QEMU_FLATTEN 3556 soft_bf16_compare(bfloat16 a, bfloat16 b, bool is_quiet, float_status *s) 3557 { 3558 FloatParts64 pa, pb; 3559 3560 bfloat16_unpack_canonical(&pa, a, s); 3561 bfloat16_unpack_canonical(&pb, b, s); 3562 return compare_floats(pa, pb, is_quiet, s); 3563 } 3564 3565 FloatRelation bfloat16_compare(bfloat16 a, bfloat16 b, float_status *s) 3566 { 3567 return soft_bf16_compare(a, b, false, s); 3568 } 3569 3570 FloatRelation bfloat16_compare_quiet(bfloat16 a, bfloat16 b, float_status *s) 3571 { 3572 return soft_bf16_compare(a, b, true, s); 3573 } 3574 3575 /* Multiply A by 2 raised to the power N. */ 3576 static FloatParts64 scalbn_decomposed(FloatParts64 a, int n, float_status *s) 3577 { 3578 if (unlikely(is_nan(a.cls))) { 3579 parts_return_nan(&a, s); 3580 } 3581 if (a.cls == float_class_normal) { 3582 /* The largest float type (even though not supported by FloatParts64) 3583 * is float128, which has a 15 bit exponent. Bounding N to 16 bits 3584 * still allows rounding to infinity, without allowing overflow 3585 * within the int32_t that backs FloatParts64.exp. 3586 */ 3587 n = MIN(MAX(n, -0x10000), 0x10000); 3588 a.exp += n; 3589 } 3590 return a; 3591 } 3592 3593 float16 float16_scalbn(float16 a, int n, float_status *status) 3594 { 3595 FloatParts64 pa, pr; 3596 3597 float16_unpack_canonical(&pa, a, status); 3598 pr = scalbn_decomposed(pa, n, status); 3599 return float16_round_pack_canonical(&pr, status); 3600 } 3601 3602 float32 float32_scalbn(float32 a, int n, float_status *status) 3603 { 3604 FloatParts64 pa, pr; 3605 3606 float32_unpack_canonical(&pa, a, status); 3607 pr = scalbn_decomposed(pa, n, status); 3608 return float32_round_pack_canonical(&pr, status); 3609 } 3610 3611 float64 float64_scalbn(float64 a, int n, float_status *status) 3612 { 3613 FloatParts64 pa, pr; 3614 3615 float64_unpack_canonical(&pa, a, status); 3616 pr = scalbn_decomposed(pa, n, status); 3617 return float64_round_pack_canonical(&pr, status); 3618 } 3619 3620 bfloat16 bfloat16_scalbn(bfloat16 a, int n, float_status *status) 3621 { 3622 FloatParts64 pa, pr; 3623 3624 bfloat16_unpack_canonical(&pa, a, status); 3625 pr = scalbn_decomposed(pa, n, status); 3626 return bfloat16_round_pack_canonical(&pr, status); 3627 } 3628 3629 /* 3630 * Square Root 3631 * 3632 * The old softfloat code did an approximation step before zeroing in 3633 * on the final result. However for simpleness we just compute the 3634 * square root by iterating down from the implicit bit to enough extra 3635 * bits to ensure we get a correctly rounded result. 3636 * 3637 * This does mean however the calculation is slower than before, 3638 * especially for 64 bit floats. 3639 */ 3640 3641 static FloatParts64 sqrt_float(FloatParts64 a, float_status *s, const FloatFmt *p) 3642 { 3643 uint64_t a_frac, r_frac, s_frac; 3644 int bit, last_bit; 3645 3646 if (is_nan(a.cls)) { 3647 parts_return_nan(&a, s); 3648 return a; 3649 } 3650 if (a.cls == float_class_zero) { 3651 return a; /* sqrt(+-0) = +-0 */ 3652 } 3653 if (a.sign) { 3654 float_raise(float_flag_invalid, s); 3655 parts_default_nan(&a, s); 3656 return a; 3657 } 3658 if (a.cls == float_class_inf) { 3659 return a; /* sqrt(+inf) = +inf */ 3660 } 3661 3662 assert(a.cls == float_class_normal); 3663 3664 /* We need two overflow bits at the top. Adding room for that is a 3665 * right shift. If the exponent is odd, we can discard the low bit 3666 * by multiplying the fraction by 2; that's a left shift. Combine 3667 * those and we shift right by 1 if the exponent is odd, otherwise 2. 3668 */ 3669 a_frac = a.frac >> (2 - (a.exp & 1)); 3670 a.exp >>= 1; 3671 3672 /* Bit-by-bit computation of sqrt. */ 3673 r_frac = 0; 3674 s_frac = 0; 3675 3676 /* Iterate from implicit bit down to the 3 extra bits to compute a 3677 * properly rounded result. Remember we've inserted two more bits 3678 * at the top, so these positions are two less. 3679 */ 3680 bit = DECOMPOSED_BINARY_POINT - 2; 3681 last_bit = MAX(p->frac_shift - 4, 0); 3682 do { 3683 uint64_t q = 1ULL << bit; 3684 uint64_t t_frac = s_frac + q; 3685 if (t_frac <= a_frac) { 3686 s_frac = t_frac + q; 3687 a_frac -= t_frac; 3688 r_frac += q; 3689 } 3690 a_frac <<= 1; 3691 } while (--bit >= last_bit); 3692 3693 /* Undo the right shift done above. If there is any remaining 3694 * fraction, the result is inexact. Set the sticky bit. 3695 */ 3696 a.frac = (r_frac << 2) + (a_frac != 0); 3697 3698 return a; 3699 } 3700 3701 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status) 3702 { 3703 FloatParts64 pa, pr; 3704 3705 float16_unpack_canonical(&pa, a, status); 3706 pr = sqrt_float(pa, status, &float16_params); 3707 return float16_round_pack_canonical(&pr, status); 3708 } 3709 3710 static float32 QEMU_SOFTFLOAT_ATTR 3711 soft_f32_sqrt(float32 a, float_status *status) 3712 { 3713 FloatParts64 pa, pr; 3714 3715 float32_unpack_canonical(&pa, a, status); 3716 pr = sqrt_float(pa, status, &float32_params); 3717 return float32_round_pack_canonical(&pr, status); 3718 } 3719 3720 static float64 QEMU_SOFTFLOAT_ATTR 3721 soft_f64_sqrt(float64 a, float_status *status) 3722 { 3723 FloatParts64 pa, pr; 3724 3725 float64_unpack_canonical(&pa, a, status); 3726 pr = sqrt_float(pa, status, &float64_params); 3727 return float64_round_pack_canonical(&pr, status); 3728 } 3729 3730 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s) 3731 { 3732 union_float32 ua, ur; 3733 3734 ua.s = xa; 3735 if (unlikely(!can_use_fpu(s))) { 3736 goto soft; 3737 } 3738 3739 float32_input_flush1(&ua.s, s); 3740 if (QEMU_HARDFLOAT_1F32_USE_FP) { 3741 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3742 fpclassify(ua.h) == FP_ZERO) || 3743 signbit(ua.h))) { 3744 goto soft; 3745 } 3746 } else if (unlikely(!float32_is_zero_or_normal(ua.s) || 3747 float32_is_neg(ua.s))) { 3748 goto soft; 3749 } 3750 ur.h = sqrtf(ua.h); 3751 return ur.s; 3752 3753 soft: 3754 return soft_f32_sqrt(ua.s, s); 3755 } 3756 3757 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s) 3758 { 3759 union_float64 ua, ur; 3760 3761 ua.s = xa; 3762 if (unlikely(!can_use_fpu(s))) { 3763 goto soft; 3764 } 3765 3766 float64_input_flush1(&ua.s, s); 3767 if (QEMU_HARDFLOAT_1F64_USE_FP) { 3768 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3769 fpclassify(ua.h) == FP_ZERO) || 3770 signbit(ua.h))) { 3771 goto soft; 3772 } 3773 } else if (unlikely(!float64_is_zero_or_normal(ua.s) || 3774 float64_is_neg(ua.s))) { 3775 goto soft; 3776 } 3777 ur.h = sqrt(ua.h); 3778 return ur.s; 3779 3780 soft: 3781 return soft_f64_sqrt(ua.s, s); 3782 } 3783 3784 bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status) 3785 { 3786 FloatParts64 pa, pr; 3787 3788 bfloat16_unpack_canonical(&pa, a, status); 3789 pr = sqrt_float(pa, status, &bfloat16_params); 3790 return bfloat16_round_pack_canonical(&pr, status); 3791 } 3792 3793 /*---------------------------------------------------------------------------- 3794 | The pattern for a default generated NaN. 3795 *----------------------------------------------------------------------------*/ 3796 3797 float16 float16_default_nan(float_status *status) 3798 { 3799 FloatParts64 p; 3800 3801 parts_default_nan(&p, status); 3802 p.frac >>= float16_params.frac_shift; 3803 return float16_pack_raw(&p); 3804 } 3805 3806 float32 float32_default_nan(float_status *status) 3807 { 3808 FloatParts64 p; 3809 3810 parts_default_nan(&p, status); 3811 p.frac >>= float32_params.frac_shift; 3812 return float32_pack_raw(&p); 3813 } 3814 3815 float64 float64_default_nan(float_status *status) 3816 { 3817 FloatParts64 p; 3818 3819 parts_default_nan(&p, status); 3820 p.frac >>= float64_params.frac_shift; 3821 return float64_pack_raw(&p); 3822 } 3823 3824 float128 float128_default_nan(float_status *status) 3825 { 3826 FloatParts128 p; 3827 3828 parts_default_nan(&p, status); 3829 frac_shr(&p, float128_params.frac_shift); 3830 return float128_pack_raw(&p); 3831 } 3832 3833 bfloat16 bfloat16_default_nan(float_status *status) 3834 { 3835 FloatParts64 p; 3836 3837 parts_default_nan(&p, status); 3838 p.frac >>= bfloat16_params.frac_shift; 3839 return bfloat16_pack_raw(&p); 3840 } 3841 3842 /*---------------------------------------------------------------------------- 3843 | Returns a quiet NaN from a signalling NaN for the floating point value `a'. 3844 *----------------------------------------------------------------------------*/ 3845 3846 float16 float16_silence_nan(float16 a, float_status *status) 3847 { 3848 FloatParts64 p; 3849 3850 float16_unpack_raw(&p, a); 3851 p.frac <<= float16_params.frac_shift; 3852 parts_silence_nan(&p, status); 3853 p.frac >>= float16_params.frac_shift; 3854 return float16_pack_raw(&p); 3855 } 3856 3857 float32 float32_silence_nan(float32 a, float_status *status) 3858 { 3859 FloatParts64 p; 3860 3861 float32_unpack_raw(&p, a); 3862 p.frac <<= float32_params.frac_shift; 3863 parts_silence_nan(&p, status); 3864 p.frac >>= float32_params.frac_shift; 3865 return float32_pack_raw(&p); 3866 } 3867 3868 float64 float64_silence_nan(float64 a, float_status *status) 3869 { 3870 FloatParts64 p; 3871 3872 float64_unpack_raw(&p, a); 3873 p.frac <<= float64_params.frac_shift; 3874 parts_silence_nan(&p, status); 3875 p.frac >>= float64_params.frac_shift; 3876 return float64_pack_raw(&p); 3877 } 3878 3879 bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status) 3880 { 3881 FloatParts64 p; 3882 3883 bfloat16_unpack_raw(&p, a); 3884 p.frac <<= bfloat16_params.frac_shift; 3885 parts_silence_nan(&p, status); 3886 p.frac >>= bfloat16_params.frac_shift; 3887 return bfloat16_pack_raw(&p); 3888 } 3889 3890 float128 float128_silence_nan(float128 a, float_status *status) 3891 { 3892 FloatParts128 p; 3893 3894 float128_unpack_raw(&p, a); 3895 frac_shl(&p, float128_params.frac_shift); 3896 parts_silence_nan(&p, status); 3897 frac_shr(&p, float128_params.frac_shift); 3898 return float128_pack_raw(&p); 3899 } 3900 3901 /*---------------------------------------------------------------------------- 3902 | If `a' is denormal and we are in flush-to-zero mode then set the 3903 | input-denormal exception and return zero. Otherwise just return the value. 3904 *----------------------------------------------------------------------------*/ 3905 3906 static bool parts_squash_denormal(FloatParts64 p, float_status *status) 3907 { 3908 if (p.exp == 0 && p.frac != 0) { 3909 float_raise(float_flag_input_denormal, status); 3910 return true; 3911 } 3912 3913 return false; 3914 } 3915 3916 float16 float16_squash_input_denormal(float16 a, float_status *status) 3917 { 3918 if (status->flush_inputs_to_zero) { 3919 FloatParts64 p; 3920 3921 float16_unpack_raw(&p, a); 3922 if (parts_squash_denormal(p, status)) { 3923 return float16_set_sign(float16_zero, p.sign); 3924 } 3925 } 3926 return a; 3927 } 3928 3929 float32 float32_squash_input_denormal(float32 a, float_status *status) 3930 { 3931 if (status->flush_inputs_to_zero) { 3932 FloatParts64 p; 3933 3934 float32_unpack_raw(&p, a); 3935 if (parts_squash_denormal(p, status)) { 3936 return float32_set_sign(float32_zero, p.sign); 3937 } 3938 } 3939 return a; 3940 } 3941 3942 float64 float64_squash_input_denormal(float64 a, float_status *status) 3943 { 3944 if (status->flush_inputs_to_zero) { 3945 FloatParts64 p; 3946 3947 float64_unpack_raw(&p, a); 3948 if (parts_squash_denormal(p, status)) { 3949 return float64_set_sign(float64_zero, p.sign); 3950 } 3951 } 3952 return a; 3953 } 3954 3955 bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status) 3956 { 3957 if (status->flush_inputs_to_zero) { 3958 FloatParts64 p; 3959 3960 bfloat16_unpack_raw(&p, a); 3961 if (parts_squash_denormal(p, status)) { 3962 return bfloat16_set_sign(bfloat16_zero, p.sign); 3963 } 3964 } 3965 return a; 3966 } 3967 3968 /*---------------------------------------------------------------------------- 3969 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 3970 | and 7, and returns the properly rounded 32-bit integer corresponding to the 3971 | input. If `zSign' is 1, the input is negated before being converted to an 3972 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 3973 | is simply rounded to an integer, with the inexact exception raised if the 3974 | input cannot be represented exactly as an integer. However, if the fixed- 3975 | point input is too large, the invalid exception is raised and the largest 3976 | positive or negative integer is returned. 3977 *----------------------------------------------------------------------------*/ 3978 3979 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ, 3980 float_status *status) 3981 { 3982 int8_t roundingMode; 3983 bool roundNearestEven; 3984 int8_t roundIncrement, roundBits; 3985 int32_t z; 3986 3987 roundingMode = status->float_rounding_mode; 3988 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3989 switch (roundingMode) { 3990 case float_round_nearest_even: 3991 case float_round_ties_away: 3992 roundIncrement = 0x40; 3993 break; 3994 case float_round_to_zero: 3995 roundIncrement = 0; 3996 break; 3997 case float_round_up: 3998 roundIncrement = zSign ? 0 : 0x7f; 3999 break; 4000 case float_round_down: 4001 roundIncrement = zSign ? 0x7f : 0; 4002 break; 4003 case float_round_to_odd: 4004 roundIncrement = absZ & 0x80 ? 0 : 0x7f; 4005 break; 4006 default: 4007 abort(); 4008 } 4009 roundBits = absZ & 0x7F; 4010 absZ = ( absZ + roundIncrement )>>7; 4011 if (!(roundBits ^ 0x40) && roundNearestEven) { 4012 absZ &= ~1; 4013 } 4014 z = absZ; 4015 if ( zSign ) z = - z; 4016 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 4017 float_raise(float_flag_invalid, status); 4018 return zSign ? INT32_MIN : INT32_MAX; 4019 } 4020 if (roundBits) { 4021 float_raise(float_flag_inexact, status); 4022 } 4023 return z; 4024 4025 } 4026 4027 /*---------------------------------------------------------------------------- 4028 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 4029 | `absZ1', with binary point between bits 63 and 64 (between the input words), 4030 | and returns the properly rounded 64-bit integer corresponding to the input. 4031 | If `zSign' is 1, the input is negated before being converted to an integer. 4032 | Ordinarily, the fixed-point input is simply rounded to an integer, with 4033 | the inexact exception raised if the input cannot be represented exactly as 4034 | an integer. However, if the fixed-point input is too large, the invalid 4035 | exception is raised and the largest positive or negative integer is 4036 | returned. 4037 *----------------------------------------------------------------------------*/ 4038 4039 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1, 4040 float_status *status) 4041 { 4042 int8_t roundingMode; 4043 bool roundNearestEven, increment; 4044 int64_t z; 4045 4046 roundingMode = status->float_rounding_mode; 4047 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4048 switch (roundingMode) { 4049 case float_round_nearest_even: 4050 case float_round_ties_away: 4051 increment = ((int64_t) absZ1 < 0); 4052 break; 4053 case float_round_to_zero: 4054 increment = 0; 4055 break; 4056 case float_round_up: 4057 increment = !zSign && absZ1; 4058 break; 4059 case float_round_down: 4060 increment = zSign && absZ1; 4061 break; 4062 case float_round_to_odd: 4063 increment = !(absZ0 & 1) && absZ1; 4064 break; 4065 default: 4066 abort(); 4067 } 4068 if ( increment ) { 4069 ++absZ0; 4070 if ( absZ0 == 0 ) goto overflow; 4071 if (!(absZ1 << 1) && roundNearestEven) { 4072 absZ0 &= ~1; 4073 } 4074 } 4075 z = absZ0; 4076 if ( zSign ) z = - z; 4077 if ( z && ( ( z < 0 ) ^ zSign ) ) { 4078 overflow: 4079 float_raise(float_flag_invalid, status); 4080 return zSign ? INT64_MIN : INT64_MAX; 4081 } 4082 if (absZ1) { 4083 float_raise(float_flag_inexact, status); 4084 } 4085 return z; 4086 4087 } 4088 4089 /*---------------------------------------------------------------------------- 4090 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 4091 | `absZ1', with binary point between bits 63 and 64 (between the input words), 4092 | and returns the properly rounded 64-bit unsigned integer corresponding to the 4093 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 4094 | with the inexact exception raised if the input cannot be represented exactly 4095 | as an integer. However, if the fixed-point input is too large, the invalid 4096 | exception is raised and the largest unsigned integer is returned. 4097 *----------------------------------------------------------------------------*/ 4098 4099 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0, 4100 uint64_t absZ1, float_status *status) 4101 { 4102 int8_t roundingMode; 4103 bool roundNearestEven, increment; 4104 4105 roundingMode = status->float_rounding_mode; 4106 roundNearestEven = (roundingMode == float_round_nearest_even); 4107 switch (roundingMode) { 4108 case float_round_nearest_even: 4109 case float_round_ties_away: 4110 increment = ((int64_t)absZ1 < 0); 4111 break; 4112 case float_round_to_zero: 4113 increment = 0; 4114 break; 4115 case float_round_up: 4116 increment = !zSign && absZ1; 4117 break; 4118 case float_round_down: 4119 increment = zSign && absZ1; 4120 break; 4121 case float_round_to_odd: 4122 increment = !(absZ0 & 1) && absZ1; 4123 break; 4124 default: 4125 abort(); 4126 } 4127 if (increment) { 4128 ++absZ0; 4129 if (absZ0 == 0) { 4130 float_raise(float_flag_invalid, status); 4131 return UINT64_MAX; 4132 } 4133 if (!(absZ1 << 1) && roundNearestEven) { 4134 absZ0 &= ~1; 4135 } 4136 } 4137 4138 if (zSign && absZ0) { 4139 float_raise(float_flag_invalid, status); 4140 return 0; 4141 } 4142 4143 if (absZ1) { 4144 float_raise(float_flag_inexact, status); 4145 } 4146 return absZ0; 4147 } 4148 4149 /*---------------------------------------------------------------------------- 4150 | Normalizes the subnormal single-precision floating-point value represented 4151 | by the denormalized significand `aSig'. The normalized exponent and 4152 | significand are stored at the locations pointed to by `zExpPtr' and 4153 | `zSigPtr', respectively. 4154 *----------------------------------------------------------------------------*/ 4155 4156 static void 4157 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 4158 { 4159 int8_t shiftCount; 4160 4161 shiftCount = clz32(aSig) - 8; 4162 *zSigPtr = aSig<<shiftCount; 4163 *zExpPtr = 1 - shiftCount; 4164 4165 } 4166 4167 /*---------------------------------------------------------------------------- 4168 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4169 | and significand `zSig', and returns the proper single-precision floating- 4170 | point value corresponding to the abstract input. Ordinarily, the abstract 4171 | value is simply rounded and packed into the single-precision format, with 4172 | the inexact exception raised if the abstract input cannot be represented 4173 | exactly. However, if the abstract value is too large, the overflow and 4174 | inexact exceptions are raised and an infinity or maximal finite value is 4175 | returned. If the abstract value is too small, the input value is rounded to 4176 | a subnormal number, and the underflow and inexact exceptions are raised if 4177 | the abstract input cannot be represented exactly as a subnormal single- 4178 | precision floating-point number. 4179 | The input significand `zSig' has its binary point between bits 30 4180 | and 29, which is 7 bits to the left of the usual location. This shifted 4181 | significand must be normalized or smaller. If `zSig' is not normalized, 4182 | `zExp' must be 0; in that case, the result returned is a subnormal number, 4183 | and it must not require rounding. In the usual case that `zSig' is 4184 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 4185 | The handling of underflow and overflow follows the IEC/IEEE Standard for 4186 | Binary Floating-Point Arithmetic. 4187 *----------------------------------------------------------------------------*/ 4188 4189 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig, 4190 float_status *status) 4191 { 4192 int8_t roundingMode; 4193 bool roundNearestEven; 4194 int8_t roundIncrement, roundBits; 4195 bool isTiny; 4196 4197 roundingMode = status->float_rounding_mode; 4198 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4199 switch (roundingMode) { 4200 case float_round_nearest_even: 4201 case float_round_ties_away: 4202 roundIncrement = 0x40; 4203 break; 4204 case float_round_to_zero: 4205 roundIncrement = 0; 4206 break; 4207 case float_round_up: 4208 roundIncrement = zSign ? 0 : 0x7f; 4209 break; 4210 case float_round_down: 4211 roundIncrement = zSign ? 0x7f : 0; 4212 break; 4213 case float_round_to_odd: 4214 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 4215 break; 4216 default: 4217 abort(); 4218 break; 4219 } 4220 roundBits = zSig & 0x7F; 4221 if ( 0xFD <= (uint16_t) zExp ) { 4222 if ( ( 0xFD < zExp ) 4223 || ( ( zExp == 0xFD ) 4224 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 4225 ) { 4226 bool overflow_to_inf = roundingMode != float_round_to_odd && 4227 roundIncrement != 0; 4228 float_raise(float_flag_overflow | float_flag_inexact, status); 4229 return packFloat32(zSign, 0xFF, -!overflow_to_inf); 4230 } 4231 if ( zExp < 0 ) { 4232 if (status->flush_to_zero) { 4233 float_raise(float_flag_output_denormal, status); 4234 return packFloat32(zSign, 0, 0); 4235 } 4236 isTiny = status->tininess_before_rounding 4237 || (zExp < -1) 4238 || (zSig + roundIncrement < 0x80000000); 4239 shift32RightJamming( zSig, - zExp, &zSig ); 4240 zExp = 0; 4241 roundBits = zSig & 0x7F; 4242 if (isTiny && roundBits) { 4243 float_raise(float_flag_underflow, status); 4244 } 4245 if (roundingMode == float_round_to_odd) { 4246 /* 4247 * For round-to-odd case, the roundIncrement depends on 4248 * zSig which just changed. 4249 */ 4250 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 4251 } 4252 } 4253 } 4254 if (roundBits) { 4255 float_raise(float_flag_inexact, status); 4256 } 4257 zSig = ( zSig + roundIncrement )>>7; 4258 if (!(roundBits ^ 0x40) && roundNearestEven) { 4259 zSig &= ~1; 4260 } 4261 if ( zSig == 0 ) zExp = 0; 4262 return packFloat32( zSign, zExp, zSig ); 4263 4264 } 4265 4266 /*---------------------------------------------------------------------------- 4267 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4268 | and significand `zSig', and returns the proper single-precision floating- 4269 | point value corresponding to the abstract input. This routine is just like 4270 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 4271 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 4272 | floating-point exponent. 4273 *----------------------------------------------------------------------------*/ 4274 4275 static float32 4276 normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig, 4277 float_status *status) 4278 { 4279 int8_t shiftCount; 4280 4281 shiftCount = clz32(zSig) - 1; 4282 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 4283 status); 4284 4285 } 4286 4287 /*---------------------------------------------------------------------------- 4288 | Normalizes the subnormal double-precision floating-point value represented 4289 | by the denormalized significand `aSig'. The normalized exponent and 4290 | significand are stored at the locations pointed to by `zExpPtr' and 4291 | `zSigPtr', respectively. 4292 *----------------------------------------------------------------------------*/ 4293 4294 static void 4295 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 4296 { 4297 int8_t shiftCount; 4298 4299 shiftCount = clz64(aSig) - 11; 4300 *zSigPtr = aSig<<shiftCount; 4301 *zExpPtr = 1 - shiftCount; 4302 4303 } 4304 4305 /*---------------------------------------------------------------------------- 4306 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 4307 | double-precision floating-point value, returning the result. After being 4308 | shifted into the proper positions, the three fields are simply added 4309 | together to form the result. This means that any integer portion of `zSig' 4310 | will be added into the exponent. Since a properly normalized significand 4311 | will have an integer portion equal to 1, the `zExp' input should be 1 less 4312 | than the desired result exponent whenever `zSig' is a complete, normalized 4313 | significand. 4314 *----------------------------------------------------------------------------*/ 4315 4316 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig) 4317 { 4318 4319 return make_float64( 4320 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 4321 4322 } 4323 4324 /*---------------------------------------------------------------------------- 4325 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4326 | and significand `zSig', and returns the proper double-precision floating- 4327 | point value corresponding to the abstract input. Ordinarily, the abstract 4328 | value is simply rounded and packed into the double-precision format, with 4329 | the inexact exception raised if the abstract input cannot be represented 4330 | exactly. However, if the abstract value is too large, the overflow and 4331 | inexact exceptions are raised and an infinity or maximal finite value is 4332 | returned. If the abstract value is too small, the input value is rounded to 4333 | a subnormal number, and the underflow and inexact exceptions are raised if 4334 | the abstract input cannot be represented exactly as a subnormal double- 4335 | precision floating-point number. 4336 | The input significand `zSig' has its binary point between bits 62 4337 | and 61, which is 10 bits to the left of the usual location. This shifted 4338 | significand must be normalized or smaller. If `zSig' is not normalized, 4339 | `zExp' must be 0; in that case, the result returned is a subnormal number, 4340 | and it must not require rounding. In the usual case that `zSig' is 4341 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 4342 | The handling of underflow and overflow follows the IEC/IEEE Standard for 4343 | Binary Floating-Point Arithmetic. 4344 *----------------------------------------------------------------------------*/ 4345 4346 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig, 4347 float_status *status) 4348 { 4349 int8_t roundingMode; 4350 bool roundNearestEven; 4351 int roundIncrement, roundBits; 4352 bool isTiny; 4353 4354 roundingMode = status->float_rounding_mode; 4355 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4356 switch (roundingMode) { 4357 case float_round_nearest_even: 4358 case float_round_ties_away: 4359 roundIncrement = 0x200; 4360 break; 4361 case float_round_to_zero: 4362 roundIncrement = 0; 4363 break; 4364 case float_round_up: 4365 roundIncrement = zSign ? 0 : 0x3ff; 4366 break; 4367 case float_round_down: 4368 roundIncrement = zSign ? 0x3ff : 0; 4369 break; 4370 case float_round_to_odd: 4371 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 4372 break; 4373 default: 4374 abort(); 4375 } 4376 roundBits = zSig & 0x3FF; 4377 if ( 0x7FD <= (uint16_t) zExp ) { 4378 if ( ( 0x7FD < zExp ) 4379 || ( ( zExp == 0x7FD ) 4380 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 4381 ) { 4382 bool overflow_to_inf = roundingMode != float_round_to_odd && 4383 roundIncrement != 0; 4384 float_raise(float_flag_overflow | float_flag_inexact, status); 4385 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 4386 } 4387 if ( zExp < 0 ) { 4388 if (status->flush_to_zero) { 4389 float_raise(float_flag_output_denormal, status); 4390 return packFloat64(zSign, 0, 0); 4391 } 4392 isTiny = status->tininess_before_rounding 4393 || (zExp < -1) 4394 || (zSig + roundIncrement < UINT64_C(0x8000000000000000)); 4395 shift64RightJamming( zSig, - zExp, &zSig ); 4396 zExp = 0; 4397 roundBits = zSig & 0x3FF; 4398 if (isTiny && roundBits) { 4399 float_raise(float_flag_underflow, status); 4400 } 4401 if (roundingMode == float_round_to_odd) { 4402 /* 4403 * For round-to-odd case, the roundIncrement depends on 4404 * zSig which just changed. 4405 */ 4406 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 4407 } 4408 } 4409 } 4410 if (roundBits) { 4411 float_raise(float_flag_inexact, status); 4412 } 4413 zSig = ( zSig + roundIncrement )>>10; 4414 if (!(roundBits ^ 0x200) && roundNearestEven) { 4415 zSig &= ~1; 4416 } 4417 if ( zSig == 0 ) zExp = 0; 4418 return packFloat64( zSign, zExp, zSig ); 4419 4420 } 4421 4422 /*---------------------------------------------------------------------------- 4423 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4424 | and significand `zSig', and returns the proper double-precision floating- 4425 | point value corresponding to the abstract input. This routine is just like 4426 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 4427 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 4428 | floating-point exponent. 4429 *----------------------------------------------------------------------------*/ 4430 4431 static float64 4432 normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig, 4433 float_status *status) 4434 { 4435 int8_t shiftCount; 4436 4437 shiftCount = clz64(zSig) - 1; 4438 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 4439 status); 4440 4441 } 4442 4443 /*---------------------------------------------------------------------------- 4444 | Normalizes the subnormal extended double-precision floating-point value 4445 | represented by the denormalized significand `aSig'. The normalized exponent 4446 | and significand are stored at the locations pointed to by `zExpPtr' and 4447 | `zSigPtr', respectively. 4448 *----------------------------------------------------------------------------*/ 4449 4450 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr, 4451 uint64_t *zSigPtr) 4452 { 4453 int8_t shiftCount; 4454 4455 shiftCount = clz64(aSig); 4456 *zSigPtr = aSig<<shiftCount; 4457 *zExpPtr = 1 - shiftCount; 4458 } 4459 4460 /*---------------------------------------------------------------------------- 4461 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4462 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 4463 | and returns the proper extended double-precision floating-point value 4464 | corresponding to the abstract input. Ordinarily, the abstract value is 4465 | rounded and packed into the extended double-precision format, with the 4466 | inexact exception raised if the abstract input cannot be represented 4467 | exactly. However, if the abstract value is too large, the overflow and 4468 | inexact exceptions are raised and an infinity or maximal finite value is 4469 | returned. If the abstract value is too small, the input value is rounded to 4470 | a subnormal number, and the underflow and inexact exceptions are raised if 4471 | the abstract input cannot be represented exactly as a subnormal extended 4472 | double-precision floating-point number. 4473 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 4474 | number of bits as single or double precision, respectively. Otherwise, the 4475 | result is rounded to the full precision of the extended double-precision 4476 | format. 4477 | The input significand must be normalized or smaller. If the input 4478 | significand is not normalized, `zExp' must be 0; in that case, the result 4479 | returned is a subnormal number, and it must not require rounding. The 4480 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 4481 | Floating-Point Arithmetic. 4482 *----------------------------------------------------------------------------*/ 4483 4484 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign, 4485 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 4486 float_status *status) 4487 { 4488 int8_t roundingMode; 4489 bool roundNearestEven, increment, isTiny; 4490 int64_t roundIncrement, roundMask, roundBits; 4491 4492 roundingMode = status->float_rounding_mode; 4493 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4494 if ( roundingPrecision == 80 ) goto precision80; 4495 if ( roundingPrecision == 64 ) { 4496 roundIncrement = UINT64_C(0x0000000000000400); 4497 roundMask = UINT64_C(0x00000000000007FF); 4498 } 4499 else if ( roundingPrecision == 32 ) { 4500 roundIncrement = UINT64_C(0x0000008000000000); 4501 roundMask = UINT64_C(0x000000FFFFFFFFFF); 4502 } 4503 else { 4504 goto precision80; 4505 } 4506 zSig0 |= ( zSig1 != 0 ); 4507 switch (roundingMode) { 4508 case float_round_nearest_even: 4509 case float_round_ties_away: 4510 break; 4511 case float_round_to_zero: 4512 roundIncrement = 0; 4513 break; 4514 case float_round_up: 4515 roundIncrement = zSign ? 0 : roundMask; 4516 break; 4517 case float_round_down: 4518 roundIncrement = zSign ? roundMask : 0; 4519 break; 4520 default: 4521 abort(); 4522 } 4523 roundBits = zSig0 & roundMask; 4524 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 4525 if ( ( 0x7FFE < zExp ) 4526 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 4527 ) { 4528 goto overflow; 4529 } 4530 if ( zExp <= 0 ) { 4531 if (status->flush_to_zero) { 4532 float_raise(float_flag_output_denormal, status); 4533 return packFloatx80(zSign, 0, 0); 4534 } 4535 isTiny = status->tininess_before_rounding 4536 || (zExp < 0 ) 4537 || (zSig0 <= zSig0 + roundIncrement); 4538 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 4539 zExp = 0; 4540 roundBits = zSig0 & roundMask; 4541 if (isTiny && roundBits) { 4542 float_raise(float_flag_underflow, status); 4543 } 4544 if (roundBits) { 4545 float_raise(float_flag_inexact, status); 4546 } 4547 zSig0 += roundIncrement; 4548 if ( (int64_t) zSig0 < 0 ) zExp = 1; 4549 roundIncrement = roundMask + 1; 4550 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 4551 roundMask |= roundIncrement; 4552 } 4553 zSig0 &= ~ roundMask; 4554 return packFloatx80( zSign, zExp, zSig0 ); 4555 } 4556 } 4557 if (roundBits) { 4558 float_raise(float_flag_inexact, status); 4559 } 4560 zSig0 += roundIncrement; 4561 if ( zSig0 < roundIncrement ) { 4562 ++zExp; 4563 zSig0 = UINT64_C(0x8000000000000000); 4564 } 4565 roundIncrement = roundMask + 1; 4566 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 4567 roundMask |= roundIncrement; 4568 } 4569 zSig0 &= ~ roundMask; 4570 if ( zSig0 == 0 ) zExp = 0; 4571 return packFloatx80( zSign, zExp, zSig0 ); 4572 precision80: 4573 switch (roundingMode) { 4574 case float_round_nearest_even: 4575 case float_round_ties_away: 4576 increment = ((int64_t)zSig1 < 0); 4577 break; 4578 case float_round_to_zero: 4579 increment = 0; 4580 break; 4581 case float_round_up: 4582 increment = !zSign && zSig1; 4583 break; 4584 case float_round_down: 4585 increment = zSign && zSig1; 4586 break; 4587 default: 4588 abort(); 4589 } 4590 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 4591 if ( ( 0x7FFE < zExp ) 4592 || ( ( zExp == 0x7FFE ) 4593 && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) ) 4594 && increment 4595 ) 4596 ) { 4597 roundMask = 0; 4598 overflow: 4599 float_raise(float_flag_overflow | float_flag_inexact, status); 4600 if ( ( roundingMode == float_round_to_zero ) 4601 || ( zSign && ( roundingMode == float_round_up ) ) 4602 || ( ! zSign && ( roundingMode == float_round_down ) ) 4603 ) { 4604 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 4605 } 4606 return packFloatx80(zSign, 4607 floatx80_infinity_high, 4608 floatx80_infinity_low); 4609 } 4610 if ( zExp <= 0 ) { 4611 isTiny = status->tininess_before_rounding 4612 || (zExp < 0) 4613 || !increment 4614 || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF)); 4615 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 4616 zExp = 0; 4617 if (isTiny && zSig1) { 4618 float_raise(float_flag_underflow, status); 4619 } 4620 if (zSig1) { 4621 float_raise(float_flag_inexact, status); 4622 } 4623 switch (roundingMode) { 4624 case float_round_nearest_even: 4625 case float_round_ties_away: 4626 increment = ((int64_t)zSig1 < 0); 4627 break; 4628 case float_round_to_zero: 4629 increment = 0; 4630 break; 4631 case float_round_up: 4632 increment = !zSign && zSig1; 4633 break; 4634 case float_round_down: 4635 increment = zSign && zSig1; 4636 break; 4637 default: 4638 abort(); 4639 } 4640 if ( increment ) { 4641 ++zSig0; 4642 if (!(zSig1 << 1) && roundNearestEven) { 4643 zSig0 &= ~1; 4644 } 4645 if ( (int64_t) zSig0 < 0 ) zExp = 1; 4646 } 4647 return packFloatx80( zSign, zExp, zSig0 ); 4648 } 4649 } 4650 if (zSig1) { 4651 float_raise(float_flag_inexact, status); 4652 } 4653 if ( increment ) { 4654 ++zSig0; 4655 if ( zSig0 == 0 ) { 4656 ++zExp; 4657 zSig0 = UINT64_C(0x8000000000000000); 4658 } 4659 else { 4660 if (!(zSig1 << 1) && roundNearestEven) { 4661 zSig0 &= ~1; 4662 } 4663 } 4664 } 4665 else { 4666 if ( zSig0 == 0 ) zExp = 0; 4667 } 4668 return packFloatx80( zSign, zExp, zSig0 ); 4669 4670 } 4671 4672 /*---------------------------------------------------------------------------- 4673 | Takes an abstract floating-point value having sign `zSign', exponent 4674 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 4675 | and returns the proper extended double-precision floating-point value 4676 | corresponding to the abstract input. This routine is just like 4677 | `roundAndPackFloatx80' except that the input significand does not have to be 4678 | normalized. 4679 *----------------------------------------------------------------------------*/ 4680 4681 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 4682 bool zSign, int32_t zExp, 4683 uint64_t zSig0, uint64_t zSig1, 4684 float_status *status) 4685 { 4686 int8_t shiftCount; 4687 4688 if ( zSig0 == 0 ) { 4689 zSig0 = zSig1; 4690 zSig1 = 0; 4691 zExp -= 64; 4692 } 4693 shiftCount = clz64(zSig0); 4694 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4695 zExp -= shiftCount; 4696 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 4697 zSig0, zSig1, status); 4698 4699 } 4700 4701 /*---------------------------------------------------------------------------- 4702 | Returns the least-significant 64 fraction bits of the quadruple-precision 4703 | floating-point value `a'. 4704 *----------------------------------------------------------------------------*/ 4705 4706 static inline uint64_t extractFloat128Frac1( float128 a ) 4707 { 4708 4709 return a.low; 4710 4711 } 4712 4713 /*---------------------------------------------------------------------------- 4714 | Returns the most-significant 48 fraction bits of the quadruple-precision 4715 | floating-point value `a'. 4716 *----------------------------------------------------------------------------*/ 4717 4718 static inline uint64_t extractFloat128Frac0( float128 a ) 4719 { 4720 4721 return a.high & UINT64_C(0x0000FFFFFFFFFFFF); 4722 4723 } 4724 4725 /*---------------------------------------------------------------------------- 4726 | Returns the exponent bits of the quadruple-precision floating-point value 4727 | `a'. 4728 *----------------------------------------------------------------------------*/ 4729 4730 static inline int32_t extractFloat128Exp( float128 a ) 4731 { 4732 4733 return ( a.high>>48 ) & 0x7FFF; 4734 4735 } 4736 4737 /*---------------------------------------------------------------------------- 4738 | Returns the sign bit of the quadruple-precision floating-point value `a'. 4739 *----------------------------------------------------------------------------*/ 4740 4741 static inline bool extractFloat128Sign(float128 a) 4742 { 4743 return a.high >> 63; 4744 } 4745 4746 /*---------------------------------------------------------------------------- 4747 | Normalizes the subnormal quadruple-precision floating-point value 4748 | represented by the denormalized significand formed by the concatenation of 4749 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 4750 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 4751 | significand are stored at the location pointed to by `zSig0Ptr', and the 4752 | least significant 64 bits of the normalized significand are stored at the 4753 | location pointed to by `zSig1Ptr'. 4754 *----------------------------------------------------------------------------*/ 4755 4756 static void 4757 normalizeFloat128Subnormal( 4758 uint64_t aSig0, 4759 uint64_t aSig1, 4760 int32_t *zExpPtr, 4761 uint64_t *zSig0Ptr, 4762 uint64_t *zSig1Ptr 4763 ) 4764 { 4765 int8_t shiftCount; 4766 4767 if ( aSig0 == 0 ) { 4768 shiftCount = clz64(aSig1) - 15; 4769 if ( shiftCount < 0 ) { 4770 *zSig0Ptr = aSig1>>( - shiftCount ); 4771 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 4772 } 4773 else { 4774 *zSig0Ptr = aSig1<<shiftCount; 4775 *zSig1Ptr = 0; 4776 } 4777 *zExpPtr = - shiftCount - 63; 4778 } 4779 else { 4780 shiftCount = clz64(aSig0) - 15; 4781 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 4782 *zExpPtr = 1 - shiftCount; 4783 } 4784 4785 } 4786 4787 /*---------------------------------------------------------------------------- 4788 | Packs the sign `zSign', the exponent `zExp', and the significand formed 4789 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 4790 | floating-point value, returning the result. After being shifted into the 4791 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 4792 | added together to form the most significant 32 bits of the result. This 4793 | means that any integer portion of `zSig0' will be added into the exponent. 4794 | Since a properly normalized significand will have an integer portion equal 4795 | to 1, the `zExp' input should be 1 less than the desired result exponent 4796 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 4797 | significand. 4798 *----------------------------------------------------------------------------*/ 4799 4800 static inline float128 4801 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1) 4802 { 4803 float128 z; 4804 4805 z.low = zSig1; 4806 z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0; 4807 return z; 4808 } 4809 4810 /*---------------------------------------------------------------------------- 4811 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4812 | and extended significand formed by the concatenation of `zSig0', `zSig1', 4813 | and `zSig2', and returns the proper quadruple-precision floating-point value 4814 | corresponding to the abstract input. Ordinarily, the abstract value is 4815 | simply rounded and packed into the quadruple-precision format, with the 4816 | inexact exception raised if the abstract input cannot be represented 4817 | exactly. However, if the abstract value is too large, the overflow and 4818 | inexact exceptions are raised and an infinity or maximal finite value is 4819 | returned. If the abstract value is too small, the input value is rounded to 4820 | a subnormal number, and the underflow and inexact exceptions are raised if 4821 | the abstract input cannot be represented exactly as a subnormal quadruple- 4822 | precision floating-point number. 4823 | The input significand must be normalized or smaller. If the input 4824 | significand is not normalized, `zExp' must be 0; in that case, the result 4825 | returned is a subnormal number, and it must not require rounding. In the 4826 | usual case that the input significand is normalized, `zExp' must be 1 less 4827 | than the ``true'' floating-point exponent. The handling of underflow and 4828 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4829 *----------------------------------------------------------------------------*/ 4830 4831 static float128 roundAndPackFloat128(bool zSign, int32_t zExp, 4832 uint64_t zSig0, uint64_t zSig1, 4833 uint64_t zSig2, float_status *status) 4834 { 4835 int8_t roundingMode; 4836 bool roundNearestEven, increment, isTiny; 4837 4838 roundingMode = status->float_rounding_mode; 4839 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4840 switch (roundingMode) { 4841 case float_round_nearest_even: 4842 case float_round_ties_away: 4843 increment = ((int64_t)zSig2 < 0); 4844 break; 4845 case float_round_to_zero: 4846 increment = 0; 4847 break; 4848 case float_round_up: 4849 increment = !zSign && zSig2; 4850 break; 4851 case float_round_down: 4852 increment = zSign && zSig2; 4853 break; 4854 case float_round_to_odd: 4855 increment = !(zSig1 & 0x1) && zSig2; 4856 break; 4857 default: 4858 abort(); 4859 } 4860 if ( 0x7FFD <= (uint32_t) zExp ) { 4861 if ( ( 0x7FFD < zExp ) 4862 || ( ( zExp == 0x7FFD ) 4863 && eq128( 4864 UINT64_C(0x0001FFFFFFFFFFFF), 4865 UINT64_C(0xFFFFFFFFFFFFFFFF), 4866 zSig0, 4867 zSig1 4868 ) 4869 && increment 4870 ) 4871 ) { 4872 float_raise(float_flag_overflow | float_flag_inexact, status); 4873 if ( ( roundingMode == float_round_to_zero ) 4874 || ( zSign && ( roundingMode == float_round_up ) ) 4875 || ( ! zSign && ( roundingMode == float_round_down ) ) 4876 || (roundingMode == float_round_to_odd) 4877 ) { 4878 return 4879 packFloat128( 4880 zSign, 4881 0x7FFE, 4882 UINT64_C(0x0000FFFFFFFFFFFF), 4883 UINT64_C(0xFFFFFFFFFFFFFFFF) 4884 ); 4885 } 4886 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4887 } 4888 if ( zExp < 0 ) { 4889 if (status->flush_to_zero) { 4890 float_raise(float_flag_output_denormal, status); 4891 return packFloat128(zSign, 0, 0, 0); 4892 } 4893 isTiny = status->tininess_before_rounding 4894 || (zExp < -1) 4895 || !increment 4896 || lt128(zSig0, zSig1, 4897 UINT64_C(0x0001FFFFFFFFFFFF), 4898 UINT64_C(0xFFFFFFFFFFFFFFFF)); 4899 shift128ExtraRightJamming( 4900 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 4901 zExp = 0; 4902 if (isTiny && zSig2) { 4903 float_raise(float_flag_underflow, status); 4904 } 4905 switch (roundingMode) { 4906 case float_round_nearest_even: 4907 case float_round_ties_away: 4908 increment = ((int64_t)zSig2 < 0); 4909 break; 4910 case float_round_to_zero: 4911 increment = 0; 4912 break; 4913 case float_round_up: 4914 increment = !zSign && zSig2; 4915 break; 4916 case float_round_down: 4917 increment = zSign && zSig2; 4918 break; 4919 case float_round_to_odd: 4920 increment = !(zSig1 & 0x1) && zSig2; 4921 break; 4922 default: 4923 abort(); 4924 } 4925 } 4926 } 4927 if (zSig2) { 4928 float_raise(float_flag_inexact, status); 4929 } 4930 if ( increment ) { 4931 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 4932 if ((zSig2 + zSig2 == 0) && roundNearestEven) { 4933 zSig1 &= ~1; 4934 } 4935 } 4936 else { 4937 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 4938 } 4939 return packFloat128( zSign, zExp, zSig0, zSig1 ); 4940 4941 } 4942 4943 /*---------------------------------------------------------------------------- 4944 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4945 | and significand formed by the concatenation of `zSig0' and `zSig1', and 4946 | returns the proper quadruple-precision floating-point value corresponding 4947 | to the abstract input. This routine is just like `roundAndPackFloat128' 4948 | except that the input significand has fewer bits and does not have to be 4949 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 4950 | point exponent. 4951 *----------------------------------------------------------------------------*/ 4952 4953 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp, 4954 uint64_t zSig0, uint64_t zSig1, 4955 float_status *status) 4956 { 4957 int8_t shiftCount; 4958 uint64_t zSig2; 4959 4960 if ( zSig0 == 0 ) { 4961 zSig0 = zSig1; 4962 zSig1 = 0; 4963 zExp -= 64; 4964 } 4965 shiftCount = clz64(zSig0) - 15; 4966 if ( 0 <= shiftCount ) { 4967 zSig2 = 0; 4968 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4969 } 4970 else { 4971 shift128ExtraRightJamming( 4972 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 4973 } 4974 zExp -= shiftCount; 4975 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 4976 4977 } 4978 4979 4980 /*---------------------------------------------------------------------------- 4981 | Returns the result of converting the 32-bit two's complement integer `a' 4982 | to the extended double-precision floating-point format. The conversion 4983 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4984 | Arithmetic. 4985 *----------------------------------------------------------------------------*/ 4986 4987 floatx80 int32_to_floatx80(int32_t a, float_status *status) 4988 { 4989 bool zSign; 4990 uint32_t absA; 4991 int8_t shiftCount; 4992 uint64_t zSig; 4993 4994 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 4995 zSign = ( a < 0 ); 4996 absA = zSign ? - a : a; 4997 shiftCount = clz32(absA) + 32; 4998 zSig = absA; 4999 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 5000 5001 } 5002 5003 /*---------------------------------------------------------------------------- 5004 | Returns the result of converting the 32-bit two's complement integer `a' to 5005 | the quadruple-precision floating-point format. The conversion is performed 5006 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5007 *----------------------------------------------------------------------------*/ 5008 5009 float128 int32_to_float128(int32_t a, float_status *status) 5010 { 5011 bool zSign; 5012 uint32_t absA; 5013 int8_t shiftCount; 5014 uint64_t zSig0; 5015 5016 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 5017 zSign = ( a < 0 ); 5018 absA = zSign ? - a : a; 5019 shiftCount = clz32(absA) + 17; 5020 zSig0 = absA; 5021 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 5022 5023 } 5024 5025 /*---------------------------------------------------------------------------- 5026 | Returns the result of converting the 64-bit two's complement integer `a' 5027 | to the extended double-precision floating-point format. The conversion 5028 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5029 | Arithmetic. 5030 *----------------------------------------------------------------------------*/ 5031 5032 floatx80 int64_to_floatx80(int64_t a, float_status *status) 5033 { 5034 bool zSign; 5035 uint64_t absA; 5036 int8_t shiftCount; 5037 5038 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 5039 zSign = ( a < 0 ); 5040 absA = zSign ? - a : a; 5041 shiftCount = clz64(absA); 5042 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 5043 5044 } 5045 5046 /*---------------------------------------------------------------------------- 5047 | Returns the result of converting the 64-bit two's complement integer `a' to 5048 | the quadruple-precision floating-point format. The conversion is performed 5049 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5050 *----------------------------------------------------------------------------*/ 5051 5052 float128 int64_to_float128(int64_t a, float_status *status) 5053 { 5054 bool zSign; 5055 uint64_t absA; 5056 int8_t shiftCount; 5057 int32_t zExp; 5058 uint64_t zSig0, zSig1; 5059 5060 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 5061 zSign = ( a < 0 ); 5062 absA = zSign ? - a : a; 5063 shiftCount = clz64(absA) + 49; 5064 zExp = 0x406E - shiftCount; 5065 if ( 64 <= shiftCount ) { 5066 zSig1 = 0; 5067 zSig0 = absA; 5068 shiftCount -= 64; 5069 } 5070 else { 5071 zSig1 = absA; 5072 zSig0 = 0; 5073 } 5074 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 5075 return packFloat128( zSign, zExp, zSig0, zSig1 ); 5076 5077 } 5078 5079 /*---------------------------------------------------------------------------- 5080 | Returns the result of converting the 64-bit unsigned integer `a' 5081 | to the quadruple-precision floating-point format. The conversion is performed 5082 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5083 *----------------------------------------------------------------------------*/ 5084 5085 float128 uint64_to_float128(uint64_t a, float_status *status) 5086 { 5087 if (a == 0) { 5088 return float128_zero; 5089 } 5090 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status); 5091 } 5092 5093 /*---------------------------------------------------------------------------- 5094 | Returns the result of converting the single-precision floating-point value 5095 | `a' to the extended double-precision floating-point format. The conversion 5096 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5097 | Arithmetic. 5098 *----------------------------------------------------------------------------*/ 5099 5100 floatx80 float32_to_floatx80(float32 a, float_status *status) 5101 { 5102 bool aSign; 5103 int aExp; 5104 uint32_t aSig; 5105 5106 a = float32_squash_input_denormal(a, status); 5107 aSig = extractFloat32Frac( a ); 5108 aExp = extractFloat32Exp( a ); 5109 aSign = extractFloat32Sign( a ); 5110 if ( aExp == 0xFF ) { 5111 if (aSig) { 5112 floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status), 5113 status); 5114 return floatx80_silence_nan(res, status); 5115 } 5116 return packFloatx80(aSign, 5117 floatx80_infinity_high, 5118 floatx80_infinity_low); 5119 } 5120 if ( aExp == 0 ) { 5121 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 5122 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5123 } 5124 aSig |= 0x00800000; 5125 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 5126 5127 } 5128 5129 /*---------------------------------------------------------------------------- 5130 | Returns the result of converting the single-precision floating-point value 5131 | `a' to the double-precision floating-point format. The conversion is 5132 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5133 | Arithmetic. 5134 *----------------------------------------------------------------------------*/ 5135 5136 float128 float32_to_float128(float32 a, float_status *status) 5137 { 5138 bool aSign; 5139 int aExp; 5140 uint32_t aSig; 5141 5142 a = float32_squash_input_denormal(a, status); 5143 aSig = extractFloat32Frac( a ); 5144 aExp = extractFloat32Exp( a ); 5145 aSign = extractFloat32Sign( a ); 5146 if ( aExp == 0xFF ) { 5147 if (aSig) { 5148 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 5149 } 5150 return packFloat128( aSign, 0x7FFF, 0, 0 ); 5151 } 5152 if ( aExp == 0 ) { 5153 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 5154 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5155 --aExp; 5156 } 5157 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 5158 5159 } 5160 5161 /*---------------------------------------------------------------------------- 5162 | Returns the remainder of the single-precision floating-point value `a' 5163 | with respect to the corresponding value `b'. The operation is performed 5164 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5165 *----------------------------------------------------------------------------*/ 5166 5167 float32 float32_rem(float32 a, float32 b, float_status *status) 5168 { 5169 bool aSign, zSign; 5170 int aExp, bExp, expDiff; 5171 uint32_t aSig, bSig; 5172 uint32_t q; 5173 uint64_t aSig64, bSig64, q64; 5174 uint32_t alternateASig; 5175 int32_t sigMean; 5176 a = float32_squash_input_denormal(a, status); 5177 b = float32_squash_input_denormal(b, status); 5178 5179 aSig = extractFloat32Frac( a ); 5180 aExp = extractFloat32Exp( a ); 5181 aSign = extractFloat32Sign( a ); 5182 bSig = extractFloat32Frac( b ); 5183 bExp = extractFloat32Exp( b ); 5184 if ( aExp == 0xFF ) { 5185 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 5186 return propagateFloat32NaN(a, b, status); 5187 } 5188 float_raise(float_flag_invalid, status); 5189 return float32_default_nan(status); 5190 } 5191 if ( bExp == 0xFF ) { 5192 if (bSig) { 5193 return propagateFloat32NaN(a, b, status); 5194 } 5195 return a; 5196 } 5197 if ( bExp == 0 ) { 5198 if ( bSig == 0 ) { 5199 float_raise(float_flag_invalid, status); 5200 return float32_default_nan(status); 5201 } 5202 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 5203 } 5204 if ( aExp == 0 ) { 5205 if ( aSig == 0 ) return a; 5206 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5207 } 5208 expDiff = aExp - bExp; 5209 aSig |= 0x00800000; 5210 bSig |= 0x00800000; 5211 if ( expDiff < 32 ) { 5212 aSig <<= 8; 5213 bSig <<= 8; 5214 if ( expDiff < 0 ) { 5215 if ( expDiff < -1 ) return a; 5216 aSig >>= 1; 5217 } 5218 q = ( bSig <= aSig ); 5219 if ( q ) aSig -= bSig; 5220 if ( 0 < expDiff ) { 5221 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 5222 q >>= 32 - expDiff; 5223 bSig >>= 2; 5224 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 5225 } 5226 else { 5227 aSig >>= 2; 5228 bSig >>= 2; 5229 } 5230 } 5231 else { 5232 if ( bSig <= aSig ) aSig -= bSig; 5233 aSig64 = ( (uint64_t) aSig )<<40; 5234 bSig64 = ( (uint64_t) bSig )<<40; 5235 expDiff -= 64; 5236 while ( 0 < expDiff ) { 5237 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 5238 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 5239 aSig64 = - ( ( bSig * q64 )<<38 ); 5240 expDiff -= 62; 5241 } 5242 expDiff += 64; 5243 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 5244 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 5245 q = q64>>( 64 - expDiff ); 5246 bSig <<= 6; 5247 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 5248 } 5249 do { 5250 alternateASig = aSig; 5251 ++q; 5252 aSig -= bSig; 5253 } while ( 0 <= (int32_t) aSig ); 5254 sigMean = aSig + alternateASig; 5255 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 5256 aSig = alternateASig; 5257 } 5258 zSign = ( (int32_t) aSig < 0 ); 5259 if ( zSign ) aSig = - aSig; 5260 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 5261 } 5262 5263 5264 5265 /*---------------------------------------------------------------------------- 5266 | Returns the binary exponential of the single-precision floating-point value 5267 | `a'. The operation is performed according to the IEC/IEEE Standard for 5268 | Binary Floating-Point Arithmetic. 5269 | 5270 | Uses the following identities: 5271 | 5272 | 1. ------------------------------------------------------------------------- 5273 | x x*ln(2) 5274 | 2 = e 5275 | 5276 | 2. ------------------------------------------------------------------------- 5277 | 2 3 4 5 n 5278 | x x x x x x x 5279 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 5280 | 1! 2! 3! 4! 5! n! 5281 *----------------------------------------------------------------------------*/ 5282 5283 static const float64 float32_exp2_coefficients[15] = 5284 { 5285 const_float64( 0x3ff0000000000000ll ), /* 1 */ 5286 const_float64( 0x3fe0000000000000ll ), /* 2 */ 5287 const_float64( 0x3fc5555555555555ll ), /* 3 */ 5288 const_float64( 0x3fa5555555555555ll ), /* 4 */ 5289 const_float64( 0x3f81111111111111ll ), /* 5 */ 5290 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 5291 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 5292 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 5293 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 5294 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 5295 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 5296 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 5297 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 5298 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 5299 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 5300 }; 5301 5302 float32 float32_exp2(float32 a, float_status *status) 5303 { 5304 bool aSign; 5305 int aExp; 5306 uint32_t aSig; 5307 float64 r, x, xn; 5308 int i; 5309 a = float32_squash_input_denormal(a, status); 5310 5311 aSig = extractFloat32Frac( a ); 5312 aExp = extractFloat32Exp( a ); 5313 aSign = extractFloat32Sign( a ); 5314 5315 if ( aExp == 0xFF) { 5316 if (aSig) { 5317 return propagateFloat32NaN(a, float32_zero, status); 5318 } 5319 return (aSign) ? float32_zero : a; 5320 } 5321 if (aExp == 0) { 5322 if (aSig == 0) return float32_one; 5323 } 5324 5325 float_raise(float_flag_inexact, status); 5326 5327 /* ******************************* */ 5328 /* using float64 for approximation */ 5329 /* ******************************* */ 5330 x = float32_to_float64(a, status); 5331 x = float64_mul(x, float64_ln2, status); 5332 5333 xn = x; 5334 r = float64_one; 5335 for (i = 0 ; i < 15 ; i++) { 5336 float64 f; 5337 5338 f = float64_mul(xn, float32_exp2_coefficients[i], status); 5339 r = float64_add(r, f, status); 5340 5341 xn = float64_mul(xn, x, status); 5342 } 5343 5344 return float64_to_float32(r, status); 5345 } 5346 5347 /*---------------------------------------------------------------------------- 5348 | Returns the binary log of the single-precision floating-point value `a'. 5349 | The operation is performed according to the IEC/IEEE Standard for Binary 5350 | Floating-Point Arithmetic. 5351 *----------------------------------------------------------------------------*/ 5352 float32 float32_log2(float32 a, float_status *status) 5353 { 5354 bool aSign, zSign; 5355 int aExp; 5356 uint32_t aSig, zSig, i; 5357 5358 a = float32_squash_input_denormal(a, status); 5359 aSig = extractFloat32Frac( a ); 5360 aExp = extractFloat32Exp( a ); 5361 aSign = extractFloat32Sign( a ); 5362 5363 if ( aExp == 0 ) { 5364 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 5365 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5366 } 5367 if ( aSign ) { 5368 float_raise(float_flag_invalid, status); 5369 return float32_default_nan(status); 5370 } 5371 if ( aExp == 0xFF ) { 5372 if (aSig) { 5373 return propagateFloat32NaN(a, float32_zero, status); 5374 } 5375 return a; 5376 } 5377 5378 aExp -= 0x7F; 5379 aSig |= 0x00800000; 5380 zSign = aExp < 0; 5381 zSig = aExp << 23; 5382 5383 for (i = 1 << 22; i > 0; i >>= 1) { 5384 aSig = ( (uint64_t)aSig * aSig ) >> 23; 5385 if ( aSig & 0x01000000 ) { 5386 aSig >>= 1; 5387 zSig |= i; 5388 } 5389 } 5390 5391 if ( zSign ) 5392 zSig = -zSig; 5393 5394 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 5395 } 5396 5397 /*---------------------------------------------------------------------------- 5398 | Returns the result of converting the double-precision floating-point value 5399 | `a' to the extended double-precision floating-point format. The conversion 5400 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5401 | Arithmetic. 5402 *----------------------------------------------------------------------------*/ 5403 5404 floatx80 float64_to_floatx80(float64 a, float_status *status) 5405 { 5406 bool aSign; 5407 int aExp; 5408 uint64_t aSig; 5409 5410 a = float64_squash_input_denormal(a, status); 5411 aSig = extractFloat64Frac( a ); 5412 aExp = extractFloat64Exp( a ); 5413 aSign = extractFloat64Sign( a ); 5414 if ( aExp == 0x7FF ) { 5415 if (aSig) { 5416 floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status), 5417 status); 5418 return floatx80_silence_nan(res, status); 5419 } 5420 return packFloatx80(aSign, 5421 floatx80_infinity_high, 5422 floatx80_infinity_low); 5423 } 5424 if ( aExp == 0 ) { 5425 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 5426 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5427 } 5428 return 5429 packFloatx80( 5430 aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11); 5431 5432 } 5433 5434 /*---------------------------------------------------------------------------- 5435 | Returns the result of converting the double-precision floating-point value 5436 | `a' to the quadruple-precision floating-point format. The conversion is 5437 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5438 | Arithmetic. 5439 *----------------------------------------------------------------------------*/ 5440 5441 float128 float64_to_float128(float64 a, float_status *status) 5442 { 5443 bool aSign; 5444 int aExp; 5445 uint64_t aSig, zSig0, zSig1; 5446 5447 a = float64_squash_input_denormal(a, status); 5448 aSig = extractFloat64Frac( a ); 5449 aExp = extractFloat64Exp( a ); 5450 aSign = extractFloat64Sign( a ); 5451 if ( aExp == 0x7FF ) { 5452 if (aSig) { 5453 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 5454 } 5455 return packFloat128( aSign, 0x7FFF, 0, 0 ); 5456 } 5457 if ( aExp == 0 ) { 5458 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 5459 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5460 --aExp; 5461 } 5462 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 5463 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 5464 5465 } 5466 5467 5468 /*---------------------------------------------------------------------------- 5469 | Returns the remainder of the double-precision floating-point value `a' 5470 | with respect to the corresponding value `b'. The operation is performed 5471 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5472 *----------------------------------------------------------------------------*/ 5473 5474 float64 float64_rem(float64 a, float64 b, float_status *status) 5475 { 5476 bool aSign, zSign; 5477 int aExp, bExp, expDiff; 5478 uint64_t aSig, bSig; 5479 uint64_t q, alternateASig; 5480 int64_t sigMean; 5481 5482 a = float64_squash_input_denormal(a, status); 5483 b = float64_squash_input_denormal(b, status); 5484 aSig = extractFloat64Frac( a ); 5485 aExp = extractFloat64Exp( a ); 5486 aSign = extractFloat64Sign( a ); 5487 bSig = extractFloat64Frac( b ); 5488 bExp = extractFloat64Exp( b ); 5489 if ( aExp == 0x7FF ) { 5490 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 5491 return propagateFloat64NaN(a, b, status); 5492 } 5493 float_raise(float_flag_invalid, status); 5494 return float64_default_nan(status); 5495 } 5496 if ( bExp == 0x7FF ) { 5497 if (bSig) { 5498 return propagateFloat64NaN(a, b, status); 5499 } 5500 return a; 5501 } 5502 if ( bExp == 0 ) { 5503 if ( bSig == 0 ) { 5504 float_raise(float_flag_invalid, status); 5505 return float64_default_nan(status); 5506 } 5507 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 5508 } 5509 if ( aExp == 0 ) { 5510 if ( aSig == 0 ) return a; 5511 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5512 } 5513 expDiff = aExp - bExp; 5514 aSig = (aSig | UINT64_C(0x0010000000000000)) << 11; 5515 bSig = (bSig | UINT64_C(0x0010000000000000)) << 11; 5516 if ( expDiff < 0 ) { 5517 if ( expDiff < -1 ) return a; 5518 aSig >>= 1; 5519 } 5520 q = ( bSig <= aSig ); 5521 if ( q ) aSig -= bSig; 5522 expDiff -= 64; 5523 while ( 0 < expDiff ) { 5524 q = estimateDiv128To64( aSig, 0, bSig ); 5525 q = ( 2 < q ) ? q - 2 : 0; 5526 aSig = - ( ( bSig>>2 ) * q ); 5527 expDiff -= 62; 5528 } 5529 expDiff += 64; 5530 if ( 0 < expDiff ) { 5531 q = estimateDiv128To64( aSig, 0, bSig ); 5532 q = ( 2 < q ) ? q - 2 : 0; 5533 q >>= 64 - expDiff; 5534 bSig >>= 2; 5535 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 5536 } 5537 else { 5538 aSig >>= 2; 5539 bSig >>= 2; 5540 } 5541 do { 5542 alternateASig = aSig; 5543 ++q; 5544 aSig -= bSig; 5545 } while ( 0 <= (int64_t) aSig ); 5546 sigMean = aSig + alternateASig; 5547 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 5548 aSig = alternateASig; 5549 } 5550 zSign = ( (int64_t) aSig < 0 ); 5551 if ( zSign ) aSig = - aSig; 5552 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 5553 5554 } 5555 5556 /*---------------------------------------------------------------------------- 5557 | Returns the binary log of the double-precision floating-point value `a'. 5558 | The operation is performed according to the IEC/IEEE Standard for Binary 5559 | Floating-Point Arithmetic. 5560 *----------------------------------------------------------------------------*/ 5561 float64 float64_log2(float64 a, float_status *status) 5562 { 5563 bool aSign, zSign; 5564 int aExp; 5565 uint64_t aSig, aSig0, aSig1, zSig, i; 5566 a = float64_squash_input_denormal(a, status); 5567 5568 aSig = extractFloat64Frac( a ); 5569 aExp = extractFloat64Exp( a ); 5570 aSign = extractFloat64Sign( a ); 5571 5572 if ( aExp == 0 ) { 5573 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 5574 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5575 } 5576 if ( aSign ) { 5577 float_raise(float_flag_invalid, status); 5578 return float64_default_nan(status); 5579 } 5580 if ( aExp == 0x7FF ) { 5581 if (aSig) { 5582 return propagateFloat64NaN(a, float64_zero, status); 5583 } 5584 return a; 5585 } 5586 5587 aExp -= 0x3FF; 5588 aSig |= UINT64_C(0x0010000000000000); 5589 zSign = aExp < 0; 5590 zSig = (uint64_t)aExp << 52; 5591 for (i = 1LL << 51; i > 0; i >>= 1) { 5592 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 5593 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 5594 if ( aSig & UINT64_C(0x0020000000000000) ) { 5595 aSig >>= 1; 5596 zSig |= i; 5597 } 5598 } 5599 5600 if ( zSign ) 5601 zSig = -zSig; 5602 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 5603 } 5604 5605 /*---------------------------------------------------------------------------- 5606 | Returns the result of converting the extended double-precision floating- 5607 | point value `a' to the 32-bit two's complement integer format. The 5608 | conversion is performed according to the IEC/IEEE Standard for Binary 5609 | Floating-Point Arithmetic---which means in particular that the conversion 5610 | is rounded according to the current rounding mode. If `a' is a NaN, the 5611 | largest positive integer is returned. Otherwise, if the conversion 5612 | overflows, the largest integer with the same sign as `a' is returned. 5613 *----------------------------------------------------------------------------*/ 5614 5615 int32_t floatx80_to_int32(floatx80 a, float_status *status) 5616 { 5617 bool aSign; 5618 int32_t aExp, shiftCount; 5619 uint64_t aSig; 5620 5621 if (floatx80_invalid_encoding(a)) { 5622 float_raise(float_flag_invalid, status); 5623 return 1 << 31; 5624 } 5625 aSig = extractFloatx80Frac( a ); 5626 aExp = extractFloatx80Exp( a ); 5627 aSign = extractFloatx80Sign( a ); 5628 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5629 shiftCount = 0x4037 - aExp; 5630 if ( shiftCount <= 0 ) shiftCount = 1; 5631 shift64RightJamming( aSig, shiftCount, &aSig ); 5632 return roundAndPackInt32(aSign, aSig, status); 5633 5634 } 5635 5636 /*---------------------------------------------------------------------------- 5637 | Returns the result of converting the extended double-precision floating- 5638 | point value `a' to the 32-bit two's complement integer format. The 5639 | conversion is performed according to the IEC/IEEE Standard for Binary 5640 | Floating-Point Arithmetic, except that the conversion is always rounded 5641 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5642 | Otherwise, if the conversion overflows, the largest integer with the same 5643 | sign as `a' is returned. 5644 *----------------------------------------------------------------------------*/ 5645 5646 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 5647 { 5648 bool aSign; 5649 int32_t aExp, shiftCount; 5650 uint64_t aSig, savedASig; 5651 int32_t z; 5652 5653 if (floatx80_invalid_encoding(a)) { 5654 float_raise(float_flag_invalid, status); 5655 return 1 << 31; 5656 } 5657 aSig = extractFloatx80Frac( a ); 5658 aExp = extractFloatx80Exp( a ); 5659 aSign = extractFloatx80Sign( a ); 5660 if ( 0x401E < aExp ) { 5661 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5662 goto invalid; 5663 } 5664 else if ( aExp < 0x3FFF ) { 5665 if (aExp || aSig) { 5666 float_raise(float_flag_inexact, status); 5667 } 5668 return 0; 5669 } 5670 shiftCount = 0x403E - aExp; 5671 savedASig = aSig; 5672 aSig >>= shiftCount; 5673 z = aSig; 5674 if ( aSign ) z = - z; 5675 if ( ( z < 0 ) ^ aSign ) { 5676 invalid: 5677 float_raise(float_flag_invalid, status); 5678 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5679 } 5680 if ( ( aSig<<shiftCount ) != savedASig ) { 5681 float_raise(float_flag_inexact, status); 5682 } 5683 return z; 5684 5685 } 5686 5687 /*---------------------------------------------------------------------------- 5688 | Returns the result of converting the extended double-precision floating- 5689 | point value `a' to the 64-bit two's complement integer format. The 5690 | conversion is performed according to the IEC/IEEE Standard for Binary 5691 | Floating-Point Arithmetic---which means in particular that the conversion 5692 | is rounded according to the current rounding mode. If `a' is a NaN, 5693 | the largest positive integer is returned. Otherwise, if the conversion 5694 | overflows, the largest integer with the same sign as `a' is returned. 5695 *----------------------------------------------------------------------------*/ 5696 5697 int64_t floatx80_to_int64(floatx80 a, float_status *status) 5698 { 5699 bool aSign; 5700 int32_t aExp, shiftCount; 5701 uint64_t aSig, aSigExtra; 5702 5703 if (floatx80_invalid_encoding(a)) { 5704 float_raise(float_flag_invalid, status); 5705 return 1ULL << 63; 5706 } 5707 aSig = extractFloatx80Frac( a ); 5708 aExp = extractFloatx80Exp( a ); 5709 aSign = extractFloatx80Sign( a ); 5710 shiftCount = 0x403E - aExp; 5711 if ( shiftCount <= 0 ) { 5712 if ( shiftCount ) { 5713 float_raise(float_flag_invalid, status); 5714 if (!aSign || floatx80_is_any_nan(a)) { 5715 return INT64_MAX; 5716 } 5717 return INT64_MIN; 5718 } 5719 aSigExtra = 0; 5720 } 5721 else { 5722 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 5723 } 5724 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 5725 5726 } 5727 5728 /*---------------------------------------------------------------------------- 5729 | Returns the result of converting the extended double-precision floating- 5730 | point value `a' to the 64-bit two's complement integer format. The 5731 | conversion is performed according to the IEC/IEEE Standard for Binary 5732 | Floating-Point Arithmetic, except that the conversion is always rounded 5733 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5734 | Otherwise, if the conversion overflows, the largest integer with the same 5735 | sign as `a' is returned. 5736 *----------------------------------------------------------------------------*/ 5737 5738 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 5739 { 5740 bool aSign; 5741 int32_t aExp, shiftCount; 5742 uint64_t aSig; 5743 int64_t z; 5744 5745 if (floatx80_invalid_encoding(a)) { 5746 float_raise(float_flag_invalid, status); 5747 return 1ULL << 63; 5748 } 5749 aSig = extractFloatx80Frac( a ); 5750 aExp = extractFloatx80Exp( a ); 5751 aSign = extractFloatx80Sign( a ); 5752 shiftCount = aExp - 0x403E; 5753 if ( 0 <= shiftCount ) { 5754 aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF); 5755 if ( ( a.high != 0xC03E ) || aSig ) { 5756 float_raise(float_flag_invalid, status); 5757 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 5758 return INT64_MAX; 5759 } 5760 } 5761 return INT64_MIN; 5762 } 5763 else if ( aExp < 0x3FFF ) { 5764 if (aExp | aSig) { 5765 float_raise(float_flag_inexact, status); 5766 } 5767 return 0; 5768 } 5769 z = aSig>>( - shiftCount ); 5770 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 5771 float_raise(float_flag_inexact, status); 5772 } 5773 if ( aSign ) z = - z; 5774 return z; 5775 5776 } 5777 5778 /*---------------------------------------------------------------------------- 5779 | Returns the result of converting the extended double-precision floating- 5780 | point value `a' to the single-precision floating-point format. The 5781 | conversion is performed according to the IEC/IEEE Standard for Binary 5782 | Floating-Point Arithmetic. 5783 *----------------------------------------------------------------------------*/ 5784 5785 float32 floatx80_to_float32(floatx80 a, float_status *status) 5786 { 5787 bool aSign; 5788 int32_t aExp; 5789 uint64_t aSig; 5790 5791 if (floatx80_invalid_encoding(a)) { 5792 float_raise(float_flag_invalid, status); 5793 return float32_default_nan(status); 5794 } 5795 aSig = extractFloatx80Frac( a ); 5796 aExp = extractFloatx80Exp( a ); 5797 aSign = extractFloatx80Sign( a ); 5798 if ( aExp == 0x7FFF ) { 5799 if ( (uint64_t) ( aSig<<1 ) ) { 5800 float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status), 5801 status); 5802 return float32_silence_nan(res, status); 5803 } 5804 return packFloat32( aSign, 0xFF, 0 ); 5805 } 5806 shift64RightJamming( aSig, 33, &aSig ); 5807 if ( aExp || aSig ) aExp -= 0x3F81; 5808 return roundAndPackFloat32(aSign, aExp, aSig, status); 5809 5810 } 5811 5812 /*---------------------------------------------------------------------------- 5813 | Returns the result of converting the extended double-precision floating- 5814 | point value `a' to the double-precision floating-point format. The 5815 | conversion is performed according to the IEC/IEEE Standard for Binary 5816 | Floating-Point Arithmetic. 5817 *----------------------------------------------------------------------------*/ 5818 5819 float64 floatx80_to_float64(floatx80 a, float_status *status) 5820 { 5821 bool aSign; 5822 int32_t aExp; 5823 uint64_t aSig, zSig; 5824 5825 if (floatx80_invalid_encoding(a)) { 5826 float_raise(float_flag_invalid, status); 5827 return float64_default_nan(status); 5828 } 5829 aSig = extractFloatx80Frac( a ); 5830 aExp = extractFloatx80Exp( a ); 5831 aSign = extractFloatx80Sign( a ); 5832 if ( aExp == 0x7FFF ) { 5833 if ( (uint64_t) ( aSig<<1 ) ) { 5834 float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status), 5835 status); 5836 return float64_silence_nan(res, status); 5837 } 5838 return packFloat64( aSign, 0x7FF, 0 ); 5839 } 5840 shift64RightJamming( aSig, 1, &zSig ); 5841 if ( aExp || aSig ) aExp -= 0x3C01; 5842 return roundAndPackFloat64(aSign, aExp, zSig, status); 5843 5844 } 5845 5846 /*---------------------------------------------------------------------------- 5847 | Returns the result of converting the extended double-precision floating- 5848 | point value `a' to the quadruple-precision floating-point format. The 5849 | conversion is performed according to the IEC/IEEE Standard for Binary 5850 | Floating-Point Arithmetic. 5851 *----------------------------------------------------------------------------*/ 5852 5853 float128 floatx80_to_float128(floatx80 a, float_status *status) 5854 { 5855 bool aSign; 5856 int aExp; 5857 uint64_t aSig, zSig0, zSig1; 5858 5859 if (floatx80_invalid_encoding(a)) { 5860 float_raise(float_flag_invalid, status); 5861 return float128_default_nan(status); 5862 } 5863 aSig = extractFloatx80Frac( a ); 5864 aExp = extractFloatx80Exp( a ); 5865 aSign = extractFloatx80Sign( a ); 5866 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 5867 float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status), 5868 status); 5869 return float128_silence_nan(res, status); 5870 } 5871 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 5872 return packFloat128( aSign, aExp, zSig0, zSig1 ); 5873 5874 } 5875 5876 /*---------------------------------------------------------------------------- 5877 | Rounds the extended double-precision floating-point value `a' 5878 | to the precision provided by floatx80_rounding_precision and returns the 5879 | result as an extended double-precision floating-point value. 5880 | The operation is performed according to the IEC/IEEE Standard for Binary 5881 | Floating-Point Arithmetic. 5882 *----------------------------------------------------------------------------*/ 5883 5884 floatx80 floatx80_round(floatx80 a, float_status *status) 5885 { 5886 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5887 extractFloatx80Sign(a), 5888 extractFloatx80Exp(a), 5889 extractFloatx80Frac(a), 0, status); 5890 } 5891 5892 /*---------------------------------------------------------------------------- 5893 | Rounds the extended double-precision floating-point value `a' to an integer, 5894 | and returns the result as an extended quadruple-precision floating-point 5895 | value. The operation is performed according to the IEC/IEEE Standard for 5896 | Binary Floating-Point Arithmetic. 5897 *----------------------------------------------------------------------------*/ 5898 5899 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 5900 { 5901 bool aSign; 5902 int32_t aExp; 5903 uint64_t lastBitMask, roundBitsMask; 5904 floatx80 z; 5905 5906 if (floatx80_invalid_encoding(a)) { 5907 float_raise(float_flag_invalid, status); 5908 return floatx80_default_nan(status); 5909 } 5910 aExp = extractFloatx80Exp( a ); 5911 if ( 0x403E <= aExp ) { 5912 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 5913 return propagateFloatx80NaN(a, a, status); 5914 } 5915 return a; 5916 } 5917 if ( aExp < 0x3FFF ) { 5918 if ( ( aExp == 0 ) 5919 && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) { 5920 return a; 5921 } 5922 float_raise(float_flag_inexact, status); 5923 aSign = extractFloatx80Sign( a ); 5924 switch (status->float_rounding_mode) { 5925 case float_round_nearest_even: 5926 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 5927 ) { 5928 return 5929 packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000)); 5930 } 5931 break; 5932 case float_round_ties_away: 5933 if (aExp == 0x3FFE) { 5934 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000)); 5935 } 5936 break; 5937 case float_round_down: 5938 return 5939 aSign ? 5940 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000)) 5941 : packFloatx80( 0, 0, 0 ); 5942 case float_round_up: 5943 return 5944 aSign ? packFloatx80( 1, 0, 0 ) 5945 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000)); 5946 5947 case float_round_to_zero: 5948 break; 5949 default: 5950 g_assert_not_reached(); 5951 } 5952 return packFloatx80( aSign, 0, 0 ); 5953 } 5954 lastBitMask = 1; 5955 lastBitMask <<= 0x403E - aExp; 5956 roundBitsMask = lastBitMask - 1; 5957 z = a; 5958 switch (status->float_rounding_mode) { 5959 case float_round_nearest_even: 5960 z.low += lastBitMask>>1; 5961 if ((z.low & roundBitsMask) == 0) { 5962 z.low &= ~lastBitMask; 5963 } 5964 break; 5965 case float_round_ties_away: 5966 z.low += lastBitMask >> 1; 5967 break; 5968 case float_round_to_zero: 5969 break; 5970 case float_round_up: 5971 if (!extractFloatx80Sign(z)) { 5972 z.low += roundBitsMask; 5973 } 5974 break; 5975 case float_round_down: 5976 if (extractFloatx80Sign(z)) { 5977 z.low += roundBitsMask; 5978 } 5979 break; 5980 default: 5981 abort(); 5982 } 5983 z.low &= ~ roundBitsMask; 5984 if ( z.low == 0 ) { 5985 ++z.high; 5986 z.low = UINT64_C(0x8000000000000000); 5987 } 5988 if (z.low != a.low) { 5989 float_raise(float_flag_inexact, status); 5990 } 5991 return z; 5992 5993 } 5994 5995 /*---------------------------------------------------------------------------- 5996 | Returns the result of adding the absolute values of the extended double- 5997 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 5998 | negated before being returned. `zSign' is ignored if the result is a NaN. 5999 | The addition is performed according to the IEC/IEEE Standard for Binary 6000 | Floating-Point Arithmetic. 6001 *----------------------------------------------------------------------------*/ 6002 6003 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign, 6004 float_status *status) 6005 { 6006 int32_t aExp, bExp, zExp; 6007 uint64_t aSig, bSig, zSig0, zSig1; 6008 int32_t expDiff; 6009 6010 aSig = extractFloatx80Frac( a ); 6011 aExp = extractFloatx80Exp( a ); 6012 bSig = extractFloatx80Frac( b ); 6013 bExp = extractFloatx80Exp( b ); 6014 expDiff = aExp - bExp; 6015 if ( 0 < expDiff ) { 6016 if ( aExp == 0x7FFF ) { 6017 if ((uint64_t)(aSig << 1)) { 6018 return propagateFloatx80NaN(a, b, status); 6019 } 6020 return a; 6021 } 6022 if ( bExp == 0 ) --expDiff; 6023 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 6024 zExp = aExp; 6025 } 6026 else if ( expDiff < 0 ) { 6027 if ( bExp == 0x7FFF ) { 6028 if ((uint64_t)(bSig << 1)) { 6029 return propagateFloatx80NaN(a, b, status); 6030 } 6031 return packFloatx80(zSign, 6032 floatx80_infinity_high, 6033 floatx80_infinity_low); 6034 } 6035 if ( aExp == 0 ) ++expDiff; 6036 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 6037 zExp = bExp; 6038 } 6039 else { 6040 if ( aExp == 0x7FFF ) { 6041 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 6042 return propagateFloatx80NaN(a, b, status); 6043 } 6044 return a; 6045 } 6046 zSig1 = 0; 6047 zSig0 = aSig + bSig; 6048 if ( aExp == 0 ) { 6049 if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) { 6050 /* At least one of the values is a pseudo-denormal, 6051 * and there is a carry out of the result. */ 6052 zExp = 1; 6053 goto shiftRight1; 6054 } 6055 if (zSig0 == 0) { 6056 return packFloatx80(zSign, 0, 0); 6057 } 6058 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 6059 goto roundAndPack; 6060 } 6061 zExp = aExp; 6062 goto shiftRight1; 6063 } 6064 zSig0 = aSig + bSig; 6065 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 6066 shiftRight1: 6067 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 6068 zSig0 |= UINT64_C(0x8000000000000000); 6069 ++zExp; 6070 roundAndPack: 6071 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6072 zSign, zExp, zSig0, zSig1, status); 6073 } 6074 6075 /*---------------------------------------------------------------------------- 6076 | Returns the result of subtracting the absolute values of the extended 6077 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 6078 | difference is negated before being returned. `zSign' is ignored if the 6079 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6080 | Standard for Binary Floating-Point Arithmetic. 6081 *----------------------------------------------------------------------------*/ 6082 6083 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign, 6084 float_status *status) 6085 { 6086 int32_t aExp, bExp, zExp; 6087 uint64_t aSig, bSig, zSig0, zSig1; 6088 int32_t expDiff; 6089 6090 aSig = extractFloatx80Frac( a ); 6091 aExp = extractFloatx80Exp( a ); 6092 bSig = extractFloatx80Frac( b ); 6093 bExp = extractFloatx80Exp( b ); 6094 expDiff = aExp - bExp; 6095 if ( 0 < expDiff ) goto aExpBigger; 6096 if ( expDiff < 0 ) goto bExpBigger; 6097 if ( aExp == 0x7FFF ) { 6098 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 6099 return propagateFloatx80NaN(a, b, status); 6100 } 6101 float_raise(float_flag_invalid, status); 6102 return floatx80_default_nan(status); 6103 } 6104 if ( aExp == 0 ) { 6105 aExp = 1; 6106 bExp = 1; 6107 } 6108 zSig1 = 0; 6109 if ( bSig < aSig ) goto aBigger; 6110 if ( aSig < bSig ) goto bBigger; 6111 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 6112 bExpBigger: 6113 if ( bExp == 0x7FFF ) { 6114 if ((uint64_t)(bSig << 1)) { 6115 return propagateFloatx80NaN(a, b, status); 6116 } 6117 return packFloatx80(zSign ^ 1, floatx80_infinity_high, 6118 floatx80_infinity_low); 6119 } 6120 if ( aExp == 0 ) ++expDiff; 6121 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 6122 bBigger: 6123 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 6124 zExp = bExp; 6125 zSign ^= 1; 6126 goto normalizeRoundAndPack; 6127 aExpBigger: 6128 if ( aExp == 0x7FFF ) { 6129 if ((uint64_t)(aSig << 1)) { 6130 return propagateFloatx80NaN(a, b, status); 6131 } 6132 return a; 6133 } 6134 if ( bExp == 0 ) --expDiff; 6135 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 6136 aBigger: 6137 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 6138 zExp = aExp; 6139 normalizeRoundAndPack: 6140 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 6141 zSign, zExp, zSig0, zSig1, status); 6142 } 6143 6144 /*---------------------------------------------------------------------------- 6145 | Returns the result of adding the extended double-precision floating-point 6146 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6147 | Standard for Binary Floating-Point Arithmetic. 6148 *----------------------------------------------------------------------------*/ 6149 6150 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 6151 { 6152 bool aSign, bSign; 6153 6154 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6155 float_raise(float_flag_invalid, status); 6156 return floatx80_default_nan(status); 6157 } 6158 aSign = extractFloatx80Sign( a ); 6159 bSign = extractFloatx80Sign( b ); 6160 if ( aSign == bSign ) { 6161 return addFloatx80Sigs(a, b, aSign, status); 6162 } 6163 else { 6164 return subFloatx80Sigs(a, b, aSign, status); 6165 } 6166 6167 } 6168 6169 /*---------------------------------------------------------------------------- 6170 | Returns the result of subtracting the extended double-precision floating- 6171 | point values `a' and `b'. The operation is performed according to the 6172 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6173 *----------------------------------------------------------------------------*/ 6174 6175 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 6176 { 6177 bool aSign, bSign; 6178 6179 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6180 float_raise(float_flag_invalid, status); 6181 return floatx80_default_nan(status); 6182 } 6183 aSign = extractFloatx80Sign( a ); 6184 bSign = extractFloatx80Sign( b ); 6185 if ( aSign == bSign ) { 6186 return subFloatx80Sigs(a, b, aSign, status); 6187 } 6188 else { 6189 return addFloatx80Sigs(a, b, aSign, status); 6190 } 6191 6192 } 6193 6194 /*---------------------------------------------------------------------------- 6195 | Returns the result of multiplying the extended double-precision floating- 6196 | point values `a' and `b'. The operation is performed according to the 6197 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6198 *----------------------------------------------------------------------------*/ 6199 6200 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 6201 { 6202 bool aSign, bSign, zSign; 6203 int32_t aExp, bExp, zExp; 6204 uint64_t aSig, bSig, zSig0, zSig1; 6205 6206 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6207 float_raise(float_flag_invalid, status); 6208 return floatx80_default_nan(status); 6209 } 6210 aSig = extractFloatx80Frac( a ); 6211 aExp = extractFloatx80Exp( a ); 6212 aSign = extractFloatx80Sign( a ); 6213 bSig = extractFloatx80Frac( b ); 6214 bExp = extractFloatx80Exp( b ); 6215 bSign = extractFloatx80Sign( b ); 6216 zSign = aSign ^ bSign; 6217 if ( aExp == 0x7FFF ) { 6218 if ( (uint64_t) ( aSig<<1 ) 6219 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 6220 return propagateFloatx80NaN(a, b, status); 6221 } 6222 if ( ( bExp | bSig ) == 0 ) goto invalid; 6223 return packFloatx80(zSign, floatx80_infinity_high, 6224 floatx80_infinity_low); 6225 } 6226 if ( bExp == 0x7FFF ) { 6227 if ((uint64_t)(bSig << 1)) { 6228 return propagateFloatx80NaN(a, b, status); 6229 } 6230 if ( ( aExp | aSig ) == 0 ) { 6231 invalid: 6232 float_raise(float_flag_invalid, status); 6233 return floatx80_default_nan(status); 6234 } 6235 return packFloatx80(zSign, floatx80_infinity_high, 6236 floatx80_infinity_low); 6237 } 6238 if ( aExp == 0 ) { 6239 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6240 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 6241 } 6242 if ( bExp == 0 ) { 6243 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6244 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6245 } 6246 zExp = aExp + bExp - 0x3FFE; 6247 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 6248 if ( 0 < (int64_t) zSig0 ) { 6249 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 6250 --zExp; 6251 } 6252 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6253 zSign, zExp, zSig0, zSig1, status); 6254 } 6255 6256 /*---------------------------------------------------------------------------- 6257 | Returns the result of dividing the extended double-precision floating-point 6258 | value `a' by the corresponding value `b'. The operation is performed 6259 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6260 *----------------------------------------------------------------------------*/ 6261 6262 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 6263 { 6264 bool aSign, bSign, zSign; 6265 int32_t aExp, bExp, zExp; 6266 uint64_t aSig, bSig, zSig0, zSig1; 6267 uint64_t rem0, rem1, rem2, term0, term1, term2; 6268 6269 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6270 float_raise(float_flag_invalid, status); 6271 return floatx80_default_nan(status); 6272 } 6273 aSig = extractFloatx80Frac( a ); 6274 aExp = extractFloatx80Exp( a ); 6275 aSign = extractFloatx80Sign( a ); 6276 bSig = extractFloatx80Frac( b ); 6277 bExp = extractFloatx80Exp( b ); 6278 bSign = extractFloatx80Sign( b ); 6279 zSign = aSign ^ bSign; 6280 if ( aExp == 0x7FFF ) { 6281 if ((uint64_t)(aSig << 1)) { 6282 return propagateFloatx80NaN(a, b, status); 6283 } 6284 if ( bExp == 0x7FFF ) { 6285 if ((uint64_t)(bSig << 1)) { 6286 return propagateFloatx80NaN(a, b, status); 6287 } 6288 goto invalid; 6289 } 6290 return packFloatx80(zSign, floatx80_infinity_high, 6291 floatx80_infinity_low); 6292 } 6293 if ( bExp == 0x7FFF ) { 6294 if ((uint64_t)(bSig << 1)) { 6295 return propagateFloatx80NaN(a, b, status); 6296 } 6297 return packFloatx80( zSign, 0, 0 ); 6298 } 6299 if ( bExp == 0 ) { 6300 if ( bSig == 0 ) { 6301 if ( ( aExp | aSig ) == 0 ) { 6302 invalid: 6303 float_raise(float_flag_invalid, status); 6304 return floatx80_default_nan(status); 6305 } 6306 float_raise(float_flag_divbyzero, status); 6307 return packFloatx80(zSign, floatx80_infinity_high, 6308 floatx80_infinity_low); 6309 } 6310 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6311 } 6312 if ( aExp == 0 ) { 6313 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6314 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 6315 } 6316 zExp = aExp - bExp + 0x3FFE; 6317 rem1 = 0; 6318 if ( bSig <= aSig ) { 6319 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 6320 ++zExp; 6321 } 6322 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 6323 mul64To128( bSig, zSig0, &term0, &term1 ); 6324 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 6325 while ( (int64_t) rem0 < 0 ) { 6326 --zSig0; 6327 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 6328 } 6329 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 6330 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 6331 mul64To128( bSig, zSig1, &term1, &term2 ); 6332 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6333 while ( (int64_t) rem1 < 0 ) { 6334 --zSig1; 6335 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 6336 } 6337 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 6338 } 6339 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6340 zSign, zExp, zSig0, zSig1, status); 6341 } 6342 6343 /*---------------------------------------------------------------------------- 6344 | Returns the remainder of the extended double-precision floating-point value 6345 | `a' with respect to the corresponding value `b'. The operation is performed 6346 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic, 6347 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating 6348 | the quotient toward zero instead. '*quotient' is set to the low 64 bits of 6349 | the absolute value of the integer quotient. 6350 *----------------------------------------------------------------------------*/ 6351 6352 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient, 6353 float_status *status) 6354 { 6355 bool aSign, zSign; 6356 int32_t aExp, bExp, expDiff, aExpOrig; 6357 uint64_t aSig0, aSig1, bSig; 6358 uint64_t q, term0, term1, alternateASig0, alternateASig1; 6359 6360 *quotient = 0; 6361 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6362 float_raise(float_flag_invalid, status); 6363 return floatx80_default_nan(status); 6364 } 6365 aSig0 = extractFloatx80Frac( a ); 6366 aExpOrig = aExp = extractFloatx80Exp( a ); 6367 aSign = extractFloatx80Sign( a ); 6368 bSig = extractFloatx80Frac( b ); 6369 bExp = extractFloatx80Exp( b ); 6370 if ( aExp == 0x7FFF ) { 6371 if ( (uint64_t) ( aSig0<<1 ) 6372 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 6373 return propagateFloatx80NaN(a, b, status); 6374 } 6375 goto invalid; 6376 } 6377 if ( bExp == 0x7FFF ) { 6378 if ((uint64_t)(bSig << 1)) { 6379 return propagateFloatx80NaN(a, b, status); 6380 } 6381 if (aExp == 0 && aSig0 >> 63) { 6382 /* 6383 * Pseudo-denormal argument must be returned in normalized 6384 * form. 6385 */ 6386 return packFloatx80(aSign, 1, aSig0); 6387 } 6388 return a; 6389 } 6390 if ( bExp == 0 ) { 6391 if ( bSig == 0 ) { 6392 invalid: 6393 float_raise(float_flag_invalid, status); 6394 return floatx80_default_nan(status); 6395 } 6396 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6397 } 6398 if ( aExp == 0 ) { 6399 if ( aSig0 == 0 ) return a; 6400 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6401 } 6402 zSign = aSign; 6403 expDiff = aExp - bExp; 6404 aSig1 = 0; 6405 if ( expDiff < 0 ) { 6406 if ( mod || expDiff < -1 ) { 6407 if (aExp == 1 && aExpOrig == 0) { 6408 /* 6409 * Pseudo-denormal argument must be returned in 6410 * normalized form. 6411 */ 6412 return packFloatx80(aSign, aExp, aSig0); 6413 } 6414 return a; 6415 } 6416 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 6417 expDiff = 0; 6418 } 6419 *quotient = q = ( bSig <= aSig0 ); 6420 if ( q ) aSig0 -= bSig; 6421 expDiff -= 64; 6422 while ( 0 < expDiff ) { 6423 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6424 q = ( 2 < q ) ? q - 2 : 0; 6425 mul64To128( bSig, q, &term0, &term1 ); 6426 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6427 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 6428 expDiff -= 62; 6429 *quotient <<= 62; 6430 *quotient += q; 6431 } 6432 expDiff += 64; 6433 if ( 0 < expDiff ) { 6434 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6435 q = ( 2 < q ) ? q - 2 : 0; 6436 q >>= 64 - expDiff; 6437 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 6438 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6439 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 6440 while ( le128( term0, term1, aSig0, aSig1 ) ) { 6441 ++q; 6442 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6443 } 6444 if (expDiff < 64) { 6445 *quotient <<= expDiff; 6446 } else { 6447 *quotient = 0; 6448 } 6449 *quotient += q; 6450 } 6451 else { 6452 term1 = 0; 6453 term0 = bSig; 6454 } 6455 if (!mod) { 6456 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 6457 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6458 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6459 && ( q & 1 ) ) 6460 ) { 6461 aSig0 = alternateASig0; 6462 aSig1 = alternateASig1; 6463 zSign = ! zSign; 6464 ++*quotient; 6465 } 6466 } 6467 return 6468 normalizeRoundAndPackFloatx80( 6469 80, zSign, bExp + expDiff, aSig0, aSig1, status); 6470 6471 } 6472 6473 /*---------------------------------------------------------------------------- 6474 | Returns the remainder of the extended double-precision floating-point value 6475 | `a' with respect to the corresponding value `b'. The operation is performed 6476 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6477 *----------------------------------------------------------------------------*/ 6478 6479 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 6480 { 6481 uint64_t quotient; 6482 return floatx80_modrem(a, b, false, "ient, status); 6483 } 6484 6485 /*---------------------------------------------------------------------------- 6486 | Returns the remainder of the extended double-precision floating-point value 6487 | `a' with respect to the corresponding value `b', with the quotient truncated 6488 | toward zero. 6489 *----------------------------------------------------------------------------*/ 6490 6491 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status) 6492 { 6493 uint64_t quotient; 6494 return floatx80_modrem(a, b, true, "ient, status); 6495 } 6496 6497 /*---------------------------------------------------------------------------- 6498 | Returns the square root of the extended double-precision floating-point 6499 | value `a'. The operation is performed according to the IEC/IEEE Standard 6500 | for Binary Floating-Point Arithmetic. 6501 *----------------------------------------------------------------------------*/ 6502 6503 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 6504 { 6505 bool aSign; 6506 int32_t aExp, zExp; 6507 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 6508 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6509 6510 if (floatx80_invalid_encoding(a)) { 6511 float_raise(float_flag_invalid, status); 6512 return floatx80_default_nan(status); 6513 } 6514 aSig0 = extractFloatx80Frac( a ); 6515 aExp = extractFloatx80Exp( a ); 6516 aSign = extractFloatx80Sign( a ); 6517 if ( aExp == 0x7FFF ) { 6518 if ((uint64_t)(aSig0 << 1)) { 6519 return propagateFloatx80NaN(a, a, status); 6520 } 6521 if ( ! aSign ) return a; 6522 goto invalid; 6523 } 6524 if ( aSign ) { 6525 if ( ( aExp | aSig0 ) == 0 ) return a; 6526 invalid: 6527 float_raise(float_flag_invalid, status); 6528 return floatx80_default_nan(status); 6529 } 6530 if ( aExp == 0 ) { 6531 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 6532 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6533 } 6534 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 6535 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 6536 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 6537 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6538 doubleZSig0 = zSig0<<1; 6539 mul64To128( zSig0, zSig0, &term0, &term1 ); 6540 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6541 while ( (int64_t) rem0 < 0 ) { 6542 --zSig0; 6543 doubleZSig0 -= 2; 6544 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6545 } 6546 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6547 if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) { 6548 if ( zSig1 == 0 ) zSig1 = 1; 6549 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6550 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6551 mul64To128( zSig1, zSig1, &term2, &term3 ); 6552 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6553 while ( (int64_t) rem1 < 0 ) { 6554 --zSig1; 6555 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6556 term3 |= 1; 6557 term2 |= doubleZSig0; 6558 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6559 } 6560 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6561 } 6562 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 6563 zSig0 |= doubleZSig0; 6564 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6565 0, zExp, zSig0, zSig1, status); 6566 } 6567 6568 /*---------------------------------------------------------------------------- 6569 | Returns the result of converting the quadruple-precision floating-point 6570 | value `a' to the 32-bit two's complement integer format. The conversion 6571 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6572 | Arithmetic---which means in particular that the conversion is rounded 6573 | according to the current rounding mode. If `a' is a NaN, the largest 6574 | positive integer is returned. Otherwise, if the conversion overflows, the 6575 | largest integer with the same sign as `a' is returned. 6576 *----------------------------------------------------------------------------*/ 6577 6578 int32_t float128_to_int32(float128 a, float_status *status) 6579 { 6580 bool aSign; 6581 int32_t aExp, shiftCount; 6582 uint64_t aSig0, aSig1; 6583 6584 aSig1 = extractFloat128Frac1( a ); 6585 aSig0 = extractFloat128Frac0( a ); 6586 aExp = extractFloat128Exp( a ); 6587 aSign = extractFloat128Sign( a ); 6588 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 6589 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6590 aSig0 |= ( aSig1 != 0 ); 6591 shiftCount = 0x4028 - aExp; 6592 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 6593 return roundAndPackInt32(aSign, aSig0, status); 6594 6595 } 6596 6597 /*---------------------------------------------------------------------------- 6598 | Returns the result of converting the quadruple-precision floating-point 6599 | value `a' to the 32-bit two's complement integer format. The conversion 6600 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6601 | Arithmetic, except that the conversion is always rounded toward zero. If 6602 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 6603 | conversion overflows, the largest integer with the same sign as `a' is 6604 | returned. 6605 *----------------------------------------------------------------------------*/ 6606 6607 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 6608 { 6609 bool aSign; 6610 int32_t aExp, shiftCount; 6611 uint64_t aSig0, aSig1, savedASig; 6612 int32_t z; 6613 6614 aSig1 = extractFloat128Frac1( a ); 6615 aSig0 = extractFloat128Frac0( a ); 6616 aExp = extractFloat128Exp( a ); 6617 aSign = extractFloat128Sign( a ); 6618 aSig0 |= ( aSig1 != 0 ); 6619 if ( 0x401E < aExp ) { 6620 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 6621 goto invalid; 6622 } 6623 else if ( aExp < 0x3FFF ) { 6624 if (aExp || aSig0) { 6625 float_raise(float_flag_inexact, status); 6626 } 6627 return 0; 6628 } 6629 aSig0 |= UINT64_C(0x0001000000000000); 6630 shiftCount = 0x402F - aExp; 6631 savedASig = aSig0; 6632 aSig0 >>= shiftCount; 6633 z = aSig0; 6634 if ( aSign ) z = - z; 6635 if ( ( z < 0 ) ^ aSign ) { 6636 invalid: 6637 float_raise(float_flag_invalid, status); 6638 return aSign ? INT32_MIN : INT32_MAX; 6639 } 6640 if ( ( aSig0<<shiftCount ) != savedASig ) { 6641 float_raise(float_flag_inexact, status); 6642 } 6643 return z; 6644 6645 } 6646 6647 /*---------------------------------------------------------------------------- 6648 | Returns the result of converting the quadruple-precision floating-point 6649 | value `a' to the 64-bit two's complement integer format. The conversion 6650 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6651 | Arithmetic---which means in particular that the conversion is rounded 6652 | according to the current rounding mode. If `a' is a NaN, the largest 6653 | positive integer is returned. Otherwise, if the conversion overflows, the 6654 | largest integer with the same sign as `a' is returned. 6655 *----------------------------------------------------------------------------*/ 6656 6657 int64_t float128_to_int64(float128 a, float_status *status) 6658 { 6659 bool aSign; 6660 int32_t aExp, shiftCount; 6661 uint64_t aSig0, aSig1; 6662 6663 aSig1 = extractFloat128Frac1( a ); 6664 aSig0 = extractFloat128Frac0( a ); 6665 aExp = extractFloat128Exp( a ); 6666 aSign = extractFloat128Sign( a ); 6667 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6668 shiftCount = 0x402F - aExp; 6669 if ( shiftCount <= 0 ) { 6670 if ( 0x403E < aExp ) { 6671 float_raise(float_flag_invalid, status); 6672 if ( ! aSign 6673 || ( ( aExp == 0x7FFF ) 6674 && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) ) 6675 ) 6676 ) { 6677 return INT64_MAX; 6678 } 6679 return INT64_MIN; 6680 } 6681 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 6682 } 6683 else { 6684 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 6685 } 6686 return roundAndPackInt64(aSign, aSig0, aSig1, status); 6687 6688 } 6689 6690 /*---------------------------------------------------------------------------- 6691 | Returns the result of converting the quadruple-precision floating-point 6692 | value `a' to the 64-bit two's complement integer format. The conversion 6693 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6694 | Arithmetic, except that the conversion is always rounded toward zero. 6695 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 6696 | the conversion overflows, the largest integer with the same sign as `a' is 6697 | returned. 6698 *----------------------------------------------------------------------------*/ 6699 6700 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 6701 { 6702 bool aSign; 6703 int32_t aExp, shiftCount; 6704 uint64_t aSig0, aSig1; 6705 int64_t z; 6706 6707 aSig1 = extractFloat128Frac1( a ); 6708 aSig0 = extractFloat128Frac0( a ); 6709 aExp = extractFloat128Exp( a ); 6710 aSign = extractFloat128Sign( a ); 6711 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6712 shiftCount = aExp - 0x402F; 6713 if ( 0 < shiftCount ) { 6714 if ( 0x403E <= aExp ) { 6715 aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF); 6716 if ( ( a.high == UINT64_C(0xC03E000000000000) ) 6717 && ( aSig1 < UINT64_C(0x0002000000000000) ) ) { 6718 if (aSig1) { 6719 float_raise(float_flag_inexact, status); 6720 } 6721 } 6722 else { 6723 float_raise(float_flag_invalid, status); 6724 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 6725 return INT64_MAX; 6726 } 6727 } 6728 return INT64_MIN; 6729 } 6730 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 6731 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 6732 float_raise(float_flag_inexact, status); 6733 } 6734 } 6735 else { 6736 if ( aExp < 0x3FFF ) { 6737 if ( aExp | aSig0 | aSig1 ) { 6738 float_raise(float_flag_inexact, status); 6739 } 6740 return 0; 6741 } 6742 z = aSig0>>( - shiftCount ); 6743 if ( aSig1 6744 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 6745 float_raise(float_flag_inexact, status); 6746 } 6747 } 6748 if ( aSign ) z = - z; 6749 return z; 6750 6751 } 6752 6753 /*---------------------------------------------------------------------------- 6754 | Returns the result of converting the quadruple-precision floating-point value 6755 | `a' to the 64-bit unsigned integer format. The conversion is 6756 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6757 | Arithmetic---which means in particular that the conversion is rounded 6758 | according to the current rounding mode. If `a' is a NaN, the largest 6759 | positive integer is returned. If the conversion overflows, the 6760 | largest unsigned integer is returned. If 'a' is negative, the value is 6761 | rounded and zero is returned; negative values that do not round to zero 6762 | will raise the inexact exception. 6763 *----------------------------------------------------------------------------*/ 6764 6765 uint64_t float128_to_uint64(float128 a, float_status *status) 6766 { 6767 bool aSign; 6768 int aExp; 6769 int shiftCount; 6770 uint64_t aSig0, aSig1; 6771 6772 aSig0 = extractFloat128Frac0(a); 6773 aSig1 = extractFloat128Frac1(a); 6774 aExp = extractFloat128Exp(a); 6775 aSign = extractFloat128Sign(a); 6776 if (aSign && (aExp > 0x3FFE)) { 6777 float_raise(float_flag_invalid, status); 6778 if (float128_is_any_nan(a)) { 6779 return UINT64_MAX; 6780 } else { 6781 return 0; 6782 } 6783 } 6784 if (aExp) { 6785 aSig0 |= UINT64_C(0x0001000000000000); 6786 } 6787 shiftCount = 0x402F - aExp; 6788 if (shiftCount <= 0) { 6789 if (0x403E < aExp) { 6790 float_raise(float_flag_invalid, status); 6791 return UINT64_MAX; 6792 } 6793 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 6794 } else { 6795 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 6796 } 6797 return roundAndPackUint64(aSign, aSig0, aSig1, status); 6798 } 6799 6800 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 6801 { 6802 uint64_t v; 6803 signed char current_rounding_mode = status->float_rounding_mode; 6804 6805 set_float_rounding_mode(float_round_to_zero, status); 6806 v = float128_to_uint64(a, status); 6807 set_float_rounding_mode(current_rounding_mode, status); 6808 6809 return v; 6810 } 6811 6812 /*---------------------------------------------------------------------------- 6813 | Returns the result of converting the quadruple-precision floating-point 6814 | value `a' to the 32-bit unsigned integer format. The conversion 6815 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6816 | Arithmetic except that the conversion is always rounded toward zero. 6817 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 6818 | if the conversion overflows, the largest unsigned integer is returned. 6819 | If 'a' is negative, the value is rounded and zero is returned; negative 6820 | values that do not round to zero will raise the inexact exception. 6821 *----------------------------------------------------------------------------*/ 6822 6823 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 6824 { 6825 uint64_t v; 6826 uint32_t res; 6827 int old_exc_flags = get_float_exception_flags(status); 6828 6829 v = float128_to_uint64_round_to_zero(a, status); 6830 if (v > 0xffffffff) { 6831 res = 0xffffffff; 6832 } else { 6833 return v; 6834 } 6835 set_float_exception_flags(old_exc_flags, status); 6836 float_raise(float_flag_invalid, status); 6837 return res; 6838 } 6839 6840 /*---------------------------------------------------------------------------- 6841 | Returns the result of converting the quadruple-precision floating-point value 6842 | `a' to the 32-bit unsigned integer format. The conversion is 6843 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6844 | Arithmetic---which means in particular that the conversion is rounded 6845 | according to the current rounding mode. If `a' is a NaN, the largest 6846 | positive integer is returned. If the conversion overflows, the 6847 | largest unsigned integer is returned. If 'a' is negative, the value is 6848 | rounded and zero is returned; negative values that do not round to zero 6849 | will raise the inexact exception. 6850 *----------------------------------------------------------------------------*/ 6851 6852 uint32_t float128_to_uint32(float128 a, float_status *status) 6853 { 6854 uint64_t v; 6855 uint32_t res; 6856 int old_exc_flags = get_float_exception_flags(status); 6857 6858 v = float128_to_uint64(a, status); 6859 if (v > 0xffffffff) { 6860 res = 0xffffffff; 6861 } else { 6862 return v; 6863 } 6864 set_float_exception_flags(old_exc_flags, status); 6865 float_raise(float_flag_invalid, status); 6866 return res; 6867 } 6868 6869 /*---------------------------------------------------------------------------- 6870 | Returns the result of converting the quadruple-precision floating-point 6871 | value `a' to the single-precision floating-point format. The conversion 6872 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6873 | Arithmetic. 6874 *----------------------------------------------------------------------------*/ 6875 6876 float32 float128_to_float32(float128 a, float_status *status) 6877 { 6878 bool aSign; 6879 int32_t aExp; 6880 uint64_t aSig0, aSig1; 6881 uint32_t zSig; 6882 6883 aSig1 = extractFloat128Frac1( a ); 6884 aSig0 = extractFloat128Frac0( a ); 6885 aExp = extractFloat128Exp( a ); 6886 aSign = extractFloat128Sign( a ); 6887 if ( aExp == 0x7FFF ) { 6888 if ( aSig0 | aSig1 ) { 6889 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 6890 } 6891 return packFloat32( aSign, 0xFF, 0 ); 6892 } 6893 aSig0 |= ( aSig1 != 0 ); 6894 shift64RightJamming( aSig0, 18, &aSig0 ); 6895 zSig = aSig0; 6896 if ( aExp || zSig ) { 6897 zSig |= 0x40000000; 6898 aExp -= 0x3F81; 6899 } 6900 return roundAndPackFloat32(aSign, aExp, zSig, status); 6901 6902 } 6903 6904 /*---------------------------------------------------------------------------- 6905 | Returns the result of converting the quadruple-precision floating-point 6906 | value `a' to the double-precision floating-point format. The conversion 6907 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6908 | Arithmetic. 6909 *----------------------------------------------------------------------------*/ 6910 6911 float64 float128_to_float64(float128 a, float_status *status) 6912 { 6913 bool aSign; 6914 int32_t aExp; 6915 uint64_t aSig0, aSig1; 6916 6917 aSig1 = extractFloat128Frac1( a ); 6918 aSig0 = extractFloat128Frac0( a ); 6919 aExp = extractFloat128Exp( a ); 6920 aSign = extractFloat128Sign( a ); 6921 if ( aExp == 0x7FFF ) { 6922 if ( aSig0 | aSig1 ) { 6923 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 6924 } 6925 return packFloat64( aSign, 0x7FF, 0 ); 6926 } 6927 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6928 aSig0 |= ( aSig1 != 0 ); 6929 if ( aExp || aSig0 ) { 6930 aSig0 |= UINT64_C(0x4000000000000000); 6931 aExp -= 0x3C01; 6932 } 6933 return roundAndPackFloat64(aSign, aExp, aSig0, status); 6934 6935 } 6936 6937 /*---------------------------------------------------------------------------- 6938 | Returns the result of converting the quadruple-precision floating-point 6939 | value `a' to the extended double-precision floating-point format. The 6940 | conversion is performed according to the IEC/IEEE Standard for Binary 6941 | Floating-Point Arithmetic. 6942 *----------------------------------------------------------------------------*/ 6943 6944 floatx80 float128_to_floatx80(float128 a, float_status *status) 6945 { 6946 bool aSign; 6947 int32_t aExp; 6948 uint64_t aSig0, aSig1; 6949 6950 aSig1 = extractFloat128Frac1( a ); 6951 aSig0 = extractFloat128Frac0( a ); 6952 aExp = extractFloat128Exp( a ); 6953 aSign = extractFloat128Sign( a ); 6954 if ( aExp == 0x7FFF ) { 6955 if ( aSig0 | aSig1 ) { 6956 floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status), 6957 status); 6958 return floatx80_silence_nan(res, status); 6959 } 6960 return packFloatx80(aSign, floatx80_infinity_high, 6961 floatx80_infinity_low); 6962 } 6963 if ( aExp == 0 ) { 6964 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6965 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6966 } 6967 else { 6968 aSig0 |= UINT64_C(0x0001000000000000); 6969 } 6970 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6971 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6972 6973 } 6974 6975 /*---------------------------------------------------------------------------- 6976 | Rounds the quadruple-precision floating-point value `a' to an integer, and 6977 | returns the result as a quadruple-precision floating-point value. The 6978 | operation is performed according to the IEC/IEEE Standard for Binary 6979 | Floating-Point Arithmetic. 6980 *----------------------------------------------------------------------------*/ 6981 6982 float128 float128_round_to_int(float128 a, float_status *status) 6983 { 6984 bool aSign; 6985 int32_t aExp; 6986 uint64_t lastBitMask, roundBitsMask; 6987 float128 z; 6988 6989 aExp = extractFloat128Exp( a ); 6990 if ( 0x402F <= aExp ) { 6991 if ( 0x406F <= aExp ) { 6992 if ( ( aExp == 0x7FFF ) 6993 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 6994 ) { 6995 return propagateFloat128NaN(a, a, status); 6996 } 6997 return a; 6998 } 6999 lastBitMask = 1; 7000 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 7001 roundBitsMask = lastBitMask - 1; 7002 z = a; 7003 switch (status->float_rounding_mode) { 7004 case float_round_nearest_even: 7005 if ( lastBitMask ) { 7006 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 7007 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 7008 } 7009 else { 7010 if ( (int64_t) z.low < 0 ) { 7011 ++z.high; 7012 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 7013 } 7014 } 7015 break; 7016 case float_round_ties_away: 7017 if (lastBitMask) { 7018 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 7019 } else { 7020 if ((int64_t) z.low < 0) { 7021 ++z.high; 7022 } 7023 } 7024 break; 7025 case float_round_to_zero: 7026 break; 7027 case float_round_up: 7028 if (!extractFloat128Sign(z)) { 7029 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 7030 } 7031 break; 7032 case float_round_down: 7033 if (extractFloat128Sign(z)) { 7034 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 7035 } 7036 break; 7037 case float_round_to_odd: 7038 /* 7039 * Note that if lastBitMask == 0, the last bit is the lsb 7040 * of high, and roundBitsMask == -1. 7041 */ 7042 if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) { 7043 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 7044 } 7045 break; 7046 default: 7047 abort(); 7048 } 7049 z.low &= ~ roundBitsMask; 7050 } 7051 else { 7052 if ( aExp < 0x3FFF ) { 7053 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 7054 float_raise(float_flag_inexact, status); 7055 aSign = extractFloat128Sign( a ); 7056 switch (status->float_rounding_mode) { 7057 case float_round_nearest_even: 7058 if ( ( aExp == 0x3FFE ) 7059 && ( extractFloat128Frac0( a ) 7060 | extractFloat128Frac1( a ) ) 7061 ) { 7062 return packFloat128( aSign, 0x3FFF, 0, 0 ); 7063 } 7064 break; 7065 case float_round_ties_away: 7066 if (aExp == 0x3FFE) { 7067 return packFloat128(aSign, 0x3FFF, 0, 0); 7068 } 7069 break; 7070 case float_round_down: 7071 return 7072 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 7073 : packFloat128( 0, 0, 0, 0 ); 7074 case float_round_up: 7075 return 7076 aSign ? packFloat128( 1, 0, 0, 0 ) 7077 : packFloat128( 0, 0x3FFF, 0, 0 ); 7078 7079 case float_round_to_odd: 7080 return packFloat128(aSign, 0x3FFF, 0, 0); 7081 7082 case float_round_to_zero: 7083 break; 7084 } 7085 return packFloat128( aSign, 0, 0, 0 ); 7086 } 7087 lastBitMask = 1; 7088 lastBitMask <<= 0x402F - aExp; 7089 roundBitsMask = lastBitMask - 1; 7090 z.low = 0; 7091 z.high = a.high; 7092 switch (status->float_rounding_mode) { 7093 case float_round_nearest_even: 7094 z.high += lastBitMask>>1; 7095 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 7096 z.high &= ~ lastBitMask; 7097 } 7098 break; 7099 case float_round_ties_away: 7100 z.high += lastBitMask>>1; 7101 break; 7102 case float_round_to_zero: 7103 break; 7104 case float_round_up: 7105 if (!extractFloat128Sign(z)) { 7106 z.high |= ( a.low != 0 ); 7107 z.high += roundBitsMask; 7108 } 7109 break; 7110 case float_round_down: 7111 if (extractFloat128Sign(z)) { 7112 z.high |= (a.low != 0); 7113 z.high += roundBitsMask; 7114 } 7115 break; 7116 case float_round_to_odd: 7117 if ((z.high & lastBitMask) == 0) { 7118 z.high |= (a.low != 0); 7119 z.high += roundBitsMask; 7120 } 7121 break; 7122 default: 7123 abort(); 7124 } 7125 z.high &= ~ roundBitsMask; 7126 } 7127 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 7128 float_raise(float_flag_inexact, status); 7129 } 7130 return z; 7131 7132 } 7133 7134 /*---------------------------------------------------------------------------- 7135 | Returns the result of adding the absolute values of the quadruple-precision 7136 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 7137 | before being returned. `zSign' is ignored if the result is a NaN. 7138 | The addition is performed according to the IEC/IEEE Standard for Binary 7139 | Floating-Point Arithmetic. 7140 *----------------------------------------------------------------------------*/ 7141 7142 static float128 addFloat128Sigs(float128 a, float128 b, bool zSign, 7143 float_status *status) 7144 { 7145 int32_t aExp, bExp, zExp; 7146 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 7147 int32_t expDiff; 7148 7149 aSig1 = extractFloat128Frac1( a ); 7150 aSig0 = extractFloat128Frac0( a ); 7151 aExp = extractFloat128Exp( a ); 7152 bSig1 = extractFloat128Frac1( b ); 7153 bSig0 = extractFloat128Frac0( b ); 7154 bExp = extractFloat128Exp( b ); 7155 expDiff = aExp - bExp; 7156 if ( 0 < expDiff ) { 7157 if ( aExp == 0x7FFF ) { 7158 if (aSig0 | aSig1) { 7159 return propagateFloat128NaN(a, b, status); 7160 } 7161 return a; 7162 } 7163 if ( bExp == 0 ) { 7164 --expDiff; 7165 } 7166 else { 7167 bSig0 |= UINT64_C(0x0001000000000000); 7168 } 7169 shift128ExtraRightJamming( 7170 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 7171 zExp = aExp; 7172 } 7173 else if ( expDiff < 0 ) { 7174 if ( bExp == 0x7FFF ) { 7175 if (bSig0 | bSig1) { 7176 return propagateFloat128NaN(a, b, status); 7177 } 7178 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7179 } 7180 if ( aExp == 0 ) { 7181 ++expDiff; 7182 } 7183 else { 7184 aSig0 |= UINT64_C(0x0001000000000000); 7185 } 7186 shift128ExtraRightJamming( 7187 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 7188 zExp = bExp; 7189 } 7190 else { 7191 if ( aExp == 0x7FFF ) { 7192 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 7193 return propagateFloat128NaN(a, b, status); 7194 } 7195 return a; 7196 } 7197 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 7198 if ( aExp == 0 ) { 7199 if (status->flush_to_zero) { 7200 if (zSig0 | zSig1) { 7201 float_raise(float_flag_output_denormal, status); 7202 } 7203 return packFloat128(zSign, 0, 0, 0); 7204 } 7205 return packFloat128( zSign, 0, zSig0, zSig1 ); 7206 } 7207 zSig2 = 0; 7208 zSig0 |= UINT64_C(0x0002000000000000); 7209 zExp = aExp; 7210 goto shiftRight1; 7211 } 7212 aSig0 |= UINT64_C(0x0001000000000000); 7213 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 7214 --zExp; 7215 if ( zSig0 < UINT64_C(0x0002000000000000) ) goto roundAndPack; 7216 ++zExp; 7217 shiftRight1: 7218 shift128ExtraRightJamming( 7219 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 7220 roundAndPack: 7221 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7222 7223 } 7224 7225 /*---------------------------------------------------------------------------- 7226 | Returns the result of subtracting the absolute values of the quadruple- 7227 | precision floating-point values `a' and `b'. If `zSign' is 1, the 7228 | difference is negated before being returned. `zSign' is ignored if the 7229 | result is a NaN. The subtraction is performed according to the IEC/IEEE 7230 | Standard for Binary Floating-Point Arithmetic. 7231 *----------------------------------------------------------------------------*/ 7232 7233 static float128 subFloat128Sigs(float128 a, float128 b, bool zSign, 7234 float_status *status) 7235 { 7236 int32_t aExp, bExp, zExp; 7237 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 7238 int32_t expDiff; 7239 7240 aSig1 = extractFloat128Frac1( a ); 7241 aSig0 = extractFloat128Frac0( a ); 7242 aExp = extractFloat128Exp( a ); 7243 bSig1 = extractFloat128Frac1( b ); 7244 bSig0 = extractFloat128Frac0( b ); 7245 bExp = extractFloat128Exp( b ); 7246 expDiff = aExp - bExp; 7247 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 7248 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 7249 if ( 0 < expDiff ) goto aExpBigger; 7250 if ( expDiff < 0 ) goto bExpBigger; 7251 if ( aExp == 0x7FFF ) { 7252 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 7253 return propagateFloat128NaN(a, b, status); 7254 } 7255 float_raise(float_flag_invalid, status); 7256 return float128_default_nan(status); 7257 } 7258 if ( aExp == 0 ) { 7259 aExp = 1; 7260 bExp = 1; 7261 } 7262 if ( bSig0 < aSig0 ) goto aBigger; 7263 if ( aSig0 < bSig0 ) goto bBigger; 7264 if ( bSig1 < aSig1 ) goto aBigger; 7265 if ( aSig1 < bSig1 ) goto bBigger; 7266 return packFloat128(status->float_rounding_mode == float_round_down, 7267 0, 0, 0); 7268 bExpBigger: 7269 if ( bExp == 0x7FFF ) { 7270 if (bSig0 | bSig1) { 7271 return propagateFloat128NaN(a, b, status); 7272 } 7273 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 7274 } 7275 if ( aExp == 0 ) { 7276 ++expDiff; 7277 } 7278 else { 7279 aSig0 |= UINT64_C(0x4000000000000000); 7280 } 7281 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 7282 bSig0 |= UINT64_C(0x4000000000000000); 7283 bBigger: 7284 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 7285 zExp = bExp; 7286 zSign ^= 1; 7287 goto normalizeRoundAndPack; 7288 aExpBigger: 7289 if ( aExp == 0x7FFF ) { 7290 if (aSig0 | aSig1) { 7291 return propagateFloat128NaN(a, b, status); 7292 } 7293 return a; 7294 } 7295 if ( bExp == 0 ) { 7296 --expDiff; 7297 } 7298 else { 7299 bSig0 |= UINT64_C(0x4000000000000000); 7300 } 7301 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 7302 aSig0 |= UINT64_C(0x4000000000000000); 7303 aBigger: 7304 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 7305 zExp = aExp; 7306 normalizeRoundAndPack: 7307 --zExp; 7308 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 7309 status); 7310 7311 } 7312 7313 /*---------------------------------------------------------------------------- 7314 | Returns the result of adding the quadruple-precision floating-point values 7315 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 7316 | for Binary Floating-Point Arithmetic. 7317 *----------------------------------------------------------------------------*/ 7318 7319 float128 float128_add(float128 a, float128 b, float_status *status) 7320 { 7321 bool aSign, bSign; 7322 7323 aSign = extractFloat128Sign( a ); 7324 bSign = extractFloat128Sign( b ); 7325 if ( aSign == bSign ) { 7326 return addFloat128Sigs(a, b, aSign, status); 7327 } 7328 else { 7329 return subFloat128Sigs(a, b, aSign, status); 7330 } 7331 7332 } 7333 7334 /*---------------------------------------------------------------------------- 7335 | Returns the result of subtracting the quadruple-precision floating-point 7336 | values `a' and `b'. The operation is performed according to the IEC/IEEE 7337 | Standard for Binary Floating-Point Arithmetic. 7338 *----------------------------------------------------------------------------*/ 7339 7340 float128 float128_sub(float128 a, float128 b, float_status *status) 7341 { 7342 bool aSign, bSign; 7343 7344 aSign = extractFloat128Sign( a ); 7345 bSign = extractFloat128Sign( b ); 7346 if ( aSign == bSign ) { 7347 return subFloat128Sigs(a, b, aSign, status); 7348 } 7349 else { 7350 return addFloat128Sigs(a, b, aSign, status); 7351 } 7352 7353 } 7354 7355 /*---------------------------------------------------------------------------- 7356 | Returns the result of multiplying the quadruple-precision floating-point 7357 | values `a' and `b'. The operation is performed according to the IEC/IEEE 7358 | Standard for Binary Floating-Point Arithmetic. 7359 *----------------------------------------------------------------------------*/ 7360 7361 float128 float128_mul(float128 a, float128 b, float_status *status) 7362 { 7363 bool aSign, bSign, zSign; 7364 int32_t aExp, bExp, zExp; 7365 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 7366 7367 aSig1 = extractFloat128Frac1( a ); 7368 aSig0 = extractFloat128Frac0( a ); 7369 aExp = extractFloat128Exp( a ); 7370 aSign = extractFloat128Sign( a ); 7371 bSig1 = extractFloat128Frac1( b ); 7372 bSig0 = extractFloat128Frac0( b ); 7373 bExp = extractFloat128Exp( b ); 7374 bSign = extractFloat128Sign( b ); 7375 zSign = aSign ^ bSign; 7376 if ( aExp == 0x7FFF ) { 7377 if ( ( aSig0 | aSig1 ) 7378 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 7379 return propagateFloat128NaN(a, b, status); 7380 } 7381 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 7382 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7383 } 7384 if ( bExp == 0x7FFF ) { 7385 if (bSig0 | bSig1) { 7386 return propagateFloat128NaN(a, b, status); 7387 } 7388 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 7389 invalid: 7390 float_raise(float_flag_invalid, status); 7391 return float128_default_nan(status); 7392 } 7393 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7394 } 7395 if ( aExp == 0 ) { 7396 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7397 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7398 } 7399 if ( bExp == 0 ) { 7400 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7401 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7402 } 7403 zExp = aExp + bExp - 0x4000; 7404 aSig0 |= UINT64_C(0x0001000000000000); 7405 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 7406 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 7407 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 7408 zSig2 |= ( zSig3 != 0 ); 7409 if (UINT64_C( 0x0002000000000000) <= zSig0 ) { 7410 shift128ExtraRightJamming( 7411 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 7412 ++zExp; 7413 } 7414 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7415 7416 } 7417 7418 /*---------------------------------------------------------------------------- 7419 | Returns the result of dividing the quadruple-precision floating-point value 7420 | `a' by the corresponding value `b'. The operation is performed according to 7421 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7422 *----------------------------------------------------------------------------*/ 7423 7424 float128 float128_div(float128 a, float128 b, float_status *status) 7425 { 7426 bool aSign, bSign, zSign; 7427 int32_t aExp, bExp, zExp; 7428 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 7429 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7430 7431 aSig1 = extractFloat128Frac1( a ); 7432 aSig0 = extractFloat128Frac0( a ); 7433 aExp = extractFloat128Exp( a ); 7434 aSign = extractFloat128Sign( a ); 7435 bSig1 = extractFloat128Frac1( b ); 7436 bSig0 = extractFloat128Frac0( b ); 7437 bExp = extractFloat128Exp( b ); 7438 bSign = extractFloat128Sign( b ); 7439 zSign = aSign ^ bSign; 7440 if ( aExp == 0x7FFF ) { 7441 if (aSig0 | aSig1) { 7442 return propagateFloat128NaN(a, b, status); 7443 } 7444 if ( bExp == 0x7FFF ) { 7445 if (bSig0 | bSig1) { 7446 return propagateFloat128NaN(a, b, status); 7447 } 7448 goto invalid; 7449 } 7450 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7451 } 7452 if ( bExp == 0x7FFF ) { 7453 if (bSig0 | bSig1) { 7454 return propagateFloat128NaN(a, b, status); 7455 } 7456 return packFloat128( zSign, 0, 0, 0 ); 7457 } 7458 if ( bExp == 0 ) { 7459 if ( ( bSig0 | bSig1 ) == 0 ) { 7460 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 7461 invalid: 7462 float_raise(float_flag_invalid, status); 7463 return float128_default_nan(status); 7464 } 7465 float_raise(float_flag_divbyzero, status); 7466 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7467 } 7468 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7469 } 7470 if ( aExp == 0 ) { 7471 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7472 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7473 } 7474 zExp = aExp - bExp + 0x3FFD; 7475 shortShift128Left( 7476 aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 ); 7477 shortShift128Left( 7478 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 ); 7479 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 7480 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 7481 ++zExp; 7482 } 7483 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7484 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 7485 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 7486 while ( (int64_t) rem0 < 0 ) { 7487 --zSig0; 7488 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 7489 } 7490 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 7491 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 7492 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 7493 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 7494 while ( (int64_t) rem1 < 0 ) { 7495 --zSig1; 7496 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 7497 } 7498 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7499 } 7500 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 7501 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7502 7503 } 7504 7505 /*---------------------------------------------------------------------------- 7506 | Returns the remainder of the quadruple-precision floating-point value `a' 7507 | with respect to the corresponding value `b'. The operation is performed 7508 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7509 *----------------------------------------------------------------------------*/ 7510 7511 float128 float128_rem(float128 a, float128 b, float_status *status) 7512 { 7513 bool aSign, zSign; 7514 int32_t aExp, bExp, expDiff; 7515 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 7516 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 7517 int64_t sigMean0; 7518 7519 aSig1 = extractFloat128Frac1( a ); 7520 aSig0 = extractFloat128Frac0( a ); 7521 aExp = extractFloat128Exp( a ); 7522 aSign = extractFloat128Sign( a ); 7523 bSig1 = extractFloat128Frac1( b ); 7524 bSig0 = extractFloat128Frac0( b ); 7525 bExp = extractFloat128Exp( b ); 7526 if ( aExp == 0x7FFF ) { 7527 if ( ( aSig0 | aSig1 ) 7528 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 7529 return propagateFloat128NaN(a, b, status); 7530 } 7531 goto invalid; 7532 } 7533 if ( bExp == 0x7FFF ) { 7534 if (bSig0 | bSig1) { 7535 return propagateFloat128NaN(a, b, status); 7536 } 7537 return a; 7538 } 7539 if ( bExp == 0 ) { 7540 if ( ( bSig0 | bSig1 ) == 0 ) { 7541 invalid: 7542 float_raise(float_flag_invalid, status); 7543 return float128_default_nan(status); 7544 } 7545 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7546 } 7547 if ( aExp == 0 ) { 7548 if ( ( aSig0 | aSig1 ) == 0 ) return a; 7549 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7550 } 7551 expDiff = aExp - bExp; 7552 if ( expDiff < -1 ) return a; 7553 shortShift128Left( 7554 aSig0 | UINT64_C(0x0001000000000000), 7555 aSig1, 7556 15 - ( expDiff < 0 ), 7557 &aSig0, 7558 &aSig1 7559 ); 7560 shortShift128Left( 7561 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 ); 7562 q = le128( bSig0, bSig1, aSig0, aSig1 ); 7563 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7564 expDiff -= 64; 7565 while ( 0 < expDiff ) { 7566 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7567 q = ( 4 < q ) ? q - 4 : 0; 7568 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7569 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 7570 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 7571 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 7572 expDiff -= 61; 7573 } 7574 if ( -64 < expDiff ) { 7575 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7576 q = ( 4 < q ) ? q - 4 : 0; 7577 q >>= - expDiff; 7578 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7579 expDiff += 52; 7580 if ( expDiff < 0 ) { 7581 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 7582 } 7583 else { 7584 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 7585 } 7586 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7587 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 7588 } 7589 else { 7590 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 7591 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7592 } 7593 do { 7594 alternateASig0 = aSig0; 7595 alternateASig1 = aSig1; 7596 ++q; 7597 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7598 } while ( 0 <= (int64_t) aSig0 ); 7599 add128( 7600 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 7601 if ( ( sigMean0 < 0 ) 7602 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 7603 aSig0 = alternateASig0; 7604 aSig1 = alternateASig1; 7605 } 7606 zSign = ( (int64_t) aSig0 < 0 ); 7607 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 7608 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 7609 status); 7610 } 7611 7612 /*---------------------------------------------------------------------------- 7613 | Returns the square root of the quadruple-precision floating-point value `a'. 7614 | The operation is performed according to the IEC/IEEE Standard for Binary 7615 | Floating-Point Arithmetic. 7616 *----------------------------------------------------------------------------*/ 7617 7618 float128 float128_sqrt(float128 a, float_status *status) 7619 { 7620 bool aSign; 7621 int32_t aExp, zExp; 7622 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 7623 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7624 7625 aSig1 = extractFloat128Frac1( a ); 7626 aSig0 = extractFloat128Frac0( a ); 7627 aExp = extractFloat128Exp( a ); 7628 aSign = extractFloat128Sign( a ); 7629 if ( aExp == 0x7FFF ) { 7630 if (aSig0 | aSig1) { 7631 return propagateFloat128NaN(a, a, status); 7632 } 7633 if ( ! aSign ) return a; 7634 goto invalid; 7635 } 7636 if ( aSign ) { 7637 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 7638 invalid: 7639 float_raise(float_flag_invalid, status); 7640 return float128_default_nan(status); 7641 } 7642 if ( aExp == 0 ) { 7643 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 7644 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7645 } 7646 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 7647 aSig0 |= UINT64_C(0x0001000000000000); 7648 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 7649 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 7650 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 7651 doubleZSig0 = zSig0<<1; 7652 mul64To128( zSig0, zSig0, &term0, &term1 ); 7653 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 7654 while ( (int64_t) rem0 < 0 ) { 7655 --zSig0; 7656 doubleZSig0 -= 2; 7657 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 7658 } 7659 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 7660 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 7661 if ( zSig1 == 0 ) zSig1 = 1; 7662 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 7663 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 7664 mul64To128( zSig1, zSig1, &term2, &term3 ); 7665 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 7666 while ( (int64_t) rem1 < 0 ) { 7667 --zSig1; 7668 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 7669 term3 |= 1; 7670 term2 |= doubleZSig0; 7671 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 7672 } 7673 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7674 } 7675 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 7676 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 7677 7678 } 7679 7680 static inline FloatRelation 7681 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet, 7682 float_status *status) 7683 { 7684 bool aSign, bSign; 7685 7686 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 7687 float_raise(float_flag_invalid, status); 7688 return float_relation_unordered; 7689 } 7690 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 7691 ( extractFloatx80Frac( a )<<1 ) ) || 7692 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 7693 ( extractFloatx80Frac( b )<<1 ) )) { 7694 if (!is_quiet || 7695 floatx80_is_signaling_nan(a, status) || 7696 floatx80_is_signaling_nan(b, status)) { 7697 float_raise(float_flag_invalid, status); 7698 } 7699 return float_relation_unordered; 7700 } 7701 aSign = extractFloatx80Sign( a ); 7702 bSign = extractFloatx80Sign( b ); 7703 if ( aSign != bSign ) { 7704 7705 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 7706 ( ( a.low | b.low ) == 0 ) ) { 7707 /* zero case */ 7708 return float_relation_equal; 7709 } else { 7710 return 1 - (2 * aSign); 7711 } 7712 } else { 7713 /* Normalize pseudo-denormals before comparison. */ 7714 if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) { 7715 ++a.high; 7716 } 7717 if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) { 7718 ++b.high; 7719 } 7720 if (a.low == b.low && a.high == b.high) { 7721 return float_relation_equal; 7722 } else { 7723 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7724 } 7725 } 7726 } 7727 7728 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status) 7729 { 7730 return floatx80_compare_internal(a, b, 0, status); 7731 } 7732 7733 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b, 7734 float_status *status) 7735 { 7736 return floatx80_compare_internal(a, b, 1, status); 7737 } 7738 7739 static inline FloatRelation 7740 float128_compare_internal(float128 a, float128 b, bool is_quiet, 7741 float_status *status) 7742 { 7743 bool aSign, bSign; 7744 7745 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7746 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7747 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7748 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7749 if (!is_quiet || 7750 float128_is_signaling_nan(a, status) || 7751 float128_is_signaling_nan(b, status)) { 7752 float_raise(float_flag_invalid, status); 7753 } 7754 return float_relation_unordered; 7755 } 7756 aSign = extractFloat128Sign( a ); 7757 bSign = extractFloat128Sign( b ); 7758 if ( aSign != bSign ) { 7759 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7760 /* zero case */ 7761 return float_relation_equal; 7762 } else { 7763 return 1 - (2 * aSign); 7764 } 7765 } else { 7766 if (a.low == b.low && a.high == b.high) { 7767 return float_relation_equal; 7768 } else { 7769 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7770 } 7771 } 7772 } 7773 7774 FloatRelation float128_compare(float128 a, float128 b, float_status *status) 7775 { 7776 return float128_compare_internal(a, b, 0, status); 7777 } 7778 7779 FloatRelation float128_compare_quiet(float128 a, float128 b, 7780 float_status *status) 7781 { 7782 return float128_compare_internal(a, b, 1, status); 7783 } 7784 7785 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7786 { 7787 bool aSign; 7788 int32_t aExp; 7789 uint64_t aSig; 7790 7791 if (floatx80_invalid_encoding(a)) { 7792 float_raise(float_flag_invalid, status); 7793 return floatx80_default_nan(status); 7794 } 7795 aSig = extractFloatx80Frac( a ); 7796 aExp = extractFloatx80Exp( a ); 7797 aSign = extractFloatx80Sign( a ); 7798 7799 if ( aExp == 0x7FFF ) { 7800 if ( aSig<<1 ) { 7801 return propagateFloatx80NaN(a, a, status); 7802 } 7803 return a; 7804 } 7805 7806 if (aExp == 0) { 7807 if (aSig == 0) { 7808 return a; 7809 } 7810 aExp++; 7811 } 7812 7813 if (n > 0x10000) { 7814 n = 0x10000; 7815 } else if (n < -0x10000) { 7816 n = -0x10000; 7817 } 7818 7819 aExp += n; 7820 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7821 aSign, aExp, aSig, 0, status); 7822 } 7823 7824 float128 float128_scalbn(float128 a, int n, float_status *status) 7825 { 7826 bool aSign; 7827 int32_t aExp; 7828 uint64_t aSig0, aSig1; 7829 7830 aSig1 = extractFloat128Frac1( a ); 7831 aSig0 = extractFloat128Frac0( a ); 7832 aExp = extractFloat128Exp( a ); 7833 aSign = extractFloat128Sign( a ); 7834 if ( aExp == 0x7FFF ) { 7835 if ( aSig0 | aSig1 ) { 7836 return propagateFloat128NaN(a, a, status); 7837 } 7838 return a; 7839 } 7840 if (aExp != 0) { 7841 aSig0 |= UINT64_C(0x0001000000000000); 7842 } else if (aSig0 == 0 && aSig1 == 0) { 7843 return a; 7844 } else { 7845 aExp++; 7846 } 7847 7848 if (n > 0x10000) { 7849 n = 0x10000; 7850 } else if (n < -0x10000) { 7851 n = -0x10000; 7852 } 7853 7854 aExp += n - 1; 7855 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7856 , status); 7857 7858 } 7859 7860 static void __attribute__((constructor)) softfloat_init(void) 7861 { 7862 union_float64 ua, ub, uc, ur; 7863 7864 if (QEMU_NO_HARDFLOAT) { 7865 return; 7866 } 7867 /* 7868 * Test that the host's FMA is not obviously broken. For example, 7869 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see 7870 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304 7871 */ 7872 ua.s = 0x0020000000000001ULL; 7873 ub.s = 0x3ca0000000000000ULL; 7874 uc.s = 0x0020000000000000ULL; 7875 ur.h = fma(ua.h, ub.h, uc.h); 7876 if (ur.s != 0x0020000000000001ULL) { 7877 force_soft_fma = true; 7878 } 7879 } 7880