1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include <math.h> 87 #include "qemu/bitops.h" 88 #include "fpu/softfloat.h" 89 90 /* We only need stdlib for abort() */ 91 92 /*---------------------------------------------------------------------------- 93 | Primitive arithmetic functions, including multi-word arithmetic, and 94 | division and square root approximations. (Can be specialized to target if 95 | desired.) 96 *----------------------------------------------------------------------------*/ 97 #include "fpu/softfloat-macros.h" 98 99 /* 100 * Hardfloat 101 * 102 * Fast emulation of guest FP instructions is challenging for two reasons. 103 * First, FP instruction semantics are similar but not identical, particularly 104 * when handling NaNs. Second, emulating at reasonable speed the guest FP 105 * exception flags is not trivial: reading the host's flags register with a 106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp], 107 * and trapping on every FP exception is not fast nor pleasant to work with. 108 * 109 * We address these challenges by leveraging the host FPU for a subset of the 110 * operations. To do this we expand on the idea presented in this paper: 111 * 112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a 113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615. 114 * 115 * The idea is thus to leverage the host FPU to (1) compute FP operations 116 * and (2) identify whether FP exceptions occurred while avoiding 117 * expensive exception flag register accesses. 118 * 119 * An important optimization shown in the paper is that given that exception 120 * flags are rarely cleared by the guest, we can avoid recomputing some flags. 121 * This is particularly useful for the inexact flag, which is very frequently 122 * raised in floating-point workloads. 123 * 124 * We optimize the code further by deferring to soft-fp whenever FP exception 125 * detection might get hairy. Two examples: (1) when at least one operand is 126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result 127 * and the result is < the minimum normal. 128 */ 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \ 130 static inline void name(soft_t *a, float_status *s) \ 131 { \ 132 if (unlikely(soft_t ## _is_denormal(*a))) { \ 133 *a = soft_t ## _set_sign(soft_t ## _zero, \ 134 soft_t ## _is_neg(*a)); \ 135 float_raise(float_flag_input_denormal, s); \ 136 } \ 137 } 138 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32) 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64) 141 #undef GEN_INPUT_FLUSH__NOCHECK 142 143 #define GEN_INPUT_FLUSH1(name, soft_t) \ 144 static inline void name(soft_t *a, float_status *s) \ 145 { \ 146 if (likely(!s->flush_inputs_to_zero)) { \ 147 return; \ 148 } \ 149 soft_t ## _input_flush__nocheck(a, s); \ 150 } 151 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32) 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64) 154 #undef GEN_INPUT_FLUSH1 155 156 #define GEN_INPUT_FLUSH2(name, soft_t) \ 157 static inline void name(soft_t *a, soft_t *b, float_status *s) \ 158 { \ 159 if (likely(!s->flush_inputs_to_zero)) { \ 160 return; \ 161 } \ 162 soft_t ## _input_flush__nocheck(a, s); \ 163 soft_t ## _input_flush__nocheck(b, s); \ 164 } 165 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32) 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64) 168 #undef GEN_INPUT_FLUSH2 169 170 #define GEN_INPUT_FLUSH3(name, soft_t) \ 171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \ 172 { \ 173 if (likely(!s->flush_inputs_to_zero)) { \ 174 return; \ 175 } \ 176 soft_t ## _input_flush__nocheck(a, s); \ 177 soft_t ## _input_flush__nocheck(b, s); \ 178 soft_t ## _input_flush__nocheck(c, s); \ 179 } 180 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32) 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64) 183 #undef GEN_INPUT_FLUSH3 184 185 /* 186 * Choose whether to use fpclassify or float32/64_* primitives in the generated 187 * hardfloat functions. Each combination of number of inputs and float size 188 * gets its own value. 189 */ 190 #if defined(__x86_64__) 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1 197 #else 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0 204 #endif 205 206 /* 207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over 208 * float{32,64}_is_infinity when !USE_FP. 209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup. 210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%. 211 */ 212 #if defined(__x86_64__) || defined(__aarch64__) 213 # define QEMU_HARDFLOAT_USE_ISINF 1 214 #else 215 # define QEMU_HARDFLOAT_USE_ISINF 0 216 #endif 217 218 /* 219 * Some targets clear the FP flags before most FP operations. This prevents 220 * the use of hardfloat, since hardfloat relies on the inexact flag being 221 * already set. 222 */ 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__) 224 # if defined(__FAST_MATH__) 225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \ 226 IEEE implementation 227 # endif 228 # define QEMU_NO_HARDFLOAT 1 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN 230 #else 231 # define QEMU_NO_HARDFLOAT 0 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline)) 233 #endif 234 235 static inline bool can_use_fpu(const float_status *s) 236 { 237 if (QEMU_NO_HARDFLOAT) { 238 return false; 239 } 240 return likely(s->float_exception_flags & float_flag_inexact && 241 s->float_rounding_mode == float_round_nearest_even); 242 } 243 244 /* 245 * Hardfloat generation functions. Each operation can have two flavors: 246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for 247 * most condition checks, or native ones (e.g. fpclassify). 248 * 249 * The flavor is chosen by the callers. Instead of using macros, we rely on the 250 * compiler to propagate constants and inline everything into the callers. 251 * 252 * We only generate functions for operations with two inputs, since only 253 * these are common enough to justify consolidating them into common code. 254 */ 255 256 typedef union { 257 float32 s; 258 float h; 259 } union_float32; 260 261 typedef union { 262 float64 s; 263 double h; 264 } union_float64; 265 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b); 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b); 268 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s); 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s); 271 typedef float (*hard_f32_op2_fn)(float a, float b); 272 typedef double (*hard_f64_op2_fn)(double a, double b); 273 274 /* 2-input is-zero-or-normal */ 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b) 276 { 277 if (QEMU_HARDFLOAT_2F32_USE_FP) { 278 /* 279 * Not using a temp variable for consecutive fpclassify calls ends up 280 * generating faster code. 281 */ 282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 284 } 285 return float32_is_zero_or_normal(a.s) && 286 float32_is_zero_or_normal(b.s); 287 } 288 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b) 290 { 291 if (QEMU_HARDFLOAT_2F64_USE_FP) { 292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 294 } 295 return float64_is_zero_or_normal(a.s) && 296 float64_is_zero_or_normal(b.s); 297 } 298 299 /* 3-input is-zero-or-normal */ 300 static inline 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c) 302 { 303 if (QEMU_HARDFLOAT_3F32_USE_FP) { 304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 307 } 308 return float32_is_zero_or_normal(a.s) && 309 float32_is_zero_or_normal(b.s) && 310 float32_is_zero_or_normal(c.s); 311 } 312 313 static inline 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c) 315 { 316 if (QEMU_HARDFLOAT_3F64_USE_FP) { 317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 320 } 321 return float64_is_zero_or_normal(a.s) && 322 float64_is_zero_or_normal(b.s) && 323 float64_is_zero_or_normal(c.s); 324 } 325 326 static inline bool f32_is_inf(union_float32 a) 327 { 328 if (QEMU_HARDFLOAT_USE_ISINF) { 329 return isinf(a.h); 330 } 331 return float32_is_infinity(a.s); 332 } 333 334 static inline bool f64_is_inf(union_float64 a) 335 { 336 if (QEMU_HARDFLOAT_USE_ISINF) { 337 return isinf(a.h); 338 } 339 return float64_is_infinity(a.s); 340 } 341 342 static inline float32 343 float32_gen2(float32 xa, float32 xb, float_status *s, 344 hard_f32_op2_fn hard, soft_f32_op2_fn soft, 345 f32_check_fn pre, f32_check_fn post) 346 { 347 union_float32 ua, ub, ur; 348 349 ua.s = xa; 350 ub.s = xb; 351 352 if (unlikely(!can_use_fpu(s))) { 353 goto soft; 354 } 355 356 float32_input_flush2(&ua.s, &ub.s, s); 357 if (unlikely(!pre(ua, ub))) { 358 goto soft; 359 } 360 361 ur.h = hard(ua.h, ub.h); 362 if (unlikely(f32_is_inf(ur))) { 363 float_raise(float_flag_overflow, s); 364 } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) { 365 goto soft; 366 } 367 return ur.s; 368 369 soft: 370 return soft(ua.s, ub.s, s); 371 } 372 373 static inline float64 374 float64_gen2(float64 xa, float64 xb, float_status *s, 375 hard_f64_op2_fn hard, soft_f64_op2_fn soft, 376 f64_check_fn pre, f64_check_fn post) 377 { 378 union_float64 ua, ub, ur; 379 380 ua.s = xa; 381 ub.s = xb; 382 383 if (unlikely(!can_use_fpu(s))) { 384 goto soft; 385 } 386 387 float64_input_flush2(&ua.s, &ub.s, s); 388 if (unlikely(!pre(ua, ub))) { 389 goto soft; 390 } 391 392 ur.h = hard(ua.h, ub.h); 393 if (unlikely(f64_is_inf(ur))) { 394 float_raise(float_flag_overflow, s); 395 } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) { 396 goto soft; 397 } 398 return ur.s; 399 400 soft: 401 return soft(ua.s, ub.s, s); 402 } 403 404 /*---------------------------------------------------------------------------- 405 | Returns the fraction bits of the single-precision floating-point value `a'. 406 *----------------------------------------------------------------------------*/ 407 408 static inline uint32_t extractFloat32Frac(float32 a) 409 { 410 return float32_val(a) & 0x007FFFFF; 411 } 412 413 /*---------------------------------------------------------------------------- 414 | Returns the exponent bits of the single-precision floating-point value `a'. 415 *----------------------------------------------------------------------------*/ 416 417 static inline int extractFloat32Exp(float32 a) 418 { 419 return (float32_val(a) >> 23) & 0xFF; 420 } 421 422 /*---------------------------------------------------------------------------- 423 | Returns the sign bit of the single-precision floating-point value `a'. 424 *----------------------------------------------------------------------------*/ 425 426 static inline bool extractFloat32Sign(float32 a) 427 { 428 return float32_val(a) >> 31; 429 } 430 431 /*---------------------------------------------------------------------------- 432 | Returns the fraction bits of the double-precision floating-point value `a'. 433 *----------------------------------------------------------------------------*/ 434 435 static inline uint64_t extractFloat64Frac(float64 a) 436 { 437 return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF); 438 } 439 440 /*---------------------------------------------------------------------------- 441 | Returns the exponent bits of the double-precision floating-point value `a'. 442 *----------------------------------------------------------------------------*/ 443 444 static inline int extractFloat64Exp(float64 a) 445 { 446 return (float64_val(a) >> 52) & 0x7FF; 447 } 448 449 /*---------------------------------------------------------------------------- 450 | Returns the sign bit of the double-precision floating-point value `a'. 451 *----------------------------------------------------------------------------*/ 452 453 static inline bool extractFloat64Sign(float64 a) 454 { 455 return float64_val(a) >> 63; 456 } 457 458 /* 459 * Classify a floating point number. Everything above float_class_qnan 460 * is a NaN so cls >= float_class_qnan is any NaN. 461 */ 462 463 typedef enum __attribute__ ((__packed__)) { 464 float_class_unclassified, 465 float_class_zero, 466 float_class_normal, 467 float_class_inf, 468 float_class_qnan, /* all NaNs from here */ 469 float_class_snan, 470 } FloatClass; 471 472 #define float_cmask(bit) (1u << (bit)) 473 474 enum { 475 float_cmask_zero = float_cmask(float_class_zero), 476 float_cmask_normal = float_cmask(float_class_normal), 477 float_cmask_inf = float_cmask(float_class_inf), 478 float_cmask_qnan = float_cmask(float_class_qnan), 479 float_cmask_snan = float_cmask(float_class_snan), 480 481 float_cmask_infzero = float_cmask_zero | float_cmask_inf, 482 float_cmask_anynan = float_cmask_qnan | float_cmask_snan, 483 }; 484 485 486 /* Simple helpers for checking if, or what kind of, NaN we have */ 487 static inline __attribute__((unused)) bool is_nan(FloatClass c) 488 { 489 return unlikely(c >= float_class_qnan); 490 } 491 492 static inline __attribute__((unused)) bool is_snan(FloatClass c) 493 { 494 return c == float_class_snan; 495 } 496 497 static inline __attribute__((unused)) bool is_qnan(FloatClass c) 498 { 499 return c == float_class_qnan; 500 } 501 502 /* 503 * Structure holding all of the decomposed parts of a float. 504 * The exponent is unbiased and the fraction is normalized. 505 * 506 * The fraction words are stored in big-endian word ordering, 507 * so that truncation from a larger format to a smaller format 508 * can be done simply by ignoring subsequent elements. 509 */ 510 511 typedef struct { 512 FloatClass cls; 513 bool sign; 514 int32_t exp; 515 union { 516 /* Routines that know the structure may reference the singular name. */ 517 uint64_t frac; 518 /* 519 * Routines expanded with multiple structures reference "hi" and "lo" 520 * depending on the operation. In FloatParts64, "hi" and "lo" are 521 * both the same word and aliased here. 522 */ 523 uint64_t frac_hi; 524 uint64_t frac_lo; 525 }; 526 } FloatParts64; 527 528 typedef struct { 529 FloatClass cls; 530 bool sign; 531 int32_t exp; 532 uint64_t frac_hi; 533 uint64_t frac_lo; 534 } FloatParts128; 535 536 /* These apply to the most significant word of each FloatPartsN. */ 537 #define DECOMPOSED_BINARY_POINT 63 538 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 539 540 /* Structure holding all of the relevant parameters for a format. 541 * exp_size: the size of the exponent field 542 * exp_bias: the offset applied to the exponent field 543 * exp_max: the maximum normalised exponent 544 * frac_size: the size of the fraction field 545 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 546 * The following are computed based the size of fraction 547 * frac_lsb: least significant bit of fraction 548 * frac_lsbm1: the bit below the least significant bit (for rounding) 549 * round_mask/roundeven_mask: masks used for rounding 550 * The following optional modifiers are available: 551 * arm_althp: handle ARM Alternative Half Precision 552 */ 553 typedef struct { 554 int exp_size; 555 int exp_bias; 556 int exp_max; 557 int frac_size; 558 int frac_shift; 559 uint64_t frac_lsb; 560 uint64_t frac_lsbm1; 561 uint64_t round_mask; 562 uint64_t roundeven_mask; 563 bool arm_althp; 564 } FloatFmt; 565 566 /* Expand fields based on the size of exponent and fraction */ 567 #define FLOAT_PARAMS(E, F) \ 568 .exp_size = E, \ 569 .exp_bias = ((1 << E) - 1) >> 1, \ 570 .exp_max = (1 << E) - 1, \ 571 .frac_size = F, \ 572 .frac_shift = (-F - 1) & 63, \ 573 .frac_lsb = 1ull << ((-F - 1) & 63), \ 574 .frac_lsbm1 = 1ull << ((-F - 2) & 63), \ 575 .round_mask = (1ull << ((-F - 1) & 63)) - 1, \ 576 .roundeven_mask = (2ull << ((-F - 1) & 63)) - 1 577 578 static const FloatFmt float16_params = { 579 FLOAT_PARAMS(5, 10) 580 }; 581 582 static const FloatFmt float16_params_ahp = { 583 FLOAT_PARAMS(5, 10), 584 .arm_althp = true 585 }; 586 587 static const FloatFmt bfloat16_params = { 588 FLOAT_PARAMS(8, 7) 589 }; 590 591 static const FloatFmt float32_params = { 592 FLOAT_PARAMS(8, 23) 593 }; 594 595 static const FloatFmt float64_params = { 596 FLOAT_PARAMS(11, 52) 597 }; 598 599 static const FloatFmt float128_params = { 600 FLOAT_PARAMS(15, 112) 601 }; 602 603 /* Unpack a float to parts, but do not canonicalize. */ 604 static void unpack_raw64(FloatParts64 *r, const FloatFmt *fmt, uint64_t raw) 605 { 606 const int f_size = fmt->frac_size; 607 const int e_size = fmt->exp_size; 608 609 *r = (FloatParts64) { 610 .cls = float_class_unclassified, 611 .sign = extract64(raw, f_size + e_size, 1), 612 .exp = extract64(raw, f_size, e_size), 613 .frac = extract64(raw, 0, f_size) 614 }; 615 } 616 617 static inline void float16_unpack_raw(FloatParts64 *p, float16 f) 618 { 619 unpack_raw64(p, &float16_params, f); 620 } 621 622 static inline void bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f) 623 { 624 unpack_raw64(p, &bfloat16_params, f); 625 } 626 627 static inline void float32_unpack_raw(FloatParts64 *p, float32 f) 628 { 629 unpack_raw64(p, &float32_params, f); 630 } 631 632 static inline void float64_unpack_raw(FloatParts64 *p, float64 f) 633 { 634 unpack_raw64(p, &float64_params, f); 635 } 636 637 static void float128_unpack_raw(FloatParts128 *p, float128 f) 638 { 639 const int f_size = float128_params.frac_size - 64; 640 const int e_size = float128_params.exp_size; 641 642 *p = (FloatParts128) { 643 .cls = float_class_unclassified, 644 .sign = extract64(f.high, f_size + e_size, 1), 645 .exp = extract64(f.high, f_size, e_size), 646 .frac_hi = extract64(f.high, 0, f_size), 647 .frac_lo = f.low, 648 }; 649 } 650 651 /* Pack a float from parts, but do not canonicalize. */ 652 static uint64_t pack_raw64(const FloatParts64 *p, const FloatFmt *fmt) 653 { 654 const int f_size = fmt->frac_size; 655 const int e_size = fmt->exp_size; 656 uint64_t ret; 657 658 ret = (uint64_t)p->sign << (f_size + e_size); 659 ret = deposit64(ret, f_size, e_size, p->exp); 660 ret = deposit64(ret, 0, f_size, p->frac); 661 return ret; 662 } 663 664 static inline float16 float16_pack_raw(const FloatParts64 *p) 665 { 666 return make_float16(pack_raw64(p, &float16_params)); 667 } 668 669 static inline bfloat16 bfloat16_pack_raw(const FloatParts64 *p) 670 { 671 return pack_raw64(p, &bfloat16_params); 672 } 673 674 static inline float32 float32_pack_raw(const FloatParts64 *p) 675 { 676 return make_float32(pack_raw64(p, &float32_params)); 677 } 678 679 static inline float64 float64_pack_raw(const FloatParts64 *p) 680 { 681 return make_float64(pack_raw64(p, &float64_params)); 682 } 683 684 static float128 float128_pack_raw(const FloatParts128 *p) 685 { 686 const int f_size = float128_params.frac_size - 64; 687 const int e_size = float128_params.exp_size; 688 uint64_t hi; 689 690 hi = (uint64_t)p->sign << (f_size + e_size); 691 hi = deposit64(hi, f_size, e_size, p->exp); 692 hi = deposit64(hi, 0, f_size, p->frac_hi); 693 return make_float128(hi, p->frac_lo); 694 } 695 696 /*---------------------------------------------------------------------------- 697 | Functions and definitions to determine: (1) whether tininess for underflow 698 | is detected before or after rounding by default, (2) what (if anything) 699 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 700 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 701 | are propagated from function inputs to output. These details are target- 702 | specific. 703 *----------------------------------------------------------------------------*/ 704 #include "softfloat-specialize.c.inc" 705 706 #define PARTS_GENERIC_64_128(NAME, P) \ 707 QEMU_GENERIC(P, (FloatParts128 *, parts128_##NAME), parts64_##NAME) 708 709 #define parts_default_nan(P, S) PARTS_GENERIC_64_128(default_nan, P)(P, S) 710 #define parts_silence_nan(P, S) PARTS_GENERIC_64_128(silence_nan, P)(P, S) 711 712 713 /* 714 * Helper functions for softfloat-parts.c.inc, per-size operations. 715 */ 716 717 static void frac128_shl(FloatParts128 *a, int c) 718 { 719 shift128Left(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo); 720 } 721 722 #define frac_shl(A, C) frac128_shl(A, C) 723 724 static void frac128_shr(FloatParts128 *a, int c) 725 { 726 shift128Right(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo); 727 } 728 729 #define frac_shr(A, C) frac128_shr(A, C) 730 731 /* Canonicalize EXP and FRAC, setting CLS. */ 732 static FloatParts64 sf_canonicalize(FloatParts64 part, const FloatFmt *parm, 733 float_status *status) 734 { 735 if (part.exp == parm->exp_max && !parm->arm_althp) { 736 if (part.frac == 0) { 737 part.cls = float_class_inf; 738 } else { 739 part.frac <<= parm->frac_shift; 740 part.cls = (parts_is_snan_frac(part.frac, status) 741 ? float_class_snan : float_class_qnan); 742 } 743 } else if (part.exp == 0) { 744 if (likely(part.frac == 0)) { 745 part.cls = float_class_zero; 746 } else if (status->flush_inputs_to_zero) { 747 float_raise(float_flag_input_denormal, status); 748 part.cls = float_class_zero; 749 part.frac = 0; 750 } else { 751 int shift = clz64(part.frac); 752 part.cls = float_class_normal; 753 part.exp = parm->frac_shift - parm->exp_bias - shift + 1; 754 part.frac <<= shift; 755 } 756 } else { 757 part.cls = float_class_normal; 758 part.exp -= parm->exp_bias; 759 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift); 760 } 761 return part; 762 } 763 764 /* Round and uncanonicalize a floating-point number by parts. There 765 * are FRAC_SHIFT bits that may require rounding at the bottom of the 766 * fraction; these bits will be removed. The exponent will be biased 767 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0]. 768 */ 769 770 static FloatParts64 round_canonical(FloatParts64 p, float_status *s, 771 const FloatFmt *parm) 772 { 773 const uint64_t frac_lsb = parm->frac_lsb; 774 const uint64_t frac_lsbm1 = parm->frac_lsbm1; 775 const uint64_t round_mask = parm->round_mask; 776 const uint64_t roundeven_mask = parm->roundeven_mask; 777 const int exp_max = parm->exp_max; 778 const int frac_shift = parm->frac_shift; 779 uint64_t frac, inc; 780 int exp, flags = 0; 781 bool overflow_norm; 782 783 frac = p.frac; 784 exp = p.exp; 785 786 switch (p.cls) { 787 case float_class_normal: 788 switch (s->float_rounding_mode) { 789 case float_round_nearest_even: 790 overflow_norm = false; 791 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 792 break; 793 case float_round_ties_away: 794 overflow_norm = false; 795 inc = frac_lsbm1; 796 break; 797 case float_round_to_zero: 798 overflow_norm = true; 799 inc = 0; 800 break; 801 case float_round_up: 802 inc = p.sign ? 0 : round_mask; 803 overflow_norm = p.sign; 804 break; 805 case float_round_down: 806 inc = p.sign ? round_mask : 0; 807 overflow_norm = !p.sign; 808 break; 809 case float_round_to_odd: 810 overflow_norm = true; 811 inc = frac & frac_lsb ? 0 : round_mask; 812 break; 813 default: 814 g_assert_not_reached(); 815 } 816 817 exp += parm->exp_bias; 818 if (likely(exp > 0)) { 819 if (frac & round_mask) { 820 flags |= float_flag_inexact; 821 if (uadd64_overflow(frac, inc, &frac)) { 822 frac = (frac >> 1) | DECOMPOSED_IMPLICIT_BIT; 823 exp++; 824 } 825 } 826 frac >>= frac_shift; 827 828 if (parm->arm_althp) { 829 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */ 830 if (unlikely(exp > exp_max)) { 831 /* Overflow. Return the maximum normal. */ 832 flags = float_flag_invalid; 833 exp = exp_max; 834 frac = -1; 835 } 836 } else if (unlikely(exp >= exp_max)) { 837 flags |= float_flag_overflow | float_flag_inexact; 838 if (overflow_norm) { 839 exp = exp_max - 1; 840 frac = -1; 841 } else { 842 p.cls = float_class_inf; 843 goto do_inf; 844 } 845 } 846 } else if (s->flush_to_zero) { 847 flags |= float_flag_output_denormal; 848 p.cls = float_class_zero; 849 goto do_zero; 850 } else { 851 bool is_tiny = s->tininess_before_rounding || (exp < 0); 852 853 if (!is_tiny) { 854 uint64_t discard; 855 is_tiny = !uadd64_overflow(frac, inc, &discard); 856 } 857 858 shift64RightJamming(frac, 1 - exp, &frac); 859 if (frac & round_mask) { 860 /* Need to recompute round-to-even. */ 861 switch (s->float_rounding_mode) { 862 case float_round_nearest_even: 863 inc = ((frac & roundeven_mask) != frac_lsbm1 864 ? frac_lsbm1 : 0); 865 break; 866 case float_round_to_odd: 867 inc = frac & frac_lsb ? 0 : round_mask; 868 break; 869 default: 870 break; 871 } 872 flags |= float_flag_inexact; 873 frac += inc; 874 } 875 876 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0); 877 frac >>= frac_shift; 878 879 if (is_tiny && (flags & float_flag_inexact)) { 880 flags |= float_flag_underflow; 881 } 882 if (exp == 0 && frac == 0) { 883 p.cls = float_class_zero; 884 } 885 } 886 break; 887 888 case float_class_zero: 889 do_zero: 890 exp = 0; 891 frac = 0; 892 break; 893 894 case float_class_inf: 895 do_inf: 896 assert(!parm->arm_althp); 897 exp = exp_max; 898 frac = 0; 899 break; 900 901 case float_class_qnan: 902 case float_class_snan: 903 assert(!parm->arm_althp); 904 exp = exp_max; 905 frac >>= parm->frac_shift; 906 break; 907 908 default: 909 g_assert_not_reached(); 910 } 911 912 float_raise(flags, s); 913 p.exp = exp; 914 p.frac = frac; 915 return p; 916 } 917 918 static FloatParts64 return_nan(FloatParts64 a, float_status *s) 919 { 920 g_assert(is_nan(a.cls)); 921 if (is_snan(a.cls)) { 922 float_raise(float_flag_invalid, s); 923 if (!s->default_nan_mode) { 924 parts_silence_nan(&a, s); 925 return a; 926 } 927 } else if (!s->default_nan_mode) { 928 return a; 929 } 930 parts_default_nan(&a, s); 931 return a; 932 } 933 934 static FloatParts64 pick_nan(FloatParts64 a, FloatParts64 b, float_status *s) 935 { 936 if (is_snan(a.cls) || is_snan(b.cls)) { 937 float_raise(float_flag_invalid, s); 938 } 939 940 if (s->default_nan_mode) { 941 parts_default_nan(&a, s); 942 } else { 943 if (pickNaN(a.cls, b.cls, 944 a.frac > b.frac || 945 (a.frac == b.frac && a.sign < b.sign), s)) { 946 a = b; 947 } 948 if (is_snan(a.cls)) { 949 parts_silence_nan(&a, s); 950 } 951 } 952 return a; 953 } 954 955 static FloatParts64 pick_nan_muladd(FloatParts64 a, FloatParts64 b, FloatParts64 c, 956 bool inf_zero, float_status *s) 957 { 958 int which; 959 960 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) { 961 float_raise(float_flag_invalid, s); 962 } 963 964 which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s); 965 966 if (s->default_nan_mode) { 967 /* Note that this check is after pickNaNMulAdd so that function 968 * has an opportunity to set the Invalid flag. 969 */ 970 which = 3; 971 } 972 973 switch (which) { 974 case 0: 975 break; 976 case 1: 977 a = b; 978 break; 979 case 2: 980 a = c; 981 break; 982 case 3: 983 parts_default_nan(&a, s); 984 break; 985 default: 986 g_assert_not_reached(); 987 } 988 989 if (is_snan(a.cls)) { 990 parts_silence_nan(&a, s); 991 } 992 return a; 993 } 994 995 /* 996 * Pack/unpack routines with a specific FloatFmt. 997 */ 998 999 static void float16a_unpack_canonical(FloatParts64 *p, float16 f, 1000 float_status *s, const FloatFmt *params) 1001 { 1002 float16_unpack_raw(p, f); 1003 *p = sf_canonicalize(*p, params, s); 1004 } 1005 1006 static void float16_unpack_canonical(FloatParts64 *p, float16 f, 1007 float_status *s) 1008 { 1009 float16a_unpack_canonical(p, f, s, &float16_params); 1010 } 1011 1012 static void bfloat16_unpack_canonical(FloatParts64 *p, bfloat16 f, 1013 float_status *s) 1014 { 1015 bfloat16_unpack_raw(p, f); 1016 *p = sf_canonicalize(*p, &bfloat16_params, s); 1017 } 1018 1019 static float16 float16a_round_pack_canonical(FloatParts64 *p, 1020 float_status *s, 1021 const FloatFmt *params) 1022 { 1023 *p = round_canonical(*p, s, params); 1024 return float16_pack_raw(p); 1025 } 1026 1027 static float16 float16_round_pack_canonical(FloatParts64 *p, 1028 float_status *s) 1029 { 1030 return float16a_round_pack_canonical(p, s, &float16_params); 1031 } 1032 1033 static bfloat16 bfloat16_round_pack_canonical(FloatParts64 *p, 1034 float_status *s) 1035 { 1036 *p = round_canonical(*p, s, &bfloat16_params); 1037 return bfloat16_pack_raw(p); 1038 } 1039 1040 static void float32_unpack_canonical(FloatParts64 *p, float32 f, 1041 float_status *s) 1042 { 1043 float32_unpack_raw(p, f); 1044 *p = sf_canonicalize(*p, &float32_params, s); 1045 } 1046 1047 static float32 float32_round_pack_canonical(FloatParts64 *p, 1048 float_status *s) 1049 { 1050 *p = round_canonical(*p, s, &float32_params); 1051 return float32_pack_raw(p); 1052 } 1053 1054 static void float64_unpack_canonical(FloatParts64 *p, float64 f, 1055 float_status *s) 1056 { 1057 float64_unpack_raw(p, f); 1058 *p = sf_canonicalize(*p, &float64_params, s); 1059 } 1060 1061 static float64 float64_round_pack_canonical(FloatParts64 *p, 1062 float_status *s) 1063 { 1064 *p = round_canonical(*p, s, &float64_params); 1065 return float64_pack_raw(p); 1066 } 1067 1068 /* 1069 * Returns the result of adding or subtracting the values of the 1070 * floating-point values `a' and `b'. The operation is performed 1071 * according to the IEC/IEEE Standard for Binary Floating-Point 1072 * Arithmetic. 1073 */ 1074 1075 static FloatParts64 addsub_floats(FloatParts64 a, FloatParts64 b, bool subtract, 1076 float_status *s) 1077 { 1078 bool a_sign = a.sign; 1079 bool b_sign = b.sign ^ subtract; 1080 1081 if (a_sign != b_sign) { 1082 /* Subtraction */ 1083 1084 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1085 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) { 1086 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 1087 a.frac = a.frac - b.frac; 1088 } else { 1089 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 1090 a.frac = b.frac - a.frac; 1091 a.exp = b.exp; 1092 a_sign ^= 1; 1093 } 1094 1095 if (a.frac == 0) { 1096 a.cls = float_class_zero; 1097 a.sign = s->float_rounding_mode == float_round_down; 1098 } else { 1099 int shift = clz64(a.frac); 1100 a.frac = a.frac << shift; 1101 a.exp = a.exp - shift; 1102 a.sign = a_sign; 1103 } 1104 return a; 1105 } 1106 if (is_nan(a.cls) || is_nan(b.cls)) { 1107 return pick_nan(a, b, s); 1108 } 1109 if (a.cls == float_class_inf) { 1110 if (b.cls == float_class_inf) { 1111 float_raise(float_flag_invalid, s); 1112 parts_default_nan(&a, s); 1113 } 1114 return a; 1115 } 1116 if (a.cls == float_class_zero && b.cls == float_class_zero) { 1117 a.sign = s->float_rounding_mode == float_round_down; 1118 return a; 1119 } 1120 if (a.cls == float_class_zero || b.cls == float_class_inf) { 1121 b.sign = a_sign ^ 1; 1122 return b; 1123 } 1124 if (b.cls == float_class_zero) { 1125 return a; 1126 } 1127 } else { 1128 /* Addition */ 1129 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1130 if (a.exp > b.exp) { 1131 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 1132 } else if (a.exp < b.exp) { 1133 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 1134 a.exp = b.exp; 1135 } 1136 1137 if (uadd64_overflow(a.frac, b.frac, &a.frac)) { 1138 shift64RightJamming(a.frac, 1, &a.frac); 1139 a.frac |= DECOMPOSED_IMPLICIT_BIT; 1140 a.exp += 1; 1141 } 1142 return a; 1143 } 1144 if (is_nan(a.cls) || is_nan(b.cls)) { 1145 return pick_nan(a, b, s); 1146 } 1147 if (a.cls == float_class_inf || b.cls == float_class_zero) { 1148 return a; 1149 } 1150 if (b.cls == float_class_inf || a.cls == float_class_zero) { 1151 b.sign = b_sign; 1152 return b; 1153 } 1154 } 1155 g_assert_not_reached(); 1156 } 1157 1158 /* 1159 * Returns the result of adding or subtracting the floating-point 1160 * values `a' and `b'. The operation is performed according to the 1161 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1162 */ 1163 1164 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status) 1165 { 1166 FloatParts64 pa, pb, pr; 1167 1168 float16_unpack_canonical(&pa, a, status); 1169 float16_unpack_canonical(&pb, b, status); 1170 pr = addsub_floats(pa, pb, false, status); 1171 1172 return float16_round_pack_canonical(&pr, status); 1173 } 1174 1175 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status) 1176 { 1177 FloatParts64 pa, pb, pr; 1178 1179 float16_unpack_canonical(&pa, a, status); 1180 float16_unpack_canonical(&pb, b, status); 1181 pr = addsub_floats(pa, pb, true, status); 1182 1183 return float16_round_pack_canonical(&pr, status); 1184 } 1185 1186 static float32 QEMU_SOFTFLOAT_ATTR 1187 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status) 1188 { 1189 FloatParts64 pa, pb, pr; 1190 1191 float32_unpack_canonical(&pa, a, status); 1192 float32_unpack_canonical(&pb, b, status); 1193 pr = addsub_floats(pa, pb, subtract, status); 1194 1195 return float32_round_pack_canonical(&pr, status); 1196 } 1197 1198 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status) 1199 { 1200 return soft_f32_addsub(a, b, false, status); 1201 } 1202 1203 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status) 1204 { 1205 return soft_f32_addsub(a, b, true, status); 1206 } 1207 1208 static float64 QEMU_SOFTFLOAT_ATTR 1209 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status) 1210 { 1211 FloatParts64 pa, pb, pr; 1212 1213 float64_unpack_canonical(&pa, a, status); 1214 float64_unpack_canonical(&pb, b, status); 1215 pr = addsub_floats(pa, pb, subtract, status); 1216 1217 return float64_round_pack_canonical(&pr, status); 1218 } 1219 1220 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status) 1221 { 1222 return soft_f64_addsub(a, b, false, status); 1223 } 1224 1225 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status) 1226 { 1227 return soft_f64_addsub(a, b, true, status); 1228 } 1229 1230 static float hard_f32_add(float a, float b) 1231 { 1232 return a + b; 1233 } 1234 1235 static float hard_f32_sub(float a, float b) 1236 { 1237 return a - b; 1238 } 1239 1240 static double hard_f64_add(double a, double b) 1241 { 1242 return a + b; 1243 } 1244 1245 static double hard_f64_sub(double a, double b) 1246 { 1247 return a - b; 1248 } 1249 1250 static bool f32_addsubmul_post(union_float32 a, union_float32 b) 1251 { 1252 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1253 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1254 } 1255 return !(float32_is_zero(a.s) && float32_is_zero(b.s)); 1256 } 1257 1258 static bool f64_addsubmul_post(union_float64 a, union_float64 b) 1259 { 1260 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1261 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1262 } else { 1263 return !(float64_is_zero(a.s) && float64_is_zero(b.s)); 1264 } 1265 } 1266 1267 static float32 float32_addsub(float32 a, float32 b, float_status *s, 1268 hard_f32_op2_fn hard, soft_f32_op2_fn soft) 1269 { 1270 return float32_gen2(a, b, s, hard, soft, 1271 f32_is_zon2, f32_addsubmul_post); 1272 } 1273 1274 static float64 float64_addsub(float64 a, float64 b, float_status *s, 1275 hard_f64_op2_fn hard, soft_f64_op2_fn soft) 1276 { 1277 return float64_gen2(a, b, s, hard, soft, 1278 f64_is_zon2, f64_addsubmul_post); 1279 } 1280 1281 float32 QEMU_FLATTEN 1282 float32_add(float32 a, float32 b, float_status *s) 1283 { 1284 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add); 1285 } 1286 1287 float32 QEMU_FLATTEN 1288 float32_sub(float32 a, float32 b, float_status *s) 1289 { 1290 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub); 1291 } 1292 1293 float64 QEMU_FLATTEN 1294 float64_add(float64 a, float64 b, float_status *s) 1295 { 1296 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add); 1297 } 1298 1299 float64 QEMU_FLATTEN 1300 float64_sub(float64 a, float64 b, float_status *s) 1301 { 1302 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub); 1303 } 1304 1305 /* 1306 * Returns the result of adding or subtracting the bfloat16 1307 * values `a' and `b'. 1308 */ 1309 bfloat16 QEMU_FLATTEN bfloat16_add(bfloat16 a, bfloat16 b, float_status *status) 1310 { 1311 FloatParts64 pa, pb, pr; 1312 1313 bfloat16_unpack_canonical(&pa, a, status); 1314 bfloat16_unpack_canonical(&pb, b, status); 1315 pr = addsub_floats(pa, pb, false, status); 1316 1317 return bfloat16_round_pack_canonical(&pr, status); 1318 } 1319 1320 bfloat16 QEMU_FLATTEN bfloat16_sub(bfloat16 a, bfloat16 b, float_status *status) 1321 { 1322 FloatParts64 pa, pb, pr; 1323 1324 bfloat16_unpack_canonical(&pa, a, status); 1325 bfloat16_unpack_canonical(&pb, b, status); 1326 pr = addsub_floats(pa, pb, true, status); 1327 1328 return bfloat16_round_pack_canonical(&pr, status); 1329 } 1330 1331 /* 1332 * Returns the result of multiplying the floating-point values `a' and 1333 * `b'. The operation is performed according to the IEC/IEEE Standard 1334 * for Binary Floating-Point Arithmetic. 1335 */ 1336 1337 static FloatParts64 mul_floats(FloatParts64 a, FloatParts64 b, float_status *s) 1338 { 1339 bool sign = a.sign ^ b.sign; 1340 1341 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1342 uint64_t hi, lo; 1343 int exp = a.exp + b.exp; 1344 1345 mul64To128(a.frac, b.frac, &hi, &lo); 1346 if (hi & DECOMPOSED_IMPLICIT_BIT) { 1347 exp += 1; 1348 } else { 1349 hi <<= 1; 1350 } 1351 hi |= (lo != 0); 1352 1353 /* Re-use a */ 1354 a.exp = exp; 1355 a.sign = sign; 1356 a.frac = hi; 1357 return a; 1358 } 1359 /* handle all the NaN cases */ 1360 if (is_nan(a.cls) || is_nan(b.cls)) { 1361 return pick_nan(a, b, s); 1362 } 1363 /* Inf * Zero == NaN */ 1364 if ((a.cls == float_class_inf && b.cls == float_class_zero) || 1365 (a.cls == float_class_zero && b.cls == float_class_inf)) { 1366 float_raise(float_flag_invalid, s); 1367 parts_default_nan(&a, s); 1368 return a; 1369 } 1370 /* Multiply by 0 or Inf */ 1371 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1372 a.sign = sign; 1373 return a; 1374 } 1375 if (b.cls == float_class_inf || b.cls == float_class_zero) { 1376 b.sign = sign; 1377 return b; 1378 } 1379 g_assert_not_reached(); 1380 } 1381 1382 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status) 1383 { 1384 FloatParts64 pa, pb, pr; 1385 1386 float16_unpack_canonical(&pa, a, status); 1387 float16_unpack_canonical(&pb, b, status); 1388 pr = mul_floats(pa, pb, status); 1389 1390 return float16_round_pack_canonical(&pr, status); 1391 } 1392 1393 static float32 QEMU_SOFTFLOAT_ATTR 1394 soft_f32_mul(float32 a, float32 b, float_status *status) 1395 { 1396 FloatParts64 pa, pb, pr; 1397 1398 float32_unpack_canonical(&pa, a, status); 1399 float32_unpack_canonical(&pb, b, status); 1400 pr = mul_floats(pa, pb, status); 1401 1402 return float32_round_pack_canonical(&pr, status); 1403 } 1404 1405 static float64 QEMU_SOFTFLOAT_ATTR 1406 soft_f64_mul(float64 a, float64 b, float_status *status) 1407 { 1408 FloatParts64 pa, pb, pr; 1409 1410 float64_unpack_canonical(&pa, a, status); 1411 float64_unpack_canonical(&pb, b, status); 1412 pr = mul_floats(pa, pb, status); 1413 1414 return float64_round_pack_canonical(&pr, status); 1415 } 1416 1417 static float hard_f32_mul(float a, float b) 1418 { 1419 return a * b; 1420 } 1421 1422 static double hard_f64_mul(double a, double b) 1423 { 1424 return a * b; 1425 } 1426 1427 float32 QEMU_FLATTEN 1428 float32_mul(float32 a, float32 b, float_status *s) 1429 { 1430 return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul, 1431 f32_is_zon2, f32_addsubmul_post); 1432 } 1433 1434 float64 QEMU_FLATTEN 1435 float64_mul(float64 a, float64 b, float_status *s) 1436 { 1437 return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul, 1438 f64_is_zon2, f64_addsubmul_post); 1439 } 1440 1441 /* 1442 * Returns the result of multiplying the bfloat16 1443 * values `a' and `b'. 1444 */ 1445 1446 bfloat16 QEMU_FLATTEN bfloat16_mul(bfloat16 a, bfloat16 b, float_status *status) 1447 { 1448 FloatParts64 pa, pb, pr; 1449 1450 bfloat16_unpack_canonical(&pa, a, status); 1451 bfloat16_unpack_canonical(&pb, b, status); 1452 pr = mul_floats(pa, pb, status); 1453 1454 return bfloat16_round_pack_canonical(&pr, status); 1455 } 1456 1457 /* 1458 * Returns the result of multiplying the floating-point values `a' and 1459 * `b' then adding 'c', with no intermediate rounding step after the 1460 * multiplication. The operation is performed according to the 1461 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008. 1462 * The flags argument allows the caller to select negation of the 1463 * addend, the intermediate product, or the final result. (The 1464 * difference between this and having the caller do a separate 1465 * negation is that negating externally will flip the sign bit on 1466 * NaNs.) 1467 */ 1468 1469 static FloatParts64 muladd_floats(FloatParts64 a, FloatParts64 b, FloatParts64 c, 1470 int flags, float_status *s) 1471 { 1472 bool inf_zero, p_sign; 1473 bool sign_flip = flags & float_muladd_negate_result; 1474 FloatClass p_class; 1475 uint64_t hi, lo; 1476 int p_exp; 1477 int ab_mask, abc_mask; 1478 1479 ab_mask = float_cmask(a.cls) | float_cmask(b.cls); 1480 abc_mask = float_cmask(c.cls) | ab_mask; 1481 inf_zero = ab_mask == float_cmask_infzero; 1482 1483 /* It is implementation-defined whether the cases of (0,inf,qnan) 1484 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 1485 * they return if they do), so we have to hand this information 1486 * off to the target-specific pick-a-NaN routine. 1487 */ 1488 if (unlikely(abc_mask & float_cmask_anynan)) { 1489 return pick_nan_muladd(a, b, c, inf_zero, s); 1490 } 1491 1492 if (inf_zero) { 1493 float_raise(float_flag_invalid, s); 1494 parts_default_nan(&a, s); 1495 return a; 1496 } 1497 1498 if (flags & float_muladd_negate_c) { 1499 c.sign ^= 1; 1500 } 1501 1502 p_sign = a.sign ^ b.sign; 1503 1504 if (flags & float_muladd_negate_product) { 1505 p_sign ^= 1; 1506 } 1507 1508 if (ab_mask & float_cmask_inf) { 1509 p_class = float_class_inf; 1510 } else if (ab_mask & float_cmask_zero) { 1511 p_class = float_class_zero; 1512 } else { 1513 p_class = float_class_normal; 1514 } 1515 1516 if (c.cls == float_class_inf) { 1517 if (p_class == float_class_inf && p_sign != c.sign) { 1518 float_raise(float_flag_invalid, s); 1519 parts_default_nan(&c, s); 1520 } else { 1521 c.sign ^= sign_flip; 1522 } 1523 return c; 1524 } 1525 1526 if (p_class == float_class_inf) { 1527 a.cls = float_class_inf; 1528 a.sign = p_sign ^ sign_flip; 1529 return a; 1530 } 1531 1532 if (p_class == float_class_zero) { 1533 if (c.cls == float_class_zero) { 1534 if (p_sign != c.sign) { 1535 p_sign = s->float_rounding_mode == float_round_down; 1536 } 1537 c.sign = p_sign; 1538 } else if (flags & float_muladd_halve_result) { 1539 c.exp -= 1; 1540 } 1541 c.sign ^= sign_flip; 1542 return c; 1543 } 1544 1545 /* a & b should be normals now... */ 1546 assert(a.cls == float_class_normal && 1547 b.cls == float_class_normal); 1548 1549 p_exp = a.exp + b.exp; 1550 1551 mul64To128(a.frac, b.frac, &hi, &lo); 1552 1553 /* Renormalize to the msb. */ 1554 if (hi & DECOMPOSED_IMPLICIT_BIT) { 1555 p_exp += 1; 1556 } else { 1557 shortShift128Left(hi, lo, 1, &hi, &lo); 1558 } 1559 1560 /* + add/sub */ 1561 if (c.cls != float_class_zero) { 1562 int exp_diff = p_exp - c.exp; 1563 if (p_sign == c.sign) { 1564 /* Addition */ 1565 if (exp_diff <= 0) { 1566 shift64RightJamming(hi, -exp_diff, &hi); 1567 p_exp = c.exp; 1568 if (uadd64_overflow(hi, c.frac, &hi)) { 1569 shift64RightJamming(hi, 1, &hi); 1570 hi |= DECOMPOSED_IMPLICIT_BIT; 1571 p_exp += 1; 1572 } 1573 } else { 1574 uint64_t c_hi, c_lo, over; 1575 shift128RightJamming(c.frac, 0, exp_diff, &c_hi, &c_lo); 1576 add192(0, hi, lo, 0, c_hi, c_lo, &over, &hi, &lo); 1577 if (over) { 1578 shift64RightJamming(hi, 1, &hi); 1579 hi |= DECOMPOSED_IMPLICIT_BIT; 1580 p_exp += 1; 1581 } 1582 } 1583 } else { 1584 /* Subtraction */ 1585 uint64_t c_hi = c.frac, c_lo = 0; 1586 1587 if (exp_diff <= 0) { 1588 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo); 1589 if (exp_diff == 0 1590 && 1591 (hi > c_hi || (hi == c_hi && lo >= c_lo))) { 1592 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1593 } else { 1594 sub128(c_hi, c_lo, hi, lo, &hi, &lo); 1595 p_sign ^= 1; 1596 p_exp = c.exp; 1597 } 1598 } else { 1599 shift128RightJamming(c_hi, c_lo, 1600 exp_diff, 1601 &c_hi, &c_lo); 1602 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1603 } 1604 1605 if (hi == 0 && lo == 0) { 1606 a.cls = float_class_zero; 1607 a.sign = s->float_rounding_mode == float_round_down; 1608 a.sign ^= sign_flip; 1609 return a; 1610 } else { 1611 int shift; 1612 if (hi != 0) { 1613 shift = clz64(hi); 1614 } else { 1615 shift = clz64(lo) + 64; 1616 } 1617 /* Normalizing to a binary point of 124 is the 1618 correct adjust for the exponent. However since we're 1619 shifting, we might as well put the binary point back 1620 at 63 where we really want it. Therefore shift as 1621 if we're leaving 1 bit at the top of the word, but 1622 adjust the exponent as if we're leaving 3 bits. */ 1623 shift128Left(hi, lo, shift, &hi, &lo); 1624 p_exp -= shift; 1625 } 1626 } 1627 } 1628 hi |= (lo != 0); 1629 1630 if (flags & float_muladd_halve_result) { 1631 p_exp -= 1; 1632 } 1633 1634 /* finally prepare our result */ 1635 a.cls = float_class_normal; 1636 a.sign = p_sign ^ sign_flip; 1637 a.exp = p_exp; 1638 a.frac = hi; 1639 1640 return a; 1641 } 1642 1643 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c, 1644 int flags, float_status *status) 1645 { 1646 FloatParts64 pa, pb, pc, pr; 1647 1648 float16_unpack_canonical(&pa, a, status); 1649 float16_unpack_canonical(&pb, b, status); 1650 float16_unpack_canonical(&pc, c, status); 1651 pr = muladd_floats(pa, pb, pc, flags, status); 1652 1653 return float16_round_pack_canonical(&pr, status); 1654 } 1655 1656 static float32 QEMU_SOFTFLOAT_ATTR 1657 soft_f32_muladd(float32 a, float32 b, float32 c, int flags, 1658 float_status *status) 1659 { 1660 FloatParts64 pa, pb, pc, pr; 1661 1662 float32_unpack_canonical(&pa, a, status); 1663 float32_unpack_canonical(&pb, b, status); 1664 float32_unpack_canonical(&pc, c, status); 1665 pr = muladd_floats(pa, pb, pc, flags, status); 1666 1667 return float32_round_pack_canonical(&pr, status); 1668 } 1669 1670 static float64 QEMU_SOFTFLOAT_ATTR 1671 soft_f64_muladd(float64 a, float64 b, float64 c, int flags, 1672 float_status *status) 1673 { 1674 FloatParts64 pa, pb, pc, pr; 1675 1676 float64_unpack_canonical(&pa, a, status); 1677 float64_unpack_canonical(&pb, b, status); 1678 float64_unpack_canonical(&pc, c, status); 1679 pr = muladd_floats(pa, pb, pc, flags, status); 1680 1681 return float64_round_pack_canonical(&pr, status); 1682 } 1683 1684 static bool force_soft_fma; 1685 1686 float32 QEMU_FLATTEN 1687 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s) 1688 { 1689 union_float32 ua, ub, uc, ur; 1690 1691 ua.s = xa; 1692 ub.s = xb; 1693 uc.s = xc; 1694 1695 if (unlikely(!can_use_fpu(s))) { 1696 goto soft; 1697 } 1698 if (unlikely(flags & float_muladd_halve_result)) { 1699 goto soft; 1700 } 1701 1702 float32_input_flush3(&ua.s, &ub.s, &uc.s, s); 1703 if (unlikely(!f32_is_zon3(ua, ub, uc))) { 1704 goto soft; 1705 } 1706 1707 if (unlikely(force_soft_fma)) { 1708 goto soft; 1709 } 1710 1711 /* 1712 * When (a || b) == 0, there's no need to check for under/over flow, 1713 * since we know the addend is (normal || 0) and the product is 0. 1714 */ 1715 if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) { 1716 union_float32 up; 1717 bool prod_sign; 1718 1719 prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s); 1720 prod_sign ^= !!(flags & float_muladd_negate_product); 1721 up.s = float32_set_sign(float32_zero, prod_sign); 1722 1723 if (flags & float_muladd_negate_c) { 1724 uc.h = -uc.h; 1725 } 1726 ur.h = up.h + uc.h; 1727 } else { 1728 union_float32 ua_orig = ua; 1729 union_float32 uc_orig = uc; 1730 1731 if (flags & float_muladd_negate_product) { 1732 ua.h = -ua.h; 1733 } 1734 if (flags & float_muladd_negate_c) { 1735 uc.h = -uc.h; 1736 } 1737 1738 ur.h = fmaf(ua.h, ub.h, uc.h); 1739 1740 if (unlikely(f32_is_inf(ur))) { 1741 float_raise(float_flag_overflow, s); 1742 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) { 1743 ua = ua_orig; 1744 uc = uc_orig; 1745 goto soft; 1746 } 1747 } 1748 if (flags & float_muladd_negate_result) { 1749 return float32_chs(ur.s); 1750 } 1751 return ur.s; 1752 1753 soft: 1754 return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s); 1755 } 1756 1757 float64 QEMU_FLATTEN 1758 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s) 1759 { 1760 union_float64 ua, ub, uc, ur; 1761 1762 ua.s = xa; 1763 ub.s = xb; 1764 uc.s = xc; 1765 1766 if (unlikely(!can_use_fpu(s))) { 1767 goto soft; 1768 } 1769 if (unlikely(flags & float_muladd_halve_result)) { 1770 goto soft; 1771 } 1772 1773 float64_input_flush3(&ua.s, &ub.s, &uc.s, s); 1774 if (unlikely(!f64_is_zon3(ua, ub, uc))) { 1775 goto soft; 1776 } 1777 1778 if (unlikely(force_soft_fma)) { 1779 goto soft; 1780 } 1781 1782 /* 1783 * When (a || b) == 0, there's no need to check for under/over flow, 1784 * since we know the addend is (normal || 0) and the product is 0. 1785 */ 1786 if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) { 1787 union_float64 up; 1788 bool prod_sign; 1789 1790 prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s); 1791 prod_sign ^= !!(flags & float_muladd_negate_product); 1792 up.s = float64_set_sign(float64_zero, prod_sign); 1793 1794 if (flags & float_muladd_negate_c) { 1795 uc.h = -uc.h; 1796 } 1797 ur.h = up.h + uc.h; 1798 } else { 1799 union_float64 ua_orig = ua; 1800 union_float64 uc_orig = uc; 1801 1802 if (flags & float_muladd_negate_product) { 1803 ua.h = -ua.h; 1804 } 1805 if (flags & float_muladd_negate_c) { 1806 uc.h = -uc.h; 1807 } 1808 1809 ur.h = fma(ua.h, ub.h, uc.h); 1810 1811 if (unlikely(f64_is_inf(ur))) { 1812 float_raise(float_flag_overflow, s); 1813 } else if (unlikely(fabs(ur.h) <= FLT_MIN)) { 1814 ua = ua_orig; 1815 uc = uc_orig; 1816 goto soft; 1817 } 1818 } 1819 if (flags & float_muladd_negate_result) { 1820 return float64_chs(ur.s); 1821 } 1822 return ur.s; 1823 1824 soft: 1825 return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s); 1826 } 1827 1828 /* 1829 * Returns the result of multiplying the bfloat16 values `a' 1830 * and `b' then adding 'c', with no intermediate rounding step after the 1831 * multiplication. 1832 */ 1833 1834 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c, 1835 int flags, float_status *status) 1836 { 1837 FloatParts64 pa, pb, pc, pr; 1838 1839 bfloat16_unpack_canonical(&pa, a, status); 1840 bfloat16_unpack_canonical(&pb, b, status); 1841 bfloat16_unpack_canonical(&pc, c, status); 1842 pr = muladd_floats(pa, pb, pc, flags, status); 1843 1844 return bfloat16_round_pack_canonical(&pr, status); 1845 } 1846 1847 /* 1848 * Returns the result of dividing the floating-point value `a' by the 1849 * corresponding value `b'. The operation is performed according to 1850 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1851 */ 1852 1853 static FloatParts64 div_floats(FloatParts64 a, FloatParts64 b, float_status *s) 1854 { 1855 bool sign = a.sign ^ b.sign; 1856 1857 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1858 uint64_t n0, n1, q, r; 1859 int exp = a.exp - b.exp; 1860 1861 /* 1862 * We want a 2*N / N-bit division to produce exactly an N-bit 1863 * result, so that we do not lose any precision and so that we 1864 * do not have to renormalize afterward. If A.frac < B.frac, 1865 * then division would produce an (N-1)-bit result; shift A left 1866 * by one to produce the an N-bit result, and decrement the 1867 * exponent to match. 1868 * 1869 * The udiv_qrnnd algorithm that we're using requires normalization, 1870 * i.e. the msb of the denominator must be set, which is already true. 1871 */ 1872 if (a.frac < b.frac) { 1873 exp -= 1; 1874 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0); 1875 } else { 1876 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, &n1, &n0); 1877 } 1878 q = udiv_qrnnd(&r, n1, n0, b.frac); 1879 1880 /* Set lsb if there is a remainder, to set inexact. */ 1881 a.frac = q | (r != 0); 1882 a.sign = sign; 1883 a.exp = exp; 1884 return a; 1885 } 1886 /* handle all the NaN cases */ 1887 if (is_nan(a.cls) || is_nan(b.cls)) { 1888 return pick_nan(a, b, s); 1889 } 1890 /* 0/0 or Inf/Inf */ 1891 if (a.cls == b.cls 1892 && 1893 (a.cls == float_class_inf || a.cls == float_class_zero)) { 1894 float_raise(float_flag_invalid, s); 1895 parts_default_nan(&a, s); 1896 return a; 1897 } 1898 /* Inf / x or 0 / x */ 1899 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1900 a.sign = sign; 1901 return a; 1902 } 1903 /* Div 0 => Inf */ 1904 if (b.cls == float_class_zero) { 1905 float_raise(float_flag_divbyzero, s); 1906 a.cls = float_class_inf; 1907 a.sign = sign; 1908 return a; 1909 } 1910 /* Div by Inf */ 1911 if (b.cls == float_class_inf) { 1912 a.cls = float_class_zero; 1913 a.sign = sign; 1914 return a; 1915 } 1916 g_assert_not_reached(); 1917 } 1918 1919 float16 float16_div(float16 a, float16 b, float_status *status) 1920 { 1921 FloatParts64 pa, pb, pr; 1922 1923 float16_unpack_canonical(&pa, a, status); 1924 float16_unpack_canonical(&pb, b, status); 1925 pr = div_floats(pa, pb, status); 1926 1927 return float16_round_pack_canonical(&pr, status); 1928 } 1929 1930 static float32 QEMU_SOFTFLOAT_ATTR 1931 soft_f32_div(float32 a, float32 b, float_status *status) 1932 { 1933 FloatParts64 pa, pb, pr; 1934 1935 float32_unpack_canonical(&pa, a, status); 1936 float32_unpack_canonical(&pb, b, status); 1937 pr = div_floats(pa, pb, status); 1938 1939 return float32_round_pack_canonical(&pr, status); 1940 } 1941 1942 static float64 QEMU_SOFTFLOAT_ATTR 1943 soft_f64_div(float64 a, float64 b, float_status *status) 1944 { 1945 FloatParts64 pa, pb, pr; 1946 1947 float64_unpack_canonical(&pa, a, status); 1948 float64_unpack_canonical(&pb, b, status); 1949 pr = div_floats(pa, pb, status); 1950 1951 return float64_round_pack_canonical(&pr, status); 1952 } 1953 1954 static float hard_f32_div(float a, float b) 1955 { 1956 return a / b; 1957 } 1958 1959 static double hard_f64_div(double a, double b) 1960 { 1961 return a / b; 1962 } 1963 1964 static bool f32_div_pre(union_float32 a, union_float32 b) 1965 { 1966 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1967 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1968 fpclassify(b.h) == FP_NORMAL; 1969 } 1970 return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s); 1971 } 1972 1973 static bool f64_div_pre(union_float64 a, union_float64 b) 1974 { 1975 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1976 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1977 fpclassify(b.h) == FP_NORMAL; 1978 } 1979 return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s); 1980 } 1981 1982 static bool f32_div_post(union_float32 a, union_float32 b) 1983 { 1984 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1985 return fpclassify(a.h) != FP_ZERO; 1986 } 1987 return !float32_is_zero(a.s); 1988 } 1989 1990 static bool f64_div_post(union_float64 a, union_float64 b) 1991 { 1992 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1993 return fpclassify(a.h) != FP_ZERO; 1994 } 1995 return !float64_is_zero(a.s); 1996 } 1997 1998 float32 QEMU_FLATTEN 1999 float32_div(float32 a, float32 b, float_status *s) 2000 { 2001 return float32_gen2(a, b, s, hard_f32_div, soft_f32_div, 2002 f32_div_pre, f32_div_post); 2003 } 2004 2005 float64 QEMU_FLATTEN 2006 float64_div(float64 a, float64 b, float_status *s) 2007 { 2008 return float64_gen2(a, b, s, hard_f64_div, soft_f64_div, 2009 f64_div_pre, f64_div_post); 2010 } 2011 2012 /* 2013 * Returns the result of dividing the bfloat16 2014 * value `a' by the corresponding value `b'. 2015 */ 2016 2017 bfloat16 bfloat16_div(bfloat16 a, bfloat16 b, float_status *status) 2018 { 2019 FloatParts64 pa, pb, pr; 2020 2021 bfloat16_unpack_canonical(&pa, a, status); 2022 bfloat16_unpack_canonical(&pb, b, status); 2023 pr = div_floats(pa, pb, status); 2024 2025 return bfloat16_round_pack_canonical(&pr, status); 2026 } 2027 2028 /* 2029 * Float to Float conversions 2030 * 2031 * Returns the result of converting one float format to another. The 2032 * conversion is performed according to the IEC/IEEE Standard for 2033 * Binary Floating-Point Arithmetic. 2034 * 2035 * The float_to_float helper only needs to take care of raising 2036 * invalid exceptions and handling the conversion on NaNs. 2037 */ 2038 2039 static FloatParts64 float_to_float(FloatParts64 a, const FloatFmt *dstf, 2040 float_status *s) 2041 { 2042 if (dstf->arm_althp) { 2043 switch (a.cls) { 2044 case float_class_qnan: 2045 case float_class_snan: 2046 /* There is no NaN in the destination format. Raise Invalid 2047 * and return a zero with the sign of the input NaN. 2048 */ 2049 float_raise(float_flag_invalid, s); 2050 a.cls = float_class_zero; 2051 a.frac = 0; 2052 a.exp = 0; 2053 break; 2054 2055 case float_class_inf: 2056 /* There is no Inf in the destination format. Raise Invalid 2057 * and return the maximum normal with the correct sign. 2058 */ 2059 float_raise(float_flag_invalid, s); 2060 a.cls = float_class_normal; 2061 a.exp = dstf->exp_max; 2062 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift; 2063 break; 2064 2065 default: 2066 break; 2067 } 2068 } else if (is_nan(a.cls)) { 2069 return return_nan(a, s); 2070 } 2071 return a; 2072 } 2073 2074 float32 float16_to_float32(float16 a, bool ieee, float_status *s) 2075 { 2076 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2077 FloatParts64 pa, pr; 2078 2079 float16a_unpack_canonical(&pa, a, s, fmt16); 2080 pr = float_to_float(pa, &float32_params, s); 2081 return float32_round_pack_canonical(&pr, s); 2082 } 2083 2084 float64 float16_to_float64(float16 a, bool ieee, float_status *s) 2085 { 2086 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2087 FloatParts64 pa, pr; 2088 2089 float16a_unpack_canonical(&pa, a, s, fmt16); 2090 pr = float_to_float(pa, &float64_params, s); 2091 return float64_round_pack_canonical(&pr, s); 2092 } 2093 2094 float16 float32_to_float16(float32 a, bool ieee, float_status *s) 2095 { 2096 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2097 FloatParts64 pa, pr; 2098 2099 float32_unpack_canonical(&pa, a, s); 2100 pr = float_to_float(pa, fmt16, s); 2101 return float16a_round_pack_canonical(&pr, s, fmt16); 2102 } 2103 2104 static float64 QEMU_SOFTFLOAT_ATTR 2105 soft_float32_to_float64(float32 a, float_status *s) 2106 { 2107 FloatParts64 pa, pr; 2108 2109 float32_unpack_canonical(&pa, a, s); 2110 pr = float_to_float(pa, &float64_params, s); 2111 return float64_round_pack_canonical(&pr, s); 2112 } 2113 2114 float64 float32_to_float64(float32 a, float_status *s) 2115 { 2116 if (likely(float32_is_normal(a))) { 2117 /* Widening conversion can never produce inexact results. */ 2118 union_float32 uf; 2119 union_float64 ud; 2120 uf.s = a; 2121 ud.h = uf.h; 2122 return ud.s; 2123 } else if (float32_is_zero(a)) { 2124 return float64_set_sign(float64_zero, float32_is_neg(a)); 2125 } else { 2126 return soft_float32_to_float64(a, s); 2127 } 2128 } 2129 2130 float16 float64_to_float16(float64 a, bool ieee, float_status *s) 2131 { 2132 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2133 FloatParts64 pa, pr; 2134 2135 float64_unpack_canonical(&pa, a, s); 2136 pr = float_to_float(pa, fmt16, s); 2137 return float16a_round_pack_canonical(&pr, s, fmt16); 2138 } 2139 2140 float32 float64_to_float32(float64 a, float_status *s) 2141 { 2142 FloatParts64 pa, pr; 2143 2144 float64_unpack_canonical(&pa, a, s); 2145 pr = float_to_float(pa, &float32_params, s); 2146 return float32_round_pack_canonical(&pr, s); 2147 } 2148 2149 float32 bfloat16_to_float32(bfloat16 a, float_status *s) 2150 { 2151 FloatParts64 pa, pr; 2152 2153 bfloat16_unpack_canonical(&pa, a, s); 2154 pr = float_to_float(pa, &float32_params, s); 2155 return float32_round_pack_canonical(&pr, s); 2156 } 2157 2158 float64 bfloat16_to_float64(bfloat16 a, float_status *s) 2159 { 2160 FloatParts64 pa, pr; 2161 2162 bfloat16_unpack_canonical(&pa, a, s); 2163 pr = float_to_float(pa, &float64_params, s); 2164 return float64_round_pack_canonical(&pr, s); 2165 } 2166 2167 bfloat16 float32_to_bfloat16(float32 a, float_status *s) 2168 { 2169 FloatParts64 pa, pr; 2170 2171 float32_unpack_canonical(&pa, a, s); 2172 pr = float_to_float(pa, &bfloat16_params, s); 2173 return bfloat16_round_pack_canonical(&pr, s); 2174 } 2175 2176 bfloat16 float64_to_bfloat16(float64 a, float_status *s) 2177 { 2178 FloatParts64 pa, pr; 2179 2180 float64_unpack_canonical(&pa, a, s); 2181 pr = float_to_float(pa, &bfloat16_params, s); 2182 return bfloat16_round_pack_canonical(&pr, s); 2183 } 2184 2185 /* 2186 * Rounds the floating-point value `a' to an integer, and returns the 2187 * result as a floating-point value. The operation is performed 2188 * according to the IEC/IEEE Standard for Binary Floating-Point 2189 * Arithmetic. 2190 */ 2191 2192 static FloatParts64 round_to_int(FloatParts64 a, FloatRoundMode rmode, 2193 int scale, float_status *s) 2194 { 2195 switch (a.cls) { 2196 case float_class_qnan: 2197 case float_class_snan: 2198 return return_nan(a, s); 2199 2200 case float_class_zero: 2201 case float_class_inf: 2202 /* already "integral" */ 2203 break; 2204 2205 case float_class_normal: 2206 scale = MIN(MAX(scale, -0x10000), 0x10000); 2207 a.exp += scale; 2208 2209 if (a.exp >= DECOMPOSED_BINARY_POINT) { 2210 /* already integral */ 2211 break; 2212 } 2213 if (a.exp < 0) { 2214 bool one; 2215 /* all fractional */ 2216 float_raise(float_flag_inexact, s); 2217 switch (rmode) { 2218 case float_round_nearest_even: 2219 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT; 2220 break; 2221 case float_round_ties_away: 2222 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT; 2223 break; 2224 case float_round_to_zero: 2225 one = false; 2226 break; 2227 case float_round_up: 2228 one = !a.sign; 2229 break; 2230 case float_round_down: 2231 one = a.sign; 2232 break; 2233 case float_round_to_odd: 2234 one = true; 2235 break; 2236 default: 2237 g_assert_not_reached(); 2238 } 2239 2240 if (one) { 2241 a.frac = DECOMPOSED_IMPLICIT_BIT; 2242 a.exp = 0; 2243 } else { 2244 a.cls = float_class_zero; 2245 } 2246 } else { 2247 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp; 2248 uint64_t frac_lsbm1 = frac_lsb >> 1; 2249 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb; 2250 uint64_t rnd_mask = rnd_even_mask >> 1; 2251 uint64_t inc; 2252 2253 switch (rmode) { 2254 case float_round_nearest_even: 2255 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 2256 break; 2257 case float_round_ties_away: 2258 inc = frac_lsbm1; 2259 break; 2260 case float_round_to_zero: 2261 inc = 0; 2262 break; 2263 case float_round_up: 2264 inc = a.sign ? 0 : rnd_mask; 2265 break; 2266 case float_round_down: 2267 inc = a.sign ? rnd_mask : 0; 2268 break; 2269 case float_round_to_odd: 2270 inc = a.frac & frac_lsb ? 0 : rnd_mask; 2271 break; 2272 default: 2273 g_assert_not_reached(); 2274 } 2275 2276 if (a.frac & rnd_mask) { 2277 float_raise(float_flag_inexact, s); 2278 if (uadd64_overflow(a.frac, inc, &a.frac)) { 2279 a.frac >>= 1; 2280 a.frac |= DECOMPOSED_IMPLICIT_BIT; 2281 a.exp++; 2282 } 2283 a.frac &= ~rnd_mask; 2284 } 2285 } 2286 break; 2287 default: 2288 g_assert_not_reached(); 2289 } 2290 return a; 2291 } 2292 2293 float16 float16_round_to_int(float16 a, float_status *s) 2294 { 2295 FloatParts64 pa, pr; 2296 2297 float16_unpack_canonical(&pa, a, s); 2298 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2299 return float16_round_pack_canonical(&pr, s); 2300 } 2301 2302 float32 float32_round_to_int(float32 a, float_status *s) 2303 { 2304 FloatParts64 pa, pr; 2305 2306 float32_unpack_canonical(&pa, a, s); 2307 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2308 return float32_round_pack_canonical(&pr, s); 2309 } 2310 2311 float64 float64_round_to_int(float64 a, float_status *s) 2312 { 2313 FloatParts64 pa, pr; 2314 2315 float64_unpack_canonical(&pa, a, s); 2316 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2317 return float64_round_pack_canonical(&pr, s); 2318 } 2319 2320 /* 2321 * Rounds the bfloat16 value `a' to an integer, and returns the 2322 * result as a bfloat16 value. 2323 */ 2324 2325 bfloat16 bfloat16_round_to_int(bfloat16 a, float_status *s) 2326 { 2327 FloatParts64 pa, pr; 2328 2329 bfloat16_unpack_canonical(&pa, a, s); 2330 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2331 return bfloat16_round_pack_canonical(&pr, s); 2332 } 2333 2334 /* 2335 * Returns the result of converting the floating-point value `a' to 2336 * the two's complement integer format. The conversion is performed 2337 * according to the IEC/IEEE Standard for Binary Floating-Point 2338 * Arithmetic---which means in particular that the conversion is 2339 * rounded according to the current rounding mode. If `a' is a NaN, 2340 * the largest positive integer is returned. Otherwise, if the 2341 * conversion overflows, the largest integer with the same sign as `a' 2342 * is returned. 2343 */ 2344 2345 static int64_t round_to_int_and_pack(FloatParts64 in, FloatRoundMode rmode, 2346 int scale, int64_t min, int64_t max, 2347 float_status *s) 2348 { 2349 uint64_t r; 2350 int orig_flags = get_float_exception_flags(s); 2351 FloatParts64 p = round_to_int(in, rmode, scale, s); 2352 2353 switch (p.cls) { 2354 case float_class_snan: 2355 case float_class_qnan: 2356 s->float_exception_flags = orig_flags | float_flag_invalid; 2357 return max; 2358 case float_class_inf: 2359 s->float_exception_flags = orig_flags | float_flag_invalid; 2360 return p.sign ? min : max; 2361 case float_class_zero: 2362 return 0; 2363 case float_class_normal: 2364 if (p.exp <= DECOMPOSED_BINARY_POINT) { 2365 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2366 } else { 2367 r = UINT64_MAX; 2368 } 2369 if (p.sign) { 2370 if (r <= -(uint64_t) min) { 2371 return -r; 2372 } else { 2373 s->float_exception_flags = orig_flags | float_flag_invalid; 2374 return min; 2375 } 2376 } else { 2377 if (r <= max) { 2378 return r; 2379 } else { 2380 s->float_exception_flags = orig_flags | float_flag_invalid; 2381 return max; 2382 } 2383 } 2384 default: 2385 g_assert_not_reached(); 2386 } 2387 } 2388 2389 int8_t float16_to_int8_scalbn(float16 a, FloatRoundMode rmode, int scale, 2390 float_status *s) 2391 { 2392 FloatParts64 p; 2393 2394 float16_unpack_canonical(&p, a, s); 2395 return round_to_int_and_pack(p, rmode, scale, INT8_MIN, INT8_MAX, s); 2396 } 2397 2398 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale, 2399 float_status *s) 2400 { 2401 FloatParts64 p; 2402 2403 float16_unpack_canonical(&p, a, s); 2404 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2405 } 2406 2407 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale, 2408 float_status *s) 2409 { 2410 FloatParts64 p; 2411 2412 float16_unpack_canonical(&p, a, s); 2413 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2414 } 2415 2416 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale, 2417 float_status *s) 2418 { 2419 FloatParts64 p; 2420 2421 float16_unpack_canonical(&p, a, s); 2422 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2423 } 2424 2425 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale, 2426 float_status *s) 2427 { 2428 FloatParts64 p; 2429 2430 float32_unpack_canonical(&p, a, s); 2431 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2432 } 2433 2434 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale, 2435 float_status *s) 2436 { 2437 FloatParts64 p; 2438 2439 float32_unpack_canonical(&p, a, s); 2440 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2441 } 2442 2443 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale, 2444 float_status *s) 2445 { 2446 FloatParts64 p; 2447 2448 float32_unpack_canonical(&p, a, s); 2449 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2450 } 2451 2452 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale, 2453 float_status *s) 2454 { 2455 FloatParts64 p; 2456 2457 float64_unpack_canonical(&p, a, s); 2458 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2459 } 2460 2461 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale, 2462 float_status *s) 2463 { 2464 FloatParts64 p; 2465 2466 float64_unpack_canonical(&p, a, s); 2467 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2468 } 2469 2470 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale, 2471 float_status *s) 2472 { 2473 FloatParts64 p; 2474 2475 float64_unpack_canonical(&p, a, s); 2476 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2477 } 2478 2479 int8_t float16_to_int8(float16 a, float_status *s) 2480 { 2481 return float16_to_int8_scalbn(a, s->float_rounding_mode, 0, s); 2482 } 2483 2484 int16_t float16_to_int16(float16 a, float_status *s) 2485 { 2486 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2487 } 2488 2489 int32_t float16_to_int32(float16 a, float_status *s) 2490 { 2491 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2492 } 2493 2494 int64_t float16_to_int64(float16 a, float_status *s) 2495 { 2496 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2497 } 2498 2499 int16_t float32_to_int16(float32 a, float_status *s) 2500 { 2501 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2502 } 2503 2504 int32_t float32_to_int32(float32 a, float_status *s) 2505 { 2506 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2507 } 2508 2509 int64_t float32_to_int64(float32 a, float_status *s) 2510 { 2511 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2512 } 2513 2514 int16_t float64_to_int16(float64 a, float_status *s) 2515 { 2516 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2517 } 2518 2519 int32_t float64_to_int32(float64 a, float_status *s) 2520 { 2521 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2522 } 2523 2524 int64_t float64_to_int64(float64 a, float_status *s) 2525 { 2526 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2527 } 2528 2529 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s) 2530 { 2531 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2532 } 2533 2534 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s) 2535 { 2536 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2537 } 2538 2539 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s) 2540 { 2541 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2542 } 2543 2544 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s) 2545 { 2546 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s); 2547 } 2548 2549 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s) 2550 { 2551 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s); 2552 } 2553 2554 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s) 2555 { 2556 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s); 2557 } 2558 2559 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s) 2560 { 2561 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s); 2562 } 2563 2564 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s) 2565 { 2566 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s); 2567 } 2568 2569 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s) 2570 { 2571 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s); 2572 } 2573 2574 /* 2575 * Returns the result of converting the floating-point value `a' to 2576 * the two's complement integer format. 2577 */ 2578 2579 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2580 float_status *s) 2581 { 2582 FloatParts64 p; 2583 2584 bfloat16_unpack_canonical(&p, a, s); 2585 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2586 } 2587 2588 int32_t bfloat16_to_int32_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2589 float_status *s) 2590 { 2591 FloatParts64 p; 2592 2593 bfloat16_unpack_canonical(&p, a, s); 2594 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2595 } 2596 2597 int64_t bfloat16_to_int64_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2598 float_status *s) 2599 { 2600 FloatParts64 p; 2601 2602 bfloat16_unpack_canonical(&p, a, s); 2603 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2604 } 2605 2606 int16_t bfloat16_to_int16(bfloat16 a, float_status *s) 2607 { 2608 return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2609 } 2610 2611 int32_t bfloat16_to_int32(bfloat16 a, float_status *s) 2612 { 2613 return bfloat16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2614 } 2615 2616 int64_t bfloat16_to_int64(bfloat16 a, float_status *s) 2617 { 2618 return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2619 } 2620 2621 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s) 2622 { 2623 return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2624 } 2625 2626 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a, float_status *s) 2627 { 2628 return bfloat16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2629 } 2630 2631 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a, float_status *s) 2632 { 2633 return bfloat16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2634 } 2635 2636 /* 2637 * Returns the result of converting the floating-point value `a' to 2638 * the unsigned integer format. The conversion is performed according 2639 * to the IEC/IEEE Standard for Binary Floating-Point 2640 * Arithmetic---which means in particular that the conversion is 2641 * rounded according to the current rounding mode. If `a' is a NaN, 2642 * the largest unsigned integer is returned. Otherwise, if the 2643 * conversion overflows, the largest unsigned integer is returned. If 2644 * the 'a' is negative, the result is rounded and zero is returned; 2645 * values that do not round to zero will raise the inexact exception 2646 * flag. 2647 */ 2648 2649 static uint64_t round_to_uint_and_pack(FloatParts64 in, FloatRoundMode rmode, 2650 int scale, uint64_t max, 2651 float_status *s) 2652 { 2653 int orig_flags = get_float_exception_flags(s); 2654 FloatParts64 p = round_to_int(in, rmode, scale, s); 2655 uint64_t r; 2656 2657 switch (p.cls) { 2658 case float_class_snan: 2659 case float_class_qnan: 2660 s->float_exception_flags = orig_flags | float_flag_invalid; 2661 return max; 2662 case float_class_inf: 2663 s->float_exception_flags = orig_flags | float_flag_invalid; 2664 return p.sign ? 0 : max; 2665 case float_class_zero: 2666 return 0; 2667 case float_class_normal: 2668 if (p.sign) { 2669 s->float_exception_flags = orig_flags | float_flag_invalid; 2670 return 0; 2671 } 2672 2673 if (p.exp <= DECOMPOSED_BINARY_POINT) { 2674 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2675 } else { 2676 s->float_exception_flags = orig_flags | float_flag_invalid; 2677 return max; 2678 } 2679 2680 /* For uint64 this will never trip, but if p.exp is too large 2681 * to shift a decomposed fraction we shall have exited via the 2682 * 3rd leg above. 2683 */ 2684 if (r > max) { 2685 s->float_exception_flags = orig_flags | float_flag_invalid; 2686 return max; 2687 } 2688 return r; 2689 default: 2690 g_assert_not_reached(); 2691 } 2692 } 2693 2694 uint8_t float16_to_uint8_scalbn(float16 a, FloatRoundMode rmode, int scale, 2695 float_status *s) 2696 { 2697 FloatParts64 p; 2698 2699 float16_unpack_canonical(&p, a, s); 2700 return round_to_uint_and_pack(p, rmode, scale, UINT8_MAX, s); 2701 } 2702 2703 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale, 2704 float_status *s) 2705 { 2706 FloatParts64 p; 2707 2708 float16_unpack_canonical(&p, a, s); 2709 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2710 } 2711 2712 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale, 2713 float_status *s) 2714 { 2715 FloatParts64 p; 2716 2717 float16_unpack_canonical(&p, a, s); 2718 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2719 } 2720 2721 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale, 2722 float_status *s) 2723 { 2724 FloatParts64 p; 2725 2726 float16_unpack_canonical(&p, a, s); 2727 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2728 } 2729 2730 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale, 2731 float_status *s) 2732 { 2733 FloatParts64 p; 2734 2735 float32_unpack_canonical(&p, a, s); 2736 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2737 } 2738 2739 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale, 2740 float_status *s) 2741 { 2742 FloatParts64 p; 2743 2744 float32_unpack_canonical(&p, a, s); 2745 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2746 } 2747 2748 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale, 2749 float_status *s) 2750 { 2751 FloatParts64 p; 2752 2753 float32_unpack_canonical(&p, a, s); 2754 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2755 } 2756 2757 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale, 2758 float_status *s) 2759 { 2760 FloatParts64 p; 2761 2762 float64_unpack_canonical(&p, a, s); 2763 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2764 } 2765 2766 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale, 2767 float_status *s) 2768 { 2769 FloatParts64 p; 2770 2771 float64_unpack_canonical(&p, a, s); 2772 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2773 } 2774 2775 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale, 2776 float_status *s) 2777 { 2778 FloatParts64 p; 2779 2780 float64_unpack_canonical(&p, a, s); 2781 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2782 } 2783 2784 uint8_t float16_to_uint8(float16 a, float_status *s) 2785 { 2786 return float16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s); 2787 } 2788 2789 uint16_t float16_to_uint16(float16 a, float_status *s) 2790 { 2791 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2792 } 2793 2794 uint32_t float16_to_uint32(float16 a, float_status *s) 2795 { 2796 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2797 } 2798 2799 uint64_t float16_to_uint64(float16 a, float_status *s) 2800 { 2801 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2802 } 2803 2804 uint16_t float32_to_uint16(float32 a, float_status *s) 2805 { 2806 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2807 } 2808 2809 uint32_t float32_to_uint32(float32 a, float_status *s) 2810 { 2811 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2812 } 2813 2814 uint64_t float32_to_uint64(float32 a, float_status *s) 2815 { 2816 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2817 } 2818 2819 uint16_t float64_to_uint16(float64 a, float_status *s) 2820 { 2821 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2822 } 2823 2824 uint32_t float64_to_uint32(float64 a, float_status *s) 2825 { 2826 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2827 } 2828 2829 uint64_t float64_to_uint64(float64 a, float_status *s) 2830 { 2831 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2832 } 2833 2834 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s) 2835 { 2836 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2837 } 2838 2839 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s) 2840 { 2841 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2842 } 2843 2844 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s) 2845 { 2846 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2847 } 2848 2849 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s) 2850 { 2851 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2852 } 2853 2854 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s) 2855 { 2856 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2857 } 2858 2859 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s) 2860 { 2861 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2862 } 2863 2864 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s) 2865 { 2866 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2867 } 2868 2869 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s) 2870 { 2871 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2872 } 2873 2874 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s) 2875 { 2876 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2877 } 2878 2879 /* 2880 * Returns the result of converting the bfloat16 value `a' to 2881 * the unsigned integer format. 2882 */ 2883 2884 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode, 2885 int scale, float_status *s) 2886 { 2887 FloatParts64 p; 2888 2889 bfloat16_unpack_canonical(&p, a, s); 2890 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2891 } 2892 2893 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode rmode, 2894 int scale, float_status *s) 2895 { 2896 FloatParts64 p; 2897 2898 bfloat16_unpack_canonical(&p, a, s); 2899 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2900 } 2901 2902 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode rmode, 2903 int scale, float_status *s) 2904 { 2905 FloatParts64 p; 2906 2907 bfloat16_unpack_canonical(&p, a, s); 2908 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2909 } 2910 2911 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s) 2912 { 2913 return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2914 } 2915 2916 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *s) 2917 { 2918 return bfloat16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2919 } 2920 2921 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s) 2922 { 2923 return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2924 } 2925 2926 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s) 2927 { 2928 return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2929 } 2930 2931 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *s) 2932 { 2933 return bfloat16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2934 } 2935 2936 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *s) 2937 { 2938 return bfloat16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2939 } 2940 2941 /* 2942 * Integer to float conversions 2943 * 2944 * Returns the result of converting the two's complement integer `a' 2945 * to the floating-point format. The conversion is performed according 2946 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2947 */ 2948 2949 static FloatParts64 int_to_float(int64_t a, int scale, float_status *status) 2950 { 2951 FloatParts64 r = { .sign = false }; 2952 2953 if (a == 0) { 2954 r.cls = float_class_zero; 2955 } else { 2956 uint64_t f = a; 2957 int shift; 2958 2959 r.cls = float_class_normal; 2960 if (a < 0) { 2961 f = -f; 2962 r.sign = true; 2963 } 2964 shift = clz64(f); 2965 scale = MIN(MAX(scale, -0x10000), 0x10000); 2966 2967 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 2968 r.frac = f << shift; 2969 } 2970 2971 return r; 2972 } 2973 2974 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status) 2975 { 2976 FloatParts64 pa = int_to_float(a, scale, status); 2977 return float16_round_pack_canonical(&pa, status); 2978 } 2979 2980 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status) 2981 { 2982 return int64_to_float16_scalbn(a, scale, status); 2983 } 2984 2985 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status) 2986 { 2987 return int64_to_float16_scalbn(a, scale, status); 2988 } 2989 2990 float16 int64_to_float16(int64_t a, float_status *status) 2991 { 2992 return int64_to_float16_scalbn(a, 0, status); 2993 } 2994 2995 float16 int32_to_float16(int32_t a, float_status *status) 2996 { 2997 return int64_to_float16_scalbn(a, 0, status); 2998 } 2999 3000 float16 int16_to_float16(int16_t a, float_status *status) 3001 { 3002 return int64_to_float16_scalbn(a, 0, status); 3003 } 3004 3005 float16 int8_to_float16(int8_t a, float_status *status) 3006 { 3007 return int64_to_float16_scalbn(a, 0, status); 3008 } 3009 3010 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status) 3011 { 3012 FloatParts64 pa = int_to_float(a, scale, status); 3013 return float32_round_pack_canonical(&pa, status); 3014 } 3015 3016 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status) 3017 { 3018 return int64_to_float32_scalbn(a, scale, status); 3019 } 3020 3021 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status) 3022 { 3023 return int64_to_float32_scalbn(a, scale, status); 3024 } 3025 3026 float32 int64_to_float32(int64_t a, float_status *status) 3027 { 3028 return int64_to_float32_scalbn(a, 0, status); 3029 } 3030 3031 float32 int32_to_float32(int32_t a, float_status *status) 3032 { 3033 return int64_to_float32_scalbn(a, 0, status); 3034 } 3035 3036 float32 int16_to_float32(int16_t a, float_status *status) 3037 { 3038 return int64_to_float32_scalbn(a, 0, status); 3039 } 3040 3041 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status) 3042 { 3043 FloatParts64 pa = int_to_float(a, scale, status); 3044 return float64_round_pack_canonical(&pa, status); 3045 } 3046 3047 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status) 3048 { 3049 return int64_to_float64_scalbn(a, scale, status); 3050 } 3051 3052 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status) 3053 { 3054 return int64_to_float64_scalbn(a, scale, status); 3055 } 3056 3057 float64 int64_to_float64(int64_t a, float_status *status) 3058 { 3059 return int64_to_float64_scalbn(a, 0, status); 3060 } 3061 3062 float64 int32_to_float64(int32_t a, float_status *status) 3063 { 3064 return int64_to_float64_scalbn(a, 0, status); 3065 } 3066 3067 float64 int16_to_float64(int16_t a, float_status *status) 3068 { 3069 return int64_to_float64_scalbn(a, 0, status); 3070 } 3071 3072 /* 3073 * Returns the result of converting the two's complement integer `a' 3074 * to the bfloat16 format. 3075 */ 3076 3077 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status) 3078 { 3079 FloatParts64 pa = int_to_float(a, scale, status); 3080 return bfloat16_round_pack_canonical(&pa, status); 3081 } 3082 3083 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int scale, float_status *status) 3084 { 3085 return int64_to_bfloat16_scalbn(a, scale, status); 3086 } 3087 3088 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status) 3089 { 3090 return int64_to_bfloat16_scalbn(a, scale, status); 3091 } 3092 3093 bfloat16 int64_to_bfloat16(int64_t a, float_status *status) 3094 { 3095 return int64_to_bfloat16_scalbn(a, 0, status); 3096 } 3097 3098 bfloat16 int32_to_bfloat16(int32_t a, float_status *status) 3099 { 3100 return int64_to_bfloat16_scalbn(a, 0, status); 3101 } 3102 3103 bfloat16 int16_to_bfloat16(int16_t a, float_status *status) 3104 { 3105 return int64_to_bfloat16_scalbn(a, 0, status); 3106 } 3107 3108 /* 3109 * Unsigned Integer to float conversions 3110 * 3111 * Returns the result of converting the unsigned integer `a' to the 3112 * floating-point format. The conversion is performed according to the 3113 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3114 */ 3115 3116 static FloatParts64 uint_to_float(uint64_t a, int scale, float_status *status) 3117 { 3118 FloatParts64 r = { .sign = false }; 3119 int shift; 3120 3121 if (a == 0) { 3122 r.cls = float_class_zero; 3123 } else { 3124 scale = MIN(MAX(scale, -0x10000), 0x10000); 3125 shift = clz64(a); 3126 r.cls = float_class_normal; 3127 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 3128 r.frac = a << shift; 3129 } 3130 3131 return r; 3132 } 3133 3134 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status) 3135 { 3136 FloatParts64 pa = uint_to_float(a, scale, status); 3137 return float16_round_pack_canonical(&pa, status); 3138 } 3139 3140 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status) 3141 { 3142 return uint64_to_float16_scalbn(a, scale, status); 3143 } 3144 3145 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status) 3146 { 3147 return uint64_to_float16_scalbn(a, scale, status); 3148 } 3149 3150 float16 uint64_to_float16(uint64_t a, float_status *status) 3151 { 3152 return uint64_to_float16_scalbn(a, 0, status); 3153 } 3154 3155 float16 uint32_to_float16(uint32_t a, float_status *status) 3156 { 3157 return uint64_to_float16_scalbn(a, 0, status); 3158 } 3159 3160 float16 uint16_to_float16(uint16_t a, float_status *status) 3161 { 3162 return uint64_to_float16_scalbn(a, 0, status); 3163 } 3164 3165 float16 uint8_to_float16(uint8_t a, float_status *status) 3166 { 3167 return uint64_to_float16_scalbn(a, 0, status); 3168 } 3169 3170 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status) 3171 { 3172 FloatParts64 pa = uint_to_float(a, scale, status); 3173 return float32_round_pack_canonical(&pa, status); 3174 } 3175 3176 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status) 3177 { 3178 return uint64_to_float32_scalbn(a, scale, status); 3179 } 3180 3181 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status) 3182 { 3183 return uint64_to_float32_scalbn(a, scale, status); 3184 } 3185 3186 float32 uint64_to_float32(uint64_t a, float_status *status) 3187 { 3188 return uint64_to_float32_scalbn(a, 0, status); 3189 } 3190 3191 float32 uint32_to_float32(uint32_t a, float_status *status) 3192 { 3193 return uint64_to_float32_scalbn(a, 0, status); 3194 } 3195 3196 float32 uint16_to_float32(uint16_t a, float_status *status) 3197 { 3198 return uint64_to_float32_scalbn(a, 0, status); 3199 } 3200 3201 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status) 3202 { 3203 FloatParts64 pa = uint_to_float(a, scale, status); 3204 return float64_round_pack_canonical(&pa, status); 3205 } 3206 3207 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status) 3208 { 3209 return uint64_to_float64_scalbn(a, scale, status); 3210 } 3211 3212 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status) 3213 { 3214 return uint64_to_float64_scalbn(a, scale, status); 3215 } 3216 3217 float64 uint64_to_float64(uint64_t a, float_status *status) 3218 { 3219 return uint64_to_float64_scalbn(a, 0, status); 3220 } 3221 3222 float64 uint32_to_float64(uint32_t a, float_status *status) 3223 { 3224 return uint64_to_float64_scalbn(a, 0, status); 3225 } 3226 3227 float64 uint16_to_float64(uint16_t a, float_status *status) 3228 { 3229 return uint64_to_float64_scalbn(a, 0, status); 3230 } 3231 3232 /* 3233 * Returns the result of converting the unsigned integer `a' to the 3234 * bfloat16 format. 3235 */ 3236 3237 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status) 3238 { 3239 FloatParts64 pa = uint_to_float(a, scale, status); 3240 return bfloat16_round_pack_canonical(&pa, status); 3241 } 3242 3243 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int scale, float_status *status) 3244 { 3245 return uint64_to_bfloat16_scalbn(a, scale, status); 3246 } 3247 3248 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status) 3249 { 3250 return uint64_to_bfloat16_scalbn(a, scale, status); 3251 } 3252 3253 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status) 3254 { 3255 return uint64_to_bfloat16_scalbn(a, 0, status); 3256 } 3257 3258 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status) 3259 { 3260 return uint64_to_bfloat16_scalbn(a, 0, status); 3261 } 3262 3263 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status) 3264 { 3265 return uint64_to_bfloat16_scalbn(a, 0, status); 3266 } 3267 3268 /* Float Min/Max */ 3269 /* min() and max() functions. These can't be implemented as 3270 * 'compare and pick one input' because that would mishandle 3271 * NaNs and +0 vs -0. 3272 * 3273 * minnum() and maxnum() functions. These are similar to the min() 3274 * and max() functions but if one of the arguments is a QNaN and 3275 * the other is numerical then the numerical argument is returned. 3276 * SNaNs will get quietened before being returned. 3277 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 3278 * and maxNum() operations. min() and max() are the typical min/max 3279 * semantics provided by many CPUs which predate that specification. 3280 * 3281 * minnummag() and maxnummag() functions correspond to minNumMag() 3282 * and minNumMag() from the IEEE-754 2008. 3283 */ 3284 static FloatParts64 minmax_floats(FloatParts64 a, FloatParts64 b, bool ismin, 3285 bool ieee, bool ismag, float_status *s) 3286 { 3287 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) { 3288 if (ieee) { 3289 /* Takes two floating-point values `a' and `b', one of 3290 * which is a NaN, and returns the appropriate NaN 3291 * result. If either `a' or `b' is a signaling NaN, 3292 * the invalid exception is raised. 3293 */ 3294 if (is_snan(a.cls) || is_snan(b.cls)) { 3295 return pick_nan(a, b, s); 3296 } else if (is_nan(a.cls) && !is_nan(b.cls)) { 3297 return b; 3298 } else if (is_nan(b.cls) && !is_nan(a.cls)) { 3299 return a; 3300 } 3301 } 3302 return pick_nan(a, b, s); 3303 } else { 3304 int a_exp, b_exp; 3305 3306 switch (a.cls) { 3307 case float_class_normal: 3308 a_exp = a.exp; 3309 break; 3310 case float_class_inf: 3311 a_exp = INT_MAX; 3312 break; 3313 case float_class_zero: 3314 a_exp = INT_MIN; 3315 break; 3316 default: 3317 g_assert_not_reached(); 3318 break; 3319 } 3320 switch (b.cls) { 3321 case float_class_normal: 3322 b_exp = b.exp; 3323 break; 3324 case float_class_inf: 3325 b_exp = INT_MAX; 3326 break; 3327 case float_class_zero: 3328 b_exp = INT_MIN; 3329 break; 3330 default: 3331 g_assert_not_reached(); 3332 break; 3333 } 3334 3335 if (ismag && (a_exp != b_exp || a.frac != b.frac)) { 3336 bool a_less = a_exp < b_exp; 3337 if (a_exp == b_exp) { 3338 a_less = a.frac < b.frac; 3339 } 3340 return a_less ^ ismin ? b : a; 3341 } 3342 3343 if (a.sign == b.sign) { 3344 bool a_less = a_exp < b_exp; 3345 if (a_exp == b_exp) { 3346 a_less = a.frac < b.frac; 3347 } 3348 return a.sign ^ a_less ^ ismin ? b : a; 3349 } else { 3350 return a.sign ^ ismin ? b : a; 3351 } 3352 } 3353 } 3354 3355 #define MINMAX(sz, name, ismin, isiee, ismag) \ 3356 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \ 3357 float_status *s) \ 3358 { \ 3359 FloatParts64 pa, pb, pr; \ 3360 float ## sz ## _unpack_canonical(&pa, a, s); \ 3361 float ## sz ## _unpack_canonical(&pb, b, s); \ 3362 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 3363 return float ## sz ## _round_pack_canonical(&pr, s); \ 3364 } 3365 3366 MINMAX(16, min, true, false, false) 3367 MINMAX(16, minnum, true, true, false) 3368 MINMAX(16, minnummag, true, true, true) 3369 MINMAX(16, max, false, false, false) 3370 MINMAX(16, maxnum, false, true, false) 3371 MINMAX(16, maxnummag, false, true, true) 3372 3373 MINMAX(32, min, true, false, false) 3374 MINMAX(32, minnum, true, true, false) 3375 MINMAX(32, minnummag, true, true, true) 3376 MINMAX(32, max, false, false, false) 3377 MINMAX(32, maxnum, false, true, false) 3378 MINMAX(32, maxnummag, false, true, true) 3379 3380 MINMAX(64, min, true, false, false) 3381 MINMAX(64, minnum, true, true, false) 3382 MINMAX(64, minnummag, true, true, true) 3383 MINMAX(64, max, false, false, false) 3384 MINMAX(64, maxnum, false, true, false) 3385 MINMAX(64, maxnummag, false, true, true) 3386 3387 #undef MINMAX 3388 3389 #define BF16_MINMAX(name, ismin, isiee, ismag) \ 3390 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s) \ 3391 { \ 3392 FloatParts64 pa, pb, pr; \ 3393 bfloat16_unpack_canonical(&pa, a, s); \ 3394 bfloat16_unpack_canonical(&pb, b, s); \ 3395 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 3396 return bfloat16_round_pack_canonical(&pr, s); \ 3397 } 3398 3399 BF16_MINMAX(min, true, false, false) 3400 BF16_MINMAX(minnum, true, true, false) 3401 BF16_MINMAX(minnummag, true, true, true) 3402 BF16_MINMAX(max, false, false, false) 3403 BF16_MINMAX(maxnum, false, true, false) 3404 BF16_MINMAX(maxnummag, false, true, true) 3405 3406 #undef BF16_MINMAX 3407 3408 /* Floating point compare */ 3409 static FloatRelation compare_floats(FloatParts64 a, FloatParts64 b, bool is_quiet, 3410 float_status *s) 3411 { 3412 if (is_nan(a.cls) || is_nan(b.cls)) { 3413 if (!is_quiet || 3414 a.cls == float_class_snan || 3415 b.cls == float_class_snan) { 3416 float_raise(float_flag_invalid, s); 3417 } 3418 return float_relation_unordered; 3419 } 3420 3421 if (a.cls == float_class_zero) { 3422 if (b.cls == float_class_zero) { 3423 return float_relation_equal; 3424 } 3425 return b.sign ? float_relation_greater : float_relation_less; 3426 } else if (b.cls == float_class_zero) { 3427 return a.sign ? float_relation_less : float_relation_greater; 3428 } 3429 3430 /* The only really important thing about infinity is its sign. If 3431 * both are infinities the sign marks the smallest of the two. 3432 */ 3433 if (a.cls == float_class_inf) { 3434 if ((b.cls == float_class_inf) && (a.sign == b.sign)) { 3435 return float_relation_equal; 3436 } 3437 return a.sign ? float_relation_less : float_relation_greater; 3438 } else if (b.cls == float_class_inf) { 3439 return b.sign ? float_relation_greater : float_relation_less; 3440 } 3441 3442 if (a.sign != b.sign) { 3443 return a.sign ? float_relation_less : float_relation_greater; 3444 } 3445 3446 if (a.exp == b.exp) { 3447 if (a.frac == b.frac) { 3448 return float_relation_equal; 3449 } 3450 if (a.sign) { 3451 return a.frac > b.frac ? 3452 float_relation_less : float_relation_greater; 3453 } else { 3454 return a.frac > b.frac ? 3455 float_relation_greater : float_relation_less; 3456 } 3457 } else { 3458 if (a.sign) { 3459 return a.exp > b.exp ? float_relation_less : float_relation_greater; 3460 } else { 3461 return a.exp > b.exp ? float_relation_greater : float_relation_less; 3462 } 3463 } 3464 } 3465 3466 #define COMPARE(name, attr, sz) \ 3467 static int attr \ 3468 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \ 3469 { \ 3470 FloatParts64 pa, pb; \ 3471 float ## sz ## _unpack_canonical(&pa, a, s); \ 3472 float ## sz ## _unpack_canonical(&pb, b, s); \ 3473 return compare_floats(pa, pb, is_quiet, s); \ 3474 } 3475 3476 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16) 3477 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32) 3478 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64) 3479 3480 #undef COMPARE 3481 3482 FloatRelation float16_compare(float16 a, float16 b, float_status *s) 3483 { 3484 return soft_f16_compare(a, b, false, s); 3485 } 3486 3487 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s) 3488 { 3489 return soft_f16_compare(a, b, true, s); 3490 } 3491 3492 static FloatRelation QEMU_FLATTEN 3493 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s) 3494 { 3495 union_float32 ua, ub; 3496 3497 ua.s = xa; 3498 ub.s = xb; 3499 3500 if (QEMU_NO_HARDFLOAT) { 3501 goto soft; 3502 } 3503 3504 float32_input_flush2(&ua.s, &ub.s, s); 3505 if (isgreaterequal(ua.h, ub.h)) { 3506 if (isgreater(ua.h, ub.h)) { 3507 return float_relation_greater; 3508 } 3509 return float_relation_equal; 3510 } 3511 if (likely(isless(ua.h, ub.h))) { 3512 return float_relation_less; 3513 } 3514 /* The only condition remaining is unordered. 3515 * Fall through to set flags. 3516 */ 3517 soft: 3518 return soft_f32_compare(ua.s, ub.s, is_quiet, s); 3519 } 3520 3521 FloatRelation float32_compare(float32 a, float32 b, float_status *s) 3522 { 3523 return f32_compare(a, b, false, s); 3524 } 3525 3526 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s) 3527 { 3528 return f32_compare(a, b, true, s); 3529 } 3530 3531 static FloatRelation QEMU_FLATTEN 3532 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s) 3533 { 3534 union_float64 ua, ub; 3535 3536 ua.s = xa; 3537 ub.s = xb; 3538 3539 if (QEMU_NO_HARDFLOAT) { 3540 goto soft; 3541 } 3542 3543 float64_input_flush2(&ua.s, &ub.s, s); 3544 if (isgreaterequal(ua.h, ub.h)) { 3545 if (isgreater(ua.h, ub.h)) { 3546 return float_relation_greater; 3547 } 3548 return float_relation_equal; 3549 } 3550 if (likely(isless(ua.h, ub.h))) { 3551 return float_relation_less; 3552 } 3553 /* The only condition remaining is unordered. 3554 * Fall through to set flags. 3555 */ 3556 soft: 3557 return soft_f64_compare(ua.s, ub.s, is_quiet, s); 3558 } 3559 3560 FloatRelation float64_compare(float64 a, float64 b, float_status *s) 3561 { 3562 return f64_compare(a, b, false, s); 3563 } 3564 3565 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s) 3566 { 3567 return f64_compare(a, b, true, s); 3568 } 3569 3570 static FloatRelation QEMU_FLATTEN 3571 soft_bf16_compare(bfloat16 a, bfloat16 b, bool is_quiet, float_status *s) 3572 { 3573 FloatParts64 pa, pb; 3574 3575 bfloat16_unpack_canonical(&pa, a, s); 3576 bfloat16_unpack_canonical(&pb, b, s); 3577 return compare_floats(pa, pb, is_quiet, s); 3578 } 3579 3580 FloatRelation bfloat16_compare(bfloat16 a, bfloat16 b, float_status *s) 3581 { 3582 return soft_bf16_compare(a, b, false, s); 3583 } 3584 3585 FloatRelation bfloat16_compare_quiet(bfloat16 a, bfloat16 b, float_status *s) 3586 { 3587 return soft_bf16_compare(a, b, true, s); 3588 } 3589 3590 /* Multiply A by 2 raised to the power N. */ 3591 static FloatParts64 scalbn_decomposed(FloatParts64 a, int n, float_status *s) 3592 { 3593 if (unlikely(is_nan(a.cls))) { 3594 return return_nan(a, s); 3595 } 3596 if (a.cls == float_class_normal) { 3597 /* The largest float type (even though not supported by FloatParts64) 3598 * is float128, which has a 15 bit exponent. Bounding N to 16 bits 3599 * still allows rounding to infinity, without allowing overflow 3600 * within the int32_t that backs FloatParts64.exp. 3601 */ 3602 n = MIN(MAX(n, -0x10000), 0x10000); 3603 a.exp += n; 3604 } 3605 return a; 3606 } 3607 3608 float16 float16_scalbn(float16 a, int n, float_status *status) 3609 { 3610 FloatParts64 pa, pr; 3611 3612 float16_unpack_canonical(&pa, a, status); 3613 pr = scalbn_decomposed(pa, n, status); 3614 return float16_round_pack_canonical(&pr, status); 3615 } 3616 3617 float32 float32_scalbn(float32 a, int n, float_status *status) 3618 { 3619 FloatParts64 pa, pr; 3620 3621 float32_unpack_canonical(&pa, a, status); 3622 pr = scalbn_decomposed(pa, n, status); 3623 return float32_round_pack_canonical(&pr, status); 3624 } 3625 3626 float64 float64_scalbn(float64 a, int n, float_status *status) 3627 { 3628 FloatParts64 pa, pr; 3629 3630 float64_unpack_canonical(&pa, a, status); 3631 pr = scalbn_decomposed(pa, n, status); 3632 return float64_round_pack_canonical(&pr, status); 3633 } 3634 3635 bfloat16 bfloat16_scalbn(bfloat16 a, int n, float_status *status) 3636 { 3637 FloatParts64 pa, pr; 3638 3639 bfloat16_unpack_canonical(&pa, a, status); 3640 pr = scalbn_decomposed(pa, n, status); 3641 return bfloat16_round_pack_canonical(&pr, status); 3642 } 3643 3644 /* 3645 * Square Root 3646 * 3647 * The old softfloat code did an approximation step before zeroing in 3648 * on the final result. However for simpleness we just compute the 3649 * square root by iterating down from the implicit bit to enough extra 3650 * bits to ensure we get a correctly rounded result. 3651 * 3652 * This does mean however the calculation is slower than before, 3653 * especially for 64 bit floats. 3654 */ 3655 3656 static FloatParts64 sqrt_float(FloatParts64 a, float_status *s, const FloatFmt *p) 3657 { 3658 uint64_t a_frac, r_frac, s_frac; 3659 int bit, last_bit; 3660 3661 if (is_nan(a.cls)) { 3662 return return_nan(a, s); 3663 } 3664 if (a.cls == float_class_zero) { 3665 return a; /* sqrt(+-0) = +-0 */ 3666 } 3667 if (a.sign) { 3668 float_raise(float_flag_invalid, s); 3669 parts_default_nan(&a, s); 3670 return a; 3671 } 3672 if (a.cls == float_class_inf) { 3673 return a; /* sqrt(+inf) = +inf */ 3674 } 3675 3676 assert(a.cls == float_class_normal); 3677 3678 /* We need two overflow bits at the top. Adding room for that is a 3679 * right shift. If the exponent is odd, we can discard the low bit 3680 * by multiplying the fraction by 2; that's a left shift. Combine 3681 * those and we shift right by 1 if the exponent is odd, otherwise 2. 3682 */ 3683 a_frac = a.frac >> (2 - (a.exp & 1)); 3684 a.exp >>= 1; 3685 3686 /* Bit-by-bit computation of sqrt. */ 3687 r_frac = 0; 3688 s_frac = 0; 3689 3690 /* Iterate from implicit bit down to the 3 extra bits to compute a 3691 * properly rounded result. Remember we've inserted two more bits 3692 * at the top, so these positions are two less. 3693 */ 3694 bit = DECOMPOSED_BINARY_POINT - 2; 3695 last_bit = MAX(p->frac_shift - 4, 0); 3696 do { 3697 uint64_t q = 1ULL << bit; 3698 uint64_t t_frac = s_frac + q; 3699 if (t_frac <= a_frac) { 3700 s_frac = t_frac + q; 3701 a_frac -= t_frac; 3702 r_frac += q; 3703 } 3704 a_frac <<= 1; 3705 } while (--bit >= last_bit); 3706 3707 /* Undo the right shift done above. If there is any remaining 3708 * fraction, the result is inexact. Set the sticky bit. 3709 */ 3710 a.frac = (r_frac << 2) + (a_frac != 0); 3711 3712 return a; 3713 } 3714 3715 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status) 3716 { 3717 FloatParts64 pa, pr; 3718 3719 float16_unpack_canonical(&pa, a, status); 3720 pr = sqrt_float(pa, status, &float16_params); 3721 return float16_round_pack_canonical(&pr, status); 3722 } 3723 3724 static float32 QEMU_SOFTFLOAT_ATTR 3725 soft_f32_sqrt(float32 a, float_status *status) 3726 { 3727 FloatParts64 pa, pr; 3728 3729 float32_unpack_canonical(&pa, a, status); 3730 pr = sqrt_float(pa, status, &float32_params); 3731 return float32_round_pack_canonical(&pr, status); 3732 } 3733 3734 static float64 QEMU_SOFTFLOAT_ATTR 3735 soft_f64_sqrt(float64 a, float_status *status) 3736 { 3737 FloatParts64 pa, pr; 3738 3739 float64_unpack_canonical(&pa, a, status); 3740 pr = sqrt_float(pa, status, &float64_params); 3741 return float64_round_pack_canonical(&pr, status); 3742 } 3743 3744 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s) 3745 { 3746 union_float32 ua, ur; 3747 3748 ua.s = xa; 3749 if (unlikely(!can_use_fpu(s))) { 3750 goto soft; 3751 } 3752 3753 float32_input_flush1(&ua.s, s); 3754 if (QEMU_HARDFLOAT_1F32_USE_FP) { 3755 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3756 fpclassify(ua.h) == FP_ZERO) || 3757 signbit(ua.h))) { 3758 goto soft; 3759 } 3760 } else if (unlikely(!float32_is_zero_or_normal(ua.s) || 3761 float32_is_neg(ua.s))) { 3762 goto soft; 3763 } 3764 ur.h = sqrtf(ua.h); 3765 return ur.s; 3766 3767 soft: 3768 return soft_f32_sqrt(ua.s, s); 3769 } 3770 3771 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s) 3772 { 3773 union_float64 ua, ur; 3774 3775 ua.s = xa; 3776 if (unlikely(!can_use_fpu(s))) { 3777 goto soft; 3778 } 3779 3780 float64_input_flush1(&ua.s, s); 3781 if (QEMU_HARDFLOAT_1F64_USE_FP) { 3782 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3783 fpclassify(ua.h) == FP_ZERO) || 3784 signbit(ua.h))) { 3785 goto soft; 3786 } 3787 } else if (unlikely(!float64_is_zero_or_normal(ua.s) || 3788 float64_is_neg(ua.s))) { 3789 goto soft; 3790 } 3791 ur.h = sqrt(ua.h); 3792 return ur.s; 3793 3794 soft: 3795 return soft_f64_sqrt(ua.s, s); 3796 } 3797 3798 bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status) 3799 { 3800 FloatParts64 pa, pr; 3801 3802 bfloat16_unpack_canonical(&pa, a, status); 3803 pr = sqrt_float(pa, status, &bfloat16_params); 3804 return bfloat16_round_pack_canonical(&pr, status); 3805 } 3806 3807 /*---------------------------------------------------------------------------- 3808 | The pattern for a default generated NaN. 3809 *----------------------------------------------------------------------------*/ 3810 3811 float16 float16_default_nan(float_status *status) 3812 { 3813 FloatParts64 p; 3814 3815 parts_default_nan(&p, status); 3816 p.frac >>= float16_params.frac_shift; 3817 return float16_pack_raw(&p); 3818 } 3819 3820 float32 float32_default_nan(float_status *status) 3821 { 3822 FloatParts64 p; 3823 3824 parts_default_nan(&p, status); 3825 p.frac >>= float32_params.frac_shift; 3826 return float32_pack_raw(&p); 3827 } 3828 3829 float64 float64_default_nan(float_status *status) 3830 { 3831 FloatParts64 p; 3832 3833 parts_default_nan(&p, status); 3834 p.frac >>= float64_params.frac_shift; 3835 return float64_pack_raw(&p); 3836 } 3837 3838 float128 float128_default_nan(float_status *status) 3839 { 3840 FloatParts128 p; 3841 3842 parts_default_nan(&p, status); 3843 frac_shr(&p, float128_params.frac_shift); 3844 return float128_pack_raw(&p); 3845 } 3846 3847 bfloat16 bfloat16_default_nan(float_status *status) 3848 { 3849 FloatParts64 p; 3850 3851 parts_default_nan(&p, status); 3852 p.frac >>= bfloat16_params.frac_shift; 3853 return bfloat16_pack_raw(&p); 3854 } 3855 3856 /*---------------------------------------------------------------------------- 3857 | Returns a quiet NaN from a signalling NaN for the floating point value `a'. 3858 *----------------------------------------------------------------------------*/ 3859 3860 float16 float16_silence_nan(float16 a, float_status *status) 3861 { 3862 FloatParts64 p; 3863 3864 float16_unpack_raw(&p, a); 3865 p.frac <<= float16_params.frac_shift; 3866 parts_silence_nan(&p, status); 3867 p.frac >>= float16_params.frac_shift; 3868 return float16_pack_raw(&p); 3869 } 3870 3871 float32 float32_silence_nan(float32 a, float_status *status) 3872 { 3873 FloatParts64 p; 3874 3875 float32_unpack_raw(&p, a); 3876 p.frac <<= float32_params.frac_shift; 3877 parts_silence_nan(&p, status); 3878 p.frac >>= float32_params.frac_shift; 3879 return float32_pack_raw(&p); 3880 } 3881 3882 float64 float64_silence_nan(float64 a, float_status *status) 3883 { 3884 FloatParts64 p; 3885 3886 float64_unpack_raw(&p, a); 3887 p.frac <<= float64_params.frac_shift; 3888 parts_silence_nan(&p, status); 3889 p.frac >>= float64_params.frac_shift; 3890 return float64_pack_raw(&p); 3891 } 3892 3893 bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status) 3894 { 3895 FloatParts64 p; 3896 3897 bfloat16_unpack_raw(&p, a); 3898 p.frac <<= bfloat16_params.frac_shift; 3899 parts_silence_nan(&p, status); 3900 p.frac >>= bfloat16_params.frac_shift; 3901 return bfloat16_pack_raw(&p); 3902 } 3903 3904 float128 float128_silence_nan(float128 a, float_status *status) 3905 { 3906 FloatParts128 p; 3907 3908 float128_unpack_raw(&p, a); 3909 frac_shl(&p, float128_params.frac_shift); 3910 parts_silence_nan(&p, status); 3911 frac_shr(&p, float128_params.frac_shift); 3912 return float128_pack_raw(&p); 3913 } 3914 3915 /*---------------------------------------------------------------------------- 3916 | If `a' is denormal and we are in flush-to-zero mode then set the 3917 | input-denormal exception and return zero. Otherwise just return the value. 3918 *----------------------------------------------------------------------------*/ 3919 3920 static bool parts_squash_denormal(FloatParts64 p, float_status *status) 3921 { 3922 if (p.exp == 0 && p.frac != 0) { 3923 float_raise(float_flag_input_denormal, status); 3924 return true; 3925 } 3926 3927 return false; 3928 } 3929 3930 float16 float16_squash_input_denormal(float16 a, float_status *status) 3931 { 3932 if (status->flush_inputs_to_zero) { 3933 FloatParts64 p; 3934 3935 float16_unpack_raw(&p, a); 3936 if (parts_squash_denormal(p, status)) { 3937 return float16_set_sign(float16_zero, p.sign); 3938 } 3939 } 3940 return a; 3941 } 3942 3943 float32 float32_squash_input_denormal(float32 a, float_status *status) 3944 { 3945 if (status->flush_inputs_to_zero) { 3946 FloatParts64 p; 3947 3948 float32_unpack_raw(&p, a); 3949 if (parts_squash_denormal(p, status)) { 3950 return float32_set_sign(float32_zero, p.sign); 3951 } 3952 } 3953 return a; 3954 } 3955 3956 float64 float64_squash_input_denormal(float64 a, float_status *status) 3957 { 3958 if (status->flush_inputs_to_zero) { 3959 FloatParts64 p; 3960 3961 float64_unpack_raw(&p, a); 3962 if (parts_squash_denormal(p, status)) { 3963 return float64_set_sign(float64_zero, p.sign); 3964 } 3965 } 3966 return a; 3967 } 3968 3969 bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status) 3970 { 3971 if (status->flush_inputs_to_zero) { 3972 FloatParts64 p; 3973 3974 bfloat16_unpack_raw(&p, a); 3975 if (parts_squash_denormal(p, status)) { 3976 return bfloat16_set_sign(bfloat16_zero, p.sign); 3977 } 3978 } 3979 return a; 3980 } 3981 3982 /*---------------------------------------------------------------------------- 3983 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 3984 | and 7, and returns the properly rounded 32-bit integer corresponding to the 3985 | input. If `zSign' is 1, the input is negated before being converted to an 3986 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 3987 | is simply rounded to an integer, with the inexact exception raised if the 3988 | input cannot be represented exactly as an integer. However, if the fixed- 3989 | point input is too large, the invalid exception is raised and the largest 3990 | positive or negative integer is returned. 3991 *----------------------------------------------------------------------------*/ 3992 3993 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ, 3994 float_status *status) 3995 { 3996 int8_t roundingMode; 3997 bool roundNearestEven; 3998 int8_t roundIncrement, roundBits; 3999 int32_t z; 4000 4001 roundingMode = status->float_rounding_mode; 4002 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4003 switch (roundingMode) { 4004 case float_round_nearest_even: 4005 case float_round_ties_away: 4006 roundIncrement = 0x40; 4007 break; 4008 case float_round_to_zero: 4009 roundIncrement = 0; 4010 break; 4011 case float_round_up: 4012 roundIncrement = zSign ? 0 : 0x7f; 4013 break; 4014 case float_round_down: 4015 roundIncrement = zSign ? 0x7f : 0; 4016 break; 4017 case float_round_to_odd: 4018 roundIncrement = absZ & 0x80 ? 0 : 0x7f; 4019 break; 4020 default: 4021 abort(); 4022 } 4023 roundBits = absZ & 0x7F; 4024 absZ = ( absZ + roundIncrement )>>7; 4025 if (!(roundBits ^ 0x40) && roundNearestEven) { 4026 absZ &= ~1; 4027 } 4028 z = absZ; 4029 if ( zSign ) z = - z; 4030 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 4031 float_raise(float_flag_invalid, status); 4032 return zSign ? INT32_MIN : INT32_MAX; 4033 } 4034 if (roundBits) { 4035 float_raise(float_flag_inexact, status); 4036 } 4037 return z; 4038 4039 } 4040 4041 /*---------------------------------------------------------------------------- 4042 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 4043 | `absZ1', with binary point between bits 63 and 64 (between the input words), 4044 | and returns the properly rounded 64-bit integer corresponding to the input. 4045 | If `zSign' is 1, the input is negated before being converted to an integer. 4046 | Ordinarily, the fixed-point input is simply rounded to an integer, with 4047 | the inexact exception raised if the input cannot be represented exactly as 4048 | an integer. However, if the fixed-point input is too large, the invalid 4049 | exception is raised and the largest positive or negative integer is 4050 | returned. 4051 *----------------------------------------------------------------------------*/ 4052 4053 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1, 4054 float_status *status) 4055 { 4056 int8_t roundingMode; 4057 bool roundNearestEven, increment; 4058 int64_t z; 4059 4060 roundingMode = status->float_rounding_mode; 4061 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4062 switch (roundingMode) { 4063 case float_round_nearest_even: 4064 case float_round_ties_away: 4065 increment = ((int64_t) absZ1 < 0); 4066 break; 4067 case float_round_to_zero: 4068 increment = 0; 4069 break; 4070 case float_round_up: 4071 increment = !zSign && absZ1; 4072 break; 4073 case float_round_down: 4074 increment = zSign && absZ1; 4075 break; 4076 case float_round_to_odd: 4077 increment = !(absZ0 & 1) && absZ1; 4078 break; 4079 default: 4080 abort(); 4081 } 4082 if ( increment ) { 4083 ++absZ0; 4084 if ( absZ0 == 0 ) goto overflow; 4085 if (!(absZ1 << 1) && roundNearestEven) { 4086 absZ0 &= ~1; 4087 } 4088 } 4089 z = absZ0; 4090 if ( zSign ) z = - z; 4091 if ( z && ( ( z < 0 ) ^ zSign ) ) { 4092 overflow: 4093 float_raise(float_flag_invalid, status); 4094 return zSign ? INT64_MIN : INT64_MAX; 4095 } 4096 if (absZ1) { 4097 float_raise(float_flag_inexact, status); 4098 } 4099 return z; 4100 4101 } 4102 4103 /*---------------------------------------------------------------------------- 4104 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 4105 | `absZ1', with binary point between bits 63 and 64 (between the input words), 4106 | and returns the properly rounded 64-bit unsigned integer corresponding to the 4107 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 4108 | with the inexact exception raised if the input cannot be represented exactly 4109 | as an integer. However, if the fixed-point input is too large, the invalid 4110 | exception is raised and the largest unsigned integer is returned. 4111 *----------------------------------------------------------------------------*/ 4112 4113 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0, 4114 uint64_t absZ1, float_status *status) 4115 { 4116 int8_t roundingMode; 4117 bool roundNearestEven, increment; 4118 4119 roundingMode = status->float_rounding_mode; 4120 roundNearestEven = (roundingMode == float_round_nearest_even); 4121 switch (roundingMode) { 4122 case float_round_nearest_even: 4123 case float_round_ties_away: 4124 increment = ((int64_t)absZ1 < 0); 4125 break; 4126 case float_round_to_zero: 4127 increment = 0; 4128 break; 4129 case float_round_up: 4130 increment = !zSign && absZ1; 4131 break; 4132 case float_round_down: 4133 increment = zSign && absZ1; 4134 break; 4135 case float_round_to_odd: 4136 increment = !(absZ0 & 1) && absZ1; 4137 break; 4138 default: 4139 abort(); 4140 } 4141 if (increment) { 4142 ++absZ0; 4143 if (absZ0 == 0) { 4144 float_raise(float_flag_invalid, status); 4145 return UINT64_MAX; 4146 } 4147 if (!(absZ1 << 1) && roundNearestEven) { 4148 absZ0 &= ~1; 4149 } 4150 } 4151 4152 if (zSign && absZ0) { 4153 float_raise(float_flag_invalid, status); 4154 return 0; 4155 } 4156 4157 if (absZ1) { 4158 float_raise(float_flag_inexact, status); 4159 } 4160 return absZ0; 4161 } 4162 4163 /*---------------------------------------------------------------------------- 4164 | Normalizes the subnormal single-precision floating-point value represented 4165 | by the denormalized significand `aSig'. The normalized exponent and 4166 | significand are stored at the locations pointed to by `zExpPtr' and 4167 | `zSigPtr', respectively. 4168 *----------------------------------------------------------------------------*/ 4169 4170 static void 4171 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 4172 { 4173 int8_t shiftCount; 4174 4175 shiftCount = clz32(aSig) - 8; 4176 *zSigPtr = aSig<<shiftCount; 4177 *zExpPtr = 1 - shiftCount; 4178 4179 } 4180 4181 /*---------------------------------------------------------------------------- 4182 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4183 | and significand `zSig', and returns the proper single-precision floating- 4184 | point value corresponding to the abstract input. Ordinarily, the abstract 4185 | value is simply rounded and packed into the single-precision format, with 4186 | the inexact exception raised if the abstract input cannot be represented 4187 | exactly. However, if the abstract value is too large, the overflow and 4188 | inexact exceptions are raised and an infinity or maximal finite value is 4189 | returned. If the abstract value is too small, the input value is rounded to 4190 | a subnormal number, and the underflow and inexact exceptions are raised if 4191 | the abstract input cannot be represented exactly as a subnormal single- 4192 | precision floating-point number. 4193 | The input significand `zSig' has its binary point between bits 30 4194 | and 29, which is 7 bits to the left of the usual location. This shifted 4195 | significand must be normalized or smaller. If `zSig' is not normalized, 4196 | `zExp' must be 0; in that case, the result returned is a subnormal number, 4197 | and it must not require rounding. In the usual case that `zSig' is 4198 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 4199 | The handling of underflow and overflow follows the IEC/IEEE Standard for 4200 | Binary Floating-Point Arithmetic. 4201 *----------------------------------------------------------------------------*/ 4202 4203 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig, 4204 float_status *status) 4205 { 4206 int8_t roundingMode; 4207 bool roundNearestEven; 4208 int8_t roundIncrement, roundBits; 4209 bool isTiny; 4210 4211 roundingMode = status->float_rounding_mode; 4212 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4213 switch (roundingMode) { 4214 case float_round_nearest_even: 4215 case float_round_ties_away: 4216 roundIncrement = 0x40; 4217 break; 4218 case float_round_to_zero: 4219 roundIncrement = 0; 4220 break; 4221 case float_round_up: 4222 roundIncrement = zSign ? 0 : 0x7f; 4223 break; 4224 case float_round_down: 4225 roundIncrement = zSign ? 0x7f : 0; 4226 break; 4227 case float_round_to_odd: 4228 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 4229 break; 4230 default: 4231 abort(); 4232 break; 4233 } 4234 roundBits = zSig & 0x7F; 4235 if ( 0xFD <= (uint16_t) zExp ) { 4236 if ( ( 0xFD < zExp ) 4237 || ( ( zExp == 0xFD ) 4238 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 4239 ) { 4240 bool overflow_to_inf = roundingMode != float_round_to_odd && 4241 roundIncrement != 0; 4242 float_raise(float_flag_overflow | float_flag_inexact, status); 4243 return packFloat32(zSign, 0xFF, -!overflow_to_inf); 4244 } 4245 if ( zExp < 0 ) { 4246 if (status->flush_to_zero) { 4247 float_raise(float_flag_output_denormal, status); 4248 return packFloat32(zSign, 0, 0); 4249 } 4250 isTiny = status->tininess_before_rounding 4251 || (zExp < -1) 4252 || (zSig + roundIncrement < 0x80000000); 4253 shift32RightJamming( zSig, - zExp, &zSig ); 4254 zExp = 0; 4255 roundBits = zSig & 0x7F; 4256 if (isTiny && roundBits) { 4257 float_raise(float_flag_underflow, status); 4258 } 4259 if (roundingMode == float_round_to_odd) { 4260 /* 4261 * For round-to-odd case, the roundIncrement depends on 4262 * zSig which just changed. 4263 */ 4264 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 4265 } 4266 } 4267 } 4268 if (roundBits) { 4269 float_raise(float_flag_inexact, status); 4270 } 4271 zSig = ( zSig + roundIncrement )>>7; 4272 if (!(roundBits ^ 0x40) && roundNearestEven) { 4273 zSig &= ~1; 4274 } 4275 if ( zSig == 0 ) zExp = 0; 4276 return packFloat32( zSign, zExp, zSig ); 4277 4278 } 4279 4280 /*---------------------------------------------------------------------------- 4281 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4282 | and significand `zSig', and returns the proper single-precision floating- 4283 | point value corresponding to the abstract input. This routine is just like 4284 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 4285 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 4286 | floating-point exponent. 4287 *----------------------------------------------------------------------------*/ 4288 4289 static float32 4290 normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig, 4291 float_status *status) 4292 { 4293 int8_t shiftCount; 4294 4295 shiftCount = clz32(zSig) - 1; 4296 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 4297 status); 4298 4299 } 4300 4301 /*---------------------------------------------------------------------------- 4302 | Normalizes the subnormal double-precision floating-point value represented 4303 | by the denormalized significand `aSig'. The normalized exponent and 4304 | significand are stored at the locations pointed to by `zExpPtr' and 4305 | `zSigPtr', respectively. 4306 *----------------------------------------------------------------------------*/ 4307 4308 static void 4309 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 4310 { 4311 int8_t shiftCount; 4312 4313 shiftCount = clz64(aSig) - 11; 4314 *zSigPtr = aSig<<shiftCount; 4315 *zExpPtr = 1 - shiftCount; 4316 4317 } 4318 4319 /*---------------------------------------------------------------------------- 4320 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 4321 | double-precision floating-point value, returning the result. After being 4322 | shifted into the proper positions, the three fields are simply added 4323 | together to form the result. This means that any integer portion of `zSig' 4324 | will be added into the exponent. Since a properly normalized significand 4325 | will have an integer portion equal to 1, the `zExp' input should be 1 less 4326 | than the desired result exponent whenever `zSig' is a complete, normalized 4327 | significand. 4328 *----------------------------------------------------------------------------*/ 4329 4330 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig) 4331 { 4332 4333 return make_float64( 4334 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 4335 4336 } 4337 4338 /*---------------------------------------------------------------------------- 4339 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4340 | and significand `zSig', and returns the proper double-precision floating- 4341 | point value corresponding to the abstract input. Ordinarily, the abstract 4342 | value is simply rounded and packed into the double-precision format, with 4343 | the inexact exception raised if the abstract input cannot be represented 4344 | exactly. However, if the abstract value is too large, the overflow and 4345 | inexact exceptions are raised and an infinity or maximal finite value is 4346 | returned. If the abstract value is too small, the input value is rounded to 4347 | a subnormal number, and the underflow and inexact exceptions are raised if 4348 | the abstract input cannot be represented exactly as a subnormal double- 4349 | precision floating-point number. 4350 | The input significand `zSig' has its binary point between bits 62 4351 | and 61, which is 10 bits to the left of the usual location. This shifted 4352 | significand must be normalized or smaller. If `zSig' is not normalized, 4353 | `zExp' must be 0; in that case, the result returned is a subnormal number, 4354 | and it must not require rounding. In the usual case that `zSig' is 4355 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 4356 | The handling of underflow and overflow follows the IEC/IEEE Standard for 4357 | Binary Floating-Point Arithmetic. 4358 *----------------------------------------------------------------------------*/ 4359 4360 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig, 4361 float_status *status) 4362 { 4363 int8_t roundingMode; 4364 bool roundNearestEven; 4365 int roundIncrement, roundBits; 4366 bool isTiny; 4367 4368 roundingMode = status->float_rounding_mode; 4369 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4370 switch (roundingMode) { 4371 case float_round_nearest_even: 4372 case float_round_ties_away: 4373 roundIncrement = 0x200; 4374 break; 4375 case float_round_to_zero: 4376 roundIncrement = 0; 4377 break; 4378 case float_round_up: 4379 roundIncrement = zSign ? 0 : 0x3ff; 4380 break; 4381 case float_round_down: 4382 roundIncrement = zSign ? 0x3ff : 0; 4383 break; 4384 case float_round_to_odd: 4385 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 4386 break; 4387 default: 4388 abort(); 4389 } 4390 roundBits = zSig & 0x3FF; 4391 if ( 0x7FD <= (uint16_t) zExp ) { 4392 if ( ( 0x7FD < zExp ) 4393 || ( ( zExp == 0x7FD ) 4394 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 4395 ) { 4396 bool overflow_to_inf = roundingMode != float_round_to_odd && 4397 roundIncrement != 0; 4398 float_raise(float_flag_overflow | float_flag_inexact, status); 4399 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 4400 } 4401 if ( zExp < 0 ) { 4402 if (status->flush_to_zero) { 4403 float_raise(float_flag_output_denormal, status); 4404 return packFloat64(zSign, 0, 0); 4405 } 4406 isTiny = status->tininess_before_rounding 4407 || (zExp < -1) 4408 || (zSig + roundIncrement < UINT64_C(0x8000000000000000)); 4409 shift64RightJamming( zSig, - zExp, &zSig ); 4410 zExp = 0; 4411 roundBits = zSig & 0x3FF; 4412 if (isTiny && roundBits) { 4413 float_raise(float_flag_underflow, status); 4414 } 4415 if (roundingMode == float_round_to_odd) { 4416 /* 4417 * For round-to-odd case, the roundIncrement depends on 4418 * zSig which just changed. 4419 */ 4420 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 4421 } 4422 } 4423 } 4424 if (roundBits) { 4425 float_raise(float_flag_inexact, status); 4426 } 4427 zSig = ( zSig + roundIncrement )>>10; 4428 if (!(roundBits ^ 0x200) && roundNearestEven) { 4429 zSig &= ~1; 4430 } 4431 if ( zSig == 0 ) zExp = 0; 4432 return packFloat64( zSign, zExp, zSig ); 4433 4434 } 4435 4436 /*---------------------------------------------------------------------------- 4437 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4438 | and significand `zSig', and returns the proper double-precision floating- 4439 | point value corresponding to the abstract input. This routine is just like 4440 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 4441 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 4442 | floating-point exponent. 4443 *----------------------------------------------------------------------------*/ 4444 4445 static float64 4446 normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig, 4447 float_status *status) 4448 { 4449 int8_t shiftCount; 4450 4451 shiftCount = clz64(zSig) - 1; 4452 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 4453 status); 4454 4455 } 4456 4457 /*---------------------------------------------------------------------------- 4458 | Normalizes the subnormal extended double-precision floating-point value 4459 | represented by the denormalized significand `aSig'. The normalized exponent 4460 | and significand are stored at the locations pointed to by `zExpPtr' and 4461 | `zSigPtr', respectively. 4462 *----------------------------------------------------------------------------*/ 4463 4464 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr, 4465 uint64_t *zSigPtr) 4466 { 4467 int8_t shiftCount; 4468 4469 shiftCount = clz64(aSig); 4470 *zSigPtr = aSig<<shiftCount; 4471 *zExpPtr = 1 - shiftCount; 4472 } 4473 4474 /*---------------------------------------------------------------------------- 4475 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4476 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 4477 | and returns the proper extended double-precision floating-point value 4478 | corresponding to the abstract input. Ordinarily, the abstract value is 4479 | rounded and packed into the extended double-precision format, with the 4480 | inexact exception raised if the abstract input cannot be represented 4481 | exactly. However, if the abstract value is too large, the overflow and 4482 | inexact exceptions are raised and an infinity or maximal finite value is 4483 | returned. If the abstract value is too small, the input value is rounded to 4484 | a subnormal number, and the underflow and inexact exceptions are raised if 4485 | the abstract input cannot be represented exactly as a subnormal extended 4486 | double-precision floating-point number. 4487 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 4488 | number of bits as single or double precision, respectively. Otherwise, the 4489 | result is rounded to the full precision of the extended double-precision 4490 | format. 4491 | The input significand must be normalized or smaller. If the input 4492 | significand is not normalized, `zExp' must be 0; in that case, the result 4493 | returned is a subnormal number, and it must not require rounding. The 4494 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 4495 | Floating-Point Arithmetic. 4496 *----------------------------------------------------------------------------*/ 4497 4498 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign, 4499 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 4500 float_status *status) 4501 { 4502 int8_t roundingMode; 4503 bool roundNearestEven, increment, isTiny; 4504 int64_t roundIncrement, roundMask, roundBits; 4505 4506 roundingMode = status->float_rounding_mode; 4507 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4508 if ( roundingPrecision == 80 ) goto precision80; 4509 if ( roundingPrecision == 64 ) { 4510 roundIncrement = UINT64_C(0x0000000000000400); 4511 roundMask = UINT64_C(0x00000000000007FF); 4512 } 4513 else if ( roundingPrecision == 32 ) { 4514 roundIncrement = UINT64_C(0x0000008000000000); 4515 roundMask = UINT64_C(0x000000FFFFFFFFFF); 4516 } 4517 else { 4518 goto precision80; 4519 } 4520 zSig0 |= ( zSig1 != 0 ); 4521 switch (roundingMode) { 4522 case float_round_nearest_even: 4523 case float_round_ties_away: 4524 break; 4525 case float_round_to_zero: 4526 roundIncrement = 0; 4527 break; 4528 case float_round_up: 4529 roundIncrement = zSign ? 0 : roundMask; 4530 break; 4531 case float_round_down: 4532 roundIncrement = zSign ? roundMask : 0; 4533 break; 4534 default: 4535 abort(); 4536 } 4537 roundBits = zSig0 & roundMask; 4538 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 4539 if ( ( 0x7FFE < zExp ) 4540 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 4541 ) { 4542 goto overflow; 4543 } 4544 if ( zExp <= 0 ) { 4545 if (status->flush_to_zero) { 4546 float_raise(float_flag_output_denormal, status); 4547 return packFloatx80(zSign, 0, 0); 4548 } 4549 isTiny = status->tininess_before_rounding 4550 || (zExp < 0 ) 4551 || (zSig0 <= zSig0 + roundIncrement); 4552 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 4553 zExp = 0; 4554 roundBits = zSig0 & roundMask; 4555 if (isTiny && roundBits) { 4556 float_raise(float_flag_underflow, status); 4557 } 4558 if (roundBits) { 4559 float_raise(float_flag_inexact, status); 4560 } 4561 zSig0 += roundIncrement; 4562 if ( (int64_t) zSig0 < 0 ) zExp = 1; 4563 roundIncrement = roundMask + 1; 4564 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 4565 roundMask |= roundIncrement; 4566 } 4567 zSig0 &= ~ roundMask; 4568 return packFloatx80( zSign, zExp, zSig0 ); 4569 } 4570 } 4571 if (roundBits) { 4572 float_raise(float_flag_inexact, status); 4573 } 4574 zSig0 += roundIncrement; 4575 if ( zSig0 < roundIncrement ) { 4576 ++zExp; 4577 zSig0 = UINT64_C(0x8000000000000000); 4578 } 4579 roundIncrement = roundMask + 1; 4580 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 4581 roundMask |= roundIncrement; 4582 } 4583 zSig0 &= ~ roundMask; 4584 if ( zSig0 == 0 ) zExp = 0; 4585 return packFloatx80( zSign, zExp, zSig0 ); 4586 precision80: 4587 switch (roundingMode) { 4588 case float_round_nearest_even: 4589 case float_round_ties_away: 4590 increment = ((int64_t)zSig1 < 0); 4591 break; 4592 case float_round_to_zero: 4593 increment = 0; 4594 break; 4595 case float_round_up: 4596 increment = !zSign && zSig1; 4597 break; 4598 case float_round_down: 4599 increment = zSign && zSig1; 4600 break; 4601 default: 4602 abort(); 4603 } 4604 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 4605 if ( ( 0x7FFE < zExp ) 4606 || ( ( zExp == 0x7FFE ) 4607 && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) ) 4608 && increment 4609 ) 4610 ) { 4611 roundMask = 0; 4612 overflow: 4613 float_raise(float_flag_overflow | float_flag_inexact, status); 4614 if ( ( roundingMode == float_round_to_zero ) 4615 || ( zSign && ( roundingMode == float_round_up ) ) 4616 || ( ! zSign && ( roundingMode == float_round_down ) ) 4617 ) { 4618 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 4619 } 4620 return packFloatx80(zSign, 4621 floatx80_infinity_high, 4622 floatx80_infinity_low); 4623 } 4624 if ( zExp <= 0 ) { 4625 isTiny = status->tininess_before_rounding 4626 || (zExp < 0) 4627 || !increment 4628 || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF)); 4629 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 4630 zExp = 0; 4631 if (isTiny && zSig1) { 4632 float_raise(float_flag_underflow, status); 4633 } 4634 if (zSig1) { 4635 float_raise(float_flag_inexact, status); 4636 } 4637 switch (roundingMode) { 4638 case float_round_nearest_even: 4639 case float_round_ties_away: 4640 increment = ((int64_t)zSig1 < 0); 4641 break; 4642 case float_round_to_zero: 4643 increment = 0; 4644 break; 4645 case float_round_up: 4646 increment = !zSign && zSig1; 4647 break; 4648 case float_round_down: 4649 increment = zSign && zSig1; 4650 break; 4651 default: 4652 abort(); 4653 } 4654 if ( increment ) { 4655 ++zSig0; 4656 if (!(zSig1 << 1) && roundNearestEven) { 4657 zSig0 &= ~1; 4658 } 4659 if ( (int64_t) zSig0 < 0 ) zExp = 1; 4660 } 4661 return packFloatx80( zSign, zExp, zSig0 ); 4662 } 4663 } 4664 if (zSig1) { 4665 float_raise(float_flag_inexact, status); 4666 } 4667 if ( increment ) { 4668 ++zSig0; 4669 if ( zSig0 == 0 ) { 4670 ++zExp; 4671 zSig0 = UINT64_C(0x8000000000000000); 4672 } 4673 else { 4674 if (!(zSig1 << 1) && roundNearestEven) { 4675 zSig0 &= ~1; 4676 } 4677 } 4678 } 4679 else { 4680 if ( zSig0 == 0 ) zExp = 0; 4681 } 4682 return packFloatx80( zSign, zExp, zSig0 ); 4683 4684 } 4685 4686 /*---------------------------------------------------------------------------- 4687 | Takes an abstract floating-point value having sign `zSign', exponent 4688 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 4689 | and returns the proper extended double-precision floating-point value 4690 | corresponding to the abstract input. This routine is just like 4691 | `roundAndPackFloatx80' except that the input significand does not have to be 4692 | normalized. 4693 *----------------------------------------------------------------------------*/ 4694 4695 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 4696 bool zSign, int32_t zExp, 4697 uint64_t zSig0, uint64_t zSig1, 4698 float_status *status) 4699 { 4700 int8_t shiftCount; 4701 4702 if ( zSig0 == 0 ) { 4703 zSig0 = zSig1; 4704 zSig1 = 0; 4705 zExp -= 64; 4706 } 4707 shiftCount = clz64(zSig0); 4708 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4709 zExp -= shiftCount; 4710 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 4711 zSig0, zSig1, status); 4712 4713 } 4714 4715 /*---------------------------------------------------------------------------- 4716 | Returns the least-significant 64 fraction bits of the quadruple-precision 4717 | floating-point value `a'. 4718 *----------------------------------------------------------------------------*/ 4719 4720 static inline uint64_t extractFloat128Frac1( float128 a ) 4721 { 4722 4723 return a.low; 4724 4725 } 4726 4727 /*---------------------------------------------------------------------------- 4728 | Returns the most-significant 48 fraction bits of the quadruple-precision 4729 | floating-point value `a'. 4730 *----------------------------------------------------------------------------*/ 4731 4732 static inline uint64_t extractFloat128Frac0( float128 a ) 4733 { 4734 4735 return a.high & UINT64_C(0x0000FFFFFFFFFFFF); 4736 4737 } 4738 4739 /*---------------------------------------------------------------------------- 4740 | Returns the exponent bits of the quadruple-precision floating-point value 4741 | `a'. 4742 *----------------------------------------------------------------------------*/ 4743 4744 static inline int32_t extractFloat128Exp( float128 a ) 4745 { 4746 4747 return ( a.high>>48 ) & 0x7FFF; 4748 4749 } 4750 4751 /*---------------------------------------------------------------------------- 4752 | Returns the sign bit of the quadruple-precision floating-point value `a'. 4753 *----------------------------------------------------------------------------*/ 4754 4755 static inline bool extractFloat128Sign(float128 a) 4756 { 4757 return a.high >> 63; 4758 } 4759 4760 /*---------------------------------------------------------------------------- 4761 | Normalizes the subnormal quadruple-precision floating-point value 4762 | represented by the denormalized significand formed by the concatenation of 4763 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 4764 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 4765 | significand are stored at the location pointed to by `zSig0Ptr', and the 4766 | least significant 64 bits of the normalized significand are stored at the 4767 | location pointed to by `zSig1Ptr'. 4768 *----------------------------------------------------------------------------*/ 4769 4770 static void 4771 normalizeFloat128Subnormal( 4772 uint64_t aSig0, 4773 uint64_t aSig1, 4774 int32_t *zExpPtr, 4775 uint64_t *zSig0Ptr, 4776 uint64_t *zSig1Ptr 4777 ) 4778 { 4779 int8_t shiftCount; 4780 4781 if ( aSig0 == 0 ) { 4782 shiftCount = clz64(aSig1) - 15; 4783 if ( shiftCount < 0 ) { 4784 *zSig0Ptr = aSig1>>( - shiftCount ); 4785 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 4786 } 4787 else { 4788 *zSig0Ptr = aSig1<<shiftCount; 4789 *zSig1Ptr = 0; 4790 } 4791 *zExpPtr = - shiftCount - 63; 4792 } 4793 else { 4794 shiftCount = clz64(aSig0) - 15; 4795 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 4796 *zExpPtr = 1 - shiftCount; 4797 } 4798 4799 } 4800 4801 /*---------------------------------------------------------------------------- 4802 | Packs the sign `zSign', the exponent `zExp', and the significand formed 4803 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 4804 | floating-point value, returning the result. After being shifted into the 4805 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 4806 | added together to form the most significant 32 bits of the result. This 4807 | means that any integer portion of `zSig0' will be added into the exponent. 4808 | Since a properly normalized significand will have an integer portion equal 4809 | to 1, the `zExp' input should be 1 less than the desired result exponent 4810 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 4811 | significand. 4812 *----------------------------------------------------------------------------*/ 4813 4814 static inline float128 4815 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1) 4816 { 4817 float128 z; 4818 4819 z.low = zSig1; 4820 z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0; 4821 return z; 4822 } 4823 4824 /*---------------------------------------------------------------------------- 4825 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4826 | and extended significand formed by the concatenation of `zSig0', `zSig1', 4827 | and `zSig2', and returns the proper quadruple-precision floating-point value 4828 | corresponding to the abstract input. Ordinarily, the abstract value is 4829 | simply rounded and packed into the quadruple-precision format, with the 4830 | inexact exception raised if the abstract input cannot be represented 4831 | exactly. However, if the abstract value is too large, the overflow and 4832 | inexact exceptions are raised and an infinity or maximal finite value is 4833 | returned. If the abstract value is too small, the input value is rounded to 4834 | a subnormal number, and the underflow and inexact exceptions are raised if 4835 | the abstract input cannot be represented exactly as a subnormal quadruple- 4836 | precision floating-point number. 4837 | The input significand must be normalized or smaller. If the input 4838 | significand is not normalized, `zExp' must be 0; in that case, the result 4839 | returned is a subnormal number, and it must not require rounding. In the 4840 | usual case that the input significand is normalized, `zExp' must be 1 less 4841 | than the ``true'' floating-point exponent. The handling of underflow and 4842 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4843 *----------------------------------------------------------------------------*/ 4844 4845 static float128 roundAndPackFloat128(bool zSign, int32_t zExp, 4846 uint64_t zSig0, uint64_t zSig1, 4847 uint64_t zSig2, float_status *status) 4848 { 4849 int8_t roundingMode; 4850 bool roundNearestEven, increment, isTiny; 4851 4852 roundingMode = status->float_rounding_mode; 4853 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4854 switch (roundingMode) { 4855 case float_round_nearest_even: 4856 case float_round_ties_away: 4857 increment = ((int64_t)zSig2 < 0); 4858 break; 4859 case float_round_to_zero: 4860 increment = 0; 4861 break; 4862 case float_round_up: 4863 increment = !zSign && zSig2; 4864 break; 4865 case float_round_down: 4866 increment = zSign && zSig2; 4867 break; 4868 case float_round_to_odd: 4869 increment = !(zSig1 & 0x1) && zSig2; 4870 break; 4871 default: 4872 abort(); 4873 } 4874 if ( 0x7FFD <= (uint32_t) zExp ) { 4875 if ( ( 0x7FFD < zExp ) 4876 || ( ( zExp == 0x7FFD ) 4877 && eq128( 4878 UINT64_C(0x0001FFFFFFFFFFFF), 4879 UINT64_C(0xFFFFFFFFFFFFFFFF), 4880 zSig0, 4881 zSig1 4882 ) 4883 && increment 4884 ) 4885 ) { 4886 float_raise(float_flag_overflow | float_flag_inexact, status); 4887 if ( ( roundingMode == float_round_to_zero ) 4888 || ( zSign && ( roundingMode == float_round_up ) ) 4889 || ( ! zSign && ( roundingMode == float_round_down ) ) 4890 || (roundingMode == float_round_to_odd) 4891 ) { 4892 return 4893 packFloat128( 4894 zSign, 4895 0x7FFE, 4896 UINT64_C(0x0000FFFFFFFFFFFF), 4897 UINT64_C(0xFFFFFFFFFFFFFFFF) 4898 ); 4899 } 4900 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4901 } 4902 if ( zExp < 0 ) { 4903 if (status->flush_to_zero) { 4904 float_raise(float_flag_output_denormal, status); 4905 return packFloat128(zSign, 0, 0, 0); 4906 } 4907 isTiny = status->tininess_before_rounding 4908 || (zExp < -1) 4909 || !increment 4910 || lt128(zSig0, zSig1, 4911 UINT64_C(0x0001FFFFFFFFFFFF), 4912 UINT64_C(0xFFFFFFFFFFFFFFFF)); 4913 shift128ExtraRightJamming( 4914 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 4915 zExp = 0; 4916 if (isTiny && zSig2) { 4917 float_raise(float_flag_underflow, status); 4918 } 4919 switch (roundingMode) { 4920 case float_round_nearest_even: 4921 case float_round_ties_away: 4922 increment = ((int64_t)zSig2 < 0); 4923 break; 4924 case float_round_to_zero: 4925 increment = 0; 4926 break; 4927 case float_round_up: 4928 increment = !zSign && zSig2; 4929 break; 4930 case float_round_down: 4931 increment = zSign && zSig2; 4932 break; 4933 case float_round_to_odd: 4934 increment = !(zSig1 & 0x1) && zSig2; 4935 break; 4936 default: 4937 abort(); 4938 } 4939 } 4940 } 4941 if (zSig2) { 4942 float_raise(float_flag_inexact, status); 4943 } 4944 if ( increment ) { 4945 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 4946 if ((zSig2 + zSig2 == 0) && roundNearestEven) { 4947 zSig1 &= ~1; 4948 } 4949 } 4950 else { 4951 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 4952 } 4953 return packFloat128( zSign, zExp, zSig0, zSig1 ); 4954 4955 } 4956 4957 /*---------------------------------------------------------------------------- 4958 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4959 | and significand formed by the concatenation of `zSig0' and `zSig1', and 4960 | returns the proper quadruple-precision floating-point value corresponding 4961 | to the abstract input. This routine is just like `roundAndPackFloat128' 4962 | except that the input significand has fewer bits and does not have to be 4963 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 4964 | point exponent. 4965 *----------------------------------------------------------------------------*/ 4966 4967 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp, 4968 uint64_t zSig0, uint64_t zSig1, 4969 float_status *status) 4970 { 4971 int8_t shiftCount; 4972 uint64_t zSig2; 4973 4974 if ( zSig0 == 0 ) { 4975 zSig0 = zSig1; 4976 zSig1 = 0; 4977 zExp -= 64; 4978 } 4979 shiftCount = clz64(zSig0) - 15; 4980 if ( 0 <= shiftCount ) { 4981 zSig2 = 0; 4982 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4983 } 4984 else { 4985 shift128ExtraRightJamming( 4986 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 4987 } 4988 zExp -= shiftCount; 4989 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 4990 4991 } 4992 4993 4994 /*---------------------------------------------------------------------------- 4995 | Returns the result of converting the 32-bit two's complement integer `a' 4996 | to the extended double-precision floating-point format. The conversion 4997 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4998 | Arithmetic. 4999 *----------------------------------------------------------------------------*/ 5000 5001 floatx80 int32_to_floatx80(int32_t a, float_status *status) 5002 { 5003 bool zSign; 5004 uint32_t absA; 5005 int8_t shiftCount; 5006 uint64_t zSig; 5007 5008 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 5009 zSign = ( a < 0 ); 5010 absA = zSign ? - a : a; 5011 shiftCount = clz32(absA) + 32; 5012 zSig = absA; 5013 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 5014 5015 } 5016 5017 /*---------------------------------------------------------------------------- 5018 | Returns the result of converting the 32-bit two's complement integer `a' to 5019 | the quadruple-precision floating-point format. The conversion is performed 5020 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5021 *----------------------------------------------------------------------------*/ 5022 5023 float128 int32_to_float128(int32_t a, float_status *status) 5024 { 5025 bool zSign; 5026 uint32_t absA; 5027 int8_t shiftCount; 5028 uint64_t zSig0; 5029 5030 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 5031 zSign = ( a < 0 ); 5032 absA = zSign ? - a : a; 5033 shiftCount = clz32(absA) + 17; 5034 zSig0 = absA; 5035 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 5036 5037 } 5038 5039 /*---------------------------------------------------------------------------- 5040 | Returns the result of converting the 64-bit two's complement integer `a' 5041 | to the extended double-precision floating-point format. The conversion 5042 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5043 | Arithmetic. 5044 *----------------------------------------------------------------------------*/ 5045 5046 floatx80 int64_to_floatx80(int64_t a, float_status *status) 5047 { 5048 bool zSign; 5049 uint64_t absA; 5050 int8_t shiftCount; 5051 5052 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 5053 zSign = ( a < 0 ); 5054 absA = zSign ? - a : a; 5055 shiftCount = clz64(absA); 5056 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 5057 5058 } 5059 5060 /*---------------------------------------------------------------------------- 5061 | Returns the result of converting the 64-bit two's complement integer `a' to 5062 | the quadruple-precision floating-point format. The conversion is performed 5063 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5064 *----------------------------------------------------------------------------*/ 5065 5066 float128 int64_to_float128(int64_t a, float_status *status) 5067 { 5068 bool zSign; 5069 uint64_t absA; 5070 int8_t shiftCount; 5071 int32_t zExp; 5072 uint64_t zSig0, zSig1; 5073 5074 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 5075 zSign = ( a < 0 ); 5076 absA = zSign ? - a : a; 5077 shiftCount = clz64(absA) + 49; 5078 zExp = 0x406E - shiftCount; 5079 if ( 64 <= shiftCount ) { 5080 zSig1 = 0; 5081 zSig0 = absA; 5082 shiftCount -= 64; 5083 } 5084 else { 5085 zSig1 = absA; 5086 zSig0 = 0; 5087 } 5088 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 5089 return packFloat128( zSign, zExp, zSig0, zSig1 ); 5090 5091 } 5092 5093 /*---------------------------------------------------------------------------- 5094 | Returns the result of converting the 64-bit unsigned integer `a' 5095 | to the quadruple-precision floating-point format. The conversion is performed 5096 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5097 *----------------------------------------------------------------------------*/ 5098 5099 float128 uint64_to_float128(uint64_t a, float_status *status) 5100 { 5101 if (a == 0) { 5102 return float128_zero; 5103 } 5104 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status); 5105 } 5106 5107 /*---------------------------------------------------------------------------- 5108 | Returns the result of converting the single-precision floating-point value 5109 | `a' to the extended double-precision floating-point format. The conversion 5110 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5111 | Arithmetic. 5112 *----------------------------------------------------------------------------*/ 5113 5114 floatx80 float32_to_floatx80(float32 a, float_status *status) 5115 { 5116 bool aSign; 5117 int aExp; 5118 uint32_t aSig; 5119 5120 a = float32_squash_input_denormal(a, status); 5121 aSig = extractFloat32Frac( a ); 5122 aExp = extractFloat32Exp( a ); 5123 aSign = extractFloat32Sign( a ); 5124 if ( aExp == 0xFF ) { 5125 if (aSig) { 5126 floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status), 5127 status); 5128 return floatx80_silence_nan(res, status); 5129 } 5130 return packFloatx80(aSign, 5131 floatx80_infinity_high, 5132 floatx80_infinity_low); 5133 } 5134 if ( aExp == 0 ) { 5135 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 5136 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5137 } 5138 aSig |= 0x00800000; 5139 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 5140 5141 } 5142 5143 /*---------------------------------------------------------------------------- 5144 | Returns the result of converting the single-precision floating-point value 5145 | `a' to the double-precision floating-point format. The conversion is 5146 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5147 | Arithmetic. 5148 *----------------------------------------------------------------------------*/ 5149 5150 float128 float32_to_float128(float32 a, float_status *status) 5151 { 5152 bool aSign; 5153 int aExp; 5154 uint32_t aSig; 5155 5156 a = float32_squash_input_denormal(a, status); 5157 aSig = extractFloat32Frac( a ); 5158 aExp = extractFloat32Exp( a ); 5159 aSign = extractFloat32Sign( a ); 5160 if ( aExp == 0xFF ) { 5161 if (aSig) { 5162 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 5163 } 5164 return packFloat128( aSign, 0x7FFF, 0, 0 ); 5165 } 5166 if ( aExp == 0 ) { 5167 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 5168 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5169 --aExp; 5170 } 5171 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 5172 5173 } 5174 5175 /*---------------------------------------------------------------------------- 5176 | Returns the remainder of the single-precision floating-point value `a' 5177 | with respect to the corresponding value `b'. The operation is performed 5178 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5179 *----------------------------------------------------------------------------*/ 5180 5181 float32 float32_rem(float32 a, float32 b, float_status *status) 5182 { 5183 bool aSign, zSign; 5184 int aExp, bExp, expDiff; 5185 uint32_t aSig, bSig; 5186 uint32_t q; 5187 uint64_t aSig64, bSig64, q64; 5188 uint32_t alternateASig; 5189 int32_t sigMean; 5190 a = float32_squash_input_denormal(a, status); 5191 b = float32_squash_input_denormal(b, status); 5192 5193 aSig = extractFloat32Frac( a ); 5194 aExp = extractFloat32Exp( a ); 5195 aSign = extractFloat32Sign( a ); 5196 bSig = extractFloat32Frac( b ); 5197 bExp = extractFloat32Exp( b ); 5198 if ( aExp == 0xFF ) { 5199 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 5200 return propagateFloat32NaN(a, b, status); 5201 } 5202 float_raise(float_flag_invalid, status); 5203 return float32_default_nan(status); 5204 } 5205 if ( bExp == 0xFF ) { 5206 if (bSig) { 5207 return propagateFloat32NaN(a, b, status); 5208 } 5209 return a; 5210 } 5211 if ( bExp == 0 ) { 5212 if ( bSig == 0 ) { 5213 float_raise(float_flag_invalid, status); 5214 return float32_default_nan(status); 5215 } 5216 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 5217 } 5218 if ( aExp == 0 ) { 5219 if ( aSig == 0 ) return a; 5220 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5221 } 5222 expDiff = aExp - bExp; 5223 aSig |= 0x00800000; 5224 bSig |= 0x00800000; 5225 if ( expDiff < 32 ) { 5226 aSig <<= 8; 5227 bSig <<= 8; 5228 if ( expDiff < 0 ) { 5229 if ( expDiff < -1 ) return a; 5230 aSig >>= 1; 5231 } 5232 q = ( bSig <= aSig ); 5233 if ( q ) aSig -= bSig; 5234 if ( 0 < expDiff ) { 5235 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 5236 q >>= 32 - expDiff; 5237 bSig >>= 2; 5238 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 5239 } 5240 else { 5241 aSig >>= 2; 5242 bSig >>= 2; 5243 } 5244 } 5245 else { 5246 if ( bSig <= aSig ) aSig -= bSig; 5247 aSig64 = ( (uint64_t) aSig )<<40; 5248 bSig64 = ( (uint64_t) bSig )<<40; 5249 expDiff -= 64; 5250 while ( 0 < expDiff ) { 5251 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 5252 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 5253 aSig64 = - ( ( bSig * q64 )<<38 ); 5254 expDiff -= 62; 5255 } 5256 expDiff += 64; 5257 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 5258 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 5259 q = q64>>( 64 - expDiff ); 5260 bSig <<= 6; 5261 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 5262 } 5263 do { 5264 alternateASig = aSig; 5265 ++q; 5266 aSig -= bSig; 5267 } while ( 0 <= (int32_t) aSig ); 5268 sigMean = aSig + alternateASig; 5269 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 5270 aSig = alternateASig; 5271 } 5272 zSign = ( (int32_t) aSig < 0 ); 5273 if ( zSign ) aSig = - aSig; 5274 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 5275 } 5276 5277 5278 5279 /*---------------------------------------------------------------------------- 5280 | Returns the binary exponential of the single-precision floating-point value 5281 | `a'. The operation is performed according to the IEC/IEEE Standard for 5282 | Binary Floating-Point Arithmetic. 5283 | 5284 | Uses the following identities: 5285 | 5286 | 1. ------------------------------------------------------------------------- 5287 | x x*ln(2) 5288 | 2 = e 5289 | 5290 | 2. ------------------------------------------------------------------------- 5291 | 2 3 4 5 n 5292 | x x x x x x x 5293 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 5294 | 1! 2! 3! 4! 5! n! 5295 *----------------------------------------------------------------------------*/ 5296 5297 static const float64 float32_exp2_coefficients[15] = 5298 { 5299 const_float64( 0x3ff0000000000000ll ), /* 1 */ 5300 const_float64( 0x3fe0000000000000ll ), /* 2 */ 5301 const_float64( 0x3fc5555555555555ll ), /* 3 */ 5302 const_float64( 0x3fa5555555555555ll ), /* 4 */ 5303 const_float64( 0x3f81111111111111ll ), /* 5 */ 5304 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 5305 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 5306 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 5307 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 5308 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 5309 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 5310 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 5311 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 5312 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 5313 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 5314 }; 5315 5316 float32 float32_exp2(float32 a, float_status *status) 5317 { 5318 bool aSign; 5319 int aExp; 5320 uint32_t aSig; 5321 float64 r, x, xn; 5322 int i; 5323 a = float32_squash_input_denormal(a, status); 5324 5325 aSig = extractFloat32Frac( a ); 5326 aExp = extractFloat32Exp( a ); 5327 aSign = extractFloat32Sign( a ); 5328 5329 if ( aExp == 0xFF) { 5330 if (aSig) { 5331 return propagateFloat32NaN(a, float32_zero, status); 5332 } 5333 return (aSign) ? float32_zero : a; 5334 } 5335 if (aExp == 0) { 5336 if (aSig == 0) return float32_one; 5337 } 5338 5339 float_raise(float_flag_inexact, status); 5340 5341 /* ******************************* */ 5342 /* using float64 for approximation */ 5343 /* ******************************* */ 5344 x = float32_to_float64(a, status); 5345 x = float64_mul(x, float64_ln2, status); 5346 5347 xn = x; 5348 r = float64_one; 5349 for (i = 0 ; i < 15 ; i++) { 5350 float64 f; 5351 5352 f = float64_mul(xn, float32_exp2_coefficients[i], status); 5353 r = float64_add(r, f, status); 5354 5355 xn = float64_mul(xn, x, status); 5356 } 5357 5358 return float64_to_float32(r, status); 5359 } 5360 5361 /*---------------------------------------------------------------------------- 5362 | Returns the binary log of the single-precision floating-point value `a'. 5363 | The operation is performed according to the IEC/IEEE Standard for Binary 5364 | Floating-Point Arithmetic. 5365 *----------------------------------------------------------------------------*/ 5366 float32 float32_log2(float32 a, float_status *status) 5367 { 5368 bool aSign, zSign; 5369 int aExp; 5370 uint32_t aSig, zSig, i; 5371 5372 a = float32_squash_input_denormal(a, status); 5373 aSig = extractFloat32Frac( a ); 5374 aExp = extractFloat32Exp( a ); 5375 aSign = extractFloat32Sign( a ); 5376 5377 if ( aExp == 0 ) { 5378 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 5379 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5380 } 5381 if ( aSign ) { 5382 float_raise(float_flag_invalid, status); 5383 return float32_default_nan(status); 5384 } 5385 if ( aExp == 0xFF ) { 5386 if (aSig) { 5387 return propagateFloat32NaN(a, float32_zero, status); 5388 } 5389 return a; 5390 } 5391 5392 aExp -= 0x7F; 5393 aSig |= 0x00800000; 5394 zSign = aExp < 0; 5395 zSig = aExp << 23; 5396 5397 for (i = 1 << 22; i > 0; i >>= 1) { 5398 aSig = ( (uint64_t)aSig * aSig ) >> 23; 5399 if ( aSig & 0x01000000 ) { 5400 aSig >>= 1; 5401 zSig |= i; 5402 } 5403 } 5404 5405 if ( zSign ) 5406 zSig = -zSig; 5407 5408 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 5409 } 5410 5411 /*---------------------------------------------------------------------------- 5412 | Returns the result of converting the double-precision floating-point value 5413 | `a' to the extended double-precision floating-point format. The conversion 5414 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5415 | Arithmetic. 5416 *----------------------------------------------------------------------------*/ 5417 5418 floatx80 float64_to_floatx80(float64 a, float_status *status) 5419 { 5420 bool aSign; 5421 int aExp; 5422 uint64_t aSig; 5423 5424 a = float64_squash_input_denormal(a, status); 5425 aSig = extractFloat64Frac( a ); 5426 aExp = extractFloat64Exp( a ); 5427 aSign = extractFloat64Sign( a ); 5428 if ( aExp == 0x7FF ) { 5429 if (aSig) { 5430 floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status), 5431 status); 5432 return floatx80_silence_nan(res, status); 5433 } 5434 return packFloatx80(aSign, 5435 floatx80_infinity_high, 5436 floatx80_infinity_low); 5437 } 5438 if ( aExp == 0 ) { 5439 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 5440 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5441 } 5442 return 5443 packFloatx80( 5444 aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11); 5445 5446 } 5447 5448 /*---------------------------------------------------------------------------- 5449 | Returns the result of converting the double-precision floating-point value 5450 | `a' to the quadruple-precision floating-point format. The conversion is 5451 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5452 | Arithmetic. 5453 *----------------------------------------------------------------------------*/ 5454 5455 float128 float64_to_float128(float64 a, float_status *status) 5456 { 5457 bool aSign; 5458 int aExp; 5459 uint64_t aSig, zSig0, zSig1; 5460 5461 a = float64_squash_input_denormal(a, status); 5462 aSig = extractFloat64Frac( a ); 5463 aExp = extractFloat64Exp( a ); 5464 aSign = extractFloat64Sign( a ); 5465 if ( aExp == 0x7FF ) { 5466 if (aSig) { 5467 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 5468 } 5469 return packFloat128( aSign, 0x7FFF, 0, 0 ); 5470 } 5471 if ( aExp == 0 ) { 5472 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 5473 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5474 --aExp; 5475 } 5476 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 5477 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 5478 5479 } 5480 5481 5482 /*---------------------------------------------------------------------------- 5483 | Returns the remainder of the double-precision floating-point value `a' 5484 | with respect to the corresponding value `b'. The operation is performed 5485 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5486 *----------------------------------------------------------------------------*/ 5487 5488 float64 float64_rem(float64 a, float64 b, float_status *status) 5489 { 5490 bool aSign, zSign; 5491 int aExp, bExp, expDiff; 5492 uint64_t aSig, bSig; 5493 uint64_t q, alternateASig; 5494 int64_t sigMean; 5495 5496 a = float64_squash_input_denormal(a, status); 5497 b = float64_squash_input_denormal(b, status); 5498 aSig = extractFloat64Frac( a ); 5499 aExp = extractFloat64Exp( a ); 5500 aSign = extractFloat64Sign( a ); 5501 bSig = extractFloat64Frac( b ); 5502 bExp = extractFloat64Exp( b ); 5503 if ( aExp == 0x7FF ) { 5504 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 5505 return propagateFloat64NaN(a, b, status); 5506 } 5507 float_raise(float_flag_invalid, status); 5508 return float64_default_nan(status); 5509 } 5510 if ( bExp == 0x7FF ) { 5511 if (bSig) { 5512 return propagateFloat64NaN(a, b, status); 5513 } 5514 return a; 5515 } 5516 if ( bExp == 0 ) { 5517 if ( bSig == 0 ) { 5518 float_raise(float_flag_invalid, status); 5519 return float64_default_nan(status); 5520 } 5521 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 5522 } 5523 if ( aExp == 0 ) { 5524 if ( aSig == 0 ) return a; 5525 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5526 } 5527 expDiff = aExp - bExp; 5528 aSig = (aSig | UINT64_C(0x0010000000000000)) << 11; 5529 bSig = (bSig | UINT64_C(0x0010000000000000)) << 11; 5530 if ( expDiff < 0 ) { 5531 if ( expDiff < -1 ) return a; 5532 aSig >>= 1; 5533 } 5534 q = ( bSig <= aSig ); 5535 if ( q ) aSig -= bSig; 5536 expDiff -= 64; 5537 while ( 0 < expDiff ) { 5538 q = estimateDiv128To64( aSig, 0, bSig ); 5539 q = ( 2 < q ) ? q - 2 : 0; 5540 aSig = - ( ( bSig>>2 ) * q ); 5541 expDiff -= 62; 5542 } 5543 expDiff += 64; 5544 if ( 0 < expDiff ) { 5545 q = estimateDiv128To64( aSig, 0, bSig ); 5546 q = ( 2 < q ) ? q - 2 : 0; 5547 q >>= 64 - expDiff; 5548 bSig >>= 2; 5549 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 5550 } 5551 else { 5552 aSig >>= 2; 5553 bSig >>= 2; 5554 } 5555 do { 5556 alternateASig = aSig; 5557 ++q; 5558 aSig -= bSig; 5559 } while ( 0 <= (int64_t) aSig ); 5560 sigMean = aSig + alternateASig; 5561 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 5562 aSig = alternateASig; 5563 } 5564 zSign = ( (int64_t) aSig < 0 ); 5565 if ( zSign ) aSig = - aSig; 5566 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 5567 5568 } 5569 5570 /*---------------------------------------------------------------------------- 5571 | Returns the binary log of the double-precision floating-point value `a'. 5572 | The operation is performed according to the IEC/IEEE Standard for Binary 5573 | Floating-Point Arithmetic. 5574 *----------------------------------------------------------------------------*/ 5575 float64 float64_log2(float64 a, float_status *status) 5576 { 5577 bool aSign, zSign; 5578 int aExp; 5579 uint64_t aSig, aSig0, aSig1, zSig, i; 5580 a = float64_squash_input_denormal(a, status); 5581 5582 aSig = extractFloat64Frac( a ); 5583 aExp = extractFloat64Exp( a ); 5584 aSign = extractFloat64Sign( a ); 5585 5586 if ( aExp == 0 ) { 5587 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 5588 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5589 } 5590 if ( aSign ) { 5591 float_raise(float_flag_invalid, status); 5592 return float64_default_nan(status); 5593 } 5594 if ( aExp == 0x7FF ) { 5595 if (aSig) { 5596 return propagateFloat64NaN(a, float64_zero, status); 5597 } 5598 return a; 5599 } 5600 5601 aExp -= 0x3FF; 5602 aSig |= UINT64_C(0x0010000000000000); 5603 zSign = aExp < 0; 5604 zSig = (uint64_t)aExp << 52; 5605 for (i = 1LL << 51; i > 0; i >>= 1) { 5606 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 5607 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 5608 if ( aSig & UINT64_C(0x0020000000000000) ) { 5609 aSig >>= 1; 5610 zSig |= i; 5611 } 5612 } 5613 5614 if ( zSign ) 5615 zSig = -zSig; 5616 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 5617 } 5618 5619 /*---------------------------------------------------------------------------- 5620 | Returns the result of converting the extended double-precision floating- 5621 | point value `a' to the 32-bit two's complement integer format. The 5622 | conversion is performed according to the IEC/IEEE Standard for Binary 5623 | Floating-Point Arithmetic---which means in particular that the conversion 5624 | is rounded according to the current rounding mode. If `a' is a NaN, the 5625 | largest positive integer is returned. Otherwise, if the conversion 5626 | overflows, the largest integer with the same sign as `a' is returned. 5627 *----------------------------------------------------------------------------*/ 5628 5629 int32_t floatx80_to_int32(floatx80 a, float_status *status) 5630 { 5631 bool aSign; 5632 int32_t aExp, shiftCount; 5633 uint64_t aSig; 5634 5635 if (floatx80_invalid_encoding(a)) { 5636 float_raise(float_flag_invalid, status); 5637 return 1 << 31; 5638 } 5639 aSig = extractFloatx80Frac( a ); 5640 aExp = extractFloatx80Exp( a ); 5641 aSign = extractFloatx80Sign( a ); 5642 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5643 shiftCount = 0x4037 - aExp; 5644 if ( shiftCount <= 0 ) shiftCount = 1; 5645 shift64RightJamming( aSig, shiftCount, &aSig ); 5646 return roundAndPackInt32(aSign, aSig, status); 5647 5648 } 5649 5650 /*---------------------------------------------------------------------------- 5651 | Returns the result of converting the extended double-precision floating- 5652 | point value `a' to the 32-bit two's complement integer format. The 5653 | conversion is performed according to the IEC/IEEE Standard for Binary 5654 | Floating-Point Arithmetic, except that the conversion is always rounded 5655 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5656 | Otherwise, if the conversion overflows, the largest integer with the same 5657 | sign as `a' is returned. 5658 *----------------------------------------------------------------------------*/ 5659 5660 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 5661 { 5662 bool aSign; 5663 int32_t aExp, shiftCount; 5664 uint64_t aSig, savedASig; 5665 int32_t z; 5666 5667 if (floatx80_invalid_encoding(a)) { 5668 float_raise(float_flag_invalid, status); 5669 return 1 << 31; 5670 } 5671 aSig = extractFloatx80Frac( a ); 5672 aExp = extractFloatx80Exp( a ); 5673 aSign = extractFloatx80Sign( a ); 5674 if ( 0x401E < aExp ) { 5675 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5676 goto invalid; 5677 } 5678 else if ( aExp < 0x3FFF ) { 5679 if (aExp || aSig) { 5680 float_raise(float_flag_inexact, status); 5681 } 5682 return 0; 5683 } 5684 shiftCount = 0x403E - aExp; 5685 savedASig = aSig; 5686 aSig >>= shiftCount; 5687 z = aSig; 5688 if ( aSign ) z = - z; 5689 if ( ( z < 0 ) ^ aSign ) { 5690 invalid: 5691 float_raise(float_flag_invalid, status); 5692 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5693 } 5694 if ( ( aSig<<shiftCount ) != savedASig ) { 5695 float_raise(float_flag_inexact, status); 5696 } 5697 return z; 5698 5699 } 5700 5701 /*---------------------------------------------------------------------------- 5702 | Returns the result of converting the extended double-precision floating- 5703 | point value `a' to the 64-bit two's complement integer format. The 5704 | conversion is performed according to the IEC/IEEE Standard for Binary 5705 | Floating-Point Arithmetic---which means in particular that the conversion 5706 | is rounded according to the current rounding mode. If `a' is a NaN, 5707 | the largest positive integer is returned. Otherwise, if the conversion 5708 | overflows, the largest integer with the same sign as `a' is returned. 5709 *----------------------------------------------------------------------------*/ 5710 5711 int64_t floatx80_to_int64(floatx80 a, float_status *status) 5712 { 5713 bool aSign; 5714 int32_t aExp, shiftCount; 5715 uint64_t aSig, aSigExtra; 5716 5717 if (floatx80_invalid_encoding(a)) { 5718 float_raise(float_flag_invalid, status); 5719 return 1ULL << 63; 5720 } 5721 aSig = extractFloatx80Frac( a ); 5722 aExp = extractFloatx80Exp( a ); 5723 aSign = extractFloatx80Sign( a ); 5724 shiftCount = 0x403E - aExp; 5725 if ( shiftCount <= 0 ) { 5726 if ( shiftCount ) { 5727 float_raise(float_flag_invalid, status); 5728 if (!aSign || floatx80_is_any_nan(a)) { 5729 return INT64_MAX; 5730 } 5731 return INT64_MIN; 5732 } 5733 aSigExtra = 0; 5734 } 5735 else { 5736 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 5737 } 5738 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 5739 5740 } 5741 5742 /*---------------------------------------------------------------------------- 5743 | Returns the result of converting the extended double-precision floating- 5744 | point value `a' to the 64-bit two's complement integer format. The 5745 | conversion is performed according to the IEC/IEEE Standard for Binary 5746 | Floating-Point Arithmetic, except that the conversion is always rounded 5747 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5748 | Otherwise, if the conversion overflows, the largest integer with the same 5749 | sign as `a' is returned. 5750 *----------------------------------------------------------------------------*/ 5751 5752 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 5753 { 5754 bool aSign; 5755 int32_t aExp, shiftCount; 5756 uint64_t aSig; 5757 int64_t z; 5758 5759 if (floatx80_invalid_encoding(a)) { 5760 float_raise(float_flag_invalid, status); 5761 return 1ULL << 63; 5762 } 5763 aSig = extractFloatx80Frac( a ); 5764 aExp = extractFloatx80Exp( a ); 5765 aSign = extractFloatx80Sign( a ); 5766 shiftCount = aExp - 0x403E; 5767 if ( 0 <= shiftCount ) { 5768 aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF); 5769 if ( ( a.high != 0xC03E ) || aSig ) { 5770 float_raise(float_flag_invalid, status); 5771 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 5772 return INT64_MAX; 5773 } 5774 } 5775 return INT64_MIN; 5776 } 5777 else if ( aExp < 0x3FFF ) { 5778 if (aExp | aSig) { 5779 float_raise(float_flag_inexact, status); 5780 } 5781 return 0; 5782 } 5783 z = aSig>>( - shiftCount ); 5784 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 5785 float_raise(float_flag_inexact, status); 5786 } 5787 if ( aSign ) z = - z; 5788 return z; 5789 5790 } 5791 5792 /*---------------------------------------------------------------------------- 5793 | Returns the result of converting the extended double-precision floating- 5794 | point value `a' to the single-precision floating-point format. The 5795 | conversion is performed according to the IEC/IEEE Standard for Binary 5796 | Floating-Point Arithmetic. 5797 *----------------------------------------------------------------------------*/ 5798 5799 float32 floatx80_to_float32(floatx80 a, float_status *status) 5800 { 5801 bool aSign; 5802 int32_t aExp; 5803 uint64_t aSig; 5804 5805 if (floatx80_invalid_encoding(a)) { 5806 float_raise(float_flag_invalid, status); 5807 return float32_default_nan(status); 5808 } 5809 aSig = extractFloatx80Frac( a ); 5810 aExp = extractFloatx80Exp( a ); 5811 aSign = extractFloatx80Sign( a ); 5812 if ( aExp == 0x7FFF ) { 5813 if ( (uint64_t) ( aSig<<1 ) ) { 5814 float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status), 5815 status); 5816 return float32_silence_nan(res, status); 5817 } 5818 return packFloat32( aSign, 0xFF, 0 ); 5819 } 5820 shift64RightJamming( aSig, 33, &aSig ); 5821 if ( aExp || aSig ) aExp -= 0x3F81; 5822 return roundAndPackFloat32(aSign, aExp, aSig, status); 5823 5824 } 5825 5826 /*---------------------------------------------------------------------------- 5827 | Returns the result of converting the extended double-precision floating- 5828 | point value `a' to the double-precision floating-point format. The 5829 | conversion is performed according to the IEC/IEEE Standard for Binary 5830 | Floating-Point Arithmetic. 5831 *----------------------------------------------------------------------------*/ 5832 5833 float64 floatx80_to_float64(floatx80 a, float_status *status) 5834 { 5835 bool aSign; 5836 int32_t aExp; 5837 uint64_t aSig, zSig; 5838 5839 if (floatx80_invalid_encoding(a)) { 5840 float_raise(float_flag_invalid, status); 5841 return float64_default_nan(status); 5842 } 5843 aSig = extractFloatx80Frac( a ); 5844 aExp = extractFloatx80Exp( a ); 5845 aSign = extractFloatx80Sign( a ); 5846 if ( aExp == 0x7FFF ) { 5847 if ( (uint64_t) ( aSig<<1 ) ) { 5848 float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status), 5849 status); 5850 return float64_silence_nan(res, status); 5851 } 5852 return packFloat64( aSign, 0x7FF, 0 ); 5853 } 5854 shift64RightJamming( aSig, 1, &zSig ); 5855 if ( aExp || aSig ) aExp -= 0x3C01; 5856 return roundAndPackFloat64(aSign, aExp, zSig, status); 5857 5858 } 5859 5860 /*---------------------------------------------------------------------------- 5861 | Returns the result of converting the extended double-precision floating- 5862 | point value `a' to the quadruple-precision floating-point format. The 5863 | conversion is performed according to the IEC/IEEE Standard for Binary 5864 | Floating-Point Arithmetic. 5865 *----------------------------------------------------------------------------*/ 5866 5867 float128 floatx80_to_float128(floatx80 a, float_status *status) 5868 { 5869 bool aSign; 5870 int aExp; 5871 uint64_t aSig, zSig0, zSig1; 5872 5873 if (floatx80_invalid_encoding(a)) { 5874 float_raise(float_flag_invalid, status); 5875 return float128_default_nan(status); 5876 } 5877 aSig = extractFloatx80Frac( a ); 5878 aExp = extractFloatx80Exp( a ); 5879 aSign = extractFloatx80Sign( a ); 5880 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 5881 float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status), 5882 status); 5883 return float128_silence_nan(res, status); 5884 } 5885 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 5886 return packFloat128( aSign, aExp, zSig0, zSig1 ); 5887 5888 } 5889 5890 /*---------------------------------------------------------------------------- 5891 | Rounds the extended double-precision floating-point value `a' 5892 | to the precision provided by floatx80_rounding_precision and returns the 5893 | result as an extended double-precision floating-point value. 5894 | The operation is performed according to the IEC/IEEE Standard for Binary 5895 | Floating-Point Arithmetic. 5896 *----------------------------------------------------------------------------*/ 5897 5898 floatx80 floatx80_round(floatx80 a, float_status *status) 5899 { 5900 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5901 extractFloatx80Sign(a), 5902 extractFloatx80Exp(a), 5903 extractFloatx80Frac(a), 0, status); 5904 } 5905 5906 /*---------------------------------------------------------------------------- 5907 | Rounds the extended double-precision floating-point value `a' to an integer, 5908 | and returns the result as an extended quadruple-precision floating-point 5909 | value. The operation is performed according to the IEC/IEEE Standard for 5910 | Binary Floating-Point Arithmetic. 5911 *----------------------------------------------------------------------------*/ 5912 5913 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 5914 { 5915 bool aSign; 5916 int32_t aExp; 5917 uint64_t lastBitMask, roundBitsMask; 5918 floatx80 z; 5919 5920 if (floatx80_invalid_encoding(a)) { 5921 float_raise(float_flag_invalid, status); 5922 return floatx80_default_nan(status); 5923 } 5924 aExp = extractFloatx80Exp( a ); 5925 if ( 0x403E <= aExp ) { 5926 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 5927 return propagateFloatx80NaN(a, a, status); 5928 } 5929 return a; 5930 } 5931 if ( aExp < 0x3FFF ) { 5932 if ( ( aExp == 0 ) 5933 && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) { 5934 return a; 5935 } 5936 float_raise(float_flag_inexact, status); 5937 aSign = extractFloatx80Sign( a ); 5938 switch (status->float_rounding_mode) { 5939 case float_round_nearest_even: 5940 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 5941 ) { 5942 return 5943 packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000)); 5944 } 5945 break; 5946 case float_round_ties_away: 5947 if (aExp == 0x3FFE) { 5948 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000)); 5949 } 5950 break; 5951 case float_round_down: 5952 return 5953 aSign ? 5954 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000)) 5955 : packFloatx80( 0, 0, 0 ); 5956 case float_round_up: 5957 return 5958 aSign ? packFloatx80( 1, 0, 0 ) 5959 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000)); 5960 5961 case float_round_to_zero: 5962 break; 5963 default: 5964 g_assert_not_reached(); 5965 } 5966 return packFloatx80( aSign, 0, 0 ); 5967 } 5968 lastBitMask = 1; 5969 lastBitMask <<= 0x403E - aExp; 5970 roundBitsMask = lastBitMask - 1; 5971 z = a; 5972 switch (status->float_rounding_mode) { 5973 case float_round_nearest_even: 5974 z.low += lastBitMask>>1; 5975 if ((z.low & roundBitsMask) == 0) { 5976 z.low &= ~lastBitMask; 5977 } 5978 break; 5979 case float_round_ties_away: 5980 z.low += lastBitMask >> 1; 5981 break; 5982 case float_round_to_zero: 5983 break; 5984 case float_round_up: 5985 if (!extractFloatx80Sign(z)) { 5986 z.low += roundBitsMask; 5987 } 5988 break; 5989 case float_round_down: 5990 if (extractFloatx80Sign(z)) { 5991 z.low += roundBitsMask; 5992 } 5993 break; 5994 default: 5995 abort(); 5996 } 5997 z.low &= ~ roundBitsMask; 5998 if ( z.low == 0 ) { 5999 ++z.high; 6000 z.low = UINT64_C(0x8000000000000000); 6001 } 6002 if (z.low != a.low) { 6003 float_raise(float_flag_inexact, status); 6004 } 6005 return z; 6006 6007 } 6008 6009 /*---------------------------------------------------------------------------- 6010 | Returns the result of adding the absolute values of the extended double- 6011 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 6012 | negated before being returned. `zSign' is ignored if the result is a NaN. 6013 | The addition is performed according to the IEC/IEEE Standard for Binary 6014 | Floating-Point Arithmetic. 6015 *----------------------------------------------------------------------------*/ 6016 6017 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign, 6018 float_status *status) 6019 { 6020 int32_t aExp, bExp, zExp; 6021 uint64_t aSig, bSig, zSig0, zSig1; 6022 int32_t expDiff; 6023 6024 aSig = extractFloatx80Frac( a ); 6025 aExp = extractFloatx80Exp( a ); 6026 bSig = extractFloatx80Frac( b ); 6027 bExp = extractFloatx80Exp( b ); 6028 expDiff = aExp - bExp; 6029 if ( 0 < expDiff ) { 6030 if ( aExp == 0x7FFF ) { 6031 if ((uint64_t)(aSig << 1)) { 6032 return propagateFloatx80NaN(a, b, status); 6033 } 6034 return a; 6035 } 6036 if ( bExp == 0 ) --expDiff; 6037 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 6038 zExp = aExp; 6039 } 6040 else if ( expDiff < 0 ) { 6041 if ( bExp == 0x7FFF ) { 6042 if ((uint64_t)(bSig << 1)) { 6043 return propagateFloatx80NaN(a, b, status); 6044 } 6045 return packFloatx80(zSign, 6046 floatx80_infinity_high, 6047 floatx80_infinity_low); 6048 } 6049 if ( aExp == 0 ) ++expDiff; 6050 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 6051 zExp = bExp; 6052 } 6053 else { 6054 if ( aExp == 0x7FFF ) { 6055 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 6056 return propagateFloatx80NaN(a, b, status); 6057 } 6058 return a; 6059 } 6060 zSig1 = 0; 6061 zSig0 = aSig + bSig; 6062 if ( aExp == 0 ) { 6063 if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) { 6064 /* At least one of the values is a pseudo-denormal, 6065 * and there is a carry out of the result. */ 6066 zExp = 1; 6067 goto shiftRight1; 6068 } 6069 if (zSig0 == 0) { 6070 return packFloatx80(zSign, 0, 0); 6071 } 6072 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 6073 goto roundAndPack; 6074 } 6075 zExp = aExp; 6076 goto shiftRight1; 6077 } 6078 zSig0 = aSig + bSig; 6079 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 6080 shiftRight1: 6081 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 6082 zSig0 |= UINT64_C(0x8000000000000000); 6083 ++zExp; 6084 roundAndPack: 6085 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6086 zSign, zExp, zSig0, zSig1, status); 6087 } 6088 6089 /*---------------------------------------------------------------------------- 6090 | Returns the result of subtracting the absolute values of the extended 6091 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 6092 | difference is negated before being returned. `zSign' is ignored if the 6093 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6094 | Standard for Binary Floating-Point Arithmetic. 6095 *----------------------------------------------------------------------------*/ 6096 6097 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign, 6098 float_status *status) 6099 { 6100 int32_t aExp, bExp, zExp; 6101 uint64_t aSig, bSig, zSig0, zSig1; 6102 int32_t expDiff; 6103 6104 aSig = extractFloatx80Frac( a ); 6105 aExp = extractFloatx80Exp( a ); 6106 bSig = extractFloatx80Frac( b ); 6107 bExp = extractFloatx80Exp( b ); 6108 expDiff = aExp - bExp; 6109 if ( 0 < expDiff ) goto aExpBigger; 6110 if ( expDiff < 0 ) goto bExpBigger; 6111 if ( aExp == 0x7FFF ) { 6112 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 6113 return propagateFloatx80NaN(a, b, status); 6114 } 6115 float_raise(float_flag_invalid, status); 6116 return floatx80_default_nan(status); 6117 } 6118 if ( aExp == 0 ) { 6119 aExp = 1; 6120 bExp = 1; 6121 } 6122 zSig1 = 0; 6123 if ( bSig < aSig ) goto aBigger; 6124 if ( aSig < bSig ) goto bBigger; 6125 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 6126 bExpBigger: 6127 if ( bExp == 0x7FFF ) { 6128 if ((uint64_t)(bSig << 1)) { 6129 return propagateFloatx80NaN(a, b, status); 6130 } 6131 return packFloatx80(zSign ^ 1, floatx80_infinity_high, 6132 floatx80_infinity_low); 6133 } 6134 if ( aExp == 0 ) ++expDiff; 6135 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 6136 bBigger: 6137 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 6138 zExp = bExp; 6139 zSign ^= 1; 6140 goto normalizeRoundAndPack; 6141 aExpBigger: 6142 if ( aExp == 0x7FFF ) { 6143 if ((uint64_t)(aSig << 1)) { 6144 return propagateFloatx80NaN(a, b, status); 6145 } 6146 return a; 6147 } 6148 if ( bExp == 0 ) --expDiff; 6149 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 6150 aBigger: 6151 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 6152 zExp = aExp; 6153 normalizeRoundAndPack: 6154 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 6155 zSign, zExp, zSig0, zSig1, status); 6156 } 6157 6158 /*---------------------------------------------------------------------------- 6159 | Returns the result of adding the extended double-precision floating-point 6160 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6161 | Standard for Binary Floating-Point Arithmetic. 6162 *----------------------------------------------------------------------------*/ 6163 6164 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 6165 { 6166 bool aSign, bSign; 6167 6168 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6169 float_raise(float_flag_invalid, status); 6170 return floatx80_default_nan(status); 6171 } 6172 aSign = extractFloatx80Sign( a ); 6173 bSign = extractFloatx80Sign( b ); 6174 if ( aSign == bSign ) { 6175 return addFloatx80Sigs(a, b, aSign, status); 6176 } 6177 else { 6178 return subFloatx80Sigs(a, b, aSign, status); 6179 } 6180 6181 } 6182 6183 /*---------------------------------------------------------------------------- 6184 | Returns the result of subtracting the extended double-precision floating- 6185 | point values `a' and `b'. The operation is performed according to the 6186 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6187 *----------------------------------------------------------------------------*/ 6188 6189 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 6190 { 6191 bool aSign, bSign; 6192 6193 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6194 float_raise(float_flag_invalid, status); 6195 return floatx80_default_nan(status); 6196 } 6197 aSign = extractFloatx80Sign( a ); 6198 bSign = extractFloatx80Sign( b ); 6199 if ( aSign == bSign ) { 6200 return subFloatx80Sigs(a, b, aSign, status); 6201 } 6202 else { 6203 return addFloatx80Sigs(a, b, aSign, status); 6204 } 6205 6206 } 6207 6208 /*---------------------------------------------------------------------------- 6209 | Returns the result of multiplying the extended double-precision floating- 6210 | point values `a' and `b'. The operation is performed according to the 6211 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6212 *----------------------------------------------------------------------------*/ 6213 6214 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 6215 { 6216 bool aSign, bSign, zSign; 6217 int32_t aExp, bExp, zExp; 6218 uint64_t aSig, bSig, zSig0, zSig1; 6219 6220 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6221 float_raise(float_flag_invalid, status); 6222 return floatx80_default_nan(status); 6223 } 6224 aSig = extractFloatx80Frac( a ); 6225 aExp = extractFloatx80Exp( a ); 6226 aSign = extractFloatx80Sign( a ); 6227 bSig = extractFloatx80Frac( b ); 6228 bExp = extractFloatx80Exp( b ); 6229 bSign = extractFloatx80Sign( b ); 6230 zSign = aSign ^ bSign; 6231 if ( aExp == 0x7FFF ) { 6232 if ( (uint64_t) ( aSig<<1 ) 6233 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 6234 return propagateFloatx80NaN(a, b, status); 6235 } 6236 if ( ( bExp | bSig ) == 0 ) goto invalid; 6237 return packFloatx80(zSign, floatx80_infinity_high, 6238 floatx80_infinity_low); 6239 } 6240 if ( bExp == 0x7FFF ) { 6241 if ((uint64_t)(bSig << 1)) { 6242 return propagateFloatx80NaN(a, b, status); 6243 } 6244 if ( ( aExp | aSig ) == 0 ) { 6245 invalid: 6246 float_raise(float_flag_invalid, status); 6247 return floatx80_default_nan(status); 6248 } 6249 return packFloatx80(zSign, floatx80_infinity_high, 6250 floatx80_infinity_low); 6251 } 6252 if ( aExp == 0 ) { 6253 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6254 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 6255 } 6256 if ( bExp == 0 ) { 6257 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6258 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6259 } 6260 zExp = aExp + bExp - 0x3FFE; 6261 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 6262 if ( 0 < (int64_t) zSig0 ) { 6263 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 6264 --zExp; 6265 } 6266 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6267 zSign, zExp, zSig0, zSig1, status); 6268 } 6269 6270 /*---------------------------------------------------------------------------- 6271 | Returns the result of dividing the extended double-precision floating-point 6272 | value `a' by the corresponding value `b'. The operation is performed 6273 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6274 *----------------------------------------------------------------------------*/ 6275 6276 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 6277 { 6278 bool aSign, bSign, zSign; 6279 int32_t aExp, bExp, zExp; 6280 uint64_t aSig, bSig, zSig0, zSig1; 6281 uint64_t rem0, rem1, rem2, term0, term1, term2; 6282 6283 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6284 float_raise(float_flag_invalid, status); 6285 return floatx80_default_nan(status); 6286 } 6287 aSig = extractFloatx80Frac( a ); 6288 aExp = extractFloatx80Exp( a ); 6289 aSign = extractFloatx80Sign( a ); 6290 bSig = extractFloatx80Frac( b ); 6291 bExp = extractFloatx80Exp( b ); 6292 bSign = extractFloatx80Sign( b ); 6293 zSign = aSign ^ bSign; 6294 if ( aExp == 0x7FFF ) { 6295 if ((uint64_t)(aSig << 1)) { 6296 return propagateFloatx80NaN(a, b, status); 6297 } 6298 if ( bExp == 0x7FFF ) { 6299 if ((uint64_t)(bSig << 1)) { 6300 return propagateFloatx80NaN(a, b, status); 6301 } 6302 goto invalid; 6303 } 6304 return packFloatx80(zSign, floatx80_infinity_high, 6305 floatx80_infinity_low); 6306 } 6307 if ( bExp == 0x7FFF ) { 6308 if ((uint64_t)(bSig << 1)) { 6309 return propagateFloatx80NaN(a, b, status); 6310 } 6311 return packFloatx80( zSign, 0, 0 ); 6312 } 6313 if ( bExp == 0 ) { 6314 if ( bSig == 0 ) { 6315 if ( ( aExp | aSig ) == 0 ) { 6316 invalid: 6317 float_raise(float_flag_invalid, status); 6318 return floatx80_default_nan(status); 6319 } 6320 float_raise(float_flag_divbyzero, status); 6321 return packFloatx80(zSign, floatx80_infinity_high, 6322 floatx80_infinity_low); 6323 } 6324 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6325 } 6326 if ( aExp == 0 ) { 6327 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6328 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 6329 } 6330 zExp = aExp - bExp + 0x3FFE; 6331 rem1 = 0; 6332 if ( bSig <= aSig ) { 6333 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 6334 ++zExp; 6335 } 6336 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 6337 mul64To128( bSig, zSig0, &term0, &term1 ); 6338 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 6339 while ( (int64_t) rem0 < 0 ) { 6340 --zSig0; 6341 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 6342 } 6343 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 6344 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 6345 mul64To128( bSig, zSig1, &term1, &term2 ); 6346 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6347 while ( (int64_t) rem1 < 0 ) { 6348 --zSig1; 6349 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 6350 } 6351 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 6352 } 6353 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6354 zSign, zExp, zSig0, zSig1, status); 6355 } 6356 6357 /*---------------------------------------------------------------------------- 6358 | Returns the remainder of the extended double-precision floating-point value 6359 | `a' with respect to the corresponding value `b'. The operation is performed 6360 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic, 6361 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating 6362 | the quotient toward zero instead. '*quotient' is set to the low 64 bits of 6363 | the absolute value of the integer quotient. 6364 *----------------------------------------------------------------------------*/ 6365 6366 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient, 6367 float_status *status) 6368 { 6369 bool aSign, zSign; 6370 int32_t aExp, bExp, expDiff, aExpOrig; 6371 uint64_t aSig0, aSig1, bSig; 6372 uint64_t q, term0, term1, alternateASig0, alternateASig1; 6373 6374 *quotient = 0; 6375 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6376 float_raise(float_flag_invalid, status); 6377 return floatx80_default_nan(status); 6378 } 6379 aSig0 = extractFloatx80Frac( a ); 6380 aExpOrig = aExp = extractFloatx80Exp( a ); 6381 aSign = extractFloatx80Sign( a ); 6382 bSig = extractFloatx80Frac( b ); 6383 bExp = extractFloatx80Exp( b ); 6384 if ( aExp == 0x7FFF ) { 6385 if ( (uint64_t) ( aSig0<<1 ) 6386 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 6387 return propagateFloatx80NaN(a, b, status); 6388 } 6389 goto invalid; 6390 } 6391 if ( bExp == 0x7FFF ) { 6392 if ((uint64_t)(bSig << 1)) { 6393 return propagateFloatx80NaN(a, b, status); 6394 } 6395 if (aExp == 0 && aSig0 >> 63) { 6396 /* 6397 * Pseudo-denormal argument must be returned in normalized 6398 * form. 6399 */ 6400 return packFloatx80(aSign, 1, aSig0); 6401 } 6402 return a; 6403 } 6404 if ( bExp == 0 ) { 6405 if ( bSig == 0 ) { 6406 invalid: 6407 float_raise(float_flag_invalid, status); 6408 return floatx80_default_nan(status); 6409 } 6410 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6411 } 6412 if ( aExp == 0 ) { 6413 if ( aSig0 == 0 ) return a; 6414 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6415 } 6416 zSign = aSign; 6417 expDiff = aExp - bExp; 6418 aSig1 = 0; 6419 if ( expDiff < 0 ) { 6420 if ( mod || expDiff < -1 ) { 6421 if (aExp == 1 && aExpOrig == 0) { 6422 /* 6423 * Pseudo-denormal argument must be returned in 6424 * normalized form. 6425 */ 6426 return packFloatx80(aSign, aExp, aSig0); 6427 } 6428 return a; 6429 } 6430 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 6431 expDiff = 0; 6432 } 6433 *quotient = q = ( bSig <= aSig0 ); 6434 if ( q ) aSig0 -= bSig; 6435 expDiff -= 64; 6436 while ( 0 < expDiff ) { 6437 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6438 q = ( 2 < q ) ? q - 2 : 0; 6439 mul64To128( bSig, q, &term0, &term1 ); 6440 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6441 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 6442 expDiff -= 62; 6443 *quotient <<= 62; 6444 *quotient += q; 6445 } 6446 expDiff += 64; 6447 if ( 0 < expDiff ) { 6448 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6449 q = ( 2 < q ) ? q - 2 : 0; 6450 q >>= 64 - expDiff; 6451 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 6452 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6453 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 6454 while ( le128( term0, term1, aSig0, aSig1 ) ) { 6455 ++q; 6456 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6457 } 6458 if (expDiff < 64) { 6459 *quotient <<= expDiff; 6460 } else { 6461 *quotient = 0; 6462 } 6463 *quotient += q; 6464 } 6465 else { 6466 term1 = 0; 6467 term0 = bSig; 6468 } 6469 if (!mod) { 6470 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 6471 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6472 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6473 && ( q & 1 ) ) 6474 ) { 6475 aSig0 = alternateASig0; 6476 aSig1 = alternateASig1; 6477 zSign = ! zSign; 6478 ++*quotient; 6479 } 6480 } 6481 return 6482 normalizeRoundAndPackFloatx80( 6483 80, zSign, bExp + expDiff, aSig0, aSig1, status); 6484 6485 } 6486 6487 /*---------------------------------------------------------------------------- 6488 | Returns the remainder of the extended double-precision floating-point value 6489 | `a' with respect to the corresponding value `b'. The operation is performed 6490 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6491 *----------------------------------------------------------------------------*/ 6492 6493 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 6494 { 6495 uint64_t quotient; 6496 return floatx80_modrem(a, b, false, "ient, status); 6497 } 6498 6499 /*---------------------------------------------------------------------------- 6500 | Returns the remainder of the extended double-precision floating-point value 6501 | `a' with respect to the corresponding value `b', with the quotient truncated 6502 | toward zero. 6503 *----------------------------------------------------------------------------*/ 6504 6505 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status) 6506 { 6507 uint64_t quotient; 6508 return floatx80_modrem(a, b, true, "ient, status); 6509 } 6510 6511 /*---------------------------------------------------------------------------- 6512 | Returns the square root of the extended double-precision floating-point 6513 | value `a'. The operation is performed according to the IEC/IEEE Standard 6514 | for Binary Floating-Point Arithmetic. 6515 *----------------------------------------------------------------------------*/ 6516 6517 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 6518 { 6519 bool aSign; 6520 int32_t aExp, zExp; 6521 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 6522 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6523 6524 if (floatx80_invalid_encoding(a)) { 6525 float_raise(float_flag_invalid, status); 6526 return floatx80_default_nan(status); 6527 } 6528 aSig0 = extractFloatx80Frac( a ); 6529 aExp = extractFloatx80Exp( a ); 6530 aSign = extractFloatx80Sign( a ); 6531 if ( aExp == 0x7FFF ) { 6532 if ((uint64_t)(aSig0 << 1)) { 6533 return propagateFloatx80NaN(a, a, status); 6534 } 6535 if ( ! aSign ) return a; 6536 goto invalid; 6537 } 6538 if ( aSign ) { 6539 if ( ( aExp | aSig0 ) == 0 ) return a; 6540 invalid: 6541 float_raise(float_flag_invalid, status); 6542 return floatx80_default_nan(status); 6543 } 6544 if ( aExp == 0 ) { 6545 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 6546 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6547 } 6548 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 6549 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 6550 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 6551 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6552 doubleZSig0 = zSig0<<1; 6553 mul64To128( zSig0, zSig0, &term0, &term1 ); 6554 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6555 while ( (int64_t) rem0 < 0 ) { 6556 --zSig0; 6557 doubleZSig0 -= 2; 6558 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6559 } 6560 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6561 if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) { 6562 if ( zSig1 == 0 ) zSig1 = 1; 6563 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6564 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6565 mul64To128( zSig1, zSig1, &term2, &term3 ); 6566 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6567 while ( (int64_t) rem1 < 0 ) { 6568 --zSig1; 6569 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6570 term3 |= 1; 6571 term2 |= doubleZSig0; 6572 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6573 } 6574 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6575 } 6576 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 6577 zSig0 |= doubleZSig0; 6578 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6579 0, zExp, zSig0, zSig1, status); 6580 } 6581 6582 /*---------------------------------------------------------------------------- 6583 | Returns the result of converting the quadruple-precision floating-point 6584 | value `a' to the 32-bit two's complement integer format. The conversion 6585 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6586 | Arithmetic---which means in particular that the conversion is rounded 6587 | according to the current rounding mode. If `a' is a NaN, the largest 6588 | positive integer is returned. Otherwise, if the conversion overflows, the 6589 | largest integer with the same sign as `a' is returned. 6590 *----------------------------------------------------------------------------*/ 6591 6592 int32_t float128_to_int32(float128 a, float_status *status) 6593 { 6594 bool aSign; 6595 int32_t aExp, shiftCount; 6596 uint64_t aSig0, aSig1; 6597 6598 aSig1 = extractFloat128Frac1( a ); 6599 aSig0 = extractFloat128Frac0( a ); 6600 aExp = extractFloat128Exp( a ); 6601 aSign = extractFloat128Sign( a ); 6602 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 6603 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6604 aSig0 |= ( aSig1 != 0 ); 6605 shiftCount = 0x4028 - aExp; 6606 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 6607 return roundAndPackInt32(aSign, aSig0, status); 6608 6609 } 6610 6611 /*---------------------------------------------------------------------------- 6612 | Returns the result of converting the quadruple-precision floating-point 6613 | value `a' to the 32-bit two's complement integer format. The conversion 6614 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6615 | Arithmetic, except that the conversion is always rounded toward zero. If 6616 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 6617 | conversion overflows, the largest integer with the same sign as `a' is 6618 | returned. 6619 *----------------------------------------------------------------------------*/ 6620 6621 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 6622 { 6623 bool aSign; 6624 int32_t aExp, shiftCount; 6625 uint64_t aSig0, aSig1, savedASig; 6626 int32_t z; 6627 6628 aSig1 = extractFloat128Frac1( a ); 6629 aSig0 = extractFloat128Frac0( a ); 6630 aExp = extractFloat128Exp( a ); 6631 aSign = extractFloat128Sign( a ); 6632 aSig0 |= ( aSig1 != 0 ); 6633 if ( 0x401E < aExp ) { 6634 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 6635 goto invalid; 6636 } 6637 else if ( aExp < 0x3FFF ) { 6638 if (aExp || aSig0) { 6639 float_raise(float_flag_inexact, status); 6640 } 6641 return 0; 6642 } 6643 aSig0 |= UINT64_C(0x0001000000000000); 6644 shiftCount = 0x402F - aExp; 6645 savedASig = aSig0; 6646 aSig0 >>= shiftCount; 6647 z = aSig0; 6648 if ( aSign ) z = - z; 6649 if ( ( z < 0 ) ^ aSign ) { 6650 invalid: 6651 float_raise(float_flag_invalid, status); 6652 return aSign ? INT32_MIN : INT32_MAX; 6653 } 6654 if ( ( aSig0<<shiftCount ) != savedASig ) { 6655 float_raise(float_flag_inexact, status); 6656 } 6657 return z; 6658 6659 } 6660 6661 /*---------------------------------------------------------------------------- 6662 | Returns the result of converting the quadruple-precision floating-point 6663 | value `a' to the 64-bit two's complement integer format. The conversion 6664 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6665 | Arithmetic---which means in particular that the conversion is rounded 6666 | according to the current rounding mode. If `a' is a NaN, the largest 6667 | positive integer is returned. Otherwise, if the conversion overflows, the 6668 | largest integer with the same sign as `a' is returned. 6669 *----------------------------------------------------------------------------*/ 6670 6671 int64_t float128_to_int64(float128 a, float_status *status) 6672 { 6673 bool aSign; 6674 int32_t aExp, shiftCount; 6675 uint64_t aSig0, aSig1; 6676 6677 aSig1 = extractFloat128Frac1( a ); 6678 aSig0 = extractFloat128Frac0( a ); 6679 aExp = extractFloat128Exp( a ); 6680 aSign = extractFloat128Sign( a ); 6681 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6682 shiftCount = 0x402F - aExp; 6683 if ( shiftCount <= 0 ) { 6684 if ( 0x403E < aExp ) { 6685 float_raise(float_flag_invalid, status); 6686 if ( ! aSign 6687 || ( ( aExp == 0x7FFF ) 6688 && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) ) 6689 ) 6690 ) { 6691 return INT64_MAX; 6692 } 6693 return INT64_MIN; 6694 } 6695 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 6696 } 6697 else { 6698 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 6699 } 6700 return roundAndPackInt64(aSign, aSig0, aSig1, status); 6701 6702 } 6703 6704 /*---------------------------------------------------------------------------- 6705 | Returns the result of converting the quadruple-precision floating-point 6706 | value `a' to the 64-bit two's complement integer format. The conversion 6707 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6708 | Arithmetic, except that the conversion is always rounded toward zero. 6709 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 6710 | the conversion overflows, the largest integer with the same sign as `a' is 6711 | returned. 6712 *----------------------------------------------------------------------------*/ 6713 6714 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 6715 { 6716 bool aSign; 6717 int32_t aExp, shiftCount; 6718 uint64_t aSig0, aSig1; 6719 int64_t z; 6720 6721 aSig1 = extractFloat128Frac1( a ); 6722 aSig0 = extractFloat128Frac0( a ); 6723 aExp = extractFloat128Exp( a ); 6724 aSign = extractFloat128Sign( a ); 6725 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6726 shiftCount = aExp - 0x402F; 6727 if ( 0 < shiftCount ) { 6728 if ( 0x403E <= aExp ) { 6729 aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF); 6730 if ( ( a.high == UINT64_C(0xC03E000000000000) ) 6731 && ( aSig1 < UINT64_C(0x0002000000000000) ) ) { 6732 if (aSig1) { 6733 float_raise(float_flag_inexact, status); 6734 } 6735 } 6736 else { 6737 float_raise(float_flag_invalid, status); 6738 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 6739 return INT64_MAX; 6740 } 6741 } 6742 return INT64_MIN; 6743 } 6744 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 6745 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 6746 float_raise(float_flag_inexact, status); 6747 } 6748 } 6749 else { 6750 if ( aExp < 0x3FFF ) { 6751 if ( aExp | aSig0 | aSig1 ) { 6752 float_raise(float_flag_inexact, status); 6753 } 6754 return 0; 6755 } 6756 z = aSig0>>( - shiftCount ); 6757 if ( aSig1 6758 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 6759 float_raise(float_flag_inexact, status); 6760 } 6761 } 6762 if ( aSign ) z = - z; 6763 return z; 6764 6765 } 6766 6767 /*---------------------------------------------------------------------------- 6768 | Returns the result of converting the quadruple-precision floating-point value 6769 | `a' to the 64-bit unsigned integer format. The conversion is 6770 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6771 | Arithmetic---which means in particular that the conversion is rounded 6772 | according to the current rounding mode. If `a' is a NaN, the largest 6773 | positive integer is returned. If the conversion overflows, the 6774 | largest unsigned integer is returned. If 'a' is negative, the value is 6775 | rounded and zero is returned; negative values that do not round to zero 6776 | will raise the inexact exception. 6777 *----------------------------------------------------------------------------*/ 6778 6779 uint64_t float128_to_uint64(float128 a, float_status *status) 6780 { 6781 bool aSign; 6782 int aExp; 6783 int shiftCount; 6784 uint64_t aSig0, aSig1; 6785 6786 aSig0 = extractFloat128Frac0(a); 6787 aSig1 = extractFloat128Frac1(a); 6788 aExp = extractFloat128Exp(a); 6789 aSign = extractFloat128Sign(a); 6790 if (aSign && (aExp > 0x3FFE)) { 6791 float_raise(float_flag_invalid, status); 6792 if (float128_is_any_nan(a)) { 6793 return UINT64_MAX; 6794 } else { 6795 return 0; 6796 } 6797 } 6798 if (aExp) { 6799 aSig0 |= UINT64_C(0x0001000000000000); 6800 } 6801 shiftCount = 0x402F - aExp; 6802 if (shiftCount <= 0) { 6803 if (0x403E < aExp) { 6804 float_raise(float_flag_invalid, status); 6805 return UINT64_MAX; 6806 } 6807 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 6808 } else { 6809 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 6810 } 6811 return roundAndPackUint64(aSign, aSig0, aSig1, status); 6812 } 6813 6814 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 6815 { 6816 uint64_t v; 6817 signed char current_rounding_mode = status->float_rounding_mode; 6818 6819 set_float_rounding_mode(float_round_to_zero, status); 6820 v = float128_to_uint64(a, status); 6821 set_float_rounding_mode(current_rounding_mode, status); 6822 6823 return v; 6824 } 6825 6826 /*---------------------------------------------------------------------------- 6827 | Returns the result of converting the quadruple-precision floating-point 6828 | value `a' to the 32-bit unsigned integer format. The conversion 6829 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6830 | Arithmetic except that the conversion is always rounded toward zero. 6831 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 6832 | if the conversion overflows, the largest unsigned integer is returned. 6833 | If 'a' is negative, the value is rounded and zero is returned; negative 6834 | values that do not round to zero will raise the inexact exception. 6835 *----------------------------------------------------------------------------*/ 6836 6837 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 6838 { 6839 uint64_t v; 6840 uint32_t res; 6841 int old_exc_flags = get_float_exception_flags(status); 6842 6843 v = float128_to_uint64_round_to_zero(a, status); 6844 if (v > 0xffffffff) { 6845 res = 0xffffffff; 6846 } else { 6847 return v; 6848 } 6849 set_float_exception_flags(old_exc_flags, status); 6850 float_raise(float_flag_invalid, status); 6851 return res; 6852 } 6853 6854 /*---------------------------------------------------------------------------- 6855 | Returns the result of converting the quadruple-precision floating-point value 6856 | `a' to the 32-bit unsigned integer format. The conversion is 6857 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6858 | Arithmetic---which means in particular that the conversion is rounded 6859 | according to the current rounding mode. If `a' is a NaN, the largest 6860 | positive integer is returned. If the conversion overflows, the 6861 | largest unsigned integer is returned. If 'a' is negative, the value is 6862 | rounded and zero is returned; negative values that do not round to zero 6863 | will raise the inexact exception. 6864 *----------------------------------------------------------------------------*/ 6865 6866 uint32_t float128_to_uint32(float128 a, float_status *status) 6867 { 6868 uint64_t v; 6869 uint32_t res; 6870 int old_exc_flags = get_float_exception_flags(status); 6871 6872 v = float128_to_uint64(a, status); 6873 if (v > 0xffffffff) { 6874 res = 0xffffffff; 6875 } else { 6876 return v; 6877 } 6878 set_float_exception_flags(old_exc_flags, status); 6879 float_raise(float_flag_invalid, status); 6880 return res; 6881 } 6882 6883 /*---------------------------------------------------------------------------- 6884 | Returns the result of converting the quadruple-precision floating-point 6885 | value `a' to the single-precision floating-point format. The conversion 6886 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6887 | Arithmetic. 6888 *----------------------------------------------------------------------------*/ 6889 6890 float32 float128_to_float32(float128 a, float_status *status) 6891 { 6892 bool aSign; 6893 int32_t aExp; 6894 uint64_t aSig0, aSig1; 6895 uint32_t zSig; 6896 6897 aSig1 = extractFloat128Frac1( a ); 6898 aSig0 = extractFloat128Frac0( a ); 6899 aExp = extractFloat128Exp( a ); 6900 aSign = extractFloat128Sign( a ); 6901 if ( aExp == 0x7FFF ) { 6902 if ( aSig0 | aSig1 ) { 6903 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 6904 } 6905 return packFloat32( aSign, 0xFF, 0 ); 6906 } 6907 aSig0 |= ( aSig1 != 0 ); 6908 shift64RightJamming( aSig0, 18, &aSig0 ); 6909 zSig = aSig0; 6910 if ( aExp || zSig ) { 6911 zSig |= 0x40000000; 6912 aExp -= 0x3F81; 6913 } 6914 return roundAndPackFloat32(aSign, aExp, zSig, status); 6915 6916 } 6917 6918 /*---------------------------------------------------------------------------- 6919 | Returns the result of converting the quadruple-precision floating-point 6920 | value `a' to the double-precision floating-point format. The conversion 6921 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6922 | Arithmetic. 6923 *----------------------------------------------------------------------------*/ 6924 6925 float64 float128_to_float64(float128 a, float_status *status) 6926 { 6927 bool aSign; 6928 int32_t aExp; 6929 uint64_t aSig0, aSig1; 6930 6931 aSig1 = extractFloat128Frac1( a ); 6932 aSig0 = extractFloat128Frac0( a ); 6933 aExp = extractFloat128Exp( a ); 6934 aSign = extractFloat128Sign( a ); 6935 if ( aExp == 0x7FFF ) { 6936 if ( aSig0 | aSig1 ) { 6937 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 6938 } 6939 return packFloat64( aSign, 0x7FF, 0 ); 6940 } 6941 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6942 aSig0 |= ( aSig1 != 0 ); 6943 if ( aExp || aSig0 ) { 6944 aSig0 |= UINT64_C(0x4000000000000000); 6945 aExp -= 0x3C01; 6946 } 6947 return roundAndPackFloat64(aSign, aExp, aSig0, status); 6948 6949 } 6950 6951 /*---------------------------------------------------------------------------- 6952 | Returns the result of converting the quadruple-precision floating-point 6953 | value `a' to the extended double-precision floating-point format. The 6954 | conversion is performed according to the IEC/IEEE Standard for Binary 6955 | Floating-Point Arithmetic. 6956 *----------------------------------------------------------------------------*/ 6957 6958 floatx80 float128_to_floatx80(float128 a, float_status *status) 6959 { 6960 bool aSign; 6961 int32_t aExp; 6962 uint64_t aSig0, aSig1; 6963 6964 aSig1 = extractFloat128Frac1( a ); 6965 aSig0 = extractFloat128Frac0( a ); 6966 aExp = extractFloat128Exp( a ); 6967 aSign = extractFloat128Sign( a ); 6968 if ( aExp == 0x7FFF ) { 6969 if ( aSig0 | aSig1 ) { 6970 floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status), 6971 status); 6972 return floatx80_silence_nan(res, status); 6973 } 6974 return packFloatx80(aSign, floatx80_infinity_high, 6975 floatx80_infinity_low); 6976 } 6977 if ( aExp == 0 ) { 6978 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6979 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6980 } 6981 else { 6982 aSig0 |= UINT64_C(0x0001000000000000); 6983 } 6984 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6985 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6986 6987 } 6988 6989 /*---------------------------------------------------------------------------- 6990 | Rounds the quadruple-precision floating-point value `a' to an integer, and 6991 | returns the result as a quadruple-precision floating-point value. The 6992 | operation is performed according to the IEC/IEEE Standard for Binary 6993 | Floating-Point Arithmetic. 6994 *----------------------------------------------------------------------------*/ 6995 6996 float128 float128_round_to_int(float128 a, float_status *status) 6997 { 6998 bool aSign; 6999 int32_t aExp; 7000 uint64_t lastBitMask, roundBitsMask; 7001 float128 z; 7002 7003 aExp = extractFloat128Exp( a ); 7004 if ( 0x402F <= aExp ) { 7005 if ( 0x406F <= aExp ) { 7006 if ( ( aExp == 0x7FFF ) 7007 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 7008 ) { 7009 return propagateFloat128NaN(a, a, status); 7010 } 7011 return a; 7012 } 7013 lastBitMask = 1; 7014 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 7015 roundBitsMask = lastBitMask - 1; 7016 z = a; 7017 switch (status->float_rounding_mode) { 7018 case float_round_nearest_even: 7019 if ( lastBitMask ) { 7020 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 7021 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 7022 } 7023 else { 7024 if ( (int64_t) z.low < 0 ) { 7025 ++z.high; 7026 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 7027 } 7028 } 7029 break; 7030 case float_round_ties_away: 7031 if (lastBitMask) { 7032 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 7033 } else { 7034 if ((int64_t) z.low < 0) { 7035 ++z.high; 7036 } 7037 } 7038 break; 7039 case float_round_to_zero: 7040 break; 7041 case float_round_up: 7042 if (!extractFloat128Sign(z)) { 7043 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 7044 } 7045 break; 7046 case float_round_down: 7047 if (extractFloat128Sign(z)) { 7048 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 7049 } 7050 break; 7051 case float_round_to_odd: 7052 /* 7053 * Note that if lastBitMask == 0, the last bit is the lsb 7054 * of high, and roundBitsMask == -1. 7055 */ 7056 if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) { 7057 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 7058 } 7059 break; 7060 default: 7061 abort(); 7062 } 7063 z.low &= ~ roundBitsMask; 7064 } 7065 else { 7066 if ( aExp < 0x3FFF ) { 7067 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 7068 float_raise(float_flag_inexact, status); 7069 aSign = extractFloat128Sign( a ); 7070 switch (status->float_rounding_mode) { 7071 case float_round_nearest_even: 7072 if ( ( aExp == 0x3FFE ) 7073 && ( extractFloat128Frac0( a ) 7074 | extractFloat128Frac1( a ) ) 7075 ) { 7076 return packFloat128( aSign, 0x3FFF, 0, 0 ); 7077 } 7078 break; 7079 case float_round_ties_away: 7080 if (aExp == 0x3FFE) { 7081 return packFloat128(aSign, 0x3FFF, 0, 0); 7082 } 7083 break; 7084 case float_round_down: 7085 return 7086 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 7087 : packFloat128( 0, 0, 0, 0 ); 7088 case float_round_up: 7089 return 7090 aSign ? packFloat128( 1, 0, 0, 0 ) 7091 : packFloat128( 0, 0x3FFF, 0, 0 ); 7092 7093 case float_round_to_odd: 7094 return packFloat128(aSign, 0x3FFF, 0, 0); 7095 7096 case float_round_to_zero: 7097 break; 7098 } 7099 return packFloat128( aSign, 0, 0, 0 ); 7100 } 7101 lastBitMask = 1; 7102 lastBitMask <<= 0x402F - aExp; 7103 roundBitsMask = lastBitMask - 1; 7104 z.low = 0; 7105 z.high = a.high; 7106 switch (status->float_rounding_mode) { 7107 case float_round_nearest_even: 7108 z.high += lastBitMask>>1; 7109 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 7110 z.high &= ~ lastBitMask; 7111 } 7112 break; 7113 case float_round_ties_away: 7114 z.high += lastBitMask>>1; 7115 break; 7116 case float_round_to_zero: 7117 break; 7118 case float_round_up: 7119 if (!extractFloat128Sign(z)) { 7120 z.high |= ( a.low != 0 ); 7121 z.high += roundBitsMask; 7122 } 7123 break; 7124 case float_round_down: 7125 if (extractFloat128Sign(z)) { 7126 z.high |= (a.low != 0); 7127 z.high += roundBitsMask; 7128 } 7129 break; 7130 case float_round_to_odd: 7131 if ((z.high & lastBitMask) == 0) { 7132 z.high |= (a.low != 0); 7133 z.high += roundBitsMask; 7134 } 7135 break; 7136 default: 7137 abort(); 7138 } 7139 z.high &= ~ roundBitsMask; 7140 } 7141 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 7142 float_raise(float_flag_inexact, status); 7143 } 7144 return z; 7145 7146 } 7147 7148 /*---------------------------------------------------------------------------- 7149 | Returns the result of adding the absolute values of the quadruple-precision 7150 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 7151 | before being returned. `zSign' is ignored if the result is a NaN. 7152 | The addition is performed according to the IEC/IEEE Standard for Binary 7153 | Floating-Point Arithmetic. 7154 *----------------------------------------------------------------------------*/ 7155 7156 static float128 addFloat128Sigs(float128 a, float128 b, bool zSign, 7157 float_status *status) 7158 { 7159 int32_t aExp, bExp, zExp; 7160 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 7161 int32_t expDiff; 7162 7163 aSig1 = extractFloat128Frac1( a ); 7164 aSig0 = extractFloat128Frac0( a ); 7165 aExp = extractFloat128Exp( a ); 7166 bSig1 = extractFloat128Frac1( b ); 7167 bSig0 = extractFloat128Frac0( b ); 7168 bExp = extractFloat128Exp( b ); 7169 expDiff = aExp - bExp; 7170 if ( 0 < expDiff ) { 7171 if ( aExp == 0x7FFF ) { 7172 if (aSig0 | aSig1) { 7173 return propagateFloat128NaN(a, b, status); 7174 } 7175 return a; 7176 } 7177 if ( bExp == 0 ) { 7178 --expDiff; 7179 } 7180 else { 7181 bSig0 |= UINT64_C(0x0001000000000000); 7182 } 7183 shift128ExtraRightJamming( 7184 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 7185 zExp = aExp; 7186 } 7187 else if ( expDiff < 0 ) { 7188 if ( bExp == 0x7FFF ) { 7189 if (bSig0 | bSig1) { 7190 return propagateFloat128NaN(a, b, status); 7191 } 7192 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7193 } 7194 if ( aExp == 0 ) { 7195 ++expDiff; 7196 } 7197 else { 7198 aSig0 |= UINT64_C(0x0001000000000000); 7199 } 7200 shift128ExtraRightJamming( 7201 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 7202 zExp = bExp; 7203 } 7204 else { 7205 if ( aExp == 0x7FFF ) { 7206 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 7207 return propagateFloat128NaN(a, b, status); 7208 } 7209 return a; 7210 } 7211 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 7212 if ( aExp == 0 ) { 7213 if (status->flush_to_zero) { 7214 if (zSig0 | zSig1) { 7215 float_raise(float_flag_output_denormal, status); 7216 } 7217 return packFloat128(zSign, 0, 0, 0); 7218 } 7219 return packFloat128( zSign, 0, zSig0, zSig1 ); 7220 } 7221 zSig2 = 0; 7222 zSig0 |= UINT64_C(0x0002000000000000); 7223 zExp = aExp; 7224 goto shiftRight1; 7225 } 7226 aSig0 |= UINT64_C(0x0001000000000000); 7227 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 7228 --zExp; 7229 if ( zSig0 < UINT64_C(0x0002000000000000) ) goto roundAndPack; 7230 ++zExp; 7231 shiftRight1: 7232 shift128ExtraRightJamming( 7233 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 7234 roundAndPack: 7235 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7236 7237 } 7238 7239 /*---------------------------------------------------------------------------- 7240 | Returns the result of subtracting the absolute values of the quadruple- 7241 | precision floating-point values `a' and `b'. If `zSign' is 1, the 7242 | difference is negated before being returned. `zSign' is ignored if the 7243 | result is a NaN. The subtraction is performed according to the IEC/IEEE 7244 | Standard for Binary Floating-Point Arithmetic. 7245 *----------------------------------------------------------------------------*/ 7246 7247 static float128 subFloat128Sigs(float128 a, float128 b, bool zSign, 7248 float_status *status) 7249 { 7250 int32_t aExp, bExp, zExp; 7251 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 7252 int32_t expDiff; 7253 7254 aSig1 = extractFloat128Frac1( a ); 7255 aSig0 = extractFloat128Frac0( a ); 7256 aExp = extractFloat128Exp( a ); 7257 bSig1 = extractFloat128Frac1( b ); 7258 bSig0 = extractFloat128Frac0( b ); 7259 bExp = extractFloat128Exp( b ); 7260 expDiff = aExp - bExp; 7261 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 7262 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 7263 if ( 0 < expDiff ) goto aExpBigger; 7264 if ( expDiff < 0 ) goto bExpBigger; 7265 if ( aExp == 0x7FFF ) { 7266 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 7267 return propagateFloat128NaN(a, b, status); 7268 } 7269 float_raise(float_flag_invalid, status); 7270 return float128_default_nan(status); 7271 } 7272 if ( aExp == 0 ) { 7273 aExp = 1; 7274 bExp = 1; 7275 } 7276 if ( bSig0 < aSig0 ) goto aBigger; 7277 if ( aSig0 < bSig0 ) goto bBigger; 7278 if ( bSig1 < aSig1 ) goto aBigger; 7279 if ( aSig1 < bSig1 ) goto bBigger; 7280 return packFloat128(status->float_rounding_mode == float_round_down, 7281 0, 0, 0); 7282 bExpBigger: 7283 if ( bExp == 0x7FFF ) { 7284 if (bSig0 | bSig1) { 7285 return propagateFloat128NaN(a, b, status); 7286 } 7287 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 7288 } 7289 if ( aExp == 0 ) { 7290 ++expDiff; 7291 } 7292 else { 7293 aSig0 |= UINT64_C(0x4000000000000000); 7294 } 7295 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 7296 bSig0 |= UINT64_C(0x4000000000000000); 7297 bBigger: 7298 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 7299 zExp = bExp; 7300 zSign ^= 1; 7301 goto normalizeRoundAndPack; 7302 aExpBigger: 7303 if ( aExp == 0x7FFF ) { 7304 if (aSig0 | aSig1) { 7305 return propagateFloat128NaN(a, b, status); 7306 } 7307 return a; 7308 } 7309 if ( bExp == 0 ) { 7310 --expDiff; 7311 } 7312 else { 7313 bSig0 |= UINT64_C(0x4000000000000000); 7314 } 7315 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 7316 aSig0 |= UINT64_C(0x4000000000000000); 7317 aBigger: 7318 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 7319 zExp = aExp; 7320 normalizeRoundAndPack: 7321 --zExp; 7322 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 7323 status); 7324 7325 } 7326 7327 /*---------------------------------------------------------------------------- 7328 | Returns the result of adding the quadruple-precision floating-point values 7329 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 7330 | for Binary Floating-Point Arithmetic. 7331 *----------------------------------------------------------------------------*/ 7332 7333 float128 float128_add(float128 a, float128 b, float_status *status) 7334 { 7335 bool aSign, bSign; 7336 7337 aSign = extractFloat128Sign( a ); 7338 bSign = extractFloat128Sign( b ); 7339 if ( aSign == bSign ) { 7340 return addFloat128Sigs(a, b, aSign, status); 7341 } 7342 else { 7343 return subFloat128Sigs(a, b, aSign, status); 7344 } 7345 7346 } 7347 7348 /*---------------------------------------------------------------------------- 7349 | Returns the result of subtracting the quadruple-precision floating-point 7350 | values `a' and `b'. The operation is performed according to the IEC/IEEE 7351 | Standard for Binary Floating-Point Arithmetic. 7352 *----------------------------------------------------------------------------*/ 7353 7354 float128 float128_sub(float128 a, float128 b, float_status *status) 7355 { 7356 bool aSign, bSign; 7357 7358 aSign = extractFloat128Sign( a ); 7359 bSign = extractFloat128Sign( b ); 7360 if ( aSign == bSign ) { 7361 return subFloat128Sigs(a, b, aSign, status); 7362 } 7363 else { 7364 return addFloat128Sigs(a, b, aSign, status); 7365 } 7366 7367 } 7368 7369 /*---------------------------------------------------------------------------- 7370 | Returns the result of multiplying the quadruple-precision floating-point 7371 | values `a' and `b'. The operation is performed according to the IEC/IEEE 7372 | Standard for Binary Floating-Point Arithmetic. 7373 *----------------------------------------------------------------------------*/ 7374 7375 float128 float128_mul(float128 a, float128 b, float_status *status) 7376 { 7377 bool aSign, bSign, zSign; 7378 int32_t aExp, bExp, zExp; 7379 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 7380 7381 aSig1 = extractFloat128Frac1( a ); 7382 aSig0 = extractFloat128Frac0( a ); 7383 aExp = extractFloat128Exp( a ); 7384 aSign = extractFloat128Sign( a ); 7385 bSig1 = extractFloat128Frac1( b ); 7386 bSig0 = extractFloat128Frac0( b ); 7387 bExp = extractFloat128Exp( b ); 7388 bSign = extractFloat128Sign( b ); 7389 zSign = aSign ^ bSign; 7390 if ( aExp == 0x7FFF ) { 7391 if ( ( aSig0 | aSig1 ) 7392 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 7393 return propagateFloat128NaN(a, b, status); 7394 } 7395 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 7396 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7397 } 7398 if ( bExp == 0x7FFF ) { 7399 if (bSig0 | bSig1) { 7400 return propagateFloat128NaN(a, b, status); 7401 } 7402 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 7403 invalid: 7404 float_raise(float_flag_invalid, status); 7405 return float128_default_nan(status); 7406 } 7407 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7408 } 7409 if ( aExp == 0 ) { 7410 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7411 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7412 } 7413 if ( bExp == 0 ) { 7414 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7415 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7416 } 7417 zExp = aExp + bExp - 0x4000; 7418 aSig0 |= UINT64_C(0x0001000000000000); 7419 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 7420 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 7421 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 7422 zSig2 |= ( zSig3 != 0 ); 7423 if (UINT64_C( 0x0002000000000000) <= zSig0 ) { 7424 shift128ExtraRightJamming( 7425 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 7426 ++zExp; 7427 } 7428 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7429 7430 } 7431 7432 /*---------------------------------------------------------------------------- 7433 | Returns the result of dividing the quadruple-precision floating-point value 7434 | `a' by the corresponding value `b'. The operation is performed according to 7435 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7436 *----------------------------------------------------------------------------*/ 7437 7438 float128 float128_div(float128 a, float128 b, float_status *status) 7439 { 7440 bool aSign, bSign, zSign; 7441 int32_t aExp, bExp, zExp; 7442 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 7443 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7444 7445 aSig1 = extractFloat128Frac1( a ); 7446 aSig0 = extractFloat128Frac0( a ); 7447 aExp = extractFloat128Exp( a ); 7448 aSign = extractFloat128Sign( a ); 7449 bSig1 = extractFloat128Frac1( b ); 7450 bSig0 = extractFloat128Frac0( b ); 7451 bExp = extractFloat128Exp( b ); 7452 bSign = extractFloat128Sign( b ); 7453 zSign = aSign ^ bSign; 7454 if ( aExp == 0x7FFF ) { 7455 if (aSig0 | aSig1) { 7456 return propagateFloat128NaN(a, b, status); 7457 } 7458 if ( bExp == 0x7FFF ) { 7459 if (bSig0 | bSig1) { 7460 return propagateFloat128NaN(a, b, status); 7461 } 7462 goto invalid; 7463 } 7464 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7465 } 7466 if ( bExp == 0x7FFF ) { 7467 if (bSig0 | bSig1) { 7468 return propagateFloat128NaN(a, b, status); 7469 } 7470 return packFloat128( zSign, 0, 0, 0 ); 7471 } 7472 if ( bExp == 0 ) { 7473 if ( ( bSig0 | bSig1 ) == 0 ) { 7474 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 7475 invalid: 7476 float_raise(float_flag_invalid, status); 7477 return float128_default_nan(status); 7478 } 7479 float_raise(float_flag_divbyzero, status); 7480 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7481 } 7482 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7483 } 7484 if ( aExp == 0 ) { 7485 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7486 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7487 } 7488 zExp = aExp - bExp + 0x3FFD; 7489 shortShift128Left( 7490 aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 ); 7491 shortShift128Left( 7492 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 ); 7493 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 7494 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 7495 ++zExp; 7496 } 7497 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7498 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 7499 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 7500 while ( (int64_t) rem0 < 0 ) { 7501 --zSig0; 7502 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 7503 } 7504 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 7505 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 7506 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 7507 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 7508 while ( (int64_t) rem1 < 0 ) { 7509 --zSig1; 7510 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 7511 } 7512 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7513 } 7514 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 7515 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7516 7517 } 7518 7519 /*---------------------------------------------------------------------------- 7520 | Returns the remainder of the quadruple-precision floating-point value `a' 7521 | with respect to the corresponding value `b'. The operation is performed 7522 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7523 *----------------------------------------------------------------------------*/ 7524 7525 float128 float128_rem(float128 a, float128 b, float_status *status) 7526 { 7527 bool aSign, zSign; 7528 int32_t aExp, bExp, expDiff; 7529 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 7530 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 7531 int64_t sigMean0; 7532 7533 aSig1 = extractFloat128Frac1( a ); 7534 aSig0 = extractFloat128Frac0( a ); 7535 aExp = extractFloat128Exp( a ); 7536 aSign = extractFloat128Sign( a ); 7537 bSig1 = extractFloat128Frac1( b ); 7538 bSig0 = extractFloat128Frac0( b ); 7539 bExp = extractFloat128Exp( b ); 7540 if ( aExp == 0x7FFF ) { 7541 if ( ( aSig0 | aSig1 ) 7542 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 7543 return propagateFloat128NaN(a, b, status); 7544 } 7545 goto invalid; 7546 } 7547 if ( bExp == 0x7FFF ) { 7548 if (bSig0 | bSig1) { 7549 return propagateFloat128NaN(a, b, status); 7550 } 7551 return a; 7552 } 7553 if ( bExp == 0 ) { 7554 if ( ( bSig0 | bSig1 ) == 0 ) { 7555 invalid: 7556 float_raise(float_flag_invalid, status); 7557 return float128_default_nan(status); 7558 } 7559 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7560 } 7561 if ( aExp == 0 ) { 7562 if ( ( aSig0 | aSig1 ) == 0 ) return a; 7563 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7564 } 7565 expDiff = aExp - bExp; 7566 if ( expDiff < -1 ) return a; 7567 shortShift128Left( 7568 aSig0 | UINT64_C(0x0001000000000000), 7569 aSig1, 7570 15 - ( expDiff < 0 ), 7571 &aSig0, 7572 &aSig1 7573 ); 7574 shortShift128Left( 7575 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 ); 7576 q = le128( bSig0, bSig1, aSig0, aSig1 ); 7577 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7578 expDiff -= 64; 7579 while ( 0 < expDiff ) { 7580 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7581 q = ( 4 < q ) ? q - 4 : 0; 7582 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7583 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 7584 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 7585 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 7586 expDiff -= 61; 7587 } 7588 if ( -64 < expDiff ) { 7589 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7590 q = ( 4 < q ) ? q - 4 : 0; 7591 q >>= - expDiff; 7592 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7593 expDiff += 52; 7594 if ( expDiff < 0 ) { 7595 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 7596 } 7597 else { 7598 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 7599 } 7600 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7601 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 7602 } 7603 else { 7604 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 7605 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7606 } 7607 do { 7608 alternateASig0 = aSig0; 7609 alternateASig1 = aSig1; 7610 ++q; 7611 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7612 } while ( 0 <= (int64_t) aSig0 ); 7613 add128( 7614 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 7615 if ( ( sigMean0 < 0 ) 7616 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 7617 aSig0 = alternateASig0; 7618 aSig1 = alternateASig1; 7619 } 7620 zSign = ( (int64_t) aSig0 < 0 ); 7621 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 7622 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 7623 status); 7624 } 7625 7626 /*---------------------------------------------------------------------------- 7627 | Returns the square root of the quadruple-precision floating-point value `a'. 7628 | The operation is performed according to the IEC/IEEE Standard for Binary 7629 | Floating-Point Arithmetic. 7630 *----------------------------------------------------------------------------*/ 7631 7632 float128 float128_sqrt(float128 a, float_status *status) 7633 { 7634 bool aSign; 7635 int32_t aExp, zExp; 7636 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 7637 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7638 7639 aSig1 = extractFloat128Frac1( a ); 7640 aSig0 = extractFloat128Frac0( a ); 7641 aExp = extractFloat128Exp( a ); 7642 aSign = extractFloat128Sign( a ); 7643 if ( aExp == 0x7FFF ) { 7644 if (aSig0 | aSig1) { 7645 return propagateFloat128NaN(a, a, status); 7646 } 7647 if ( ! aSign ) return a; 7648 goto invalid; 7649 } 7650 if ( aSign ) { 7651 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 7652 invalid: 7653 float_raise(float_flag_invalid, status); 7654 return float128_default_nan(status); 7655 } 7656 if ( aExp == 0 ) { 7657 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 7658 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7659 } 7660 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 7661 aSig0 |= UINT64_C(0x0001000000000000); 7662 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 7663 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 7664 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 7665 doubleZSig0 = zSig0<<1; 7666 mul64To128( zSig0, zSig0, &term0, &term1 ); 7667 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 7668 while ( (int64_t) rem0 < 0 ) { 7669 --zSig0; 7670 doubleZSig0 -= 2; 7671 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 7672 } 7673 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 7674 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 7675 if ( zSig1 == 0 ) zSig1 = 1; 7676 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 7677 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 7678 mul64To128( zSig1, zSig1, &term2, &term3 ); 7679 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 7680 while ( (int64_t) rem1 < 0 ) { 7681 --zSig1; 7682 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 7683 term3 |= 1; 7684 term2 |= doubleZSig0; 7685 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 7686 } 7687 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7688 } 7689 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 7690 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 7691 7692 } 7693 7694 static inline FloatRelation 7695 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet, 7696 float_status *status) 7697 { 7698 bool aSign, bSign; 7699 7700 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 7701 float_raise(float_flag_invalid, status); 7702 return float_relation_unordered; 7703 } 7704 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 7705 ( extractFloatx80Frac( a )<<1 ) ) || 7706 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 7707 ( extractFloatx80Frac( b )<<1 ) )) { 7708 if (!is_quiet || 7709 floatx80_is_signaling_nan(a, status) || 7710 floatx80_is_signaling_nan(b, status)) { 7711 float_raise(float_flag_invalid, status); 7712 } 7713 return float_relation_unordered; 7714 } 7715 aSign = extractFloatx80Sign( a ); 7716 bSign = extractFloatx80Sign( b ); 7717 if ( aSign != bSign ) { 7718 7719 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 7720 ( ( a.low | b.low ) == 0 ) ) { 7721 /* zero case */ 7722 return float_relation_equal; 7723 } else { 7724 return 1 - (2 * aSign); 7725 } 7726 } else { 7727 /* Normalize pseudo-denormals before comparison. */ 7728 if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) { 7729 ++a.high; 7730 } 7731 if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) { 7732 ++b.high; 7733 } 7734 if (a.low == b.low && a.high == b.high) { 7735 return float_relation_equal; 7736 } else { 7737 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7738 } 7739 } 7740 } 7741 7742 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status) 7743 { 7744 return floatx80_compare_internal(a, b, 0, status); 7745 } 7746 7747 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b, 7748 float_status *status) 7749 { 7750 return floatx80_compare_internal(a, b, 1, status); 7751 } 7752 7753 static inline FloatRelation 7754 float128_compare_internal(float128 a, float128 b, bool is_quiet, 7755 float_status *status) 7756 { 7757 bool aSign, bSign; 7758 7759 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7760 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7761 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7762 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7763 if (!is_quiet || 7764 float128_is_signaling_nan(a, status) || 7765 float128_is_signaling_nan(b, status)) { 7766 float_raise(float_flag_invalid, status); 7767 } 7768 return float_relation_unordered; 7769 } 7770 aSign = extractFloat128Sign( a ); 7771 bSign = extractFloat128Sign( b ); 7772 if ( aSign != bSign ) { 7773 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7774 /* zero case */ 7775 return float_relation_equal; 7776 } else { 7777 return 1 - (2 * aSign); 7778 } 7779 } else { 7780 if (a.low == b.low && a.high == b.high) { 7781 return float_relation_equal; 7782 } else { 7783 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7784 } 7785 } 7786 } 7787 7788 FloatRelation float128_compare(float128 a, float128 b, float_status *status) 7789 { 7790 return float128_compare_internal(a, b, 0, status); 7791 } 7792 7793 FloatRelation float128_compare_quiet(float128 a, float128 b, 7794 float_status *status) 7795 { 7796 return float128_compare_internal(a, b, 1, status); 7797 } 7798 7799 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7800 { 7801 bool aSign; 7802 int32_t aExp; 7803 uint64_t aSig; 7804 7805 if (floatx80_invalid_encoding(a)) { 7806 float_raise(float_flag_invalid, status); 7807 return floatx80_default_nan(status); 7808 } 7809 aSig = extractFloatx80Frac( a ); 7810 aExp = extractFloatx80Exp( a ); 7811 aSign = extractFloatx80Sign( a ); 7812 7813 if ( aExp == 0x7FFF ) { 7814 if ( aSig<<1 ) { 7815 return propagateFloatx80NaN(a, a, status); 7816 } 7817 return a; 7818 } 7819 7820 if (aExp == 0) { 7821 if (aSig == 0) { 7822 return a; 7823 } 7824 aExp++; 7825 } 7826 7827 if (n > 0x10000) { 7828 n = 0x10000; 7829 } else if (n < -0x10000) { 7830 n = -0x10000; 7831 } 7832 7833 aExp += n; 7834 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7835 aSign, aExp, aSig, 0, status); 7836 } 7837 7838 float128 float128_scalbn(float128 a, int n, float_status *status) 7839 { 7840 bool aSign; 7841 int32_t aExp; 7842 uint64_t aSig0, aSig1; 7843 7844 aSig1 = extractFloat128Frac1( a ); 7845 aSig0 = extractFloat128Frac0( a ); 7846 aExp = extractFloat128Exp( a ); 7847 aSign = extractFloat128Sign( a ); 7848 if ( aExp == 0x7FFF ) { 7849 if ( aSig0 | aSig1 ) { 7850 return propagateFloat128NaN(a, a, status); 7851 } 7852 return a; 7853 } 7854 if (aExp != 0) { 7855 aSig0 |= UINT64_C(0x0001000000000000); 7856 } else if (aSig0 == 0 && aSig1 == 0) { 7857 return a; 7858 } else { 7859 aExp++; 7860 } 7861 7862 if (n > 0x10000) { 7863 n = 0x10000; 7864 } else if (n < -0x10000) { 7865 n = -0x10000; 7866 } 7867 7868 aExp += n - 1; 7869 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7870 , status); 7871 7872 } 7873 7874 static void __attribute__((constructor)) softfloat_init(void) 7875 { 7876 union_float64 ua, ub, uc, ur; 7877 7878 if (QEMU_NO_HARDFLOAT) { 7879 return; 7880 } 7881 /* 7882 * Test that the host's FMA is not obviously broken. For example, 7883 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see 7884 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304 7885 */ 7886 ua.s = 0x0020000000000001ULL; 7887 ub.s = 0x3ca0000000000000ULL; 7888 uc.s = 0x0020000000000000ULL; 7889 ur.h = fma(ua.h, ub.h, uc.h); 7890 if (ur.s != 0x0020000000000001ULL) { 7891 force_soft_fma = true; 7892 } 7893 } 7894