1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include <math.h> 87 #include "qemu/bitops.h" 88 #include "fpu/softfloat.h" 89 90 /* We only need stdlib for abort() */ 91 92 /*---------------------------------------------------------------------------- 93 | Primitive arithmetic functions, including multi-word arithmetic, and 94 | division and square root approximations. (Can be specialized to target if 95 | desired.) 96 *----------------------------------------------------------------------------*/ 97 #include "fpu/softfloat-macros.h" 98 99 /* 100 * Hardfloat 101 * 102 * Fast emulation of guest FP instructions is challenging for two reasons. 103 * First, FP instruction semantics are similar but not identical, particularly 104 * when handling NaNs. Second, emulating at reasonable speed the guest FP 105 * exception flags is not trivial: reading the host's flags register with a 106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp], 107 * and trapping on every FP exception is not fast nor pleasant to work with. 108 * 109 * We address these challenges by leveraging the host FPU for a subset of the 110 * operations. To do this we expand on the idea presented in this paper: 111 * 112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a 113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615. 114 * 115 * The idea is thus to leverage the host FPU to (1) compute FP operations 116 * and (2) identify whether FP exceptions occurred while avoiding 117 * expensive exception flag register accesses. 118 * 119 * An important optimization shown in the paper is that given that exception 120 * flags are rarely cleared by the guest, we can avoid recomputing some flags. 121 * This is particularly useful for the inexact flag, which is very frequently 122 * raised in floating-point workloads. 123 * 124 * We optimize the code further by deferring to soft-fp whenever FP exception 125 * detection might get hairy. Two examples: (1) when at least one operand is 126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result 127 * and the result is < the minimum normal. 128 */ 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \ 130 static inline void name(soft_t *a, float_status *s) \ 131 { \ 132 if (unlikely(soft_t ## _is_denormal(*a))) { \ 133 *a = soft_t ## _set_sign(soft_t ## _zero, \ 134 soft_t ## _is_neg(*a)); \ 135 float_raise(float_flag_input_denormal, s); \ 136 } \ 137 } 138 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32) 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64) 141 #undef GEN_INPUT_FLUSH__NOCHECK 142 143 #define GEN_INPUT_FLUSH1(name, soft_t) \ 144 static inline void name(soft_t *a, float_status *s) \ 145 { \ 146 if (likely(!s->flush_inputs_to_zero)) { \ 147 return; \ 148 } \ 149 soft_t ## _input_flush__nocheck(a, s); \ 150 } 151 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32) 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64) 154 #undef GEN_INPUT_FLUSH1 155 156 #define GEN_INPUT_FLUSH2(name, soft_t) \ 157 static inline void name(soft_t *a, soft_t *b, float_status *s) \ 158 { \ 159 if (likely(!s->flush_inputs_to_zero)) { \ 160 return; \ 161 } \ 162 soft_t ## _input_flush__nocheck(a, s); \ 163 soft_t ## _input_flush__nocheck(b, s); \ 164 } 165 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32) 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64) 168 #undef GEN_INPUT_FLUSH2 169 170 #define GEN_INPUT_FLUSH3(name, soft_t) \ 171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \ 172 { \ 173 if (likely(!s->flush_inputs_to_zero)) { \ 174 return; \ 175 } \ 176 soft_t ## _input_flush__nocheck(a, s); \ 177 soft_t ## _input_flush__nocheck(b, s); \ 178 soft_t ## _input_flush__nocheck(c, s); \ 179 } 180 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32) 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64) 183 #undef GEN_INPUT_FLUSH3 184 185 /* 186 * Choose whether to use fpclassify or float32/64_* primitives in the generated 187 * hardfloat functions. Each combination of number of inputs and float size 188 * gets its own value. 189 */ 190 #if defined(__x86_64__) 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1 197 #else 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0 204 #endif 205 206 /* 207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over 208 * float{32,64}_is_infinity when !USE_FP. 209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup. 210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%. 211 */ 212 #if defined(__x86_64__) || defined(__aarch64__) 213 # define QEMU_HARDFLOAT_USE_ISINF 1 214 #else 215 # define QEMU_HARDFLOAT_USE_ISINF 0 216 #endif 217 218 /* 219 * Some targets clear the FP flags before most FP operations. This prevents 220 * the use of hardfloat, since hardfloat relies on the inexact flag being 221 * already set. 222 */ 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__) 224 # if defined(__FAST_MATH__) 225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \ 226 IEEE implementation 227 # endif 228 # define QEMU_NO_HARDFLOAT 1 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN 230 #else 231 # define QEMU_NO_HARDFLOAT 0 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline)) 233 #endif 234 235 static inline bool can_use_fpu(const float_status *s) 236 { 237 if (QEMU_NO_HARDFLOAT) { 238 return false; 239 } 240 return likely(s->float_exception_flags & float_flag_inexact && 241 s->float_rounding_mode == float_round_nearest_even); 242 } 243 244 /* 245 * Hardfloat generation functions. Each operation can have two flavors: 246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for 247 * most condition checks, or native ones (e.g. fpclassify). 248 * 249 * The flavor is chosen by the callers. Instead of using macros, we rely on the 250 * compiler to propagate constants and inline everything into the callers. 251 * 252 * We only generate functions for operations with two inputs, since only 253 * these are common enough to justify consolidating them into common code. 254 */ 255 256 typedef union { 257 float32 s; 258 float h; 259 } union_float32; 260 261 typedef union { 262 float64 s; 263 double h; 264 } union_float64; 265 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b); 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b); 268 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s); 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s); 271 typedef float (*hard_f32_op2_fn)(float a, float b); 272 typedef double (*hard_f64_op2_fn)(double a, double b); 273 274 /* 2-input is-zero-or-normal */ 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b) 276 { 277 if (QEMU_HARDFLOAT_2F32_USE_FP) { 278 /* 279 * Not using a temp variable for consecutive fpclassify calls ends up 280 * generating faster code. 281 */ 282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 284 } 285 return float32_is_zero_or_normal(a.s) && 286 float32_is_zero_or_normal(b.s); 287 } 288 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b) 290 { 291 if (QEMU_HARDFLOAT_2F64_USE_FP) { 292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 294 } 295 return float64_is_zero_or_normal(a.s) && 296 float64_is_zero_or_normal(b.s); 297 } 298 299 /* 3-input is-zero-or-normal */ 300 static inline 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c) 302 { 303 if (QEMU_HARDFLOAT_3F32_USE_FP) { 304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 307 } 308 return float32_is_zero_or_normal(a.s) && 309 float32_is_zero_or_normal(b.s) && 310 float32_is_zero_or_normal(c.s); 311 } 312 313 static inline 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c) 315 { 316 if (QEMU_HARDFLOAT_3F64_USE_FP) { 317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 320 } 321 return float64_is_zero_or_normal(a.s) && 322 float64_is_zero_or_normal(b.s) && 323 float64_is_zero_or_normal(c.s); 324 } 325 326 static inline bool f32_is_inf(union_float32 a) 327 { 328 if (QEMU_HARDFLOAT_USE_ISINF) { 329 return isinf(a.h); 330 } 331 return float32_is_infinity(a.s); 332 } 333 334 static inline bool f64_is_inf(union_float64 a) 335 { 336 if (QEMU_HARDFLOAT_USE_ISINF) { 337 return isinf(a.h); 338 } 339 return float64_is_infinity(a.s); 340 } 341 342 static inline float32 343 float32_gen2(float32 xa, float32 xb, float_status *s, 344 hard_f32_op2_fn hard, soft_f32_op2_fn soft, 345 f32_check_fn pre, f32_check_fn post) 346 { 347 union_float32 ua, ub, ur; 348 349 ua.s = xa; 350 ub.s = xb; 351 352 if (unlikely(!can_use_fpu(s))) { 353 goto soft; 354 } 355 356 float32_input_flush2(&ua.s, &ub.s, s); 357 if (unlikely(!pre(ua, ub))) { 358 goto soft; 359 } 360 361 ur.h = hard(ua.h, ub.h); 362 if (unlikely(f32_is_inf(ur))) { 363 float_raise(float_flag_overflow, s); 364 } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) { 365 goto soft; 366 } 367 return ur.s; 368 369 soft: 370 return soft(ua.s, ub.s, s); 371 } 372 373 static inline float64 374 float64_gen2(float64 xa, float64 xb, float_status *s, 375 hard_f64_op2_fn hard, soft_f64_op2_fn soft, 376 f64_check_fn pre, f64_check_fn post) 377 { 378 union_float64 ua, ub, ur; 379 380 ua.s = xa; 381 ub.s = xb; 382 383 if (unlikely(!can_use_fpu(s))) { 384 goto soft; 385 } 386 387 float64_input_flush2(&ua.s, &ub.s, s); 388 if (unlikely(!pre(ua, ub))) { 389 goto soft; 390 } 391 392 ur.h = hard(ua.h, ub.h); 393 if (unlikely(f64_is_inf(ur))) { 394 float_raise(float_flag_overflow, s); 395 } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) { 396 goto soft; 397 } 398 return ur.s; 399 400 soft: 401 return soft(ua.s, ub.s, s); 402 } 403 404 /*---------------------------------------------------------------------------- 405 | Returns the fraction bits of the single-precision floating-point value `a'. 406 *----------------------------------------------------------------------------*/ 407 408 static inline uint32_t extractFloat32Frac(float32 a) 409 { 410 return float32_val(a) & 0x007FFFFF; 411 } 412 413 /*---------------------------------------------------------------------------- 414 | Returns the exponent bits of the single-precision floating-point value `a'. 415 *----------------------------------------------------------------------------*/ 416 417 static inline int extractFloat32Exp(float32 a) 418 { 419 return (float32_val(a) >> 23) & 0xFF; 420 } 421 422 /*---------------------------------------------------------------------------- 423 | Returns the sign bit of the single-precision floating-point value `a'. 424 *----------------------------------------------------------------------------*/ 425 426 static inline bool extractFloat32Sign(float32 a) 427 { 428 return float32_val(a) >> 31; 429 } 430 431 /*---------------------------------------------------------------------------- 432 | Returns the fraction bits of the double-precision floating-point value `a'. 433 *----------------------------------------------------------------------------*/ 434 435 static inline uint64_t extractFloat64Frac(float64 a) 436 { 437 return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF); 438 } 439 440 /*---------------------------------------------------------------------------- 441 | Returns the exponent bits of the double-precision floating-point value `a'. 442 *----------------------------------------------------------------------------*/ 443 444 static inline int extractFloat64Exp(float64 a) 445 { 446 return (float64_val(a) >> 52) & 0x7FF; 447 } 448 449 /*---------------------------------------------------------------------------- 450 | Returns the sign bit of the double-precision floating-point value `a'. 451 *----------------------------------------------------------------------------*/ 452 453 static inline bool extractFloat64Sign(float64 a) 454 { 455 return float64_val(a) >> 63; 456 } 457 458 /* 459 * Classify a floating point number. Everything above float_class_qnan 460 * is a NaN so cls >= float_class_qnan is any NaN. 461 */ 462 463 typedef enum __attribute__ ((__packed__)) { 464 float_class_unclassified, 465 float_class_zero, 466 float_class_normal, 467 float_class_inf, 468 float_class_qnan, /* all NaNs from here */ 469 float_class_snan, 470 } FloatClass; 471 472 #define float_cmask(bit) (1u << (bit)) 473 474 enum { 475 float_cmask_zero = float_cmask(float_class_zero), 476 float_cmask_normal = float_cmask(float_class_normal), 477 float_cmask_inf = float_cmask(float_class_inf), 478 float_cmask_qnan = float_cmask(float_class_qnan), 479 float_cmask_snan = float_cmask(float_class_snan), 480 481 float_cmask_infzero = float_cmask_zero | float_cmask_inf, 482 float_cmask_anynan = float_cmask_qnan | float_cmask_snan, 483 }; 484 485 486 /* Simple helpers for checking if, or what kind of, NaN we have */ 487 static inline __attribute__((unused)) bool is_nan(FloatClass c) 488 { 489 return unlikely(c >= float_class_qnan); 490 } 491 492 static inline __attribute__((unused)) bool is_snan(FloatClass c) 493 { 494 return c == float_class_snan; 495 } 496 497 static inline __attribute__((unused)) bool is_qnan(FloatClass c) 498 { 499 return c == float_class_qnan; 500 } 501 502 /* 503 * Structure holding all of the decomposed parts of a float. 504 * The exponent is unbiased and the fraction is normalized. 505 * 506 * The fraction words are stored in big-endian word ordering, 507 * so that truncation from a larger format to a smaller format 508 * can be done simply by ignoring subsequent elements. 509 */ 510 511 typedef struct { 512 FloatClass cls; 513 bool sign; 514 int32_t exp; 515 union { 516 /* Routines that know the structure may reference the singular name. */ 517 uint64_t frac; 518 /* 519 * Routines expanded with multiple structures reference "hi" and "lo" 520 * depending on the operation. In FloatParts64, "hi" and "lo" are 521 * both the same word and aliased here. 522 */ 523 uint64_t frac_hi; 524 uint64_t frac_lo; 525 }; 526 } FloatParts64; 527 528 typedef struct { 529 FloatClass cls; 530 bool sign; 531 int32_t exp; 532 uint64_t frac_hi; 533 uint64_t frac_lo; 534 } FloatParts128; 535 536 /* These apply to the most significant word of each FloatPartsN. */ 537 #define DECOMPOSED_BINARY_POINT 63 538 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 539 540 /* Structure holding all of the relevant parameters for a format. 541 * exp_size: the size of the exponent field 542 * exp_bias: the offset applied to the exponent field 543 * exp_max: the maximum normalised exponent 544 * frac_size: the size of the fraction field 545 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 546 * The following are computed based the size of fraction 547 * frac_lsb: least significant bit of fraction 548 * frac_lsbm1: the bit below the least significant bit (for rounding) 549 * round_mask/roundeven_mask: masks used for rounding 550 * The following optional modifiers are available: 551 * arm_althp: handle ARM Alternative Half Precision 552 */ 553 typedef struct { 554 int exp_size; 555 int exp_bias; 556 int exp_max; 557 int frac_size; 558 int frac_shift; 559 uint64_t frac_lsb; 560 uint64_t frac_lsbm1; 561 uint64_t round_mask; 562 uint64_t roundeven_mask; 563 bool arm_althp; 564 } FloatFmt; 565 566 /* Expand fields based on the size of exponent and fraction */ 567 #define FLOAT_PARAMS(E, F) \ 568 .exp_size = E, \ 569 .exp_bias = ((1 << E) - 1) >> 1, \ 570 .exp_max = (1 << E) - 1, \ 571 .frac_size = F, \ 572 .frac_shift = (-F - 1) & 63, \ 573 .frac_lsb = 1ull << ((-F - 1) & 63), \ 574 .frac_lsbm1 = 1ull << ((-F - 2) & 63), \ 575 .round_mask = (1ull << ((-F - 1) & 63)) - 1, \ 576 .roundeven_mask = (2ull << ((-F - 1) & 63)) - 1 577 578 static const FloatFmt float16_params = { 579 FLOAT_PARAMS(5, 10) 580 }; 581 582 static const FloatFmt float16_params_ahp = { 583 FLOAT_PARAMS(5, 10), 584 .arm_althp = true 585 }; 586 587 static const FloatFmt bfloat16_params = { 588 FLOAT_PARAMS(8, 7) 589 }; 590 591 static const FloatFmt float32_params = { 592 FLOAT_PARAMS(8, 23) 593 }; 594 595 static const FloatFmt float64_params = { 596 FLOAT_PARAMS(11, 52) 597 }; 598 599 static const FloatFmt float128_params = { 600 FLOAT_PARAMS(15, 112) 601 }; 602 603 /* Unpack a float to parts, but do not canonicalize. */ 604 static void unpack_raw64(FloatParts64 *r, const FloatFmt *fmt, uint64_t raw) 605 { 606 const int f_size = fmt->frac_size; 607 const int e_size = fmt->exp_size; 608 609 *r = (FloatParts64) { 610 .cls = float_class_unclassified, 611 .sign = extract64(raw, f_size + e_size, 1), 612 .exp = extract64(raw, f_size, e_size), 613 .frac = extract64(raw, 0, f_size) 614 }; 615 } 616 617 static inline void float16_unpack_raw(FloatParts64 *p, float16 f) 618 { 619 unpack_raw64(p, &float16_params, f); 620 } 621 622 static inline void bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f) 623 { 624 unpack_raw64(p, &bfloat16_params, f); 625 } 626 627 static inline void float32_unpack_raw(FloatParts64 *p, float32 f) 628 { 629 unpack_raw64(p, &float32_params, f); 630 } 631 632 static inline void float64_unpack_raw(FloatParts64 *p, float64 f) 633 { 634 unpack_raw64(p, &float64_params, f); 635 } 636 637 static void float128_unpack_raw(FloatParts128 *p, float128 f) 638 { 639 const int f_size = float128_params.frac_size - 64; 640 const int e_size = float128_params.exp_size; 641 642 *p = (FloatParts128) { 643 .cls = float_class_unclassified, 644 .sign = extract64(f.high, f_size + e_size, 1), 645 .exp = extract64(f.high, f_size, e_size), 646 .frac_hi = extract64(f.high, 0, f_size), 647 .frac_lo = f.low, 648 }; 649 } 650 651 /* Pack a float from parts, but do not canonicalize. */ 652 static uint64_t pack_raw64(const FloatParts64 *p, const FloatFmt *fmt) 653 { 654 const int f_size = fmt->frac_size; 655 const int e_size = fmt->exp_size; 656 uint64_t ret; 657 658 ret = (uint64_t)p->sign << (f_size + e_size); 659 ret = deposit64(ret, f_size, e_size, p->exp); 660 ret = deposit64(ret, 0, f_size, p->frac); 661 return ret; 662 } 663 664 static inline float16 float16_pack_raw(const FloatParts64 *p) 665 { 666 return make_float16(pack_raw64(p, &float16_params)); 667 } 668 669 static inline bfloat16 bfloat16_pack_raw(const FloatParts64 *p) 670 { 671 return pack_raw64(p, &bfloat16_params); 672 } 673 674 static inline float32 float32_pack_raw(const FloatParts64 *p) 675 { 676 return make_float32(pack_raw64(p, &float32_params)); 677 } 678 679 static inline float64 float64_pack_raw(const FloatParts64 *p) 680 { 681 return make_float64(pack_raw64(p, &float64_params)); 682 } 683 684 static float128 float128_pack_raw(const FloatParts128 *p) 685 { 686 const int f_size = float128_params.frac_size - 64; 687 const int e_size = float128_params.exp_size; 688 uint64_t hi; 689 690 hi = (uint64_t)p->sign << (f_size + e_size); 691 hi = deposit64(hi, f_size, e_size, p->exp); 692 hi = deposit64(hi, 0, f_size, p->frac_hi); 693 return make_float128(hi, p->frac_lo); 694 } 695 696 /*---------------------------------------------------------------------------- 697 | Functions and definitions to determine: (1) whether tininess for underflow 698 | is detected before or after rounding by default, (2) what (if anything) 699 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 700 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 701 | are propagated from function inputs to output. These details are target- 702 | specific. 703 *----------------------------------------------------------------------------*/ 704 #include "softfloat-specialize.c.inc" 705 706 #define PARTS_GENERIC_64_128(NAME, P) \ 707 QEMU_GENERIC(P, (FloatParts128 *, parts128_##NAME), parts64_##NAME) 708 709 #define parts_default_nan(P, S) PARTS_GENERIC_64_128(default_nan, P)(P, S) 710 #define parts_silence_nan(P, S) PARTS_GENERIC_64_128(silence_nan, P)(P, S) 711 712 static void parts64_return_nan(FloatParts64 *a, float_status *s); 713 static void parts128_return_nan(FloatParts128 *a, float_status *s); 714 715 #define parts_return_nan(P, S) PARTS_GENERIC_64_128(return_nan, P)(P, S) 716 717 /* 718 * Helper functions for softfloat-parts.c.inc, per-size operations. 719 */ 720 721 static void frac128_shl(FloatParts128 *a, int c) 722 { 723 shift128Left(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo); 724 } 725 726 #define frac_shl(A, C) frac128_shl(A, C) 727 728 static void frac128_shr(FloatParts128 *a, int c) 729 { 730 shift128Right(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo); 731 } 732 733 #define frac_shr(A, C) frac128_shr(A, C) 734 735 /* Canonicalize EXP and FRAC, setting CLS. */ 736 static FloatParts64 sf_canonicalize(FloatParts64 part, const FloatFmt *parm, 737 float_status *status) 738 { 739 if (part.exp == parm->exp_max && !parm->arm_althp) { 740 if (part.frac == 0) { 741 part.cls = float_class_inf; 742 } else { 743 part.frac <<= parm->frac_shift; 744 part.cls = (parts_is_snan_frac(part.frac, status) 745 ? float_class_snan : float_class_qnan); 746 } 747 } else if (part.exp == 0) { 748 if (likely(part.frac == 0)) { 749 part.cls = float_class_zero; 750 } else if (status->flush_inputs_to_zero) { 751 float_raise(float_flag_input_denormal, status); 752 part.cls = float_class_zero; 753 part.frac = 0; 754 } else { 755 int shift = clz64(part.frac); 756 part.cls = float_class_normal; 757 part.exp = parm->frac_shift - parm->exp_bias - shift + 1; 758 part.frac <<= shift; 759 } 760 } else { 761 part.cls = float_class_normal; 762 part.exp -= parm->exp_bias; 763 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift); 764 } 765 return part; 766 } 767 768 /* Round and uncanonicalize a floating-point number by parts. There 769 * are FRAC_SHIFT bits that may require rounding at the bottom of the 770 * fraction; these bits will be removed. The exponent will be biased 771 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0]. 772 */ 773 774 static FloatParts64 round_canonical(FloatParts64 p, float_status *s, 775 const FloatFmt *parm) 776 { 777 const uint64_t frac_lsb = parm->frac_lsb; 778 const uint64_t frac_lsbm1 = parm->frac_lsbm1; 779 const uint64_t round_mask = parm->round_mask; 780 const uint64_t roundeven_mask = parm->roundeven_mask; 781 const int exp_max = parm->exp_max; 782 const int frac_shift = parm->frac_shift; 783 uint64_t frac, inc; 784 int exp, flags = 0; 785 bool overflow_norm; 786 787 frac = p.frac; 788 exp = p.exp; 789 790 switch (p.cls) { 791 case float_class_normal: 792 switch (s->float_rounding_mode) { 793 case float_round_nearest_even: 794 overflow_norm = false; 795 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 796 break; 797 case float_round_ties_away: 798 overflow_norm = false; 799 inc = frac_lsbm1; 800 break; 801 case float_round_to_zero: 802 overflow_norm = true; 803 inc = 0; 804 break; 805 case float_round_up: 806 inc = p.sign ? 0 : round_mask; 807 overflow_norm = p.sign; 808 break; 809 case float_round_down: 810 inc = p.sign ? round_mask : 0; 811 overflow_norm = !p.sign; 812 break; 813 case float_round_to_odd: 814 overflow_norm = true; 815 inc = frac & frac_lsb ? 0 : round_mask; 816 break; 817 default: 818 g_assert_not_reached(); 819 } 820 821 exp += parm->exp_bias; 822 if (likely(exp > 0)) { 823 if (frac & round_mask) { 824 flags |= float_flag_inexact; 825 if (uadd64_overflow(frac, inc, &frac)) { 826 frac = (frac >> 1) | DECOMPOSED_IMPLICIT_BIT; 827 exp++; 828 } 829 } 830 frac >>= frac_shift; 831 832 if (parm->arm_althp) { 833 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */ 834 if (unlikely(exp > exp_max)) { 835 /* Overflow. Return the maximum normal. */ 836 flags = float_flag_invalid; 837 exp = exp_max; 838 frac = -1; 839 } 840 } else if (unlikely(exp >= exp_max)) { 841 flags |= float_flag_overflow | float_flag_inexact; 842 if (overflow_norm) { 843 exp = exp_max - 1; 844 frac = -1; 845 } else { 846 p.cls = float_class_inf; 847 goto do_inf; 848 } 849 } 850 } else if (s->flush_to_zero) { 851 flags |= float_flag_output_denormal; 852 p.cls = float_class_zero; 853 goto do_zero; 854 } else { 855 bool is_tiny = s->tininess_before_rounding || (exp < 0); 856 857 if (!is_tiny) { 858 uint64_t discard; 859 is_tiny = !uadd64_overflow(frac, inc, &discard); 860 } 861 862 shift64RightJamming(frac, 1 - exp, &frac); 863 if (frac & round_mask) { 864 /* Need to recompute round-to-even. */ 865 switch (s->float_rounding_mode) { 866 case float_round_nearest_even: 867 inc = ((frac & roundeven_mask) != frac_lsbm1 868 ? frac_lsbm1 : 0); 869 break; 870 case float_round_to_odd: 871 inc = frac & frac_lsb ? 0 : round_mask; 872 break; 873 default: 874 break; 875 } 876 flags |= float_flag_inexact; 877 frac += inc; 878 } 879 880 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0); 881 frac >>= frac_shift; 882 883 if (is_tiny && (flags & float_flag_inexact)) { 884 flags |= float_flag_underflow; 885 } 886 if (exp == 0 && frac == 0) { 887 p.cls = float_class_zero; 888 } 889 } 890 break; 891 892 case float_class_zero: 893 do_zero: 894 exp = 0; 895 frac = 0; 896 break; 897 898 case float_class_inf: 899 do_inf: 900 assert(!parm->arm_althp); 901 exp = exp_max; 902 frac = 0; 903 break; 904 905 case float_class_qnan: 906 case float_class_snan: 907 assert(!parm->arm_althp); 908 exp = exp_max; 909 frac >>= parm->frac_shift; 910 break; 911 912 default: 913 g_assert_not_reached(); 914 } 915 916 float_raise(flags, s); 917 p.exp = exp; 918 p.frac = frac; 919 return p; 920 } 921 922 static FloatParts64 pick_nan(FloatParts64 a, FloatParts64 b, float_status *s) 923 { 924 if (is_snan(a.cls) || is_snan(b.cls)) { 925 float_raise(float_flag_invalid, s); 926 } 927 928 if (s->default_nan_mode) { 929 parts_default_nan(&a, s); 930 } else { 931 if (pickNaN(a.cls, b.cls, 932 a.frac > b.frac || 933 (a.frac == b.frac && a.sign < b.sign), s)) { 934 a = b; 935 } 936 if (is_snan(a.cls)) { 937 parts_silence_nan(&a, s); 938 } 939 } 940 return a; 941 } 942 943 static FloatParts64 pick_nan_muladd(FloatParts64 a, FloatParts64 b, FloatParts64 c, 944 bool inf_zero, float_status *s) 945 { 946 int which; 947 948 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) { 949 float_raise(float_flag_invalid, s); 950 } 951 952 which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s); 953 954 if (s->default_nan_mode) { 955 /* Note that this check is after pickNaNMulAdd so that function 956 * has an opportunity to set the Invalid flag. 957 */ 958 which = 3; 959 } 960 961 switch (which) { 962 case 0: 963 break; 964 case 1: 965 a = b; 966 break; 967 case 2: 968 a = c; 969 break; 970 case 3: 971 parts_default_nan(&a, s); 972 break; 973 default: 974 g_assert_not_reached(); 975 } 976 977 if (is_snan(a.cls)) { 978 parts_silence_nan(&a, s); 979 } 980 return a; 981 } 982 983 #define partsN(NAME) parts64_##NAME 984 #define FloatPartsN FloatParts64 985 986 #include "softfloat-parts.c.inc" 987 988 #undef partsN 989 #undef FloatPartsN 990 #define partsN(NAME) parts128_##NAME 991 #define FloatPartsN FloatParts128 992 993 #include "softfloat-parts.c.inc" 994 995 #undef partsN 996 #undef FloatPartsN 997 998 /* 999 * Pack/unpack routines with a specific FloatFmt. 1000 */ 1001 1002 static void float16a_unpack_canonical(FloatParts64 *p, float16 f, 1003 float_status *s, const FloatFmt *params) 1004 { 1005 float16_unpack_raw(p, f); 1006 *p = sf_canonicalize(*p, params, s); 1007 } 1008 1009 static void float16_unpack_canonical(FloatParts64 *p, float16 f, 1010 float_status *s) 1011 { 1012 float16a_unpack_canonical(p, f, s, &float16_params); 1013 } 1014 1015 static void bfloat16_unpack_canonical(FloatParts64 *p, bfloat16 f, 1016 float_status *s) 1017 { 1018 bfloat16_unpack_raw(p, f); 1019 *p = sf_canonicalize(*p, &bfloat16_params, s); 1020 } 1021 1022 static float16 float16a_round_pack_canonical(FloatParts64 *p, 1023 float_status *s, 1024 const FloatFmt *params) 1025 { 1026 *p = round_canonical(*p, s, params); 1027 return float16_pack_raw(p); 1028 } 1029 1030 static float16 float16_round_pack_canonical(FloatParts64 *p, 1031 float_status *s) 1032 { 1033 return float16a_round_pack_canonical(p, s, &float16_params); 1034 } 1035 1036 static bfloat16 bfloat16_round_pack_canonical(FloatParts64 *p, 1037 float_status *s) 1038 { 1039 *p = round_canonical(*p, s, &bfloat16_params); 1040 return bfloat16_pack_raw(p); 1041 } 1042 1043 static void float32_unpack_canonical(FloatParts64 *p, float32 f, 1044 float_status *s) 1045 { 1046 float32_unpack_raw(p, f); 1047 *p = sf_canonicalize(*p, &float32_params, s); 1048 } 1049 1050 static float32 float32_round_pack_canonical(FloatParts64 *p, 1051 float_status *s) 1052 { 1053 *p = round_canonical(*p, s, &float32_params); 1054 return float32_pack_raw(p); 1055 } 1056 1057 static void float64_unpack_canonical(FloatParts64 *p, float64 f, 1058 float_status *s) 1059 { 1060 float64_unpack_raw(p, f); 1061 *p = sf_canonicalize(*p, &float64_params, s); 1062 } 1063 1064 static float64 float64_round_pack_canonical(FloatParts64 *p, 1065 float_status *s) 1066 { 1067 *p = round_canonical(*p, s, &float64_params); 1068 return float64_pack_raw(p); 1069 } 1070 1071 /* 1072 * Returns the result of adding or subtracting the values of the 1073 * floating-point values `a' and `b'. The operation is performed 1074 * according to the IEC/IEEE Standard for Binary Floating-Point 1075 * Arithmetic. 1076 */ 1077 1078 static FloatParts64 addsub_floats(FloatParts64 a, FloatParts64 b, bool subtract, 1079 float_status *s) 1080 { 1081 bool a_sign = a.sign; 1082 bool b_sign = b.sign ^ subtract; 1083 1084 if (a_sign != b_sign) { 1085 /* Subtraction */ 1086 1087 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1088 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) { 1089 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 1090 a.frac = a.frac - b.frac; 1091 } else { 1092 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 1093 a.frac = b.frac - a.frac; 1094 a.exp = b.exp; 1095 a_sign ^= 1; 1096 } 1097 1098 if (a.frac == 0) { 1099 a.cls = float_class_zero; 1100 a.sign = s->float_rounding_mode == float_round_down; 1101 } else { 1102 int shift = clz64(a.frac); 1103 a.frac = a.frac << shift; 1104 a.exp = a.exp - shift; 1105 a.sign = a_sign; 1106 } 1107 return a; 1108 } 1109 if (is_nan(a.cls) || is_nan(b.cls)) { 1110 return pick_nan(a, b, s); 1111 } 1112 if (a.cls == float_class_inf) { 1113 if (b.cls == float_class_inf) { 1114 float_raise(float_flag_invalid, s); 1115 parts_default_nan(&a, s); 1116 } 1117 return a; 1118 } 1119 if (a.cls == float_class_zero && b.cls == float_class_zero) { 1120 a.sign = s->float_rounding_mode == float_round_down; 1121 return a; 1122 } 1123 if (a.cls == float_class_zero || b.cls == float_class_inf) { 1124 b.sign = a_sign ^ 1; 1125 return b; 1126 } 1127 if (b.cls == float_class_zero) { 1128 return a; 1129 } 1130 } else { 1131 /* Addition */ 1132 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1133 if (a.exp > b.exp) { 1134 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 1135 } else if (a.exp < b.exp) { 1136 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 1137 a.exp = b.exp; 1138 } 1139 1140 if (uadd64_overflow(a.frac, b.frac, &a.frac)) { 1141 shift64RightJamming(a.frac, 1, &a.frac); 1142 a.frac |= DECOMPOSED_IMPLICIT_BIT; 1143 a.exp += 1; 1144 } 1145 return a; 1146 } 1147 if (is_nan(a.cls) || is_nan(b.cls)) { 1148 return pick_nan(a, b, s); 1149 } 1150 if (a.cls == float_class_inf || b.cls == float_class_zero) { 1151 return a; 1152 } 1153 if (b.cls == float_class_inf || a.cls == float_class_zero) { 1154 b.sign = b_sign; 1155 return b; 1156 } 1157 } 1158 g_assert_not_reached(); 1159 } 1160 1161 /* 1162 * Returns the result of adding or subtracting the floating-point 1163 * values `a' and `b'. The operation is performed according to the 1164 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1165 */ 1166 1167 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status) 1168 { 1169 FloatParts64 pa, pb, pr; 1170 1171 float16_unpack_canonical(&pa, a, status); 1172 float16_unpack_canonical(&pb, b, status); 1173 pr = addsub_floats(pa, pb, false, status); 1174 1175 return float16_round_pack_canonical(&pr, status); 1176 } 1177 1178 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status) 1179 { 1180 FloatParts64 pa, pb, pr; 1181 1182 float16_unpack_canonical(&pa, a, status); 1183 float16_unpack_canonical(&pb, b, status); 1184 pr = addsub_floats(pa, pb, true, status); 1185 1186 return float16_round_pack_canonical(&pr, status); 1187 } 1188 1189 static float32 QEMU_SOFTFLOAT_ATTR 1190 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status) 1191 { 1192 FloatParts64 pa, pb, pr; 1193 1194 float32_unpack_canonical(&pa, a, status); 1195 float32_unpack_canonical(&pb, b, status); 1196 pr = addsub_floats(pa, pb, subtract, status); 1197 1198 return float32_round_pack_canonical(&pr, status); 1199 } 1200 1201 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status) 1202 { 1203 return soft_f32_addsub(a, b, false, status); 1204 } 1205 1206 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status) 1207 { 1208 return soft_f32_addsub(a, b, true, status); 1209 } 1210 1211 static float64 QEMU_SOFTFLOAT_ATTR 1212 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status) 1213 { 1214 FloatParts64 pa, pb, pr; 1215 1216 float64_unpack_canonical(&pa, a, status); 1217 float64_unpack_canonical(&pb, b, status); 1218 pr = addsub_floats(pa, pb, subtract, status); 1219 1220 return float64_round_pack_canonical(&pr, status); 1221 } 1222 1223 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status) 1224 { 1225 return soft_f64_addsub(a, b, false, status); 1226 } 1227 1228 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status) 1229 { 1230 return soft_f64_addsub(a, b, true, status); 1231 } 1232 1233 static float hard_f32_add(float a, float b) 1234 { 1235 return a + b; 1236 } 1237 1238 static float hard_f32_sub(float a, float b) 1239 { 1240 return a - b; 1241 } 1242 1243 static double hard_f64_add(double a, double b) 1244 { 1245 return a + b; 1246 } 1247 1248 static double hard_f64_sub(double a, double b) 1249 { 1250 return a - b; 1251 } 1252 1253 static bool f32_addsubmul_post(union_float32 a, union_float32 b) 1254 { 1255 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1256 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1257 } 1258 return !(float32_is_zero(a.s) && float32_is_zero(b.s)); 1259 } 1260 1261 static bool f64_addsubmul_post(union_float64 a, union_float64 b) 1262 { 1263 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1264 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1265 } else { 1266 return !(float64_is_zero(a.s) && float64_is_zero(b.s)); 1267 } 1268 } 1269 1270 static float32 float32_addsub(float32 a, float32 b, float_status *s, 1271 hard_f32_op2_fn hard, soft_f32_op2_fn soft) 1272 { 1273 return float32_gen2(a, b, s, hard, soft, 1274 f32_is_zon2, f32_addsubmul_post); 1275 } 1276 1277 static float64 float64_addsub(float64 a, float64 b, float_status *s, 1278 hard_f64_op2_fn hard, soft_f64_op2_fn soft) 1279 { 1280 return float64_gen2(a, b, s, hard, soft, 1281 f64_is_zon2, f64_addsubmul_post); 1282 } 1283 1284 float32 QEMU_FLATTEN 1285 float32_add(float32 a, float32 b, float_status *s) 1286 { 1287 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add); 1288 } 1289 1290 float32 QEMU_FLATTEN 1291 float32_sub(float32 a, float32 b, float_status *s) 1292 { 1293 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub); 1294 } 1295 1296 float64 QEMU_FLATTEN 1297 float64_add(float64 a, float64 b, float_status *s) 1298 { 1299 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add); 1300 } 1301 1302 float64 QEMU_FLATTEN 1303 float64_sub(float64 a, float64 b, float_status *s) 1304 { 1305 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub); 1306 } 1307 1308 /* 1309 * Returns the result of adding or subtracting the bfloat16 1310 * values `a' and `b'. 1311 */ 1312 bfloat16 QEMU_FLATTEN bfloat16_add(bfloat16 a, bfloat16 b, float_status *status) 1313 { 1314 FloatParts64 pa, pb, pr; 1315 1316 bfloat16_unpack_canonical(&pa, a, status); 1317 bfloat16_unpack_canonical(&pb, b, status); 1318 pr = addsub_floats(pa, pb, false, status); 1319 1320 return bfloat16_round_pack_canonical(&pr, status); 1321 } 1322 1323 bfloat16 QEMU_FLATTEN bfloat16_sub(bfloat16 a, bfloat16 b, float_status *status) 1324 { 1325 FloatParts64 pa, pb, pr; 1326 1327 bfloat16_unpack_canonical(&pa, a, status); 1328 bfloat16_unpack_canonical(&pb, b, status); 1329 pr = addsub_floats(pa, pb, true, status); 1330 1331 return bfloat16_round_pack_canonical(&pr, status); 1332 } 1333 1334 /* 1335 * Returns the result of multiplying the floating-point values `a' and 1336 * `b'. The operation is performed according to the IEC/IEEE Standard 1337 * for Binary Floating-Point Arithmetic. 1338 */ 1339 1340 static FloatParts64 mul_floats(FloatParts64 a, FloatParts64 b, float_status *s) 1341 { 1342 bool sign = a.sign ^ b.sign; 1343 1344 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1345 uint64_t hi, lo; 1346 int exp = a.exp + b.exp; 1347 1348 mul64To128(a.frac, b.frac, &hi, &lo); 1349 if (hi & DECOMPOSED_IMPLICIT_BIT) { 1350 exp += 1; 1351 } else { 1352 hi <<= 1; 1353 } 1354 hi |= (lo != 0); 1355 1356 /* Re-use a */ 1357 a.exp = exp; 1358 a.sign = sign; 1359 a.frac = hi; 1360 return a; 1361 } 1362 /* handle all the NaN cases */ 1363 if (is_nan(a.cls) || is_nan(b.cls)) { 1364 return pick_nan(a, b, s); 1365 } 1366 /* Inf * Zero == NaN */ 1367 if ((a.cls == float_class_inf && b.cls == float_class_zero) || 1368 (a.cls == float_class_zero && b.cls == float_class_inf)) { 1369 float_raise(float_flag_invalid, s); 1370 parts_default_nan(&a, s); 1371 return a; 1372 } 1373 /* Multiply by 0 or Inf */ 1374 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1375 a.sign = sign; 1376 return a; 1377 } 1378 if (b.cls == float_class_inf || b.cls == float_class_zero) { 1379 b.sign = sign; 1380 return b; 1381 } 1382 g_assert_not_reached(); 1383 } 1384 1385 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status) 1386 { 1387 FloatParts64 pa, pb, pr; 1388 1389 float16_unpack_canonical(&pa, a, status); 1390 float16_unpack_canonical(&pb, b, status); 1391 pr = mul_floats(pa, pb, status); 1392 1393 return float16_round_pack_canonical(&pr, status); 1394 } 1395 1396 static float32 QEMU_SOFTFLOAT_ATTR 1397 soft_f32_mul(float32 a, float32 b, float_status *status) 1398 { 1399 FloatParts64 pa, pb, pr; 1400 1401 float32_unpack_canonical(&pa, a, status); 1402 float32_unpack_canonical(&pb, b, status); 1403 pr = mul_floats(pa, pb, status); 1404 1405 return float32_round_pack_canonical(&pr, status); 1406 } 1407 1408 static float64 QEMU_SOFTFLOAT_ATTR 1409 soft_f64_mul(float64 a, float64 b, float_status *status) 1410 { 1411 FloatParts64 pa, pb, pr; 1412 1413 float64_unpack_canonical(&pa, a, status); 1414 float64_unpack_canonical(&pb, b, status); 1415 pr = mul_floats(pa, pb, status); 1416 1417 return float64_round_pack_canonical(&pr, status); 1418 } 1419 1420 static float hard_f32_mul(float a, float b) 1421 { 1422 return a * b; 1423 } 1424 1425 static double hard_f64_mul(double a, double b) 1426 { 1427 return a * b; 1428 } 1429 1430 float32 QEMU_FLATTEN 1431 float32_mul(float32 a, float32 b, float_status *s) 1432 { 1433 return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul, 1434 f32_is_zon2, f32_addsubmul_post); 1435 } 1436 1437 float64 QEMU_FLATTEN 1438 float64_mul(float64 a, float64 b, float_status *s) 1439 { 1440 return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul, 1441 f64_is_zon2, f64_addsubmul_post); 1442 } 1443 1444 /* 1445 * Returns the result of multiplying the bfloat16 1446 * values `a' and `b'. 1447 */ 1448 1449 bfloat16 QEMU_FLATTEN bfloat16_mul(bfloat16 a, bfloat16 b, float_status *status) 1450 { 1451 FloatParts64 pa, pb, pr; 1452 1453 bfloat16_unpack_canonical(&pa, a, status); 1454 bfloat16_unpack_canonical(&pb, b, status); 1455 pr = mul_floats(pa, pb, status); 1456 1457 return bfloat16_round_pack_canonical(&pr, status); 1458 } 1459 1460 /* 1461 * Returns the result of multiplying the floating-point values `a' and 1462 * `b' then adding 'c', with no intermediate rounding step after the 1463 * multiplication. The operation is performed according to the 1464 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008. 1465 * The flags argument allows the caller to select negation of the 1466 * addend, the intermediate product, or the final result. (The 1467 * difference between this and having the caller do a separate 1468 * negation is that negating externally will flip the sign bit on 1469 * NaNs.) 1470 */ 1471 1472 static FloatParts64 muladd_floats(FloatParts64 a, FloatParts64 b, FloatParts64 c, 1473 int flags, float_status *s) 1474 { 1475 bool inf_zero, p_sign; 1476 bool sign_flip = flags & float_muladd_negate_result; 1477 FloatClass p_class; 1478 uint64_t hi, lo; 1479 int p_exp; 1480 int ab_mask, abc_mask; 1481 1482 ab_mask = float_cmask(a.cls) | float_cmask(b.cls); 1483 abc_mask = float_cmask(c.cls) | ab_mask; 1484 inf_zero = ab_mask == float_cmask_infzero; 1485 1486 /* It is implementation-defined whether the cases of (0,inf,qnan) 1487 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 1488 * they return if they do), so we have to hand this information 1489 * off to the target-specific pick-a-NaN routine. 1490 */ 1491 if (unlikely(abc_mask & float_cmask_anynan)) { 1492 return pick_nan_muladd(a, b, c, inf_zero, s); 1493 } 1494 1495 if (inf_zero) { 1496 float_raise(float_flag_invalid, s); 1497 parts_default_nan(&a, s); 1498 return a; 1499 } 1500 1501 if (flags & float_muladd_negate_c) { 1502 c.sign ^= 1; 1503 } 1504 1505 p_sign = a.sign ^ b.sign; 1506 1507 if (flags & float_muladd_negate_product) { 1508 p_sign ^= 1; 1509 } 1510 1511 if (ab_mask & float_cmask_inf) { 1512 p_class = float_class_inf; 1513 } else if (ab_mask & float_cmask_zero) { 1514 p_class = float_class_zero; 1515 } else { 1516 p_class = float_class_normal; 1517 } 1518 1519 if (c.cls == float_class_inf) { 1520 if (p_class == float_class_inf && p_sign != c.sign) { 1521 float_raise(float_flag_invalid, s); 1522 parts_default_nan(&c, s); 1523 } else { 1524 c.sign ^= sign_flip; 1525 } 1526 return c; 1527 } 1528 1529 if (p_class == float_class_inf) { 1530 a.cls = float_class_inf; 1531 a.sign = p_sign ^ sign_flip; 1532 return a; 1533 } 1534 1535 if (p_class == float_class_zero) { 1536 if (c.cls == float_class_zero) { 1537 if (p_sign != c.sign) { 1538 p_sign = s->float_rounding_mode == float_round_down; 1539 } 1540 c.sign = p_sign; 1541 } else if (flags & float_muladd_halve_result) { 1542 c.exp -= 1; 1543 } 1544 c.sign ^= sign_flip; 1545 return c; 1546 } 1547 1548 /* a & b should be normals now... */ 1549 assert(a.cls == float_class_normal && 1550 b.cls == float_class_normal); 1551 1552 p_exp = a.exp + b.exp; 1553 1554 mul64To128(a.frac, b.frac, &hi, &lo); 1555 1556 /* Renormalize to the msb. */ 1557 if (hi & DECOMPOSED_IMPLICIT_BIT) { 1558 p_exp += 1; 1559 } else { 1560 shortShift128Left(hi, lo, 1, &hi, &lo); 1561 } 1562 1563 /* + add/sub */ 1564 if (c.cls != float_class_zero) { 1565 int exp_diff = p_exp - c.exp; 1566 if (p_sign == c.sign) { 1567 /* Addition */ 1568 if (exp_diff <= 0) { 1569 shift64RightJamming(hi, -exp_diff, &hi); 1570 p_exp = c.exp; 1571 if (uadd64_overflow(hi, c.frac, &hi)) { 1572 shift64RightJamming(hi, 1, &hi); 1573 hi |= DECOMPOSED_IMPLICIT_BIT; 1574 p_exp += 1; 1575 } 1576 } else { 1577 uint64_t c_hi, c_lo, over; 1578 shift128RightJamming(c.frac, 0, exp_diff, &c_hi, &c_lo); 1579 add192(0, hi, lo, 0, c_hi, c_lo, &over, &hi, &lo); 1580 if (over) { 1581 shift64RightJamming(hi, 1, &hi); 1582 hi |= DECOMPOSED_IMPLICIT_BIT; 1583 p_exp += 1; 1584 } 1585 } 1586 } else { 1587 /* Subtraction */ 1588 uint64_t c_hi = c.frac, c_lo = 0; 1589 1590 if (exp_diff <= 0) { 1591 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo); 1592 if (exp_diff == 0 1593 && 1594 (hi > c_hi || (hi == c_hi && lo >= c_lo))) { 1595 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1596 } else { 1597 sub128(c_hi, c_lo, hi, lo, &hi, &lo); 1598 p_sign ^= 1; 1599 p_exp = c.exp; 1600 } 1601 } else { 1602 shift128RightJamming(c_hi, c_lo, 1603 exp_diff, 1604 &c_hi, &c_lo); 1605 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1606 } 1607 1608 if (hi == 0 && lo == 0) { 1609 a.cls = float_class_zero; 1610 a.sign = s->float_rounding_mode == float_round_down; 1611 a.sign ^= sign_flip; 1612 return a; 1613 } else { 1614 int shift; 1615 if (hi != 0) { 1616 shift = clz64(hi); 1617 } else { 1618 shift = clz64(lo) + 64; 1619 } 1620 /* Normalizing to a binary point of 124 is the 1621 correct adjust for the exponent. However since we're 1622 shifting, we might as well put the binary point back 1623 at 63 where we really want it. Therefore shift as 1624 if we're leaving 1 bit at the top of the word, but 1625 adjust the exponent as if we're leaving 3 bits. */ 1626 shift128Left(hi, lo, shift, &hi, &lo); 1627 p_exp -= shift; 1628 } 1629 } 1630 } 1631 hi |= (lo != 0); 1632 1633 if (flags & float_muladd_halve_result) { 1634 p_exp -= 1; 1635 } 1636 1637 /* finally prepare our result */ 1638 a.cls = float_class_normal; 1639 a.sign = p_sign ^ sign_flip; 1640 a.exp = p_exp; 1641 a.frac = hi; 1642 1643 return a; 1644 } 1645 1646 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c, 1647 int flags, float_status *status) 1648 { 1649 FloatParts64 pa, pb, pc, pr; 1650 1651 float16_unpack_canonical(&pa, a, status); 1652 float16_unpack_canonical(&pb, b, status); 1653 float16_unpack_canonical(&pc, c, status); 1654 pr = muladd_floats(pa, pb, pc, flags, status); 1655 1656 return float16_round_pack_canonical(&pr, status); 1657 } 1658 1659 static float32 QEMU_SOFTFLOAT_ATTR 1660 soft_f32_muladd(float32 a, float32 b, float32 c, int flags, 1661 float_status *status) 1662 { 1663 FloatParts64 pa, pb, pc, pr; 1664 1665 float32_unpack_canonical(&pa, a, status); 1666 float32_unpack_canonical(&pb, b, status); 1667 float32_unpack_canonical(&pc, c, status); 1668 pr = muladd_floats(pa, pb, pc, flags, status); 1669 1670 return float32_round_pack_canonical(&pr, status); 1671 } 1672 1673 static float64 QEMU_SOFTFLOAT_ATTR 1674 soft_f64_muladd(float64 a, float64 b, float64 c, int flags, 1675 float_status *status) 1676 { 1677 FloatParts64 pa, pb, pc, pr; 1678 1679 float64_unpack_canonical(&pa, a, status); 1680 float64_unpack_canonical(&pb, b, status); 1681 float64_unpack_canonical(&pc, c, status); 1682 pr = muladd_floats(pa, pb, pc, flags, status); 1683 1684 return float64_round_pack_canonical(&pr, status); 1685 } 1686 1687 static bool force_soft_fma; 1688 1689 float32 QEMU_FLATTEN 1690 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s) 1691 { 1692 union_float32 ua, ub, uc, ur; 1693 1694 ua.s = xa; 1695 ub.s = xb; 1696 uc.s = xc; 1697 1698 if (unlikely(!can_use_fpu(s))) { 1699 goto soft; 1700 } 1701 if (unlikely(flags & float_muladd_halve_result)) { 1702 goto soft; 1703 } 1704 1705 float32_input_flush3(&ua.s, &ub.s, &uc.s, s); 1706 if (unlikely(!f32_is_zon3(ua, ub, uc))) { 1707 goto soft; 1708 } 1709 1710 if (unlikely(force_soft_fma)) { 1711 goto soft; 1712 } 1713 1714 /* 1715 * When (a || b) == 0, there's no need to check for under/over flow, 1716 * since we know the addend is (normal || 0) and the product is 0. 1717 */ 1718 if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) { 1719 union_float32 up; 1720 bool prod_sign; 1721 1722 prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s); 1723 prod_sign ^= !!(flags & float_muladd_negate_product); 1724 up.s = float32_set_sign(float32_zero, prod_sign); 1725 1726 if (flags & float_muladd_negate_c) { 1727 uc.h = -uc.h; 1728 } 1729 ur.h = up.h + uc.h; 1730 } else { 1731 union_float32 ua_orig = ua; 1732 union_float32 uc_orig = uc; 1733 1734 if (flags & float_muladd_negate_product) { 1735 ua.h = -ua.h; 1736 } 1737 if (flags & float_muladd_negate_c) { 1738 uc.h = -uc.h; 1739 } 1740 1741 ur.h = fmaf(ua.h, ub.h, uc.h); 1742 1743 if (unlikely(f32_is_inf(ur))) { 1744 float_raise(float_flag_overflow, s); 1745 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) { 1746 ua = ua_orig; 1747 uc = uc_orig; 1748 goto soft; 1749 } 1750 } 1751 if (flags & float_muladd_negate_result) { 1752 return float32_chs(ur.s); 1753 } 1754 return ur.s; 1755 1756 soft: 1757 return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s); 1758 } 1759 1760 float64 QEMU_FLATTEN 1761 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s) 1762 { 1763 union_float64 ua, ub, uc, ur; 1764 1765 ua.s = xa; 1766 ub.s = xb; 1767 uc.s = xc; 1768 1769 if (unlikely(!can_use_fpu(s))) { 1770 goto soft; 1771 } 1772 if (unlikely(flags & float_muladd_halve_result)) { 1773 goto soft; 1774 } 1775 1776 float64_input_flush3(&ua.s, &ub.s, &uc.s, s); 1777 if (unlikely(!f64_is_zon3(ua, ub, uc))) { 1778 goto soft; 1779 } 1780 1781 if (unlikely(force_soft_fma)) { 1782 goto soft; 1783 } 1784 1785 /* 1786 * When (a || b) == 0, there's no need to check for under/over flow, 1787 * since we know the addend is (normal || 0) and the product is 0. 1788 */ 1789 if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) { 1790 union_float64 up; 1791 bool prod_sign; 1792 1793 prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s); 1794 prod_sign ^= !!(flags & float_muladd_negate_product); 1795 up.s = float64_set_sign(float64_zero, prod_sign); 1796 1797 if (flags & float_muladd_negate_c) { 1798 uc.h = -uc.h; 1799 } 1800 ur.h = up.h + uc.h; 1801 } else { 1802 union_float64 ua_orig = ua; 1803 union_float64 uc_orig = uc; 1804 1805 if (flags & float_muladd_negate_product) { 1806 ua.h = -ua.h; 1807 } 1808 if (flags & float_muladd_negate_c) { 1809 uc.h = -uc.h; 1810 } 1811 1812 ur.h = fma(ua.h, ub.h, uc.h); 1813 1814 if (unlikely(f64_is_inf(ur))) { 1815 float_raise(float_flag_overflow, s); 1816 } else if (unlikely(fabs(ur.h) <= FLT_MIN)) { 1817 ua = ua_orig; 1818 uc = uc_orig; 1819 goto soft; 1820 } 1821 } 1822 if (flags & float_muladd_negate_result) { 1823 return float64_chs(ur.s); 1824 } 1825 return ur.s; 1826 1827 soft: 1828 return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s); 1829 } 1830 1831 /* 1832 * Returns the result of multiplying the bfloat16 values `a' 1833 * and `b' then adding 'c', with no intermediate rounding step after the 1834 * multiplication. 1835 */ 1836 1837 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c, 1838 int flags, float_status *status) 1839 { 1840 FloatParts64 pa, pb, pc, pr; 1841 1842 bfloat16_unpack_canonical(&pa, a, status); 1843 bfloat16_unpack_canonical(&pb, b, status); 1844 bfloat16_unpack_canonical(&pc, c, status); 1845 pr = muladd_floats(pa, pb, pc, flags, status); 1846 1847 return bfloat16_round_pack_canonical(&pr, status); 1848 } 1849 1850 /* 1851 * Returns the result of dividing the floating-point value `a' by the 1852 * corresponding value `b'. The operation is performed according to 1853 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1854 */ 1855 1856 static FloatParts64 div_floats(FloatParts64 a, FloatParts64 b, float_status *s) 1857 { 1858 bool sign = a.sign ^ b.sign; 1859 1860 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1861 uint64_t n0, n1, q, r; 1862 int exp = a.exp - b.exp; 1863 1864 /* 1865 * We want a 2*N / N-bit division to produce exactly an N-bit 1866 * result, so that we do not lose any precision and so that we 1867 * do not have to renormalize afterward. If A.frac < B.frac, 1868 * then division would produce an (N-1)-bit result; shift A left 1869 * by one to produce the an N-bit result, and decrement the 1870 * exponent to match. 1871 * 1872 * The udiv_qrnnd algorithm that we're using requires normalization, 1873 * i.e. the msb of the denominator must be set, which is already true. 1874 */ 1875 if (a.frac < b.frac) { 1876 exp -= 1; 1877 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0); 1878 } else { 1879 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, &n1, &n0); 1880 } 1881 q = udiv_qrnnd(&r, n1, n0, b.frac); 1882 1883 /* Set lsb if there is a remainder, to set inexact. */ 1884 a.frac = q | (r != 0); 1885 a.sign = sign; 1886 a.exp = exp; 1887 return a; 1888 } 1889 /* handle all the NaN cases */ 1890 if (is_nan(a.cls) || is_nan(b.cls)) { 1891 return pick_nan(a, b, s); 1892 } 1893 /* 0/0 or Inf/Inf */ 1894 if (a.cls == b.cls 1895 && 1896 (a.cls == float_class_inf || a.cls == float_class_zero)) { 1897 float_raise(float_flag_invalid, s); 1898 parts_default_nan(&a, s); 1899 return a; 1900 } 1901 /* Inf / x or 0 / x */ 1902 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1903 a.sign = sign; 1904 return a; 1905 } 1906 /* Div 0 => Inf */ 1907 if (b.cls == float_class_zero) { 1908 float_raise(float_flag_divbyzero, s); 1909 a.cls = float_class_inf; 1910 a.sign = sign; 1911 return a; 1912 } 1913 /* Div by Inf */ 1914 if (b.cls == float_class_inf) { 1915 a.cls = float_class_zero; 1916 a.sign = sign; 1917 return a; 1918 } 1919 g_assert_not_reached(); 1920 } 1921 1922 float16 float16_div(float16 a, float16 b, float_status *status) 1923 { 1924 FloatParts64 pa, pb, pr; 1925 1926 float16_unpack_canonical(&pa, a, status); 1927 float16_unpack_canonical(&pb, b, status); 1928 pr = div_floats(pa, pb, status); 1929 1930 return float16_round_pack_canonical(&pr, status); 1931 } 1932 1933 static float32 QEMU_SOFTFLOAT_ATTR 1934 soft_f32_div(float32 a, float32 b, float_status *status) 1935 { 1936 FloatParts64 pa, pb, pr; 1937 1938 float32_unpack_canonical(&pa, a, status); 1939 float32_unpack_canonical(&pb, b, status); 1940 pr = div_floats(pa, pb, status); 1941 1942 return float32_round_pack_canonical(&pr, status); 1943 } 1944 1945 static float64 QEMU_SOFTFLOAT_ATTR 1946 soft_f64_div(float64 a, float64 b, float_status *status) 1947 { 1948 FloatParts64 pa, pb, pr; 1949 1950 float64_unpack_canonical(&pa, a, status); 1951 float64_unpack_canonical(&pb, b, status); 1952 pr = div_floats(pa, pb, status); 1953 1954 return float64_round_pack_canonical(&pr, status); 1955 } 1956 1957 static float hard_f32_div(float a, float b) 1958 { 1959 return a / b; 1960 } 1961 1962 static double hard_f64_div(double a, double b) 1963 { 1964 return a / b; 1965 } 1966 1967 static bool f32_div_pre(union_float32 a, union_float32 b) 1968 { 1969 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1970 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1971 fpclassify(b.h) == FP_NORMAL; 1972 } 1973 return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s); 1974 } 1975 1976 static bool f64_div_pre(union_float64 a, union_float64 b) 1977 { 1978 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1979 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1980 fpclassify(b.h) == FP_NORMAL; 1981 } 1982 return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s); 1983 } 1984 1985 static bool f32_div_post(union_float32 a, union_float32 b) 1986 { 1987 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1988 return fpclassify(a.h) != FP_ZERO; 1989 } 1990 return !float32_is_zero(a.s); 1991 } 1992 1993 static bool f64_div_post(union_float64 a, union_float64 b) 1994 { 1995 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1996 return fpclassify(a.h) != FP_ZERO; 1997 } 1998 return !float64_is_zero(a.s); 1999 } 2000 2001 float32 QEMU_FLATTEN 2002 float32_div(float32 a, float32 b, float_status *s) 2003 { 2004 return float32_gen2(a, b, s, hard_f32_div, soft_f32_div, 2005 f32_div_pre, f32_div_post); 2006 } 2007 2008 float64 QEMU_FLATTEN 2009 float64_div(float64 a, float64 b, float_status *s) 2010 { 2011 return float64_gen2(a, b, s, hard_f64_div, soft_f64_div, 2012 f64_div_pre, f64_div_post); 2013 } 2014 2015 /* 2016 * Returns the result of dividing the bfloat16 2017 * value `a' by the corresponding value `b'. 2018 */ 2019 2020 bfloat16 bfloat16_div(bfloat16 a, bfloat16 b, float_status *status) 2021 { 2022 FloatParts64 pa, pb, pr; 2023 2024 bfloat16_unpack_canonical(&pa, a, status); 2025 bfloat16_unpack_canonical(&pb, b, status); 2026 pr = div_floats(pa, pb, status); 2027 2028 return bfloat16_round_pack_canonical(&pr, status); 2029 } 2030 2031 /* 2032 * Float to Float conversions 2033 * 2034 * Returns the result of converting one float format to another. The 2035 * conversion is performed according to the IEC/IEEE Standard for 2036 * Binary Floating-Point Arithmetic. 2037 * 2038 * The float_to_float helper only needs to take care of raising 2039 * invalid exceptions and handling the conversion on NaNs. 2040 */ 2041 2042 static FloatParts64 float_to_float(FloatParts64 a, const FloatFmt *dstf, 2043 float_status *s) 2044 { 2045 if (dstf->arm_althp) { 2046 switch (a.cls) { 2047 case float_class_qnan: 2048 case float_class_snan: 2049 /* There is no NaN in the destination format. Raise Invalid 2050 * and return a zero with the sign of the input NaN. 2051 */ 2052 float_raise(float_flag_invalid, s); 2053 a.cls = float_class_zero; 2054 a.frac = 0; 2055 a.exp = 0; 2056 break; 2057 2058 case float_class_inf: 2059 /* There is no Inf in the destination format. Raise Invalid 2060 * and return the maximum normal with the correct sign. 2061 */ 2062 float_raise(float_flag_invalid, s); 2063 a.cls = float_class_normal; 2064 a.exp = dstf->exp_max; 2065 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift; 2066 break; 2067 2068 default: 2069 break; 2070 } 2071 } else if (is_nan(a.cls)) { 2072 parts_return_nan(&a, s); 2073 } 2074 return a; 2075 } 2076 2077 float32 float16_to_float32(float16 a, bool ieee, float_status *s) 2078 { 2079 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2080 FloatParts64 pa, pr; 2081 2082 float16a_unpack_canonical(&pa, a, s, fmt16); 2083 pr = float_to_float(pa, &float32_params, s); 2084 return float32_round_pack_canonical(&pr, s); 2085 } 2086 2087 float64 float16_to_float64(float16 a, bool ieee, float_status *s) 2088 { 2089 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2090 FloatParts64 pa, pr; 2091 2092 float16a_unpack_canonical(&pa, a, s, fmt16); 2093 pr = float_to_float(pa, &float64_params, s); 2094 return float64_round_pack_canonical(&pr, s); 2095 } 2096 2097 float16 float32_to_float16(float32 a, bool ieee, float_status *s) 2098 { 2099 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2100 FloatParts64 pa, pr; 2101 2102 float32_unpack_canonical(&pa, a, s); 2103 pr = float_to_float(pa, fmt16, s); 2104 return float16a_round_pack_canonical(&pr, s, fmt16); 2105 } 2106 2107 static float64 QEMU_SOFTFLOAT_ATTR 2108 soft_float32_to_float64(float32 a, float_status *s) 2109 { 2110 FloatParts64 pa, pr; 2111 2112 float32_unpack_canonical(&pa, a, s); 2113 pr = float_to_float(pa, &float64_params, s); 2114 return float64_round_pack_canonical(&pr, s); 2115 } 2116 2117 float64 float32_to_float64(float32 a, float_status *s) 2118 { 2119 if (likely(float32_is_normal(a))) { 2120 /* Widening conversion can never produce inexact results. */ 2121 union_float32 uf; 2122 union_float64 ud; 2123 uf.s = a; 2124 ud.h = uf.h; 2125 return ud.s; 2126 } else if (float32_is_zero(a)) { 2127 return float64_set_sign(float64_zero, float32_is_neg(a)); 2128 } else { 2129 return soft_float32_to_float64(a, s); 2130 } 2131 } 2132 2133 float16 float64_to_float16(float64 a, bool ieee, float_status *s) 2134 { 2135 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2136 FloatParts64 pa, pr; 2137 2138 float64_unpack_canonical(&pa, a, s); 2139 pr = float_to_float(pa, fmt16, s); 2140 return float16a_round_pack_canonical(&pr, s, fmt16); 2141 } 2142 2143 float32 float64_to_float32(float64 a, float_status *s) 2144 { 2145 FloatParts64 pa, pr; 2146 2147 float64_unpack_canonical(&pa, a, s); 2148 pr = float_to_float(pa, &float32_params, s); 2149 return float32_round_pack_canonical(&pr, s); 2150 } 2151 2152 float32 bfloat16_to_float32(bfloat16 a, float_status *s) 2153 { 2154 FloatParts64 pa, pr; 2155 2156 bfloat16_unpack_canonical(&pa, a, s); 2157 pr = float_to_float(pa, &float32_params, s); 2158 return float32_round_pack_canonical(&pr, s); 2159 } 2160 2161 float64 bfloat16_to_float64(bfloat16 a, float_status *s) 2162 { 2163 FloatParts64 pa, pr; 2164 2165 bfloat16_unpack_canonical(&pa, a, s); 2166 pr = float_to_float(pa, &float64_params, s); 2167 return float64_round_pack_canonical(&pr, s); 2168 } 2169 2170 bfloat16 float32_to_bfloat16(float32 a, float_status *s) 2171 { 2172 FloatParts64 pa, pr; 2173 2174 float32_unpack_canonical(&pa, a, s); 2175 pr = float_to_float(pa, &bfloat16_params, s); 2176 return bfloat16_round_pack_canonical(&pr, s); 2177 } 2178 2179 bfloat16 float64_to_bfloat16(float64 a, float_status *s) 2180 { 2181 FloatParts64 pa, pr; 2182 2183 float64_unpack_canonical(&pa, a, s); 2184 pr = float_to_float(pa, &bfloat16_params, s); 2185 return bfloat16_round_pack_canonical(&pr, s); 2186 } 2187 2188 /* 2189 * Rounds the floating-point value `a' to an integer, and returns the 2190 * result as a floating-point value. The operation is performed 2191 * according to the IEC/IEEE Standard for Binary Floating-Point 2192 * Arithmetic. 2193 */ 2194 2195 static FloatParts64 round_to_int(FloatParts64 a, FloatRoundMode rmode, 2196 int scale, float_status *s) 2197 { 2198 switch (a.cls) { 2199 case float_class_qnan: 2200 case float_class_snan: 2201 parts_return_nan(&a, s); 2202 break; 2203 2204 case float_class_zero: 2205 case float_class_inf: 2206 /* already "integral" */ 2207 break; 2208 2209 case float_class_normal: 2210 scale = MIN(MAX(scale, -0x10000), 0x10000); 2211 a.exp += scale; 2212 2213 if (a.exp >= DECOMPOSED_BINARY_POINT) { 2214 /* already integral */ 2215 break; 2216 } 2217 if (a.exp < 0) { 2218 bool one; 2219 /* all fractional */ 2220 float_raise(float_flag_inexact, s); 2221 switch (rmode) { 2222 case float_round_nearest_even: 2223 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT; 2224 break; 2225 case float_round_ties_away: 2226 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT; 2227 break; 2228 case float_round_to_zero: 2229 one = false; 2230 break; 2231 case float_round_up: 2232 one = !a.sign; 2233 break; 2234 case float_round_down: 2235 one = a.sign; 2236 break; 2237 case float_round_to_odd: 2238 one = true; 2239 break; 2240 default: 2241 g_assert_not_reached(); 2242 } 2243 2244 if (one) { 2245 a.frac = DECOMPOSED_IMPLICIT_BIT; 2246 a.exp = 0; 2247 } else { 2248 a.cls = float_class_zero; 2249 } 2250 } else { 2251 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp; 2252 uint64_t frac_lsbm1 = frac_lsb >> 1; 2253 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb; 2254 uint64_t rnd_mask = rnd_even_mask >> 1; 2255 uint64_t inc; 2256 2257 switch (rmode) { 2258 case float_round_nearest_even: 2259 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 2260 break; 2261 case float_round_ties_away: 2262 inc = frac_lsbm1; 2263 break; 2264 case float_round_to_zero: 2265 inc = 0; 2266 break; 2267 case float_round_up: 2268 inc = a.sign ? 0 : rnd_mask; 2269 break; 2270 case float_round_down: 2271 inc = a.sign ? rnd_mask : 0; 2272 break; 2273 case float_round_to_odd: 2274 inc = a.frac & frac_lsb ? 0 : rnd_mask; 2275 break; 2276 default: 2277 g_assert_not_reached(); 2278 } 2279 2280 if (a.frac & rnd_mask) { 2281 float_raise(float_flag_inexact, s); 2282 if (uadd64_overflow(a.frac, inc, &a.frac)) { 2283 a.frac >>= 1; 2284 a.frac |= DECOMPOSED_IMPLICIT_BIT; 2285 a.exp++; 2286 } 2287 a.frac &= ~rnd_mask; 2288 } 2289 } 2290 break; 2291 default: 2292 g_assert_not_reached(); 2293 } 2294 return a; 2295 } 2296 2297 float16 float16_round_to_int(float16 a, float_status *s) 2298 { 2299 FloatParts64 pa, pr; 2300 2301 float16_unpack_canonical(&pa, a, s); 2302 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2303 return float16_round_pack_canonical(&pr, s); 2304 } 2305 2306 float32 float32_round_to_int(float32 a, float_status *s) 2307 { 2308 FloatParts64 pa, pr; 2309 2310 float32_unpack_canonical(&pa, a, s); 2311 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2312 return float32_round_pack_canonical(&pr, s); 2313 } 2314 2315 float64 float64_round_to_int(float64 a, float_status *s) 2316 { 2317 FloatParts64 pa, pr; 2318 2319 float64_unpack_canonical(&pa, a, s); 2320 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2321 return float64_round_pack_canonical(&pr, s); 2322 } 2323 2324 /* 2325 * Rounds the bfloat16 value `a' to an integer, and returns the 2326 * result as a bfloat16 value. 2327 */ 2328 2329 bfloat16 bfloat16_round_to_int(bfloat16 a, float_status *s) 2330 { 2331 FloatParts64 pa, pr; 2332 2333 bfloat16_unpack_canonical(&pa, a, s); 2334 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2335 return bfloat16_round_pack_canonical(&pr, s); 2336 } 2337 2338 /* 2339 * Returns the result of converting the floating-point value `a' to 2340 * the two's complement integer format. The conversion is performed 2341 * according to the IEC/IEEE Standard for Binary Floating-Point 2342 * Arithmetic---which means in particular that the conversion is 2343 * rounded according to the current rounding mode. If `a' is a NaN, 2344 * the largest positive integer is returned. Otherwise, if the 2345 * conversion overflows, the largest integer with the same sign as `a' 2346 * is returned. 2347 */ 2348 2349 static int64_t round_to_int_and_pack(FloatParts64 in, FloatRoundMode rmode, 2350 int scale, int64_t min, int64_t max, 2351 float_status *s) 2352 { 2353 uint64_t r; 2354 int orig_flags = get_float_exception_flags(s); 2355 FloatParts64 p = round_to_int(in, rmode, scale, s); 2356 2357 switch (p.cls) { 2358 case float_class_snan: 2359 case float_class_qnan: 2360 s->float_exception_flags = orig_flags | float_flag_invalid; 2361 return max; 2362 case float_class_inf: 2363 s->float_exception_flags = orig_flags | float_flag_invalid; 2364 return p.sign ? min : max; 2365 case float_class_zero: 2366 return 0; 2367 case float_class_normal: 2368 if (p.exp <= DECOMPOSED_BINARY_POINT) { 2369 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2370 } else { 2371 r = UINT64_MAX; 2372 } 2373 if (p.sign) { 2374 if (r <= -(uint64_t) min) { 2375 return -r; 2376 } else { 2377 s->float_exception_flags = orig_flags | float_flag_invalid; 2378 return min; 2379 } 2380 } else { 2381 if (r <= max) { 2382 return r; 2383 } else { 2384 s->float_exception_flags = orig_flags | float_flag_invalid; 2385 return max; 2386 } 2387 } 2388 default: 2389 g_assert_not_reached(); 2390 } 2391 } 2392 2393 int8_t float16_to_int8_scalbn(float16 a, FloatRoundMode rmode, int scale, 2394 float_status *s) 2395 { 2396 FloatParts64 p; 2397 2398 float16_unpack_canonical(&p, a, s); 2399 return round_to_int_and_pack(p, rmode, scale, INT8_MIN, INT8_MAX, s); 2400 } 2401 2402 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale, 2403 float_status *s) 2404 { 2405 FloatParts64 p; 2406 2407 float16_unpack_canonical(&p, a, s); 2408 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2409 } 2410 2411 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale, 2412 float_status *s) 2413 { 2414 FloatParts64 p; 2415 2416 float16_unpack_canonical(&p, a, s); 2417 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2418 } 2419 2420 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale, 2421 float_status *s) 2422 { 2423 FloatParts64 p; 2424 2425 float16_unpack_canonical(&p, a, s); 2426 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2427 } 2428 2429 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale, 2430 float_status *s) 2431 { 2432 FloatParts64 p; 2433 2434 float32_unpack_canonical(&p, a, s); 2435 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2436 } 2437 2438 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale, 2439 float_status *s) 2440 { 2441 FloatParts64 p; 2442 2443 float32_unpack_canonical(&p, a, s); 2444 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2445 } 2446 2447 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale, 2448 float_status *s) 2449 { 2450 FloatParts64 p; 2451 2452 float32_unpack_canonical(&p, a, s); 2453 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2454 } 2455 2456 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale, 2457 float_status *s) 2458 { 2459 FloatParts64 p; 2460 2461 float64_unpack_canonical(&p, a, s); 2462 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2463 } 2464 2465 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale, 2466 float_status *s) 2467 { 2468 FloatParts64 p; 2469 2470 float64_unpack_canonical(&p, a, s); 2471 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2472 } 2473 2474 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale, 2475 float_status *s) 2476 { 2477 FloatParts64 p; 2478 2479 float64_unpack_canonical(&p, a, s); 2480 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2481 } 2482 2483 int8_t float16_to_int8(float16 a, float_status *s) 2484 { 2485 return float16_to_int8_scalbn(a, s->float_rounding_mode, 0, s); 2486 } 2487 2488 int16_t float16_to_int16(float16 a, float_status *s) 2489 { 2490 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2491 } 2492 2493 int32_t float16_to_int32(float16 a, float_status *s) 2494 { 2495 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2496 } 2497 2498 int64_t float16_to_int64(float16 a, float_status *s) 2499 { 2500 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2501 } 2502 2503 int16_t float32_to_int16(float32 a, float_status *s) 2504 { 2505 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2506 } 2507 2508 int32_t float32_to_int32(float32 a, float_status *s) 2509 { 2510 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2511 } 2512 2513 int64_t float32_to_int64(float32 a, float_status *s) 2514 { 2515 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2516 } 2517 2518 int16_t float64_to_int16(float64 a, float_status *s) 2519 { 2520 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2521 } 2522 2523 int32_t float64_to_int32(float64 a, float_status *s) 2524 { 2525 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2526 } 2527 2528 int64_t float64_to_int64(float64 a, float_status *s) 2529 { 2530 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2531 } 2532 2533 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s) 2534 { 2535 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2536 } 2537 2538 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s) 2539 { 2540 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2541 } 2542 2543 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s) 2544 { 2545 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2546 } 2547 2548 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s) 2549 { 2550 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s); 2551 } 2552 2553 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s) 2554 { 2555 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s); 2556 } 2557 2558 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s) 2559 { 2560 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s); 2561 } 2562 2563 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s) 2564 { 2565 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s); 2566 } 2567 2568 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s) 2569 { 2570 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s); 2571 } 2572 2573 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s) 2574 { 2575 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s); 2576 } 2577 2578 /* 2579 * Returns the result of converting the floating-point value `a' to 2580 * the two's complement integer format. 2581 */ 2582 2583 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2584 float_status *s) 2585 { 2586 FloatParts64 p; 2587 2588 bfloat16_unpack_canonical(&p, a, s); 2589 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2590 } 2591 2592 int32_t bfloat16_to_int32_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2593 float_status *s) 2594 { 2595 FloatParts64 p; 2596 2597 bfloat16_unpack_canonical(&p, a, s); 2598 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2599 } 2600 2601 int64_t bfloat16_to_int64_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2602 float_status *s) 2603 { 2604 FloatParts64 p; 2605 2606 bfloat16_unpack_canonical(&p, a, s); 2607 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2608 } 2609 2610 int16_t bfloat16_to_int16(bfloat16 a, float_status *s) 2611 { 2612 return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2613 } 2614 2615 int32_t bfloat16_to_int32(bfloat16 a, float_status *s) 2616 { 2617 return bfloat16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2618 } 2619 2620 int64_t bfloat16_to_int64(bfloat16 a, float_status *s) 2621 { 2622 return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2623 } 2624 2625 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s) 2626 { 2627 return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2628 } 2629 2630 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a, float_status *s) 2631 { 2632 return bfloat16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2633 } 2634 2635 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a, float_status *s) 2636 { 2637 return bfloat16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2638 } 2639 2640 /* 2641 * Returns the result of converting the floating-point value `a' to 2642 * the unsigned integer format. The conversion is performed according 2643 * to the IEC/IEEE Standard for Binary Floating-Point 2644 * Arithmetic---which means in particular that the conversion is 2645 * rounded according to the current rounding mode. If `a' is a NaN, 2646 * the largest unsigned integer is returned. Otherwise, if the 2647 * conversion overflows, the largest unsigned integer is returned. If 2648 * the 'a' is negative, the result is rounded and zero is returned; 2649 * values that do not round to zero will raise the inexact exception 2650 * flag. 2651 */ 2652 2653 static uint64_t round_to_uint_and_pack(FloatParts64 in, FloatRoundMode rmode, 2654 int scale, uint64_t max, 2655 float_status *s) 2656 { 2657 int orig_flags = get_float_exception_flags(s); 2658 FloatParts64 p = round_to_int(in, rmode, scale, s); 2659 uint64_t r; 2660 2661 switch (p.cls) { 2662 case float_class_snan: 2663 case float_class_qnan: 2664 s->float_exception_flags = orig_flags | float_flag_invalid; 2665 return max; 2666 case float_class_inf: 2667 s->float_exception_flags = orig_flags | float_flag_invalid; 2668 return p.sign ? 0 : max; 2669 case float_class_zero: 2670 return 0; 2671 case float_class_normal: 2672 if (p.sign) { 2673 s->float_exception_flags = orig_flags | float_flag_invalid; 2674 return 0; 2675 } 2676 2677 if (p.exp <= DECOMPOSED_BINARY_POINT) { 2678 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2679 } else { 2680 s->float_exception_flags = orig_flags | float_flag_invalid; 2681 return max; 2682 } 2683 2684 /* For uint64 this will never trip, but if p.exp is too large 2685 * to shift a decomposed fraction we shall have exited via the 2686 * 3rd leg above. 2687 */ 2688 if (r > max) { 2689 s->float_exception_flags = orig_flags | float_flag_invalid; 2690 return max; 2691 } 2692 return r; 2693 default: 2694 g_assert_not_reached(); 2695 } 2696 } 2697 2698 uint8_t float16_to_uint8_scalbn(float16 a, FloatRoundMode rmode, int scale, 2699 float_status *s) 2700 { 2701 FloatParts64 p; 2702 2703 float16_unpack_canonical(&p, a, s); 2704 return round_to_uint_and_pack(p, rmode, scale, UINT8_MAX, s); 2705 } 2706 2707 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale, 2708 float_status *s) 2709 { 2710 FloatParts64 p; 2711 2712 float16_unpack_canonical(&p, a, s); 2713 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2714 } 2715 2716 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale, 2717 float_status *s) 2718 { 2719 FloatParts64 p; 2720 2721 float16_unpack_canonical(&p, a, s); 2722 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2723 } 2724 2725 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale, 2726 float_status *s) 2727 { 2728 FloatParts64 p; 2729 2730 float16_unpack_canonical(&p, a, s); 2731 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2732 } 2733 2734 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale, 2735 float_status *s) 2736 { 2737 FloatParts64 p; 2738 2739 float32_unpack_canonical(&p, a, s); 2740 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2741 } 2742 2743 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale, 2744 float_status *s) 2745 { 2746 FloatParts64 p; 2747 2748 float32_unpack_canonical(&p, a, s); 2749 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2750 } 2751 2752 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale, 2753 float_status *s) 2754 { 2755 FloatParts64 p; 2756 2757 float32_unpack_canonical(&p, a, s); 2758 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2759 } 2760 2761 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale, 2762 float_status *s) 2763 { 2764 FloatParts64 p; 2765 2766 float64_unpack_canonical(&p, a, s); 2767 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2768 } 2769 2770 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale, 2771 float_status *s) 2772 { 2773 FloatParts64 p; 2774 2775 float64_unpack_canonical(&p, a, s); 2776 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2777 } 2778 2779 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale, 2780 float_status *s) 2781 { 2782 FloatParts64 p; 2783 2784 float64_unpack_canonical(&p, a, s); 2785 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2786 } 2787 2788 uint8_t float16_to_uint8(float16 a, float_status *s) 2789 { 2790 return float16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s); 2791 } 2792 2793 uint16_t float16_to_uint16(float16 a, float_status *s) 2794 { 2795 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2796 } 2797 2798 uint32_t float16_to_uint32(float16 a, float_status *s) 2799 { 2800 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2801 } 2802 2803 uint64_t float16_to_uint64(float16 a, float_status *s) 2804 { 2805 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2806 } 2807 2808 uint16_t float32_to_uint16(float32 a, float_status *s) 2809 { 2810 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2811 } 2812 2813 uint32_t float32_to_uint32(float32 a, float_status *s) 2814 { 2815 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2816 } 2817 2818 uint64_t float32_to_uint64(float32 a, float_status *s) 2819 { 2820 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2821 } 2822 2823 uint16_t float64_to_uint16(float64 a, float_status *s) 2824 { 2825 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2826 } 2827 2828 uint32_t float64_to_uint32(float64 a, float_status *s) 2829 { 2830 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2831 } 2832 2833 uint64_t float64_to_uint64(float64 a, float_status *s) 2834 { 2835 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2836 } 2837 2838 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s) 2839 { 2840 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2841 } 2842 2843 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s) 2844 { 2845 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2846 } 2847 2848 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s) 2849 { 2850 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2851 } 2852 2853 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s) 2854 { 2855 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2856 } 2857 2858 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s) 2859 { 2860 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2861 } 2862 2863 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s) 2864 { 2865 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2866 } 2867 2868 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s) 2869 { 2870 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2871 } 2872 2873 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s) 2874 { 2875 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2876 } 2877 2878 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s) 2879 { 2880 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2881 } 2882 2883 /* 2884 * Returns the result of converting the bfloat16 value `a' to 2885 * the unsigned integer format. 2886 */ 2887 2888 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode, 2889 int scale, float_status *s) 2890 { 2891 FloatParts64 p; 2892 2893 bfloat16_unpack_canonical(&p, a, s); 2894 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2895 } 2896 2897 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode rmode, 2898 int scale, float_status *s) 2899 { 2900 FloatParts64 p; 2901 2902 bfloat16_unpack_canonical(&p, a, s); 2903 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2904 } 2905 2906 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode rmode, 2907 int scale, float_status *s) 2908 { 2909 FloatParts64 p; 2910 2911 bfloat16_unpack_canonical(&p, a, s); 2912 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2913 } 2914 2915 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s) 2916 { 2917 return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2918 } 2919 2920 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *s) 2921 { 2922 return bfloat16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2923 } 2924 2925 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s) 2926 { 2927 return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2928 } 2929 2930 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s) 2931 { 2932 return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2933 } 2934 2935 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *s) 2936 { 2937 return bfloat16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2938 } 2939 2940 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *s) 2941 { 2942 return bfloat16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2943 } 2944 2945 /* 2946 * Integer to float conversions 2947 * 2948 * Returns the result of converting the two's complement integer `a' 2949 * to the floating-point format. The conversion is performed according 2950 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2951 */ 2952 2953 static FloatParts64 int_to_float(int64_t a, int scale, float_status *status) 2954 { 2955 FloatParts64 r = { .sign = false }; 2956 2957 if (a == 0) { 2958 r.cls = float_class_zero; 2959 } else { 2960 uint64_t f = a; 2961 int shift; 2962 2963 r.cls = float_class_normal; 2964 if (a < 0) { 2965 f = -f; 2966 r.sign = true; 2967 } 2968 shift = clz64(f); 2969 scale = MIN(MAX(scale, -0x10000), 0x10000); 2970 2971 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 2972 r.frac = f << shift; 2973 } 2974 2975 return r; 2976 } 2977 2978 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status) 2979 { 2980 FloatParts64 pa = int_to_float(a, scale, status); 2981 return float16_round_pack_canonical(&pa, status); 2982 } 2983 2984 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status) 2985 { 2986 return int64_to_float16_scalbn(a, scale, status); 2987 } 2988 2989 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status) 2990 { 2991 return int64_to_float16_scalbn(a, scale, status); 2992 } 2993 2994 float16 int64_to_float16(int64_t a, float_status *status) 2995 { 2996 return int64_to_float16_scalbn(a, 0, status); 2997 } 2998 2999 float16 int32_to_float16(int32_t a, float_status *status) 3000 { 3001 return int64_to_float16_scalbn(a, 0, status); 3002 } 3003 3004 float16 int16_to_float16(int16_t a, float_status *status) 3005 { 3006 return int64_to_float16_scalbn(a, 0, status); 3007 } 3008 3009 float16 int8_to_float16(int8_t a, float_status *status) 3010 { 3011 return int64_to_float16_scalbn(a, 0, status); 3012 } 3013 3014 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status) 3015 { 3016 FloatParts64 pa = int_to_float(a, scale, status); 3017 return float32_round_pack_canonical(&pa, status); 3018 } 3019 3020 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status) 3021 { 3022 return int64_to_float32_scalbn(a, scale, status); 3023 } 3024 3025 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status) 3026 { 3027 return int64_to_float32_scalbn(a, scale, status); 3028 } 3029 3030 float32 int64_to_float32(int64_t a, float_status *status) 3031 { 3032 return int64_to_float32_scalbn(a, 0, status); 3033 } 3034 3035 float32 int32_to_float32(int32_t a, float_status *status) 3036 { 3037 return int64_to_float32_scalbn(a, 0, status); 3038 } 3039 3040 float32 int16_to_float32(int16_t a, float_status *status) 3041 { 3042 return int64_to_float32_scalbn(a, 0, status); 3043 } 3044 3045 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status) 3046 { 3047 FloatParts64 pa = int_to_float(a, scale, status); 3048 return float64_round_pack_canonical(&pa, status); 3049 } 3050 3051 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status) 3052 { 3053 return int64_to_float64_scalbn(a, scale, status); 3054 } 3055 3056 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status) 3057 { 3058 return int64_to_float64_scalbn(a, scale, status); 3059 } 3060 3061 float64 int64_to_float64(int64_t a, float_status *status) 3062 { 3063 return int64_to_float64_scalbn(a, 0, status); 3064 } 3065 3066 float64 int32_to_float64(int32_t a, float_status *status) 3067 { 3068 return int64_to_float64_scalbn(a, 0, status); 3069 } 3070 3071 float64 int16_to_float64(int16_t a, float_status *status) 3072 { 3073 return int64_to_float64_scalbn(a, 0, status); 3074 } 3075 3076 /* 3077 * Returns the result of converting the two's complement integer `a' 3078 * to the bfloat16 format. 3079 */ 3080 3081 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status) 3082 { 3083 FloatParts64 pa = int_to_float(a, scale, status); 3084 return bfloat16_round_pack_canonical(&pa, status); 3085 } 3086 3087 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int scale, float_status *status) 3088 { 3089 return int64_to_bfloat16_scalbn(a, scale, status); 3090 } 3091 3092 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status) 3093 { 3094 return int64_to_bfloat16_scalbn(a, scale, status); 3095 } 3096 3097 bfloat16 int64_to_bfloat16(int64_t a, float_status *status) 3098 { 3099 return int64_to_bfloat16_scalbn(a, 0, status); 3100 } 3101 3102 bfloat16 int32_to_bfloat16(int32_t a, float_status *status) 3103 { 3104 return int64_to_bfloat16_scalbn(a, 0, status); 3105 } 3106 3107 bfloat16 int16_to_bfloat16(int16_t a, float_status *status) 3108 { 3109 return int64_to_bfloat16_scalbn(a, 0, status); 3110 } 3111 3112 /* 3113 * Unsigned Integer to float conversions 3114 * 3115 * Returns the result of converting the unsigned integer `a' to the 3116 * floating-point format. The conversion is performed according to the 3117 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3118 */ 3119 3120 static FloatParts64 uint_to_float(uint64_t a, int scale, float_status *status) 3121 { 3122 FloatParts64 r = { .sign = false }; 3123 int shift; 3124 3125 if (a == 0) { 3126 r.cls = float_class_zero; 3127 } else { 3128 scale = MIN(MAX(scale, -0x10000), 0x10000); 3129 shift = clz64(a); 3130 r.cls = float_class_normal; 3131 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 3132 r.frac = a << shift; 3133 } 3134 3135 return r; 3136 } 3137 3138 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status) 3139 { 3140 FloatParts64 pa = uint_to_float(a, scale, status); 3141 return float16_round_pack_canonical(&pa, status); 3142 } 3143 3144 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status) 3145 { 3146 return uint64_to_float16_scalbn(a, scale, status); 3147 } 3148 3149 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status) 3150 { 3151 return uint64_to_float16_scalbn(a, scale, status); 3152 } 3153 3154 float16 uint64_to_float16(uint64_t a, float_status *status) 3155 { 3156 return uint64_to_float16_scalbn(a, 0, status); 3157 } 3158 3159 float16 uint32_to_float16(uint32_t a, float_status *status) 3160 { 3161 return uint64_to_float16_scalbn(a, 0, status); 3162 } 3163 3164 float16 uint16_to_float16(uint16_t a, float_status *status) 3165 { 3166 return uint64_to_float16_scalbn(a, 0, status); 3167 } 3168 3169 float16 uint8_to_float16(uint8_t a, float_status *status) 3170 { 3171 return uint64_to_float16_scalbn(a, 0, status); 3172 } 3173 3174 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status) 3175 { 3176 FloatParts64 pa = uint_to_float(a, scale, status); 3177 return float32_round_pack_canonical(&pa, status); 3178 } 3179 3180 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status) 3181 { 3182 return uint64_to_float32_scalbn(a, scale, status); 3183 } 3184 3185 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status) 3186 { 3187 return uint64_to_float32_scalbn(a, scale, status); 3188 } 3189 3190 float32 uint64_to_float32(uint64_t a, float_status *status) 3191 { 3192 return uint64_to_float32_scalbn(a, 0, status); 3193 } 3194 3195 float32 uint32_to_float32(uint32_t a, float_status *status) 3196 { 3197 return uint64_to_float32_scalbn(a, 0, status); 3198 } 3199 3200 float32 uint16_to_float32(uint16_t a, float_status *status) 3201 { 3202 return uint64_to_float32_scalbn(a, 0, status); 3203 } 3204 3205 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status) 3206 { 3207 FloatParts64 pa = uint_to_float(a, scale, status); 3208 return float64_round_pack_canonical(&pa, status); 3209 } 3210 3211 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status) 3212 { 3213 return uint64_to_float64_scalbn(a, scale, status); 3214 } 3215 3216 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status) 3217 { 3218 return uint64_to_float64_scalbn(a, scale, status); 3219 } 3220 3221 float64 uint64_to_float64(uint64_t a, float_status *status) 3222 { 3223 return uint64_to_float64_scalbn(a, 0, status); 3224 } 3225 3226 float64 uint32_to_float64(uint32_t a, float_status *status) 3227 { 3228 return uint64_to_float64_scalbn(a, 0, status); 3229 } 3230 3231 float64 uint16_to_float64(uint16_t a, float_status *status) 3232 { 3233 return uint64_to_float64_scalbn(a, 0, status); 3234 } 3235 3236 /* 3237 * Returns the result of converting the unsigned integer `a' to the 3238 * bfloat16 format. 3239 */ 3240 3241 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status) 3242 { 3243 FloatParts64 pa = uint_to_float(a, scale, status); 3244 return bfloat16_round_pack_canonical(&pa, status); 3245 } 3246 3247 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int scale, float_status *status) 3248 { 3249 return uint64_to_bfloat16_scalbn(a, scale, status); 3250 } 3251 3252 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status) 3253 { 3254 return uint64_to_bfloat16_scalbn(a, scale, status); 3255 } 3256 3257 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status) 3258 { 3259 return uint64_to_bfloat16_scalbn(a, 0, status); 3260 } 3261 3262 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status) 3263 { 3264 return uint64_to_bfloat16_scalbn(a, 0, status); 3265 } 3266 3267 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status) 3268 { 3269 return uint64_to_bfloat16_scalbn(a, 0, status); 3270 } 3271 3272 /* Float Min/Max */ 3273 /* min() and max() functions. These can't be implemented as 3274 * 'compare and pick one input' because that would mishandle 3275 * NaNs and +0 vs -0. 3276 * 3277 * minnum() and maxnum() functions. These are similar to the min() 3278 * and max() functions but if one of the arguments is a QNaN and 3279 * the other is numerical then the numerical argument is returned. 3280 * SNaNs will get quietened before being returned. 3281 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 3282 * and maxNum() operations. min() and max() are the typical min/max 3283 * semantics provided by many CPUs which predate that specification. 3284 * 3285 * minnummag() and maxnummag() functions correspond to minNumMag() 3286 * and minNumMag() from the IEEE-754 2008. 3287 */ 3288 static FloatParts64 minmax_floats(FloatParts64 a, FloatParts64 b, bool ismin, 3289 bool ieee, bool ismag, float_status *s) 3290 { 3291 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) { 3292 if (ieee) { 3293 /* Takes two floating-point values `a' and `b', one of 3294 * which is a NaN, and returns the appropriate NaN 3295 * result. If either `a' or `b' is a signaling NaN, 3296 * the invalid exception is raised. 3297 */ 3298 if (is_snan(a.cls) || is_snan(b.cls)) { 3299 return pick_nan(a, b, s); 3300 } else if (is_nan(a.cls) && !is_nan(b.cls)) { 3301 return b; 3302 } else if (is_nan(b.cls) && !is_nan(a.cls)) { 3303 return a; 3304 } 3305 } 3306 return pick_nan(a, b, s); 3307 } else { 3308 int a_exp, b_exp; 3309 3310 switch (a.cls) { 3311 case float_class_normal: 3312 a_exp = a.exp; 3313 break; 3314 case float_class_inf: 3315 a_exp = INT_MAX; 3316 break; 3317 case float_class_zero: 3318 a_exp = INT_MIN; 3319 break; 3320 default: 3321 g_assert_not_reached(); 3322 break; 3323 } 3324 switch (b.cls) { 3325 case float_class_normal: 3326 b_exp = b.exp; 3327 break; 3328 case float_class_inf: 3329 b_exp = INT_MAX; 3330 break; 3331 case float_class_zero: 3332 b_exp = INT_MIN; 3333 break; 3334 default: 3335 g_assert_not_reached(); 3336 break; 3337 } 3338 3339 if (ismag && (a_exp != b_exp || a.frac != b.frac)) { 3340 bool a_less = a_exp < b_exp; 3341 if (a_exp == b_exp) { 3342 a_less = a.frac < b.frac; 3343 } 3344 return a_less ^ ismin ? b : a; 3345 } 3346 3347 if (a.sign == b.sign) { 3348 bool a_less = a_exp < b_exp; 3349 if (a_exp == b_exp) { 3350 a_less = a.frac < b.frac; 3351 } 3352 return a.sign ^ a_less ^ ismin ? b : a; 3353 } else { 3354 return a.sign ^ ismin ? b : a; 3355 } 3356 } 3357 } 3358 3359 #define MINMAX(sz, name, ismin, isiee, ismag) \ 3360 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \ 3361 float_status *s) \ 3362 { \ 3363 FloatParts64 pa, pb, pr; \ 3364 float ## sz ## _unpack_canonical(&pa, a, s); \ 3365 float ## sz ## _unpack_canonical(&pb, b, s); \ 3366 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 3367 return float ## sz ## _round_pack_canonical(&pr, s); \ 3368 } 3369 3370 MINMAX(16, min, true, false, false) 3371 MINMAX(16, minnum, true, true, false) 3372 MINMAX(16, minnummag, true, true, true) 3373 MINMAX(16, max, false, false, false) 3374 MINMAX(16, maxnum, false, true, false) 3375 MINMAX(16, maxnummag, false, true, true) 3376 3377 MINMAX(32, min, true, false, false) 3378 MINMAX(32, minnum, true, true, false) 3379 MINMAX(32, minnummag, true, true, true) 3380 MINMAX(32, max, false, false, false) 3381 MINMAX(32, maxnum, false, true, false) 3382 MINMAX(32, maxnummag, false, true, true) 3383 3384 MINMAX(64, min, true, false, false) 3385 MINMAX(64, minnum, true, true, false) 3386 MINMAX(64, minnummag, true, true, true) 3387 MINMAX(64, max, false, false, false) 3388 MINMAX(64, maxnum, false, true, false) 3389 MINMAX(64, maxnummag, false, true, true) 3390 3391 #undef MINMAX 3392 3393 #define BF16_MINMAX(name, ismin, isiee, ismag) \ 3394 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s) \ 3395 { \ 3396 FloatParts64 pa, pb, pr; \ 3397 bfloat16_unpack_canonical(&pa, a, s); \ 3398 bfloat16_unpack_canonical(&pb, b, s); \ 3399 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 3400 return bfloat16_round_pack_canonical(&pr, s); \ 3401 } 3402 3403 BF16_MINMAX(min, true, false, false) 3404 BF16_MINMAX(minnum, true, true, false) 3405 BF16_MINMAX(minnummag, true, true, true) 3406 BF16_MINMAX(max, false, false, false) 3407 BF16_MINMAX(maxnum, false, true, false) 3408 BF16_MINMAX(maxnummag, false, true, true) 3409 3410 #undef BF16_MINMAX 3411 3412 /* Floating point compare */ 3413 static FloatRelation compare_floats(FloatParts64 a, FloatParts64 b, bool is_quiet, 3414 float_status *s) 3415 { 3416 if (is_nan(a.cls) || is_nan(b.cls)) { 3417 if (!is_quiet || 3418 a.cls == float_class_snan || 3419 b.cls == float_class_snan) { 3420 float_raise(float_flag_invalid, s); 3421 } 3422 return float_relation_unordered; 3423 } 3424 3425 if (a.cls == float_class_zero) { 3426 if (b.cls == float_class_zero) { 3427 return float_relation_equal; 3428 } 3429 return b.sign ? float_relation_greater : float_relation_less; 3430 } else if (b.cls == float_class_zero) { 3431 return a.sign ? float_relation_less : float_relation_greater; 3432 } 3433 3434 /* The only really important thing about infinity is its sign. If 3435 * both are infinities the sign marks the smallest of the two. 3436 */ 3437 if (a.cls == float_class_inf) { 3438 if ((b.cls == float_class_inf) && (a.sign == b.sign)) { 3439 return float_relation_equal; 3440 } 3441 return a.sign ? float_relation_less : float_relation_greater; 3442 } else if (b.cls == float_class_inf) { 3443 return b.sign ? float_relation_greater : float_relation_less; 3444 } 3445 3446 if (a.sign != b.sign) { 3447 return a.sign ? float_relation_less : float_relation_greater; 3448 } 3449 3450 if (a.exp == b.exp) { 3451 if (a.frac == b.frac) { 3452 return float_relation_equal; 3453 } 3454 if (a.sign) { 3455 return a.frac > b.frac ? 3456 float_relation_less : float_relation_greater; 3457 } else { 3458 return a.frac > b.frac ? 3459 float_relation_greater : float_relation_less; 3460 } 3461 } else { 3462 if (a.sign) { 3463 return a.exp > b.exp ? float_relation_less : float_relation_greater; 3464 } else { 3465 return a.exp > b.exp ? float_relation_greater : float_relation_less; 3466 } 3467 } 3468 } 3469 3470 #define COMPARE(name, attr, sz) \ 3471 static int attr \ 3472 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \ 3473 { \ 3474 FloatParts64 pa, pb; \ 3475 float ## sz ## _unpack_canonical(&pa, a, s); \ 3476 float ## sz ## _unpack_canonical(&pb, b, s); \ 3477 return compare_floats(pa, pb, is_quiet, s); \ 3478 } 3479 3480 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16) 3481 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32) 3482 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64) 3483 3484 #undef COMPARE 3485 3486 FloatRelation float16_compare(float16 a, float16 b, float_status *s) 3487 { 3488 return soft_f16_compare(a, b, false, s); 3489 } 3490 3491 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s) 3492 { 3493 return soft_f16_compare(a, b, true, s); 3494 } 3495 3496 static FloatRelation QEMU_FLATTEN 3497 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s) 3498 { 3499 union_float32 ua, ub; 3500 3501 ua.s = xa; 3502 ub.s = xb; 3503 3504 if (QEMU_NO_HARDFLOAT) { 3505 goto soft; 3506 } 3507 3508 float32_input_flush2(&ua.s, &ub.s, s); 3509 if (isgreaterequal(ua.h, ub.h)) { 3510 if (isgreater(ua.h, ub.h)) { 3511 return float_relation_greater; 3512 } 3513 return float_relation_equal; 3514 } 3515 if (likely(isless(ua.h, ub.h))) { 3516 return float_relation_less; 3517 } 3518 /* The only condition remaining is unordered. 3519 * Fall through to set flags. 3520 */ 3521 soft: 3522 return soft_f32_compare(ua.s, ub.s, is_quiet, s); 3523 } 3524 3525 FloatRelation float32_compare(float32 a, float32 b, float_status *s) 3526 { 3527 return f32_compare(a, b, false, s); 3528 } 3529 3530 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s) 3531 { 3532 return f32_compare(a, b, true, s); 3533 } 3534 3535 static FloatRelation QEMU_FLATTEN 3536 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s) 3537 { 3538 union_float64 ua, ub; 3539 3540 ua.s = xa; 3541 ub.s = xb; 3542 3543 if (QEMU_NO_HARDFLOAT) { 3544 goto soft; 3545 } 3546 3547 float64_input_flush2(&ua.s, &ub.s, s); 3548 if (isgreaterequal(ua.h, ub.h)) { 3549 if (isgreater(ua.h, ub.h)) { 3550 return float_relation_greater; 3551 } 3552 return float_relation_equal; 3553 } 3554 if (likely(isless(ua.h, ub.h))) { 3555 return float_relation_less; 3556 } 3557 /* The only condition remaining is unordered. 3558 * Fall through to set flags. 3559 */ 3560 soft: 3561 return soft_f64_compare(ua.s, ub.s, is_quiet, s); 3562 } 3563 3564 FloatRelation float64_compare(float64 a, float64 b, float_status *s) 3565 { 3566 return f64_compare(a, b, false, s); 3567 } 3568 3569 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s) 3570 { 3571 return f64_compare(a, b, true, s); 3572 } 3573 3574 static FloatRelation QEMU_FLATTEN 3575 soft_bf16_compare(bfloat16 a, bfloat16 b, bool is_quiet, float_status *s) 3576 { 3577 FloatParts64 pa, pb; 3578 3579 bfloat16_unpack_canonical(&pa, a, s); 3580 bfloat16_unpack_canonical(&pb, b, s); 3581 return compare_floats(pa, pb, is_quiet, s); 3582 } 3583 3584 FloatRelation bfloat16_compare(bfloat16 a, bfloat16 b, float_status *s) 3585 { 3586 return soft_bf16_compare(a, b, false, s); 3587 } 3588 3589 FloatRelation bfloat16_compare_quiet(bfloat16 a, bfloat16 b, float_status *s) 3590 { 3591 return soft_bf16_compare(a, b, true, s); 3592 } 3593 3594 /* Multiply A by 2 raised to the power N. */ 3595 static FloatParts64 scalbn_decomposed(FloatParts64 a, int n, float_status *s) 3596 { 3597 if (unlikely(is_nan(a.cls))) { 3598 parts_return_nan(&a, s); 3599 } 3600 if (a.cls == float_class_normal) { 3601 /* The largest float type (even though not supported by FloatParts64) 3602 * is float128, which has a 15 bit exponent. Bounding N to 16 bits 3603 * still allows rounding to infinity, without allowing overflow 3604 * within the int32_t that backs FloatParts64.exp. 3605 */ 3606 n = MIN(MAX(n, -0x10000), 0x10000); 3607 a.exp += n; 3608 } 3609 return a; 3610 } 3611 3612 float16 float16_scalbn(float16 a, int n, float_status *status) 3613 { 3614 FloatParts64 pa, pr; 3615 3616 float16_unpack_canonical(&pa, a, status); 3617 pr = scalbn_decomposed(pa, n, status); 3618 return float16_round_pack_canonical(&pr, status); 3619 } 3620 3621 float32 float32_scalbn(float32 a, int n, float_status *status) 3622 { 3623 FloatParts64 pa, pr; 3624 3625 float32_unpack_canonical(&pa, a, status); 3626 pr = scalbn_decomposed(pa, n, status); 3627 return float32_round_pack_canonical(&pr, status); 3628 } 3629 3630 float64 float64_scalbn(float64 a, int n, float_status *status) 3631 { 3632 FloatParts64 pa, pr; 3633 3634 float64_unpack_canonical(&pa, a, status); 3635 pr = scalbn_decomposed(pa, n, status); 3636 return float64_round_pack_canonical(&pr, status); 3637 } 3638 3639 bfloat16 bfloat16_scalbn(bfloat16 a, int n, float_status *status) 3640 { 3641 FloatParts64 pa, pr; 3642 3643 bfloat16_unpack_canonical(&pa, a, status); 3644 pr = scalbn_decomposed(pa, n, status); 3645 return bfloat16_round_pack_canonical(&pr, status); 3646 } 3647 3648 /* 3649 * Square Root 3650 * 3651 * The old softfloat code did an approximation step before zeroing in 3652 * on the final result. However for simpleness we just compute the 3653 * square root by iterating down from the implicit bit to enough extra 3654 * bits to ensure we get a correctly rounded result. 3655 * 3656 * This does mean however the calculation is slower than before, 3657 * especially for 64 bit floats. 3658 */ 3659 3660 static FloatParts64 sqrt_float(FloatParts64 a, float_status *s, const FloatFmt *p) 3661 { 3662 uint64_t a_frac, r_frac, s_frac; 3663 int bit, last_bit; 3664 3665 if (is_nan(a.cls)) { 3666 parts_return_nan(&a, s); 3667 return a; 3668 } 3669 if (a.cls == float_class_zero) { 3670 return a; /* sqrt(+-0) = +-0 */ 3671 } 3672 if (a.sign) { 3673 float_raise(float_flag_invalid, s); 3674 parts_default_nan(&a, s); 3675 return a; 3676 } 3677 if (a.cls == float_class_inf) { 3678 return a; /* sqrt(+inf) = +inf */ 3679 } 3680 3681 assert(a.cls == float_class_normal); 3682 3683 /* We need two overflow bits at the top. Adding room for that is a 3684 * right shift. If the exponent is odd, we can discard the low bit 3685 * by multiplying the fraction by 2; that's a left shift. Combine 3686 * those and we shift right by 1 if the exponent is odd, otherwise 2. 3687 */ 3688 a_frac = a.frac >> (2 - (a.exp & 1)); 3689 a.exp >>= 1; 3690 3691 /* Bit-by-bit computation of sqrt. */ 3692 r_frac = 0; 3693 s_frac = 0; 3694 3695 /* Iterate from implicit bit down to the 3 extra bits to compute a 3696 * properly rounded result. Remember we've inserted two more bits 3697 * at the top, so these positions are two less. 3698 */ 3699 bit = DECOMPOSED_BINARY_POINT - 2; 3700 last_bit = MAX(p->frac_shift - 4, 0); 3701 do { 3702 uint64_t q = 1ULL << bit; 3703 uint64_t t_frac = s_frac + q; 3704 if (t_frac <= a_frac) { 3705 s_frac = t_frac + q; 3706 a_frac -= t_frac; 3707 r_frac += q; 3708 } 3709 a_frac <<= 1; 3710 } while (--bit >= last_bit); 3711 3712 /* Undo the right shift done above. If there is any remaining 3713 * fraction, the result is inexact. Set the sticky bit. 3714 */ 3715 a.frac = (r_frac << 2) + (a_frac != 0); 3716 3717 return a; 3718 } 3719 3720 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status) 3721 { 3722 FloatParts64 pa, pr; 3723 3724 float16_unpack_canonical(&pa, a, status); 3725 pr = sqrt_float(pa, status, &float16_params); 3726 return float16_round_pack_canonical(&pr, status); 3727 } 3728 3729 static float32 QEMU_SOFTFLOAT_ATTR 3730 soft_f32_sqrt(float32 a, float_status *status) 3731 { 3732 FloatParts64 pa, pr; 3733 3734 float32_unpack_canonical(&pa, a, status); 3735 pr = sqrt_float(pa, status, &float32_params); 3736 return float32_round_pack_canonical(&pr, status); 3737 } 3738 3739 static float64 QEMU_SOFTFLOAT_ATTR 3740 soft_f64_sqrt(float64 a, float_status *status) 3741 { 3742 FloatParts64 pa, pr; 3743 3744 float64_unpack_canonical(&pa, a, status); 3745 pr = sqrt_float(pa, status, &float64_params); 3746 return float64_round_pack_canonical(&pr, status); 3747 } 3748 3749 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s) 3750 { 3751 union_float32 ua, ur; 3752 3753 ua.s = xa; 3754 if (unlikely(!can_use_fpu(s))) { 3755 goto soft; 3756 } 3757 3758 float32_input_flush1(&ua.s, s); 3759 if (QEMU_HARDFLOAT_1F32_USE_FP) { 3760 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3761 fpclassify(ua.h) == FP_ZERO) || 3762 signbit(ua.h))) { 3763 goto soft; 3764 } 3765 } else if (unlikely(!float32_is_zero_or_normal(ua.s) || 3766 float32_is_neg(ua.s))) { 3767 goto soft; 3768 } 3769 ur.h = sqrtf(ua.h); 3770 return ur.s; 3771 3772 soft: 3773 return soft_f32_sqrt(ua.s, s); 3774 } 3775 3776 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s) 3777 { 3778 union_float64 ua, ur; 3779 3780 ua.s = xa; 3781 if (unlikely(!can_use_fpu(s))) { 3782 goto soft; 3783 } 3784 3785 float64_input_flush1(&ua.s, s); 3786 if (QEMU_HARDFLOAT_1F64_USE_FP) { 3787 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3788 fpclassify(ua.h) == FP_ZERO) || 3789 signbit(ua.h))) { 3790 goto soft; 3791 } 3792 } else if (unlikely(!float64_is_zero_or_normal(ua.s) || 3793 float64_is_neg(ua.s))) { 3794 goto soft; 3795 } 3796 ur.h = sqrt(ua.h); 3797 return ur.s; 3798 3799 soft: 3800 return soft_f64_sqrt(ua.s, s); 3801 } 3802 3803 bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status) 3804 { 3805 FloatParts64 pa, pr; 3806 3807 bfloat16_unpack_canonical(&pa, a, status); 3808 pr = sqrt_float(pa, status, &bfloat16_params); 3809 return bfloat16_round_pack_canonical(&pr, status); 3810 } 3811 3812 /*---------------------------------------------------------------------------- 3813 | The pattern for a default generated NaN. 3814 *----------------------------------------------------------------------------*/ 3815 3816 float16 float16_default_nan(float_status *status) 3817 { 3818 FloatParts64 p; 3819 3820 parts_default_nan(&p, status); 3821 p.frac >>= float16_params.frac_shift; 3822 return float16_pack_raw(&p); 3823 } 3824 3825 float32 float32_default_nan(float_status *status) 3826 { 3827 FloatParts64 p; 3828 3829 parts_default_nan(&p, status); 3830 p.frac >>= float32_params.frac_shift; 3831 return float32_pack_raw(&p); 3832 } 3833 3834 float64 float64_default_nan(float_status *status) 3835 { 3836 FloatParts64 p; 3837 3838 parts_default_nan(&p, status); 3839 p.frac >>= float64_params.frac_shift; 3840 return float64_pack_raw(&p); 3841 } 3842 3843 float128 float128_default_nan(float_status *status) 3844 { 3845 FloatParts128 p; 3846 3847 parts_default_nan(&p, status); 3848 frac_shr(&p, float128_params.frac_shift); 3849 return float128_pack_raw(&p); 3850 } 3851 3852 bfloat16 bfloat16_default_nan(float_status *status) 3853 { 3854 FloatParts64 p; 3855 3856 parts_default_nan(&p, status); 3857 p.frac >>= bfloat16_params.frac_shift; 3858 return bfloat16_pack_raw(&p); 3859 } 3860 3861 /*---------------------------------------------------------------------------- 3862 | Returns a quiet NaN from a signalling NaN for the floating point value `a'. 3863 *----------------------------------------------------------------------------*/ 3864 3865 float16 float16_silence_nan(float16 a, float_status *status) 3866 { 3867 FloatParts64 p; 3868 3869 float16_unpack_raw(&p, a); 3870 p.frac <<= float16_params.frac_shift; 3871 parts_silence_nan(&p, status); 3872 p.frac >>= float16_params.frac_shift; 3873 return float16_pack_raw(&p); 3874 } 3875 3876 float32 float32_silence_nan(float32 a, float_status *status) 3877 { 3878 FloatParts64 p; 3879 3880 float32_unpack_raw(&p, a); 3881 p.frac <<= float32_params.frac_shift; 3882 parts_silence_nan(&p, status); 3883 p.frac >>= float32_params.frac_shift; 3884 return float32_pack_raw(&p); 3885 } 3886 3887 float64 float64_silence_nan(float64 a, float_status *status) 3888 { 3889 FloatParts64 p; 3890 3891 float64_unpack_raw(&p, a); 3892 p.frac <<= float64_params.frac_shift; 3893 parts_silence_nan(&p, status); 3894 p.frac >>= float64_params.frac_shift; 3895 return float64_pack_raw(&p); 3896 } 3897 3898 bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status) 3899 { 3900 FloatParts64 p; 3901 3902 bfloat16_unpack_raw(&p, a); 3903 p.frac <<= bfloat16_params.frac_shift; 3904 parts_silence_nan(&p, status); 3905 p.frac >>= bfloat16_params.frac_shift; 3906 return bfloat16_pack_raw(&p); 3907 } 3908 3909 float128 float128_silence_nan(float128 a, float_status *status) 3910 { 3911 FloatParts128 p; 3912 3913 float128_unpack_raw(&p, a); 3914 frac_shl(&p, float128_params.frac_shift); 3915 parts_silence_nan(&p, status); 3916 frac_shr(&p, float128_params.frac_shift); 3917 return float128_pack_raw(&p); 3918 } 3919 3920 /*---------------------------------------------------------------------------- 3921 | If `a' is denormal and we are in flush-to-zero mode then set the 3922 | input-denormal exception and return zero. Otherwise just return the value. 3923 *----------------------------------------------------------------------------*/ 3924 3925 static bool parts_squash_denormal(FloatParts64 p, float_status *status) 3926 { 3927 if (p.exp == 0 && p.frac != 0) { 3928 float_raise(float_flag_input_denormal, status); 3929 return true; 3930 } 3931 3932 return false; 3933 } 3934 3935 float16 float16_squash_input_denormal(float16 a, float_status *status) 3936 { 3937 if (status->flush_inputs_to_zero) { 3938 FloatParts64 p; 3939 3940 float16_unpack_raw(&p, a); 3941 if (parts_squash_denormal(p, status)) { 3942 return float16_set_sign(float16_zero, p.sign); 3943 } 3944 } 3945 return a; 3946 } 3947 3948 float32 float32_squash_input_denormal(float32 a, float_status *status) 3949 { 3950 if (status->flush_inputs_to_zero) { 3951 FloatParts64 p; 3952 3953 float32_unpack_raw(&p, a); 3954 if (parts_squash_denormal(p, status)) { 3955 return float32_set_sign(float32_zero, p.sign); 3956 } 3957 } 3958 return a; 3959 } 3960 3961 float64 float64_squash_input_denormal(float64 a, float_status *status) 3962 { 3963 if (status->flush_inputs_to_zero) { 3964 FloatParts64 p; 3965 3966 float64_unpack_raw(&p, a); 3967 if (parts_squash_denormal(p, status)) { 3968 return float64_set_sign(float64_zero, p.sign); 3969 } 3970 } 3971 return a; 3972 } 3973 3974 bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status) 3975 { 3976 if (status->flush_inputs_to_zero) { 3977 FloatParts64 p; 3978 3979 bfloat16_unpack_raw(&p, a); 3980 if (parts_squash_denormal(p, status)) { 3981 return bfloat16_set_sign(bfloat16_zero, p.sign); 3982 } 3983 } 3984 return a; 3985 } 3986 3987 /*---------------------------------------------------------------------------- 3988 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 3989 | and 7, and returns the properly rounded 32-bit integer corresponding to the 3990 | input. If `zSign' is 1, the input is negated before being converted to an 3991 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 3992 | is simply rounded to an integer, with the inexact exception raised if the 3993 | input cannot be represented exactly as an integer. However, if the fixed- 3994 | point input is too large, the invalid exception is raised and the largest 3995 | positive or negative integer is returned. 3996 *----------------------------------------------------------------------------*/ 3997 3998 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ, 3999 float_status *status) 4000 { 4001 int8_t roundingMode; 4002 bool roundNearestEven; 4003 int8_t roundIncrement, roundBits; 4004 int32_t z; 4005 4006 roundingMode = status->float_rounding_mode; 4007 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4008 switch (roundingMode) { 4009 case float_round_nearest_even: 4010 case float_round_ties_away: 4011 roundIncrement = 0x40; 4012 break; 4013 case float_round_to_zero: 4014 roundIncrement = 0; 4015 break; 4016 case float_round_up: 4017 roundIncrement = zSign ? 0 : 0x7f; 4018 break; 4019 case float_round_down: 4020 roundIncrement = zSign ? 0x7f : 0; 4021 break; 4022 case float_round_to_odd: 4023 roundIncrement = absZ & 0x80 ? 0 : 0x7f; 4024 break; 4025 default: 4026 abort(); 4027 } 4028 roundBits = absZ & 0x7F; 4029 absZ = ( absZ + roundIncrement )>>7; 4030 if (!(roundBits ^ 0x40) && roundNearestEven) { 4031 absZ &= ~1; 4032 } 4033 z = absZ; 4034 if ( zSign ) z = - z; 4035 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 4036 float_raise(float_flag_invalid, status); 4037 return zSign ? INT32_MIN : INT32_MAX; 4038 } 4039 if (roundBits) { 4040 float_raise(float_flag_inexact, status); 4041 } 4042 return z; 4043 4044 } 4045 4046 /*---------------------------------------------------------------------------- 4047 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 4048 | `absZ1', with binary point between bits 63 and 64 (between the input words), 4049 | and returns the properly rounded 64-bit integer corresponding to the input. 4050 | If `zSign' is 1, the input is negated before being converted to an integer. 4051 | Ordinarily, the fixed-point input is simply rounded to an integer, with 4052 | the inexact exception raised if the input cannot be represented exactly as 4053 | an integer. However, if the fixed-point input is too large, the invalid 4054 | exception is raised and the largest positive or negative integer is 4055 | returned. 4056 *----------------------------------------------------------------------------*/ 4057 4058 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1, 4059 float_status *status) 4060 { 4061 int8_t roundingMode; 4062 bool roundNearestEven, increment; 4063 int64_t z; 4064 4065 roundingMode = status->float_rounding_mode; 4066 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4067 switch (roundingMode) { 4068 case float_round_nearest_even: 4069 case float_round_ties_away: 4070 increment = ((int64_t) absZ1 < 0); 4071 break; 4072 case float_round_to_zero: 4073 increment = 0; 4074 break; 4075 case float_round_up: 4076 increment = !zSign && absZ1; 4077 break; 4078 case float_round_down: 4079 increment = zSign && absZ1; 4080 break; 4081 case float_round_to_odd: 4082 increment = !(absZ0 & 1) && absZ1; 4083 break; 4084 default: 4085 abort(); 4086 } 4087 if ( increment ) { 4088 ++absZ0; 4089 if ( absZ0 == 0 ) goto overflow; 4090 if (!(absZ1 << 1) && roundNearestEven) { 4091 absZ0 &= ~1; 4092 } 4093 } 4094 z = absZ0; 4095 if ( zSign ) z = - z; 4096 if ( z && ( ( z < 0 ) ^ zSign ) ) { 4097 overflow: 4098 float_raise(float_flag_invalid, status); 4099 return zSign ? INT64_MIN : INT64_MAX; 4100 } 4101 if (absZ1) { 4102 float_raise(float_flag_inexact, status); 4103 } 4104 return z; 4105 4106 } 4107 4108 /*---------------------------------------------------------------------------- 4109 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 4110 | `absZ1', with binary point between bits 63 and 64 (between the input words), 4111 | and returns the properly rounded 64-bit unsigned integer corresponding to the 4112 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 4113 | with the inexact exception raised if the input cannot be represented exactly 4114 | as an integer. However, if the fixed-point input is too large, the invalid 4115 | exception is raised and the largest unsigned integer is returned. 4116 *----------------------------------------------------------------------------*/ 4117 4118 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0, 4119 uint64_t absZ1, float_status *status) 4120 { 4121 int8_t roundingMode; 4122 bool roundNearestEven, increment; 4123 4124 roundingMode = status->float_rounding_mode; 4125 roundNearestEven = (roundingMode == float_round_nearest_even); 4126 switch (roundingMode) { 4127 case float_round_nearest_even: 4128 case float_round_ties_away: 4129 increment = ((int64_t)absZ1 < 0); 4130 break; 4131 case float_round_to_zero: 4132 increment = 0; 4133 break; 4134 case float_round_up: 4135 increment = !zSign && absZ1; 4136 break; 4137 case float_round_down: 4138 increment = zSign && absZ1; 4139 break; 4140 case float_round_to_odd: 4141 increment = !(absZ0 & 1) && absZ1; 4142 break; 4143 default: 4144 abort(); 4145 } 4146 if (increment) { 4147 ++absZ0; 4148 if (absZ0 == 0) { 4149 float_raise(float_flag_invalid, status); 4150 return UINT64_MAX; 4151 } 4152 if (!(absZ1 << 1) && roundNearestEven) { 4153 absZ0 &= ~1; 4154 } 4155 } 4156 4157 if (zSign && absZ0) { 4158 float_raise(float_flag_invalid, status); 4159 return 0; 4160 } 4161 4162 if (absZ1) { 4163 float_raise(float_flag_inexact, status); 4164 } 4165 return absZ0; 4166 } 4167 4168 /*---------------------------------------------------------------------------- 4169 | Normalizes the subnormal single-precision floating-point value represented 4170 | by the denormalized significand `aSig'. The normalized exponent and 4171 | significand are stored at the locations pointed to by `zExpPtr' and 4172 | `zSigPtr', respectively. 4173 *----------------------------------------------------------------------------*/ 4174 4175 static void 4176 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 4177 { 4178 int8_t shiftCount; 4179 4180 shiftCount = clz32(aSig) - 8; 4181 *zSigPtr = aSig<<shiftCount; 4182 *zExpPtr = 1 - shiftCount; 4183 4184 } 4185 4186 /*---------------------------------------------------------------------------- 4187 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4188 | and significand `zSig', and returns the proper single-precision floating- 4189 | point value corresponding to the abstract input. Ordinarily, the abstract 4190 | value is simply rounded and packed into the single-precision format, with 4191 | the inexact exception raised if the abstract input cannot be represented 4192 | exactly. However, if the abstract value is too large, the overflow and 4193 | inexact exceptions are raised and an infinity or maximal finite value is 4194 | returned. If the abstract value is too small, the input value is rounded to 4195 | a subnormal number, and the underflow and inexact exceptions are raised if 4196 | the abstract input cannot be represented exactly as a subnormal single- 4197 | precision floating-point number. 4198 | The input significand `zSig' has its binary point between bits 30 4199 | and 29, which is 7 bits to the left of the usual location. This shifted 4200 | significand must be normalized or smaller. If `zSig' is not normalized, 4201 | `zExp' must be 0; in that case, the result returned is a subnormal number, 4202 | and it must not require rounding. In the usual case that `zSig' is 4203 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 4204 | The handling of underflow and overflow follows the IEC/IEEE Standard for 4205 | Binary Floating-Point Arithmetic. 4206 *----------------------------------------------------------------------------*/ 4207 4208 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig, 4209 float_status *status) 4210 { 4211 int8_t roundingMode; 4212 bool roundNearestEven; 4213 int8_t roundIncrement, roundBits; 4214 bool isTiny; 4215 4216 roundingMode = status->float_rounding_mode; 4217 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4218 switch (roundingMode) { 4219 case float_round_nearest_even: 4220 case float_round_ties_away: 4221 roundIncrement = 0x40; 4222 break; 4223 case float_round_to_zero: 4224 roundIncrement = 0; 4225 break; 4226 case float_round_up: 4227 roundIncrement = zSign ? 0 : 0x7f; 4228 break; 4229 case float_round_down: 4230 roundIncrement = zSign ? 0x7f : 0; 4231 break; 4232 case float_round_to_odd: 4233 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 4234 break; 4235 default: 4236 abort(); 4237 break; 4238 } 4239 roundBits = zSig & 0x7F; 4240 if ( 0xFD <= (uint16_t) zExp ) { 4241 if ( ( 0xFD < zExp ) 4242 || ( ( zExp == 0xFD ) 4243 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 4244 ) { 4245 bool overflow_to_inf = roundingMode != float_round_to_odd && 4246 roundIncrement != 0; 4247 float_raise(float_flag_overflow | float_flag_inexact, status); 4248 return packFloat32(zSign, 0xFF, -!overflow_to_inf); 4249 } 4250 if ( zExp < 0 ) { 4251 if (status->flush_to_zero) { 4252 float_raise(float_flag_output_denormal, status); 4253 return packFloat32(zSign, 0, 0); 4254 } 4255 isTiny = status->tininess_before_rounding 4256 || (zExp < -1) 4257 || (zSig + roundIncrement < 0x80000000); 4258 shift32RightJamming( zSig, - zExp, &zSig ); 4259 zExp = 0; 4260 roundBits = zSig & 0x7F; 4261 if (isTiny && roundBits) { 4262 float_raise(float_flag_underflow, status); 4263 } 4264 if (roundingMode == float_round_to_odd) { 4265 /* 4266 * For round-to-odd case, the roundIncrement depends on 4267 * zSig which just changed. 4268 */ 4269 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 4270 } 4271 } 4272 } 4273 if (roundBits) { 4274 float_raise(float_flag_inexact, status); 4275 } 4276 zSig = ( zSig + roundIncrement )>>7; 4277 if (!(roundBits ^ 0x40) && roundNearestEven) { 4278 zSig &= ~1; 4279 } 4280 if ( zSig == 0 ) zExp = 0; 4281 return packFloat32( zSign, zExp, zSig ); 4282 4283 } 4284 4285 /*---------------------------------------------------------------------------- 4286 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4287 | and significand `zSig', and returns the proper single-precision floating- 4288 | point value corresponding to the abstract input. This routine is just like 4289 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 4290 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 4291 | floating-point exponent. 4292 *----------------------------------------------------------------------------*/ 4293 4294 static float32 4295 normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig, 4296 float_status *status) 4297 { 4298 int8_t shiftCount; 4299 4300 shiftCount = clz32(zSig) - 1; 4301 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 4302 status); 4303 4304 } 4305 4306 /*---------------------------------------------------------------------------- 4307 | Normalizes the subnormal double-precision floating-point value represented 4308 | by the denormalized significand `aSig'. The normalized exponent and 4309 | significand are stored at the locations pointed to by `zExpPtr' and 4310 | `zSigPtr', respectively. 4311 *----------------------------------------------------------------------------*/ 4312 4313 static void 4314 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 4315 { 4316 int8_t shiftCount; 4317 4318 shiftCount = clz64(aSig) - 11; 4319 *zSigPtr = aSig<<shiftCount; 4320 *zExpPtr = 1 - shiftCount; 4321 4322 } 4323 4324 /*---------------------------------------------------------------------------- 4325 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 4326 | double-precision floating-point value, returning the result. After being 4327 | shifted into the proper positions, the three fields are simply added 4328 | together to form the result. This means that any integer portion of `zSig' 4329 | will be added into the exponent. Since a properly normalized significand 4330 | will have an integer portion equal to 1, the `zExp' input should be 1 less 4331 | than the desired result exponent whenever `zSig' is a complete, normalized 4332 | significand. 4333 *----------------------------------------------------------------------------*/ 4334 4335 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig) 4336 { 4337 4338 return make_float64( 4339 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 4340 4341 } 4342 4343 /*---------------------------------------------------------------------------- 4344 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4345 | and significand `zSig', and returns the proper double-precision floating- 4346 | point value corresponding to the abstract input. Ordinarily, the abstract 4347 | value is simply rounded and packed into the double-precision format, with 4348 | the inexact exception raised if the abstract input cannot be represented 4349 | exactly. However, if the abstract value is too large, the overflow and 4350 | inexact exceptions are raised and an infinity or maximal finite value is 4351 | returned. If the abstract value is too small, the input value is rounded to 4352 | a subnormal number, and the underflow and inexact exceptions are raised if 4353 | the abstract input cannot be represented exactly as a subnormal double- 4354 | precision floating-point number. 4355 | The input significand `zSig' has its binary point between bits 62 4356 | and 61, which is 10 bits to the left of the usual location. This shifted 4357 | significand must be normalized or smaller. If `zSig' is not normalized, 4358 | `zExp' must be 0; in that case, the result returned is a subnormal number, 4359 | and it must not require rounding. In the usual case that `zSig' is 4360 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 4361 | The handling of underflow and overflow follows the IEC/IEEE Standard for 4362 | Binary Floating-Point Arithmetic. 4363 *----------------------------------------------------------------------------*/ 4364 4365 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig, 4366 float_status *status) 4367 { 4368 int8_t roundingMode; 4369 bool roundNearestEven; 4370 int roundIncrement, roundBits; 4371 bool isTiny; 4372 4373 roundingMode = status->float_rounding_mode; 4374 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4375 switch (roundingMode) { 4376 case float_round_nearest_even: 4377 case float_round_ties_away: 4378 roundIncrement = 0x200; 4379 break; 4380 case float_round_to_zero: 4381 roundIncrement = 0; 4382 break; 4383 case float_round_up: 4384 roundIncrement = zSign ? 0 : 0x3ff; 4385 break; 4386 case float_round_down: 4387 roundIncrement = zSign ? 0x3ff : 0; 4388 break; 4389 case float_round_to_odd: 4390 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 4391 break; 4392 default: 4393 abort(); 4394 } 4395 roundBits = zSig & 0x3FF; 4396 if ( 0x7FD <= (uint16_t) zExp ) { 4397 if ( ( 0x7FD < zExp ) 4398 || ( ( zExp == 0x7FD ) 4399 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 4400 ) { 4401 bool overflow_to_inf = roundingMode != float_round_to_odd && 4402 roundIncrement != 0; 4403 float_raise(float_flag_overflow | float_flag_inexact, status); 4404 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 4405 } 4406 if ( zExp < 0 ) { 4407 if (status->flush_to_zero) { 4408 float_raise(float_flag_output_denormal, status); 4409 return packFloat64(zSign, 0, 0); 4410 } 4411 isTiny = status->tininess_before_rounding 4412 || (zExp < -1) 4413 || (zSig + roundIncrement < UINT64_C(0x8000000000000000)); 4414 shift64RightJamming( zSig, - zExp, &zSig ); 4415 zExp = 0; 4416 roundBits = zSig & 0x3FF; 4417 if (isTiny && roundBits) { 4418 float_raise(float_flag_underflow, status); 4419 } 4420 if (roundingMode == float_round_to_odd) { 4421 /* 4422 * For round-to-odd case, the roundIncrement depends on 4423 * zSig which just changed. 4424 */ 4425 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 4426 } 4427 } 4428 } 4429 if (roundBits) { 4430 float_raise(float_flag_inexact, status); 4431 } 4432 zSig = ( zSig + roundIncrement )>>10; 4433 if (!(roundBits ^ 0x200) && roundNearestEven) { 4434 zSig &= ~1; 4435 } 4436 if ( zSig == 0 ) zExp = 0; 4437 return packFloat64( zSign, zExp, zSig ); 4438 4439 } 4440 4441 /*---------------------------------------------------------------------------- 4442 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4443 | and significand `zSig', and returns the proper double-precision floating- 4444 | point value corresponding to the abstract input. This routine is just like 4445 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 4446 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 4447 | floating-point exponent. 4448 *----------------------------------------------------------------------------*/ 4449 4450 static float64 4451 normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig, 4452 float_status *status) 4453 { 4454 int8_t shiftCount; 4455 4456 shiftCount = clz64(zSig) - 1; 4457 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 4458 status); 4459 4460 } 4461 4462 /*---------------------------------------------------------------------------- 4463 | Normalizes the subnormal extended double-precision floating-point value 4464 | represented by the denormalized significand `aSig'. The normalized exponent 4465 | and significand are stored at the locations pointed to by `zExpPtr' and 4466 | `zSigPtr', respectively. 4467 *----------------------------------------------------------------------------*/ 4468 4469 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr, 4470 uint64_t *zSigPtr) 4471 { 4472 int8_t shiftCount; 4473 4474 shiftCount = clz64(aSig); 4475 *zSigPtr = aSig<<shiftCount; 4476 *zExpPtr = 1 - shiftCount; 4477 } 4478 4479 /*---------------------------------------------------------------------------- 4480 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4481 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 4482 | and returns the proper extended double-precision floating-point value 4483 | corresponding to the abstract input. Ordinarily, the abstract value is 4484 | rounded and packed into the extended double-precision format, with the 4485 | inexact exception raised if the abstract input cannot be represented 4486 | exactly. However, if the abstract value is too large, the overflow and 4487 | inexact exceptions are raised and an infinity or maximal finite value is 4488 | returned. If the abstract value is too small, the input value is rounded to 4489 | a subnormal number, and the underflow and inexact exceptions are raised if 4490 | the abstract input cannot be represented exactly as a subnormal extended 4491 | double-precision floating-point number. 4492 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 4493 | number of bits as single or double precision, respectively. Otherwise, the 4494 | result is rounded to the full precision of the extended double-precision 4495 | format. 4496 | The input significand must be normalized or smaller. If the input 4497 | significand is not normalized, `zExp' must be 0; in that case, the result 4498 | returned is a subnormal number, and it must not require rounding. The 4499 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 4500 | Floating-Point Arithmetic. 4501 *----------------------------------------------------------------------------*/ 4502 4503 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign, 4504 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 4505 float_status *status) 4506 { 4507 int8_t roundingMode; 4508 bool roundNearestEven, increment, isTiny; 4509 int64_t roundIncrement, roundMask, roundBits; 4510 4511 roundingMode = status->float_rounding_mode; 4512 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4513 if ( roundingPrecision == 80 ) goto precision80; 4514 if ( roundingPrecision == 64 ) { 4515 roundIncrement = UINT64_C(0x0000000000000400); 4516 roundMask = UINT64_C(0x00000000000007FF); 4517 } 4518 else if ( roundingPrecision == 32 ) { 4519 roundIncrement = UINT64_C(0x0000008000000000); 4520 roundMask = UINT64_C(0x000000FFFFFFFFFF); 4521 } 4522 else { 4523 goto precision80; 4524 } 4525 zSig0 |= ( zSig1 != 0 ); 4526 switch (roundingMode) { 4527 case float_round_nearest_even: 4528 case float_round_ties_away: 4529 break; 4530 case float_round_to_zero: 4531 roundIncrement = 0; 4532 break; 4533 case float_round_up: 4534 roundIncrement = zSign ? 0 : roundMask; 4535 break; 4536 case float_round_down: 4537 roundIncrement = zSign ? roundMask : 0; 4538 break; 4539 default: 4540 abort(); 4541 } 4542 roundBits = zSig0 & roundMask; 4543 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 4544 if ( ( 0x7FFE < zExp ) 4545 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 4546 ) { 4547 goto overflow; 4548 } 4549 if ( zExp <= 0 ) { 4550 if (status->flush_to_zero) { 4551 float_raise(float_flag_output_denormal, status); 4552 return packFloatx80(zSign, 0, 0); 4553 } 4554 isTiny = status->tininess_before_rounding 4555 || (zExp < 0 ) 4556 || (zSig0 <= zSig0 + roundIncrement); 4557 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 4558 zExp = 0; 4559 roundBits = zSig0 & roundMask; 4560 if (isTiny && roundBits) { 4561 float_raise(float_flag_underflow, status); 4562 } 4563 if (roundBits) { 4564 float_raise(float_flag_inexact, status); 4565 } 4566 zSig0 += roundIncrement; 4567 if ( (int64_t) zSig0 < 0 ) zExp = 1; 4568 roundIncrement = roundMask + 1; 4569 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 4570 roundMask |= roundIncrement; 4571 } 4572 zSig0 &= ~ roundMask; 4573 return packFloatx80( zSign, zExp, zSig0 ); 4574 } 4575 } 4576 if (roundBits) { 4577 float_raise(float_flag_inexact, status); 4578 } 4579 zSig0 += roundIncrement; 4580 if ( zSig0 < roundIncrement ) { 4581 ++zExp; 4582 zSig0 = UINT64_C(0x8000000000000000); 4583 } 4584 roundIncrement = roundMask + 1; 4585 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 4586 roundMask |= roundIncrement; 4587 } 4588 zSig0 &= ~ roundMask; 4589 if ( zSig0 == 0 ) zExp = 0; 4590 return packFloatx80( zSign, zExp, zSig0 ); 4591 precision80: 4592 switch (roundingMode) { 4593 case float_round_nearest_even: 4594 case float_round_ties_away: 4595 increment = ((int64_t)zSig1 < 0); 4596 break; 4597 case float_round_to_zero: 4598 increment = 0; 4599 break; 4600 case float_round_up: 4601 increment = !zSign && zSig1; 4602 break; 4603 case float_round_down: 4604 increment = zSign && zSig1; 4605 break; 4606 default: 4607 abort(); 4608 } 4609 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 4610 if ( ( 0x7FFE < zExp ) 4611 || ( ( zExp == 0x7FFE ) 4612 && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) ) 4613 && increment 4614 ) 4615 ) { 4616 roundMask = 0; 4617 overflow: 4618 float_raise(float_flag_overflow | float_flag_inexact, status); 4619 if ( ( roundingMode == float_round_to_zero ) 4620 || ( zSign && ( roundingMode == float_round_up ) ) 4621 || ( ! zSign && ( roundingMode == float_round_down ) ) 4622 ) { 4623 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 4624 } 4625 return packFloatx80(zSign, 4626 floatx80_infinity_high, 4627 floatx80_infinity_low); 4628 } 4629 if ( zExp <= 0 ) { 4630 isTiny = status->tininess_before_rounding 4631 || (zExp < 0) 4632 || !increment 4633 || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF)); 4634 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 4635 zExp = 0; 4636 if (isTiny && zSig1) { 4637 float_raise(float_flag_underflow, status); 4638 } 4639 if (zSig1) { 4640 float_raise(float_flag_inexact, status); 4641 } 4642 switch (roundingMode) { 4643 case float_round_nearest_even: 4644 case float_round_ties_away: 4645 increment = ((int64_t)zSig1 < 0); 4646 break; 4647 case float_round_to_zero: 4648 increment = 0; 4649 break; 4650 case float_round_up: 4651 increment = !zSign && zSig1; 4652 break; 4653 case float_round_down: 4654 increment = zSign && zSig1; 4655 break; 4656 default: 4657 abort(); 4658 } 4659 if ( increment ) { 4660 ++zSig0; 4661 if (!(zSig1 << 1) && roundNearestEven) { 4662 zSig0 &= ~1; 4663 } 4664 if ( (int64_t) zSig0 < 0 ) zExp = 1; 4665 } 4666 return packFloatx80( zSign, zExp, zSig0 ); 4667 } 4668 } 4669 if (zSig1) { 4670 float_raise(float_flag_inexact, status); 4671 } 4672 if ( increment ) { 4673 ++zSig0; 4674 if ( zSig0 == 0 ) { 4675 ++zExp; 4676 zSig0 = UINT64_C(0x8000000000000000); 4677 } 4678 else { 4679 if (!(zSig1 << 1) && roundNearestEven) { 4680 zSig0 &= ~1; 4681 } 4682 } 4683 } 4684 else { 4685 if ( zSig0 == 0 ) zExp = 0; 4686 } 4687 return packFloatx80( zSign, zExp, zSig0 ); 4688 4689 } 4690 4691 /*---------------------------------------------------------------------------- 4692 | Takes an abstract floating-point value having sign `zSign', exponent 4693 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 4694 | and returns the proper extended double-precision floating-point value 4695 | corresponding to the abstract input. This routine is just like 4696 | `roundAndPackFloatx80' except that the input significand does not have to be 4697 | normalized. 4698 *----------------------------------------------------------------------------*/ 4699 4700 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 4701 bool zSign, int32_t zExp, 4702 uint64_t zSig0, uint64_t zSig1, 4703 float_status *status) 4704 { 4705 int8_t shiftCount; 4706 4707 if ( zSig0 == 0 ) { 4708 zSig0 = zSig1; 4709 zSig1 = 0; 4710 zExp -= 64; 4711 } 4712 shiftCount = clz64(zSig0); 4713 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4714 zExp -= shiftCount; 4715 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 4716 zSig0, zSig1, status); 4717 4718 } 4719 4720 /*---------------------------------------------------------------------------- 4721 | Returns the least-significant 64 fraction bits of the quadruple-precision 4722 | floating-point value `a'. 4723 *----------------------------------------------------------------------------*/ 4724 4725 static inline uint64_t extractFloat128Frac1( float128 a ) 4726 { 4727 4728 return a.low; 4729 4730 } 4731 4732 /*---------------------------------------------------------------------------- 4733 | Returns the most-significant 48 fraction bits of the quadruple-precision 4734 | floating-point value `a'. 4735 *----------------------------------------------------------------------------*/ 4736 4737 static inline uint64_t extractFloat128Frac0( float128 a ) 4738 { 4739 4740 return a.high & UINT64_C(0x0000FFFFFFFFFFFF); 4741 4742 } 4743 4744 /*---------------------------------------------------------------------------- 4745 | Returns the exponent bits of the quadruple-precision floating-point value 4746 | `a'. 4747 *----------------------------------------------------------------------------*/ 4748 4749 static inline int32_t extractFloat128Exp( float128 a ) 4750 { 4751 4752 return ( a.high>>48 ) & 0x7FFF; 4753 4754 } 4755 4756 /*---------------------------------------------------------------------------- 4757 | Returns the sign bit of the quadruple-precision floating-point value `a'. 4758 *----------------------------------------------------------------------------*/ 4759 4760 static inline bool extractFloat128Sign(float128 a) 4761 { 4762 return a.high >> 63; 4763 } 4764 4765 /*---------------------------------------------------------------------------- 4766 | Normalizes the subnormal quadruple-precision floating-point value 4767 | represented by the denormalized significand formed by the concatenation of 4768 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 4769 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 4770 | significand are stored at the location pointed to by `zSig0Ptr', and the 4771 | least significant 64 bits of the normalized significand are stored at the 4772 | location pointed to by `zSig1Ptr'. 4773 *----------------------------------------------------------------------------*/ 4774 4775 static void 4776 normalizeFloat128Subnormal( 4777 uint64_t aSig0, 4778 uint64_t aSig1, 4779 int32_t *zExpPtr, 4780 uint64_t *zSig0Ptr, 4781 uint64_t *zSig1Ptr 4782 ) 4783 { 4784 int8_t shiftCount; 4785 4786 if ( aSig0 == 0 ) { 4787 shiftCount = clz64(aSig1) - 15; 4788 if ( shiftCount < 0 ) { 4789 *zSig0Ptr = aSig1>>( - shiftCount ); 4790 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 4791 } 4792 else { 4793 *zSig0Ptr = aSig1<<shiftCount; 4794 *zSig1Ptr = 0; 4795 } 4796 *zExpPtr = - shiftCount - 63; 4797 } 4798 else { 4799 shiftCount = clz64(aSig0) - 15; 4800 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 4801 *zExpPtr = 1 - shiftCount; 4802 } 4803 4804 } 4805 4806 /*---------------------------------------------------------------------------- 4807 | Packs the sign `zSign', the exponent `zExp', and the significand formed 4808 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 4809 | floating-point value, returning the result. After being shifted into the 4810 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 4811 | added together to form the most significant 32 bits of the result. This 4812 | means that any integer portion of `zSig0' will be added into the exponent. 4813 | Since a properly normalized significand will have an integer portion equal 4814 | to 1, the `zExp' input should be 1 less than the desired result exponent 4815 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 4816 | significand. 4817 *----------------------------------------------------------------------------*/ 4818 4819 static inline float128 4820 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1) 4821 { 4822 float128 z; 4823 4824 z.low = zSig1; 4825 z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0; 4826 return z; 4827 } 4828 4829 /*---------------------------------------------------------------------------- 4830 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4831 | and extended significand formed by the concatenation of `zSig0', `zSig1', 4832 | and `zSig2', and returns the proper quadruple-precision floating-point value 4833 | corresponding to the abstract input. Ordinarily, the abstract value is 4834 | simply rounded and packed into the quadruple-precision format, with the 4835 | inexact exception raised if the abstract input cannot be represented 4836 | exactly. However, if the abstract value is too large, the overflow and 4837 | inexact exceptions are raised and an infinity or maximal finite value is 4838 | returned. If the abstract value is too small, the input value is rounded to 4839 | a subnormal number, and the underflow and inexact exceptions are raised if 4840 | the abstract input cannot be represented exactly as a subnormal quadruple- 4841 | precision floating-point number. 4842 | The input significand must be normalized or smaller. If the input 4843 | significand is not normalized, `zExp' must be 0; in that case, the result 4844 | returned is a subnormal number, and it must not require rounding. In the 4845 | usual case that the input significand is normalized, `zExp' must be 1 less 4846 | than the ``true'' floating-point exponent. The handling of underflow and 4847 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4848 *----------------------------------------------------------------------------*/ 4849 4850 static float128 roundAndPackFloat128(bool zSign, int32_t zExp, 4851 uint64_t zSig0, uint64_t zSig1, 4852 uint64_t zSig2, float_status *status) 4853 { 4854 int8_t roundingMode; 4855 bool roundNearestEven, increment, isTiny; 4856 4857 roundingMode = status->float_rounding_mode; 4858 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4859 switch (roundingMode) { 4860 case float_round_nearest_even: 4861 case float_round_ties_away: 4862 increment = ((int64_t)zSig2 < 0); 4863 break; 4864 case float_round_to_zero: 4865 increment = 0; 4866 break; 4867 case float_round_up: 4868 increment = !zSign && zSig2; 4869 break; 4870 case float_round_down: 4871 increment = zSign && zSig2; 4872 break; 4873 case float_round_to_odd: 4874 increment = !(zSig1 & 0x1) && zSig2; 4875 break; 4876 default: 4877 abort(); 4878 } 4879 if ( 0x7FFD <= (uint32_t) zExp ) { 4880 if ( ( 0x7FFD < zExp ) 4881 || ( ( zExp == 0x7FFD ) 4882 && eq128( 4883 UINT64_C(0x0001FFFFFFFFFFFF), 4884 UINT64_C(0xFFFFFFFFFFFFFFFF), 4885 zSig0, 4886 zSig1 4887 ) 4888 && increment 4889 ) 4890 ) { 4891 float_raise(float_flag_overflow | float_flag_inexact, status); 4892 if ( ( roundingMode == float_round_to_zero ) 4893 || ( zSign && ( roundingMode == float_round_up ) ) 4894 || ( ! zSign && ( roundingMode == float_round_down ) ) 4895 || (roundingMode == float_round_to_odd) 4896 ) { 4897 return 4898 packFloat128( 4899 zSign, 4900 0x7FFE, 4901 UINT64_C(0x0000FFFFFFFFFFFF), 4902 UINT64_C(0xFFFFFFFFFFFFFFFF) 4903 ); 4904 } 4905 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4906 } 4907 if ( zExp < 0 ) { 4908 if (status->flush_to_zero) { 4909 float_raise(float_flag_output_denormal, status); 4910 return packFloat128(zSign, 0, 0, 0); 4911 } 4912 isTiny = status->tininess_before_rounding 4913 || (zExp < -1) 4914 || !increment 4915 || lt128(zSig0, zSig1, 4916 UINT64_C(0x0001FFFFFFFFFFFF), 4917 UINT64_C(0xFFFFFFFFFFFFFFFF)); 4918 shift128ExtraRightJamming( 4919 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 4920 zExp = 0; 4921 if (isTiny && zSig2) { 4922 float_raise(float_flag_underflow, status); 4923 } 4924 switch (roundingMode) { 4925 case float_round_nearest_even: 4926 case float_round_ties_away: 4927 increment = ((int64_t)zSig2 < 0); 4928 break; 4929 case float_round_to_zero: 4930 increment = 0; 4931 break; 4932 case float_round_up: 4933 increment = !zSign && zSig2; 4934 break; 4935 case float_round_down: 4936 increment = zSign && zSig2; 4937 break; 4938 case float_round_to_odd: 4939 increment = !(zSig1 & 0x1) && zSig2; 4940 break; 4941 default: 4942 abort(); 4943 } 4944 } 4945 } 4946 if (zSig2) { 4947 float_raise(float_flag_inexact, status); 4948 } 4949 if ( increment ) { 4950 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 4951 if ((zSig2 + zSig2 == 0) && roundNearestEven) { 4952 zSig1 &= ~1; 4953 } 4954 } 4955 else { 4956 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 4957 } 4958 return packFloat128( zSign, zExp, zSig0, zSig1 ); 4959 4960 } 4961 4962 /*---------------------------------------------------------------------------- 4963 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4964 | and significand formed by the concatenation of `zSig0' and `zSig1', and 4965 | returns the proper quadruple-precision floating-point value corresponding 4966 | to the abstract input. This routine is just like `roundAndPackFloat128' 4967 | except that the input significand has fewer bits and does not have to be 4968 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 4969 | point exponent. 4970 *----------------------------------------------------------------------------*/ 4971 4972 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp, 4973 uint64_t zSig0, uint64_t zSig1, 4974 float_status *status) 4975 { 4976 int8_t shiftCount; 4977 uint64_t zSig2; 4978 4979 if ( zSig0 == 0 ) { 4980 zSig0 = zSig1; 4981 zSig1 = 0; 4982 zExp -= 64; 4983 } 4984 shiftCount = clz64(zSig0) - 15; 4985 if ( 0 <= shiftCount ) { 4986 zSig2 = 0; 4987 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4988 } 4989 else { 4990 shift128ExtraRightJamming( 4991 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 4992 } 4993 zExp -= shiftCount; 4994 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 4995 4996 } 4997 4998 4999 /*---------------------------------------------------------------------------- 5000 | Returns the result of converting the 32-bit two's complement integer `a' 5001 | to the extended double-precision floating-point format. The conversion 5002 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5003 | Arithmetic. 5004 *----------------------------------------------------------------------------*/ 5005 5006 floatx80 int32_to_floatx80(int32_t a, float_status *status) 5007 { 5008 bool zSign; 5009 uint32_t absA; 5010 int8_t shiftCount; 5011 uint64_t zSig; 5012 5013 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 5014 zSign = ( a < 0 ); 5015 absA = zSign ? - a : a; 5016 shiftCount = clz32(absA) + 32; 5017 zSig = absA; 5018 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 5019 5020 } 5021 5022 /*---------------------------------------------------------------------------- 5023 | Returns the result of converting the 32-bit two's complement integer `a' to 5024 | the quadruple-precision floating-point format. The conversion is performed 5025 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5026 *----------------------------------------------------------------------------*/ 5027 5028 float128 int32_to_float128(int32_t a, float_status *status) 5029 { 5030 bool zSign; 5031 uint32_t absA; 5032 int8_t shiftCount; 5033 uint64_t zSig0; 5034 5035 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 5036 zSign = ( a < 0 ); 5037 absA = zSign ? - a : a; 5038 shiftCount = clz32(absA) + 17; 5039 zSig0 = absA; 5040 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 5041 5042 } 5043 5044 /*---------------------------------------------------------------------------- 5045 | Returns the result of converting the 64-bit two's complement integer `a' 5046 | to the extended double-precision floating-point format. The conversion 5047 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5048 | Arithmetic. 5049 *----------------------------------------------------------------------------*/ 5050 5051 floatx80 int64_to_floatx80(int64_t a, float_status *status) 5052 { 5053 bool zSign; 5054 uint64_t absA; 5055 int8_t shiftCount; 5056 5057 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 5058 zSign = ( a < 0 ); 5059 absA = zSign ? - a : a; 5060 shiftCount = clz64(absA); 5061 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 5062 5063 } 5064 5065 /*---------------------------------------------------------------------------- 5066 | Returns the result of converting the 64-bit two's complement integer `a' to 5067 | the quadruple-precision floating-point format. The conversion is performed 5068 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5069 *----------------------------------------------------------------------------*/ 5070 5071 float128 int64_to_float128(int64_t a, float_status *status) 5072 { 5073 bool zSign; 5074 uint64_t absA; 5075 int8_t shiftCount; 5076 int32_t zExp; 5077 uint64_t zSig0, zSig1; 5078 5079 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 5080 zSign = ( a < 0 ); 5081 absA = zSign ? - a : a; 5082 shiftCount = clz64(absA) + 49; 5083 zExp = 0x406E - shiftCount; 5084 if ( 64 <= shiftCount ) { 5085 zSig1 = 0; 5086 zSig0 = absA; 5087 shiftCount -= 64; 5088 } 5089 else { 5090 zSig1 = absA; 5091 zSig0 = 0; 5092 } 5093 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 5094 return packFloat128( zSign, zExp, zSig0, zSig1 ); 5095 5096 } 5097 5098 /*---------------------------------------------------------------------------- 5099 | Returns the result of converting the 64-bit unsigned integer `a' 5100 | to the quadruple-precision floating-point format. The conversion is performed 5101 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5102 *----------------------------------------------------------------------------*/ 5103 5104 float128 uint64_to_float128(uint64_t a, float_status *status) 5105 { 5106 if (a == 0) { 5107 return float128_zero; 5108 } 5109 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status); 5110 } 5111 5112 /*---------------------------------------------------------------------------- 5113 | Returns the result of converting the single-precision floating-point value 5114 | `a' to the extended double-precision floating-point format. The conversion 5115 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5116 | Arithmetic. 5117 *----------------------------------------------------------------------------*/ 5118 5119 floatx80 float32_to_floatx80(float32 a, float_status *status) 5120 { 5121 bool aSign; 5122 int aExp; 5123 uint32_t aSig; 5124 5125 a = float32_squash_input_denormal(a, status); 5126 aSig = extractFloat32Frac( a ); 5127 aExp = extractFloat32Exp( a ); 5128 aSign = extractFloat32Sign( a ); 5129 if ( aExp == 0xFF ) { 5130 if (aSig) { 5131 floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status), 5132 status); 5133 return floatx80_silence_nan(res, status); 5134 } 5135 return packFloatx80(aSign, 5136 floatx80_infinity_high, 5137 floatx80_infinity_low); 5138 } 5139 if ( aExp == 0 ) { 5140 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 5141 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5142 } 5143 aSig |= 0x00800000; 5144 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 5145 5146 } 5147 5148 /*---------------------------------------------------------------------------- 5149 | Returns the result of converting the single-precision floating-point value 5150 | `a' to the double-precision floating-point format. The conversion is 5151 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5152 | Arithmetic. 5153 *----------------------------------------------------------------------------*/ 5154 5155 float128 float32_to_float128(float32 a, float_status *status) 5156 { 5157 bool aSign; 5158 int aExp; 5159 uint32_t aSig; 5160 5161 a = float32_squash_input_denormal(a, status); 5162 aSig = extractFloat32Frac( a ); 5163 aExp = extractFloat32Exp( a ); 5164 aSign = extractFloat32Sign( a ); 5165 if ( aExp == 0xFF ) { 5166 if (aSig) { 5167 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 5168 } 5169 return packFloat128( aSign, 0x7FFF, 0, 0 ); 5170 } 5171 if ( aExp == 0 ) { 5172 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 5173 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5174 --aExp; 5175 } 5176 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 5177 5178 } 5179 5180 /*---------------------------------------------------------------------------- 5181 | Returns the remainder of the single-precision floating-point value `a' 5182 | with respect to the corresponding value `b'. The operation is performed 5183 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5184 *----------------------------------------------------------------------------*/ 5185 5186 float32 float32_rem(float32 a, float32 b, float_status *status) 5187 { 5188 bool aSign, zSign; 5189 int aExp, bExp, expDiff; 5190 uint32_t aSig, bSig; 5191 uint32_t q; 5192 uint64_t aSig64, bSig64, q64; 5193 uint32_t alternateASig; 5194 int32_t sigMean; 5195 a = float32_squash_input_denormal(a, status); 5196 b = float32_squash_input_denormal(b, status); 5197 5198 aSig = extractFloat32Frac( a ); 5199 aExp = extractFloat32Exp( a ); 5200 aSign = extractFloat32Sign( a ); 5201 bSig = extractFloat32Frac( b ); 5202 bExp = extractFloat32Exp( b ); 5203 if ( aExp == 0xFF ) { 5204 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 5205 return propagateFloat32NaN(a, b, status); 5206 } 5207 float_raise(float_flag_invalid, status); 5208 return float32_default_nan(status); 5209 } 5210 if ( bExp == 0xFF ) { 5211 if (bSig) { 5212 return propagateFloat32NaN(a, b, status); 5213 } 5214 return a; 5215 } 5216 if ( bExp == 0 ) { 5217 if ( bSig == 0 ) { 5218 float_raise(float_flag_invalid, status); 5219 return float32_default_nan(status); 5220 } 5221 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 5222 } 5223 if ( aExp == 0 ) { 5224 if ( aSig == 0 ) return a; 5225 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5226 } 5227 expDiff = aExp - bExp; 5228 aSig |= 0x00800000; 5229 bSig |= 0x00800000; 5230 if ( expDiff < 32 ) { 5231 aSig <<= 8; 5232 bSig <<= 8; 5233 if ( expDiff < 0 ) { 5234 if ( expDiff < -1 ) return a; 5235 aSig >>= 1; 5236 } 5237 q = ( bSig <= aSig ); 5238 if ( q ) aSig -= bSig; 5239 if ( 0 < expDiff ) { 5240 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 5241 q >>= 32 - expDiff; 5242 bSig >>= 2; 5243 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 5244 } 5245 else { 5246 aSig >>= 2; 5247 bSig >>= 2; 5248 } 5249 } 5250 else { 5251 if ( bSig <= aSig ) aSig -= bSig; 5252 aSig64 = ( (uint64_t) aSig )<<40; 5253 bSig64 = ( (uint64_t) bSig )<<40; 5254 expDiff -= 64; 5255 while ( 0 < expDiff ) { 5256 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 5257 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 5258 aSig64 = - ( ( bSig * q64 )<<38 ); 5259 expDiff -= 62; 5260 } 5261 expDiff += 64; 5262 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 5263 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 5264 q = q64>>( 64 - expDiff ); 5265 bSig <<= 6; 5266 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 5267 } 5268 do { 5269 alternateASig = aSig; 5270 ++q; 5271 aSig -= bSig; 5272 } while ( 0 <= (int32_t) aSig ); 5273 sigMean = aSig + alternateASig; 5274 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 5275 aSig = alternateASig; 5276 } 5277 zSign = ( (int32_t) aSig < 0 ); 5278 if ( zSign ) aSig = - aSig; 5279 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 5280 } 5281 5282 5283 5284 /*---------------------------------------------------------------------------- 5285 | Returns the binary exponential of the single-precision floating-point value 5286 | `a'. The operation is performed according to the IEC/IEEE Standard for 5287 | Binary Floating-Point Arithmetic. 5288 | 5289 | Uses the following identities: 5290 | 5291 | 1. ------------------------------------------------------------------------- 5292 | x x*ln(2) 5293 | 2 = e 5294 | 5295 | 2. ------------------------------------------------------------------------- 5296 | 2 3 4 5 n 5297 | x x x x x x x 5298 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 5299 | 1! 2! 3! 4! 5! n! 5300 *----------------------------------------------------------------------------*/ 5301 5302 static const float64 float32_exp2_coefficients[15] = 5303 { 5304 const_float64( 0x3ff0000000000000ll ), /* 1 */ 5305 const_float64( 0x3fe0000000000000ll ), /* 2 */ 5306 const_float64( 0x3fc5555555555555ll ), /* 3 */ 5307 const_float64( 0x3fa5555555555555ll ), /* 4 */ 5308 const_float64( 0x3f81111111111111ll ), /* 5 */ 5309 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 5310 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 5311 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 5312 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 5313 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 5314 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 5315 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 5316 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 5317 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 5318 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 5319 }; 5320 5321 float32 float32_exp2(float32 a, float_status *status) 5322 { 5323 bool aSign; 5324 int aExp; 5325 uint32_t aSig; 5326 float64 r, x, xn; 5327 int i; 5328 a = float32_squash_input_denormal(a, status); 5329 5330 aSig = extractFloat32Frac( a ); 5331 aExp = extractFloat32Exp( a ); 5332 aSign = extractFloat32Sign( a ); 5333 5334 if ( aExp == 0xFF) { 5335 if (aSig) { 5336 return propagateFloat32NaN(a, float32_zero, status); 5337 } 5338 return (aSign) ? float32_zero : a; 5339 } 5340 if (aExp == 0) { 5341 if (aSig == 0) return float32_one; 5342 } 5343 5344 float_raise(float_flag_inexact, status); 5345 5346 /* ******************************* */ 5347 /* using float64 for approximation */ 5348 /* ******************************* */ 5349 x = float32_to_float64(a, status); 5350 x = float64_mul(x, float64_ln2, status); 5351 5352 xn = x; 5353 r = float64_one; 5354 for (i = 0 ; i < 15 ; i++) { 5355 float64 f; 5356 5357 f = float64_mul(xn, float32_exp2_coefficients[i], status); 5358 r = float64_add(r, f, status); 5359 5360 xn = float64_mul(xn, x, status); 5361 } 5362 5363 return float64_to_float32(r, status); 5364 } 5365 5366 /*---------------------------------------------------------------------------- 5367 | Returns the binary log of the single-precision floating-point value `a'. 5368 | The operation is performed according to the IEC/IEEE Standard for Binary 5369 | Floating-Point Arithmetic. 5370 *----------------------------------------------------------------------------*/ 5371 float32 float32_log2(float32 a, float_status *status) 5372 { 5373 bool aSign, zSign; 5374 int aExp; 5375 uint32_t aSig, zSig, i; 5376 5377 a = float32_squash_input_denormal(a, status); 5378 aSig = extractFloat32Frac( a ); 5379 aExp = extractFloat32Exp( a ); 5380 aSign = extractFloat32Sign( a ); 5381 5382 if ( aExp == 0 ) { 5383 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 5384 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5385 } 5386 if ( aSign ) { 5387 float_raise(float_flag_invalid, status); 5388 return float32_default_nan(status); 5389 } 5390 if ( aExp == 0xFF ) { 5391 if (aSig) { 5392 return propagateFloat32NaN(a, float32_zero, status); 5393 } 5394 return a; 5395 } 5396 5397 aExp -= 0x7F; 5398 aSig |= 0x00800000; 5399 zSign = aExp < 0; 5400 zSig = aExp << 23; 5401 5402 for (i = 1 << 22; i > 0; i >>= 1) { 5403 aSig = ( (uint64_t)aSig * aSig ) >> 23; 5404 if ( aSig & 0x01000000 ) { 5405 aSig >>= 1; 5406 zSig |= i; 5407 } 5408 } 5409 5410 if ( zSign ) 5411 zSig = -zSig; 5412 5413 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 5414 } 5415 5416 /*---------------------------------------------------------------------------- 5417 | Returns the result of converting the double-precision floating-point value 5418 | `a' to the extended double-precision floating-point format. The conversion 5419 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5420 | Arithmetic. 5421 *----------------------------------------------------------------------------*/ 5422 5423 floatx80 float64_to_floatx80(float64 a, float_status *status) 5424 { 5425 bool aSign; 5426 int aExp; 5427 uint64_t aSig; 5428 5429 a = float64_squash_input_denormal(a, status); 5430 aSig = extractFloat64Frac( a ); 5431 aExp = extractFloat64Exp( a ); 5432 aSign = extractFloat64Sign( a ); 5433 if ( aExp == 0x7FF ) { 5434 if (aSig) { 5435 floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status), 5436 status); 5437 return floatx80_silence_nan(res, status); 5438 } 5439 return packFloatx80(aSign, 5440 floatx80_infinity_high, 5441 floatx80_infinity_low); 5442 } 5443 if ( aExp == 0 ) { 5444 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 5445 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5446 } 5447 return 5448 packFloatx80( 5449 aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11); 5450 5451 } 5452 5453 /*---------------------------------------------------------------------------- 5454 | Returns the result of converting the double-precision floating-point value 5455 | `a' to the quadruple-precision floating-point format. The conversion is 5456 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5457 | Arithmetic. 5458 *----------------------------------------------------------------------------*/ 5459 5460 float128 float64_to_float128(float64 a, float_status *status) 5461 { 5462 bool aSign; 5463 int aExp; 5464 uint64_t aSig, zSig0, zSig1; 5465 5466 a = float64_squash_input_denormal(a, status); 5467 aSig = extractFloat64Frac( a ); 5468 aExp = extractFloat64Exp( a ); 5469 aSign = extractFloat64Sign( a ); 5470 if ( aExp == 0x7FF ) { 5471 if (aSig) { 5472 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 5473 } 5474 return packFloat128( aSign, 0x7FFF, 0, 0 ); 5475 } 5476 if ( aExp == 0 ) { 5477 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 5478 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5479 --aExp; 5480 } 5481 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 5482 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 5483 5484 } 5485 5486 5487 /*---------------------------------------------------------------------------- 5488 | Returns the remainder of the double-precision floating-point value `a' 5489 | with respect to the corresponding value `b'. The operation is performed 5490 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5491 *----------------------------------------------------------------------------*/ 5492 5493 float64 float64_rem(float64 a, float64 b, float_status *status) 5494 { 5495 bool aSign, zSign; 5496 int aExp, bExp, expDiff; 5497 uint64_t aSig, bSig; 5498 uint64_t q, alternateASig; 5499 int64_t sigMean; 5500 5501 a = float64_squash_input_denormal(a, status); 5502 b = float64_squash_input_denormal(b, status); 5503 aSig = extractFloat64Frac( a ); 5504 aExp = extractFloat64Exp( a ); 5505 aSign = extractFloat64Sign( a ); 5506 bSig = extractFloat64Frac( b ); 5507 bExp = extractFloat64Exp( b ); 5508 if ( aExp == 0x7FF ) { 5509 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 5510 return propagateFloat64NaN(a, b, status); 5511 } 5512 float_raise(float_flag_invalid, status); 5513 return float64_default_nan(status); 5514 } 5515 if ( bExp == 0x7FF ) { 5516 if (bSig) { 5517 return propagateFloat64NaN(a, b, status); 5518 } 5519 return a; 5520 } 5521 if ( bExp == 0 ) { 5522 if ( bSig == 0 ) { 5523 float_raise(float_flag_invalid, status); 5524 return float64_default_nan(status); 5525 } 5526 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 5527 } 5528 if ( aExp == 0 ) { 5529 if ( aSig == 0 ) return a; 5530 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5531 } 5532 expDiff = aExp - bExp; 5533 aSig = (aSig | UINT64_C(0x0010000000000000)) << 11; 5534 bSig = (bSig | UINT64_C(0x0010000000000000)) << 11; 5535 if ( expDiff < 0 ) { 5536 if ( expDiff < -1 ) return a; 5537 aSig >>= 1; 5538 } 5539 q = ( bSig <= aSig ); 5540 if ( q ) aSig -= bSig; 5541 expDiff -= 64; 5542 while ( 0 < expDiff ) { 5543 q = estimateDiv128To64( aSig, 0, bSig ); 5544 q = ( 2 < q ) ? q - 2 : 0; 5545 aSig = - ( ( bSig>>2 ) * q ); 5546 expDiff -= 62; 5547 } 5548 expDiff += 64; 5549 if ( 0 < expDiff ) { 5550 q = estimateDiv128To64( aSig, 0, bSig ); 5551 q = ( 2 < q ) ? q - 2 : 0; 5552 q >>= 64 - expDiff; 5553 bSig >>= 2; 5554 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 5555 } 5556 else { 5557 aSig >>= 2; 5558 bSig >>= 2; 5559 } 5560 do { 5561 alternateASig = aSig; 5562 ++q; 5563 aSig -= bSig; 5564 } while ( 0 <= (int64_t) aSig ); 5565 sigMean = aSig + alternateASig; 5566 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 5567 aSig = alternateASig; 5568 } 5569 zSign = ( (int64_t) aSig < 0 ); 5570 if ( zSign ) aSig = - aSig; 5571 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 5572 5573 } 5574 5575 /*---------------------------------------------------------------------------- 5576 | Returns the binary log of the double-precision floating-point value `a'. 5577 | The operation is performed according to the IEC/IEEE Standard for Binary 5578 | Floating-Point Arithmetic. 5579 *----------------------------------------------------------------------------*/ 5580 float64 float64_log2(float64 a, float_status *status) 5581 { 5582 bool aSign, zSign; 5583 int aExp; 5584 uint64_t aSig, aSig0, aSig1, zSig, i; 5585 a = float64_squash_input_denormal(a, status); 5586 5587 aSig = extractFloat64Frac( a ); 5588 aExp = extractFloat64Exp( a ); 5589 aSign = extractFloat64Sign( a ); 5590 5591 if ( aExp == 0 ) { 5592 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 5593 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5594 } 5595 if ( aSign ) { 5596 float_raise(float_flag_invalid, status); 5597 return float64_default_nan(status); 5598 } 5599 if ( aExp == 0x7FF ) { 5600 if (aSig) { 5601 return propagateFloat64NaN(a, float64_zero, status); 5602 } 5603 return a; 5604 } 5605 5606 aExp -= 0x3FF; 5607 aSig |= UINT64_C(0x0010000000000000); 5608 zSign = aExp < 0; 5609 zSig = (uint64_t)aExp << 52; 5610 for (i = 1LL << 51; i > 0; i >>= 1) { 5611 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 5612 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 5613 if ( aSig & UINT64_C(0x0020000000000000) ) { 5614 aSig >>= 1; 5615 zSig |= i; 5616 } 5617 } 5618 5619 if ( zSign ) 5620 zSig = -zSig; 5621 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 5622 } 5623 5624 /*---------------------------------------------------------------------------- 5625 | Returns the result of converting the extended double-precision floating- 5626 | point value `a' to the 32-bit two's complement integer format. The 5627 | conversion is performed according to the IEC/IEEE Standard for Binary 5628 | Floating-Point Arithmetic---which means in particular that the conversion 5629 | is rounded according to the current rounding mode. If `a' is a NaN, the 5630 | largest positive integer is returned. Otherwise, if the conversion 5631 | overflows, the largest integer with the same sign as `a' is returned. 5632 *----------------------------------------------------------------------------*/ 5633 5634 int32_t floatx80_to_int32(floatx80 a, float_status *status) 5635 { 5636 bool aSign; 5637 int32_t aExp, shiftCount; 5638 uint64_t aSig; 5639 5640 if (floatx80_invalid_encoding(a)) { 5641 float_raise(float_flag_invalid, status); 5642 return 1 << 31; 5643 } 5644 aSig = extractFloatx80Frac( a ); 5645 aExp = extractFloatx80Exp( a ); 5646 aSign = extractFloatx80Sign( a ); 5647 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5648 shiftCount = 0x4037 - aExp; 5649 if ( shiftCount <= 0 ) shiftCount = 1; 5650 shift64RightJamming( aSig, shiftCount, &aSig ); 5651 return roundAndPackInt32(aSign, aSig, status); 5652 5653 } 5654 5655 /*---------------------------------------------------------------------------- 5656 | Returns the result of converting the extended double-precision floating- 5657 | point value `a' to the 32-bit two's complement integer format. The 5658 | conversion is performed according to the IEC/IEEE Standard for Binary 5659 | Floating-Point Arithmetic, except that the conversion is always rounded 5660 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5661 | Otherwise, if the conversion overflows, the largest integer with the same 5662 | sign as `a' is returned. 5663 *----------------------------------------------------------------------------*/ 5664 5665 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 5666 { 5667 bool aSign; 5668 int32_t aExp, shiftCount; 5669 uint64_t aSig, savedASig; 5670 int32_t z; 5671 5672 if (floatx80_invalid_encoding(a)) { 5673 float_raise(float_flag_invalid, status); 5674 return 1 << 31; 5675 } 5676 aSig = extractFloatx80Frac( a ); 5677 aExp = extractFloatx80Exp( a ); 5678 aSign = extractFloatx80Sign( a ); 5679 if ( 0x401E < aExp ) { 5680 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5681 goto invalid; 5682 } 5683 else if ( aExp < 0x3FFF ) { 5684 if (aExp || aSig) { 5685 float_raise(float_flag_inexact, status); 5686 } 5687 return 0; 5688 } 5689 shiftCount = 0x403E - aExp; 5690 savedASig = aSig; 5691 aSig >>= shiftCount; 5692 z = aSig; 5693 if ( aSign ) z = - z; 5694 if ( ( z < 0 ) ^ aSign ) { 5695 invalid: 5696 float_raise(float_flag_invalid, status); 5697 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5698 } 5699 if ( ( aSig<<shiftCount ) != savedASig ) { 5700 float_raise(float_flag_inexact, status); 5701 } 5702 return z; 5703 5704 } 5705 5706 /*---------------------------------------------------------------------------- 5707 | Returns the result of converting the extended double-precision floating- 5708 | point value `a' to the 64-bit two's complement integer format. The 5709 | conversion is performed according to the IEC/IEEE Standard for Binary 5710 | Floating-Point Arithmetic---which means in particular that the conversion 5711 | is rounded according to the current rounding mode. If `a' is a NaN, 5712 | the largest positive integer is returned. Otherwise, if the conversion 5713 | overflows, the largest integer with the same sign as `a' is returned. 5714 *----------------------------------------------------------------------------*/ 5715 5716 int64_t floatx80_to_int64(floatx80 a, float_status *status) 5717 { 5718 bool aSign; 5719 int32_t aExp, shiftCount; 5720 uint64_t aSig, aSigExtra; 5721 5722 if (floatx80_invalid_encoding(a)) { 5723 float_raise(float_flag_invalid, status); 5724 return 1ULL << 63; 5725 } 5726 aSig = extractFloatx80Frac( a ); 5727 aExp = extractFloatx80Exp( a ); 5728 aSign = extractFloatx80Sign( a ); 5729 shiftCount = 0x403E - aExp; 5730 if ( shiftCount <= 0 ) { 5731 if ( shiftCount ) { 5732 float_raise(float_flag_invalid, status); 5733 if (!aSign || floatx80_is_any_nan(a)) { 5734 return INT64_MAX; 5735 } 5736 return INT64_MIN; 5737 } 5738 aSigExtra = 0; 5739 } 5740 else { 5741 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 5742 } 5743 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 5744 5745 } 5746 5747 /*---------------------------------------------------------------------------- 5748 | Returns the result of converting the extended double-precision floating- 5749 | point value `a' to the 64-bit two's complement integer format. The 5750 | conversion is performed according to the IEC/IEEE Standard for Binary 5751 | Floating-Point Arithmetic, except that the conversion is always rounded 5752 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5753 | Otherwise, if the conversion overflows, the largest integer with the same 5754 | sign as `a' is returned. 5755 *----------------------------------------------------------------------------*/ 5756 5757 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 5758 { 5759 bool aSign; 5760 int32_t aExp, shiftCount; 5761 uint64_t aSig; 5762 int64_t z; 5763 5764 if (floatx80_invalid_encoding(a)) { 5765 float_raise(float_flag_invalid, status); 5766 return 1ULL << 63; 5767 } 5768 aSig = extractFloatx80Frac( a ); 5769 aExp = extractFloatx80Exp( a ); 5770 aSign = extractFloatx80Sign( a ); 5771 shiftCount = aExp - 0x403E; 5772 if ( 0 <= shiftCount ) { 5773 aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF); 5774 if ( ( a.high != 0xC03E ) || aSig ) { 5775 float_raise(float_flag_invalid, status); 5776 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 5777 return INT64_MAX; 5778 } 5779 } 5780 return INT64_MIN; 5781 } 5782 else if ( aExp < 0x3FFF ) { 5783 if (aExp | aSig) { 5784 float_raise(float_flag_inexact, status); 5785 } 5786 return 0; 5787 } 5788 z = aSig>>( - shiftCount ); 5789 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 5790 float_raise(float_flag_inexact, status); 5791 } 5792 if ( aSign ) z = - z; 5793 return z; 5794 5795 } 5796 5797 /*---------------------------------------------------------------------------- 5798 | Returns the result of converting the extended double-precision floating- 5799 | point value `a' to the single-precision floating-point format. The 5800 | conversion is performed according to the IEC/IEEE Standard for Binary 5801 | Floating-Point Arithmetic. 5802 *----------------------------------------------------------------------------*/ 5803 5804 float32 floatx80_to_float32(floatx80 a, float_status *status) 5805 { 5806 bool aSign; 5807 int32_t aExp; 5808 uint64_t aSig; 5809 5810 if (floatx80_invalid_encoding(a)) { 5811 float_raise(float_flag_invalid, status); 5812 return float32_default_nan(status); 5813 } 5814 aSig = extractFloatx80Frac( a ); 5815 aExp = extractFloatx80Exp( a ); 5816 aSign = extractFloatx80Sign( a ); 5817 if ( aExp == 0x7FFF ) { 5818 if ( (uint64_t) ( aSig<<1 ) ) { 5819 float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status), 5820 status); 5821 return float32_silence_nan(res, status); 5822 } 5823 return packFloat32( aSign, 0xFF, 0 ); 5824 } 5825 shift64RightJamming( aSig, 33, &aSig ); 5826 if ( aExp || aSig ) aExp -= 0x3F81; 5827 return roundAndPackFloat32(aSign, aExp, aSig, status); 5828 5829 } 5830 5831 /*---------------------------------------------------------------------------- 5832 | Returns the result of converting the extended double-precision floating- 5833 | point value `a' to the double-precision floating-point format. The 5834 | conversion is performed according to the IEC/IEEE Standard for Binary 5835 | Floating-Point Arithmetic. 5836 *----------------------------------------------------------------------------*/ 5837 5838 float64 floatx80_to_float64(floatx80 a, float_status *status) 5839 { 5840 bool aSign; 5841 int32_t aExp; 5842 uint64_t aSig, zSig; 5843 5844 if (floatx80_invalid_encoding(a)) { 5845 float_raise(float_flag_invalid, status); 5846 return float64_default_nan(status); 5847 } 5848 aSig = extractFloatx80Frac( a ); 5849 aExp = extractFloatx80Exp( a ); 5850 aSign = extractFloatx80Sign( a ); 5851 if ( aExp == 0x7FFF ) { 5852 if ( (uint64_t) ( aSig<<1 ) ) { 5853 float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status), 5854 status); 5855 return float64_silence_nan(res, status); 5856 } 5857 return packFloat64( aSign, 0x7FF, 0 ); 5858 } 5859 shift64RightJamming( aSig, 1, &zSig ); 5860 if ( aExp || aSig ) aExp -= 0x3C01; 5861 return roundAndPackFloat64(aSign, aExp, zSig, status); 5862 5863 } 5864 5865 /*---------------------------------------------------------------------------- 5866 | Returns the result of converting the extended double-precision floating- 5867 | point value `a' to the quadruple-precision floating-point format. The 5868 | conversion is performed according to the IEC/IEEE Standard for Binary 5869 | Floating-Point Arithmetic. 5870 *----------------------------------------------------------------------------*/ 5871 5872 float128 floatx80_to_float128(floatx80 a, float_status *status) 5873 { 5874 bool aSign; 5875 int aExp; 5876 uint64_t aSig, zSig0, zSig1; 5877 5878 if (floatx80_invalid_encoding(a)) { 5879 float_raise(float_flag_invalid, status); 5880 return float128_default_nan(status); 5881 } 5882 aSig = extractFloatx80Frac( a ); 5883 aExp = extractFloatx80Exp( a ); 5884 aSign = extractFloatx80Sign( a ); 5885 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 5886 float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status), 5887 status); 5888 return float128_silence_nan(res, status); 5889 } 5890 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 5891 return packFloat128( aSign, aExp, zSig0, zSig1 ); 5892 5893 } 5894 5895 /*---------------------------------------------------------------------------- 5896 | Rounds the extended double-precision floating-point value `a' 5897 | to the precision provided by floatx80_rounding_precision and returns the 5898 | result as an extended double-precision floating-point value. 5899 | The operation is performed according to the IEC/IEEE Standard for Binary 5900 | Floating-Point Arithmetic. 5901 *----------------------------------------------------------------------------*/ 5902 5903 floatx80 floatx80_round(floatx80 a, float_status *status) 5904 { 5905 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5906 extractFloatx80Sign(a), 5907 extractFloatx80Exp(a), 5908 extractFloatx80Frac(a), 0, status); 5909 } 5910 5911 /*---------------------------------------------------------------------------- 5912 | Rounds the extended double-precision floating-point value `a' to an integer, 5913 | and returns the result as an extended quadruple-precision floating-point 5914 | value. The operation is performed according to the IEC/IEEE Standard for 5915 | Binary Floating-Point Arithmetic. 5916 *----------------------------------------------------------------------------*/ 5917 5918 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 5919 { 5920 bool aSign; 5921 int32_t aExp; 5922 uint64_t lastBitMask, roundBitsMask; 5923 floatx80 z; 5924 5925 if (floatx80_invalid_encoding(a)) { 5926 float_raise(float_flag_invalid, status); 5927 return floatx80_default_nan(status); 5928 } 5929 aExp = extractFloatx80Exp( a ); 5930 if ( 0x403E <= aExp ) { 5931 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 5932 return propagateFloatx80NaN(a, a, status); 5933 } 5934 return a; 5935 } 5936 if ( aExp < 0x3FFF ) { 5937 if ( ( aExp == 0 ) 5938 && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) { 5939 return a; 5940 } 5941 float_raise(float_flag_inexact, status); 5942 aSign = extractFloatx80Sign( a ); 5943 switch (status->float_rounding_mode) { 5944 case float_round_nearest_even: 5945 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 5946 ) { 5947 return 5948 packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000)); 5949 } 5950 break; 5951 case float_round_ties_away: 5952 if (aExp == 0x3FFE) { 5953 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000)); 5954 } 5955 break; 5956 case float_round_down: 5957 return 5958 aSign ? 5959 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000)) 5960 : packFloatx80( 0, 0, 0 ); 5961 case float_round_up: 5962 return 5963 aSign ? packFloatx80( 1, 0, 0 ) 5964 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000)); 5965 5966 case float_round_to_zero: 5967 break; 5968 default: 5969 g_assert_not_reached(); 5970 } 5971 return packFloatx80( aSign, 0, 0 ); 5972 } 5973 lastBitMask = 1; 5974 lastBitMask <<= 0x403E - aExp; 5975 roundBitsMask = lastBitMask - 1; 5976 z = a; 5977 switch (status->float_rounding_mode) { 5978 case float_round_nearest_even: 5979 z.low += lastBitMask>>1; 5980 if ((z.low & roundBitsMask) == 0) { 5981 z.low &= ~lastBitMask; 5982 } 5983 break; 5984 case float_round_ties_away: 5985 z.low += lastBitMask >> 1; 5986 break; 5987 case float_round_to_zero: 5988 break; 5989 case float_round_up: 5990 if (!extractFloatx80Sign(z)) { 5991 z.low += roundBitsMask; 5992 } 5993 break; 5994 case float_round_down: 5995 if (extractFloatx80Sign(z)) { 5996 z.low += roundBitsMask; 5997 } 5998 break; 5999 default: 6000 abort(); 6001 } 6002 z.low &= ~ roundBitsMask; 6003 if ( z.low == 0 ) { 6004 ++z.high; 6005 z.low = UINT64_C(0x8000000000000000); 6006 } 6007 if (z.low != a.low) { 6008 float_raise(float_flag_inexact, status); 6009 } 6010 return z; 6011 6012 } 6013 6014 /*---------------------------------------------------------------------------- 6015 | Returns the result of adding the absolute values of the extended double- 6016 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 6017 | negated before being returned. `zSign' is ignored if the result is a NaN. 6018 | The addition is performed according to the IEC/IEEE Standard for Binary 6019 | Floating-Point Arithmetic. 6020 *----------------------------------------------------------------------------*/ 6021 6022 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign, 6023 float_status *status) 6024 { 6025 int32_t aExp, bExp, zExp; 6026 uint64_t aSig, bSig, zSig0, zSig1; 6027 int32_t expDiff; 6028 6029 aSig = extractFloatx80Frac( a ); 6030 aExp = extractFloatx80Exp( a ); 6031 bSig = extractFloatx80Frac( b ); 6032 bExp = extractFloatx80Exp( b ); 6033 expDiff = aExp - bExp; 6034 if ( 0 < expDiff ) { 6035 if ( aExp == 0x7FFF ) { 6036 if ((uint64_t)(aSig << 1)) { 6037 return propagateFloatx80NaN(a, b, status); 6038 } 6039 return a; 6040 } 6041 if ( bExp == 0 ) --expDiff; 6042 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 6043 zExp = aExp; 6044 } 6045 else if ( expDiff < 0 ) { 6046 if ( bExp == 0x7FFF ) { 6047 if ((uint64_t)(bSig << 1)) { 6048 return propagateFloatx80NaN(a, b, status); 6049 } 6050 return packFloatx80(zSign, 6051 floatx80_infinity_high, 6052 floatx80_infinity_low); 6053 } 6054 if ( aExp == 0 ) ++expDiff; 6055 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 6056 zExp = bExp; 6057 } 6058 else { 6059 if ( aExp == 0x7FFF ) { 6060 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 6061 return propagateFloatx80NaN(a, b, status); 6062 } 6063 return a; 6064 } 6065 zSig1 = 0; 6066 zSig0 = aSig + bSig; 6067 if ( aExp == 0 ) { 6068 if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) { 6069 /* At least one of the values is a pseudo-denormal, 6070 * and there is a carry out of the result. */ 6071 zExp = 1; 6072 goto shiftRight1; 6073 } 6074 if (zSig0 == 0) { 6075 return packFloatx80(zSign, 0, 0); 6076 } 6077 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 6078 goto roundAndPack; 6079 } 6080 zExp = aExp; 6081 goto shiftRight1; 6082 } 6083 zSig0 = aSig + bSig; 6084 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 6085 shiftRight1: 6086 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 6087 zSig0 |= UINT64_C(0x8000000000000000); 6088 ++zExp; 6089 roundAndPack: 6090 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6091 zSign, zExp, zSig0, zSig1, status); 6092 } 6093 6094 /*---------------------------------------------------------------------------- 6095 | Returns the result of subtracting the absolute values of the extended 6096 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 6097 | difference is negated before being returned. `zSign' is ignored if the 6098 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6099 | Standard for Binary Floating-Point Arithmetic. 6100 *----------------------------------------------------------------------------*/ 6101 6102 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign, 6103 float_status *status) 6104 { 6105 int32_t aExp, bExp, zExp; 6106 uint64_t aSig, bSig, zSig0, zSig1; 6107 int32_t expDiff; 6108 6109 aSig = extractFloatx80Frac( a ); 6110 aExp = extractFloatx80Exp( a ); 6111 bSig = extractFloatx80Frac( b ); 6112 bExp = extractFloatx80Exp( b ); 6113 expDiff = aExp - bExp; 6114 if ( 0 < expDiff ) goto aExpBigger; 6115 if ( expDiff < 0 ) goto bExpBigger; 6116 if ( aExp == 0x7FFF ) { 6117 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 6118 return propagateFloatx80NaN(a, b, status); 6119 } 6120 float_raise(float_flag_invalid, status); 6121 return floatx80_default_nan(status); 6122 } 6123 if ( aExp == 0 ) { 6124 aExp = 1; 6125 bExp = 1; 6126 } 6127 zSig1 = 0; 6128 if ( bSig < aSig ) goto aBigger; 6129 if ( aSig < bSig ) goto bBigger; 6130 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 6131 bExpBigger: 6132 if ( bExp == 0x7FFF ) { 6133 if ((uint64_t)(bSig << 1)) { 6134 return propagateFloatx80NaN(a, b, status); 6135 } 6136 return packFloatx80(zSign ^ 1, floatx80_infinity_high, 6137 floatx80_infinity_low); 6138 } 6139 if ( aExp == 0 ) ++expDiff; 6140 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 6141 bBigger: 6142 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 6143 zExp = bExp; 6144 zSign ^= 1; 6145 goto normalizeRoundAndPack; 6146 aExpBigger: 6147 if ( aExp == 0x7FFF ) { 6148 if ((uint64_t)(aSig << 1)) { 6149 return propagateFloatx80NaN(a, b, status); 6150 } 6151 return a; 6152 } 6153 if ( bExp == 0 ) --expDiff; 6154 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 6155 aBigger: 6156 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 6157 zExp = aExp; 6158 normalizeRoundAndPack: 6159 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 6160 zSign, zExp, zSig0, zSig1, status); 6161 } 6162 6163 /*---------------------------------------------------------------------------- 6164 | Returns the result of adding the extended double-precision floating-point 6165 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6166 | Standard for Binary Floating-Point Arithmetic. 6167 *----------------------------------------------------------------------------*/ 6168 6169 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 6170 { 6171 bool aSign, bSign; 6172 6173 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6174 float_raise(float_flag_invalid, status); 6175 return floatx80_default_nan(status); 6176 } 6177 aSign = extractFloatx80Sign( a ); 6178 bSign = extractFloatx80Sign( b ); 6179 if ( aSign == bSign ) { 6180 return addFloatx80Sigs(a, b, aSign, status); 6181 } 6182 else { 6183 return subFloatx80Sigs(a, b, aSign, status); 6184 } 6185 6186 } 6187 6188 /*---------------------------------------------------------------------------- 6189 | Returns the result of subtracting the extended double-precision floating- 6190 | point values `a' and `b'. The operation is performed according to the 6191 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6192 *----------------------------------------------------------------------------*/ 6193 6194 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 6195 { 6196 bool aSign, bSign; 6197 6198 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6199 float_raise(float_flag_invalid, status); 6200 return floatx80_default_nan(status); 6201 } 6202 aSign = extractFloatx80Sign( a ); 6203 bSign = extractFloatx80Sign( b ); 6204 if ( aSign == bSign ) { 6205 return subFloatx80Sigs(a, b, aSign, status); 6206 } 6207 else { 6208 return addFloatx80Sigs(a, b, aSign, status); 6209 } 6210 6211 } 6212 6213 /*---------------------------------------------------------------------------- 6214 | Returns the result of multiplying the extended double-precision floating- 6215 | point values `a' and `b'. The operation is performed according to the 6216 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6217 *----------------------------------------------------------------------------*/ 6218 6219 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 6220 { 6221 bool aSign, bSign, zSign; 6222 int32_t aExp, bExp, zExp; 6223 uint64_t aSig, bSig, zSig0, zSig1; 6224 6225 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6226 float_raise(float_flag_invalid, status); 6227 return floatx80_default_nan(status); 6228 } 6229 aSig = extractFloatx80Frac( a ); 6230 aExp = extractFloatx80Exp( a ); 6231 aSign = extractFloatx80Sign( a ); 6232 bSig = extractFloatx80Frac( b ); 6233 bExp = extractFloatx80Exp( b ); 6234 bSign = extractFloatx80Sign( b ); 6235 zSign = aSign ^ bSign; 6236 if ( aExp == 0x7FFF ) { 6237 if ( (uint64_t) ( aSig<<1 ) 6238 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 6239 return propagateFloatx80NaN(a, b, status); 6240 } 6241 if ( ( bExp | bSig ) == 0 ) goto invalid; 6242 return packFloatx80(zSign, floatx80_infinity_high, 6243 floatx80_infinity_low); 6244 } 6245 if ( bExp == 0x7FFF ) { 6246 if ((uint64_t)(bSig << 1)) { 6247 return propagateFloatx80NaN(a, b, status); 6248 } 6249 if ( ( aExp | aSig ) == 0 ) { 6250 invalid: 6251 float_raise(float_flag_invalid, status); 6252 return floatx80_default_nan(status); 6253 } 6254 return packFloatx80(zSign, floatx80_infinity_high, 6255 floatx80_infinity_low); 6256 } 6257 if ( aExp == 0 ) { 6258 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6259 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 6260 } 6261 if ( bExp == 0 ) { 6262 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6263 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6264 } 6265 zExp = aExp + bExp - 0x3FFE; 6266 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 6267 if ( 0 < (int64_t) zSig0 ) { 6268 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 6269 --zExp; 6270 } 6271 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6272 zSign, zExp, zSig0, zSig1, status); 6273 } 6274 6275 /*---------------------------------------------------------------------------- 6276 | Returns the result of dividing the extended double-precision floating-point 6277 | value `a' by the corresponding value `b'. The operation is performed 6278 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6279 *----------------------------------------------------------------------------*/ 6280 6281 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 6282 { 6283 bool aSign, bSign, zSign; 6284 int32_t aExp, bExp, zExp; 6285 uint64_t aSig, bSig, zSig0, zSig1; 6286 uint64_t rem0, rem1, rem2, term0, term1, term2; 6287 6288 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6289 float_raise(float_flag_invalid, status); 6290 return floatx80_default_nan(status); 6291 } 6292 aSig = extractFloatx80Frac( a ); 6293 aExp = extractFloatx80Exp( a ); 6294 aSign = extractFloatx80Sign( a ); 6295 bSig = extractFloatx80Frac( b ); 6296 bExp = extractFloatx80Exp( b ); 6297 bSign = extractFloatx80Sign( b ); 6298 zSign = aSign ^ bSign; 6299 if ( aExp == 0x7FFF ) { 6300 if ((uint64_t)(aSig << 1)) { 6301 return propagateFloatx80NaN(a, b, status); 6302 } 6303 if ( bExp == 0x7FFF ) { 6304 if ((uint64_t)(bSig << 1)) { 6305 return propagateFloatx80NaN(a, b, status); 6306 } 6307 goto invalid; 6308 } 6309 return packFloatx80(zSign, floatx80_infinity_high, 6310 floatx80_infinity_low); 6311 } 6312 if ( bExp == 0x7FFF ) { 6313 if ((uint64_t)(bSig << 1)) { 6314 return propagateFloatx80NaN(a, b, status); 6315 } 6316 return packFloatx80( zSign, 0, 0 ); 6317 } 6318 if ( bExp == 0 ) { 6319 if ( bSig == 0 ) { 6320 if ( ( aExp | aSig ) == 0 ) { 6321 invalid: 6322 float_raise(float_flag_invalid, status); 6323 return floatx80_default_nan(status); 6324 } 6325 float_raise(float_flag_divbyzero, status); 6326 return packFloatx80(zSign, floatx80_infinity_high, 6327 floatx80_infinity_low); 6328 } 6329 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6330 } 6331 if ( aExp == 0 ) { 6332 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6333 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 6334 } 6335 zExp = aExp - bExp + 0x3FFE; 6336 rem1 = 0; 6337 if ( bSig <= aSig ) { 6338 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 6339 ++zExp; 6340 } 6341 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 6342 mul64To128( bSig, zSig0, &term0, &term1 ); 6343 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 6344 while ( (int64_t) rem0 < 0 ) { 6345 --zSig0; 6346 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 6347 } 6348 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 6349 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 6350 mul64To128( bSig, zSig1, &term1, &term2 ); 6351 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6352 while ( (int64_t) rem1 < 0 ) { 6353 --zSig1; 6354 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 6355 } 6356 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 6357 } 6358 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6359 zSign, zExp, zSig0, zSig1, status); 6360 } 6361 6362 /*---------------------------------------------------------------------------- 6363 | Returns the remainder of the extended double-precision floating-point value 6364 | `a' with respect to the corresponding value `b'. The operation is performed 6365 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic, 6366 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating 6367 | the quotient toward zero instead. '*quotient' is set to the low 64 bits of 6368 | the absolute value of the integer quotient. 6369 *----------------------------------------------------------------------------*/ 6370 6371 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient, 6372 float_status *status) 6373 { 6374 bool aSign, zSign; 6375 int32_t aExp, bExp, expDiff, aExpOrig; 6376 uint64_t aSig0, aSig1, bSig; 6377 uint64_t q, term0, term1, alternateASig0, alternateASig1; 6378 6379 *quotient = 0; 6380 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6381 float_raise(float_flag_invalid, status); 6382 return floatx80_default_nan(status); 6383 } 6384 aSig0 = extractFloatx80Frac( a ); 6385 aExpOrig = aExp = extractFloatx80Exp( a ); 6386 aSign = extractFloatx80Sign( a ); 6387 bSig = extractFloatx80Frac( b ); 6388 bExp = extractFloatx80Exp( b ); 6389 if ( aExp == 0x7FFF ) { 6390 if ( (uint64_t) ( aSig0<<1 ) 6391 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 6392 return propagateFloatx80NaN(a, b, status); 6393 } 6394 goto invalid; 6395 } 6396 if ( bExp == 0x7FFF ) { 6397 if ((uint64_t)(bSig << 1)) { 6398 return propagateFloatx80NaN(a, b, status); 6399 } 6400 if (aExp == 0 && aSig0 >> 63) { 6401 /* 6402 * Pseudo-denormal argument must be returned in normalized 6403 * form. 6404 */ 6405 return packFloatx80(aSign, 1, aSig0); 6406 } 6407 return a; 6408 } 6409 if ( bExp == 0 ) { 6410 if ( bSig == 0 ) { 6411 invalid: 6412 float_raise(float_flag_invalid, status); 6413 return floatx80_default_nan(status); 6414 } 6415 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6416 } 6417 if ( aExp == 0 ) { 6418 if ( aSig0 == 0 ) return a; 6419 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6420 } 6421 zSign = aSign; 6422 expDiff = aExp - bExp; 6423 aSig1 = 0; 6424 if ( expDiff < 0 ) { 6425 if ( mod || expDiff < -1 ) { 6426 if (aExp == 1 && aExpOrig == 0) { 6427 /* 6428 * Pseudo-denormal argument must be returned in 6429 * normalized form. 6430 */ 6431 return packFloatx80(aSign, aExp, aSig0); 6432 } 6433 return a; 6434 } 6435 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 6436 expDiff = 0; 6437 } 6438 *quotient = q = ( bSig <= aSig0 ); 6439 if ( q ) aSig0 -= bSig; 6440 expDiff -= 64; 6441 while ( 0 < expDiff ) { 6442 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6443 q = ( 2 < q ) ? q - 2 : 0; 6444 mul64To128( bSig, q, &term0, &term1 ); 6445 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6446 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 6447 expDiff -= 62; 6448 *quotient <<= 62; 6449 *quotient += q; 6450 } 6451 expDiff += 64; 6452 if ( 0 < expDiff ) { 6453 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6454 q = ( 2 < q ) ? q - 2 : 0; 6455 q >>= 64 - expDiff; 6456 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 6457 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6458 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 6459 while ( le128( term0, term1, aSig0, aSig1 ) ) { 6460 ++q; 6461 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6462 } 6463 if (expDiff < 64) { 6464 *quotient <<= expDiff; 6465 } else { 6466 *quotient = 0; 6467 } 6468 *quotient += q; 6469 } 6470 else { 6471 term1 = 0; 6472 term0 = bSig; 6473 } 6474 if (!mod) { 6475 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 6476 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6477 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6478 && ( q & 1 ) ) 6479 ) { 6480 aSig0 = alternateASig0; 6481 aSig1 = alternateASig1; 6482 zSign = ! zSign; 6483 ++*quotient; 6484 } 6485 } 6486 return 6487 normalizeRoundAndPackFloatx80( 6488 80, zSign, bExp + expDiff, aSig0, aSig1, status); 6489 6490 } 6491 6492 /*---------------------------------------------------------------------------- 6493 | Returns the remainder of the extended double-precision floating-point value 6494 | `a' with respect to the corresponding value `b'. The operation is performed 6495 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6496 *----------------------------------------------------------------------------*/ 6497 6498 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 6499 { 6500 uint64_t quotient; 6501 return floatx80_modrem(a, b, false, "ient, status); 6502 } 6503 6504 /*---------------------------------------------------------------------------- 6505 | Returns the remainder of the extended double-precision floating-point value 6506 | `a' with respect to the corresponding value `b', with the quotient truncated 6507 | toward zero. 6508 *----------------------------------------------------------------------------*/ 6509 6510 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status) 6511 { 6512 uint64_t quotient; 6513 return floatx80_modrem(a, b, true, "ient, status); 6514 } 6515 6516 /*---------------------------------------------------------------------------- 6517 | Returns the square root of the extended double-precision floating-point 6518 | value `a'. The operation is performed according to the IEC/IEEE Standard 6519 | for Binary Floating-Point Arithmetic. 6520 *----------------------------------------------------------------------------*/ 6521 6522 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 6523 { 6524 bool aSign; 6525 int32_t aExp, zExp; 6526 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 6527 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6528 6529 if (floatx80_invalid_encoding(a)) { 6530 float_raise(float_flag_invalid, status); 6531 return floatx80_default_nan(status); 6532 } 6533 aSig0 = extractFloatx80Frac( a ); 6534 aExp = extractFloatx80Exp( a ); 6535 aSign = extractFloatx80Sign( a ); 6536 if ( aExp == 0x7FFF ) { 6537 if ((uint64_t)(aSig0 << 1)) { 6538 return propagateFloatx80NaN(a, a, status); 6539 } 6540 if ( ! aSign ) return a; 6541 goto invalid; 6542 } 6543 if ( aSign ) { 6544 if ( ( aExp | aSig0 ) == 0 ) return a; 6545 invalid: 6546 float_raise(float_flag_invalid, status); 6547 return floatx80_default_nan(status); 6548 } 6549 if ( aExp == 0 ) { 6550 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 6551 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6552 } 6553 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 6554 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 6555 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 6556 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6557 doubleZSig0 = zSig0<<1; 6558 mul64To128( zSig0, zSig0, &term0, &term1 ); 6559 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6560 while ( (int64_t) rem0 < 0 ) { 6561 --zSig0; 6562 doubleZSig0 -= 2; 6563 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6564 } 6565 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6566 if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) { 6567 if ( zSig1 == 0 ) zSig1 = 1; 6568 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6569 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6570 mul64To128( zSig1, zSig1, &term2, &term3 ); 6571 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6572 while ( (int64_t) rem1 < 0 ) { 6573 --zSig1; 6574 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6575 term3 |= 1; 6576 term2 |= doubleZSig0; 6577 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6578 } 6579 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6580 } 6581 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 6582 zSig0 |= doubleZSig0; 6583 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6584 0, zExp, zSig0, zSig1, status); 6585 } 6586 6587 /*---------------------------------------------------------------------------- 6588 | Returns the result of converting the quadruple-precision floating-point 6589 | value `a' to the 32-bit two's complement integer format. The conversion 6590 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6591 | Arithmetic---which means in particular that the conversion is rounded 6592 | according to the current rounding mode. If `a' is a NaN, the largest 6593 | positive integer is returned. Otherwise, if the conversion overflows, the 6594 | largest integer with the same sign as `a' is returned. 6595 *----------------------------------------------------------------------------*/ 6596 6597 int32_t float128_to_int32(float128 a, float_status *status) 6598 { 6599 bool aSign; 6600 int32_t aExp, shiftCount; 6601 uint64_t aSig0, aSig1; 6602 6603 aSig1 = extractFloat128Frac1( a ); 6604 aSig0 = extractFloat128Frac0( a ); 6605 aExp = extractFloat128Exp( a ); 6606 aSign = extractFloat128Sign( a ); 6607 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 6608 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6609 aSig0 |= ( aSig1 != 0 ); 6610 shiftCount = 0x4028 - aExp; 6611 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 6612 return roundAndPackInt32(aSign, aSig0, status); 6613 6614 } 6615 6616 /*---------------------------------------------------------------------------- 6617 | Returns the result of converting the quadruple-precision floating-point 6618 | value `a' to the 32-bit two's complement integer format. The conversion 6619 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6620 | Arithmetic, except that the conversion is always rounded toward zero. If 6621 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 6622 | conversion overflows, the largest integer with the same sign as `a' is 6623 | returned. 6624 *----------------------------------------------------------------------------*/ 6625 6626 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 6627 { 6628 bool aSign; 6629 int32_t aExp, shiftCount; 6630 uint64_t aSig0, aSig1, savedASig; 6631 int32_t z; 6632 6633 aSig1 = extractFloat128Frac1( a ); 6634 aSig0 = extractFloat128Frac0( a ); 6635 aExp = extractFloat128Exp( a ); 6636 aSign = extractFloat128Sign( a ); 6637 aSig0 |= ( aSig1 != 0 ); 6638 if ( 0x401E < aExp ) { 6639 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 6640 goto invalid; 6641 } 6642 else if ( aExp < 0x3FFF ) { 6643 if (aExp || aSig0) { 6644 float_raise(float_flag_inexact, status); 6645 } 6646 return 0; 6647 } 6648 aSig0 |= UINT64_C(0x0001000000000000); 6649 shiftCount = 0x402F - aExp; 6650 savedASig = aSig0; 6651 aSig0 >>= shiftCount; 6652 z = aSig0; 6653 if ( aSign ) z = - z; 6654 if ( ( z < 0 ) ^ aSign ) { 6655 invalid: 6656 float_raise(float_flag_invalid, status); 6657 return aSign ? INT32_MIN : INT32_MAX; 6658 } 6659 if ( ( aSig0<<shiftCount ) != savedASig ) { 6660 float_raise(float_flag_inexact, status); 6661 } 6662 return z; 6663 6664 } 6665 6666 /*---------------------------------------------------------------------------- 6667 | Returns the result of converting the quadruple-precision floating-point 6668 | value `a' to the 64-bit two's complement integer format. The conversion 6669 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6670 | Arithmetic---which means in particular that the conversion is rounded 6671 | according to the current rounding mode. If `a' is a NaN, the largest 6672 | positive integer is returned. Otherwise, if the conversion overflows, the 6673 | largest integer with the same sign as `a' is returned. 6674 *----------------------------------------------------------------------------*/ 6675 6676 int64_t float128_to_int64(float128 a, float_status *status) 6677 { 6678 bool aSign; 6679 int32_t aExp, shiftCount; 6680 uint64_t aSig0, aSig1; 6681 6682 aSig1 = extractFloat128Frac1( a ); 6683 aSig0 = extractFloat128Frac0( a ); 6684 aExp = extractFloat128Exp( a ); 6685 aSign = extractFloat128Sign( a ); 6686 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6687 shiftCount = 0x402F - aExp; 6688 if ( shiftCount <= 0 ) { 6689 if ( 0x403E < aExp ) { 6690 float_raise(float_flag_invalid, status); 6691 if ( ! aSign 6692 || ( ( aExp == 0x7FFF ) 6693 && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) ) 6694 ) 6695 ) { 6696 return INT64_MAX; 6697 } 6698 return INT64_MIN; 6699 } 6700 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 6701 } 6702 else { 6703 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 6704 } 6705 return roundAndPackInt64(aSign, aSig0, aSig1, status); 6706 6707 } 6708 6709 /*---------------------------------------------------------------------------- 6710 | Returns the result of converting the quadruple-precision floating-point 6711 | value `a' to the 64-bit two's complement integer format. The conversion 6712 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6713 | Arithmetic, except that the conversion is always rounded toward zero. 6714 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 6715 | the conversion overflows, the largest integer with the same sign as `a' is 6716 | returned. 6717 *----------------------------------------------------------------------------*/ 6718 6719 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 6720 { 6721 bool aSign; 6722 int32_t aExp, shiftCount; 6723 uint64_t aSig0, aSig1; 6724 int64_t z; 6725 6726 aSig1 = extractFloat128Frac1( a ); 6727 aSig0 = extractFloat128Frac0( a ); 6728 aExp = extractFloat128Exp( a ); 6729 aSign = extractFloat128Sign( a ); 6730 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6731 shiftCount = aExp - 0x402F; 6732 if ( 0 < shiftCount ) { 6733 if ( 0x403E <= aExp ) { 6734 aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF); 6735 if ( ( a.high == UINT64_C(0xC03E000000000000) ) 6736 && ( aSig1 < UINT64_C(0x0002000000000000) ) ) { 6737 if (aSig1) { 6738 float_raise(float_flag_inexact, status); 6739 } 6740 } 6741 else { 6742 float_raise(float_flag_invalid, status); 6743 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 6744 return INT64_MAX; 6745 } 6746 } 6747 return INT64_MIN; 6748 } 6749 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 6750 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 6751 float_raise(float_flag_inexact, status); 6752 } 6753 } 6754 else { 6755 if ( aExp < 0x3FFF ) { 6756 if ( aExp | aSig0 | aSig1 ) { 6757 float_raise(float_flag_inexact, status); 6758 } 6759 return 0; 6760 } 6761 z = aSig0>>( - shiftCount ); 6762 if ( aSig1 6763 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 6764 float_raise(float_flag_inexact, status); 6765 } 6766 } 6767 if ( aSign ) z = - z; 6768 return z; 6769 6770 } 6771 6772 /*---------------------------------------------------------------------------- 6773 | Returns the result of converting the quadruple-precision floating-point value 6774 | `a' to the 64-bit unsigned integer format. The conversion is 6775 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6776 | Arithmetic---which means in particular that the conversion is rounded 6777 | according to the current rounding mode. If `a' is a NaN, the largest 6778 | positive integer is returned. If the conversion overflows, the 6779 | largest unsigned integer is returned. If 'a' is negative, the value is 6780 | rounded and zero is returned; negative values that do not round to zero 6781 | will raise the inexact exception. 6782 *----------------------------------------------------------------------------*/ 6783 6784 uint64_t float128_to_uint64(float128 a, float_status *status) 6785 { 6786 bool aSign; 6787 int aExp; 6788 int shiftCount; 6789 uint64_t aSig0, aSig1; 6790 6791 aSig0 = extractFloat128Frac0(a); 6792 aSig1 = extractFloat128Frac1(a); 6793 aExp = extractFloat128Exp(a); 6794 aSign = extractFloat128Sign(a); 6795 if (aSign && (aExp > 0x3FFE)) { 6796 float_raise(float_flag_invalid, status); 6797 if (float128_is_any_nan(a)) { 6798 return UINT64_MAX; 6799 } else { 6800 return 0; 6801 } 6802 } 6803 if (aExp) { 6804 aSig0 |= UINT64_C(0x0001000000000000); 6805 } 6806 shiftCount = 0x402F - aExp; 6807 if (shiftCount <= 0) { 6808 if (0x403E < aExp) { 6809 float_raise(float_flag_invalid, status); 6810 return UINT64_MAX; 6811 } 6812 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 6813 } else { 6814 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 6815 } 6816 return roundAndPackUint64(aSign, aSig0, aSig1, status); 6817 } 6818 6819 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 6820 { 6821 uint64_t v; 6822 signed char current_rounding_mode = status->float_rounding_mode; 6823 6824 set_float_rounding_mode(float_round_to_zero, status); 6825 v = float128_to_uint64(a, status); 6826 set_float_rounding_mode(current_rounding_mode, status); 6827 6828 return v; 6829 } 6830 6831 /*---------------------------------------------------------------------------- 6832 | Returns the result of converting the quadruple-precision floating-point 6833 | value `a' to the 32-bit unsigned integer format. The conversion 6834 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6835 | Arithmetic except that the conversion is always rounded toward zero. 6836 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 6837 | if the conversion overflows, the largest unsigned integer is returned. 6838 | If 'a' is negative, the value is rounded and zero is returned; negative 6839 | values that do not round to zero will raise the inexact exception. 6840 *----------------------------------------------------------------------------*/ 6841 6842 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 6843 { 6844 uint64_t v; 6845 uint32_t res; 6846 int old_exc_flags = get_float_exception_flags(status); 6847 6848 v = float128_to_uint64_round_to_zero(a, status); 6849 if (v > 0xffffffff) { 6850 res = 0xffffffff; 6851 } else { 6852 return v; 6853 } 6854 set_float_exception_flags(old_exc_flags, status); 6855 float_raise(float_flag_invalid, status); 6856 return res; 6857 } 6858 6859 /*---------------------------------------------------------------------------- 6860 | Returns the result of converting the quadruple-precision floating-point value 6861 | `a' to the 32-bit unsigned integer format. The conversion is 6862 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6863 | Arithmetic---which means in particular that the conversion is rounded 6864 | according to the current rounding mode. If `a' is a NaN, the largest 6865 | positive integer is returned. If the conversion overflows, the 6866 | largest unsigned integer is returned. If 'a' is negative, the value is 6867 | rounded and zero is returned; negative values that do not round to zero 6868 | will raise the inexact exception. 6869 *----------------------------------------------------------------------------*/ 6870 6871 uint32_t float128_to_uint32(float128 a, float_status *status) 6872 { 6873 uint64_t v; 6874 uint32_t res; 6875 int old_exc_flags = get_float_exception_flags(status); 6876 6877 v = float128_to_uint64(a, status); 6878 if (v > 0xffffffff) { 6879 res = 0xffffffff; 6880 } else { 6881 return v; 6882 } 6883 set_float_exception_flags(old_exc_flags, status); 6884 float_raise(float_flag_invalid, status); 6885 return res; 6886 } 6887 6888 /*---------------------------------------------------------------------------- 6889 | Returns the result of converting the quadruple-precision floating-point 6890 | value `a' to the single-precision floating-point format. The conversion 6891 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6892 | Arithmetic. 6893 *----------------------------------------------------------------------------*/ 6894 6895 float32 float128_to_float32(float128 a, float_status *status) 6896 { 6897 bool aSign; 6898 int32_t aExp; 6899 uint64_t aSig0, aSig1; 6900 uint32_t zSig; 6901 6902 aSig1 = extractFloat128Frac1( a ); 6903 aSig0 = extractFloat128Frac0( a ); 6904 aExp = extractFloat128Exp( a ); 6905 aSign = extractFloat128Sign( a ); 6906 if ( aExp == 0x7FFF ) { 6907 if ( aSig0 | aSig1 ) { 6908 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 6909 } 6910 return packFloat32( aSign, 0xFF, 0 ); 6911 } 6912 aSig0 |= ( aSig1 != 0 ); 6913 shift64RightJamming( aSig0, 18, &aSig0 ); 6914 zSig = aSig0; 6915 if ( aExp || zSig ) { 6916 zSig |= 0x40000000; 6917 aExp -= 0x3F81; 6918 } 6919 return roundAndPackFloat32(aSign, aExp, zSig, status); 6920 6921 } 6922 6923 /*---------------------------------------------------------------------------- 6924 | Returns the result of converting the quadruple-precision floating-point 6925 | value `a' to the double-precision floating-point format. The conversion 6926 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6927 | Arithmetic. 6928 *----------------------------------------------------------------------------*/ 6929 6930 float64 float128_to_float64(float128 a, float_status *status) 6931 { 6932 bool aSign; 6933 int32_t aExp; 6934 uint64_t aSig0, aSig1; 6935 6936 aSig1 = extractFloat128Frac1( a ); 6937 aSig0 = extractFloat128Frac0( a ); 6938 aExp = extractFloat128Exp( a ); 6939 aSign = extractFloat128Sign( a ); 6940 if ( aExp == 0x7FFF ) { 6941 if ( aSig0 | aSig1 ) { 6942 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 6943 } 6944 return packFloat64( aSign, 0x7FF, 0 ); 6945 } 6946 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6947 aSig0 |= ( aSig1 != 0 ); 6948 if ( aExp || aSig0 ) { 6949 aSig0 |= UINT64_C(0x4000000000000000); 6950 aExp -= 0x3C01; 6951 } 6952 return roundAndPackFloat64(aSign, aExp, aSig0, status); 6953 6954 } 6955 6956 /*---------------------------------------------------------------------------- 6957 | Returns the result of converting the quadruple-precision floating-point 6958 | value `a' to the extended double-precision floating-point format. The 6959 | conversion is performed according to the IEC/IEEE Standard for Binary 6960 | Floating-Point Arithmetic. 6961 *----------------------------------------------------------------------------*/ 6962 6963 floatx80 float128_to_floatx80(float128 a, float_status *status) 6964 { 6965 bool aSign; 6966 int32_t aExp; 6967 uint64_t aSig0, aSig1; 6968 6969 aSig1 = extractFloat128Frac1( a ); 6970 aSig0 = extractFloat128Frac0( a ); 6971 aExp = extractFloat128Exp( a ); 6972 aSign = extractFloat128Sign( a ); 6973 if ( aExp == 0x7FFF ) { 6974 if ( aSig0 | aSig1 ) { 6975 floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status), 6976 status); 6977 return floatx80_silence_nan(res, status); 6978 } 6979 return packFloatx80(aSign, floatx80_infinity_high, 6980 floatx80_infinity_low); 6981 } 6982 if ( aExp == 0 ) { 6983 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6984 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6985 } 6986 else { 6987 aSig0 |= UINT64_C(0x0001000000000000); 6988 } 6989 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6990 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6991 6992 } 6993 6994 /*---------------------------------------------------------------------------- 6995 | Rounds the quadruple-precision floating-point value `a' to an integer, and 6996 | returns the result as a quadruple-precision floating-point value. The 6997 | operation is performed according to the IEC/IEEE Standard for Binary 6998 | Floating-Point Arithmetic. 6999 *----------------------------------------------------------------------------*/ 7000 7001 float128 float128_round_to_int(float128 a, float_status *status) 7002 { 7003 bool aSign; 7004 int32_t aExp; 7005 uint64_t lastBitMask, roundBitsMask; 7006 float128 z; 7007 7008 aExp = extractFloat128Exp( a ); 7009 if ( 0x402F <= aExp ) { 7010 if ( 0x406F <= aExp ) { 7011 if ( ( aExp == 0x7FFF ) 7012 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 7013 ) { 7014 return propagateFloat128NaN(a, a, status); 7015 } 7016 return a; 7017 } 7018 lastBitMask = 1; 7019 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 7020 roundBitsMask = lastBitMask - 1; 7021 z = a; 7022 switch (status->float_rounding_mode) { 7023 case float_round_nearest_even: 7024 if ( lastBitMask ) { 7025 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 7026 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 7027 } 7028 else { 7029 if ( (int64_t) z.low < 0 ) { 7030 ++z.high; 7031 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 7032 } 7033 } 7034 break; 7035 case float_round_ties_away: 7036 if (lastBitMask) { 7037 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 7038 } else { 7039 if ((int64_t) z.low < 0) { 7040 ++z.high; 7041 } 7042 } 7043 break; 7044 case float_round_to_zero: 7045 break; 7046 case float_round_up: 7047 if (!extractFloat128Sign(z)) { 7048 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 7049 } 7050 break; 7051 case float_round_down: 7052 if (extractFloat128Sign(z)) { 7053 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 7054 } 7055 break; 7056 case float_round_to_odd: 7057 /* 7058 * Note that if lastBitMask == 0, the last bit is the lsb 7059 * of high, and roundBitsMask == -1. 7060 */ 7061 if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) { 7062 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 7063 } 7064 break; 7065 default: 7066 abort(); 7067 } 7068 z.low &= ~ roundBitsMask; 7069 } 7070 else { 7071 if ( aExp < 0x3FFF ) { 7072 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 7073 float_raise(float_flag_inexact, status); 7074 aSign = extractFloat128Sign( a ); 7075 switch (status->float_rounding_mode) { 7076 case float_round_nearest_even: 7077 if ( ( aExp == 0x3FFE ) 7078 && ( extractFloat128Frac0( a ) 7079 | extractFloat128Frac1( a ) ) 7080 ) { 7081 return packFloat128( aSign, 0x3FFF, 0, 0 ); 7082 } 7083 break; 7084 case float_round_ties_away: 7085 if (aExp == 0x3FFE) { 7086 return packFloat128(aSign, 0x3FFF, 0, 0); 7087 } 7088 break; 7089 case float_round_down: 7090 return 7091 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 7092 : packFloat128( 0, 0, 0, 0 ); 7093 case float_round_up: 7094 return 7095 aSign ? packFloat128( 1, 0, 0, 0 ) 7096 : packFloat128( 0, 0x3FFF, 0, 0 ); 7097 7098 case float_round_to_odd: 7099 return packFloat128(aSign, 0x3FFF, 0, 0); 7100 7101 case float_round_to_zero: 7102 break; 7103 } 7104 return packFloat128( aSign, 0, 0, 0 ); 7105 } 7106 lastBitMask = 1; 7107 lastBitMask <<= 0x402F - aExp; 7108 roundBitsMask = lastBitMask - 1; 7109 z.low = 0; 7110 z.high = a.high; 7111 switch (status->float_rounding_mode) { 7112 case float_round_nearest_even: 7113 z.high += lastBitMask>>1; 7114 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 7115 z.high &= ~ lastBitMask; 7116 } 7117 break; 7118 case float_round_ties_away: 7119 z.high += lastBitMask>>1; 7120 break; 7121 case float_round_to_zero: 7122 break; 7123 case float_round_up: 7124 if (!extractFloat128Sign(z)) { 7125 z.high |= ( a.low != 0 ); 7126 z.high += roundBitsMask; 7127 } 7128 break; 7129 case float_round_down: 7130 if (extractFloat128Sign(z)) { 7131 z.high |= (a.low != 0); 7132 z.high += roundBitsMask; 7133 } 7134 break; 7135 case float_round_to_odd: 7136 if ((z.high & lastBitMask) == 0) { 7137 z.high |= (a.low != 0); 7138 z.high += roundBitsMask; 7139 } 7140 break; 7141 default: 7142 abort(); 7143 } 7144 z.high &= ~ roundBitsMask; 7145 } 7146 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 7147 float_raise(float_flag_inexact, status); 7148 } 7149 return z; 7150 7151 } 7152 7153 /*---------------------------------------------------------------------------- 7154 | Returns the result of adding the absolute values of the quadruple-precision 7155 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 7156 | before being returned. `zSign' is ignored if the result is a NaN. 7157 | The addition is performed according to the IEC/IEEE Standard for Binary 7158 | Floating-Point Arithmetic. 7159 *----------------------------------------------------------------------------*/ 7160 7161 static float128 addFloat128Sigs(float128 a, float128 b, bool zSign, 7162 float_status *status) 7163 { 7164 int32_t aExp, bExp, zExp; 7165 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 7166 int32_t expDiff; 7167 7168 aSig1 = extractFloat128Frac1( a ); 7169 aSig0 = extractFloat128Frac0( a ); 7170 aExp = extractFloat128Exp( a ); 7171 bSig1 = extractFloat128Frac1( b ); 7172 bSig0 = extractFloat128Frac0( b ); 7173 bExp = extractFloat128Exp( b ); 7174 expDiff = aExp - bExp; 7175 if ( 0 < expDiff ) { 7176 if ( aExp == 0x7FFF ) { 7177 if (aSig0 | aSig1) { 7178 return propagateFloat128NaN(a, b, status); 7179 } 7180 return a; 7181 } 7182 if ( bExp == 0 ) { 7183 --expDiff; 7184 } 7185 else { 7186 bSig0 |= UINT64_C(0x0001000000000000); 7187 } 7188 shift128ExtraRightJamming( 7189 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 7190 zExp = aExp; 7191 } 7192 else if ( expDiff < 0 ) { 7193 if ( bExp == 0x7FFF ) { 7194 if (bSig0 | bSig1) { 7195 return propagateFloat128NaN(a, b, status); 7196 } 7197 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7198 } 7199 if ( aExp == 0 ) { 7200 ++expDiff; 7201 } 7202 else { 7203 aSig0 |= UINT64_C(0x0001000000000000); 7204 } 7205 shift128ExtraRightJamming( 7206 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 7207 zExp = bExp; 7208 } 7209 else { 7210 if ( aExp == 0x7FFF ) { 7211 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 7212 return propagateFloat128NaN(a, b, status); 7213 } 7214 return a; 7215 } 7216 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 7217 if ( aExp == 0 ) { 7218 if (status->flush_to_zero) { 7219 if (zSig0 | zSig1) { 7220 float_raise(float_flag_output_denormal, status); 7221 } 7222 return packFloat128(zSign, 0, 0, 0); 7223 } 7224 return packFloat128( zSign, 0, zSig0, zSig1 ); 7225 } 7226 zSig2 = 0; 7227 zSig0 |= UINT64_C(0x0002000000000000); 7228 zExp = aExp; 7229 goto shiftRight1; 7230 } 7231 aSig0 |= UINT64_C(0x0001000000000000); 7232 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 7233 --zExp; 7234 if ( zSig0 < UINT64_C(0x0002000000000000) ) goto roundAndPack; 7235 ++zExp; 7236 shiftRight1: 7237 shift128ExtraRightJamming( 7238 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 7239 roundAndPack: 7240 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7241 7242 } 7243 7244 /*---------------------------------------------------------------------------- 7245 | Returns the result of subtracting the absolute values of the quadruple- 7246 | precision floating-point values `a' and `b'. If `zSign' is 1, the 7247 | difference is negated before being returned. `zSign' is ignored if the 7248 | result is a NaN. The subtraction is performed according to the IEC/IEEE 7249 | Standard for Binary Floating-Point Arithmetic. 7250 *----------------------------------------------------------------------------*/ 7251 7252 static float128 subFloat128Sigs(float128 a, float128 b, bool zSign, 7253 float_status *status) 7254 { 7255 int32_t aExp, bExp, zExp; 7256 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 7257 int32_t expDiff; 7258 7259 aSig1 = extractFloat128Frac1( a ); 7260 aSig0 = extractFloat128Frac0( a ); 7261 aExp = extractFloat128Exp( a ); 7262 bSig1 = extractFloat128Frac1( b ); 7263 bSig0 = extractFloat128Frac0( b ); 7264 bExp = extractFloat128Exp( b ); 7265 expDiff = aExp - bExp; 7266 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 7267 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 7268 if ( 0 < expDiff ) goto aExpBigger; 7269 if ( expDiff < 0 ) goto bExpBigger; 7270 if ( aExp == 0x7FFF ) { 7271 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 7272 return propagateFloat128NaN(a, b, status); 7273 } 7274 float_raise(float_flag_invalid, status); 7275 return float128_default_nan(status); 7276 } 7277 if ( aExp == 0 ) { 7278 aExp = 1; 7279 bExp = 1; 7280 } 7281 if ( bSig0 < aSig0 ) goto aBigger; 7282 if ( aSig0 < bSig0 ) goto bBigger; 7283 if ( bSig1 < aSig1 ) goto aBigger; 7284 if ( aSig1 < bSig1 ) goto bBigger; 7285 return packFloat128(status->float_rounding_mode == float_round_down, 7286 0, 0, 0); 7287 bExpBigger: 7288 if ( bExp == 0x7FFF ) { 7289 if (bSig0 | bSig1) { 7290 return propagateFloat128NaN(a, b, status); 7291 } 7292 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 7293 } 7294 if ( aExp == 0 ) { 7295 ++expDiff; 7296 } 7297 else { 7298 aSig0 |= UINT64_C(0x4000000000000000); 7299 } 7300 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 7301 bSig0 |= UINT64_C(0x4000000000000000); 7302 bBigger: 7303 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 7304 zExp = bExp; 7305 zSign ^= 1; 7306 goto normalizeRoundAndPack; 7307 aExpBigger: 7308 if ( aExp == 0x7FFF ) { 7309 if (aSig0 | aSig1) { 7310 return propagateFloat128NaN(a, b, status); 7311 } 7312 return a; 7313 } 7314 if ( bExp == 0 ) { 7315 --expDiff; 7316 } 7317 else { 7318 bSig0 |= UINT64_C(0x4000000000000000); 7319 } 7320 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 7321 aSig0 |= UINT64_C(0x4000000000000000); 7322 aBigger: 7323 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 7324 zExp = aExp; 7325 normalizeRoundAndPack: 7326 --zExp; 7327 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 7328 status); 7329 7330 } 7331 7332 /*---------------------------------------------------------------------------- 7333 | Returns the result of adding the quadruple-precision floating-point values 7334 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 7335 | for Binary Floating-Point Arithmetic. 7336 *----------------------------------------------------------------------------*/ 7337 7338 float128 float128_add(float128 a, float128 b, float_status *status) 7339 { 7340 bool aSign, bSign; 7341 7342 aSign = extractFloat128Sign( a ); 7343 bSign = extractFloat128Sign( b ); 7344 if ( aSign == bSign ) { 7345 return addFloat128Sigs(a, b, aSign, status); 7346 } 7347 else { 7348 return subFloat128Sigs(a, b, aSign, status); 7349 } 7350 7351 } 7352 7353 /*---------------------------------------------------------------------------- 7354 | Returns the result of subtracting the quadruple-precision floating-point 7355 | values `a' and `b'. The operation is performed according to the IEC/IEEE 7356 | Standard for Binary Floating-Point Arithmetic. 7357 *----------------------------------------------------------------------------*/ 7358 7359 float128 float128_sub(float128 a, float128 b, float_status *status) 7360 { 7361 bool aSign, bSign; 7362 7363 aSign = extractFloat128Sign( a ); 7364 bSign = extractFloat128Sign( b ); 7365 if ( aSign == bSign ) { 7366 return subFloat128Sigs(a, b, aSign, status); 7367 } 7368 else { 7369 return addFloat128Sigs(a, b, aSign, status); 7370 } 7371 7372 } 7373 7374 /*---------------------------------------------------------------------------- 7375 | Returns the result of multiplying the quadruple-precision floating-point 7376 | values `a' and `b'. The operation is performed according to the IEC/IEEE 7377 | Standard for Binary Floating-Point Arithmetic. 7378 *----------------------------------------------------------------------------*/ 7379 7380 float128 float128_mul(float128 a, float128 b, float_status *status) 7381 { 7382 bool aSign, bSign, zSign; 7383 int32_t aExp, bExp, zExp; 7384 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 7385 7386 aSig1 = extractFloat128Frac1( a ); 7387 aSig0 = extractFloat128Frac0( a ); 7388 aExp = extractFloat128Exp( a ); 7389 aSign = extractFloat128Sign( a ); 7390 bSig1 = extractFloat128Frac1( b ); 7391 bSig0 = extractFloat128Frac0( b ); 7392 bExp = extractFloat128Exp( b ); 7393 bSign = extractFloat128Sign( b ); 7394 zSign = aSign ^ bSign; 7395 if ( aExp == 0x7FFF ) { 7396 if ( ( aSig0 | aSig1 ) 7397 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 7398 return propagateFloat128NaN(a, b, status); 7399 } 7400 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 7401 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7402 } 7403 if ( bExp == 0x7FFF ) { 7404 if (bSig0 | bSig1) { 7405 return propagateFloat128NaN(a, b, status); 7406 } 7407 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 7408 invalid: 7409 float_raise(float_flag_invalid, status); 7410 return float128_default_nan(status); 7411 } 7412 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7413 } 7414 if ( aExp == 0 ) { 7415 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7416 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7417 } 7418 if ( bExp == 0 ) { 7419 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7420 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7421 } 7422 zExp = aExp + bExp - 0x4000; 7423 aSig0 |= UINT64_C(0x0001000000000000); 7424 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 7425 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 7426 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 7427 zSig2 |= ( zSig3 != 0 ); 7428 if (UINT64_C( 0x0002000000000000) <= zSig0 ) { 7429 shift128ExtraRightJamming( 7430 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 7431 ++zExp; 7432 } 7433 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7434 7435 } 7436 7437 /*---------------------------------------------------------------------------- 7438 | Returns the result of dividing the quadruple-precision floating-point value 7439 | `a' by the corresponding value `b'. The operation is performed according to 7440 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7441 *----------------------------------------------------------------------------*/ 7442 7443 float128 float128_div(float128 a, float128 b, float_status *status) 7444 { 7445 bool aSign, bSign, zSign; 7446 int32_t aExp, bExp, zExp; 7447 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 7448 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7449 7450 aSig1 = extractFloat128Frac1( a ); 7451 aSig0 = extractFloat128Frac0( a ); 7452 aExp = extractFloat128Exp( a ); 7453 aSign = extractFloat128Sign( a ); 7454 bSig1 = extractFloat128Frac1( b ); 7455 bSig0 = extractFloat128Frac0( b ); 7456 bExp = extractFloat128Exp( b ); 7457 bSign = extractFloat128Sign( b ); 7458 zSign = aSign ^ bSign; 7459 if ( aExp == 0x7FFF ) { 7460 if (aSig0 | aSig1) { 7461 return propagateFloat128NaN(a, b, status); 7462 } 7463 if ( bExp == 0x7FFF ) { 7464 if (bSig0 | bSig1) { 7465 return propagateFloat128NaN(a, b, status); 7466 } 7467 goto invalid; 7468 } 7469 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7470 } 7471 if ( bExp == 0x7FFF ) { 7472 if (bSig0 | bSig1) { 7473 return propagateFloat128NaN(a, b, status); 7474 } 7475 return packFloat128( zSign, 0, 0, 0 ); 7476 } 7477 if ( bExp == 0 ) { 7478 if ( ( bSig0 | bSig1 ) == 0 ) { 7479 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 7480 invalid: 7481 float_raise(float_flag_invalid, status); 7482 return float128_default_nan(status); 7483 } 7484 float_raise(float_flag_divbyzero, status); 7485 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7486 } 7487 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7488 } 7489 if ( aExp == 0 ) { 7490 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7491 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7492 } 7493 zExp = aExp - bExp + 0x3FFD; 7494 shortShift128Left( 7495 aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 ); 7496 shortShift128Left( 7497 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 ); 7498 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 7499 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 7500 ++zExp; 7501 } 7502 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7503 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 7504 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 7505 while ( (int64_t) rem0 < 0 ) { 7506 --zSig0; 7507 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 7508 } 7509 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 7510 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 7511 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 7512 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 7513 while ( (int64_t) rem1 < 0 ) { 7514 --zSig1; 7515 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 7516 } 7517 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7518 } 7519 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 7520 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7521 7522 } 7523 7524 /*---------------------------------------------------------------------------- 7525 | Returns the remainder of the quadruple-precision floating-point value `a' 7526 | with respect to the corresponding value `b'. The operation is performed 7527 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7528 *----------------------------------------------------------------------------*/ 7529 7530 float128 float128_rem(float128 a, float128 b, float_status *status) 7531 { 7532 bool aSign, zSign; 7533 int32_t aExp, bExp, expDiff; 7534 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 7535 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 7536 int64_t sigMean0; 7537 7538 aSig1 = extractFloat128Frac1( a ); 7539 aSig0 = extractFloat128Frac0( a ); 7540 aExp = extractFloat128Exp( a ); 7541 aSign = extractFloat128Sign( a ); 7542 bSig1 = extractFloat128Frac1( b ); 7543 bSig0 = extractFloat128Frac0( b ); 7544 bExp = extractFloat128Exp( b ); 7545 if ( aExp == 0x7FFF ) { 7546 if ( ( aSig0 | aSig1 ) 7547 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 7548 return propagateFloat128NaN(a, b, status); 7549 } 7550 goto invalid; 7551 } 7552 if ( bExp == 0x7FFF ) { 7553 if (bSig0 | bSig1) { 7554 return propagateFloat128NaN(a, b, status); 7555 } 7556 return a; 7557 } 7558 if ( bExp == 0 ) { 7559 if ( ( bSig0 | bSig1 ) == 0 ) { 7560 invalid: 7561 float_raise(float_flag_invalid, status); 7562 return float128_default_nan(status); 7563 } 7564 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7565 } 7566 if ( aExp == 0 ) { 7567 if ( ( aSig0 | aSig1 ) == 0 ) return a; 7568 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7569 } 7570 expDiff = aExp - bExp; 7571 if ( expDiff < -1 ) return a; 7572 shortShift128Left( 7573 aSig0 | UINT64_C(0x0001000000000000), 7574 aSig1, 7575 15 - ( expDiff < 0 ), 7576 &aSig0, 7577 &aSig1 7578 ); 7579 shortShift128Left( 7580 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 ); 7581 q = le128( bSig0, bSig1, aSig0, aSig1 ); 7582 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7583 expDiff -= 64; 7584 while ( 0 < expDiff ) { 7585 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7586 q = ( 4 < q ) ? q - 4 : 0; 7587 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7588 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 7589 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 7590 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 7591 expDiff -= 61; 7592 } 7593 if ( -64 < expDiff ) { 7594 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7595 q = ( 4 < q ) ? q - 4 : 0; 7596 q >>= - expDiff; 7597 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7598 expDiff += 52; 7599 if ( expDiff < 0 ) { 7600 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 7601 } 7602 else { 7603 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 7604 } 7605 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7606 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 7607 } 7608 else { 7609 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 7610 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7611 } 7612 do { 7613 alternateASig0 = aSig0; 7614 alternateASig1 = aSig1; 7615 ++q; 7616 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7617 } while ( 0 <= (int64_t) aSig0 ); 7618 add128( 7619 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 7620 if ( ( sigMean0 < 0 ) 7621 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 7622 aSig0 = alternateASig0; 7623 aSig1 = alternateASig1; 7624 } 7625 zSign = ( (int64_t) aSig0 < 0 ); 7626 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 7627 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 7628 status); 7629 } 7630 7631 /*---------------------------------------------------------------------------- 7632 | Returns the square root of the quadruple-precision floating-point value `a'. 7633 | The operation is performed according to the IEC/IEEE Standard for Binary 7634 | Floating-Point Arithmetic. 7635 *----------------------------------------------------------------------------*/ 7636 7637 float128 float128_sqrt(float128 a, float_status *status) 7638 { 7639 bool aSign; 7640 int32_t aExp, zExp; 7641 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 7642 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7643 7644 aSig1 = extractFloat128Frac1( a ); 7645 aSig0 = extractFloat128Frac0( a ); 7646 aExp = extractFloat128Exp( a ); 7647 aSign = extractFloat128Sign( a ); 7648 if ( aExp == 0x7FFF ) { 7649 if (aSig0 | aSig1) { 7650 return propagateFloat128NaN(a, a, status); 7651 } 7652 if ( ! aSign ) return a; 7653 goto invalid; 7654 } 7655 if ( aSign ) { 7656 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 7657 invalid: 7658 float_raise(float_flag_invalid, status); 7659 return float128_default_nan(status); 7660 } 7661 if ( aExp == 0 ) { 7662 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 7663 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7664 } 7665 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 7666 aSig0 |= UINT64_C(0x0001000000000000); 7667 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 7668 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 7669 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 7670 doubleZSig0 = zSig0<<1; 7671 mul64To128( zSig0, zSig0, &term0, &term1 ); 7672 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 7673 while ( (int64_t) rem0 < 0 ) { 7674 --zSig0; 7675 doubleZSig0 -= 2; 7676 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 7677 } 7678 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 7679 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 7680 if ( zSig1 == 0 ) zSig1 = 1; 7681 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 7682 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 7683 mul64To128( zSig1, zSig1, &term2, &term3 ); 7684 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 7685 while ( (int64_t) rem1 < 0 ) { 7686 --zSig1; 7687 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 7688 term3 |= 1; 7689 term2 |= doubleZSig0; 7690 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 7691 } 7692 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7693 } 7694 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 7695 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 7696 7697 } 7698 7699 static inline FloatRelation 7700 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet, 7701 float_status *status) 7702 { 7703 bool aSign, bSign; 7704 7705 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 7706 float_raise(float_flag_invalid, status); 7707 return float_relation_unordered; 7708 } 7709 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 7710 ( extractFloatx80Frac( a )<<1 ) ) || 7711 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 7712 ( extractFloatx80Frac( b )<<1 ) )) { 7713 if (!is_quiet || 7714 floatx80_is_signaling_nan(a, status) || 7715 floatx80_is_signaling_nan(b, status)) { 7716 float_raise(float_flag_invalid, status); 7717 } 7718 return float_relation_unordered; 7719 } 7720 aSign = extractFloatx80Sign( a ); 7721 bSign = extractFloatx80Sign( b ); 7722 if ( aSign != bSign ) { 7723 7724 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 7725 ( ( a.low | b.low ) == 0 ) ) { 7726 /* zero case */ 7727 return float_relation_equal; 7728 } else { 7729 return 1 - (2 * aSign); 7730 } 7731 } else { 7732 /* Normalize pseudo-denormals before comparison. */ 7733 if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) { 7734 ++a.high; 7735 } 7736 if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) { 7737 ++b.high; 7738 } 7739 if (a.low == b.low && a.high == b.high) { 7740 return float_relation_equal; 7741 } else { 7742 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7743 } 7744 } 7745 } 7746 7747 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status) 7748 { 7749 return floatx80_compare_internal(a, b, 0, status); 7750 } 7751 7752 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b, 7753 float_status *status) 7754 { 7755 return floatx80_compare_internal(a, b, 1, status); 7756 } 7757 7758 static inline FloatRelation 7759 float128_compare_internal(float128 a, float128 b, bool is_quiet, 7760 float_status *status) 7761 { 7762 bool aSign, bSign; 7763 7764 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7765 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7766 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7767 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7768 if (!is_quiet || 7769 float128_is_signaling_nan(a, status) || 7770 float128_is_signaling_nan(b, status)) { 7771 float_raise(float_flag_invalid, status); 7772 } 7773 return float_relation_unordered; 7774 } 7775 aSign = extractFloat128Sign( a ); 7776 bSign = extractFloat128Sign( b ); 7777 if ( aSign != bSign ) { 7778 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7779 /* zero case */ 7780 return float_relation_equal; 7781 } else { 7782 return 1 - (2 * aSign); 7783 } 7784 } else { 7785 if (a.low == b.low && a.high == b.high) { 7786 return float_relation_equal; 7787 } else { 7788 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7789 } 7790 } 7791 } 7792 7793 FloatRelation float128_compare(float128 a, float128 b, float_status *status) 7794 { 7795 return float128_compare_internal(a, b, 0, status); 7796 } 7797 7798 FloatRelation float128_compare_quiet(float128 a, float128 b, 7799 float_status *status) 7800 { 7801 return float128_compare_internal(a, b, 1, status); 7802 } 7803 7804 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7805 { 7806 bool aSign; 7807 int32_t aExp; 7808 uint64_t aSig; 7809 7810 if (floatx80_invalid_encoding(a)) { 7811 float_raise(float_flag_invalid, status); 7812 return floatx80_default_nan(status); 7813 } 7814 aSig = extractFloatx80Frac( a ); 7815 aExp = extractFloatx80Exp( a ); 7816 aSign = extractFloatx80Sign( a ); 7817 7818 if ( aExp == 0x7FFF ) { 7819 if ( aSig<<1 ) { 7820 return propagateFloatx80NaN(a, a, status); 7821 } 7822 return a; 7823 } 7824 7825 if (aExp == 0) { 7826 if (aSig == 0) { 7827 return a; 7828 } 7829 aExp++; 7830 } 7831 7832 if (n > 0x10000) { 7833 n = 0x10000; 7834 } else if (n < -0x10000) { 7835 n = -0x10000; 7836 } 7837 7838 aExp += n; 7839 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7840 aSign, aExp, aSig, 0, status); 7841 } 7842 7843 float128 float128_scalbn(float128 a, int n, float_status *status) 7844 { 7845 bool aSign; 7846 int32_t aExp; 7847 uint64_t aSig0, aSig1; 7848 7849 aSig1 = extractFloat128Frac1( a ); 7850 aSig0 = extractFloat128Frac0( a ); 7851 aExp = extractFloat128Exp( a ); 7852 aSign = extractFloat128Sign( a ); 7853 if ( aExp == 0x7FFF ) { 7854 if ( aSig0 | aSig1 ) { 7855 return propagateFloat128NaN(a, a, status); 7856 } 7857 return a; 7858 } 7859 if (aExp != 0) { 7860 aSig0 |= UINT64_C(0x0001000000000000); 7861 } else if (aSig0 == 0 && aSig1 == 0) { 7862 return a; 7863 } else { 7864 aExp++; 7865 } 7866 7867 if (n > 0x10000) { 7868 n = 0x10000; 7869 } else if (n < -0x10000) { 7870 n = -0x10000; 7871 } 7872 7873 aExp += n - 1; 7874 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7875 , status); 7876 7877 } 7878 7879 static void __attribute__((constructor)) softfloat_init(void) 7880 { 7881 union_float64 ua, ub, uc, ur; 7882 7883 if (QEMU_NO_HARDFLOAT) { 7884 return; 7885 } 7886 /* 7887 * Test that the host's FMA is not obviously broken. For example, 7888 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see 7889 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304 7890 */ 7891 ua.s = 0x0020000000000001ULL; 7892 ub.s = 0x3ca0000000000000ULL; 7893 uc.s = 0x0020000000000000ULL; 7894 ur.h = fma(ua.h, ub.h, uc.h); 7895 if (ur.s != 0x0020000000000001ULL) { 7896 force_soft_fma = true; 7897 } 7898 } 7899