1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include <math.h> 87 #include "qemu/bitops.h" 88 #include "fpu/softfloat.h" 89 90 /* We only need stdlib for abort() */ 91 92 /*---------------------------------------------------------------------------- 93 | Primitive arithmetic functions, including multi-word arithmetic, and 94 | division and square root approximations. (Can be specialized to target if 95 | desired.) 96 *----------------------------------------------------------------------------*/ 97 #include "fpu/softfloat-macros.h" 98 99 /* 100 * Hardfloat 101 * 102 * Fast emulation of guest FP instructions is challenging for two reasons. 103 * First, FP instruction semantics are similar but not identical, particularly 104 * when handling NaNs. Second, emulating at reasonable speed the guest FP 105 * exception flags is not trivial: reading the host's flags register with a 106 * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp], 107 * and trapping on every FP exception is not fast nor pleasant to work with. 108 * 109 * We address these challenges by leveraging the host FPU for a subset of the 110 * operations. To do this we expand on the idea presented in this paper: 111 * 112 * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a 113 * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615. 114 * 115 * The idea is thus to leverage the host FPU to (1) compute FP operations 116 * and (2) identify whether FP exceptions occurred while avoiding 117 * expensive exception flag register accesses. 118 * 119 * An important optimization shown in the paper is that given that exception 120 * flags are rarely cleared by the guest, we can avoid recomputing some flags. 121 * This is particularly useful for the inexact flag, which is very frequently 122 * raised in floating-point workloads. 123 * 124 * We optimize the code further by deferring to soft-fp whenever FP exception 125 * detection might get hairy. Two examples: (1) when at least one operand is 126 * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result 127 * and the result is < the minimum normal. 128 */ 129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \ 130 static inline void name(soft_t *a, float_status *s) \ 131 { \ 132 if (unlikely(soft_t ## _is_denormal(*a))) { \ 133 *a = soft_t ## _set_sign(soft_t ## _zero, \ 134 soft_t ## _is_neg(*a)); \ 135 float_raise(float_flag_input_denormal, s); \ 136 } \ 137 } 138 139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32) 140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64) 141 #undef GEN_INPUT_FLUSH__NOCHECK 142 143 #define GEN_INPUT_FLUSH1(name, soft_t) \ 144 static inline void name(soft_t *a, float_status *s) \ 145 { \ 146 if (likely(!s->flush_inputs_to_zero)) { \ 147 return; \ 148 } \ 149 soft_t ## _input_flush__nocheck(a, s); \ 150 } 151 152 GEN_INPUT_FLUSH1(float32_input_flush1, float32) 153 GEN_INPUT_FLUSH1(float64_input_flush1, float64) 154 #undef GEN_INPUT_FLUSH1 155 156 #define GEN_INPUT_FLUSH2(name, soft_t) \ 157 static inline void name(soft_t *a, soft_t *b, float_status *s) \ 158 { \ 159 if (likely(!s->flush_inputs_to_zero)) { \ 160 return; \ 161 } \ 162 soft_t ## _input_flush__nocheck(a, s); \ 163 soft_t ## _input_flush__nocheck(b, s); \ 164 } 165 166 GEN_INPUT_FLUSH2(float32_input_flush2, float32) 167 GEN_INPUT_FLUSH2(float64_input_flush2, float64) 168 #undef GEN_INPUT_FLUSH2 169 170 #define GEN_INPUT_FLUSH3(name, soft_t) \ 171 static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \ 172 { \ 173 if (likely(!s->flush_inputs_to_zero)) { \ 174 return; \ 175 } \ 176 soft_t ## _input_flush__nocheck(a, s); \ 177 soft_t ## _input_flush__nocheck(b, s); \ 178 soft_t ## _input_flush__nocheck(c, s); \ 179 } 180 181 GEN_INPUT_FLUSH3(float32_input_flush3, float32) 182 GEN_INPUT_FLUSH3(float64_input_flush3, float64) 183 #undef GEN_INPUT_FLUSH3 184 185 /* 186 * Choose whether to use fpclassify or float32/64_* primitives in the generated 187 * hardfloat functions. Each combination of number of inputs and float size 188 * gets its own value. 189 */ 190 #if defined(__x86_64__) 191 # define QEMU_HARDFLOAT_1F32_USE_FP 0 192 # define QEMU_HARDFLOAT_1F64_USE_FP 1 193 # define QEMU_HARDFLOAT_2F32_USE_FP 0 194 # define QEMU_HARDFLOAT_2F64_USE_FP 1 195 # define QEMU_HARDFLOAT_3F32_USE_FP 0 196 # define QEMU_HARDFLOAT_3F64_USE_FP 1 197 #else 198 # define QEMU_HARDFLOAT_1F32_USE_FP 0 199 # define QEMU_HARDFLOAT_1F64_USE_FP 0 200 # define QEMU_HARDFLOAT_2F32_USE_FP 0 201 # define QEMU_HARDFLOAT_2F64_USE_FP 0 202 # define QEMU_HARDFLOAT_3F32_USE_FP 0 203 # define QEMU_HARDFLOAT_3F64_USE_FP 0 204 #endif 205 206 /* 207 * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over 208 * float{32,64}_is_infinity when !USE_FP. 209 * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup. 210 * On power64 however, using isinf() reduces fp-bench performance by up to 50%. 211 */ 212 #if defined(__x86_64__) || defined(__aarch64__) 213 # define QEMU_HARDFLOAT_USE_ISINF 1 214 #else 215 # define QEMU_HARDFLOAT_USE_ISINF 0 216 #endif 217 218 /* 219 * Some targets clear the FP flags before most FP operations. This prevents 220 * the use of hardfloat, since hardfloat relies on the inexact flag being 221 * already set. 222 */ 223 #if defined(TARGET_PPC) || defined(__FAST_MATH__) 224 # if defined(__FAST_MATH__) 225 # warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \ 226 IEEE implementation 227 # endif 228 # define QEMU_NO_HARDFLOAT 1 229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN 230 #else 231 # define QEMU_NO_HARDFLOAT 0 232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline)) 233 #endif 234 235 static inline bool can_use_fpu(const float_status *s) 236 { 237 if (QEMU_NO_HARDFLOAT) { 238 return false; 239 } 240 return likely(s->float_exception_flags & float_flag_inexact && 241 s->float_rounding_mode == float_round_nearest_even); 242 } 243 244 /* 245 * Hardfloat generation functions. Each operation can have two flavors: 246 * either using softfloat primitives (e.g. float32_is_zero_or_normal) for 247 * most condition checks, or native ones (e.g. fpclassify). 248 * 249 * The flavor is chosen by the callers. Instead of using macros, we rely on the 250 * compiler to propagate constants and inline everything into the callers. 251 * 252 * We only generate functions for operations with two inputs, since only 253 * these are common enough to justify consolidating them into common code. 254 */ 255 256 typedef union { 257 float32 s; 258 float h; 259 } union_float32; 260 261 typedef union { 262 float64 s; 263 double h; 264 } union_float64; 265 266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b); 267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b); 268 269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s); 270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s); 271 typedef float (*hard_f32_op2_fn)(float a, float b); 272 typedef double (*hard_f64_op2_fn)(double a, double b); 273 274 /* 2-input is-zero-or-normal */ 275 static inline bool f32_is_zon2(union_float32 a, union_float32 b) 276 { 277 if (QEMU_HARDFLOAT_2F32_USE_FP) { 278 /* 279 * Not using a temp variable for consecutive fpclassify calls ends up 280 * generating faster code. 281 */ 282 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 283 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 284 } 285 return float32_is_zero_or_normal(a.s) && 286 float32_is_zero_or_normal(b.s); 287 } 288 289 static inline bool f64_is_zon2(union_float64 a, union_float64 b) 290 { 291 if (QEMU_HARDFLOAT_2F64_USE_FP) { 292 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 293 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); 294 } 295 return float64_is_zero_or_normal(a.s) && 296 float64_is_zero_or_normal(b.s); 297 } 298 299 /* 3-input is-zero-or-normal */ 300 static inline 301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c) 302 { 303 if (QEMU_HARDFLOAT_3F32_USE_FP) { 304 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 305 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 306 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 307 } 308 return float32_is_zero_or_normal(a.s) && 309 float32_is_zero_or_normal(b.s) && 310 float32_is_zero_or_normal(c.s); 311 } 312 313 static inline 314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c) 315 { 316 if (QEMU_HARDFLOAT_3F64_USE_FP) { 317 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 318 (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && 319 (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); 320 } 321 return float64_is_zero_or_normal(a.s) && 322 float64_is_zero_or_normal(b.s) && 323 float64_is_zero_or_normal(c.s); 324 } 325 326 static inline bool f32_is_inf(union_float32 a) 327 { 328 if (QEMU_HARDFLOAT_USE_ISINF) { 329 return isinf(a.h); 330 } 331 return float32_is_infinity(a.s); 332 } 333 334 static inline bool f64_is_inf(union_float64 a) 335 { 336 if (QEMU_HARDFLOAT_USE_ISINF) { 337 return isinf(a.h); 338 } 339 return float64_is_infinity(a.s); 340 } 341 342 static inline float32 343 float32_gen2(float32 xa, float32 xb, float_status *s, 344 hard_f32_op2_fn hard, soft_f32_op2_fn soft, 345 f32_check_fn pre, f32_check_fn post) 346 { 347 union_float32 ua, ub, ur; 348 349 ua.s = xa; 350 ub.s = xb; 351 352 if (unlikely(!can_use_fpu(s))) { 353 goto soft; 354 } 355 356 float32_input_flush2(&ua.s, &ub.s, s); 357 if (unlikely(!pre(ua, ub))) { 358 goto soft; 359 } 360 361 ur.h = hard(ua.h, ub.h); 362 if (unlikely(f32_is_inf(ur))) { 363 float_raise(float_flag_overflow, s); 364 } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) { 365 goto soft; 366 } 367 return ur.s; 368 369 soft: 370 return soft(ua.s, ub.s, s); 371 } 372 373 static inline float64 374 float64_gen2(float64 xa, float64 xb, float_status *s, 375 hard_f64_op2_fn hard, soft_f64_op2_fn soft, 376 f64_check_fn pre, f64_check_fn post) 377 { 378 union_float64 ua, ub, ur; 379 380 ua.s = xa; 381 ub.s = xb; 382 383 if (unlikely(!can_use_fpu(s))) { 384 goto soft; 385 } 386 387 float64_input_flush2(&ua.s, &ub.s, s); 388 if (unlikely(!pre(ua, ub))) { 389 goto soft; 390 } 391 392 ur.h = hard(ua.h, ub.h); 393 if (unlikely(f64_is_inf(ur))) { 394 float_raise(float_flag_overflow, s); 395 } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) { 396 goto soft; 397 } 398 return ur.s; 399 400 soft: 401 return soft(ua.s, ub.s, s); 402 } 403 404 /*---------------------------------------------------------------------------- 405 | Returns the fraction bits of the single-precision floating-point value `a'. 406 *----------------------------------------------------------------------------*/ 407 408 static inline uint32_t extractFloat32Frac(float32 a) 409 { 410 return float32_val(a) & 0x007FFFFF; 411 } 412 413 /*---------------------------------------------------------------------------- 414 | Returns the exponent bits of the single-precision floating-point value `a'. 415 *----------------------------------------------------------------------------*/ 416 417 static inline int extractFloat32Exp(float32 a) 418 { 419 return (float32_val(a) >> 23) & 0xFF; 420 } 421 422 /*---------------------------------------------------------------------------- 423 | Returns the sign bit of the single-precision floating-point value `a'. 424 *----------------------------------------------------------------------------*/ 425 426 static inline bool extractFloat32Sign(float32 a) 427 { 428 return float32_val(a) >> 31; 429 } 430 431 /*---------------------------------------------------------------------------- 432 | Returns the fraction bits of the double-precision floating-point value `a'. 433 *----------------------------------------------------------------------------*/ 434 435 static inline uint64_t extractFloat64Frac(float64 a) 436 { 437 return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF); 438 } 439 440 /*---------------------------------------------------------------------------- 441 | Returns the exponent bits of the double-precision floating-point value `a'. 442 *----------------------------------------------------------------------------*/ 443 444 static inline int extractFloat64Exp(float64 a) 445 { 446 return (float64_val(a) >> 52) & 0x7FF; 447 } 448 449 /*---------------------------------------------------------------------------- 450 | Returns the sign bit of the double-precision floating-point value `a'. 451 *----------------------------------------------------------------------------*/ 452 453 static inline bool extractFloat64Sign(float64 a) 454 { 455 return float64_val(a) >> 63; 456 } 457 458 /* 459 * Classify a floating point number. Everything above float_class_qnan 460 * is a NaN so cls >= float_class_qnan is any NaN. 461 */ 462 463 typedef enum __attribute__ ((__packed__)) { 464 float_class_unclassified, 465 float_class_zero, 466 float_class_normal, 467 float_class_inf, 468 float_class_qnan, /* all NaNs from here */ 469 float_class_snan, 470 } FloatClass; 471 472 #define float_cmask(bit) (1u << (bit)) 473 474 enum { 475 float_cmask_zero = float_cmask(float_class_zero), 476 float_cmask_normal = float_cmask(float_class_normal), 477 float_cmask_inf = float_cmask(float_class_inf), 478 float_cmask_qnan = float_cmask(float_class_qnan), 479 float_cmask_snan = float_cmask(float_class_snan), 480 481 float_cmask_infzero = float_cmask_zero | float_cmask_inf, 482 float_cmask_anynan = float_cmask_qnan | float_cmask_snan, 483 }; 484 485 486 /* Simple helpers for checking if, or what kind of, NaN we have */ 487 static inline __attribute__((unused)) bool is_nan(FloatClass c) 488 { 489 return unlikely(c >= float_class_qnan); 490 } 491 492 static inline __attribute__((unused)) bool is_snan(FloatClass c) 493 { 494 return c == float_class_snan; 495 } 496 497 static inline __attribute__((unused)) bool is_qnan(FloatClass c) 498 { 499 return c == float_class_qnan; 500 } 501 502 /* 503 * Structure holding all of the decomposed parts of a float. 504 * The exponent is unbiased and the fraction is normalized. 505 * 506 * The fraction words are stored in big-endian word ordering, 507 * so that truncation from a larger format to a smaller format 508 * can be done simply by ignoring subsequent elements. 509 */ 510 511 typedef struct { 512 FloatClass cls; 513 bool sign; 514 int32_t exp; 515 union { 516 /* Routines that know the structure may reference the singular name. */ 517 uint64_t frac; 518 /* 519 * Routines expanded with multiple structures reference "hi" and "lo" 520 * depending on the operation. In FloatParts64, "hi" and "lo" are 521 * both the same word and aliased here. 522 */ 523 uint64_t frac_hi; 524 uint64_t frac_lo; 525 }; 526 } FloatParts64; 527 528 typedef struct { 529 FloatClass cls; 530 bool sign; 531 int32_t exp; 532 uint64_t frac_hi; 533 uint64_t frac_lo; 534 } FloatParts128; 535 536 /* These apply to the most significant word of each FloatPartsN. */ 537 #define DECOMPOSED_BINARY_POINT 63 538 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 539 540 /* Structure holding all of the relevant parameters for a format. 541 * exp_size: the size of the exponent field 542 * exp_bias: the offset applied to the exponent field 543 * exp_max: the maximum normalised exponent 544 * frac_size: the size of the fraction field 545 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 546 * The following are computed based the size of fraction 547 * frac_lsb: least significant bit of fraction 548 * frac_lsbm1: the bit below the least significant bit (for rounding) 549 * round_mask/roundeven_mask: masks used for rounding 550 * The following optional modifiers are available: 551 * arm_althp: handle ARM Alternative Half Precision 552 */ 553 typedef struct { 554 int exp_size; 555 int exp_bias; 556 int exp_max; 557 int frac_size; 558 int frac_shift; 559 uint64_t frac_lsb; 560 uint64_t frac_lsbm1; 561 uint64_t round_mask; 562 uint64_t roundeven_mask; 563 bool arm_althp; 564 } FloatFmt; 565 566 /* Expand fields based on the size of exponent and fraction */ 567 #define FLOAT_PARAMS(E, F) \ 568 .exp_size = E, \ 569 .exp_bias = ((1 << E) - 1) >> 1, \ 570 .exp_max = (1 << E) - 1, \ 571 .frac_size = F, \ 572 .frac_shift = (-F - 1) & 63, \ 573 .frac_lsb = 1ull << ((-F - 1) & 63), \ 574 .frac_lsbm1 = 1ull << ((-F - 2) & 63), \ 575 .round_mask = (1ull << ((-F - 1) & 63)) - 1, \ 576 .roundeven_mask = (2ull << ((-F - 1) & 63)) - 1 577 578 static const FloatFmt float16_params = { 579 FLOAT_PARAMS(5, 10) 580 }; 581 582 static const FloatFmt float16_params_ahp = { 583 FLOAT_PARAMS(5, 10), 584 .arm_althp = true 585 }; 586 587 static const FloatFmt bfloat16_params = { 588 FLOAT_PARAMS(8, 7) 589 }; 590 591 static const FloatFmt float32_params = { 592 FLOAT_PARAMS(8, 23) 593 }; 594 595 static const FloatFmt float64_params = { 596 FLOAT_PARAMS(11, 52) 597 }; 598 599 static const FloatFmt float128_params = { 600 FLOAT_PARAMS(15, 112) 601 }; 602 603 /* Unpack a float to parts, but do not canonicalize. */ 604 static void unpack_raw64(FloatParts64 *r, const FloatFmt *fmt, uint64_t raw) 605 { 606 const int f_size = fmt->frac_size; 607 const int e_size = fmt->exp_size; 608 609 *r = (FloatParts64) { 610 .cls = float_class_unclassified, 611 .sign = extract64(raw, f_size + e_size, 1), 612 .exp = extract64(raw, f_size, e_size), 613 .frac = extract64(raw, 0, f_size) 614 }; 615 } 616 617 static inline void float16_unpack_raw(FloatParts64 *p, float16 f) 618 { 619 unpack_raw64(p, &float16_params, f); 620 } 621 622 static inline void bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f) 623 { 624 unpack_raw64(p, &bfloat16_params, f); 625 } 626 627 static inline void float32_unpack_raw(FloatParts64 *p, float32 f) 628 { 629 unpack_raw64(p, &float32_params, f); 630 } 631 632 static inline void float64_unpack_raw(FloatParts64 *p, float64 f) 633 { 634 unpack_raw64(p, &float64_params, f); 635 } 636 637 static void float128_unpack_raw(FloatParts128 *p, float128 f) 638 { 639 const int f_size = float128_params.frac_size - 64; 640 const int e_size = float128_params.exp_size; 641 642 *p = (FloatParts128) { 643 .cls = float_class_unclassified, 644 .sign = extract64(f.high, f_size + e_size, 1), 645 .exp = extract64(f.high, f_size, e_size), 646 .frac_hi = extract64(f.high, 0, f_size), 647 .frac_lo = f.low, 648 }; 649 } 650 651 /* Pack a float from parts, but do not canonicalize. */ 652 static uint64_t pack_raw64(const FloatParts64 *p, const FloatFmt *fmt) 653 { 654 const int f_size = fmt->frac_size; 655 const int e_size = fmt->exp_size; 656 uint64_t ret; 657 658 ret = (uint64_t)p->sign << (f_size + e_size); 659 ret = deposit64(ret, f_size, e_size, p->exp); 660 ret = deposit64(ret, 0, f_size, p->frac); 661 return ret; 662 } 663 664 static inline float16 float16_pack_raw(const FloatParts64 *p) 665 { 666 return make_float16(pack_raw64(p, &float16_params)); 667 } 668 669 static inline bfloat16 bfloat16_pack_raw(const FloatParts64 *p) 670 { 671 return pack_raw64(p, &bfloat16_params); 672 } 673 674 static inline float32 float32_pack_raw(const FloatParts64 *p) 675 { 676 return make_float32(pack_raw64(p, &float32_params)); 677 } 678 679 static inline float64 float64_pack_raw(const FloatParts64 *p) 680 { 681 return make_float64(pack_raw64(p, &float64_params)); 682 } 683 684 static float128 float128_pack_raw(const FloatParts128 *p) 685 { 686 const int f_size = float128_params.frac_size - 64; 687 const int e_size = float128_params.exp_size; 688 uint64_t hi; 689 690 hi = (uint64_t)p->sign << (f_size + e_size); 691 hi = deposit64(hi, f_size, e_size, p->exp); 692 hi = deposit64(hi, 0, f_size, p->frac_hi); 693 return make_float128(hi, p->frac_lo); 694 } 695 696 /*---------------------------------------------------------------------------- 697 | Functions and definitions to determine: (1) whether tininess for underflow 698 | is detected before or after rounding by default, (2) what (if anything) 699 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 700 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 701 | are propagated from function inputs to output. These details are target- 702 | specific. 703 *----------------------------------------------------------------------------*/ 704 #include "softfloat-specialize.c.inc" 705 706 #define PARTS_GENERIC_64_128(NAME, P) \ 707 QEMU_GENERIC(P, (FloatParts128 *, parts128_##NAME), parts64_##NAME) 708 709 #define parts_default_nan(P, S) PARTS_GENERIC_64_128(default_nan, P)(P, S) 710 #define parts_silence_nan(P, S) PARTS_GENERIC_64_128(silence_nan, P)(P, S) 711 712 static void parts64_return_nan(FloatParts64 *a, float_status *s); 713 static void parts128_return_nan(FloatParts128 *a, float_status *s); 714 715 #define parts_return_nan(P, S) PARTS_GENERIC_64_128(return_nan, P)(P, S) 716 717 static FloatParts64 *parts64_pick_nan(FloatParts64 *a, FloatParts64 *b, 718 float_status *s); 719 static FloatParts128 *parts128_pick_nan(FloatParts128 *a, FloatParts128 *b, 720 float_status *s); 721 722 #define parts_pick_nan(A, B, S) PARTS_GENERIC_64_128(pick_nan, A)(A, B, S) 723 724 static FloatParts64 *parts64_pick_nan_muladd(FloatParts64 *a, FloatParts64 *b, 725 FloatParts64 *c, float_status *s, 726 int ab_mask, int abc_mask); 727 static FloatParts128 *parts128_pick_nan_muladd(FloatParts128 *a, 728 FloatParts128 *b, 729 FloatParts128 *c, 730 float_status *s, 731 int ab_mask, int abc_mask); 732 733 #define parts_pick_nan_muladd(A, B, C, S, ABM, ABCM) \ 734 PARTS_GENERIC_64_128(pick_nan_muladd, A)(A, B, C, S, ABM, ABCM) 735 736 static void parts64_canonicalize(FloatParts64 *p, float_status *status, 737 const FloatFmt *fmt); 738 static void parts128_canonicalize(FloatParts128 *p, float_status *status, 739 const FloatFmt *fmt); 740 741 #define parts_canonicalize(A, S, F) \ 742 PARTS_GENERIC_64_128(canonicalize, A)(A, S, F) 743 744 static void parts64_uncanon(FloatParts64 *p, float_status *status, 745 const FloatFmt *fmt); 746 static void parts128_uncanon(FloatParts128 *p, float_status *status, 747 const FloatFmt *fmt); 748 749 #define parts_uncanon(A, S, F) \ 750 PARTS_GENERIC_64_128(uncanon, A)(A, S, F) 751 752 /* 753 * Helper functions for softfloat-parts.c.inc, per-size operations. 754 */ 755 756 #define FRAC_GENERIC_64_128(NAME, P) \ 757 QEMU_GENERIC(P, (FloatParts128 *, frac128_##NAME), frac64_##NAME) 758 759 static bool frac64_addi(FloatParts64 *r, FloatParts64 *a, uint64_t c) 760 { 761 return uadd64_overflow(a->frac, c, &r->frac); 762 } 763 764 static bool frac128_addi(FloatParts128 *r, FloatParts128 *a, uint64_t c) 765 { 766 c = uadd64_overflow(a->frac_lo, c, &r->frac_lo); 767 return uadd64_overflow(a->frac_hi, c, &r->frac_hi); 768 } 769 770 #define frac_addi(R, A, C) FRAC_GENERIC_64_128(addi, R)(R, A, C) 771 772 static void frac64_allones(FloatParts64 *a) 773 { 774 a->frac = -1; 775 } 776 777 static void frac128_allones(FloatParts128 *a) 778 { 779 a->frac_hi = a->frac_lo = -1; 780 } 781 782 #define frac_allones(A) FRAC_GENERIC_64_128(allones, A)(A) 783 784 static int frac64_cmp(FloatParts64 *a, FloatParts64 *b) 785 { 786 return a->frac == b->frac ? 0 : a->frac < b->frac ? -1 : 1; 787 } 788 789 static int frac128_cmp(FloatParts128 *a, FloatParts128 *b) 790 { 791 uint64_t ta = a->frac_hi, tb = b->frac_hi; 792 if (ta == tb) { 793 ta = a->frac_lo, tb = b->frac_lo; 794 if (ta == tb) { 795 return 0; 796 } 797 } 798 return ta < tb ? -1 : 1; 799 } 800 801 #define frac_cmp(A, B) FRAC_GENERIC_64_128(cmp, A)(A, B) 802 803 static void frac64_clear(FloatParts64 *a) 804 { 805 a->frac = 0; 806 } 807 808 static void frac128_clear(FloatParts128 *a) 809 { 810 a->frac_hi = a->frac_lo = 0; 811 } 812 813 #define frac_clear(A) FRAC_GENERIC_64_128(clear, A)(A) 814 815 static bool frac64_eqz(FloatParts64 *a) 816 { 817 return a->frac == 0; 818 } 819 820 static bool frac128_eqz(FloatParts128 *a) 821 { 822 return (a->frac_hi | a->frac_lo) == 0; 823 } 824 825 #define frac_eqz(A) FRAC_GENERIC_64_128(eqz, A)(A) 826 827 static int frac64_normalize(FloatParts64 *a) 828 { 829 if (a->frac) { 830 int shift = clz64(a->frac); 831 a->frac <<= shift; 832 return shift; 833 } 834 return 64; 835 } 836 837 static int frac128_normalize(FloatParts128 *a) 838 { 839 if (a->frac_hi) { 840 int shl = clz64(a->frac_hi); 841 if (shl) { 842 int shr = 64 - shl; 843 a->frac_hi = (a->frac_hi << shl) | (a->frac_lo >> shr); 844 a->frac_lo = (a->frac_lo << shl); 845 } 846 return shl; 847 } else if (a->frac_lo) { 848 int shl = clz64(a->frac_lo); 849 a->frac_hi = (a->frac_lo << shl); 850 a->frac_lo = 0; 851 return shl + 64; 852 } 853 return 128; 854 } 855 856 #define frac_normalize(A) FRAC_GENERIC_64_128(normalize, A)(A) 857 858 static void frac64_shl(FloatParts64 *a, int c) 859 { 860 a->frac <<= c; 861 } 862 863 static void frac128_shl(FloatParts128 *a, int c) 864 { 865 shift128Left(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo); 866 } 867 868 #define frac_shl(A, C) FRAC_GENERIC_64_128(shl, A)(A, C) 869 870 static void frac64_shr(FloatParts64 *a, int c) 871 { 872 a->frac >>= c; 873 } 874 875 static void frac128_shr(FloatParts128 *a, int c) 876 { 877 shift128Right(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo); 878 } 879 880 #define frac_shr(A, C) FRAC_GENERIC_64_128(shr, A)(A, C) 881 882 static void frac64_shrjam(FloatParts64 *a, int c) 883 { 884 shift64RightJamming(a->frac, c, &a->frac); 885 } 886 887 static void frac128_shrjam(FloatParts128 *a, int c) 888 { 889 shift128RightJamming(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo); 890 } 891 892 #define frac_shrjam(A, C) FRAC_GENERIC_64_128(shrjam, A)(A, C) 893 894 #define partsN(NAME) parts64_##NAME 895 #define FloatPartsN FloatParts64 896 897 #include "softfloat-parts.c.inc" 898 899 #undef partsN 900 #undef FloatPartsN 901 #define partsN(NAME) parts128_##NAME 902 #define FloatPartsN FloatParts128 903 904 #include "softfloat-parts.c.inc" 905 906 #undef partsN 907 #undef FloatPartsN 908 909 /* 910 * Pack/unpack routines with a specific FloatFmt. 911 */ 912 913 static void float16a_unpack_canonical(FloatParts64 *p, float16 f, 914 float_status *s, const FloatFmt *params) 915 { 916 float16_unpack_raw(p, f); 917 parts_canonicalize(p, s, params); 918 } 919 920 static void float16_unpack_canonical(FloatParts64 *p, float16 f, 921 float_status *s) 922 { 923 float16a_unpack_canonical(p, f, s, &float16_params); 924 } 925 926 static void bfloat16_unpack_canonical(FloatParts64 *p, bfloat16 f, 927 float_status *s) 928 { 929 bfloat16_unpack_raw(p, f); 930 parts_canonicalize(p, s, &bfloat16_params); 931 } 932 933 static float16 float16a_round_pack_canonical(FloatParts64 *p, 934 float_status *s, 935 const FloatFmt *params) 936 { 937 parts_uncanon(p, s, params); 938 return float16_pack_raw(p); 939 } 940 941 static float16 float16_round_pack_canonical(FloatParts64 *p, 942 float_status *s) 943 { 944 return float16a_round_pack_canonical(p, s, &float16_params); 945 } 946 947 static bfloat16 bfloat16_round_pack_canonical(FloatParts64 *p, 948 float_status *s) 949 { 950 parts_uncanon(p, s, &bfloat16_params); 951 return bfloat16_pack_raw(p); 952 } 953 954 static void float32_unpack_canonical(FloatParts64 *p, float32 f, 955 float_status *s) 956 { 957 float32_unpack_raw(p, f); 958 parts_canonicalize(p, s, &float32_params); 959 } 960 961 static float32 float32_round_pack_canonical(FloatParts64 *p, 962 float_status *s) 963 { 964 parts_uncanon(p, s, &float32_params); 965 return float32_pack_raw(p); 966 } 967 968 static void float64_unpack_canonical(FloatParts64 *p, float64 f, 969 float_status *s) 970 { 971 float64_unpack_raw(p, f); 972 parts_canonicalize(p, s, &float64_params); 973 } 974 975 static float64 float64_round_pack_canonical(FloatParts64 *p, 976 float_status *s) 977 { 978 parts_uncanon(p, s, &float64_params); 979 return float64_pack_raw(p); 980 } 981 982 /* 983 * Returns the result of adding or subtracting the values of the 984 * floating-point values `a' and `b'. The operation is performed 985 * according to the IEC/IEEE Standard for Binary Floating-Point 986 * Arithmetic. 987 */ 988 989 static FloatParts64 addsub_floats(FloatParts64 a, FloatParts64 b, bool subtract, 990 float_status *s) 991 { 992 bool a_sign = a.sign; 993 bool b_sign = b.sign ^ subtract; 994 995 if (a_sign != b_sign) { 996 /* Subtraction */ 997 998 if (a.cls == float_class_normal && b.cls == float_class_normal) { 999 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) { 1000 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 1001 a.frac = a.frac - b.frac; 1002 } else { 1003 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 1004 a.frac = b.frac - a.frac; 1005 a.exp = b.exp; 1006 a_sign ^= 1; 1007 } 1008 1009 if (a.frac == 0) { 1010 a.cls = float_class_zero; 1011 a.sign = s->float_rounding_mode == float_round_down; 1012 } else { 1013 int shift = clz64(a.frac); 1014 a.frac = a.frac << shift; 1015 a.exp = a.exp - shift; 1016 a.sign = a_sign; 1017 } 1018 return a; 1019 } 1020 if (is_nan(a.cls) || is_nan(b.cls)) { 1021 return *parts_pick_nan(&a, &b, s); 1022 } 1023 if (a.cls == float_class_inf) { 1024 if (b.cls == float_class_inf) { 1025 float_raise(float_flag_invalid, s); 1026 parts_default_nan(&a, s); 1027 } 1028 return a; 1029 } 1030 if (a.cls == float_class_zero && b.cls == float_class_zero) { 1031 a.sign = s->float_rounding_mode == float_round_down; 1032 return a; 1033 } 1034 if (a.cls == float_class_zero || b.cls == float_class_inf) { 1035 b.sign = a_sign ^ 1; 1036 return b; 1037 } 1038 if (b.cls == float_class_zero) { 1039 return a; 1040 } 1041 } else { 1042 /* Addition */ 1043 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1044 if (a.exp > b.exp) { 1045 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 1046 } else if (a.exp < b.exp) { 1047 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 1048 a.exp = b.exp; 1049 } 1050 1051 if (uadd64_overflow(a.frac, b.frac, &a.frac)) { 1052 shift64RightJamming(a.frac, 1, &a.frac); 1053 a.frac |= DECOMPOSED_IMPLICIT_BIT; 1054 a.exp += 1; 1055 } 1056 return a; 1057 } 1058 if (is_nan(a.cls) || is_nan(b.cls)) { 1059 return *parts_pick_nan(&a, &b, s); 1060 } 1061 if (a.cls == float_class_inf || b.cls == float_class_zero) { 1062 return a; 1063 } 1064 if (b.cls == float_class_inf || a.cls == float_class_zero) { 1065 b.sign = b_sign; 1066 return b; 1067 } 1068 } 1069 g_assert_not_reached(); 1070 } 1071 1072 /* 1073 * Returns the result of adding or subtracting the floating-point 1074 * values `a' and `b'. The operation is performed according to the 1075 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1076 */ 1077 1078 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status) 1079 { 1080 FloatParts64 pa, pb, pr; 1081 1082 float16_unpack_canonical(&pa, a, status); 1083 float16_unpack_canonical(&pb, b, status); 1084 pr = addsub_floats(pa, pb, false, status); 1085 1086 return float16_round_pack_canonical(&pr, status); 1087 } 1088 1089 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status) 1090 { 1091 FloatParts64 pa, pb, pr; 1092 1093 float16_unpack_canonical(&pa, a, status); 1094 float16_unpack_canonical(&pb, b, status); 1095 pr = addsub_floats(pa, pb, true, status); 1096 1097 return float16_round_pack_canonical(&pr, status); 1098 } 1099 1100 static float32 QEMU_SOFTFLOAT_ATTR 1101 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status) 1102 { 1103 FloatParts64 pa, pb, pr; 1104 1105 float32_unpack_canonical(&pa, a, status); 1106 float32_unpack_canonical(&pb, b, status); 1107 pr = addsub_floats(pa, pb, subtract, status); 1108 1109 return float32_round_pack_canonical(&pr, status); 1110 } 1111 1112 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status) 1113 { 1114 return soft_f32_addsub(a, b, false, status); 1115 } 1116 1117 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status) 1118 { 1119 return soft_f32_addsub(a, b, true, status); 1120 } 1121 1122 static float64 QEMU_SOFTFLOAT_ATTR 1123 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status) 1124 { 1125 FloatParts64 pa, pb, pr; 1126 1127 float64_unpack_canonical(&pa, a, status); 1128 float64_unpack_canonical(&pb, b, status); 1129 pr = addsub_floats(pa, pb, subtract, status); 1130 1131 return float64_round_pack_canonical(&pr, status); 1132 } 1133 1134 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status) 1135 { 1136 return soft_f64_addsub(a, b, false, status); 1137 } 1138 1139 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status) 1140 { 1141 return soft_f64_addsub(a, b, true, status); 1142 } 1143 1144 static float hard_f32_add(float a, float b) 1145 { 1146 return a + b; 1147 } 1148 1149 static float hard_f32_sub(float a, float b) 1150 { 1151 return a - b; 1152 } 1153 1154 static double hard_f64_add(double a, double b) 1155 { 1156 return a + b; 1157 } 1158 1159 static double hard_f64_sub(double a, double b) 1160 { 1161 return a - b; 1162 } 1163 1164 static bool f32_addsubmul_post(union_float32 a, union_float32 b) 1165 { 1166 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1167 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1168 } 1169 return !(float32_is_zero(a.s) && float32_is_zero(b.s)); 1170 } 1171 1172 static bool f64_addsubmul_post(union_float64 a, union_float64 b) 1173 { 1174 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1175 return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); 1176 } else { 1177 return !(float64_is_zero(a.s) && float64_is_zero(b.s)); 1178 } 1179 } 1180 1181 static float32 float32_addsub(float32 a, float32 b, float_status *s, 1182 hard_f32_op2_fn hard, soft_f32_op2_fn soft) 1183 { 1184 return float32_gen2(a, b, s, hard, soft, 1185 f32_is_zon2, f32_addsubmul_post); 1186 } 1187 1188 static float64 float64_addsub(float64 a, float64 b, float_status *s, 1189 hard_f64_op2_fn hard, soft_f64_op2_fn soft) 1190 { 1191 return float64_gen2(a, b, s, hard, soft, 1192 f64_is_zon2, f64_addsubmul_post); 1193 } 1194 1195 float32 QEMU_FLATTEN 1196 float32_add(float32 a, float32 b, float_status *s) 1197 { 1198 return float32_addsub(a, b, s, hard_f32_add, soft_f32_add); 1199 } 1200 1201 float32 QEMU_FLATTEN 1202 float32_sub(float32 a, float32 b, float_status *s) 1203 { 1204 return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub); 1205 } 1206 1207 float64 QEMU_FLATTEN 1208 float64_add(float64 a, float64 b, float_status *s) 1209 { 1210 return float64_addsub(a, b, s, hard_f64_add, soft_f64_add); 1211 } 1212 1213 float64 QEMU_FLATTEN 1214 float64_sub(float64 a, float64 b, float_status *s) 1215 { 1216 return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub); 1217 } 1218 1219 /* 1220 * Returns the result of adding or subtracting the bfloat16 1221 * values `a' and `b'. 1222 */ 1223 bfloat16 QEMU_FLATTEN bfloat16_add(bfloat16 a, bfloat16 b, float_status *status) 1224 { 1225 FloatParts64 pa, pb, pr; 1226 1227 bfloat16_unpack_canonical(&pa, a, status); 1228 bfloat16_unpack_canonical(&pb, b, status); 1229 pr = addsub_floats(pa, pb, false, status); 1230 1231 return bfloat16_round_pack_canonical(&pr, status); 1232 } 1233 1234 bfloat16 QEMU_FLATTEN bfloat16_sub(bfloat16 a, bfloat16 b, float_status *status) 1235 { 1236 FloatParts64 pa, pb, pr; 1237 1238 bfloat16_unpack_canonical(&pa, a, status); 1239 bfloat16_unpack_canonical(&pb, b, status); 1240 pr = addsub_floats(pa, pb, true, status); 1241 1242 return bfloat16_round_pack_canonical(&pr, status); 1243 } 1244 1245 /* 1246 * Returns the result of multiplying the floating-point values `a' and 1247 * `b'. The operation is performed according to the IEC/IEEE Standard 1248 * for Binary Floating-Point Arithmetic. 1249 */ 1250 1251 static FloatParts64 mul_floats(FloatParts64 a, FloatParts64 b, float_status *s) 1252 { 1253 bool sign = a.sign ^ b.sign; 1254 1255 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1256 uint64_t hi, lo; 1257 int exp = a.exp + b.exp; 1258 1259 mul64To128(a.frac, b.frac, &hi, &lo); 1260 if (hi & DECOMPOSED_IMPLICIT_BIT) { 1261 exp += 1; 1262 } else { 1263 hi <<= 1; 1264 } 1265 hi |= (lo != 0); 1266 1267 /* Re-use a */ 1268 a.exp = exp; 1269 a.sign = sign; 1270 a.frac = hi; 1271 return a; 1272 } 1273 /* handle all the NaN cases */ 1274 if (is_nan(a.cls) || is_nan(b.cls)) { 1275 return *parts_pick_nan(&a, &b, s); 1276 } 1277 /* Inf * Zero == NaN */ 1278 if ((a.cls == float_class_inf && b.cls == float_class_zero) || 1279 (a.cls == float_class_zero && b.cls == float_class_inf)) { 1280 float_raise(float_flag_invalid, s); 1281 parts_default_nan(&a, s); 1282 return a; 1283 } 1284 /* Multiply by 0 or Inf */ 1285 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1286 a.sign = sign; 1287 return a; 1288 } 1289 if (b.cls == float_class_inf || b.cls == float_class_zero) { 1290 b.sign = sign; 1291 return b; 1292 } 1293 g_assert_not_reached(); 1294 } 1295 1296 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status) 1297 { 1298 FloatParts64 pa, pb, pr; 1299 1300 float16_unpack_canonical(&pa, a, status); 1301 float16_unpack_canonical(&pb, b, status); 1302 pr = mul_floats(pa, pb, status); 1303 1304 return float16_round_pack_canonical(&pr, status); 1305 } 1306 1307 static float32 QEMU_SOFTFLOAT_ATTR 1308 soft_f32_mul(float32 a, float32 b, float_status *status) 1309 { 1310 FloatParts64 pa, pb, pr; 1311 1312 float32_unpack_canonical(&pa, a, status); 1313 float32_unpack_canonical(&pb, b, status); 1314 pr = mul_floats(pa, pb, status); 1315 1316 return float32_round_pack_canonical(&pr, status); 1317 } 1318 1319 static float64 QEMU_SOFTFLOAT_ATTR 1320 soft_f64_mul(float64 a, float64 b, float_status *status) 1321 { 1322 FloatParts64 pa, pb, pr; 1323 1324 float64_unpack_canonical(&pa, a, status); 1325 float64_unpack_canonical(&pb, b, status); 1326 pr = mul_floats(pa, pb, status); 1327 1328 return float64_round_pack_canonical(&pr, status); 1329 } 1330 1331 static float hard_f32_mul(float a, float b) 1332 { 1333 return a * b; 1334 } 1335 1336 static double hard_f64_mul(double a, double b) 1337 { 1338 return a * b; 1339 } 1340 1341 float32 QEMU_FLATTEN 1342 float32_mul(float32 a, float32 b, float_status *s) 1343 { 1344 return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul, 1345 f32_is_zon2, f32_addsubmul_post); 1346 } 1347 1348 float64 QEMU_FLATTEN 1349 float64_mul(float64 a, float64 b, float_status *s) 1350 { 1351 return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul, 1352 f64_is_zon2, f64_addsubmul_post); 1353 } 1354 1355 /* 1356 * Returns the result of multiplying the bfloat16 1357 * values `a' and `b'. 1358 */ 1359 1360 bfloat16 QEMU_FLATTEN bfloat16_mul(bfloat16 a, bfloat16 b, float_status *status) 1361 { 1362 FloatParts64 pa, pb, pr; 1363 1364 bfloat16_unpack_canonical(&pa, a, status); 1365 bfloat16_unpack_canonical(&pb, b, status); 1366 pr = mul_floats(pa, pb, status); 1367 1368 return bfloat16_round_pack_canonical(&pr, status); 1369 } 1370 1371 /* 1372 * Returns the result of multiplying the floating-point values `a' and 1373 * `b' then adding 'c', with no intermediate rounding step after the 1374 * multiplication. The operation is performed according to the 1375 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008. 1376 * The flags argument allows the caller to select negation of the 1377 * addend, the intermediate product, or the final result. (The 1378 * difference between this and having the caller do a separate 1379 * negation is that negating externally will flip the sign bit on 1380 * NaNs.) 1381 */ 1382 1383 static FloatParts64 muladd_floats(FloatParts64 a, FloatParts64 b, FloatParts64 c, 1384 int flags, float_status *s) 1385 { 1386 bool inf_zero, p_sign; 1387 bool sign_flip = flags & float_muladd_negate_result; 1388 FloatClass p_class; 1389 uint64_t hi, lo; 1390 int p_exp; 1391 int ab_mask, abc_mask; 1392 1393 ab_mask = float_cmask(a.cls) | float_cmask(b.cls); 1394 abc_mask = float_cmask(c.cls) | ab_mask; 1395 inf_zero = ab_mask == float_cmask_infzero; 1396 1397 /* It is implementation-defined whether the cases of (0,inf,qnan) 1398 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 1399 * they return if they do), so we have to hand this information 1400 * off to the target-specific pick-a-NaN routine. 1401 */ 1402 if (unlikely(abc_mask & float_cmask_anynan)) { 1403 return *parts_pick_nan_muladd(&a, &b, &c, s, ab_mask, abc_mask); 1404 } 1405 1406 if (inf_zero) { 1407 float_raise(float_flag_invalid, s); 1408 parts_default_nan(&a, s); 1409 return a; 1410 } 1411 1412 if (flags & float_muladd_negate_c) { 1413 c.sign ^= 1; 1414 } 1415 1416 p_sign = a.sign ^ b.sign; 1417 1418 if (flags & float_muladd_negate_product) { 1419 p_sign ^= 1; 1420 } 1421 1422 if (ab_mask & float_cmask_inf) { 1423 p_class = float_class_inf; 1424 } else if (ab_mask & float_cmask_zero) { 1425 p_class = float_class_zero; 1426 } else { 1427 p_class = float_class_normal; 1428 } 1429 1430 if (c.cls == float_class_inf) { 1431 if (p_class == float_class_inf && p_sign != c.sign) { 1432 float_raise(float_flag_invalid, s); 1433 parts_default_nan(&c, s); 1434 } else { 1435 c.sign ^= sign_flip; 1436 } 1437 return c; 1438 } 1439 1440 if (p_class == float_class_inf) { 1441 a.cls = float_class_inf; 1442 a.sign = p_sign ^ sign_flip; 1443 return a; 1444 } 1445 1446 if (p_class == float_class_zero) { 1447 if (c.cls == float_class_zero) { 1448 if (p_sign != c.sign) { 1449 p_sign = s->float_rounding_mode == float_round_down; 1450 } 1451 c.sign = p_sign; 1452 } else if (flags & float_muladd_halve_result) { 1453 c.exp -= 1; 1454 } 1455 c.sign ^= sign_flip; 1456 return c; 1457 } 1458 1459 /* a & b should be normals now... */ 1460 assert(a.cls == float_class_normal && 1461 b.cls == float_class_normal); 1462 1463 p_exp = a.exp + b.exp; 1464 1465 mul64To128(a.frac, b.frac, &hi, &lo); 1466 1467 /* Renormalize to the msb. */ 1468 if (hi & DECOMPOSED_IMPLICIT_BIT) { 1469 p_exp += 1; 1470 } else { 1471 shortShift128Left(hi, lo, 1, &hi, &lo); 1472 } 1473 1474 /* + add/sub */ 1475 if (c.cls != float_class_zero) { 1476 int exp_diff = p_exp - c.exp; 1477 if (p_sign == c.sign) { 1478 /* Addition */ 1479 if (exp_diff <= 0) { 1480 shift64RightJamming(hi, -exp_diff, &hi); 1481 p_exp = c.exp; 1482 if (uadd64_overflow(hi, c.frac, &hi)) { 1483 shift64RightJamming(hi, 1, &hi); 1484 hi |= DECOMPOSED_IMPLICIT_BIT; 1485 p_exp += 1; 1486 } 1487 } else { 1488 uint64_t c_hi, c_lo, over; 1489 shift128RightJamming(c.frac, 0, exp_diff, &c_hi, &c_lo); 1490 add192(0, hi, lo, 0, c_hi, c_lo, &over, &hi, &lo); 1491 if (over) { 1492 shift64RightJamming(hi, 1, &hi); 1493 hi |= DECOMPOSED_IMPLICIT_BIT; 1494 p_exp += 1; 1495 } 1496 } 1497 } else { 1498 /* Subtraction */ 1499 uint64_t c_hi = c.frac, c_lo = 0; 1500 1501 if (exp_diff <= 0) { 1502 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo); 1503 if (exp_diff == 0 1504 && 1505 (hi > c_hi || (hi == c_hi && lo >= c_lo))) { 1506 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1507 } else { 1508 sub128(c_hi, c_lo, hi, lo, &hi, &lo); 1509 p_sign ^= 1; 1510 p_exp = c.exp; 1511 } 1512 } else { 1513 shift128RightJamming(c_hi, c_lo, 1514 exp_diff, 1515 &c_hi, &c_lo); 1516 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1517 } 1518 1519 if (hi == 0 && lo == 0) { 1520 a.cls = float_class_zero; 1521 a.sign = s->float_rounding_mode == float_round_down; 1522 a.sign ^= sign_flip; 1523 return a; 1524 } else { 1525 int shift; 1526 if (hi != 0) { 1527 shift = clz64(hi); 1528 } else { 1529 shift = clz64(lo) + 64; 1530 } 1531 /* Normalizing to a binary point of 124 is the 1532 correct adjust for the exponent. However since we're 1533 shifting, we might as well put the binary point back 1534 at 63 where we really want it. Therefore shift as 1535 if we're leaving 1 bit at the top of the word, but 1536 adjust the exponent as if we're leaving 3 bits. */ 1537 shift128Left(hi, lo, shift, &hi, &lo); 1538 p_exp -= shift; 1539 } 1540 } 1541 } 1542 hi |= (lo != 0); 1543 1544 if (flags & float_muladd_halve_result) { 1545 p_exp -= 1; 1546 } 1547 1548 /* finally prepare our result */ 1549 a.cls = float_class_normal; 1550 a.sign = p_sign ^ sign_flip; 1551 a.exp = p_exp; 1552 a.frac = hi; 1553 1554 return a; 1555 } 1556 1557 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c, 1558 int flags, float_status *status) 1559 { 1560 FloatParts64 pa, pb, pc, pr; 1561 1562 float16_unpack_canonical(&pa, a, status); 1563 float16_unpack_canonical(&pb, b, status); 1564 float16_unpack_canonical(&pc, c, status); 1565 pr = muladd_floats(pa, pb, pc, flags, status); 1566 1567 return float16_round_pack_canonical(&pr, status); 1568 } 1569 1570 static float32 QEMU_SOFTFLOAT_ATTR 1571 soft_f32_muladd(float32 a, float32 b, float32 c, int flags, 1572 float_status *status) 1573 { 1574 FloatParts64 pa, pb, pc, pr; 1575 1576 float32_unpack_canonical(&pa, a, status); 1577 float32_unpack_canonical(&pb, b, status); 1578 float32_unpack_canonical(&pc, c, status); 1579 pr = muladd_floats(pa, pb, pc, flags, status); 1580 1581 return float32_round_pack_canonical(&pr, status); 1582 } 1583 1584 static float64 QEMU_SOFTFLOAT_ATTR 1585 soft_f64_muladd(float64 a, float64 b, float64 c, int flags, 1586 float_status *status) 1587 { 1588 FloatParts64 pa, pb, pc, pr; 1589 1590 float64_unpack_canonical(&pa, a, status); 1591 float64_unpack_canonical(&pb, b, status); 1592 float64_unpack_canonical(&pc, c, status); 1593 pr = muladd_floats(pa, pb, pc, flags, status); 1594 1595 return float64_round_pack_canonical(&pr, status); 1596 } 1597 1598 static bool force_soft_fma; 1599 1600 float32 QEMU_FLATTEN 1601 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s) 1602 { 1603 union_float32 ua, ub, uc, ur; 1604 1605 ua.s = xa; 1606 ub.s = xb; 1607 uc.s = xc; 1608 1609 if (unlikely(!can_use_fpu(s))) { 1610 goto soft; 1611 } 1612 if (unlikely(flags & float_muladd_halve_result)) { 1613 goto soft; 1614 } 1615 1616 float32_input_flush3(&ua.s, &ub.s, &uc.s, s); 1617 if (unlikely(!f32_is_zon3(ua, ub, uc))) { 1618 goto soft; 1619 } 1620 1621 if (unlikely(force_soft_fma)) { 1622 goto soft; 1623 } 1624 1625 /* 1626 * When (a || b) == 0, there's no need to check for under/over flow, 1627 * since we know the addend is (normal || 0) and the product is 0. 1628 */ 1629 if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) { 1630 union_float32 up; 1631 bool prod_sign; 1632 1633 prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s); 1634 prod_sign ^= !!(flags & float_muladd_negate_product); 1635 up.s = float32_set_sign(float32_zero, prod_sign); 1636 1637 if (flags & float_muladd_negate_c) { 1638 uc.h = -uc.h; 1639 } 1640 ur.h = up.h + uc.h; 1641 } else { 1642 union_float32 ua_orig = ua; 1643 union_float32 uc_orig = uc; 1644 1645 if (flags & float_muladd_negate_product) { 1646 ua.h = -ua.h; 1647 } 1648 if (flags & float_muladd_negate_c) { 1649 uc.h = -uc.h; 1650 } 1651 1652 ur.h = fmaf(ua.h, ub.h, uc.h); 1653 1654 if (unlikely(f32_is_inf(ur))) { 1655 float_raise(float_flag_overflow, s); 1656 } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) { 1657 ua = ua_orig; 1658 uc = uc_orig; 1659 goto soft; 1660 } 1661 } 1662 if (flags & float_muladd_negate_result) { 1663 return float32_chs(ur.s); 1664 } 1665 return ur.s; 1666 1667 soft: 1668 return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s); 1669 } 1670 1671 float64 QEMU_FLATTEN 1672 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s) 1673 { 1674 union_float64 ua, ub, uc, ur; 1675 1676 ua.s = xa; 1677 ub.s = xb; 1678 uc.s = xc; 1679 1680 if (unlikely(!can_use_fpu(s))) { 1681 goto soft; 1682 } 1683 if (unlikely(flags & float_muladd_halve_result)) { 1684 goto soft; 1685 } 1686 1687 float64_input_flush3(&ua.s, &ub.s, &uc.s, s); 1688 if (unlikely(!f64_is_zon3(ua, ub, uc))) { 1689 goto soft; 1690 } 1691 1692 if (unlikely(force_soft_fma)) { 1693 goto soft; 1694 } 1695 1696 /* 1697 * When (a || b) == 0, there's no need to check for under/over flow, 1698 * since we know the addend is (normal || 0) and the product is 0. 1699 */ 1700 if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) { 1701 union_float64 up; 1702 bool prod_sign; 1703 1704 prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s); 1705 prod_sign ^= !!(flags & float_muladd_negate_product); 1706 up.s = float64_set_sign(float64_zero, prod_sign); 1707 1708 if (flags & float_muladd_negate_c) { 1709 uc.h = -uc.h; 1710 } 1711 ur.h = up.h + uc.h; 1712 } else { 1713 union_float64 ua_orig = ua; 1714 union_float64 uc_orig = uc; 1715 1716 if (flags & float_muladd_negate_product) { 1717 ua.h = -ua.h; 1718 } 1719 if (flags & float_muladd_negate_c) { 1720 uc.h = -uc.h; 1721 } 1722 1723 ur.h = fma(ua.h, ub.h, uc.h); 1724 1725 if (unlikely(f64_is_inf(ur))) { 1726 float_raise(float_flag_overflow, s); 1727 } else if (unlikely(fabs(ur.h) <= FLT_MIN)) { 1728 ua = ua_orig; 1729 uc = uc_orig; 1730 goto soft; 1731 } 1732 } 1733 if (flags & float_muladd_negate_result) { 1734 return float64_chs(ur.s); 1735 } 1736 return ur.s; 1737 1738 soft: 1739 return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s); 1740 } 1741 1742 /* 1743 * Returns the result of multiplying the bfloat16 values `a' 1744 * and `b' then adding 'c', with no intermediate rounding step after the 1745 * multiplication. 1746 */ 1747 1748 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c, 1749 int flags, float_status *status) 1750 { 1751 FloatParts64 pa, pb, pc, pr; 1752 1753 bfloat16_unpack_canonical(&pa, a, status); 1754 bfloat16_unpack_canonical(&pb, b, status); 1755 bfloat16_unpack_canonical(&pc, c, status); 1756 pr = muladd_floats(pa, pb, pc, flags, status); 1757 1758 return bfloat16_round_pack_canonical(&pr, status); 1759 } 1760 1761 /* 1762 * Returns the result of dividing the floating-point value `a' by the 1763 * corresponding value `b'. The operation is performed according to 1764 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1765 */ 1766 1767 static FloatParts64 div_floats(FloatParts64 a, FloatParts64 b, float_status *s) 1768 { 1769 bool sign = a.sign ^ b.sign; 1770 1771 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1772 uint64_t n0, n1, q, r; 1773 int exp = a.exp - b.exp; 1774 1775 /* 1776 * We want a 2*N / N-bit division to produce exactly an N-bit 1777 * result, so that we do not lose any precision and so that we 1778 * do not have to renormalize afterward. If A.frac < B.frac, 1779 * then division would produce an (N-1)-bit result; shift A left 1780 * by one to produce the an N-bit result, and decrement the 1781 * exponent to match. 1782 * 1783 * The udiv_qrnnd algorithm that we're using requires normalization, 1784 * i.e. the msb of the denominator must be set, which is already true. 1785 */ 1786 if (a.frac < b.frac) { 1787 exp -= 1; 1788 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0); 1789 } else { 1790 shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, &n1, &n0); 1791 } 1792 q = udiv_qrnnd(&r, n1, n0, b.frac); 1793 1794 /* Set lsb if there is a remainder, to set inexact. */ 1795 a.frac = q | (r != 0); 1796 a.sign = sign; 1797 a.exp = exp; 1798 return a; 1799 } 1800 /* handle all the NaN cases */ 1801 if (is_nan(a.cls) || is_nan(b.cls)) { 1802 return *parts_pick_nan(&a, &b, s); 1803 } 1804 /* 0/0 or Inf/Inf */ 1805 if (a.cls == b.cls 1806 && 1807 (a.cls == float_class_inf || a.cls == float_class_zero)) { 1808 float_raise(float_flag_invalid, s); 1809 parts_default_nan(&a, s); 1810 return a; 1811 } 1812 /* Inf / x or 0 / x */ 1813 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1814 a.sign = sign; 1815 return a; 1816 } 1817 /* Div 0 => Inf */ 1818 if (b.cls == float_class_zero) { 1819 float_raise(float_flag_divbyzero, s); 1820 a.cls = float_class_inf; 1821 a.sign = sign; 1822 return a; 1823 } 1824 /* Div by Inf */ 1825 if (b.cls == float_class_inf) { 1826 a.cls = float_class_zero; 1827 a.sign = sign; 1828 return a; 1829 } 1830 g_assert_not_reached(); 1831 } 1832 1833 float16 float16_div(float16 a, float16 b, float_status *status) 1834 { 1835 FloatParts64 pa, pb, pr; 1836 1837 float16_unpack_canonical(&pa, a, status); 1838 float16_unpack_canonical(&pb, b, status); 1839 pr = div_floats(pa, pb, status); 1840 1841 return float16_round_pack_canonical(&pr, status); 1842 } 1843 1844 static float32 QEMU_SOFTFLOAT_ATTR 1845 soft_f32_div(float32 a, float32 b, float_status *status) 1846 { 1847 FloatParts64 pa, pb, pr; 1848 1849 float32_unpack_canonical(&pa, a, status); 1850 float32_unpack_canonical(&pb, b, status); 1851 pr = div_floats(pa, pb, status); 1852 1853 return float32_round_pack_canonical(&pr, status); 1854 } 1855 1856 static float64 QEMU_SOFTFLOAT_ATTR 1857 soft_f64_div(float64 a, float64 b, float_status *status) 1858 { 1859 FloatParts64 pa, pb, pr; 1860 1861 float64_unpack_canonical(&pa, a, status); 1862 float64_unpack_canonical(&pb, b, status); 1863 pr = div_floats(pa, pb, status); 1864 1865 return float64_round_pack_canonical(&pr, status); 1866 } 1867 1868 static float hard_f32_div(float a, float b) 1869 { 1870 return a / b; 1871 } 1872 1873 static double hard_f64_div(double a, double b) 1874 { 1875 return a / b; 1876 } 1877 1878 static bool f32_div_pre(union_float32 a, union_float32 b) 1879 { 1880 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1881 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1882 fpclassify(b.h) == FP_NORMAL; 1883 } 1884 return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s); 1885 } 1886 1887 static bool f64_div_pre(union_float64 a, union_float64 b) 1888 { 1889 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1890 return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && 1891 fpclassify(b.h) == FP_NORMAL; 1892 } 1893 return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s); 1894 } 1895 1896 static bool f32_div_post(union_float32 a, union_float32 b) 1897 { 1898 if (QEMU_HARDFLOAT_2F32_USE_FP) { 1899 return fpclassify(a.h) != FP_ZERO; 1900 } 1901 return !float32_is_zero(a.s); 1902 } 1903 1904 static bool f64_div_post(union_float64 a, union_float64 b) 1905 { 1906 if (QEMU_HARDFLOAT_2F64_USE_FP) { 1907 return fpclassify(a.h) != FP_ZERO; 1908 } 1909 return !float64_is_zero(a.s); 1910 } 1911 1912 float32 QEMU_FLATTEN 1913 float32_div(float32 a, float32 b, float_status *s) 1914 { 1915 return float32_gen2(a, b, s, hard_f32_div, soft_f32_div, 1916 f32_div_pre, f32_div_post); 1917 } 1918 1919 float64 QEMU_FLATTEN 1920 float64_div(float64 a, float64 b, float_status *s) 1921 { 1922 return float64_gen2(a, b, s, hard_f64_div, soft_f64_div, 1923 f64_div_pre, f64_div_post); 1924 } 1925 1926 /* 1927 * Returns the result of dividing the bfloat16 1928 * value `a' by the corresponding value `b'. 1929 */ 1930 1931 bfloat16 bfloat16_div(bfloat16 a, bfloat16 b, float_status *status) 1932 { 1933 FloatParts64 pa, pb, pr; 1934 1935 bfloat16_unpack_canonical(&pa, a, status); 1936 bfloat16_unpack_canonical(&pb, b, status); 1937 pr = div_floats(pa, pb, status); 1938 1939 return bfloat16_round_pack_canonical(&pr, status); 1940 } 1941 1942 /* 1943 * Float to Float conversions 1944 * 1945 * Returns the result of converting one float format to another. The 1946 * conversion is performed according to the IEC/IEEE Standard for 1947 * Binary Floating-Point Arithmetic. 1948 * 1949 * The float_to_float helper only needs to take care of raising 1950 * invalid exceptions and handling the conversion on NaNs. 1951 */ 1952 1953 static FloatParts64 float_to_float(FloatParts64 a, const FloatFmt *dstf, 1954 float_status *s) 1955 { 1956 if (dstf->arm_althp) { 1957 switch (a.cls) { 1958 case float_class_qnan: 1959 case float_class_snan: 1960 /* There is no NaN in the destination format. Raise Invalid 1961 * and return a zero with the sign of the input NaN. 1962 */ 1963 float_raise(float_flag_invalid, s); 1964 a.cls = float_class_zero; 1965 a.frac = 0; 1966 a.exp = 0; 1967 break; 1968 1969 case float_class_inf: 1970 /* There is no Inf in the destination format. Raise Invalid 1971 * and return the maximum normal with the correct sign. 1972 */ 1973 float_raise(float_flag_invalid, s); 1974 a.cls = float_class_normal; 1975 a.exp = dstf->exp_max; 1976 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift; 1977 break; 1978 1979 default: 1980 break; 1981 } 1982 } else if (is_nan(a.cls)) { 1983 parts_return_nan(&a, s); 1984 } 1985 return a; 1986 } 1987 1988 float32 float16_to_float32(float16 a, bool ieee, float_status *s) 1989 { 1990 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1991 FloatParts64 pa, pr; 1992 1993 float16a_unpack_canonical(&pa, a, s, fmt16); 1994 pr = float_to_float(pa, &float32_params, s); 1995 return float32_round_pack_canonical(&pr, s); 1996 } 1997 1998 float64 float16_to_float64(float16 a, bool ieee, float_status *s) 1999 { 2000 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2001 FloatParts64 pa, pr; 2002 2003 float16a_unpack_canonical(&pa, a, s, fmt16); 2004 pr = float_to_float(pa, &float64_params, s); 2005 return float64_round_pack_canonical(&pr, s); 2006 } 2007 2008 float16 float32_to_float16(float32 a, bool ieee, float_status *s) 2009 { 2010 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2011 FloatParts64 pa, pr; 2012 2013 float32_unpack_canonical(&pa, a, s); 2014 pr = float_to_float(pa, fmt16, s); 2015 return float16a_round_pack_canonical(&pr, s, fmt16); 2016 } 2017 2018 static float64 QEMU_SOFTFLOAT_ATTR 2019 soft_float32_to_float64(float32 a, float_status *s) 2020 { 2021 FloatParts64 pa, pr; 2022 2023 float32_unpack_canonical(&pa, a, s); 2024 pr = float_to_float(pa, &float64_params, s); 2025 return float64_round_pack_canonical(&pr, s); 2026 } 2027 2028 float64 float32_to_float64(float32 a, float_status *s) 2029 { 2030 if (likely(float32_is_normal(a))) { 2031 /* Widening conversion can never produce inexact results. */ 2032 union_float32 uf; 2033 union_float64 ud; 2034 uf.s = a; 2035 ud.h = uf.h; 2036 return ud.s; 2037 } else if (float32_is_zero(a)) { 2038 return float64_set_sign(float64_zero, float32_is_neg(a)); 2039 } else { 2040 return soft_float32_to_float64(a, s); 2041 } 2042 } 2043 2044 float16 float64_to_float16(float64 a, bool ieee, float_status *s) 2045 { 2046 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 2047 FloatParts64 pa, pr; 2048 2049 float64_unpack_canonical(&pa, a, s); 2050 pr = float_to_float(pa, fmt16, s); 2051 return float16a_round_pack_canonical(&pr, s, fmt16); 2052 } 2053 2054 float32 float64_to_float32(float64 a, float_status *s) 2055 { 2056 FloatParts64 pa, pr; 2057 2058 float64_unpack_canonical(&pa, a, s); 2059 pr = float_to_float(pa, &float32_params, s); 2060 return float32_round_pack_canonical(&pr, s); 2061 } 2062 2063 float32 bfloat16_to_float32(bfloat16 a, float_status *s) 2064 { 2065 FloatParts64 pa, pr; 2066 2067 bfloat16_unpack_canonical(&pa, a, s); 2068 pr = float_to_float(pa, &float32_params, s); 2069 return float32_round_pack_canonical(&pr, s); 2070 } 2071 2072 float64 bfloat16_to_float64(bfloat16 a, float_status *s) 2073 { 2074 FloatParts64 pa, pr; 2075 2076 bfloat16_unpack_canonical(&pa, a, s); 2077 pr = float_to_float(pa, &float64_params, s); 2078 return float64_round_pack_canonical(&pr, s); 2079 } 2080 2081 bfloat16 float32_to_bfloat16(float32 a, float_status *s) 2082 { 2083 FloatParts64 pa, pr; 2084 2085 float32_unpack_canonical(&pa, a, s); 2086 pr = float_to_float(pa, &bfloat16_params, s); 2087 return bfloat16_round_pack_canonical(&pr, s); 2088 } 2089 2090 bfloat16 float64_to_bfloat16(float64 a, float_status *s) 2091 { 2092 FloatParts64 pa, pr; 2093 2094 float64_unpack_canonical(&pa, a, s); 2095 pr = float_to_float(pa, &bfloat16_params, s); 2096 return bfloat16_round_pack_canonical(&pr, s); 2097 } 2098 2099 /* 2100 * Rounds the floating-point value `a' to an integer, and returns the 2101 * result as a floating-point value. The operation is performed 2102 * according to the IEC/IEEE Standard for Binary Floating-Point 2103 * Arithmetic. 2104 */ 2105 2106 static FloatParts64 round_to_int(FloatParts64 a, FloatRoundMode rmode, 2107 int scale, float_status *s) 2108 { 2109 switch (a.cls) { 2110 case float_class_qnan: 2111 case float_class_snan: 2112 parts_return_nan(&a, s); 2113 break; 2114 2115 case float_class_zero: 2116 case float_class_inf: 2117 /* already "integral" */ 2118 break; 2119 2120 case float_class_normal: 2121 scale = MIN(MAX(scale, -0x10000), 0x10000); 2122 a.exp += scale; 2123 2124 if (a.exp >= DECOMPOSED_BINARY_POINT) { 2125 /* already integral */ 2126 break; 2127 } 2128 if (a.exp < 0) { 2129 bool one; 2130 /* all fractional */ 2131 float_raise(float_flag_inexact, s); 2132 switch (rmode) { 2133 case float_round_nearest_even: 2134 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT; 2135 break; 2136 case float_round_ties_away: 2137 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT; 2138 break; 2139 case float_round_to_zero: 2140 one = false; 2141 break; 2142 case float_round_up: 2143 one = !a.sign; 2144 break; 2145 case float_round_down: 2146 one = a.sign; 2147 break; 2148 case float_round_to_odd: 2149 one = true; 2150 break; 2151 default: 2152 g_assert_not_reached(); 2153 } 2154 2155 if (one) { 2156 a.frac = DECOMPOSED_IMPLICIT_BIT; 2157 a.exp = 0; 2158 } else { 2159 a.cls = float_class_zero; 2160 } 2161 } else { 2162 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp; 2163 uint64_t frac_lsbm1 = frac_lsb >> 1; 2164 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb; 2165 uint64_t rnd_mask = rnd_even_mask >> 1; 2166 uint64_t inc; 2167 2168 switch (rmode) { 2169 case float_round_nearest_even: 2170 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 2171 break; 2172 case float_round_ties_away: 2173 inc = frac_lsbm1; 2174 break; 2175 case float_round_to_zero: 2176 inc = 0; 2177 break; 2178 case float_round_up: 2179 inc = a.sign ? 0 : rnd_mask; 2180 break; 2181 case float_round_down: 2182 inc = a.sign ? rnd_mask : 0; 2183 break; 2184 case float_round_to_odd: 2185 inc = a.frac & frac_lsb ? 0 : rnd_mask; 2186 break; 2187 default: 2188 g_assert_not_reached(); 2189 } 2190 2191 if (a.frac & rnd_mask) { 2192 float_raise(float_flag_inexact, s); 2193 if (uadd64_overflow(a.frac, inc, &a.frac)) { 2194 a.frac >>= 1; 2195 a.frac |= DECOMPOSED_IMPLICIT_BIT; 2196 a.exp++; 2197 } 2198 a.frac &= ~rnd_mask; 2199 } 2200 } 2201 break; 2202 default: 2203 g_assert_not_reached(); 2204 } 2205 return a; 2206 } 2207 2208 float16 float16_round_to_int(float16 a, float_status *s) 2209 { 2210 FloatParts64 pa, pr; 2211 2212 float16_unpack_canonical(&pa, a, s); 2213 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2214 return float16_round_pack_canonical(&pr, s); 2215 } 2216 2217 float32 float32_round_to_int(float32 a, float_status *s) 2218 { 2219 FloatParts64 pa, pr; 2220 2221 float32_unpack_canonical(&pa, a, s); 2222 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2223 return float32_round_pack_canonical(&pr, s); 2224 } 2225 2226 float64 float64_round_to_int(float64 a, float_status *s) 2227 { 2228 FloatParts64 pa, pr; 2229 2230 float64_unpack_canonical(&pa, a, s); 2231 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2232 return float64_round_pack_canonical(&pr, s); 2233 } 2234 2235 /* 2236 * Rounds the bfloat16 value `a' to an integer, and returns the 2237 * result as a bfloat16 value. 2238 */ 2239 2240 bfloat16 bfloat16_round_to_int(bfloat16 a, float_status *s) 2241 { 2242 FloatParts64 pa, pr; 2243 2244 bfloat16_unpack_canonical(&pa, a, s); 2245 pr = round_to_int(pa, s->float_rounding_mode, 0, s); 2246 return bfloat16_round_pack_canonical(&pr, s); 2247 } 2248 2249 /* 2250 * Returns the result of converting the floating-point value `a' to 2251 * the two's complement integer format. The conversion is performed 2252 * according to the IEC/IEEE Standard for Binary Floating-Point 2253 * Arithmetic---which means in particular that the conversion is 2254 * rounded according to the current rounding mode. If `a' is a NaN, 2255 * the largest positive integer is returned. Otherwise, if the 2256 * conversion overflows, the largest integer with the same sign as `a' 2257 * is returned. 2258 */ 2259 2260 static int64_t round_to_int_and_pack(FloatParts64 in, FloatRoundMode rmode, 2261 int scale, int64_t min, int64_t max, 2262 float_status *s) 2263 { 2264 uint64_t r; 2265 int orig_flags = get_float_exception_flags(s); 2266 FloatParts64 p = round_to_int(in, rmode, scale, s); 2267 2268 switch (p.cls) { 2269 case float_class_snan: 2270 case float_class_qnan: 2271 s->float_exception_flags = orig_flags | float_flag_invalid; 2272 return max; 2273 case float_class_inf: 2274 s->float_exception_flags = orig_flags | float_flag_invalid; 2275 return p.sign ? min : max; 2276 case float_class_zero: 2277 return 0; 2278 case float_class_normal: 2279 if (p.exp <= DECOMPOSED_BINARY_POINT) { 2280 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2281 } else { 2282 r = UINT64_MAX; 2283 } 2284 if (p.sign) { 2285 if (r <= -(uint64_t) min) { 2286 return -r; 2287 } else { 2288 s->float_exception_flags = orig_flags | float_flag_invalid; 2289 return min; 2290 } 2291 } else { 2292 if (r <= max) { 2293 return r; 2294 } else { 2295 s->float_exception_flags = orig_flags | float_flag_invalid; 2296 return max; 2297 } 2298 } 2299 default: 2300 g_assert_not_reached(); 2301 } 2302 } 2303 2304 int8_t float16_to_int8_scalbn(float16 a, FloatRoundMode rmode, int scale, 2305 float_status *s) 2306 { 2307 FloatParts64 p; 2308 2309 float16_unpack_canonical(&p, a, s); 2310 return round_to_int_and_pack(p, rmode, scale, INT8_MIN, INT8_MAX, s); 2311 } 2312 2313 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale, 2314 float_status *s) 2315 { 2316 FloatParts64 p; 2317 2318 float16_unpack_canonical(&p, a, s); 2319 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2320 } 2321 2322 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale, 2323 float_status *s) 2324 { 2325 FloatParts64 p; 2326 2327 float16_unpack_canonical(&p, a, s); 2328 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2329 } 2330 2331 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale, 2332 float_status *s) 2333 { 2334 FloatParts64 p; 2335 2336 float16_unpack_canonical(&p, a, s); 2337 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2338 } 2339 2340 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale, 2341 float_status *s) 2342 { 2343 FloatParts64 p; 2344 2345 float32_unpack_canonical(&p, a, s); 2346 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2347 } 2348 2349 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale, 2350 float_status *s) 2351 { 2352 FloatParts64 p; 2353 2354 float32_unpack_canonical(&p, a, s); 2355 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2356 } 2357 2358 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale, 2359 float_status *s) 2360 { 2361 FloatParts64 p; 2362 2363 float32_unpack_canonical(&p, a, s); 2364 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2365 } 2366 2367 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale, 2368 float_status *s) 2369 { 2370 FloatParts64 p; 2371 2372 float64_unpack_canonical(&p, a, s); 2373 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2374 } 2375 2376 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale, 2377 float_status *s) 2378 { 2379 FloatParts64 p; 2380 2381 float64_unpack_canonical(&p, a, s); 2382 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2383 } 2384 2385 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale, 2386 float_status *s) 2387 { 2388 FloatParts64 p; 2389 2390 float64_unpack_canonical(&p, a, s); 2391 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2392 } 2393 2394 int8_t float16_to_int8(float16 a, float_status *s) 2395 { 2396 return float16_to_int8_scalbn(a, s->float_rounding_mode, 0, s); 2397 } 2398 2399 int16_t float16_to_int16(float16 a, float_status *s) 2400 { 2401 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2402 } 2403 2404 int32_t float16_to_int32(float16 a, float_status *s) 2405 { 2406 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2407 } 2408 2409 int64_t float16_to_int64(float16 a, float_status *s) 2410 { 2411 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2412 } 2413 2414 int16_t float32_to_int16(float32 a, float_status *s) 2415 { 2416 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2417 } 2418 2419 int32_t float32_to_int32(float32 a, float_status *s) 2420 { 2421 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2422 } 2423 2424 int64_t float32_to_int64(float32 a, float_status *s) 2425 { 2426 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2427 } 2428 2429 int16_t float64_to_int16(float64 a, float_status *s) 2430 { 2431 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2432 } 2433 2434 int32_t float64_to_int32(float64 a, float_status *s) 2435 { 2436 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2437 } 2438 2439 int64_t float64_to_int64(float64 a, float_status *s) 2440 { 2441 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2442 } 2443 2444 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s) 2445 { 2446 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2447 } 2448 2449 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s) 2450 { 2451 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2452 } 2453 2454 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s) 2455 { 2456 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2457 } 2458 2459 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s) 2460 { 2461 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s); 2462 } 2463 2464 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s) 2465 { 2466 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s); 2467 } 2468 2469 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s) 2470 { 2471 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s); 2472 } 2473 2474 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s) 2475 { 2476 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s); 2477 } 2478 2479 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s) 2480 { 2481 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s); 2482 } 2483 2484 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s) 2485 { 2486 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s); 2487 } 2488 2489 /* 2490 * Returns the result of converting the floating-point value `a' to 2491 * the two's complement integer format. 2492 */ 2493 2494 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2495 float_status *s) 2496 { 2497 FloatParts64 p; 2498 2499 bfloat16_unpack_canonical(&p, a, s); 2500 return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s); 2501 } 2502 2503 int32_t bfloat16_to_int32_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2504 float_status *s) 2505 { 2506 FloatParts64 p; 2507 2508 bfloat16_unpack_canonical(&p, a, s); 2509 return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s); 2510 } 2511 2512 int64_t bfloat16_to_int64_scalbn(bfloat16 a, FloatRoundMode rmode, int scale, 2513 float_status *s) 2514 { 2515 FloatParts64 p; 2516 2517 bfloat16_unpack_canonical(&p, a, s); 2518 return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s); 2519 } 2520 2521 int16_t bfloat16_to_int16(bfloat16 a, float_status *s) 2522 { 2523 return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 2524 } 2525 2526 int32_t bfloat16_to_int32(bfloat16 a, float_status *s) 2527 { 2528 return bfloat16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 2529 } 2530 2531 int64_t bfloat16_to_int64(bfloat16 a, float_status *s) 2532 { 2533 return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 2534 } 2535 2536 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s) 2537 { 2538 return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s); 2539 } 2540 2541 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a, float_status *s) 2542 { 2543 return bfloat16_to_int32_scalbn(a, float_round_to_zero, 0, s); 2544 } 2545 2546 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a, float_status *s) 2547 { 2548 return bfloat16_to_int64_scalbn(a, float_round_to_zero, 0, s); 2549 } 2550 2551 /* 2552 * Returns the result of converting the floating-point value `a' to 2553 * the unsigned integer format. The conversion is performed according 2554 * to the IEC/IEEE Standard for Binary Floating-Point 2555 * Arithmetic---which means in particular that the conversion is 2556 * rounded according to the current rounding mode. If `a' is a NaN, 2557 * the largest unsigned integer is returned. Otherwise, if the 2558 * conversion overflows, the largest unsigned integer is returned. If 2559 * the 'a' is negative, the result is rounded and zero is returned; 2560 * values that do not round to zero will raise the inexact exception 2561 * flag. 2562 */ 2563 2564 static uint64_t round_to_uint_and_pack(FloatParts64 in, FloatRoundMode rmode, 2565 int scale, uint64_t max, 2566 float_status *s) 2567 { 2568 int orig_flags = get_float_exception_flags(s); 2569 FloatParts64 p = round_to_int(in, rmode, scale, s); 2570 uint64_t r; 2571 2572 switch (p.cls) { 2573 case float_class_snan: 2574 case float_class_qnan: 2575 s->float_exception_flags = orig_flags | float_flag_invalid; 2576 return max; 2577 case float_class_inf: 2578 s->float_exception_flags = orig_flags | float_flag_invalid; 2579 return p.sign ? 0 : max; 2580 case float_class_zero: 2581 return 0; 2582 case float_class_normal: 2583 if (p.sign) { 2584 s->float_exception_flags = orig_flags | float_flag_invalid; 2585 return 0; 2586 } 2587 2588 if (p.exp <= DECOMPOSED_BINARY_POINT) { 2589 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 2590 } else { 2591 s->float_exception_flags = orig_flags | float_flag_invalid; 2592 return max; 2593 } 2594 2595 /* For uint64 this will never trip, but if p.exp is too large 2596 * to shift a decomposed fraction we shall have exited via the 2597 * 3rd leg above. 2598 */ 2599 if (r > max) { 2600 s->float_exception_flags = orig_flags | float_flag_invalid; 2601 return max; 2602 } 2603 return r; 2604 default: 2605 g_assert_not_reached(); 2606 } 2607 } 2608 2609 uint8_t float16_to_uint8_scalbn(float16 a, FloatRoundMode rmode, int scale, 2610 float_status *s) 2611 { 2612 FloatParts64 p; 2613 2614 float16_unpack_canonical(&p, a, s); 2615 return round_to_uint_and_pack(p, rmode, scale, UINT8_MAX, s); 2616 } 2617 2618 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale, 2619 float_status *s) 2620 { 2621 FloatParts64 p; 2622 2623 float16_unpack_canonical(&p, a, s); 2624 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2625 } 2626 2627 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale, 2628 float_status *s) 2629 { 2630 FloatParts64 p; 2631 2632 float16_unpack_canonical(&p, a, s); 2633 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2634 } 2635 2636 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale, 2637 float_status *s) 2638 { 2639 FloatParts64 p; 2640 2641 float16_unpack_canonical(&p, a, s); 2642 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2643 } 2644 2645 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale, 2646 float_status *s) 2647 { 2648 FloatParts64 p; 2649 2650 float32_unpack_canonical(&p, a, s); 2651 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2652 } 2653 2654 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale, 2655 float_status *s) 2656 { 2657 FloatParts64 p; 2658 2659 float32_unpack_canonical(&p, a, s); 2660 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2661 } 2662 2663 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale, 2664 float_status *s) 2665 { 2666 FloatParts64 p; 2667 2668 float32_unpack_canonical(&p, a, s); 2669 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2670 } 2671 2672 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale, 2673 float_status *s) 2674 { 2675 FloatParts64 p; 2676 2677 float64_unpack_canonical(&p, a, s); 2678 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2679 } 2680 2681 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale, 2682 float_status *s) 2683 { 2684 FloatParts64 p; 2685 2686 float64_unpack_canonical(&p, a, s); 2687 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2688 } 2689 2690 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale, 2691 float_status *s) 2692 { 2693 FloatParts64 p; 2694 2695 float64_unpack_canonical(&p, a, s); 2696 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2697 } 2698 2699 uint8_t float16_to_uint8(float16 a, float_status *s) 2700 { 2701 return float16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s); 2702 } 2703 2704 uint16_t float16_to_uint16(float16 a, float_status *s) 2705 { 2706 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2707 } 2708 2709 uint32_t float16_to_uint32(float16 a, float_status *s) 2710 { 2711 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2712 } 2713 2714 uint64_t float16_to_uint64(float16 a, float_status *s) 2715 { 2716 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2717 } 2718 2719 uint16_t float32_to_uint16(float32 a, float_status *s) 2720 { 2721 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2722 } 2723 2724 uint32_t float32_to_uint32(float32 a, float_status *s) 2725 { 2726 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2727 } 2728 2729 uint64_t float32_to_uint64(float32 a, float_status *s) 2730 { 2731 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2732 } 2733 2734 uint16_t float64_to_uint16(float64 a, float_status *s) 2735 { 2736 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2737 } 2738 2739 uint32_t float64_to_uint32(float64 a, float_status *s) 2740 { 2741 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2742 } 2743 2744 uint64_t float64_to_uint64(float64 a, float_status *s) 2745 { 2746 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2747 } 2748 2749 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s) 2750 { 2751 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2752 } 2753 2754 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s) 2755 { 2756 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2757 } 2758 2759 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s) 2760 { 2761 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2762 } 2763 2764 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s) 2765 { 2766 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2767 } 2768 2769 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s) 2770 { 2771 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2772 } 2773 2774 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s) 2775 { 2776 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2777 } 2778 2779 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s) 2780 { 2781 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2782 } 2783 2784 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s) 2785 { 2786 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2787 } 2788 2789 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s) 2790 { 2791 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2792 } 2793 2794 /* 2795 * Returns the result of converting the bfloat16 value `a' to 2796 * the unsigned integer format. 2797 */ 2798 2799 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode, 2800 int scale, float_status *s) 2801 { 2802 FloatParts64 p; 2803 2804 bfloat16_unpack_canonical(&p, a, s); 2805 return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s); 2806 } 2807 2808 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode rmode, 2809 int scale, float_status *s) 2810 { 2811 FloatParts64 p; 2812 2813 bfloat16_unpack_canonical(&p, a, s); 2814 return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s); 2815 } 2816 2817 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode rmode, 2818 int scale, float_status *s) 2819 { 2820 FloatParts64 p; 2821 2822 bfloat16_unpack_canonical(&p, a, s); 2823 return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s); 2824 } 2825 2826 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s) 2827 { 2828 return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 2829 } 2830 2831 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *s) 2832 { 2833 return bfloat16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 2834 } 2835 2836 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s) 2837 { 2838 return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 2839 } 2840 2841 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s) 2842 { 2843 return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 2844 } 2845 2846 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *s) 2847 { 2848 return bfloat16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 2849 } 2850 2851 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *s) 2852 { 2853 return bfloat16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 2854 } 2855 2856 /* 2857 * Integer to float conversions 2858 * 2859 * Returns the result of converting the two's complement integer `a' 2860 * to the floating-point format. The conversion is performed according 2861 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2862 */ 2863 2864 static FloatParts64 int_to_float(int64_t a, int scale, float_status *status) 2865 { 2866 FloatParts64 r = { .sign = false }; 2867 2868 if (a == 0) { 2869 r.cls = float_class_zero; 2870 } else { 2871 uint64_t f = a; 2872 int shift; 2873 2874 r.cls = float_class_normal; 2875 if (a < 0) { 2876 f = -f; 2877 r.sign = true; 2878 } 2879 shift = clz64(f); 2880 scale = MIN(MAX(scale, -0x10000), 0x10000); 2881 2882 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 2883 r.frac = f << shift; 2884 } 2885 2886 return r; 2887 } 2888 2889 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status) 2890 { 2891 FloatParts64 pa = int_to_float(a, scale, status); 2892 return float16_round_pack_canonical(&pa, status); 2893 } 2894 2895 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status) 2896 { 2897 return int64_to_float16_scalbn(a, scale, status); 2898 } 2899 2900 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status) 2901 { 2902 return int64_to_float16_scalbn(a, scale, status); 2903 } 2904 2905 float16 int64_to_float16(int64_t a, float_status *status) 2906 { 2907 return int64_to_float16_scalbn(a, 0, status); 2908 } 2909 2910 float16 int32_to_float16(int32_t a, float_status *status) 2911 { 2912 return int64_to_float16_scalbn(a, 0, status); 2913 } 2914 2915 float16 int16_to_float16(int16_t a, float_status *status) 2916 { 2917 return int64_to_float16_scalbn(a, 0, status); 2918 } 2919 2920 float16 int8_to_float16(int8_t a, float_status *status) 2921 { 2922 return int64_to_float16_scalbn(a, 0, status); 2923 } 2924 2925 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status) 2926 { 2927 FloatParts64 pa = int_to_float(a, scale, status); 2928 return float32_round_pack_canonical(&pa, status); 2929 } 2930 2931 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status) 2932 { 2933 return int64_to_float32_scalbn(a, scale, status); 2934 } 2935 2936 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status) 2937 { 2938 return int64_to_float32_scalbn(a, scale, status); 2939 } 2940 2941 float32 int64_to_float32(int64_t a, float_status *status) 2942 { 2943 return int64_to_float32_scalbn(a, 0, status); 2944 } 2945 2946 float32 int32_to_float32(int32_t a, float_status *status) 2947 { 2948 return int64_to_float32_scalbn(a, 0, status); 2949 } 2950 2951 float32 int16_to_float32(int16_t a, float_status *status) 2952 { 2953 return int64_to_float32_scalbn(a, 0, status); 2954 } 2955 2956 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status) 2957 { 2958 FloatParts64 pa = int_to_float(a, scale, status); 2959 return float64_round_pack_canonical(&pa, status); 2960 } 2961 2962 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status) 2963 { 2964 return int64_to_float64_scalbn(a, scale, status); 2965 } 2966 2967 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status) 2968 { 2969 return int64_to_float64_scalbn(a, scale, status); 2970 } 2971 2972 float64 int64_to_float64(int64_t a, float_status *status) 2973 { 2974 return int64_to_float64_scalbn(a, 0, status); 2975 } 2976 2977 float64 int32_to_float64(int32_t a, float_status *status) 2978 { 2979 return int64_to_float64_scalbn(a, 0, status); 2980 } 2981 2982 float64 int16_to_float64(int16_t a, float_status *status) 2983 { 2984 return int64_to_float64_scalbn(a, 0, status); 2985 } 2986 2987 /* 2988 * Returns the result of converting the two's complement integer `a' 2989 * to the bfloat16 format. 2990 */ 2991 2992 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status) 2993 { 2994 FloatParts64 pa = int_to_float(a, scale, status); 2995 return bfloat16_round_pack_canonical(&pa, status); 2996 } 2997 2998 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int scale, float_status *status) 2999 { 3000 return int64_to_bfloat16_scalbn(a, scale, status); 3001 } 3002 3003 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status) 3004 { 3005 return int64_to_bfloat16_scalbn(a, scale, status); 3006 } 3007 3008 bfloat16 int64_to_bfloat16(int64_t a, float_status *status) 3009 { 3010 return int64_to_bfloat16_scalbn(a, 0, status); 3011 } 3012 3013 bfloat16 int32_to_bfloat16(int32_t a, float_status *status) 3014 { 3015 return int64_to_bfloat16_scalbn(a, 0, status); 3016 } 3017 3018 bfloat16 int16_to_bfloat16(int16_t a, float_status *status) 3019 { 3020 return int64_to_bfloat16_scalbn(a, 0, status); 3021 } 3022 3023 /* 3024 * Unsigned Integer to float conversions 3025 * 3026 * Returns the result of converting the unsigned integer `a' to the 3027 * floating-point format. The conversion is performed according to the 3028 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3029 */ 3030 3031 static FloatParts64 uint_to_float(uint64_t a, int scale, float_status *status) 3032 { 3033 FloatParts64 r = { .sign = false }; 3034 int shift; 3035 3036 if (a == 0) { 3037 r.cls = float_class_zero; 3038 } else { 3039 scale = MIN(MAX(scale, -0x10000), 0x10000); 3040 shift = clz64(a); 3041 r.cls = float_class_normal; 3042 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 3043 r.frac = a << shift; 3044 } 3045 3046 return r; 3047 } 3048 3049 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status) 3050 { 3051 FloatParts64 pa = uint_to_float(a, scale, status); 3052 return float16_round_pack_canonical(&pa, status); 3053 } 3054 3055 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status) 3056 { 3057 return uint64_to_float16_scalbn(a, scale, status); 3058 } 3059 3060 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status) 3061 { 3062 return uint64_to_float16_scalbn(a, scale, status); 3063 } 3064 3065 float16 uint64_to_float16(uint64_t a, float_status *status) 3066 { 3067 return uint64_to_float16_scalbn(a, 0, status); 3068 } 3069 3070 float16 uint32_to_float16(uint32_t a, float_status *status) 3071 { 3072 return uint64_to_float16_scalbn(a, 0, status); 3073 } 3074 3075 float16 uint16_to_float16(uint16_t a, float_status *status) 3076 { 3077 return uint64_to_float16_scalbn(a, 0, status); 3078 } 3079 3080 float16 uint8_to_float16(uint8_t a, float_status *status) 3081 { 3082 return uint64_to_float16_scalbn(a, 0, status); 3083 } 3084 3085 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status) 3086 { 3087 FloatParts64 pa = uint_to_float(a, scale, status); 3088 return float32_round_pack_canonical(&pa, status); 3089 } 3090 3091 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status) 3092 { 3093 return uint64_to_float32_scalbn(a, scale, status); 3094 } 3095 3096 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status) 3097 { 3098 return uint64_to_float32_scalbn(a, scale, status); 3099 } 3100 3101 float32 uint64_to_float32(uint64_t a, float_status *status) 3102 { 3103 return uint64_to_float32_scalbn(a, 0, status); 3104 } 3105 3106 float32 uint32_to_float32(uint32_t a, float_status *status) 3107 { 3108 return uint64_to_float32_scalbn(a, 0, status); 3109 } 3110 3111 float32 uint16_to_float32(uint16_t a, float_status *status) 3112 { 3113 return uint64_to_float32_scalbn(a, 0, status); 3114 } 3115 3116 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status) 3117 { 3118 FloatParts64 pa = uint_to_float(a, scale, status); 3119 return float64_round_pack_canonical(&pa, status); 3120 } 3121 3122 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status) 3123 { 3124 return uint64_to_float64_scalbn(a, scale, status); 3125 } 3126 3127 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status) 3128 { 3129 return uint64_to_float64_scalbn(a, scale, status); 3130 } 3131 3132 float64 uint64_to_float64(uint64_t a, float_status *status) 3133 { 3134 return uint64_to_float64_scalbn(a, 0, status); 3135 } 3136 3137 float64 uint32_to_float64(uint32_t a, float_status *status) 3138 { 3139 return uint64_to_float64_scalbn(a, 0, status); 3140 } 3141 3142 float64 uint16_to_float64(uint16_t a, float_status *status) 3143 { 3144 return uint64_to_float64_scalbn(a, 0, status); 3145 } 3146 3147 /* 3148 * Returns the result of converting the unsigned integer `a' to the 3149 * bfloat16 format. 3150 */ 3151 3152 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status) 3153 { 3154 FloatParts64 pa = uint_to_float(a, scale, status); 3155 return bfloat16_round_pack_canonical(&pa, status); 3156 } 3157 3158 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int scale, float_status *status) 3159 { 3160 return uint64_to_bfloat16_scalbn(a, scale, status); 3161 } 3162 3163 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status) 3164 { 3165 return uint64_to_bfloat16_scalbn(a, scale, status); 3166 } 3167 3168 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status) 3169 { 3170 return uint64_to_bfloat16_scalbn(a, 0, status); 3171 } 3172 3173 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status) 3174 { 3175 return uint64_to_bfloat16_scalbn(a, 0, status); 3176 } 3177 3178 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status) 3179 { 3180 return uint64_to_bfloat16_scalbn(a, 0, status); 3181 } 3182 3183 /* Float Min/Max */ 3184 /* min() and max() functions. These can't be implemented as 3185 * 'compare and pick one input' because that would mishandle 3186 * NaNs and +0 vs -0. 3187 * 3188 * minnum() and maxnum() functions. These are similar to the min() 3189 * and max() functions but if one of the arguments is a QNaN and 3190 * the other is numerical then the numerical argument is returned. 3191 * SNaNs will get quietened before being returned. 3192 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 3193 * and maxNum() operations. min() and max() are the typical min/max 3194 * semantics provided by many CPUs which predate that specification. 3195 * 3196 * minnummag() and maxnummag() functions correspond to minNumMag() 3197 * and minNumMag() from the IEEE-754 2008. 3198 */ 3199 static FloatParts64 minmax_floats(FloatParts64 a, FloatParts64 b, bool ismin, 3200 bool ieee, bool ismag, float_status *s) 3201 { 3202 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) { 3203 if (ieee) { 3204 /* Takes two floating-point values `a' and `b', one of 3205 * which is a NaN, and returns the appropriate NaN 3206 * result. If either `a' or `b' is a signaling NaN, 3207 * the invalid exception is raised. 3208 */ 3209 if (is_snan(a.cls) || is_snan(b.cls)) { 3210 return *parts_pick_nan(&a, &b, s); 3211 } else if (is_nan(a.cls) && !is_nan(b.cls)) { 3212 return b; 3213 } else if (is_nan(b.cls) && !is_nan(a.cls)) { 3214 return a; 3215 } 3216 } 3217 return *parts_pick_nan(&a, &b, s); 3218 } else { 3219 int a_exp, b_exp; 3220 3221 switch (a.cls) { 3222 case float_class_normal: 3223 a_exp = a.exp; 3224 break; 3225 case float_class_inf: 3226 a_exp = INT_MAX; 3227 break; 3228 case float_class_zero: 3229 a_exp = INT_MIN; 3230 break; 3231 default: 3232 g_assert_not_reached(); 3233 break; 3234 } 3235 switch (b.cls) { 3236 case float_class_normal: 3237 b_exp = b.exp; 3238 break; 3239 case float_class_inf: 3240 b_exp = INT_MAX; 3241 break; 3242 case float_class_zero: 3243 b_exp = INT_MIN; 3244 break; 3245 default: 3246 g_assert_not_reached(); 3247 break; 3248 } 3249 3250 if (ismag && (a_exp != b_exp || a.frac != b.frac)) { 3251 bool a_less = a_exp < b_exp; 3252 if (a_exp == b_exp) { 3253 a_less = a.frac < b.frac; 3254 } 3255 return a_less ^ ismin ? b : a; 3256 } 3257 3258 if (a.sign == b.sign) { 3259 bool a_less = a_exp < b_exp; 3260 if (a_exp == b_exp) { 3261 a_less = a.frac < b.frac; 3262 } 3263 return a.sign ^ a_less ^ ismin ? b : a; 3264 } else { 3265 return a.sign ^ ismin ? b : a; 3266 } 3267 } 3268 } 3269 3270 #define MINMAX(sz, name, ismin, isiee, ismag) \ 3271 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \ 3272 float_status *s) \ 3273 { \ 3274 FloatParts64 pa, pb, pr; \ 3275 float ## sz ## _unpack_canonical(&pa, a, s); \ 3276 float ## sz ## _unpack_canonical(&pb, b, s); \ 3277 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 3278 return float ## sz ## _round_pack_canonical(&pr, s); \ 3279 } 3280 3281 MINMAX(16, min, true, false, false) 3282 MINMAX(16, minnum, true, true, false) 3283 MINMAX(16, minnummag, true, true, true) 3284 MINMAX(16, max, false, false, false) 3285 MINMAX(16, maxnum, false, true, false) 3286 MINMAX(16, maxnummag, false, true, true) 3287 3288 MINMAX(32, min, true, false, false) 3289 MINMAX(32, minnum, true, true, false) 3290 MINMAX(32, minnummag, true, true, true) 3291 MINMAX(32, max, false, false, false) 3292 MINMAX(32, maxnum, false, true, false) 3293 MINMAX(32, maxnummag, false, true, true) 3294 3295 MINMAX(64, min, true, false, false) 3296 MINMAX(64, minnum, true, true, false) 3297 MINMAX(64, minnummag, true, true, true) 3298 MINMAX(64, max, false, false, false) 3299 MINMAX(64, maxnum, false, true, false) 3300 MINMAX(64, maxnummag, false, true, true) 3301 3302 #undef MINMAX 3303 3304 #define BF16_MINMAX(name, ismin, isiee, ismag) \ 3305 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s) \ 3306 { \ 3307 FloatParts64 pa, pb, pr; \ 3308 bfloat16_unpack_canonical(&pa, a, s); \ 3309 bfloat16_unpack_canonical(&pb, b, s); \ 3310 pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 3311 return bfloat16_round_pack_canonical(&pr, s); \ 3312 } 3313 3314 BF16_MINMAX(min, true, false, false) 3315 BF16_MINMAX(minnum, true, true, false) 3316 BF16_MINMAX(minnummag, true, true, true) 3317 BF16_MINMAX(max, false, false, false) 3318 BF16_MINMAX(maxnum, false, true, false) 3319 BF16_MINMAX(maxnummag, false, true, true) 3320 3321 #undef BF16_MINMAX 3322 3323 /* Floating point compare */ 3324 static FloatRelation compare_floats(FloatParts64 a, FloatParts64 b, bool is_quiet, 3325 float_status *s) 3326 { 3327 if (is_nan(a.cls) || is_nan(b.cls)) { 3328 if (!is_quiet || 3329 a.cls == float_class_snan || 3330 b.cls == float_class_snan) { 3331 float_raise(float_flag_invalid, s); 3332 } 3333 return float_relation_unordered; 3334 } 3335 3336 if (a.cls == float_class_zero) { 3337 if (b.cls == float_class_zero) { 3338 return float_relation_equal; 3339 } 3340 return b.sign ? float_relation_greater : float_relation_less; 3341 } else if (b.cls == float_class_zero) { 3342 return a.sign ? float_relation_less : float_relation_greater; 3343 } 3344 3345 /* The only really important thing about infinity is its sign. If 3346 * both are infinities the sign marks the smallest of the two. 3347 */ 3348 if (a.cls == float_class_inf) { 3349 if ((b.cls == float_class_inf) && (a.sign == b.sign)) { 3350 return float_relation_equal; 3351 } 3352 return a.sign ? float_relation_less : float_relation_greater; 3353 } else if (b.cls == float_class_inf) { 3354 return b.sign ? float_relation_greater : float_relation_less; 3355 } 3356 3357 if (a.sign != b.sign) { 3358 return a.sign ? float_relation_less : float_relation_greater; 3359 } 3360 3361 if (a.exp == b.exp) { 3362 if (a.frac == b.frac) { 3363 return float_relation_equal; 3364 } 3365 if (a.sign) { 3366 return a.frac > b.frac ? 3367 float_relation_less : float_relation_greater; 3368 } else { 3369 return a.frac > b.frac ? 3370 float_relation_greater : float_relation_less; 3371 } 3372 } else { 3373 if (a.sign) { 3374 return a.exp > b.exp ? float_relation_less : float_relation_greater; 3375 } else { 3376 return a.exp > b.exp ? float_relation_greater : float_relation_less; 3377 } 3378 } 3379 } 3380 3381 #define COMPARE(name, attr, sz) \ 3382 static int attr \ 3383 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \ 3384 { \ 3385 FloatParts64 pa, pb; \ 3386 float ## sz ## _unpack_canonical(&pa, a, s); \ 3387 float ## sz ## _unpack_canonical(&pb, b, s); \ 3388 return compare_floats(pa, pb, is_quiet, s); \ 3389 } 3390 3391 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16) 3392 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32) 3393 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64) 3394 3395 #undef COMPARE 3396 3397 FloatRelation float16_compare(float16 a, float16 b, float_status *s) 3398 { 3399 return soft_f16_compare(a, b, false, s); 3400 } 3401 3402 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s) 3403 { 3404 return soft_f16_compare(a, b, true, s); 3405 } 3406 3407 static FloatRelation QEMU_FLATTEN 3408 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s) 3409 { 3410 union_float32 ua, ub; 3411 3412 ua.s = xa; 3413 ub.s = xb; 3414 3415 if (QEMU_NO_HARDFLOAT) { 3416 goto soft; 3417 } 3418 3419 float32_input_flush2(&ua.s, &ub.s, s); 3420 if (isgreaterequal(ua.h, ub.h)) { 3421 if (isgreater(ua.h, ub.h)) { 3422 return float_relation_greater; 3423 } 3424 return float_relation_equal; 3425 } 3426 if (likely(isless(ua.h, ub.h))) { 3427 return float_relation_less; 3428 } 3429 /* The only condition remaining is unordered. 3430 * Fall through to set flags. 3431 */ 3432 soft: 3433 return soft_f32_compare(ua.s, ub.s, is_quiet, s); 3434 } 3435 3436 FloatRelation float32_compare(float32 a, float32 b, float_status *s) 3437 { 3438 return f32_compare(a, b, false, s); 3439 } 3440 3441 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s) 3442 { 3443 return f32_compare(a, b, true, s); 3444 } 3445 3446 static FloatRelation QEMU_FLATTEN 3447 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s) 3448 { 3449 union_float64 ua, ub; 3450 3451 ua.s = xa; 3452 ub.s = xb; 3453 3454 if (QEMU_NO_HARDFLOAT) { 3455 goto soft; 3456 } 3457 3458 float64_input_flush2(&ua.s, &ub.s, s); 3459 if (isgreaterequal(ua.h, ub.h)) { 3460 if (isgreater(ua.h, ub.h)) { 3461 return float_relation_greater; 3462 } 3463 return float_relation_equal; 3464 } 3465 if (likely(isless(ua.h, ub.h))) { 3466 return float_relation_less; 3467 } 3468 /* The only condition remaining is unordered. 3469 * Fall through to set flags. 3470 */ 3471 soft: 3472 return soft_f64_compare(ua.s, ub.s, is_quiet, s); 3473 } 3474 3475 FloatRelation float64_compare(float64 a, float64 b, float_status *s) 3476 { 3477 return f64_compare(a, b, false, s); 3478 } 3479 3480 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s) 3481 { 3482 return f64_compare(a, b, true, s); 3483 } 3484 3485 static FloatRelation QEMU_FLATTEN 3486 soft_bf16_compare(bfloat16 a, bfloat16 b, bool is_quiet, float_status *s) 3487 { 3488 FloatParts64 pa, pb; 3489 3490 bfloat16_unpack_canonical(&pa, a, s); 3491 bfloat16_unpack_canonical(&pb, b, s); 3492 return compare_floats(pa, pb, is_quiet, s); 3493 } 3494 3495 FloatRelation bfloat16_compare(bfloat16 a, bfloat16 b, float_status *s) 3496 { 3497 return soft_bf16_compare(a, b, false, s); 3498 } 3499 3500 FloatRelation bfloat16_compare_quiet(bfloat16 a, bfloat16 b, float_status *s) 3501 { 3502 return soft_bf16_compare(a, b, true, s); 3503 } 3504 3505 /* Multiply A by 2 raised to the power N. */ 3506 static FloatParts64 scalbn_decomposed(FloatParts64 a, int n, float_status *s) 3507 { 3508 if (unlikely(is_nan(a.cls))) { 3509 parts_return_nan(&a, s); 3510 } 3511 if (a.cls == float_class_normal) { 3512 /* The largest float type (even though not supported by FloatParts64) 3513 * is float128, which has a 15 bit exponent. Bounding N to 16 bits 3514 * still allows rounding to infinity, without allowing overflow 3515 * within the int32_t that backs FloatParts64.exp. 3516 */ 3517 n = MIN(MAX(n, -0x10000), 0x10000); 3518 a.exp += n; 3519 } 3520 return a; 3521 } 3522 3523 float16 float16_scalbn(float16 a, int n, float_status *status) 3524 { 3525 FloatParts64 pa, pr; 3526 3527 float16_unpack_canonical(&pa, a, status); 3528 pr = scalbn_decomposed(pa, n, status); 3529 return float16_round_pack_canonical(&pr, status); 3530 } 3531 3532 float32 float32_scalbn(float32 a, int n, float_status *status) 3533 { 3534 FloatParts64 pa, pr; 3535 3536 float32_unpack_canonical(&pa, a, status); 3537 pr = scalbn_decomposed(pa, n, status); 3538 return float32_round_pack_canonical(&pr, status); 3539 } 3540 3541 float64 float64_scalbn(float64 a, int n, float_status *status) 3542 { 3543 FloatParts64 pa, pr; 3544 3545 float64_unpack_canonical(&pa, a, status); 3546 pr = scalbn_decomposed(pa, n, status); 3547 return float64_round_pack_canonical(&pr, status); 3548 } 3549 3550 bfloat16 bfloat16_scalbn(bfloat16 a, int n, float_status *status) 3551 { 3552 FloatParts64 pa, pr; 3553 3554 bfloat16_unpack_canonical(&pa, a, status); 3555 pr = scalbn_decomposed(pa, n, status); 3556 return bfloat16_round_pack_canonical(&pr, status); 3557 } 3558 3559 /* 3560 * Square Root 3561 * 3562 * The old softfloat code did an approximation step before zeroing in 3563 * on the final result. However for simpleness we just compute the 3564 * square root by iterating down from the implicit bit to enough extra 3565 * bits to ensure we get a correctly rounded result. 3566 * 3567 * This does mean however the calculation is slower than before, 3568 * especially for 64 bit floats. 3569 */ 3570 3571 static FloatParts64 sqrt_float(FloatParts64 a, float_status *s, const FloatFmt *p) 3572 { 3573 uint64_t a_frac, r_frac, s_frac; 3574 int bit, last_bit; 3575 3576 if (is_nan(a.cls)) { 3577 parts_return_nan(&a, s); 3578 return a; 3579 } 3580 if (a.cls == float_class_zero) { 3581 return a; /* sqrt(+-0) = +-0 */ 3582 } 3583 if (a.sign) { 3584 float_raise(float_flag_invalid, s); 3585 parts_default_nan(&a, s); 3586 return a; 3587 } 3588 if (a.cls == float_class_inf) { 3589 return a; /* sqrt(+inf) = +inf */ 3590 } 3591 3592 assert(a.cls == float_class_normal); 3593 3594 /* We need two overflow bits at the top. Adding room for that is a 3595 * right shift. If the exponent is odd, we can discard the low bit 3596 * by multiplying the fraction by 2; that's a left shift. Combine 3597 * those and we shift right by 1 if the exponent is odd, otherwise 2. 3598 */ 3599 a_frac = a.frac >> (2 - (a.exp & 1)); 3600 a.exp >>= 1; 3601 3602 /* Bit-by-bit computation of sqrt. */ 3603 r_frac = 0; 3604 s_frac = 0; 3605 3606 /* Iterate from implicit bit down to the 3 extra bits to compute a 3607 * properly rounded result. Remember we've inserted two more bits 3608 * at the top, so these positions are two less. 3609 */ 3610 bit = DECOMPOSED_BINARY_POINT - 2; 3611 last_bit = MAX(p->frac_shift - 4, 0); 3612 do { 3613 uint64_t q = 1ULL << bit; 3614 uint64_t t_frac = s_frac + q; 3615 if (t_frac <= a_frac) { 3616 s_frac = t_frac + q; 3617 a_frac -= t_frac; 3618 r_frac += q; 3619 } 3620 a_frac <<= 1; 3621 } while (--bit >= last_bit); 3622 3623 /* Undo the right shift done above. If there is any remaining 3624 * fraction, the result is inexact. Set the sticky bit. 3625 */ 3626 a.frac = (r_frac << 2) + (a_frac != 0); 3627 3628 return a; 3629 } 3630 3631 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status) 3632 { 3633 FloatParts64 pa, pr; 3634 3635 float16_unpack_canonical(&pa, a, status); 3636 pr = sqrt_float(pa, status, &float16_params); 3637 return float16_round_pack_canonical(&pr, status); 3638 } 3639 3640 static float32 QEMU_SOFTFLOAT_ATTR 3641 soft_f32_sqrt(float32 a, float_status *status) 3642 { 3643 FloatParts64 pa, pr; 3644 3645 float32_unpack_canonical(&pa, a, status); 3646 pr = sqrt_float(pa, status, &float32_params); 3647 return float32_round_pack_canonical(&pr, status); 3648 } 3649 3650 static float64 QEMU_SOFTFLOAT_ATTR 3651 soft_f64_sqrt(float64 a, float_status *status) 3652 { 3653 FloatParts64 pa, pr; 3654 3655 float64_unpack_canonical(&pa, a, status); 3656 pr = sqrt_float(pa, status, &float64_params); 3657 return float64_round_pack_canonical(&pr, status); 3658 } 3659 3660 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s) 3661 { 3662 union_float32 ua, ur; 3663 3664 ua.s = xa; 3665 if (unlikely(!can_use_fpu(s))) { 3666 goto soft; 3667 } 3668 3669 float32_input_flush1(&ua.s, s); 3670 if (QEMU_HARDFLOAT_1F32_USE_FP) { 3671 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3672 fpclassify(ua.h) == FP_ZERO) || 3673 signbit(ua.h))) { 3674 goto soft; 3675 } 3676 } else if (unlikely(!float32_is_zero_or_normal(ua.s) || 3677 float32_is_neg(ua.s))) { 3678 goto soft; 3679 } 3680 ur.h = sqrtf(ua.h); 3681 return ur.s; 3682 3683 soft: 3684 return soft_f32_sqrt(ua.s, s); 3685 } 3686 3687 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s) 3688 { 3689 union_float64 ua, ur; 3690 3691 ua.s = xa; 3692 if (unlikely(!can_use_fpu(s))) { 3693 goto soft; 3694 } 3695 3696 float64_input_flush1(&ua.s, s); 3697 if (QEMU_HARDFLOAT_1F64_USE_FP) { 3698 if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || 3699 fpclassify(ua.h) == FP_ZERO) || 3700 signbit(ua.h))) { 3701 goto soft; 3702 } 3703 } else if (unlikely(!float64_is_zero_or_normal(ua.s) || 3704 float64_is_neg(ua.s))) { 3705 goto soft; 3706 } 3707 ur.h = sqrt(ua.h); 3708 return ur.s; 3709 3710 soft: 3711 return soft_f64_sqrt(ua.s, s); 3712 } 3713 3714 bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status) 3715 { 3716 FloatParts64 pa, pr; 3717 3718 bfloat16_unpack_canonical(&pa, a, status); 3719 pr = sqrt_float(pa, status, &bfloat16_params); 3720 return bfloat16_round_pack_canonical(&pr, status); 3721 } 3722 3723 /*---------------------------------------------------------------------------- 3724 | The pattern for a default generated NaN. 3725 *----------------------------------------------------------------------------*/ 3726 3727 float16 float16_default_nan(float_status *status) 3728 { 3729 FloatParts64 p; 3730 3731 parts_default_nan(&p, status); 3732 p.frac >>= float16_params.frac_shift; 3733 return float16_pack_raw(&p); 3734 } 3735 3736 float32 float32_default_nan(float_status *status) 3737 { 3738 FloatParts64 p; 3739 3740 parts_default_nan(&p, status); 3741 p.frac >>= float32_params.frac_shift; 3742 return float32_pack_raw(&p); 3743 } 3744 3745 float64 float64_default_nan(float_status *status) 3746 { 3747 FloatParts64 p; 3748 3749 parts_default_nan(&p, status); 3750 p.frac >>= float64_params.frac_shift; 3751 return float64_pack_raw(&p); 3752 } 3753 3754 float128 float128_default_nan(float_status *status) 3755 { 3756 FloatParts128 p; 3757 3758 parts_default_nan(&p, status); 3759 frac_shr(&p, float128_params.frac_shift); 3760 return float128_pack_raw(&p); 3761 } 3762 3763 bfloat16 bfloat16_default_nan(float_status *status) 3764 { 3765 FloatParts64 p; 3766 3767 parts_default_nan(&p, status); 3768 p.frac >>= bfloat16_params.frac_shift; 3769 return bfloat16_pack_raw(&p); 3770 } 3771 3772 /*---------------------------------------------------------------------------- 3773 | Returns a quiet NaN from a signalling NaN for the floating point value `a'. 3774 *----------------------------------------------------------------------------*/ 3775 3776 float16 float16_silence_nan(float16 a, float_status *status) 3777 { 3778 FloatParts64 p; 3779 3780 float16_unpack_raw(&p, a); 3781 p.frac <<= float16_params.frac_shift; 3782 parts_silence_nan(&p, status); 3783 p.frac >>= float16_params.frac_shift; 3784 return float16_pack_raw(&p); 3785 } 3786 3787 float32 float32_silence_nan(float32 a, float_status *status) 3788 { 3789 FloatParts64 p; 3790 3791 float32_unpack_raw(&p, a); 3792 p.frac <<= float32_params.frac_shift; 3793 parts_silence_nan(&p, status); 3794 p.frac >>= float32_params.frac_shift; 3795 return float32_pack_raw(&p); 3796 } 3797 3798 float64 float64_silence_nan(float64 a, float_status *status) 3799 { 3800 FloatParts64 p; 3801 3802 float64_unpack_raw(&p, a); 3803 p.frac <<= float64_params.frac_shift; 3804 parts_silence_nan(&p, status); 3805 p.frac >>= float64_params.frac_shift; 3806 return float64_pack_raw(&p); 3807 } 3808 3809 bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status) 3810 { 3811 FloatParts64 p; 3812 3813 bfloat16_unpack_raw(&p, a); 3814 p.frac <<= bfloat16_params.frac_shift; 3815 parts_silence_nan(&p, status); 3816 p.frac >>= bfloat16_params.frac_shift; 3817 return bfloat16_pack_raw(&p); 3818 } 3819 3820 float128 float128_silence_nan(float128 a, float_status *status) 3821 { 3822 FloatParts128 p; 3823 3824 float128_unpack_raw(&p, a); 3825 frac_shl(&p, float128_params.frac_shift); 3826 parts_silence_nan(&p, status); 3827 frac_shr(&p, float128_params.frac_shift); 3828 return float128_pack_raw(&p); 3829 } 3830 3831 /*---------------------------------------------------------------------------- 3832 | If `a' is denormal and we are in flush-to-zero mode then set the 3833 | input-denormal exception and return zero. Otherwise just return the value. 3834 *----------------------------------------------------------------------------*/ 3835 3836 static bool parts_squash_denormal(FloatParts64 p, float_status *status) 3837 { 3838 if (p.exp == 0 && p.frac != 0) { 3839 float_raise(float_flag_input_denormal, status); 3840 return true; 3841 } 3842 3843 return false; 3844 } 3845 3846 float16 float16_squash_input_denormal(float16 a, float_status *status) 3847 { 3848 if (status->flush_inputs_to_zero) { 3849 FloatParts64 p; 3850 3851 float16_unpack_raw(&p, a); 3852 if (parts_squash_denormal(p, status)) { 3853 return float16_set_sign(float16_zero, p.sign); 3854 } 3855 } 3856 return a; 3857 } 3858 3859 float32 float32_squash_input_denormal(float32 a, float_status *status) 3860 { 3861 if (status->flush_inputs_to_zero) { 3862 FloatParts64 p; 3863 3864 float32_unpack_raw(&p, a); 3865 if (parts_squash_denormal(p, status)) { 3866 return float32_set_sign(float32_zero, p.sign); 3867 } 3868 } 3869 return a; 3870 } 3871 3872 float64 float64_squash_input_denormal(float64 a, float_status *status) 3873 { 3874 if (status->flush_inputs_to_zero) { 3875 FloatParts64 p; 3876 3877 float64_unpack_raw(&p, a); 3878 if (parts_squash_denormal(p, status)) { 3879 return float64_set_sign(float64_zero, p.sign); 3880 } 3881 } 3882 return a; 3883 } 3884 3885 bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status) 3886 { 3887 if (status->flush_inputs_to_zero) { 3888 FloatParts64 p; 3889 3890 bfloat16_unpack_raw(&p, a); 3891 if (parts_squash_denormal(p, status)) { 3892 return bfloat16_set_sign(bfloat16_zero, p.sign); 3893 } 3894 } 3895 return a; 3896 } 3897 3898 /*---------------------------------------------------------------------------- 3899 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 3900 | and 7, and returns the properly rounded 32-bit integer corresponding to the 3901 | input. If `zSign' is 1, the input is negated before being converted to an 3902 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 3903 | is simply rounded to an integer, with the inexact exception raised if the 3904 | input cannot be represented exactly as an integer. However, if the fixed- 3905 | point input is too large, the invalid exception is raised and the largest 3906 | positive or negative integer is returned. 3907 *----------------------------------------------------------------------------*/ 3908 3909 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ, 3910 float_status *status) 3911 { 3912 int8_t roundingMode; 3913 bool roundNearestEven; 3914 int8_t roundIncrement, roundBits; 3915 int32_t z; 3916 3917 roundingMode = status->float_rounding_mode; 3918 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3919 switch (roundingMode) { 3920 case float_round_nearest_even: 3921 case float_round_ties_away: 3922 roundIncrement = 0x40; 3923 break; 3924 case float_round_to_zero: 3925 roundIncrement = 0; 3926 break; 3927 case float_round_up: 3928 roundIncrement = zSign ? 0 : 0x7f; 3929 break; 3930 case float_round_down: 3931 roundIncrement = zSign ? 0x7f : 0; 3932 break; 3933 case float_round_to_odd: 3934 roundIncrement = absZ & 0x80 ? 0 : 0x7f; 3935 break; 3936 default: 3937 abort(); 3938 } 3939 roundBits = absZ & 0x7F; 3940 absZ = ( absZ + roundIncrement )>>7; 3941 if (!(roundBits ^ 0x40) && roundNearestEven) { 3942 absZ &= ~1; 3943 } 3944 z = absZ; 3945 if ( zSign ) z = - z; 3946 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 3947 float_raise(float_flag_invalid, status); 3948 return zSign ? INT32_MIN : INT32_MAX; 3949 } 3950 if (roundBits) { 3951 float_raise(float_flag_inexact, status); 3952 } 3953 return z; 3954 3955 } 3956 3957 /*---------------------------------------------------------------------------- 3958 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 3959 | `absZ1', with binary point between bits 63 and 64 (between the input words), 3960 | and returns the properly rounded 64-bit integer corresponding to the input. 3961 | If `zSign' is 1, the input is negated before being converted to an integer. 3962 | Ordinarily, the fixed-point input is simply rounded to an integer, with 3963 | the inexact exception raised if the input cannot be represented exactly as 3964 | an integer. However, if the fixed-point input is too large, the invalid 3965 | exception is raised and the largest positive or negative integer is 3966 | returned. 3967 *----------------------------------------------------------------------------*/ 3968 3969 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1, 3970 float_status *status) 3971 { 3972 int8_t roundingMode; 3973 bool roundNearestEven, increment; 3974 int64_t z; 3975 3976 roundingMode = status->float_rounding_mode; 3977 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3978 switch (roundingMode) { 3979 case float_round_nearest_even: 3980 case float_round_ties_away: 3981 increment = ((int64_t) absZ1 < 0); 3982 break; 3983 case float_round_to_zero: 3984 increment = 0; 3985 break; 3986 case float_round_up: 3987 increment = !zSign && absZ1; 3988 break; 3989 case float_round_down: 3990 increment = zSign && absZ1; 3991 break; 3992 case float_round_to_odd: 3993 increment = !(absZ0 & 1) && absZ1; 3994 break; 3995 default: 3996 abort(); 3997 } 3998 if ( increment ) { 3999 ++absZ0; 4000 if ( absZ0 == 0 ) goto overflow; 4001 if (!(absZ1 << 1) && roundNearestEven) { 4002 absZ0 &= ~1; 4003 } 4004 } 4005 z = absZ0; 4006 if ( zSign ) z = - z; 4007 if ( z && ( ( z < 0 ) ^ zSign ) ) { 4008 overflow: 4009 float_raise(float_flag_invalid, status); 4010 return zSign ? INT64_MIN : INT64_MAX; 4011 } 4012 if (absZ1) { 4013 float_raise(float_flag_inexact, status); 4014 } 4015 return z; 4016 4017 } 4018 4019 /*---------------------------------------------------------------------------- 4020 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 4021 | `absZ1', with binary point between bits 63 and 64 (between the input words), 4022 | and returns the properly rounded 64-bit unsigned integer corresponding to the 4023 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 4024 | with the inexact exception raised if the input cannot be represented exactly 4025 | as an integer. However, if the fixed-point input is too large, the invalid 4026 | exception is raised and the largest unsigned integer is returned. 4027 *----------------------------------------------------------------------------*/ 4028 4029 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0, 4030 uint64_t absZ1, float_status *status) 4031 { 4032 int8_t roundingMode; 4033 bool roundNearestEven, increment; 4034 4035 roundingMode = status->float_rounding_mode; 4036 roundNearestEven = (roundingMode == float_round_nearest_even); 4037 switch (roundingMode) { 4038 case float_round_nearest_even: 4039 case float_round_ties_away: 4040 increment = ((int64_t)absZ1 < 0); 4041 break; 4042 case float_round_to_zero: 4043 increment = 0; 4044 break; 4045 case float_round_up: 4046 increment = !zSign && absZ1; 4047 break; 4048 case float_round_down: 4049 increment = zSign && absZ1; 4050 break; 4051 case float_round_to_odd: 4052 increment = !(absZ0 & 1) && absZ1; 4053 break; 4054 default: 4055 abort(); 4056 } 4057 if (increment) { 4058 ++absZ0; 4059 if (absZ0 == 0) { 4060 float_raise(float_flag_invalid, status); 4061 return UINT64_MAX; 4062 } 4063 if (!(absZ1 << 1) && roundNearestEven) { 4064 absZ0 &= ~1; 4065 } 4066 } 4067 4068 if (zSign && absZ0) { 4069 float_raise(float_flag_invalid, status); 4070 return 0; 4071 } 4072 4073 if (absZ1) { 4074 float_raise(float_flag_inexact, status); 4075 } 4076 return absZ0; 4077 } 4078 4079 /*---------------------------------------------------------------------------- 4080 | Normalizes the subnormal single-precision floating-point value represented 4081 | by the denormalized significand `aSig'. The normalized exponent and 4082 | significand are stored at the locations pointed to by `zExpPtr' and 4083 | `zSigPtr', respectively. 4084 *----------------------------------------------------------------------------*/ 4085 4086 static void 4087 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 4088 { 4089 int8_t shiftCount; 4090 4091 shiftCount = clz32(aSig) - 8; 4092 *zSigPtr = aSig<<shiftCount; 4093 *zExpPtr = 1 - shiftCount; 4094 4095 } 4096 4097 /*---------------------------------------------------------------------------- 4098 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4099 | and significand `zSig', and returns the proper single-precision floating- 4100 | point value corresponding to the abstract input. Ordinarily, the abstract 4101 | value is simply rounded and packed into the single-precision format, with 4102 | the inexact exception raised if the abstract input cannot be represented 4103 | exactly. However, if the abstract value is too large, the overflow and 4104 | inexact exceptions are raised and an infinity or maximal finite value is 4105 | returned. If the abstract value is too small, the input value is rounded to 4106 | a subnormal number, and the underflow and inexact exceptions are raised if 4107 | the abstract input cannot be represented exactly as a subnormal single- 4108 | precision floating-point number. 4109 | The input significand `zSig' has its binary point between bits 30 4110 | and 29, which is 7 bits to the left of the usual location. This shifted 4111 | significand must be normalized or smaller. If `zSig' is not normalized, 4112 | `zExp' must be 0; in that case, the result returned is a subnormal number, 4113 | and it must not require rounding. In the usual case that `zSig' is 4114 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 4115 | The handling of underflow and overflow follows the IEC/IEEE Standard for 4116 | Binary Floating-Point Arithmetic. 4117 *----------------------------------------------------------------------------*/ 4118 4119 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig, 4120 float_status *status) 4121 { 4122 int8_t roundingMode; 4123 bool roundNearestEven; 4124 int8_t roundIncrement, roundBits; 4125 bool isTiny; 4126 4127 roundingMode = status->float_rounding_mode; 4128 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4129 switch (roundingMode) { 4130 case float_round_nearest_even: 4131 case float_round_ties_away: 4132 roundIncrement = 0x40; 4133 break; 4134 case float_round_to_zero: 4135 roundIncrement = 0; 4136 break; 4137 case float_round_up: 4138 roundIncrement = zSign ? 0 : 0x7f; 4139 break; 4140 case float_round_down: 4141 roundIncrement = zSign ? 0x7f : 0; 4142 break; 4143 case float_round_to_odd: 4144 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 4145 break; 4146 default: 4147 abort(); 4148 break; 4149 } 4150 roundBits = zSig & 0x7F; 4151 if ( 0xFD <= (uint16_t) zExp ) { 4152 if ( ( 0xFD < zExp ) 4153 || ( ( zExp == 0xFD ) 4154 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 4155 ) { 4156 bool overflow_to_inf = roundingMode != float_round_to_odd && 4157 roundIncrement != 0; 4158 float_raise(float_flag_overflow | float_flag_inexact, status); 4159 return packFloat32(zSign, 0xFF, -!overflow_to_inf); 4160 } 4161 if ( zExp < 0 ) { 4162 if (status->flush_to_zero) { 4163 float_raise(float_flag_output_denormal, status); 4164 return packFloat32(zSign, 0, 0); 4165 } 4166 isTiny = status->tininess_before_rounding 4167 || (zExp < -1) 4168 || (zSig + roundIncrement < 0x80000000); 4169 shift32RightJamming( zSig, - zExp, &zSig ); 4170 zExp = 0; 4171 roundBits = zSig & 0x7F; 4172 if (isTiny && roundBits) { 4173 float_raise(float_flag_underflow, status); 4174 } 4175 if (roundingMode == float_round_to_odd) { 4176 /* 4177 * For round-to-odd case, the roundIncrement depends on 4178 * zSig which just changed. 4179 */ 4180 roundIncrement = zSig & 0x80 ? 0 : 0x7f; 4181 } 4182 } 4183 } 4184 if (roundBits) { 4185 float_raise(float_flag_inexact, status); 4186 } 4187 zSig = ( zSig + roundIncrement )>>7; 4188 if (!(roundBits ^ 0x40) && roundNearestEven) { 4189 zSig &= ~1; 4190 } 4191 if ( zSig == 0 ) zExp = 0; 4192 return packFloat32( zSign, zExp, zSig ); 4193 4194 } 4195 4196 /*---------------------------------------------------------------------------- 4197 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4198 | and significand `zSig', and returns the proper single-precision floating- 4199 | point value corresponding to the abstract input. This routine is just like 4200 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 4201 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 4202 | floating-point exponent. 4203 *----------------------------------------------------------------------------*/ 4204 4205 static float32 4206 normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig, 4207 float_status *status) 4208 { 4209 int8_t shiftCount; 4210 4211 shiftCount = clz32(zSig) - 1; 4212 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 4213 status); 4214 4215 } 4216 4217 /*---------------------------------------------------------------------------- 4218 | Normalizes the subnormal double-precision floating-point value represented 4219 | by the denormalized significand `aSig'. The normalized exponent and 4220 | significand are stored at the locations pointed to by `zExpPtr' and 4221 | `zSigPtr', respectively. 4222 *----------------------------------------------------------------------------*/ 4223 4224 static void 4225 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 4226 { 4227 int8_t shiftCount; 4228 4229 shiftCount = clz64(aSig) - 11; 4230 *zSigPtr = aSig<<shiftCount; 4231 *zExpPtr = 1 - shiftCount; 4232 4233 } 4234 4235 /*---------------------------------------------------------------------------- 4236 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 4237 | double-precision floating-point value, returning the result. After being 4238 | shifted into the proper positions, the three fields are simply added 4239 | together to form the result. This means that any integer portion of `zSig' 4240 | will be added into the exponent. Since a properly normalized significand 4241 | will have an integer portion equal to 1, the `zExp' input should be 1 less 4242 | than the desired result exponent whenever `zSig' is a complete, normalized 4243 | significand. 4244 *----------------------------------------------------------------------------*/ 4245 4246 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig) 4247 { 4248 4249 return make_float64( 4250 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 4251 4252 } 4253 4254 /*---------------------------------------------------------------------------- 4255 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4256 | and significand `zSig', and returns the proper double-precision floating- 4257 | point value corresponding to the abstract input. Ordinarily, the abstract 4258 | value is simply rounded and packed into the double-precision format, with 4259 | the inexact exception raised if the abstract input cannot be represented 4260 | exactly. However, if the abstract value is too large, the overflow and 4261 | inexact exceptions are raised and an infinity or maximal finite value is 4262 | returned. If the abstract value is too small, the input value is rounded to 4263 | a subnormal number, and the underflow and inexact exceptions are raised if 4264 | the abstract input cannot be represented exactly as a subnormal double- 4265 | precision floating-point number. 4266 | The input significand `zSig' has its binary point between bits 62 4267 | and 61, which is 10 bits to the left of the usual location. This shifted 4268 | significand must be normalized or smaller. If `zSig' is not normalized, 4269 | `zExp' must be 0; in that case, the result returned is a subnormal number, 4270 | and it must not require rounding. In the usual case that `zSig' is 4271 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 4272 | The handling of underflow and overflow follows the IEC/IEEE Standard for 4273 | Binary Floating-Point Arithmetic. 4274 *----------------------------------------------------------------------------*/ 4275 4276 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig, 4277 float_status *status) 4278 { 4279 int8_t roundingMode; 4280 bool roundNearestEven; 4281 int roundIncrement, roundBits; 4282 bool isTiny; 4283 4284 roundingMode = status->float_rounding_mode; 4285 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4286 switch (roundingMode) { 4287 case float_round_nearest_even: 4288 case float_round_ties_away: 4289 roundIncrement = 0x200; 4290 break; 4291 case float_round_to_zero: 4292 roundIncrement = 0; 4293 break; 4294 case float_round_up: 4295 roundIncrement = zSign ? 0 : 0x3ff; 4296 break; 4297 case float_round_down: 4298 roundIncrement = zSign ? 0x3ff : 0; 4299 break; 4300 case float_round_to_odd: 4301 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 4302 break; 4303 default: 4304 abort(); 4305 } 4306 roundBits = zSig & 0x3FF; 4307 if ( 0x7FD <= (uint16_t) zExp ) { 4308 if ( ( 0x7FD < zExp ) 4309 || ( ( zExp == 0x7FD ) 4310 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 4311 ) { 4312 bool overflow_to_inf = roundingMode != float_round_to_odd && 4313 roundIncrement != 0; 4314 float_raise(float_flag_overflow | float_flag_inexact, status); 4315 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 4316 } 4317 if ( zExp < 0 ) { 4318 if (status->flush_to_zero) { 4319 float_raise(float_flag_output_denormal, status); 4320 return packFloat64(zSign, 0, 0); 4321 } 4322 isTiny = status->tininess_before_rounding 4323 || (zExp < -1) 4324 || (zSig + roundIncrement < UINT64_C(0x8000000000000000)); 4325 shift64RightJamming( zSig, - zExp, &zSig ); 4326 zExp = 0; 4327 roundBits = zSig & 0x3FF; 4328 if (isTiny && roundBits) { 4329 float_raise(float_flag_underflow, status); 4330 } 4331 if (roundingMode == float_round_to_odd) { 4332 /* 4333 * For round-to-odd case, the roundIncrement depends on 4334 * zSig which just changed. 4335 */ 4336 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 4337 } 4338 } 4339 } 4340 if (roundBits) { 4341 float_raise(float_flag_inexact, status); 4342 } 4343 zSig = ( zSig + roundIncrement )>>10; 4344 if (!(roundBits ^ 0x200) && roundNearestEven) { 4345 zSig &= ~1; 4346 } 4347 if ( zSig == 0 ) zExp = 0; 4348 return packFloat64( zSign, zExp, zSig ); 4349 4350 } 4351 4352 /*---------------------------------------------------------------------------- 4353 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4354 | and significand `zSig', and returns the proper double-precision floating- 4355 | point value corresponding to the abstract input. This routine is just like 4356 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 4357 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 4358 | floating-point exponent. 4359 *----------------------------------------------------------------------------*/ 4360 4361 static float64 4362 normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig, 4363 float_status *status) 4364 { 4365 int8_t shiftCount; 4366 4367 shiftCount = clz64(zSig) - 1; 4368 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 4369 status); 4370 4371 } 4372 4373 /*---------------------------------------------------------------------------- 4374 | Normalizes the subnormal extended double-precision floating-point value 4375 | represented by the denormalized significand `aSig'. The normalized exponent 4376 | and significand are stored at the locations pointed to by `zExpPtr' and 4377 | `zSigPtr', respectively. 4378 *----------------------------------------------------------------------------*/ 4379 4380 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr, 4381 uint64_t *zSigPtr) 4382 { 4383 int8_t shiftCount; 4384 4385 shiftCount = clz64(aSig); 4386 *zSigPtr = aSig<<shiftCount; 4387 *zExpPtr = 1 - shiftCount; 4388 } 4389 4390 /*---------------------------------------------------------------------------- 4391 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4392 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 4393 | and returns the proper extended double-precision floating-point value 4394 | corresponding to the abstract input. Ordinarily, the abstract value is 4395 | rounded and packed into the extended double-precision format, with the 4396 | inexact exception raised if the abstract input cannot be represented 4397 | exactly. However, if the abstract value is too large, the overflow and 4398 | inexact exceptions are raised and an infinity or maximal finite value is 4399 | returned. If the abstract value is too small, the input value is rounded to 4400 | a subnormal number, and the underflow and inexact exceptions are raised if 4401 | the abstract input cannot be represented exactly as a subnormal extended 4402 | double-precision floating-point number. 4403 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 4404 | number of bits as single or double precision, respectively. Otherwise, the 4405 | result is rounded to the full precision of the extended double-precision 4406 | format. 4407 | The input significand must be normalized or smaller. If the input 4408 | significand is not normalized, `zExp' must be 0; in that case, the result 4409 | returned is a subnormal number, and it must not require rounding. The 4410 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 4411 | Floating-Point Arithmetic. 4412 *----------------------------------------------------------------------------*/ 4413 4414 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign, 4415 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 4416 float_status *status) 4417 { 4418 int8_t roundingMode; 4419 bool roundNearestEven, increment, isTiny; 4420 int64_t roundIncrement, roundMask, roundBits; 4421 4422 roundingMode = status->float_rounding_mode; 4423 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4424 if ( roundingPrecision == 80 ) goto precision80; 4425 if ( roundingPrecision == 64 ) { 4426 roundIncrement = UINT64_C(0x0000000000000400); 4427 roundMask = UINT64_C(0x00000000000007FF); 4428 } 4429 else if ( roundingPrecision == 32 ) { 4430 roundIncrement = UINT64_C(0x0000008000000000); 4431 roundMask = UINT64_C(0x000000FFFFFFFFFF); 4432 } 4433 else { 4434 goto precision80; 4435 } 4436 zSig0 |= ( zSig1 != 0 ); 4437 switch (roundingMode) { 4438 case float_round_nearest_even: 4439 case float_round_ties_away: 4440 break; 4441 case float_round_to_zero: 4442 roundIncrement = 0; 4443 break; 4444 case float_round_up: 4445 roundIncrement = zSign ? 0 : roundMask; 4446 break; 4447 case float_round_down: 4448 roundIncrement = zSign ? roundMask : 0; 4449 break; 4450 default: 4451 abort(); 4452 } 4453 roundBits = zSig0 & roundMask; 4454 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 4455 if ( ( 0x7FFE < zExp ) 4456 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 4457 ) { 4458 goto overflow; 4459 } 4460 if ( zExp <= 0 ) { 4461 if (status->flush_to_zero) { 4462 float_raise(float_flag_output_denormal, status); 4463 return packFloatx80(zSign, 0, 0); 4464 } 4465 isTiny = status->tininess_before_rounding 4466 || (zExp < 0 ) 4467 || (zSig0 <= zSig0 + roundIncrement); 4468 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 4469 zExp = 0; 4470 roundBits = zSig0 & roundMask; 4471 if (isTiny && roundBits) { 4472 float_raise(float_flag_underflow, status); 4473 } 4474 if (roundBits) { 4475 float_raise(float_flag_inexact, status); 4476 } 4477 zSig0 += roundIncrement; 4478 if ( (int64_t) zSig0 < 0 ) zExp = 1; 4479 roundIncrement = roundMask + 1; 4480 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 4481 roundMask |= roundIncrement; 4482 } 4483 zSig0 &= ~ roundMask; 4484 return packFloatx80( zSign, zExp, zSig0 ); 4485 } 4486 } 4487 if (roundBits) { 4488 float_raise(float_flag_inexact, status); 4489 } 4490 zSig0 += roundIncrement; 4491 if ( zSig0 < roundIncrement ) { 4492 ++zExp; 4493 zSig0 = UINT64_C(0x8000000000000000); 4494 } 4495 roundIncrement = roundMask + 1; 4496 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 4497 roundMask |= roundIncrement; 4498 } 4499 zSig0 &= ~ roundMask; 4500 if ( zSig0 == 0 ) zExp = 0; 4501 return packFloatx80( zSign, zExp, zSig0 ); 4502 precision80: 4503 switch (roundingMode) { 4504 case float_round_nearest_even: 4505 case float_round_ties_away: 4506 increment = ((int64_t)zSig1 < 0); 4507 break; 4508 case float_round_to_zero: 4509 increment = 0; 4510 break; 4511 case float_round_up: 4512 increment = !zSign && zSig1; 4513 break; 4514 case float_round_down: 4515 increment = zSign && zSig1; 4516 break; 4517 default: 4518 abort(); 4519 } 4520 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 4521 if ( ( 0x7FFE < zExp ) 4522 || ( ( zExp == 0x7FFE ) 4523 && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) ) 4524 && increment 4525 ) 4526 ) { 4527 roundMask = 0; 4528 overflow: 4529 float_raise(float_flag_overflow | float_flag_inexact, status); 4530 if ( ( roundingMode == float_round_to_zero ) 4531 || ( zSign && ( roundingMode == float_round_up ) ) 4532 || ( ! zSign && ( roundingMode == float_round_down ) ) 4533 ) { 4534 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 4535 } 4536 return packFloatx80(zSign, 4537 floatx80_infinity_high, 4538 floatx80_infinity_low); 4539 } 4540 if ( zExp <= 0 ) { 4541 isTiny = status->tininess_before_rounding 4542 || (zExp < 0) 4543 || !increment 4544 || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF)); 4545 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 4546 zExp = 0; 4547 if (isTiny && zSig1) { 4548 float_raise(float_flag_underflow, status); 4549 } 4550 if (zSig1) { 4551 float_raise(float_flag_inexact, status); 4552 } 4553 switch (roundingMode) { 4554 case float_round_nearest_even: 4555 case float_round_ties_away: 4556 increment = ((int64_t)zSig1 < 0); 4557 break; 4558 case float_round_to_zero: 4559 increment = 0; 4560 break; 4561 case float_round_up: 4562 increment = !zSign && zSig1; 4563 break; 4564 case float_round_down: 4565 increment = zSign && zSig1; 4566 break; 4567 default: 4568 abort(); 4569 } 4570 if ( increment ) { 4571 ++zSig0; 4572 if (!(zSig1 << 1) && roundNearestEven) { 4573 zSig0 &= ~1; 4574 } 4575 if ( (int64_t) zSig0 < 0 ) zExp = 1; 4576 } 4577 return packFloatx80( zSign, zExp, zSig0 ); 4578 } 4579 } 4580 if (zSig1) { 4581 float_raise(float_flag_inexact, status); 4582 } 4583 if ( increment ) { 4584 ++zSig0; 4585 if ( zSig0 == 0 ) { 4586 ++zExp; 4587 zSig0 = UINT64_C(0x8000000000000000); 4588 } 4589 else { 4590 if (!(zSig1 << 1) && roundNearestEven) { 4591 zSig0 &= ~1; 4592 } 4593 } 4594 } 4595 else { 4596 if ( zSig0 == 0 ) zExp = 0; 4597 } 4598 return packFloatx80( zSign, zExp, zSig0 ); 4599 4600 } 4601 4602 /*---------------------------------------------------------------------------- 4603 | Takes an abstract floating-point value having sign `zSign', exponent 4604 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 4605 | and returns the proper extended double-precision floating-point value 4606 | corresponding to the abstract input. This routine is just like 4607 | `roundAndPackFloatx80' except that the input significand does not have to be 4608 | normalized. 4609 *----------------------------------------------------------------------------*/ 4610 4611 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 4612 bool zSign, int32_t zExp, 4613 uint64_t zSig0, uint64_t zSig1, 4614 float_status *status) 4615 { 4616 int8_t shiftCount; 4617 4618 if ( zSig0 == 0 ) { 4619 zSig0 = zSig1; 4620 zSig1 = 0; 4621 zExp -= 64; 4622 } 4623 shiftCount = clz64(zSig0); 4624 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4625 zExp -= shiftCount; 4626 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 4627 zSig0, zSig1, status); 4628 4629 } 4630 4631 /*---------------------------------------------------------------------------- 4632 | Returns the least-significant 64 fraction bits of the quadruple-precision 4633 | floating-point value `a'. 4634 *----------------------------------------------------------------------------*/ 4635 4636 static inline uint64_t extractFloat128Frac1( float128 a ) 4637 { 4638 4639 return a.low; 4640 4641 } 4642 4643 /*---------------------------------------------------------------------------- 4644 | Returns the most-significant 48 fraction bits of the quadruple-precision 4645 | floating-point value `a'. 4646 *----------------------------------------------------------------------------*/ 4647 4648 static inline uint64_t extractFloat128Frac0( float128 a ) 4649 { 4650 4651 return a.high & UINT64_C(0x0000FFFFFFFFFFFF); 4652 4653 } 4654 4655 /*---------------------------------------------------------------------------- 4656 | Returns the exponent bits of the quadruple-precision floating-point value 4657 | `a'. 4658 *----------------------------------------------------------------------------*/ 4659 4660 static inline int32_t extractFloat128Exp( float128 a ) 4661 { 4662 4663 return ( a.high>>48 ) & 0x7FFF; 4664 4665 } 4666 4667 /*---------------------------------------------------------------------------- 4668 | Returns the sign bit of the quadruple-precision floating-point value `a'. 4669 *----------------------------------------------------------------------------*/ 4670 4671 static inline bool extractFloat128Sign(float128 a) 4672 { 4673 return a.high >> 63; 4674 } 4675 4676 /*---------------------------------------------------------------------------- 4677 | Normalizes the subnormal quadruple-precision floating-point value 4678 | represented by the denormalized significand formed by the concatenation of 4679 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 4680 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 4681 | significand are stored at the location pointed to by `zSig0Ptr', and the 4682 | least significant 64 bits of the normalized significand are stored at the 4683 | location pointed to by `zSig1Ptr'. 4684 *----------------------------------------------------------------------------*/ 4685 4686 static void 4687 normalizeFloat128Subnormal( 4688 uint64_t aSig0, 4689 uint64_t aSig1, 4690 int32_t *zExpPtr, 4691 uint64_t *zSig0Ptr, 4692 uint64_t *zSig1Ptr 4693 ) 4694 { 4695 int8_t shiftCount; 4696 4697 if ( aSig0 == 0 ) { 4698 shiftCount = clz64(aSig1) - 15; 4699 if ( shiftCount < 0 ) { 4700 *zSig0Ptr = aSig1>>( - shiftCount ); 4701 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 4702 } 4703 else { 4704 *zSig0Ptr = aSig1<<shiftCount; 4705 *zSig1Ptr = 0; 4706 } 4707 *zExpPtr = - shiftCount - 63; 4708 } 4709 else { 4710 shiftCount = clz64(aSig0) - 15; 4711 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 4712 *zExpPtr = 1 - shiftCount; 4713 } 4714 4715 } 4716 4717 /*---------------------------------------------------------------------------- 4718 | Packs the sign `zSign', the exponent `zExp', and the significand formed 4719 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 4720 | floating-point value, returning the result. After being shifted into the 4721 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 4722 | added together to form the most significant 32 bits of the result. This 4723 | means that any integer portion of `zSig0' will be added into the exponent. 4724 | Since a properly normalized significand will have an integer portion equal 4725 | to 1, the `zExp' input should be 1 less than the desired result exponent 4726 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 4727 | significand. 4728 *----------------------------------------------------------------------------*/ 4729 4730 static inline float128 4731 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1) 4732 { 4733 float128 z; 4734 4735 z.low = zSig1; 4736 z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0; 4737 return z; 4738 } 4739 4740 /*---------------------------------------------------------------------------- 4741 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4742 | and extended significand formed by the concatenation of `zSig0', `zSig1', 4743 | and `zSig2', and returns the proper quadruple-precision floating-point value 4744 | corresponding to the abstract input. Ordinarily, the abstract value is 4745 | simply rounded and packed into the quadruple-precision format, with the 4746 | inexact exception raised if the abstract input cannot be represented 4747 | exactly. However, if the abstract value is too large, the overflow and 4748 | inexact exceptions are raised and an infinity or maximal finite value is 4749 | returned. If the abstract value is too small, the input value is rounded to 4750 | a subnormal number, and the underflow and inexact exceptions are raised if 4751 | the abstract input cannot be represented exactly as a subnormal quadruple- 4752 | precision floating-point number. 4753 | The input significand must be normalized or smaller. If the input 4754 | significand is not normalized, `zExp' must be 0; in that case, the result 4755 | returned is a subnormal number, and it must not require rounding. In the 4756 | usual case that the input significand is normalized, `zExp' must be 1 less 4757 | than the ``true'' floating-point exponent. The handling of underflow and 4758 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4759 *----------------------------------------------------------------------------*/ 4760 4761 static float128 roundAndPackFloat128(bool zSign, int32_t zExp, 4762 uint64_t zSig0, uint64_t zSig1, 4763 uint64_t zSig2, float_status *status) 4764 { 4765 int8_t roundingMode; 4766 bool roundNearestEven, increment, isTiny; 4767 4768 roundingMode = status->float_rounding_mode; 4769 roundNearestEven = ( roundingMode == float_round_nearest_even ); 4770 switch (roundingMode) { 4771 case float_round_nearest_even: 4772 case float_round_ties_away: 4773 increment = ((int64_t)zSig2 < 0); 4774 break; 4775 case float_round_to_zero: 4776 increment = 0; 4777 break; 4778 case float_round_up: 4779 increment = !zSign && zSig2; 4780 break; 4781 case float_round_down: 4782 increment = zSign && zSig2; 4783 break; 4784 case float_round_to_odd: 4785 increment = !(zSig1 & 0x1) && zSig2; 4786 break; 4787 default: 4788 abort(); 4789 } 4790 if ( 0x7FFD <= (uint32_t) zExp ) { 4791 if ( ( 0x7FFD < zExp ) 4792 || ( ( zExp == 0x7FFD ) 4793 && eq128( 4794 UINT64_C(0x0001FFFFFFFFFFFF), 4795 UINT64_C(0xFFFFFFFFFFFFFFFF), 4796 zSig0, 4797 zSig1 4798 ) 4799 && increment 4800 ) 4801 ) { 4802 float_raise(float_flag_overflow | float_flag_inexact, status); 4803 if ( ( roundingMode == float_round_to_zero ) 4804 || ( zSign && ( roundingMode == float_round_up ) ) 4805 || ( ! zSign && ( roundingMode == float_round_down ) ) 4806 || (roundingMode == float_round_to_odd) 4807 ) { 4808 return 4809 packFloat128( 4810 zSign, 4811 0x7FFE, 4812 UINT64_C(0x0000FFFFFFFFFFFF), 4813 UINT64_C(0xFFFFFFFFFFFFFFFF) 4814 ); 4815 } 4816 return packFloat128( zSign, 0x7FFF, 0, 0 ); 4817 } 4818 if ( zExp < 0 ) { 4819 if (status->flush_to_zero) { 4820 float_raise(float_flag_output_denormal, status); 4821 return packFloat128(zSign, 0, 0, 0); 4822 } 4823 isTiny = status->tininess_before_rounding 4824 || (zExp < -1) 4825 || !increment 4826 || lt128(zSig0, zSig1, 4827 UINT64_C(0x0001FFFFFFFFFFFF), 4828 UINT64_C(0xFFFFFFFFFFFFFFFF)); 4829 shift128ExtraRightJamming( 4830 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 4831 zExp = 0; 4832 if (isTiny && zSig2) { 4833 float_raise(float_flag_underflow, status); 4834 } 4835 switch (roundingMode) { 4836 case float_round_nearest_even: 4837 case float_round_ties_away: 4838 increment = ((int64_t)zSig2 < 0); 4839 break; 4840 case float_round_to_zero: 4841 increment = 0; 4842 break; 4843 case float_round_up: 4844 increment = !zSign && zSig2; 4845 break; 4846 case float_round_down: 4847 increment = zSign && zSig2; 4848 break; 4849 case float_round_to_odd: 4850 increment = !(zSig1 & 0x1) && zSig2; 4851 break; 4852 default: 4853 abort(); 4854 } 4855 } 4856 } 4857 if (zSig2) { 4858 float_raise(float_flag_inexact, status); 4859 } 4860 if ( increment ) { 4861 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 4862 if ((zSig2 + zSig2 == 0) && roundNearestEven) { 4863 zSig1 &= ~1; 4864 } 4865 } 4866 else { 4867 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 4868 } 4869 return packFloat128( zSign, zExp, zSig0, zSig1 ); 4870 4871 } 4872 4873 /*---------------------------------------------------------------------------- 4874 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 4875 | and significand formed by the concatenation of `zSig0' and `zSig1', and 4876 | returns the proper quadruple-precision floating-point value corresponding 4877 | to the abstract input. This routine is just like `roundAndPackFloat128' 4878 | except that the input significand has fewer bits and does not have to be 4879 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 4880 | point exponent. 4881 *----------------------------------------------------------------------------*/ 4882 4883 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp, 4884 uint64_t zSig0, uint64_t zSig1, 4885 float_status *status) 4886 { 4887 int8_t shiftCount; 4888 uint64_t zSig2; 4889 4890 if ( zSig0 == 0 ) { 4891 zSig0 = zSig1; 4892 zSig1 = 0; 4893 zExp -= 64; 4894 } 4895 shiftCount = clz64(zSig0) - 15; 4896 if ( 0 <= shiftCount ) { 4897 zSig2 = 0; 4898 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 4899 } 4900 else { 4901 shift128ExtraRightJamming( 4902 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 4903 } 4904 zExp -= shiftCount; 4905 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 4906 4907 } 4908 4909 4910 /*---------------------------------------------------------------------------- 4911 | Returns the result of converting the 32-bit two's complement integer `a' 4912 | to the extended double-precision floating-point format. The conversion 4913 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4914 | Arithmetic. 4915 *----------------------------------------------------------------------------*/ 4916 4917 floatx80 int32_to_floatx80(int32_t a, float_status *status) 4918 { 4919 bool zSign; 4920 uint32_t absA; 4921 int8_t shiftCount; 4922 uint64_t zSig; 4923 4924 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 4925 zSign = ( a < 0 ); 4926 absA = zSign ? - a : a; 4927 shiftCount = clz32(absA) + 32; 4928 zSig = absA; 4929 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 4930 4931 } 4932 4933 /*---------------------------------------------------------------------------- 4934 | Returns the result of converting the 32-bit two's complement integer `a' to 4935 | the quadruple-precision floating-point format. The conversion is performed 4936 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4937 *----------------------------------------------------------------------------*/ 4938 4939 float128 int32_to_float128(int32_t a, float_status *status) 4940 { 4941 bool zSign; 4942 uint32_t absA; 4943 int8_t shiftCount; 4944 uint64_t zSig0; 4945 4946 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 4947 zSign = ( a < 0 ); 4948 absA = zSign ? - a : a; 4949 shiftCount = clz32(absA) + 17; 4950 zSig0 = absA; 4951 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 4952 4953 } 4954 4955 /*---------------------------------------------------------------------------- 4956 | Returns the result of converting the 64-bit two's complement integer `a' 4957 | to the extended double-precision floating-point format. The conversion 4958 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4959 | Arithmetic. 4960 *----------------------------------------------------------------------------*/ 4961 4962 floatx80 int64_to_floatx80(int64_t a, float_status *status) 4963 { 4964 bool zSign; 4965 uint64_t absA; 4966 int8_t shiftCount; 4967 4968 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 4969 zSign = ( a < 0 ); 4970 absA = zSign ? - a : a; 4971 shiftCount = clz64(absA); 4972 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 4973 4974 } 4975 4976 /*---------------------------------------------------------------------------- 4977 | Returns the result of converting the 64-bit two's complement integer `a' to 4978 | the quadruple-precision floating-point format. The conversion is performed 4979 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4980 *----------------------------------------------------------------------------*/ 4981 4982 float128 int64_to_float128(int64_t a, float_status *status) 4983 { 4984 bool zSign; 4985 uint64_t absA; 4986 int8_t shiftCount; 4987 int32_t zExp; 4988 uint64_t zSig0, zSig1; 4989 4990 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 4991 zSign = ( a < 0 ); 4992 absA = zSign ? - a : a; 4993 shiftCount = clz64(absA) + 49; 4994 zExp = 0x406E - shiftCount; 4995 if ( 64 <= shiftCount ) { 4996 zSig1 = 0; 4997 zSig0 = absA; 4998 shiftCount -= 64; 4999 } 5000 else { 5001 zSig1 = absA; 5002 zSig0 = 0; 5003 } 5004 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 5005 return packFloat128( zSign, zExp, zSig0, zSig1 ); 5006 5007 } 5008 5009 /*---------------------------------------------------------------------------- 5010 | Returns the result of converting the 64-bit unsigned integer `a' 5011 | to the quadruple-precision floating-point format. The conversion is performed 5012 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5013 *----------------------------------------------------------------------------*/ 5014 5015 float128 uint64_to_float128(uint64_t a, float_status *status) 5016 { 5017 if (a == 0) { 5018 return float128_zero; 5019 } 5020 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status); 5021 } 5022 5023 /*---------------------------------------------------------------------------- 5024 | Returns the result of converting the single-precision floating-point value 5025 | `a' to the extended double-precision floating-point format. The conversion 5026 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5027 | Arithmetic. 5028 *----------------------------------------------------------------------------*/ 5029 5030 floatx80 float32_to_floatx80(float32 a, float_status *status) 5031 { 5032 bool aSign; 5033 int aExp; 5034 uint32_t aSig; 5035 5036 a = float32_squash_input_denormal(a, status); 5037 aSig = extractFloat32Frac( a ); 5038 aExp = extractFloat32Exp( a ); 5039 aSign = extractFloat32Sign( a ); 5040 if ( aExp == 0xFF ) { 5041 if (aSig) { 5042 floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status), 5043 status); 5044 return floatx80_silence_nan(res, status); 5045 } 5046 return packFloatx80(aSign, 5047 floatx80_infinity_high, 5048 floatx80_infinity_low); 5049 } 5050 if ( aExp == 0 ) { 5051 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 5052 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5053 } 5054 aSig |= 0x00800000; 5055 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 5056 5057 } 5058 5059 /*---------------------------------------------------------------------------- 5060 | Returns the result of converting the single-precision floating-point value 5061 | `a' to the double-precision floating-point format. The conversion is 5062 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5063 | Arithmetic. 5064 *----------------------------------------------------------------------------*/ 5065 5066 float128 float32_to_float128(float32 a, float_status *status) 5067 { 5068 bool aSign; 5069 int aExp; 5070 uint32_t aSig; 5071 5072 a = float32_squash_input_denormal(a, status); 5073 aSig = extractFloat32Frac( a ); 5074 aExp = extractFloat32Exp( a ); 5075 aSign = extractFloat32Sign( a ); 5076 if ( aExp == 0xFF ) { 5077 if (aSig) { 5078 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 5079 } 5080 return packFloat128( aSign, 0x7FFF, 0, 0 ); 5081 } 5082 if ( aExp == 0 ) { 5083 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 5084 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5085 --aExp; 5086 } 5087 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 5088 5089 } 5090 5091 /*---------------------------------------------------------------------------- 5092 | Returns the remainder of the single-precision floating-point value `a' 5093 | with respect to the corresponding value `b'. The operation is performed 5094 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5095 *----------------------------------------------------------------------------*/ 5096 5097 float32 float32_rem(float32 a, float32 b, float_status *status) 5098 { 5099 bool aSign, zSign; 5100 int aExp, bExp, expDiff; 5101 uint32_t aSig, bSig; 5102 uint32_t q; 5103 uint64_t aSig64, bSig64, q64; 5104 uint32_t alternateASig; 5105 int32_t sigMean; 5106 a = float32_squash_input_denormal(a, status); 5107 b = float32_squash_input_denormal(b, status); 5108 5109 aSig = extractFloat32Frac( a ); 5110 aExp = extractFloat32Exp( a ); 5111 aSign = extractFloat32Sign( a ); 5112 bSig = extractFloat32Frac( b ); 5113 bExp = extractFloat32Exp( b ); 5114 if ( aExp == 0xFF ) { 5115 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 5116 return propagateFloat32NaN(a, b, status); 5117 } 5118 float_raise(float_flag_invalid, status); 5119 return float32_default_nan(status); 5120 } 5121 if ( bExp == 0xFF ) { 5122 if (bSig) { 5123 return propagateFloat32NaN(a, b, status); 5124 } 5125 return a; 5126 } 5127 if ( bExp == 0 ) { 5128 if ( bSig == 0 ) { 5129 float_raise(float_flag_invalid, status); 5130 return float32_default_nan(status); 5131 } 5132 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 5133 } 5134 if ( aExp == 0 ) { 5135 if ( aSig == 0 ) return a; 5136 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5137 } 5138 expDiff = aExp - bExp; 5139 aSig |= 0x00800000; 5140 bSig |= 0x00800000; 5141 if ( expDiff < 32 ) { 5142 aSig <<= 8; 5143 bSig <<= 8; 5144 if ( expDiff < 0 ) { 5145 if ( expDiff < -1 ) return a; 5146 aSig >>= 1; 5147 } 5148 q = ( bSig <= aSig ); 5149 if ( q ) aSig -= bSig; 5150 if ( 0 < expDiff ) { 5151 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 5152 q >>= 32 - expDiff; 5153 bSig >>= 2; 5154 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 5155 } 5156 else { 5157 aSig >>= 2; 5158 bSig >>= 2; 5159 } 5160 } 5161 else { 5162 if ( bSig <= aSig ) aSig -= bSig; 5163 aSig64 = ( (uint64_t) aSig )<<40; 5164 bSig64 = ( (uint64_t) bSig )<<40; 5165 expDiff -= 64; 5166 while ( 0 < expDiff ) { 5167 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 5168 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 5169 aSig64 = - ( ( bSig * q64 )<<38 ); 5170 expDiff -= 62; 5171 } 5172 expDiff += 64; 5173 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 5174 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 5175 q = q64>>( 64 - expDiff ); 5176 bSig <<= 6; 5177 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 5178 } 5179 do { 5180 alternateASig = aSig; 5181 ++q; 5182 aSig -= bSig; 5183 } while ( 0 <= (int32_t) aSig ); 5184 sigMean = aSig + alternateASig; 5185 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 5186 aSig = alternateASig; 5187 } 5188 zSign = ( (int32_t) aSig < 0 ); 5189 if ( zSign ) aSig = - aSig; 5190 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 5191 } 5192 5193 5194 5195 /*---------------------------------------------------------------------------- 5196 | Returns the binary exponential of the single-precision floating-point value 5197 | `a'. The operation is performed according to the IEC/IEEE Standard for 5198 | Binary Floating-Point Arithmetic. 5199 | 5200 | Uses the following identities: 5201 | 5202 | 1. ------------------------------------------------------------------------- 5203 | x x*ln(2) 5204 | 2 = e 5205 | 5206 | 2. ------------------------------------------------------------------------- 5207 | 2 3 4 5 n 5208 | x x x x x x x 5209 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 5210 | 1! 2! 3! 4! 5! n! 5211 *----------------------------------------------------------------------------*/ 5212 5213 static const float64 float32_exp2_coefficients[15] = 5214 { 5215 const_float64( 0x3ff0000000000000ll ), /* 1 */ 5216 const_float64( 0x3fe0000000000000ll ), /* 2 */ 5217 const_float64( 0x3fc5555555555555ll ), /* 3 */ 5218 const_float64( 0x3fa5555555555555ll ), /* 4 */ 5219 const_float64( 0x3f81111111111111ll ), /* 5 */ 5220 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 5221 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 5222 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 5223 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 5224 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 5225 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 5226 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 5227 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 5228 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 5229 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 5230 }; 5231 5232 float32 float32_exp2(float32 a, float_status *status) 5233 { 5234 bool aSign; 5235 int aExp; 5236 uint32_t aSig; 5237 float64 r, x, xn; 5238 int i; 5239 a = float32_squash_input_denormal(a, status); 5240 5241 aSig = extractFloat32Frac( a ); 5242 aExp = extractFloat32Exp( a ); 5243 aSign = extractFloat32Sign( a ); 5244 5245 if ( aExp == 0xFF) { 5246 if (aSig) { 5247 return propagateFloat32NaN(a, float32_zero, status); 5248 } 5249 return (aSign) ? float32_zero : a; 5250 } 5251 if (aExp == 0) { 5252 if (aSig == 0) return float32_one; 5253 } 5254 5255 float_raise(float_flag_inexact, status); 5256 5257 /* ******************************* */ 5258 /* using float64 for approximation */ 5259 /* ******************************* */ 5260 x = float32_to_float64(a, status); 5261 x = float64_mul(x, float64_ln2, status); 5262 5263 xn = x; 5264 r = float64_one; 5265 for (i = 0 ; i < 15 ; i++) { 5266 float64 f; 5267 5268 f = float64_mul(xn, float32_exp2_coefficients[i], status); 5269 r = float64_add(r, f, status); 5270 5271 xn = float64_mul(xn, x, status); 5272 } 5273 5274 return float64_to_float32(r, status); 5275 } 5276 5277 /*---------------------------------------------------------------------------- 5278 | Returns the binary log of the single-precision floating-point value `a'. 5279 | The operation is performed according to the IEC/IEEE Standard for Binary 5280 | Floating-Point Arithmetic. 5281 *----------------------------------------------------------------------------*/ 5282 float32 float32_log2(float32 a, float_status *status) 5283 { 5284 bool aSign, zSign; 5285 int aExp; 5286 uint32_t aSig, zSig, i; 5287 5288 a = float32_squash_input_denormal(a, status); 5289 aSig = extractFloat32Frac( a ); 5290 aExp = extractFloat32Exp( a ); 5291 aSign = extractFloat32Sign( a ); 5292 5293 if ( aExp == 0 ) { 5294 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 5295 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 5296 } 5297 if ( aSign ) { 5298 float_raise(float_flag_invalid, status); 5299 return float32_default_nan(status); 5300 } 5301 if ( aExp == 0xFF ) { 5302 if (aSig) { 5303 return propagateFloat32NaN(a, float32_zero, status); 5304 } 5305 return a; 5306 } 5307 5308 aExp -= 0x7F; 5309 aSig |= 0x00800000; 5310 zSign = aExp < 0; 5311 zSig = aExp << 23; 5312 5313 for (i = 1 << 22; i > 0; i >>= 1) { 5314 aSig = ( (uint64_t)aSig * aSig ) >> 23; 5315 if ( aSig & 0x01000000 ) { 5316 aSig >>= 1; 5317 zSig |= i; 5318 } 5319 } 5320 5321 if ( zSign ) 5322 zSig = -zSig; 5323 5324 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 5325 } 5326 5327 /*---------------------------------------------------------------------------- 5328 | Returns the result of converting the double-precision floating-point value 5329 | `a' to the extended double-precision floating-point format. The conversion 5330 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5331 | Arithmetic. 5332 *----------------------------------------------------------------------------*/ 5333 5334 floatx80 float64_to_floatx80(float64 a, float_status *status) 5335 { 5336 bool aSign; 5337 int aExp; 5338 uint64_t aSig; 5339 5340 a = float64_squash_input_denormal(a, status); 5341 aSig = extractFloat64Frac( a ); 5342 aExp = extractFloat64Exp( a ); 5343 aSign = extractFloat64Sign( a ); 5344 if ( aExp == 0x7FF ) { 5345 if (aSig) { 5346 floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status), 5347 status); 5348 return floatx80_silence_nan(res, status); 5349 } 5350 return packFloatx80(aSign, 5351 floatx80_infinity_high, 5352 floatx80_infinity_low); 5353 } 5354 if ( aExp == 0 ) { 5355 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 5356 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5357 } 5358 return 5359 packFloatx80( 5360 aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11); 5361 5362 } 5363 5364 /*---------------------------------------------------------------------------- 5365 | Returns the result of converting the double-precision floating-point value 5366 | `a' to the quadruple-precision floating-point format. The conversion is 5367 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5368 | Arithmetic. 5369 *----------------------------------------------------------------------------*/ 5370 5371 float128 float64_to_float128(float64 a, float_status *status) 5372 { 5373 bool aSign; 5374 int aExp; 5375 uint64_t aSig, zSig0, zSig1; 5376 5377 a = float64_squash_input_denormal(a, status); 5378 aSig = extractFloat64Frac( a ); 5379 aExp = extractFloat64Exp( a ); 5380 aSign = extractFloat64Sign( a ); 5381 if ( aExp == 0x7FF ) { 5382 if (aSig) { 5383 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 5384 } 5385 return packFloat128( aSign, 0x7FFF, 0, 0 ); 5386 } 5387 if ( aExp == 0 ) { 5388 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 5389 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5390 --aExp; 5391 } 5392 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 5393 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 5394 5395 } 5396 5397 5398 /*---------------------------------------------------------------------------- 5399 | Returns the remainder of the double-precision floating-point value `a' 5400 | with respect to the corresponding value `b'. The operation is performed 5401 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5402 *----------------------------------------------------------------------------*/ 5403 5404 float64 float64_rem(float64 a, float64 b, float_status *status) 5405 { 5406 bool aSign, zSign; 5407 int aExp, bExp, expDiff; 5408 uint64_t aSig, bSig; 5409 uint64_t q, alternateASig; 5410 int64_t sigMean; 5411 5412 a = float64_squash_input_denormal(a, status); 5413 b = float64_squash_input_denormal(b, status); 5414 aSig = extractFloat64Frac( a ); 5415 aExp = extractFloat64Exp( a ); 5416 aSign = extractFloat64Sign( a ); 5417 bSig = extractFloat64Frac( b ); 5418 bExp = extractFloat64Exp( b ); 5419 if ( aExp == 0x7FF ) { 5420 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 5421 return propagateFloat64NaN(a, b, status); 5422 } 5423 float_raise(float_flag_invalid, status); 5424 return float64_default_nan(status); 5425 } 5426 if ( bExp == 0x7FF ) { 5427 if (bSig) { 5428 return propagateFloat64NaN(a, b, status); 5429 } 5430 return a; 5431 } 5432 if ( bExp == 0 ) { 5433 if ( bSig == 0 ) { 5434 float_raise(float_flag_invalid, status); 5435 return float64_default_nan(status); 5436 } 5437 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 5438 } 5439 if ( aExp == 0 ) { 5440 if ( aSig == 0 ) return a; 5441 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5442 } 5443 expDiff = aExp - bExp; 5444 aSig = (aSig | UINT64_C(0x0010000000000000)) << 11; 5445 bSig = (bSig | UINT64_C(0x0010000000000000)) << 11; 5446 if ( expDiff < 0 ) { 5447 if ( expDiff < -1 ) return a; 5448 aSig >>= 1; 5449 } 5450 q = ( bSig <= aSig ); 5451 if ( q ) aSig -= bSig; 5452 expDiff -= 64; 5453 while ( 0 < expDiff ) { 5454 q = estimateDiv128To64( aSig, 0, bSig ); 5455 q = ( 2 < q ) ? q - 2 : 0; 5456 aSig = - ( ( bSig>>2 ) * q ); 5457 expDiff -= 62; 5458 } 5459 expDiff += 64; 5460 if ( 0 < expDiff ) { 5461 q = estimateDiv128To64( aSig, 0, bSig ); 5462 q = ( 2 < q ) ? q - 2 : 0; 5463 q >>= 64 - expDiff; 5464 bSig >>= 2; 5465 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 5466 } 5467 else { 5468 aSig >>= 2; 5469 bSig >>= 2; 5470 } 5471 do { 5472 alternateASig = aSig; 5473 ++q; 5474 aSig -= bSig; 5475 } while ( 0 <= (int64_t) aSig ); 5476 sigMean = aSig + alternateASig; 5477 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 5478 aSig = alternateASig; 5479 } 5480 zSign = ( (int64_t) aSig < 0 ); 5481 if ( zSign ) aSig = - aSig; 5482 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 5483 5484 } 5485 5486 /*---------------------------------------------------------------------------- 5487 | Returns the binary log of the double-precision floating-point value `a'. 5488 | The operation is performed according to the IEC/IEEE Standard for Binary 5489 | Floating-Point Arithmetic. 5490 *----------------------------------------------------------------------------*/ 5491 float64 float64_log2(float64 a, float_status *status) 5492 { 5493 bool aSign, zSign; 5494 int aExp; 5495 uint64_t aSig, aSig0, aSig1, zSig, i; 5496 a = float64_squash_input_denormal(a, status); 5497 5498 aSig = extractFloat64Frac( a ); 5499 aExp = extractFloat64Exp( a ); 5500 aSign = extractFloat64Sign( a ); 5501 5502 if ( aExp == 0 ) { 5503 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 5504 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 5505 } 5506 if ( aSign ) { 5507 float_raise(float_flag_invalid, status); 5508 return float64_default_nan(status); 5509 } 5510 if ( aExp == 0x7FF ) { 5511 if (aSig) { 5512 return propagateFloat64NaN(a, float64_zero, status); 5513 } 5514 return a; 5515 } 5516 5517 aExp -= 0x3FF; 5518 aSig |= UINT64_C(0x0010000000000000); 5519 zSign = aExp < 0; 5520 zSig = (uint64_t)aExp << 52; 5521 for (i = 1LL << 51; i > 0; i >>= 1) { 5522 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 5523 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 5524 if ( aSig & UINT64_C(0x0020000000000000) ) { 5525 aSig >>= 1; 5526 zSig |= i; 5527 } 5528 } 5529 5530 if ( zSign ) 5531 zSig = -zSig; 5532 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 5533 } 5534 5535 /*---------------------------------------------------------------------------- 5536 | Returns the result of converting the extended double-precision floating- 5537 | point value `a' to the 32-bit two's complement integer format. The 5538 | conversion is performed according to the IEC/IEEE Standard for Binary 5539 | Floating-Point Arithmetic---which means in particular that the conversion 5540 | is rounded according to the current rounding mode. If `a' is a NaN, the 5541 | largest positive integer is returned. Otherwise, if the conversion 5542 | overflows, the largest integer with the same sign as `a' is returned. 5543 *----------------------------------------------------------------------------*/ 5544 5545 int32_t floatx80_to_int32(floatx80 a, float_status *status) 5546 { 5547 bool aSign; 5548 int32_t aExp, shiftCount; 5549 uint64_t aSig; 5550 5551 if (floatx80_invalid_encoding(a)) { 5552 float_raise(float_flag_invalid, status); 5553 return 1 << 31; 5554 } 5555 aSig = extractFloatx80Frac( a ); 5556 aExp = extractFloatx80Exp( a ); 5557 aSign = extractFloatx80Sign( a ); 5558 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5559 shiftCount = 0x4037 - aExp; 5560 if ( shiftCount <= 0 ) shiftCount = 1; 5561 shift64RightJamming( aSig, shiftCount, &aSig ); 5562 return roundAndPackInt32(aSign, aSig, status); 5563 5564 } 5565 5566 /*---------------------------------------------------------------------------- 5567 | Returns the result of converting the extended double-precision floating- 5568 | point value `a' to the 32-bit two's complement integer format. The 5569 | conversion is performed according to the IEC/IEEE Standard for Binary 5570 | Floating-Point Arithmetic, except that the conversion is always rounded 5571 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5572 | Otherwise, if the conversion overflows, the largest integer with the same 5573 | sign as `a' is returned. 5574 *----------------------------------------------------------------------------*/ 5575 5576 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 5577 { 5578 bool aSign; 5579 int32_t aExp, shiftCount; 5580 uint64_t aSig, savedASig; 5581 int32_t z; 5582 5583 if (floatx80_invalid_encoding(a)) { 5584 float_raise(float_flag_invalid, status); 5585 return 1 << 31; 5586 } 5587 aSig = extractFloatx80Frac( a ); 5588 aExp = extractFloatx80Exp( a ); 5589 aSign = extractFloatx80Sign( a ); 5590 if ( 0x401E < aExp ) { 5591 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 5592 goto invalid; 5593 } 5594 else if ( aExp < 0x3FFF ) { 5595 if (aExp || aSig) { 5596 float_raise(float_flag_inexact, status); 5597 } 5598 return 0; 5599 } 5600 shiftCount = 0x403E - aExp; 5601 savedASig = aSig; 5602 aSig >>= shiftCount; 5603 z = aSig; 5604 if ( aSign ) z = - z; 5605 if ( ( z < 0 ) ^ aSign ) { 5606 invalid: 5607 float_raise(float_flag_invalid, status); 5608 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5609 } 5610 if ( ( aSig<<shiftCount ) != savedASig ) { 5611 float_raise(float_flag_inexact, status); 5612 } 5613 return z; 5614 5615 } 5616 5617 /*---------------------------------------------------------------------------- 5618 | Returns the result of converting the extended double-precision floating- 5619 | point value `a' to the 64-bit two's complement integer format. The 5620 | conversion is performed according to the IEC/IEEE Standard for Binary 5621 | Floating-Point Arithmetic---which means in particular that the conversion 5622 | is rounded according to the current rounding mode. If `a' is a NaN, 5623 | the largest positive integer is returned. Otherwise, if the conversion 5624 | overflows, the largest integer with the same sign as `a' is returned. 5625 *----------------------------------------------------------------------------*/ 5626 5627 int64_t floatx80_to_int64(floatx80 a, float_status *status) 5628 { 5629 bool aSign; 5630 int32_t aExp, shiftCount; 5631 uint64_t aSig, aSigExtra; 5632 5633 if (floatx80_invalid_encoding(a)) { 5634 float_raise(float_flag_invalid, status); 5635 return 1ULL << 63; 5636 } 5637 aSig = extractFloatx80Frac( a ); 5638 aExp = extractFloatx80Exp( a ); 5639 aSign = extractFloatx80Sign( a ); 5640 shiftCount = 0x403E - aExp; 5641 if ( shiftCount <= 0 ) { 5642 if ( shiftCount ) { 5643 float_raise(float_flag_invalid, status); 5644 if (!aSign || floatx80_is_any_nan(a)) { 5645 return INT64_MAX; 5646 } 5647 return INT64_MIN; 5648 } 5649 aSigExtra = 0; 5650 } 5651 else { 5652 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 5653 } 5654 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 5655 5656 } 5657 5658 /*---------------------------------------------------------------------------- 5659 | Returns the result of converting the extended double-precision floating- 5660 | point value `a' to the 64-bit two's complement integer format. The 5661 | conversion is performed according to the IEC/IEEE Standard for Binary 5662 | Floating-Point Arithmetic, except that the conversion is always rounded 5663 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5664 | Otherwise, if the conversion overflows, the largest integer with the same 5665 | sign as `a' is returned. 5666 *----------------------------------------------------------------------------*/ 5667 5668 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 5669 { 5670 bool aSign; 5671 int32_t aExp, shiftCount; 5672 uint64_t aSig; 5673 int64_t z; 5674 5675 if (floatx80_invalid_encoding(a)) { 5676 float_raise(float_flag_invalid, status); 5677 return 1ULL << 63; 5678 } 5679 aSig = extractFloatx80Frac( a ); 5680 aExp = extractFloatx80Exp( a ); 5681 aSign = extractFloatx80Sign( a ); 5682 shiftCount = aExp - 0x403E; 5683 if ( 0 <= shiftCount ) { 5684 aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF); 5685 if ( ( a.high != 0xC03E ) || aSig ) { 5686 float_raise(float_flag_invalid, status); 5687 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 5688 return INT64_MAX; 5689 } 5690 } 5691 return INT64_MIN; 5692 } 5693 else if ( aExp < 0x3FFF ) { 5694 if (aExp | aSig) { 5695 float_raise(float_flag_inexact, status); 5696 } 5697 return 0; 5698 } 5699 z = aSig>>( - shiftCount ); 5700 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 5701 float_raise(float_flag_inexact, status); 5702 } 5703 if ( aSign ) z = - z; 5704 return z; 5705 5706 } 5707 5708 /*---------------------------------------------------------------------------- 5709 | Returns the result of converting the extended double-precision floating- 5710 | point value `a' to the single-precision floating-point format. The 5711 | conversion is performed according to the IEC/IEEE Standard for Binary 5712 | Floating-Point Arithmetic. 5713 *----------------------------------------------------------------------------*/ 5714 5715 float32 floatx80_to_float32(floatx80 a, float_status *status) 5716 { 5717 bool aSign; 5718 int32_t aExp; 5719 uint64_t aSig; 5720 5721 if (floatx80_invalid_encoding(a)) { 5722 float_raise(float_flag_invalid, status); 5723 return float32_default_nan(status); 5724 } 5725 aSig = extractFloatx80Frac( a ); 5726 aExp = extractFloatx80Exp( a ); 5727 aSign = extractFloatx80Sign( a ); 5728 if ( aExp == 0x7FFF ) { 5729 if ( (uint64_t) ( aSig<<1 ) ) { 5730 float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status), 5731 status); 5732 return float32_silence_nan(res, status); 5733 } 5734 return packFloat32( aSign, 0xFF, 0 ); 5735 } 5736 shift64RightJamming( aSig, 33, &aSig ); 5737 if ( aExp || aSig ) aExp -= 0x3F81; 5738 return roundAndPackFloat32(aSign, aExp, aSig, status); 5739 5740 } 5741 5742 /*---------------------------------------------------------------------------- 5743 | Returns the result of converting the extended double-precision floating- 5744 | point value `a' to the double-precision floating-point format. The 5745 | conversion is performed according to the IEC/IEEE Standard for Binary 5746 | Floating-Point Arithmetic. 5747 *----------------------------------------------------------------------------*/ 5748 5749 float64 floatx80_to_float64(floatx80 a, float_status *status) 5750 { 5751 bool aSign; 5752 int32_t aExp; 5753 uint64_t aSig, zSig; 5754 5755 if (floatx80_invalid_encoding(a)) { 5756 float_raise(float_flag_invalid, status); 5757 return float64_default_nan(status); 5758 } 5759 aSig = extractFloatx80Frac( a ); 5760 aExp = extractFloatx80Exp( a ); 5761 aSign = extractFloatx80Sign( a ); 5762 if ( aExp == 0x7FFF ) { 5763 if ( (uint64_t) ( aSig<<1 ) ) { 5764 float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status), 5765 status); 5766 return float64_silence_nan(res, status); 5767 } 5768 return packFloat64( aSign, 0x7FF, 0 ); 5769 } 5770 shift64RightJamming( aSig, 1, &zSig ); 5771 if ( aExp || aSig ) aExp -= 0x3C01; 5772 return roundAndPackFloat64(aSign, aExp, zSig, status); 5773 5774 } 5775 5776 /*---------------------------------------------------------------------------- 5777 | Returns the result of converting the extended double-precision floating- 5778 | point value `a' to the quadruple-precision floating-point format. The 5779 | conversion is performed according to the IEC/IEEE Standard for Binary 5780 | Floating-Point Arithmetic. 5781 *----------------------------------------------------------------------------*/ 5782 5783 float128 floatx80_to_float128(floatx80 a, float_status *status) 5784 { 5785 bool aSign; 5786 int aExp; 5787 uint64_t aSig, zSig0, zSig1; 5788 5789 if (floatx80_invalid_encoding(a)) { 5790 float_raise(float_flag_invalid, status); 5791 return float128_default_nan(status); 5792 } 5793 aSig = extractFloatx80Frac( a ); 5794 aExp = extractFloatx80Exp( a ); 5795 aSign = extractFloatx80Sign( a ); 5796 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 5797 float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status), 5798 status); 5799 return float128_silence_nan(res, status); 5800 } 5801 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 5802 return packFloat128( aSign, aExp, zSig0, zSig1 ); 5803 5804 } 5805 5806 /*---------------------------------------------------------------------------- 5807 | Rounds the extended double-precision floating-point value `a' 5808 | to the precision provided by floatx80_rounding_precision and returns the 5809 | result as an extended double-precision floating-point value. 5810 | The operation is performed according to the IEC/IEEE Standard for Binary 5811 | Floating-Point Arithmetic. 5812 *----------------------------------------------------------------------------*/ 5813 5814 floatx80 floatx80_round(floatx80 a, float_status *status) 5815 { 5816 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5817 extractFloatx80Sign(a), 5818 extractFloatx80Exp(a), 5819 extractFloatx80Frac(a), 0, status); 5820 } 5821 5822 /*---------------------------------------------------------------------------- 5823 | Rounds the extended double-precision floating-point value `a' to an integer, 5824 | and returns the result as an extended quadruple-precision floating-point 5825 | value. The operation is performed according to the IEC/IEEE Standard for 5826 | Binary Floating-Point Arithmetic. 5827 *----------------------------------------------------------------------------*/ 5828 5829 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 5830 { 5831 bool aSign; 5832 int32_t aExp; 5833 uint64_t lastBitMask, roundBitsMask; 5834 floatx80 z; 5835 5836 if (floatx80_invalid_encoding(a)) { 5837 float_raise(float_flag_invalid, status); 5838 return floatx80_default_nan(status); 5839 } 5840 aExp = extractFloatx80Exp( a ); 5841 if ( 0x403E <= aExp ) { 5842 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 5843 return propagateFloatx80NaN(a, a, status); 5844 } 5845 return a; 5846 } 5847 if ( aExp < 0x3FFF ) { 5848 if ( ( aExp == 0 ) 5849 && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) { 5850 return a; 5851 } 5852 float_raise(float_flag_inexact, status); 5853 aSign = extractFloatx80Sign( a ); 5854 switch (status->float_rounding_mode) { 5855 case float_round_nearest_even: 5856 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 5857 ) { 5858 return 5859 packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000)); 5860 } 5861 break; 5862 case float_round_ties_away: 5863 if (aExp == 0x3FFE) { 5864 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000)); 5865 } 5866 break; 5867 case float_round_down: 5868 return 5869 aSign ? 5870 packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000)) 5871 : packFloatx80( 0, 0, 0 ); 5872 case float_round_up: 5873 return 5874 aSign ? packFloatx80( 1, 0, 0 ) 5875 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000)); 5876 5877 case float_round_to_zero: 5878 break; 5879 default: 5880 g_assert_not_reached(); 5881 } 5882 return packFloatx80( aSign, 0, 0 ); 5883 } 5884 lastBitMask = 1; 5885 lastBitMask <<= 0x403E - aExp; 5886 roundBitsMask = lastBitMask - 1; 5887 z = a; 5888 switch (status->float_rounding_mode) { 5889 case float_round_nearest_even: 5890 z.low += lastBitMask>>1; 5891 if ((z.low & roundBitsMask) == 0) { 5892 z.low &= ~lastBitMask; 5893 } 5894 break; 5895 case float_round_ties_away: 5896 z.low += lastBitMask >> 1; 5897 break; 5898 case float_round_to_zero: 5899 break; 5900 case float_round_up: 5901 if (!extractFloatx80Sign(z)) { 5902 z.low += roundBitsMask; 5903 } 5904 break; 5905 case float_round_down: 5906 if (extractFloatx80Sign(z)) { 5907 z.low += roundBitsMask; 5908 } 5909 break; 5910 default: 5911 abort(); 5912 } 5913 z.low &= ~ roundBitsMask; 5914 if ( z.low == 0 ) { 5915 ++z.high; 5916 z.low = UINT64_C(0x8000000000000000); 5917 } 5918 if (z.low != a.low) { 5919 float_raise(float_flag_inexact, status); 5920 } 5921 return z; 5922 5923 } 5924 5925 /*---------------------------------------------------------------------------- 5926 | Returns the result of adding the absolute values of the extended double- 5927 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 5928 | negated before being returned. `zSign' is ignored if the result is a NaN. 5929 | The addition is performed according to the IEC/IEEE Standard for Binary 5930 | Floating-Point Arithmetic. 5931 *----------------------------------------------------------------------------*/ 5932 5933 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign, 5934 float_status *status) 5935 { 5936 int32_t aExp, bExp, zExp; 5937 uint64_t aSig, bSig, zSig0, zSig1; 5938 int32_t expDiff; 5939 5940 aSig = extractFloatx80Frac( a ); 5941 aExp = extractFloatx80Exp( a ); 5942 bSig = extractFloatx80Frac( b ); 5943 bExp = extractFloatx80Exp( b ); 5944 expDiff = aExp - bExp; 5945 if ( 0 < expDiff ) { 5946 if ( aExp == 0x7FFF ) { 5947 if ((uint64_t)(aSig << 1)) { 5948 return propagateFloatx80NaN(a, b, status); 5949 } 5950 return a; 5951 } 5952 if ( bExp == 0 ) --expDiff; 5953 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5954 zExp = aExp; 5955 } 5956 else if ( expDiff < 0 ) { 5957 if ( bExp == 0x7FFF ) { 5958 if ((uint64_t)(bSig << 1)) { 5959 return propagateFloatx80NaN(a, b, status); 5960 } 5961 return packFloatx80(zSign, 5962 floatx80_infinity_high, 5963 floatx80_infinity_low); 5964 } 5965 if ( aExp == 0 ) ++expDiff; 5966 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5967 zExp = bExp; 5968 } 5969 else { 5970 if ( aExp == 0x7FFF ) { 5971 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5972 return propagateFloatx80NaN(a, b, status); 5973 } 5974 return a; 5975 } 5976 zSig1 = 0; 5977 zSig0 = aSig + bSig; 5978 if ( aExp == 0 ) { 5979 if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) { 5980 /* At least one of the values is a pseudo-denormal, 5981 * and there is a carry out of the result. */ 5982 zExp = 1; 5983 goto shiftRight1; 5984 } 5985 if (zSig0 == 0) { 5986 return packFloatx80(zSign, 0, 0); 5987 } 5988 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 5989 goto roundAndPack; 5990 } 5991 zExp = aExp; 5992 goto shiftRight1; 5993 } 5994 zSig0 = aSig + bSig; 5995 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 5996 shiftRight1: 5997 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5998 zSig0 |= UINT64_C(0x8000000000000000); 5999 ++zExp; 6000 roundAndPack: 6001 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6002 zSign, zExp, zSig0, zSig1, status); 6003 } 6004 6005 /*---------------------------------------------------------------------------- 6006 | Returns the result of subtracting the absolute values of the extended 6007 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 6008 | difference is negated before being returned. `zSign' is ignored if the 6009 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6010 | Standard for Binary Floating-Point Arithmetic. 6011 *----------------------------------------------------------------------------*/ 6012 6013 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign, 6014 float_status *status) 6015 { 6016 int32_t aExp, bExp, zExp; 6017 uint64_t aSig, bSig, zSig0, zSig1; 6018 int32_t expDiff; 6019 6020 aSig = extractFloatx80Frac( a ); 6021 aExp = extractFloatx80Exp( a ); 6022 bSig = extractFloatx80Frac( b ); 6023 bExp = extractFloatx80Exp( b ); 6024 expDiff = aExp - bExp; 6025 if ( 0 < expDiff ) goto aExpBigger; 6026 if ( expDiff < 0 ) goto bExpBigger; 6027 if ( aExp == 0x7FFF ) { 6028 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 6029 return propagateFloatx80NaN(a, b, status); 6030 } 6031 float_raise(float_flag_invalid, status); 6032 return floatx80_default_nan(status); 6033 } 6034 if ( aExp == 0 ) { 6035 aExp = 1; 6036 bExp = 1; 6037 } 6038 zSig1 = 0; 6039 if ( bSig < aSig ) goto aBigger; 6040 if ( aSig < bSig ) goto bBigger; 6041 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 6042 bExpBigger: 6043 if ( bExp == 0x7FFF ) { 6044 if ((uint64_t)(bSig << 1)) { 6045 return propagateFloatx80NaN(a, b, status); 6046 } 6047 return packFloatx80(zSign ^ 1, floatx80_infinity_high, 6048 floatx80_infinity_low); 6049 } 6050 if ( aExp == 0 ) ++expDiff; 6051 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 6052 bBigger: 6053 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 6054 zExp = bExp; 6055 zSign ^= 1; 6056 goto normalizeRoundAndPack; 6057 aExpBigger: 6058 if ( aExp == 0x7FFF ) { 6059 if ((uint64_t)(aSig << 1)) { 6060 return propagateFloatx80NaN(a, b, status); 6061 } 6062 return a; 6063 } 6064 if ( bExp == 0 ) --expDiff; 6065 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 6066 aBigger: 6067 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 6068 zExp = aExp; 6069 normalizeRoundAndPack: 6070 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 6071 zSign, zExp, zSig0, zSig1, status); 6072 } 6073 6074 /*---------------------------------------------------------------------------- 6075 | Returns the result of adding the extended double-precision floating-point 6076 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6077 | Standard for Binary Floating-Point Arithmetic. 6078 *----------------------------------------------------------------------------*/ 6079 6080 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 6081 { 6082 bool aSign, bSign; 6083 6084 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6085 float_raise(float_flag_invalid, status); 6086 return floatx80_default_nan(status); 6087 } 6088 aSign = extractFloatx80Sign( a ); 6089 bSign = extractFloatx80Sign( b ); 6090 if ( aSign == bSign ) { 6091 return addFloatx80Sigs(a, b, aSign, status); 6092 } 6093 else { 6094 return subFloatx80Sigs(a, b, aSign, status); 6095 } 6096 6097 } 6098 6099 /*---------------------------------------------------------------------------- 6100 | Returns the result of subtracting the extended double-precision floating- 6101 | point values `a' and `b'. The operation is performed according to the 6102 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6103 *----------------------------------------------------------------------------*/ 6104 6105 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 6106 { 6107 bool aSign, bSign; 6108 6109 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6110 float_raise(float_flag_invalid, status); 6111 return floatx80_default_nan(status); 6112 } 6113 aSign = extractFloatx80Sign( a ); 6114 bSign = extractFloatx80Sign( b ); 6115 if ( aSign == bSign ) { 6116 return subFloatx80Sigs(a, b, aSign, status); 6117 } 6118 else { 6119 return addFloatx80Sigs(a, b, aSign, status); 6120 } 6121 6122 } 6123 6124 /*---------------------------------------------------------------------------- 6125 | Returns the result of multiplying the extended double-precision floating- 6126 | point values `a' and `b'. The operation is performed according to the 6127 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6128 *----------------------------------------------------------------------------*/ 6129 6130 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 6131 { 6132 bool aSign, bSign, zSign; 6133 int32_t aExp, bExp, zExp; 6134 uint64_t aSig, bSig, zSig0, zSig1; 6135 6136 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6137 float_raise(float_flag_invalid, status); 6138 return floatx80_default_nan(status); 6139 } 6140 aSig = extractFloatx80Frac( a ); 6141 aExp = extractFloatx80Exp( a ); 6142 aSign = extractFloatx80Sign( a ); 6143 bSig = extractFloatx80Frac( b ); 6144 bExp = extractFloatx80Exp( b ); 6145 bSign = extractFloatx80Sign( b ); 6146 zSign = aSign ^ bSign; 6147 if ( aExp == 0x7FFF ) { 6148 if ( (uint64_t) ( aSig<<1 ) 6149 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 6150 return propagateFloatx80NaN(a, b, status); 6151 } 6152 if ( ( bExp | bSig ) == 0 ) goto invalid; 6153 return packFloatx80(zSign, floatx80_infinity_high, 6154 floatx80_infinity_low); 6155 } 6156 if ( bExp == 0x7FFF ) { 6157 if ((uint64_t)(bSig << 1)) { 6158 return propagateFloatx80NaN(a, b, status); 6159 } 6160 if ( ( aExp | aSig ) == 0 ) { 6161 invalid: 6162 float_raise(float_flag_invalid, status); 6163 return floatx80_default_nan(status); 6164 } 6165 return packFloatx80(zSign, floatx80_infinity_high, 6166 floatx80_infinity_low); 6167 } 6168 if ( aExp == 0 ) { 6169 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6170 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 6171 } 6172 if ( bExp == 0 ) { 6173 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6174 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6175 } 6176 zExp = aExp + bExp - 0x3FFE; 6177 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 6178 if ( 0 < (int64_t) zSig0 ) { 6179 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 6180 --zExp; 6181 } 6182 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6183 zSign, zExp, zSig0, zSig1, status); 6184 } 6185 6186 /*---------------------------------------------------------------------------- 6187 | Returns the result of dividing the extended double-precision floating-point 6188 | value `a' by the corresponding value `b'. The operation is performed 6189 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6190 *----------------------------------------------------------------------------*/ 6191 6192 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 6193 { 6194 bool aSign, bSign, zSign; 6195 int32_t aExp, bExp, zExp; 6196 uint64_t aSig, bSig, zSig0, zSig1; 6197 uint64_t rem0, rem1, rem2, term0, term1, term2; 6198 6199 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6200 float_raise(float_flag_invalid, status); 6201 return floatx80_default_nan(status); 6202 } 6203 aSig = extractFloatx80Frac( a ); 6204 aExp = extractFloatx80Exp( a ); 6205 aSign = extractFloatx80Sign( a ); 6206 bSig = extractFloatx80Frac( b ); 6207 bExp = extractFloatx80Exp( b ); 6208 bSign = extractFloatx80Sign( b ); 6209 zSign = aSign ^ bSign; 6210 if ( aExp == 0x7FFF ) { 6211 if ((uint64_t)(aSig << 1)) { 6212 return propagateFloatx80NaN(a, b, status); 6213 } 6214 if ( bExp == 0x7FFF ) { 6215 if ((uint64_t)(bSig << 1)) { 6216 return propagateFloatx80NaN(a, b, status); 6217 } 6218 goto invalid; 6219 } 6220 return packFloatx80(zSign, floatx80_infinity_high, 6221 floatx80_infinity_low); 6222 } 6223 if ( bExp == 0x7FFF ) { 6224 if ((uint64_t)(bSig << 1)) { 6225 return propagateFloatx80NaN(a, b, status); 6226 } 6227 return packFloatx80( zSign, 0, 0 ); 6228 } 6229 if ( bExp == 0 ) { 6230 if ( bSig == 0 ) { 6231 if ( ( aExp | aSig ) == 0 ) { 6232 invalid: 6233 float_raise(float_flag_invalid, status); 6234 return floatx80_default_nan(status); 6235 } 6236 float_raise(float_flag_divbyzero, status); 6237 return packFloatx80(zSign, floatx80_infinity_high, 6238 floatx80_infinity_low); 6239 } 6240 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6241 } 6242 if ( aExp == 0 ) { 6243 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 6244 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 6245 } 6246 zExp = aExp - bExp + 0x3FFE; 6247 rem1 = 0; 6248 if ( bSig <= aSig ) { 6249 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 6250 ++zExp; 6251 } 6252 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 6253 mul64To128( bSig, zSig0, &term0, &term1 ); 6254 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 6255 while ( (int64_t) rem0 < 0 ) { 6256 --zSig0; 6257 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 6258 } 6259 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 6260 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 6261 mul64To128( bSig, zSig1, &term1, &term2 ); 6262 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6263 while ( (int64_t) rem1 < 0 ) { 6264 --zSig1; 6265 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 6266 } 6267 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 6268 } 6269 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6270 zSign, zExp, zSig0, zSig1, status); 6271 } 6272 6273 /*---------------------------------------------------------------------------- 6274 | Returns the remainder of the extended double-precision floating-point value 6275 | `a' with respect to the corresponding value `b'. The operation is performed 6276 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic, 6277 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating 6278 | the quotient toward zero instead. '*quotient' is set to the low 64 bits of 6279 | the absolute value of the integer quotient. 6280 *----------------------------------------------------------------------------*/ 6281 6282 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient, 6283 float_status *status) 6284 { 6285 bool aSign, zSign; 6286 int32_t aExp, bExp, expDiff, aExpOrig; 6287 uint64_t aSig0, aSig1, bSig; 6288 uint64_t q, term0, term1, alternateASig0, alternateASig1; 6289 6290 *quotient = 0; 6291 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6292 float_raise(float_flag_invalid, status); 6293 return floatx80_default_nan(status); 6294 } 6295 aSig0 = extractFloatx80Frac( a ); 6296 aExpOrig = aExp = extractFloatx80Exp( a ); 6297 aSign = extractFloatx80Sign( a ); 6298 bSig = extractFloatx80Frac( b ); 6299 bExp = extractFloatx80Exp( b ); 6300 if ( aExp == 0x7FFF ) { 6301 if ( (uint64_t) ( aSig0<<1 ) 6302 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 6303 return propagateFloatx80NaN(a, b, status); 6304 } 6305 goto invalid; 6306 } 6307 if ( bExp == 0x7FFF ) { 6308 if ((uint64_t)(bSig << 1)) { 6309 return propagateFloatx80NaN(a, b, status); 6310 } 6311 if (aExp == 0 && aSig0 >> 63) { 6312 /* 6313 * Pseudo-denormal argument must be returned in normalized 6314 * form. 6315 */ 6316 return packFloatx80(aSign, 1, aSig0); 6317 } 6318 return a; 6319 } 6320 if ( bExp == 0 ) { 6321 if ( bSig == 0 ) { 6322 invalid: 6323 float_raise(float_flag_invalid, status); 6324 return floatx80_default_nan(status); 6325 } 6326 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 6327 } 6328 if ( aExp == 0 ) { 6329 if ( aSig0 == 0 ) return a; 6330 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6331 } 6332 zSign = aSign; 6333 expDiff = aExp - bExp; 6334 aSig1 = 0; 6335 if ( expDiff < 0 ) { 6336 if ( mod || expDiff < -1 ) { 6337 if (aExp == 1 && aExpOrig == 0) { 6338 /* 6339 * Pseudo-denormal argument must be returned in 6340 * normalized form. 6341 */ 6342 return packFloatx80(aSign, aExp, aSig0); 6343 } 6344 return a; 6345 } 6346 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 6347 expDiff = 0; 6348 } 6349 *quotient = q = ( bSig <= aSig0 ); 6350 if ( q ) aSig0 -= bSig; 6351 expDiff -= 64; 6352 while ( 0 < expDiff ) { 6353 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6354 q = ( 2 < q ) ? q - 2 : 0; 6355 mul64To128( bSig, q, &term0, &term1 ); 6356 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6357 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 6358 expDiff -= 62; 6359 *quotient <<= 62; 6360 *quotient += q; 6361 } 6362 expDiff += 64; 6363 if ( 0 < expDiff ) { 6364 q = estimateDiv128To64( aSig0, aSig1, bSig ); 6365 q = ( 2 < q ) ? q - 2 : 0; 6366 q >>= 64 - expDiff; 6367 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 6368 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6369 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 6370 while ( le128( term0, term1, aSig0, aSig1 ) ) { 6371 ++q; 6372 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 6373 } 6374 if (expDiff < 64) { 6375 *quotient <<= expDiff; 6376 } else { 6377 *quotient = 0; 6378 } 6379 *quotient += q; 6380 } 6381 else { 6382 term1 = 0; 6383 term0 = bSig; 6384 } 6385 if (!mod) { 6386 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 6387 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6388 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 6389 && ( q & 1 ) ) 6390 ) { 6391 aSig0 = alternateASig0; 6392 aSig1 = alternateASig1; 6393 zSign = ! zSign; 6394 ++*quotient; 6395 } 6396 } 6397 return 6398 normalizeRoundAndPackFloatx80( 6399 80, zSign, bExp + expDiff, aSig0, aSig1, status); 6400 6401 } 6402 6403 /*---------------------------------------------------------------------------- 6404 | Returns the remainder of the extended double-precision floating-point value 6405 | `a' with respect to the corresponding value `b'. The operation is performed 6406 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6407 *----------------------------------------------------------------------------*/ 6408 6409 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 6410 { 6411 uint64_t quotient; 6412 return floatx80_modrem(a, b, false, "ient, status); 6413 } 6414 6415 /*---------------------------------------------------------------------------- 6416 | Returns the remainder of the extended double-precision floating-point value 6417 | `a' with respect to the corresponding value `b', with the quotient truncated 6418 | toward zero. 6419 *----------------------------------------------------------------------------*/ 6420 6421 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status) 6422 { 6423 uint64_t quotient; 6424 return floatx80_modrem(a, b, true, "ient, status); 6425 } 6426 6427 /*---------------------------------------------------------------------------- 6428 | Returns the square root of the extended double-precision floating-point 6429 | value `a'. The operation is performed according to the IEC/IEEE Standard 6430 | for Binary Floating-Point Arithmetic. 6431 *----------------------------------------------------------------------------*/ 6432 6433 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 6434 { 6435 bool aSign; 6436 int32_t aExp, zExp; 6437 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 6438 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6439 6440 if (floatx80_invalid_encoding(a)) { 6441 float_raise(float_flag_invalid, status); 6442 return floatx80_default_nan(status); 6443 } 6444 aSig0 = extractFloatx80Frac( a ); 6445 aExp = extractFloatx80Exp( a ); 6446 aSign = extractFloatx80Sign( a ); 6447 if ( aExp == 0x7FFF ) { 6448 if ((uint64_t)(aSig0 << 1)) { 6449 return propagateFloatx80NaN(a, a, status); 6450 } 6451 if ( ! aSign ) return a; 6452 goto invalid; 6453 } 6454 if ( aSign ) { 6455 if ( ( aExp | aSig0 ) == 0 ) return a; 6456 invalid: 6457 float_raise(float_flag_invalid, status); 6458 return floatx80_default_nan(status); 6459 } 6460 if ( aExp == 0 ) { 6461 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 6462 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 6463 } 6464 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 6465 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 6466 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 6467 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6468 doubleZSig0 = zSig0<<1; 6469 mul64To128( zSig0, zSig0, &term0, &term1 ); 6470 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6471 while ( (int64_t) rem0 < 0 ) { 6472 --zSig0; 6473 doubleZSig0 -= 2; 6474 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6475 } 6476 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6477 if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) { 6478 if ( zSig1 == 0 ) zSig1 = 1; 6479 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6480 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6481 mul64To128( zSig1, zSig1, &term2, &term3 ); 6482 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6483 while ( (int64_t) rem1 < 0 ) { 6484 --zSig1; 6485 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6486 term3 |= 1; 6487 term2 |= doubleZSig0; 6488 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6489 } 6490 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6491 } 6492 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 6493 zSig0 |= doubleZSig0; 6494 return roundAndPackFloatx80(status->floatx80_rounding_precision, 6495 0, zExp, zSig0, zSig1, status); 6496 } 6497 6498 /*---------------------------------------------------------------------------- 6499 | Returns the result of converting the quadruple-precision floating-point 6500 | value `a' to the 32-bit two's complement integer format. The conversion 6501 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6502 | Arithmetic---which means in particular that the conversion is rounded 6503 | according to the current rounding mode. If `a' is a NaN, the largest 6504 | positive integer is returned. Otherwise, if the conversion overflows, the 6505 | largest integer with the same sign as `a' is returned. 6506 *----------------------------------------------------------------------------*/ 6507 6508 int32_t float128_to_int32(float128 a, float_status *status) 6509 { 6510 bool aSign; 6511 int32_t aExp, shiftCount; 6512 uint64_t aSig0, aSig1; 6513 6514 aSig1 = extractFloat128Frac1( a ); 6515 aSig0 = extractFloat128Frac0( a ); 6516 aExp = extractFloat128Exp( a ); 6517 aSign = extractFloat128Sign( a ); 6518 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 6519 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6520 aSig0 |= ( aSig1 != 0 ); 6521 shiftCount = 0x4028 - aExp; 6522 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 6523 return roundAndPackInt32(aSign, aSig0, status); 6524 6525 } 6526 6527 /*---------------------------------------------------------------------------- 6528 | Returns the result of converting the quadruple-precision floating-point 6529 | value `a' to the 32-bit two's complement integer format. The conversion 6530 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6531 | Arithmetic, except that the conversion is always rounded toward zero. If 6532 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 6533 | conversion overflows, the largest integer with the same sign as `a' is 6534 | returned. 6535 *----------------------------------------------------------------------------*/ 6536 6537 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 6538 { 6539 bool aSign; 6540 int32_t aExp, shiftCount; 6541 uint64_t aSig0, aSig1, savedASig; 6542 int32_t z; 6543 6544 aSig1 = extractFloat128Frac1( a ); 6545 aSig0 = extractFloat128Frac0( a ); 6546 aExp = extractFloat128Exp( a ); 6547 aSign = extractFloat128Sign( a ); 6548 aSig0 |= ( aSig1 != 0 ); 6549 if ( 0x401E < aExp ) { 6550 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 6551 goto invalid; 6552 } 6553 else if ( aExp < 0x3FFF ) { 6554 if (aExp || aSig0) { 6555 float_raise(float_flag_inexact, status); 6556 } 6557 return 0; 6558 } 6559 aSig0 |= UINT64_C(0x0001000000000000); 6560 shiftCount = 0x402F - aExp; 6561 savedASig = aSig0; 6562 aSig0 >>= shiftCount; 6563 z = aSig0; 6564 if ( aSign ) z = - z; 6565 if ( ( z < 0 ) ^ aSign ) { 6566 invalid: 6567 float_raise(float_flag_invalid, status); 6568 return aSign ? INT32_MIN : INT32_MAX; 6569 } 6570 if ( ( aSig0<<shiftCount ) != savedASig ) { 6571 float_raise(float_flag_inexact, status); 6572 } 6573 return z; 6574 6575 } 6576 6577 /*---------------------------------------------------------------------------- 6578 | Returns the result of converting the quadruple-precision floating-point 6579 | value `a' to the 64-bit two's complement integer format. The conversion 6580 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6581 | Arithmetic---which means in particular that the conversion is rounded 6582 | according to the current rounding mode. If `a' is a NaN, the largest 6583 | positive integer is returned. Otherwise, if the conversion overflows, the 6584 | largest integer with the same sign as `a' is returned. 6585 *----------------------------------------------------------------------------*/ 6586 6587 int64_t float128_to_int64(float128 a, float_status *status) 6588 { 6589 bool aSign; 6590 int32_t aExp, shiftCount; 6591 uint64_t aSig0, aSig1; 6592 6593 aSig1 = extractFloat128Frac1( a ); 6594 aSig0 = extractFloat128Frac0( a ); 6595 aExp = extractFloat128Exp( a ); 6596 aSign = extractFloat128Sign( a ); 6597 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6598 shiftCount = 0x402F - aExp; 6599 if ( shiftCount <= 0 ) { 6600 if ( 0x403E < aExp ) { 6601 float_raise(float_flag_invalid, status); 6602 if ( ! aSign 6603 || ( ( aExp == 0x7FFF ) 6604 && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) ) 6605 ) 6606 ) { 6607 return INT64_MAX; 6608 } 6609 return INT64_MIN; 6610 } 6611 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 6612 } 6613 else { 6614 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 6615 } 6616 return roundAndPackInt64(aSign, aSig0, aSig1, status); 6617 6618 } 6619 6620 /*---------------------------------------------------------------------------- 6621 | Returns the result of converting the quadruple-precision floating-point 6622 | value `a' to the 64-bit two's complement integer format. The conversion 6623 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6624 | Arithmetic, except that the conversion is always rounded toward zero. 6625 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 6626 | the conversion overflows, the largest integer with the same sign as `a' is 6627 | returned. 6628 *----------------------------------------------------------------------------*/ 6629 6630 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 6631 { 6632 bool aSign; 6633 int32_t aExp, shiftCount; 6634 uint64_t aSig0, aSig1; 6635 int64_t z; 6636 6637 aSig1 = extractFloat128Frac1( a ); 6638 aSig0 = extractFloat128Frac0( a ); 6639 aExp = extractFloat128Exp( a ); 6640 aSign = extractFloat128Sign( a ); 6641 if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000); 6642 shiftCount = aExp - 0x402F; 6643 if ( 0 < shiftCount ) { 6644 if ( 0x403E <= aExp ) { 6645 aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF); 6646 if ( ( a.high == UINT64_C(0xC03E000000000000) ) 6647 && ( aSig1 < UINT64_C(0x0002000000000000) ) ) { 6648 if (aSig1) { 6649 float_raise(float_flag_inexact, status); 6650 } 6651 } 6652 else { 6653 float_raise(float_flag_invalid, status); 6654 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 6655 return INT64_MAX; 6656 } 6657 } 6658 return INT64_MIN; 6659 } 6660 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 6661 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 6662 float_raise(float_flag_inexact, status); 6663 } 6664 } 6665 else { 6666 if ( aExp < 0x3FFF ) { 6667 if ( aExp | aSig0 | aSig1 ) { 6668 float_raise(float_flag_inexact, status); 6669 } 6670 return 0; 6671 } 6672 z = aSig0>>( - shiftCount ); 6673 if ( aSig1 6674 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 6675 float_raise(float_flag_inexact, status); 6676 } 6677 } 6678 if ( aSign ) z = - z; 6679 return z; 6680 6681 } 6682 6683 /*---------------------------------------------------------------------------- 6684 | Returns the result of converting the quadruple-precision floating-point value 6685 | `a' to the 64-bit unsigned integer format. The conversion is 6686 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6687 | Arithmetic---which means in particular that the conversion is rounded 6688 | according to the current rounding mode. If `a' is a NaN, the largest 6689 | positive integer is returned. If the conversion overflows, the 6690 | largest unsigned integer is returned. If 'a' is negative, the value is 6691 | rounded and zero is returned; negative values that do not round to zero 6692 | will raise the inexact exception. 6693 *----------------------------------------------------------------------------*/ 6694 6695 uint64_t float128_to_uint64(float128 a, float_status *status) 6696 { 6697 bool aSign; 6698 int aExp; 6699 int shiftCount; 6700 uint64_t aSig0, aSig1; 6701 6702 aSig0 = extractFloat128Frac0(a); 6703 aSig1 = extractFloat128Frac1(a); 6704 aExp = extractFloat128Exp(a); 6705 aSign = extractFloat128Sign(a); 6706 if (aSign && (aExp > 0x3FFE)) { 6707 float_raise(float_flag_invalid, status); 6708 if (float128_is_any_nan(a)) { 6709 return UINT64_MAX; 6710 } else { 6711 return 0; 6712 } 6713 } 6714 if (aExp) { 6715 aSig0 |= UINT64_C(0x0001000000000000); 6716 } 6717 shiftCount = 0x402F - aExp; 6718 if (shiftCount <= 0) { 6719 if (0x403E < aExp) { 6720 float_raise(float_flag_invalid, status); 6721 return UINT64_MAX; 6722 } 6723 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 6724 } else { 6725 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 6726 } 6727 return roundAndPackUint64(aSign, aSig0, aSig1, status); 6728 } 6729 6730 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 6731 { 6732 uint64_t v; 6733 signed char current_rounding_mode = status->float_rounding_mode; 6734 6735 set_float_rounding_mode(float_round_to_zero, status); 6736 v = float128_to_uint64(a, status); 6737 set_float_rounding_mode(current_rounding_mode, status); 6738 6739 return v; 6740 } 6741 6742 /*---------------------------------------------------------------------------- 6743 | Returns the result of converting the quadruple-precision floating-point 6744 | value `a' to the 32-bit unsigned integer format. The conversion 6745 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6746 | Arithmetic except that the conversion is always rounded toward zero. 6747 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 6748 | if the conversion overflows, the largest unsigned integer is returned. 6749 | If 'a' is negative, the value is rounded and zero is returned; negative 6750 | values that do not round to zero will raise the inexact exception. 6751 *----------------------------------------------------------------------------*/ 6752 6753 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 6754 { 6755 uint64_t v; 6756 uint32_t res; 6757 int old_exc_flags = get_float_exception_flags(status); 6758 6759 v = float128_to_uint64_round_to_zero(a, status); 6760 if (v > 0xffffffff) { 6761 res = 0xffffffff; 6762 } else { 6763 return v; 6764 } 6765 set_float_exception_flags(old_exc_flags, status); 6766 float_raise(float_flag_invalid, status); 6767 return res; 6768 } 6769 6770 /*---------------------------------------------------------------------------- 6771 | Returns the result of converting the quadruple-precision floating-point value 6772 | `a' to the 32-bit unsigned integer format. The conversion is 6773 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6774 | Arithmetic---which means in particular that the conversion is rounded 6775 | according to the current rounding mode. If `a' is a NaN, the largest 6776 | positive integer is returned. If the conversion overflows, the 6777 | largest unsigned integer is returned. If 'a' is negative, the value is 6778 | rounded and zero is returned; negative values that do not round to zero 6779 | will raise the inexact exception. 6780 *----------------------------------------------------------------------------*/ 6781 6782 uint32_t float128_to_uint32(float128 a, float_status *status) 6783 { 6784 uint64_t v; 6785 uint32_t res; 6786 int old_exc_flags = get_float_exception_flags(status); 6787 6788 v = float128_to_uint64(a, status); 6789 if (v > 0xffffffff) { 6790 res = 0xffffffff; 6791 } else { 6792 return v; 6793 } 6794 set_float_exception_flags(old_exc_flags, status); 6795 float_raise(float_flag_invalid, status); 6796 return res; 6797 } 6798 6799 /*---------------------------------------------------------------------------- 6800 | Returns the result of converting the quadruple-precision floating-point 6801 | value `a' to the single-precision floating-point format. The conversion 6802 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6803 | Arithmetic. 6804 *----------------------------------------------------------------------------*/ 6805 6806 float32 float128_to_float32(float128 a, float_status *status) 6807 { 6808 bool aSign; 6809 int32_t aExp; 6810 uint64_t aSig0, aSig1; 6811 uint32_t zSig; 6812 6813 aSig1 = extractFloat128Frac1( a ); 6814 aSig0 = extractFloat128Frac0( a ); 6815 aExp = extractFloat128Exp( a ); 6816 aSign = extractFloat128Sign( a ); 6817 if ( aExp == 0x7FFF ) { 6818 if ( aSig0 | aSig1 ) { 6819 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 6820 } 6821 return packFloat32( aSign, 0xFF, 0 ); 6822 } 6823 aSig0 |= ( aSig1 != 0 ); 6824 shift64RightJamming( aSig0, 18, &aSig0 ); 6825 zSig = aSig0; 6826 if ( aExp || zSig ) { 6827 zSig |= 0x40000000; 6828 aExp -= 0x3F81; 6829 } 6830 return roundAndPackFloat32(aSign, aExp, zSig, status); 6831 6832 } 6833 6834 /*---------------------------------------------------------------------------- 6835 | Returns the result of converting the quadruple-precision floating-point 6836 | value `a' to the double-precision floating-point format. The conversion 6837 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6838 | Arithmetic. 6839 *----------------------------------------------------------------------------*/ 6840 6841 float64 float128_to_float64(float128 a, float_status *status) 6842 { 6843 bool aSign; 6844 int32_t aExp; 6845 uint64_t aSig0, aSig1; 6846 6847 aSig1 = extractFloat128Frac1( a ); 6848 aSig0 = extractFloat128Frac0( a ); 6849 aExp = extractFloat128Exp( a ); 6850 aSign = extractFloat128Sign( a ); 6851 if ( aExp == 0x7FFF ) { 6852 if ( aSig0 | aSig1 ) { 6853 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 6854 } 6855 return packFloat64( aSign, 0x7FF, 0 ); 6856 } 6857 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6858 aSig0 |= ( aSig1 != 0 ); 6859 if ( aExp || aSig0 ) { 6860 aSig0 |= UINT64_C(0x4000000000000000); 6861 aExp -= 0x3C01; 6862 } 6863 return roundAndPackFloat64(aSign, aExp, aSig0, status); 6864 6865 } 6866 6867 /*---------------------------------------------------------------------------- 6868 | Returns the result of converting the quadruple-precision floating-point 6869 | value `a' to the extended double-precision floating-point format. The 6870 | conversion is performed according to the IEC/IEEE Standard for Binary 6871 | Floating-Point Arithmetic. 6872 *----------------------------------------------------------------------------*/ 6873 6874 floatx80 float128_to_floatx80(float128 a, float_status *status) 6875 { 6876 bool aSign; 6877 int32_t aExp; 6878 uint64_t aSig0, aSig1; 6879 6880 aSig1 = extractFloat128Frac1( a ); 6881 aSig0 = extractFloat128Frac0( a ); 6882 aExp = extractFloat128Exp( a ); 6883 aSign = extractFloat128Sign( a ); 6884 if ( aExp == 0x7FFF ) { 6885 if ( aSig0 | aSig1 ) { 6886 floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status), 6887 status); 6888 return floatx80_silence_nan(res, status); 6889 } 6890 return packFloatx80(aSign, floatx80_infinity_high, 6891 floatx80_infinity_low); 6892 } 6893 if ( aExp == 0 ) { 6894 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6895 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6896 } 6897 else { 6898 aSig0 |= UINT64_C(0x0001000000000000); 6899 } 6900 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6901 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6902 6903 } 6904 6905 /*---------------------------------------------------------------------------- 6906 | Rounds the quadruple-precision floating-point value `a' to an integer, and 6907 | returns the result as a quadruple-precision floating-point value. The 6908 | operation is performed according to the IEC/IEEE Standard for Binary 6909 | Floating-Point Arithmetic. 6910 *----------------------------------------------------------------------------*/ 6911 6912 float128 float128_round_to_int(float128 a, float_status *status) 6913 { 6914 bool aSign; 6915 int32_t aExp; 6916 uint64_t lastBitMask, roundBitsMask; 6917 float128 z; 6918 6919 aExp = extractFloat128Exp( a ); 6920 if ( 0x402F <= aExp ) { 6921 if ( 0x406F <= aExp ) { 6922 if ( ( aExp == 0x7FFF ) 6923 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 6924 ) { 6925 return propagateFloat128NaN(a, a, status); 6926 } 6927 return a; 6928 } 6929 lastBitMask = 1; 6930 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 6931 roundBitsMask = lastBitMask - 1; 6932 z = a; 6933 switch (status->float_rounding_mode) { 6934 case float_round_nearest_even: 6935 if ( lastBitMask ) { 6936 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 6937 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 6938 } 6939 else { 6940 if ( (int64_t) z.low < 0 ) { 6941 ++z.high; 6942 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 6943 } 6944 } 6945 break; 6946 case float_round_ties_away: 6947 if (lastBitMask) { 6948 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 6949 } else { 6950 if ((int64_t) z.low < 0) { 6951 ++z.high; 6952 } 6953 } 6954 break; 6955 case float_round_to_zero: 6956 break; 6957 case float_round_up: 6958 if (!extractFloat128Sign(z)) { 6959 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6960 } 6961 break; 6962 case float_round_down: 6963 if (extractFloat128Sign(z)) { 6964 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6965 } 6966 break; 6967 case float_round_to_odd: 6968 /* 6969 * Note that if lastBitMask == 0, the last bit is the lsb 6970 * of high, and roundBitsMask == -1. 6971 */ 6972 if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) { 6973 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6974 } 6975 break; 6976 default: 6977 abort(); 6978 } 6979 z.low &= ~ roundBitsMask; 6980 } 6981 else { 6982 if ( aExp < 0x3FFF ) { 6983 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 6984 float_raise(float_flag_inexact, status); 6985 aSign = extractFloat128Sign( a ); 6986 switch (status->float_rounding_mode) { 6987 case float_round_nearest_even: 6988 if ( ( aExp == 0x3FFE ) 6989 && ( extractFloat128Frac0( a ) 6990 | extractFloat128Frac1( a ) ) 6991 ) { 6992 return packFloat128( aSign, 0x3FFF, 0, 0 ); 6993 } 6994 break; 6995 case float_round_ties_away: 6996 if (aExp == 0x3FFE) { 6997 return packFloat128(aSign, 0x3FFF, 0, 0); 6998 } 6999 break; 7000 case float_round_down: 7001 return 7002 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 7003 : packFloat128( 0, 0, 0, 0 ); 7004 case float_round_up: 7005 return 7006 aSign ? packFloat128( 1, 0, 0, 0 ) 7007 : packFloat128( 0, 0x3FFF, 0, 0 ); 7008 7009 case float_round_to_odd: 7010 return packFloat128(aSign, 0x3FFF, 0, 0); 7011 7012 case float_round_to_zero: 7013 break; 7014 } 7015 return packFloat128( aSign, 0, 0, 0 ); 7016 } 7017 lastBitMask = 1; 7018 lastBitMask <<= 0x402F - aExp; 7019 roundBitsMask = lastBitMask - 1; 7020 z.low = 0; 7021 z.high = a.high; 7022 switch (status->float_rounding_mode) { 7023 case float_round_nearest_even: 7024 z.high += lastBitMask>>1; 7025 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 7026 z.high &= ~ lastBitMask; 7027 } 7028 break; 7029 case float_round_ties_away: 7030 z.high += lastBitMask>>1; 7031 break; 7032 case float_round_to_zero: 7033 break; 7034 case float_round_up: 7035 if (!extractFloat128Sign(z)) { 7036 z.high |= ( a.low != 0 ); 7037 z.high += roundBitsMask; 7038 } 7039 break; 7040 case float_round_down: 7041 if (extractFloat128Sign(z)) { 7042 z.high |= (a.low != 0); 7043 z.high += roundBitsMask; 7044 } 7045 break; 7046 case float_round_to_odd: 7047 if ((z.high & lastBitMask) == 0) { 7048 z.high |= (a.low != 0); 7049 z.high += roundBitsMask; 7050 } 7051 break; 7052 default: 7053 abort(); 7054 } 7055 z.high &= ~ roundBitsMask; 7056 } 7057 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 7058 float_raise(float_flag_inexact, status); 7059 } 7060 return z; 7061 7062 } 7063 7064 /*---------------------------------------------------------------------------- 7065 | Returns the result of adding the absolute values of the quadruple-precision 7066 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 7067 | before being returned. `zSign' is ignored if the result is a NaN. 7068 | The addition is performed according to the IEC/IEEE Standard for Binary 7069 | Floating-Point Arithmetic. 7070 *----------------------------------------------------------------------------*/ 7071 7072 static float128 addFloat128Sigs(float128 a, float128 b, bool zSign, 7073 float_status *status) 7074 { 7075 int32_t aExp, bExp, zExp; 7076 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 7077 int32_t expDiff; 7078 7079 aSig1 = extractFloat128Frac1( a ); 7080 aSig0 = extractFloat128Frac0( a ); 7081 aExp = extractFloat128Exp( a ); 7082 bSig1 = extractFloat128Frac1( b ); 7083 bSig0 = extractFloat128Frac0( b ); 7084 bExp = extractFloat128Exp( b ); 7085 expDiff = aExp - bExp; 7086 if ( 0 < expDiff ) { 7087 if ( aExp == 0x7FFF ) { 7088 if (aSig0 | aSig1) { 7089 return propagateFloat128NaN(a, b, status); 7090 } 7091 return a; 7092 } 7093 if ( bExp == 0 ) { 7094 --expDiff; 7095 } 7096 else { 7097 bSig0 |= UINT64_C(0x0001000000000000); 7098 } 7099 shift128ExtraRightJamming( 7100 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 7101 zExp = aExp; 7102 } 7103 else if ( expDiff < 0 ) { 7104 if ( bExp == 0x7FFF ) { 7105 if (bSig0 | bSig1) { 7106 return propagateFloat128NaN(a, b, status); 7107 } 7108 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7109 } 7110 if ( aExp == 0 ) { 7111 ++expDiff; 7112 } 7113 else { 7114 aSig0 |= UINT64_C(0x0001000000000000); 7115 } 7116 shift128ExtraRightJamming( 7117 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 7118 zExp = bExp; 7119 } 7120 else { 7121 if ( aExp == 0x7FFF ) { 7122 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 7123 return propagateFloat128NaN(a, b, status); 7124 } 7125 return a; 7126 } 7127 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 7128 if ( aExp == 0 ) { 7129 if (status->flush_to_zero) { 7130 if (zSig0 | zSig1) { 7131 float_raise(float_flag_output_denormal, status); 7132 } 7133 return packFloat128(zSign, 0, 0, 0); 7134 } 7135 return packFloat128( zSign, 0, zSig0, zSig1 ); 7136 } 7137 zSig2 = 0; 7138 zSig0 |= UINT64_C(0x0002000000000000); 7139 zExp = aExp; 7140 goto shiftRight1; 7141 } 7142 aSig0 |= UINT64_C(0x0001000000000000); 7143 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 7144 --zExp; 7145 if ( zSig0 < UINT64_C(0x0002000000000000) ) goto roundAndPack; 7146 ++zExp; 7147 shiftRight1: 7148 shift128ExtraRightJamming( 7149 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 7150 roundAndPack: 7151 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7152 7153 } 7154 7155 /*---------------------------------------------------------------------------- 7156 | Returns the result of subtracting the absolute values of the quadruple- 7157 | precision floating-point values `a' and `b'. If `zSign' is 1, the 7158 | difference is negated before being returned. `zSign' is ignored if the 7159 | result is a NaN. The subtraction is performed according to the IEC/IEEE 7160 | Standard for Binary Floating-Point Arithmetic. 7161 *----------------------------------------------------------------------------*/ 7162 7163 static float128 subFloat128Sigs(float128 a, float128 b, bool zSign, 7164 float_status *status) 7165 { 7166 int32_t aExp, bExp, zExp; 7167 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 7168 int32_t expDiff; 7169 7170 aSig1 = extractFloat128Frac1( a ); 7171 aSig0 = extractFloat128Frac0( a ); 7172 aExp = extractFloat128Exp( a ); 7173 bSig1 = extractFloat128Frac1( b ); 7174 bSig0 = extractFloat128Frac0( b ); 7175 bExp = extractFloat128Exp( b ); 7176 expDiff = aExp - bExp; 7177 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 7178 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 7179 if ( 0 < expDiff ) goto aExpBigger; 7180 if ( expDiff < 0 ) goto bExpBigger; 7181 if ( aExp == 0x7FFF ) { 7182 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 7183 return propagateFloat128NaN(a, b, status); 7184 } 7185 float_raise(float_flag_invalid, status); 7186 return float128_default_nan(status); 7187 } 7188 if ( aExp == 0 ) { 7189 aExp = 1; 7190 bExp = 1; 7191 } 7192 if ( bSig0 < aSig0 ) goto aBigger; 7193 if ( aSig0 < bSig0 ) goto bBigger; 7194 if ( bSig1 < aSig1 ) goto aBigger; 7195 if ( aSig1 < bSig1 ) goto bBigger; 7196 return packFloat128(status->float_rounding_mode == float_round_down, 7197 0, 0, 0); 7198 bExpBigger: 7199 if ( bExp == 0x7FFF ) { 7200 if (bSig0 | bSig1) { 7201 return propagateFloat128NaN(a, b, status); 7202 } 7203 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 7204 } 7205 if ( aExp == 0 ) { 7206 ++expDiff; 7207 } 7208 else { 7209 aSig0 |= UINT64_C(0x4000000000000000); 7210 } 7211 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 7212 bSig0 |= UINT64_C(0x4000000000000000); 7213 bBigger: 7214 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 7215 zExp = bExp; 7216 zSign ^= 1; 7217 goto normalizeRoundAndPack; 7218 aExpBigger: 7219 if ( aExp == 0x7FFF ) { 7220 if (aSig0 | aSig1) { 7221 return propagateFloat128NaN(a, b, status); 7222 } 7223 return a; 7224 } 7225 if ( bExp == 0 ) { 7226 --expDiff; 7227 } 7228 else { 7229 bSig0 |= UINT64_C(0x4000000000000000); 7230 } 7231 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 7232 aSig0 |= UINT64_C(0x4000000000000000); 7233 aBigger: 7234 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 7235 zExp = aExp; 7236 normalizeRoundAndPack: 7237 --zExp; 7238 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 7239 status); 7240 7241 } 7242 7243 /*---------------------------------------------------------------------------- 7244 | Returns the result of adding the quadruple-precision floating-point values 7245 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 7246 | for Binary Floating-Point Arithmetic. 7247 *----------------------------------------------------------------------------*/ 7248 7249 float128 float128_add(float128 a, float128 b, float_status *status) 7250 { 7251 bool aSign, bSign; 7252 7253 aSign = extractFloat128Sign( a ); 7254 bSign = extractFloat128Sign( b ); 7255 if ( aSign == bSign ) { 7256 return addFloat128Sigs(a, b, aSign, status); 7257 } 7258 else { 7259 return subFloat128Sigs(a, b, aSign, status); 7260 } 7261 7262 } 7263 7264 /*---------------------------------------------------------------------------- 7265 | Returns the result of subtracting the quadruple-precision floating-point 7266 | values `a' and `b'. The operation is performed according to the IEC/IEEE 7267 | Standard for Binary Floating-Point Arithmetic. 7268 *----------------------------------------------------------------------------*/ 7269 7270 float128 float128_sub(float128 a, float128 b, float_status *status) 7271 { 7272 bool aSign, bSign; 7273 7274 aSign = extractFloat128Sign( a ); 7275 bSign = extractFloat128Sign( b ); 7276 if ( aSign == bSign ) { 7277 return subFloat128Sigs(a, b, aSign, status); 7278 } 7279 else { 7280 return addFloat128Sigs(a, b, aSign, status); 7281 } 7282 7283 } 7284 7285 /*---------------------------------------------------------------------------- 7286 | Returns the result of multiplying the quadruple-precision floating-point 7287 | values `a' and `b'. The operation is performed according to the IEC/IEEE 7288 | Standard for Binary Floating-Point Arithmetic. 7289 *----------------------------------------------------------------------------*/ 7290 7291 float128 float128_mul(float128 a, float128 b, float_status *status) 7292 { 7293 bool aSign, bSign, zSign; 7294 int32_t aExp, bExp, zExp; 7295 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 7296 7297 aSig1 = extractFloat128Frac1( a ); 7298 aSig0 = extractFloat128Frac0( a ); 7299 aExp = extractFloat128Exp( a ); 7300 aSign = extractFloat128Sign( a ); 7301 bSig1 = extractFloat128Frac1( b ); 7302 bSig0 = extractFloat128Frac0( b ); 7303 bExp = extractFloat128Exp( b ); 7304 bSign = extractFloat128Sign( b ); 7305 zSign = aSign ^ bSign; 7306 if ( aExp == 0x7FFF ) { 7307 if ( ( aSig0 | aSig1 ) 7308 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 7309 return propagateFloat128NaN(a, b, status); 7310 } 7311 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 7312 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7313 } 7314 if ( bExp == 0x7FFF ) { 7315 if (bSig0 | bSig1) { 7316 return propagateFloat128NaN(a, b, status); 7317 } 7318 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 7319 invalid: 7320 float_raise(float_flag_invalid, status); 7321 return float128_default_nan(status); 7322 } 7323 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7324 } 7325 if ( aExp == 0 ) { 7326 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7327 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7328 } 7329 if ( bExp == 0 ) { 7330 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7331 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7332 } 7333 zExp = aExp + bExp - 0x4000; 7334 aSig0 |= UINT64_C(0x0001000000000000); 7335 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 7336 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 7337 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 7338 zSig2 |= ( zSig3 != 0 ); 7339 if (UINT64_C( 0x0002000000000000) <= zSig0 ) { 7340 shift128ExtraRightJamming( 7341 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 7342 ++zExp; 7343 } 7344 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7345 7346 } 7347 7348 /*---------------------------------------------------------------------------- 7349 | Returns the result of dividing the quadruple-precision floating-point value 7350 | `a' by the corresponding value `b'. The operation is performed according to 7351 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7352 *----------------------------------------------------------------------------*/ 7353 7354 float128 float128_div(float128 a, float128 b, float_status *status) 7355 { 7356 bool aSign, bSign, zSign; 7357 int32_t aExp, bExp, zExp; 7358 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 7359 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7360 7361 aSig1 = extractFloat128Frac1( a ); 7362 aSig0 = extractFloat128Frac0( a ); 7363 aExp = extractFloat128Exp( a ); 7364 aSign = extractFloat128Sign( a ); 7365 bSig1 = extractFloat128Frac1( b ); 7366 bSig0 = extractFloat128Frac0( b ); 7367 bExp = extractFloat128Exp( b ); 7368 bSign = extractFloat128Sign( b ); 7369 zSign = aSign ^ bSign; 7370 if ( aExp == 0x7FFF ) { 7371 if (aSig0 | aSig1) { 7372 return propagateFloat128NaN(a, b, status); 7373 } 7374 if ( bExp == 0x7FFF ) { 7375 if (bSig0 | bSig1) { 7376 return propagateFloat128NaN(a, b, status); 7377 } 7378 goto invalid; 7379 } 7380 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7381 } 7382 if ( bExp == 0x7FFF ) { 7383 if (bSig0 | bSig1) { 7384 return propagateFloat128NaN(a, b, status); 7385 } 7386 return packFloat128( zSign, 0, 0, 0 ); 7387 } 7388 if ( bExp == 0 ) { 7389 if ( ( bSig0 | bSig1 ) == 0 ) { 7390 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 7391 invalid: 7392 float_raise(float_flag_invalid, status); 7393 return float128_default_nan(status); 7394 } 7395 float_raise(float_flag_divbyzero, status); 7396 return packFloat128( zSign, 0x7FFF, 0, 0 ); 7397 } 7398 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7399 } 7400 if ( aExp == 0 ) { 7401 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 7402 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7403 } 7404 zExp = aExp - bExp + 0x3FFD; 7405 shortShift128Left( 7406 aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 ); 7407 shortShift128Left( 7408 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 ); 7409 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 7410 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 7411 ++zExp; 7412 } 7413 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7414 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 7415 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 7416 while ( (int64_t) rem0 < 0 ) { 7417 --zSig0; 7418 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 7419 } 7420 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 7421 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 7422 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 7423 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 7424 while ( (int64_t) rem1 < 0 ) { 7425 --zSig1; 7426 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 7427 } 7428 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7429 } 7430 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 7431 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 7432 7433 } 7434 7435 /*---------------------------------------------------------------------------- 7436 | Returns the remainder of the quadruple-precision floating-point value `a' 7437 | with respect to the corresponding value `b'. The operation is performed 7438 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7439 *----------------------------------------------------------------------------*/ 7440 7441 float128 float128_rem(float128 a, float128 b, float_status *status) 7442 { 7443 bool aSign, zSign; 7444 int32_t aExp, bExp, expDiff; 7445 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 7446 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 7447 int64_t sigMean0; 7448 7449 aSig1 = extractFloat128Frac1( a ); 7450 aSig0 = extractFloat128Frac0( a ); 7451 aExp = extractFloat128Exp( a ); 7452 aSign = extractFloat128Sign( a ); 7453 bSig1 = extractFloat128Frac1( b ); 7454 bSig0 = extractFloat128Frac0( b ); 7455 bExp = extractFloat128Exp( b ); 7456 if ( aExp == 0x7FFF ) { 7457 if ( ( aSig0 | aSig1 ) 7458 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 7459 return propagateFloat128NaN(a, b, status); 7460 } 7461 goto invalid; 7462 } 7463 if ( bExp == 0x7FFF ) { 7464 if (bSig0 | bSig1) { 7465 return propagateFloat128NaN(a, b, status); 7466 } 7467 return a; 7468 } 7469 if ( bExp == 0 ) { 7470 if ( ( bSig0 | bSig1 ) == 0 ) { 7471 invalid: 7472 float_raise(float_flag_invalid, status); 7473 return float128_default_nan(status); 7474 } 7475 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 7476 } 7477 if ( aExp == 0 ) { 7478 if ( ( aSig0 | aSig1 ) == 0 ) return a; 7479 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7480 } 7481 expDiff = aExp - bExp; 7482 if ( expDiff < -1 ) return a; 7483 shortShift128Left( 7484 aSig0 | UINT64_C(0x0001000000000000), 7485 aSig1, 7486 15 - ( expDiff < 0 ), 7487 &aSig0, 7488 &aSig1 7489 ); 7490 shortShift128Left( 7491 bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 ); 7492 q = le128( bSig0, bSig1, aSig0, aSig1 ); 7493 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7494 expDiff -= 64; 7495 while ( 0 < expDiff ) { 7496 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7497 q = ( 4 < q ) ? q - 4 : 0; 7498 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7499 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 7500 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 7501 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 7502 expDiff -= 61; 7503 } 7504 if ( -64 < expDiff ) { 7505 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7506 q = ( 4 < q ) ? q - 4 : 0; 7507 q >>= - expDiff; 7508 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7509 expDiff += 52; 7510 if ( expDiff < 0 ) { 7511 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 7512 } 7513 else { 7514 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 7515 } 7516 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7517 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 7518 } 7519 else { 7520 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 7521 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7522 } 7523 do { 7524 alternateASig0 = aSig0; 7525 alternateASig1 = aSig1; 7526 ++q; 7527 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7528 } while ( 0 <= (int64_t) aSig0 ); 7529 add128( 7530 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 7531 if ( ( sigMean0 < 0 ) 7532 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 7533 aSig0 = alternateASig0; 7534 aSig1 = alternateASig1; 7535 } 7536 zSign = ( (int64_t) aSig0 < 0 ); 7537 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 7538 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 7539 status); 7540 } 7541 7542 /*---------------------------------------------------------------------------- 7543 | Returns the square root of the quadruple-precision floating-point value `a'. 7544 | The operation is performed according to the IEC/IEEE Standard for Binary 7545 | Floating-Point Arithmetic. 7546 *----------------------------------------------------------------------------*/ 7547 7548 float128 float128_sqrt(float128 a, float_status *status) 7549 { 7550 bool aSign; 7551 int32_t aExp, zExp; 7552 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 7553 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7554 7555 aSig1 = extractFloat128Frac1( a ); 7556 aSig0 = extractFloat128Frac0( a ); 7557 aExp = extractFloat128Exp( a ); 7558 aSign = extractFloat128Sign( a ); 7559 if ( aExp == 0x7FFF ) { 7560 if (aSig0 | aSig1) { 7561 return propagateFloat128NaN(a, a, status); 7562 } 7563 if ( ! aSign ) return a; 7564 goto invalid; 7565 } 7566 if ( aSign ) { 7567 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 7568 invalid: 7569 float_raise(float_flag_invalid, status); 7570 return float128_default_nan(status); 7571 } 7572 if ( aExp == 0 ) { 7573 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 7574 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7575 } 7576 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 7577 aSig0 |= UINT64_C(0x0001000000000000); 7578 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 7579 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 7580 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 7581 doubleZSig0 = zSig0<<1; 7582 mul64To128( zSig0, zSig0, &term0, &term1 ); 7583 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 7584 while ( (int64_t) rem0 < 0 ) { 7585 --zSig0; 7586 doubleZSig0 -= 2; 7587 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 7588 } 7589 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 7590 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 7591 if ( zSig1 == 0 ) zSig1 = 1; 7592 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 7593 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 7594 mul64To128( zSig1, zSig1, &term2, &term3 ); 7595 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 7596 while ( (int64_t) rem1 < 0 ) { 7597 --zSig1; 7598 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 7599 term3 |= 1; 7600 term2 |= doubleZSig0; 7601 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 7602 } 7603 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7604 } 7605 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 7606 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 7607 7608 } 7609 7610 static inline FloatRelation 7611 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet, 7612 float_status *status) 7613 { 7614 bool aSign, bSign; 7615 7616 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 7617 float_raise(float_flag_invalid, status); 7618 return float_relation_unordered; 7619 } 7620 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 7621 ( extractFloatx80Frac( a )<<1 ) ) || 7622 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 7623 ( extractFloatx80Frac( b )<<1 ) )) { 7624 if (!is_quiet || 7625 floatx80_is_signaling_nan(a, status) || 7626 floatx80_is_signaling_nan(b, status)) { 7627 float_raise(float_flag_invalid, status); 7628 } 7629 return float_relation_unordered; 7630 } 7631 aSign = extractFloatx80Sign( a ); 7632 bSign = extractFloatx80Sign( b ); 7633 if ( aSign != bSign ) { 7634 7635 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 7636 ( ( a.low | b.low ) == 0 ) ) { 7637 /* zero case */ 7638 return float_relation_equal; 7639 } else { 7640 return 1 - (2 * aSign); 7641 } 7642 } else { 7643 /* Normalize pseudo-denormals before comparison. */ 7644 if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) { 7645 ++a.high; 7646 } 7647 if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) { 7648 ++b.high; 7649 } 7650 if (a.low == b.low && a.high == b.high) { 7651 return float_relation_equal; 7652 } else { 7653 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7654 } 7655 } 7656 } 7657 7658 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status) 7659 { 7660 return floatx80_compare_internal(a, b, 0, status); 7661 } 7662 7663 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b, 7664 float_status *status) 7665 { 7666 return floatx80_compare_internal(a, b, 1, status); 7667 } 7668 7669 static inline FloatRelation 7670 float128_compare_internal(float128 a, float128 b, bool is_quiet, 7671 float_status *status) 7672 { 7673 bool aSign, bSign; 7674 7675 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7676 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7677 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7678 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7679 if (!is_quiet || 7680 float128_is_signaling_nan(a, status) || 7681 float128_is_signaling_nan(b, status)) { 7682 float_raise(float_flag_invalid, status); 7683 } 7684 return float_relation_unordered; 7685 } 7686 aSign = extractFloat128Sign( a ); 7687 bSign = extractFloat128Sign( b ); 7688 if ( aSign != bSign ) { 7689 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7690 /* zero case */ 7691 return float_relation_equal; 7692 } else { 7693 return 1 - (2 * aSign); 7694 } 7695 } else { 7696 if (a.low == b.low && a.high == b.high) { 7697 return float_relation_equal; 7698 } else { 7699 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7700 } 7701 } 7702 } 7703 7704 FloatRelation float128_compare(float128 a, float128 b, float_status *status) 7705 { 7706 return float128_compare_internal(a, b, 0, status); 7707 } 7708 7709 FloatRelation float128_compare_quiet(float128 a, float128 b, 7710 float_status *status) 7711 { 7712 return float128_compare_internal(a, b, 1, status); 7713 } 7714 7715 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7716 { 7717 bool aSign; 7718 int32_t aExp; 7719 uint64_t aSig; 7720 7721 if (floatx80_invalid_encoding(a)) { 7722 float_raise(float_flag_invalid, status); 7723 return floatx80_default_nan(status); 7724 } 7725 aSig = extractFloatx80Frac( a ); 7726 aExp = extractFloatx80Exp( a ); 7727 aSign = extractFloatx80Sign( a ); 7728 7729 if ( aExp == 0x7FFF ) { 7730 if ( aSig<<1 ) { 7731 return propagateFloatx80NaN(a, a, status); 7732 } 7733 return a; 7734 } 7735 7736 if (aExp == 0) { 7737 if (aSig == 0) { 7738 return a; 7739 } 7740 aExp++; 7741 } 7742 7743 if (n > 0x10000) { 7744 n = 0x10000; 7745 } else if (n < -0x10000) { 7746 n = -0x10000; 7747 } 7748 7749 aExp += n; 7750 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7751 aSign, aExp, aSig, 0, status); 7752 } 7753 7754 float128 float128_scalbn(float128 a, int n, float_status *status) 7755 { 7756 bool aSign; 7757 int32_t aExp; 7758 uint64_t aSig0, aSig1; 7759 7760 aSig1 = extractFloat128Frac1( a ); 7761 aSig0 = extractFloat128Frac0( a ); 7762 aExp = extractFloat128Exp( a ); 7763 aSign = extractFloat128Sign( a ); 7764 if ( aExp == 0x7FFF ) { 7765 if ( aSig0 | aSig1 ) { 7766 return propagateFloat128NaN(a, a, status); 7767 } 7768 return a; 7769 } 7770 if (aExp != 0) { 7771 aSig0 |= UINT64_C(0x0001000000000000); 7772 } else if (aSig0 == 0 && aSig1 == 0) { 7773 return a; 7774 } else { 7775 aExp++; 7776 } 7777 7778 if (n > 0x10000) { 7779 n = 0x10000; 7780 } else if (n < -0x10000) { 7781 n = -0x10000; 7782 } 7783 7784 aExp += n - 1; 7785 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7786 , status); 7787 7788 } 7789 7790 static void __attribute__((constructor)) softfloat_init(void) 7791 { 7792 union_float64 ua, ub, uc, ur; 7793 7794 if (QEMU_NO_HARDFLOAT) { 7795 return; 7796 } 7797 /* 7798 * Test that the host's FMA is not obviously broken. For example, 7799 * glibc < 2.23 can perform an incorrect FMA on certain hosts; see 7800 * https://sourceware.org/bugzilla/show_bug.cgi?id=13304 7801 */ 7802 ua.s = 0x0020000000000001ULL; 7803 ub.s = 0x3ca0000000000000ULL; 7804 uc.s = 0x0020000000000000ULL; 7805 ur.h = fma(ua.h, ub.h, uc.h); 7806 if (ur.s != 0x0020000000000001ULL) { 7807 force_soft_fma = true; 7808 } 7809 } 7810